1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512
9
10declare {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32>, <1 x i32>)
11declare {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>)
12declare {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32>, <3 x i32>)
13declare {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32>, <4 x i32>)
14declare {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32>, <6 x i32>)
15declare {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32>, <8 x i32>)
16declare {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32>, <16 x i32>)
17
18declare {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8>, <16 x i8>)
19declare {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16>, <8 x i16>)
20declare {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
21
22declare {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24>, <4 x i24>)
23declare {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1>, <4 x i1>)
24declare {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128>, <2 x i128>)
25
26define <1 x i32> @ssubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
27; CHECK-LABEL: ssubo_v1i32:
28; CHECK:       # %bb.0:
29; CHECK-NEXT:    xorl %eax, %eax
30; CHECK-NEXT:    subl %esi, %edi
31; CHECK-NEXT:    seto %al
32; CHECK-NEXT:    negl %eax
33; CHECK-NEXT:    movl %edi, (%rdx)
34; CHECK-NEXT:    retq
35  %t = call {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
36  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
37  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
38  %res = sext <1 x i1> %obit to <1 x i32>
39  store <1 x i32> %val, <1 x i32>* %p2
40  ret <1 x i32> %res
41}
42
43define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
44; SSE-LABEL: ssubo_v2i32:
45; SSE:       # %bb.0:
46; SSE-NEXT:    pxor %xmm2, %xmm2
47; SSE-NEXT:    movdqa %xmm0, %xmm3
48; SSE-NEXT:    psubd %xmm1, %xmm3
49; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
50; SSE-NEXT:    pcmpgtd %xmm3, %xmm0
51; SSE-NEXT:    pxor %xmm1, %xmm0
52; SSE-NEXT:    movq %xmm3, (%rdi)
53; SSE-NEXT:    retq
54;
55; AVX-LABEL: ssubo_v2i32:
56; AVX:       # %bb.0:
57; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
58; AVX-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
59; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
60; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
61; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
62; AVX-NEXT:    vmovq %xmm1, (%rdi)
63; AVX-NEXT:    retq
64;
65; AVX512-LABEL: ssubo_v2i32:
66; AVX512:       # %bb.0:
67; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
68; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
69; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
70; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
71; AVX512-NEXT:    kxorw %k1, %k0, %k1
72; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
73; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
74; AVX512-NEXT:    vmovq %xmm1, (%rdi)
75; AVX512-NEXT:    retq
76  %t = call {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
77  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
78  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
79  %res = sext <2 x i1> %obit to <2 x i32>
80  store <2 x i32> %val, <2 x i32>* %p2
81  ret <2 x i32> %res
82}
83
84define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
85; SSE2-LABEL: ssubo_v3i32:
86; SSE2:       # %bb.0:
87; SSE2-NEXT:    pxor %xmm2, %xmm2
88; SSE2-NEXT:    movdqa %xmm0, %xmm3
89; SSE2-NEXT:    psubd %xmm1, %xmm3
90; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
91; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
92; SSE2-NEXT:    pxor %xmm1, %xmm0
93; SSE2-NEXT:    movq %xmm3, (%rdi)
94; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
95; SSE2-NEXT:    movd %xmm1, 8(%rdi)
96; SSE2-NEXT:    retq
97;
98; SSSE3-LABEL: ssubo_v3i32:
99; SSSE3:       # %bb.0:
100; SSSE3-NEXT:    pxor %xmm2, %xmm2
101; SSSE3-NEXT:    movdqa %xmm0, %xmm3
102; SSSE3-NEXT:    psubd %xmm1, %xmm3
103; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
104; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
105; SSSE3-NEXT:    pxor %xmm1, %xmm0
106; SSSE3-NEXT:    movq %xmm3, (%rdi)
107; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
108; SSSE3-NEXT:    movd %xmm1, 8(%rdi)
109; SSSE3-NEXT:    retq
110;
111; SSE41-LABEL: ssubo_v3i32:
112; SSE41:       # %bb.0:
113; SSE41-NEXT:    pxor %xmm2, %xmm2
114; SSE41-NEXT:    movdqa %xmm0, %xmm3
115; SSE41-NEXT:    psubd %xmm1, %xmm3
116; SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
117; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
118; SSE41-NEXT:    pxor %xmm1, %xmm0
119; SSE41-NEXT:    pextrd $2, %xmm3, 8(%rdi)
120; SSE41-NEXT:    movq %xmm3, (%rdi)
121; SSE41-NEXT:    retq
122;
123; AVX-LABEL: ssubo_v3i32:
124; AVX:       # %bb.0:
125; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
126; AVX-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
127; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
128; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
129; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
130; AVX-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
131; AVX-NEXT:    vmovq %xmm1, (%rdi)
132; AVX-NEXT:    retq
133;
134; AVX512-LABEL: ssubo_v3i32:
135; AVX512:       # %bb.0:
136; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
137; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
138; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
139; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
140; AVX512-NEXT:    kxorw %k1, %k0, %k1
141; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
142; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
143; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
144; AVX512-NEXT:    vmovq %xmm1, (%rdi)
145; AVX512-NEXT:    retq
146  %t = call {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
147  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
148  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
149  %res = sext <3 x i1> %obit to <3 x i32>
150  store <3 x i32> %val, <3 x i32>* %p2
151  ret <3 x i32> %res
152}
153
154define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
155; SSE-LABEL: ssubo_v4i32:
156; SSE:       # %bb.0:
157; SSE-NEXT:    pxor %xmm2, %xmm2
158; SSE-NEXT:    movdqa %xmm0, %xmm3
159; SSE-NEXT:    psubd %xmm1, %xmm3
160; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
161; SSE-NEXT:    pcmpgtd %xmm3, %xmm0
162; SSE-NEXT:    pxor %xmm1, %xmm0
163; SSE-NEXT:    movdqa %xmm3, (%rdi)
164; SSE-NEXT:    retq
165;
166; AVX-LABEL: ssubo_v4i32:
167; AVX:       # %bb.0:
168; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
169; AVX-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
170; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
171; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
172; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
173; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
174; AVX-NEXT:    retq
175;
176; AVX512-LABEL: ssubo_v4i32:
177; AVX512:       # %bb.0:
178; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
179; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
180; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
181; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
182; AVX512-NEXT:    kxorw %k1, %k0, %k1
183; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
184; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
185; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
186; AVX512-NEXT:    retq
187  %t = call {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
188  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
189  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
190  %res = sext <4 x i1> %obit to <4 x i32>
191  store <4 x i32> %val, <4 x i32>* %p2
192  ret <4 x i32> %res
193}
194
195define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
196; SSE2-LABEL: ssubo_v6i32:
197; SSE2:       # %bb.0:
198; SSE2-NEXT:    movq %rdi, %rax
199; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
200; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
201; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
202; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
203; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
204; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
205; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
206; SSE2-NEXT:    movd %r8d, %xmm1
207; SSE2-NEXT:    movd %ecx, %xmm2
208; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
209; SSE2-NEXT:    movd %edx, %xmm1
210; SSE2-NEXT:    movd %esi, %xmm3
211; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
212; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
213; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
214; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
215; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
216; SSE2-NEXT:    movd %r9d, %xmm1
217; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
218; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
219; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
220; SSE2-NEXT:    movdqa %xmm3, %xmm4
221; SSE2-NEXT:    psubd %xmm0, %xmm4
222; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
223; SSE2-NEXT:    pxor %xmm5, %xmm5
224; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
225; SSE2-NEXT:    pxor %xmm3, %xmm0
226; SSE2-NEXT:    movdqa %xmm1, %xmm3
227; SSE2-NEXT:    psubd %xmm2, %xmm3
228; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
229; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
230; SSE2-NEXT:    pxor %xmm1, %xmm2
231; SSE2-NEXT:    movq %xmm3, 16(%rcx)
232; SSE2-NEXT:    movdqa %xmm4, (%rcx)
233; SSE2-NEXT:    movq %xmm2, 16(%rdi)
234; SSE2-NEXT:    movdqa %xmm0, (%rdi)
235; SSE2-NEXT:    retq
236;
237; SSSE3-LABEL: ssubo_v6i32:
238; SSSE3:       # %bb.0:
239; SSSE3-NEXT:    movq %rdi, %rax
240; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
241; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
242; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
243; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
244; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
245; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
246; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
247; SSSE3-NEXT:    movd %r8d, %xmm1
248; SSSE3-NEXT:    movd %ecx, %xmm2
249; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
250; SSSE3-NEXT:    movd %edx, %xmm1
251; SSSE3-NEXT:    movd %esi, %xmm3
252; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
253; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
254; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
255; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
256; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
257; SSSE3-NEXT:    movd %r9d, %xmm1
258; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
259; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
260; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
261; SSSE3-NEXT:    movdqa %xmm3, %xmm4
262; SSSE3-NEXT:    psubd %xmm0, %xmm4
263; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
264; SSSE3-NEXT:    pxor %xmm5, %xmm5
265; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
266; SSSE3-NEXT:    pxor %xmm3, %xmm0
267; SSSE3-NEXT:    movdqa %xmm1, %xmm3
268; SSSE3-NEXT:    psubd %xmm2, %xmm3
269; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
270; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
271; SSSE3-NEXT:    pxor %xmm1, %xmm2
272; SSSE3-NEXT:    movq %xmm3, 16(%rcx)
273; SSSE3-NEXT:    movdqa %xmm4, (%rcx)
274; SSSE3-NEXT:    movq %xmm2, 16(%rdi)
275; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
276; SSSE3-NEXT:    retq
277;
278; SSE41-LABEL: ssubo_v6i32:
279; SSE41:       # %bb.0:
280; SSE41-NEXT:    movq %rdi, %rax
281; SSE41-NEXT:    movd %esi, %xmm1
282; SSE41-NEXT:    pinsrd $1, %edx, %xmm1
283; SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
284; SSE41-NEXT:    pinsrd $3, %r8d, %xmm1
285; SSE41-NEXT:    movd %r9d, %xmm0
286; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm0
287; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
288; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
289; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
290; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
291; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
292; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
293; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
294; SSE41-NEXT:    movdqa %xmm1, %xmm4
295; SSE41-NEXT:    psubd %xmm3, %xmm4
296; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
297; SSE41-NEXT:    pxor %xmm5, %xmm5
298; SSE41-NEXT:    pcmpgtd %xmm5, %xmm3
299; SSE41-NEXT:    pxor %xmm1, %xmm3
300; SSE41-NEXT:    movdqa %xmm0, %xmm1
301; SSE41-NEXT:    psubd %xmm2, %xmm1
302; SSE41-NEXT:    pcmpgtd %xmm5, %xmm2
303; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
304; SSE41-NEXT:    pxor %xmm2, %xmm0
305; SSE41-NEXT:    movq %xmm1, 16(%rcx)
306; SSE41-NEXT:    movdqa %xmm4, (%rcx)
307; SSE41-NEXT:    movq %xmm0, 16(%rdi)
308; SSE41-NEXT:    movdqa %xmm3, (%rdi)
309; SSE41-NEXT:    retq
310;
311; AVX1-LABEL: ssubo_v6i32:
312; AVX1:       # %bb.0:
313; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
314; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
315; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm4
316; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm3
317; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
318; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
319; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
320; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
321; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
322; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
323; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
324; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
325; AVX1-NEXT:    vmovq %xmm2, 16(%rdi)
326; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
327; AVX1-NEXT:    retq
328;
329; AVX2-LABEL: ssubo_v6i32:
330; AVX2:       # %bb.0:
331; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
332; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm2
333; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
334; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
335; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
336; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
337; AVX2-NEXT:    vmovq %xmm2, 16(%rdi)
338; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
339; AVX2-NEXT:    retq
340;
341; AVX512-LABEL: ssubo_v6i32:
342; AVX512:       # %bb.0:
343; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
344; AVX512-NEXT:    vpcmpgtd %ymm2, %ymm1, %k0
345; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
346; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
347; AVX512-NEXT:    kxorw %k1, %k0, %k1
348; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
349; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
350; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
351; AVX512-NEXT:    vmovq %xmm2, 16(%rdi)
352; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
353; AVX512-NEXT:    retq
354  %t = call {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
355  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
356  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
357  %res = sext <6 x i1> %obit to <6 x i32>
358  store <6 x i32> %val, <6 x i32>* %p2
359  ret <6 x i32> %res
360}
361
362define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
363; SSE-LABEL: ssubo_v8i32:
364; SSE:       # %bb.0:
365; SSE-NEXT:    pxor %xmm4, %xmm4
366; SSE-NEXT:    movdqa %xmm0, %xmm5
367; SSE-NEXT:    psubd %xmm2, %xmm5
368; SSE-NEXT:    pcmpgtd %xmm4, %xmm2
369; SSE-NEXT:    pcmpgtd %xmm5, %xmm0
370; SSE-NEXT:    pxor %xmm2, %xmm0
371; SSE-NEXT:    movdqa %xmm1, %xmm2
372; SSE-NEXT:    psubd %xmm3, %xmm2
373; SSE-NEXT:    pcmpgtd %xmm4, %xmm3
374; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
375; SSE-NEXT:    pxor %xmm3, %xmm1
376; SSE-NEXT:    movdqa %xmm2, 16(%rdi)
377; SSE-NEXT:    movdqa %xmm5, (%rdi)
378; SSE-NEXT:    retq
379;
380; AVX1-LABEL: ssubo_v8i32:
381; AVX1:       # %bb.0:
382; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
383; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
384; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm4
385; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm3
386; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
387; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
388; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
389; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
390; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
391; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
392; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
393; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
394; AVX1-NEXT:    vmovdqa %xmm2, 16(%rdi)
395; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
396; AVX1-NEXT:    retq
397;
398; AVX2-LABEL: ssubo_v8i32:
399; AVX2:       # %bb.0:
400; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
401; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm2
402; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
403; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
404; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
405; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
406; AVX2-NEXT:    retq
407;
408; AVX512-LABEL: ssubo_v8i32:
409; AVX512:       # %bb.0:
410; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
411; AVX512-NEXT:    vpcmpgtd %ymm2, %ymm1, %k0
412; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
413; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
414; AVX512-NEXT:    kxorw %k1, %k0, %k1
415; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
416; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
417; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
418; AVX512-NEXT:    retq
419  %t = call {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
420  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
421  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
422  %res = sext <8 x i1> %obit to <8 x i32>
423  store <8 x i32> %val, <8 x i32>* %p2
424  ret <8 x i32> %res
425}
426
427define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
428; SSE-LABEL: ssubo_v16i32:
429; SSE:       # %bb.0:
430; SSE-NEXT:    pxor %xmm9, %xmm9
431; SSE-NEXT:    movdqa %xmm0, %xmm8
432; SSE-NEXT:    psubd %xmm4, %xmm8
433; SSE-NEXT:    pcmpgtd %xmm9, %xmm4
434; SSE-NEXT:    pcmpgtd %xmm8, %xmm0
435; SSE-NEXT:    pxor %xmm4, %xmm0
436; SSE-NEXT:    movdqa %xmm1, %xmm4
437; SSE-NEXT:    psubd %xmm5, %xmm4
438; SSE-NEXT:    pcmpgtd %xmm9, %xmm5
439; SSE-NEXT:    pcmpgtd %xmm4, %xmm1
440; SSE-NEXT:    pxor %xmm5, %xmm1
441; SSE-NEXT:    movdqa %xmm2, %xmm5
442; SSE-NEXT:    psubd %xmm6, %xmm5
443; SSE-NEXT:    pcmpgtd %xmm9, %xmm6
444; SSE-NEXT:    pcmpgtd %xmm5, %xmm2
445; SSE-NEXT:    pxor %xmm6, %xmm2
446; SSE-NEXT:    movdqa %xmm3, %xmm6
447; SSE-NEXT:    psubd %xmm7, %xmm6
448; SSE-NEXT:    pcmpgtd %xmm9, %xmm7
449; SSE-NEXT:    pcmpgtd %xmm6, %xmm3
450; SSE-NEXT:    pxor %xmm7, %xmm3
451; SSE-NEXT:    movdqa %xmm6, 48(%rdi)
452; SSE-NEXT:    movdqa %xmm5, 32(%rdi)
453; SSE-NEXT:    movdqa %xmm4, 16(%rdi)
454; SSE-NEXT:    movdqa %xmm8, (%rdi)
455; SSE-NEXT:    retq
456;
457; AVX1-LABEL: ssubo_v16i32:
458; AVX1:       # %bb.0:
459; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
460; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
461; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm4, %xmm6
462; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
463; AVX1-NEXT:    vpsubd %xmm4, %xmm7, %xmm8
464; AVX1-NEXT:    vpcmpgtd %xmm8, %xmm7, %xmm7
465; AVX1-NEXT:    vpxor %xmm7, %xmm6, %xmm6
466; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm3, %xmm7
467; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm3
468; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm1
469; AVX1-NEXT:    vpxor %xmm1, %xmm7, %xmm1
470; AVX1-NEXT:    vpackssdw %xmm6, %xmm1, %xmm1
471; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
472; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm6, %xmm7
473; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
474; AVX1-NEXT:    vpsubd %xmm6, %xmm4, %xmm6
475; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm4, %xmm4
476; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm4
477; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm2, %xmm5
478; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm2
479; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
480; AVX1-NEXT:    vpxor %xmm0, %xmm5, %xmm0
481; AVX1-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
482; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
483; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm4
484; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
485; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
486; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
487; AVX1-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
488; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm4
489; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
490; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
491; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
492; AVX1-NEXT:    vmovdqa %xmm8, 48(%rdi)
493; AVX1-NEXT:    vmovdqa %xmm3, 32(%rdi)
494; AVX1-NEXT:    vmovdqa %xmm6, 16(%rdi)
495; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
496; AVX1-NEXT:    retq
497;
498; AVX2-LABEL: ssubo_v16i32:
499; AVX2:       # %bb.0:
500; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
501; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm3, %ymm5
502; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm3
503; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm1, %ymm1
504; AVX2-NEXT:    vpxor %ymm1, %ymm5, %ymm1
505; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
506; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
507; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm2, %ymm4
508; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm2
509; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
510; AVX2-NEXT:    vpxor %ymm0, %ymm4, %ymm0
511; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
512; AVX2-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
513; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
514; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
515; AVX2-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
516; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
517; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
518; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
519; AVX2-NEXT:    retq
520;
521; AVX512-LABEL: ssubo_v16i32:
522; AVX512:       # %bb.0:
523; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
524; AVX512-NEXT:    vpcmpgtd %zmm2, %zmm1, %k0
525; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
526; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
527; AVX512-NEXT:    kxorw %k1, %k0, %k1
528; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
529; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
530; AVX512-NEXT:    retq
531  %t = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
532  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
533  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
534  %res = sext <16 x i1> %obit to <16 x i32>
535  store <16 x i32> %val, <16 x i32>* %p2
536  ret <16 x i32> %res
537}
538
539define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
540; SSE2-LABEL: ssubo_v16i8:
541; SSE2:       # %bb.0:
542; SSE2-NEXT:    movdqa %xmm0, %xmm2
543; SSE2-NEXT:    psubsb %xmm1, %xmm2
544; SSE2-NEXT:    psubb %xmm1, %xmm0
545; SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
546; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
547; SSE2-NEXT:    pxor %xmm2, %xmm3
548; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
549; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
550; SSE2-NEXT:    psrad $24, %xmm4
551; SSE2-NEXT:    movdqa %xmm3, %xmm1
552; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
553; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
554; SSE2-NEXT:    pslld $31, %xmm1
555; SSE2-NEXT:    psrad $31, %xmm1
556; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
557; SSE2-NEXT:    movdqa %xmm3, %xmm2
558; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
559; SSE2-NEXT:    pslld $31, %xmm2
560; SSE2-NEXT:    psrad $31, %xmm2
561; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
562; SSE2-NEXT:    pslld $31, %xmm3
563; SSE2-NEXT:    psrad $31, %xmm3
564; SSE2-NEXT:    movdqa %xmm0, (%rdi)
565; SSE2-NEXT:    movdqa %xmm4, %xmm0
566; SSE2-NEXT:    retq
567;
568; SSSE3-LABEL: ssubo_v16i8:
569; SSSE3:       # %bb.0:
570; SSSE3-NEXT:    movdqa %xmm0, %xmm2
571; SSSE3-NEXT:    psubsb %xmm1, %xmm2
572; SSSE3-NEXT:    psubb %xmm1, %xmm0
573; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
574; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
575; SSSE3-NEXT:    pxor %xmm2, %xmm3
576; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
577; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
578; SSSE3-NEXT:    psrad $24, %xmm4
579; SSSE3-NEXT:    movdqa %xmm3, %xmm1
580; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
581; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
582; SSSE3-NEXT:    pslld $31, %xmm1
583; SSSE3-NEXT:    psrad $31, %xmm1
584; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
585; SSSE3-NEXT:    movdqa %xmm3, %xmm2
586; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
587; SSSE3-NEXT:    pslld $31, %xmm2
588; SSSE3-NEXT:    psrad $31, %xmm2
589; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
590; SSSE3-NEXT:    pslld $31, %xmm3
591; SSSE3-NEXT:    psrad $31, %xmm3
592; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
593; SSSE3-NEXT:    movdqa %xmm4, %xmm0
594; SSSE3-NEXT:    retq
595;
596; SSE41-LABEL: ssubo_v16i8:
597; SSE41:       # %bb.0:
598; SSE41-NEXT:    movdqa %xmm0, %xmm2
599; SSE41-NEXT:    psubsb %xmm1, %xmm2
600; SSE41-NEXT:    psubb %xmm1, %xmm0
601; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
602; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
603; SSE41-NEXT:    pxor %xmm2, %xmm3
604; SSE41-NEXT:    pmovsxbd %xmm3, %xmm4
605; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
606; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
607; SSE41-NEXT:    pslld $31, %xmm1
608; SSE41-NEXT:    psrad $31, %xmm1
609; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
610; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
611; SSE41-NEXT:    pslld $31, %xmm2
612; SSE41-NEXT:    psrad $31, %xmm2
613; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
614; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
615; SSE41-NEXT:    pslld $31, %xmm3
616; SSE41-NEXT:    psrad $31, %xmm3
617; SSE41-NEXT:    movdqa %xmm0, (%rdi)
618; SSE41-NEXT:    movdqa %xmm4, %xmm0
619; SSE41-NEXT:    retq
620;
621; AVX1-LABEL: ssubo_v16i8:
622; AVX1:       # %bb.0:
623; AVX1-NEXT:    vpsubsb %xmm1, %xmm0, %xmm2
624; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm3
625; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm3, %xmm0
626; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
627; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
628; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
629; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
630; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
631; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
632; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
633; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
634; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
635; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
636; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
637; AVX1-NEXT:    vmovdqa %xmm3, (%rdi)
638; AVX1-NEXT:    retq
639;
640; AVX2-LABEL: ssubo_v16i8:
641; AVX2:       # %bb.0:
642; AVX2-NEXT:    vpsubsb %xmm1, %xmm0, %xmm2
643; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm3
644; AVX2-NEXT:    vpcmpeqb %xmm2, %xmm3, %xmm0
645; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
646; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
647; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm0
648; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
649; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
650; AVX2-NEXT:    vmovdqa %xmm3, (%rdi)
651; AVX2-NEXT:    retq
652;
653; AVX512-LABEL: ssubo_v16i8:
654; AVX512:       # %bb.0:
655; AVX512-NEXT:    vpsubsb %xmm1, %xmm0, %xmm2
656; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm1
657; AVX512-NEXT:    vpcmpneqb %xmm2, %xmm1, %k1
658; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
659; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
660; AVX512-NEXT:    retq
661  %t = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
662  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
663  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
664  %res = sext <16 x i1> %obit to <16 x i32>
665  store <16 x i8> %val, <16 x i8>* %p2
666  ret <16 x i32> %res
667}
668
669define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
670; SSE2-LABEL: ssubo_v8i16:
671; SSE2:       # %bb.0:
672; SSE2-NEXT:    movdqa %xmm0, %xmm2
673; SSE2-NEXT:    psubsw %xmm1, %xmm2
674; SSE2-NEXT:    psubw %xmm1, %xmm0
675; SSE2-NEXT:    pcmpeqw %xmm0, %xmm2
676; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
677; SSE2-NEXT:    pxor %xmm2, %xmm1
678; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
679; SSE2-NEXT:    psrad $16, %xmm2
680; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
681; SSE2-NEXT:    pslld $31, %xmm1
682; SSE2-NEXT:    psrad $31, %xmm1
683; SSE2-NEXT:    movdqa %xmm0, (%rdi)
684; SSE2-NEXT:    movdqa %xmm2, %xmm0
685; SSE2-NEXT:    retq
686;
687; SSSE3-LABEL: ssubo_v8i16:
688; SSSE3:       # %bb.0:
689; SSSE3-NEXT:    movdqa %xmm0, %xmm2
690; SSSE3-NEXT:    psubsw %xmm1, %xmm2
691; SSSE3-NEXT:    psubw %xmm1, %xmm0
692; SSSE3-NEXT:    pcmpeqw %xmm0, %xmm2
693; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
694; SSSE3-NEXT:    pxor %xmm2, %xmm1
695; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
696; SSSE3-NEXT:    psrad $16, %xmm2
697; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
698; SSSE3-NEXT:    pslld $31, %xmm1
699; SSSE3-NEXT:    psrad $31, %xmm1
700; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
701; SSSE3-NEXT:    movdqa %xmm2, %xmm0
702; SSSE3-NEXT:    retq
703;
704; SSE41-LABEL: ssubo_v8i16:
705; SSE41:       # %bb.0:
706; SSE41-NEXT:    movdqa %xmm0, %xmm2
707; SSE41-NEXT:    psubsw %xmm1, %xmm2
708; SSE41-NEXT:    psubw %xmm1, %xmm0
709; SSE41-NEXT:    pcmpeqw %xmm0, %xmm2
710; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
711; SSE41-NEXT:    pxor %xmm2, %xmm1
712; SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
713; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
714; SSE41-NEXT:    pslld $31, %xmm1
715; SSE41-NEXT:    psrad $31, %xmm1
716; SSE41-NEXT:    movdqa %xmm0, (%rdi)
717; SSE41-NEXT:    movdqa %xmm2, %xmm0
718; SSE41-NEXT:    retq
719;
720; AVX1-LABEL: ssubo_v8i16:
721; AVX1:       # %bb.0:
722; AVX1-NEXT:    vpsubsw %xmm1, %xmm0, %xmm2
723; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
724; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm0
725; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
726; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
727; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
728; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
729; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
730; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
731; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
732; AVX1-NEXT:    retq
733;
734; AVX2-LABEL: ssubo_v8i16:
735; AVX2:       # %bb.0:
736; AVX2-NEXT:    vpsubsw %xmm1, %xmm0, %xmm2
737; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
738; AVX2-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm0
739; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
740; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
741; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
742; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
743; AVX2-NEXT:    retq
744;
745; AVX512-LABEL: ssubo_v8i16:
746; AVX512:       # %bb.0:
747; AVX512-NEXT:    vpsubsw %xmm1, %xmm0, %xmm2
748; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
749; AVX512-NEXT:    vpcmpneqw %xmm2, %xmm1, %k1
750; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
751; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
752; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
753; AVX512-NEXT:    retq
754  %t = call {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
755  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
756  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
757  %res = sext <8 x i1> %obit to <8 x i32>
758  store <8 x i16> %val, <8 x i16>* %p2
759  ret <8 x i32> %res
760}
761
762define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
763; SSE-LABEL: ssubo_v2i64:
764; SSE:       # %bb.0:
765; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
766; SSE-NEXT:    movdqa %xmm0, %xmm3
767; SSE-NEXT:    pxor %xmm2, %xmm3
768; SSE-NEXT:    psubq %xmm1, %xmm0
769; SSE-NEXT:    movdqa %xmm0, (%rdi)
770; SSE-NEXT:    pxor %xmm2, %xmm0
771; SSE-NEXT:    movdqa %xmm3, %xmm4
772; SSE-NEXT:    pcmpgtd %xmm0, %xmm4
773; SSE-NEXT:    pcmpeqd %xmm3, %xmm0
774; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
775; SSE-NEXT:    pand %xmm4, %xmm0
776; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
777; SSE-NEXT:    por %xmm0, %xmm3
778; SSE-NEXT:    pxor %xmm2, %xmm1
779; SSE-NEXT:    movdqa %xmm1, %xmm0
780; SSE-NEXT:    pcmpgtd %xmm2, %xmm0
781; SSE-NEXT:    pcmpeqd %xmm2, %xmm1
782; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
783; SSE-NEXT:    pand %xmm0, %xmm1
784; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
785; SSE-NEXT:    por %xmm1, %xmm0
786; SSE-NEXT:    pxor %xmm3, %xmm0
787; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
788; SSE-NEXT:    retq
789;
790; AVX-LABEL: ssubo_v2i64:
791; AVX:       # %bb.0:
792; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
793; AVX-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
794; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
795; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
796; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
797; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
798; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
799; AVX-NEXT:    retq
800;
801; AVX512-LABEL: ssubo_v2i64:
802; AVX512:       # %bb.0:
803; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
804; AVX512-NEXT:    vpcmpgtq %xmm2, %xmm1, %k0
805; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
806; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
807; AVX512-NEXT:    kxorw %k1, %k0, %k1
808; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
809; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
810; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
811; AVX512-NEXT:    retq
812  %t = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
813  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
814  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
815  %res = sext <2 x i1> %obit to <2 x i32>
816  store <2 x i64> %val, <2 x i64>* %p2
817  ret <2 x i32> %res
818}
819
820define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
821; SSE2-LABEL: ssubo_v4i24:
822; SSE2:       # %bb.0:
823; SSE2-NEXT:    movdqa %xmm0, %xmm2
824; SSE2-NEXT:    pslld $8, %xmm1
825; SSE2-NEXT:    psrad $8, %xmm1
826; SSE2-NEXT:    pslld $8, %xmm2
827; SSE2-NEXT:    psrad $8, %xmm2
828; SSE2-NEXT:    psubd %xmm1, %xmm2
829; SSE2-NEXT:    movdqa %xmm2, %xmm0
830; SSE2-NEXT:    pslld $8, %xmm0
831; SSE2-NEXT:    psrad $8, %xmm0
832; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
833; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
834; SSE2-NEXT:    pxor %xmm1, %xmm0
835; SSE2-NEXT:    movd %xmm2, %eax
836; SSE2-NEXT:    movw %ax, (%rdi)
837; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
838; SSE2-NEXT:    movd %xmm1, %ecx
839; SSE2-NEXT:    movw %cx, 9(%rdi)
840; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
841; SSE2-NEXT:    movd %xmm1, %edx
842; SSE2-NEXT:    movw %dx, 6(%rdi)
843; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
844; SSE2-NEXT:    movd %xmm1, %esi
845; SSE2-NEXT:    movw %si, 3(%rdi)
846; SSE2-NEXT:    shrl $16, %eax
847; SSE2-NEXT:    movb %al, 2(%rdi)
848; SSE2-NEXT:    shrl $16, %ecx
849; SSE2-NEXT:    movb %cl, 11(%rdi)
850; SSE2-NEXT:    shrl $16, %edx
851; SSE2-NEXT:    movb %dl, 8(%rdi)
852; SSE2-NEXT:    shrl $16, %esi
853; SSE2-NEXT:    movb %sil, 5(%rdi)
854; SSE2-NEXT:    retq
855;
856; SSSE3-LABEL: ssubo_v4i24:
857; SSSE3:       # %bb.0:
858; SSSE3-NEXT:    movdqa %xmm0, %xmm2
859; SSSE3-NEXT:    pslld $8, %xmm1
860; SSSE3-NEXT:    psrad $8, %xmm1
861; SSSE3-NEXT:    pslld $8, %xmm2
862; SSSE3-NEXT:    psrad $8, %xmm2
863; SSSE3-NEXT:    psubd %xmm1, %xmm2
864; SSSE3-NEXT:    movdqa %xmm2, %xmm0
865; SSSE3-NEXT:    pslld $8, %xmm0
866; SSSE3-NEXT:    psrad $8, %xmm0
867; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
868; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
869; SSSE3-NEXT:    pxor %xmm1, %xmm0
870; SSSE3-NEXT:    movd %xmm2, %eax
871; SSSE3-NEXT:    movw %ax, (%rdi)
872; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
873; SSSE3-NEXT:    movd %xmm1, %ecx
874; SSSE3-NEXT:    movw %cx, 9(%rdi)
875; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
876; SSSE3-NEXT:    movd %xmm1, %edx
877; SSSE3-NEXT:    movw %dx, 6(%rdi)
878; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
879; SSSE3-NEXT:    movd %xmm1, %esi
880; SSSE3-NEXT:    movw %si, 3(%rdi)
881; SSSE3-NEXT:    shrl $16, %eax
882; SSSE3-NEXT:    movb %al, 2(%rdi)
883; SSSE3-NEXT:    shrl $16, %ecx
884; SSSE3-NEXT:    movb %cl, 11(%rdi)
885; SSSE3-NEXT:    shrl $16, %edx
886; SSSE3-NEXT:    movb %dl, 8(%rdi)
887; SSSE3-NEXT:    shrl $16, %esi
888; SSSE3-NEXT:    movb %sil, 5(%rdi)
889; SSSE3-NEXT:    retq
890;
891; SSE41-LABEL: ssubo_v4i24:
892; SSE41:       # %bb.0:
893; SSE41-NEXT:    movdqa %xmm0, %xmm2
894; SSE41-NEXT:    pslld $8, %xmm1
895; SSE41-NEXT:    psrad $8, %xmm1
896; SSE41-NEXT:    pslld $8, %xmm2
897; SSE41-NEXT:    psrad $8, %xmm2
898; SSE41-NEXT:    psubd %xmm1, %xmm2
899; SSE41-NEXT:    movdqa %xmm2, %xmm0
900; SSE41-NEXT:    pslld $8, %xmm0
901; SSE41-NEXT:    psrad $8, %xmm0
902; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
903; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
904; SSE41-NEXT:    pxor %xmm1, %xmm0
905; SSE41-NEXT:    pextrd $3, %xmm2, %eax
906; SSE41-NEXT:    movw %ax, 9(%rdi)
907; SSE41-NEXT:    pextrd $2, %xmm2, %ecx
908; SSE41-NEXT:    movw %cx, 6(%rdi)
909; SSE41-NEXT:    pextrd $1, %xmm2, %edx
910; SSE41-NEXT:    movw %dx, 3(%rdi)
911; SSE41-NEXT:    movd %xmm2, %esi
912; SSE41-NEXT:    movw %si, (%rdi)
913; SSE41-NEXT:    shrl $16, %eax
914; SSE41-NEXT:    movb %al, 11(%rdi)
915; SSE41-NEXT:    shrl $16, %ecx
916; SSE41-NEXT:    movb %cl, 8(%rdi)
917; SSE41-NEXT:    shrl $16, %edx
918; SSE41-NEXT:    movb %dl, 5(%rdi)
919; SSE41-NEXT:    shrl $16, %esi
920; SSE41-NEXT:    movb %sil, 2(%rdi)
921; SSE41-NEXT:    retq
922;
923; AVX-LABEL: ssubo_v4i24:
924; AVX:       # %bb.0:
925; AVX-NEXT:    vpslld $8, %xmm1, %xmm1
926; AVX-NEXT:    vpsrad $8, %xmm1, %xmm1
927; AVX-NEXT:    vpslld $8, %xmm0, %xmm0
928; AVX-NEXT:    vpsrad $8, %xmm0, %xmm0
929; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
930; AVX-NEXT:    vpslld $8, %xmm1, %xmm0
931; AVX-NEXT:    vpsrad $8, %xmm0, %xmm0
932; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
933; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
934; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
935; AVX-NEXT:    vpextrd $3, %xmm1, %eax
936; AVX-NEXT:    movw %ax, 9(%rdi)
937; AVX-NEXT:    vpextrd $2, %xmm1, %ecx
938; AVX-NEXT:    movw %cx, 6(%rdi)
939; AVX-NEXT:    vpextrd $1, %xmm1, %edx
940; AVX-NEXT:    movw %dx, 3(%rdi)
941; AVX-NEXT:    vmovd %xmm1, %esi
942; AVX-NEXT:    movw %si, (%rdi)
943; AVX-NEXT:    shrl $16, %eax
944; AVX-NEXT:    movb %al, 11(%rdi)
945; AVX-NEXT:    shrl $16, %ecx
946; AVX-NEXT:    movb %cl, 8(%rdi)
947; AVX-NEXT:    shrl $16, %edx
948; AVX-NEXT:    movb %dl, 5(%rdi)
949; AVX-NEXT:    shrl $16, %esi
950; AVX-NEXT:    movb %sil, 2(%rdi)
951; AVX-NEXT:    retq
952;
953; AVX512-LABEL: ssubo_v4i24:
954; AVX512:       # %bb.0:
955; AVX512-NEXT:    vpslld $8, %xmm1, %xmm1
956; AVX512-NEXT:    vpsrad $8, %xmm1, %xmm1
957; AVX512-NEXT:    vpslld $8, %xmm0, %xmm0
958; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
959; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
960; AVX512-NEXT:    vpslld $8, %xmm1, %xmm0
961; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
962; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
963; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
964; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
965; AVX512-NEXT:    movw %ax, 9(%rdi)
966; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
967; AVX512-NEXT:    movw %cx, 6(%rdi)
968; AVX512-NEXT:    vpextrd $1, %xmm1, %edx
969; AVX512-NEXT:    movw %dx, 3(%rdi)
970; AVX512-NEXT:    vmovd %xmm1, %esi
971; AVX512-NEXT:    movw %si, (%rdi)
972; AVX512-NEXT:    shrl $16, %eax
973; AVX512-NEXT:    movb %al, 11(%rdi)
974; AVX512-NEXT:    shrl $16, %ecx
975; AVX512-NEXT:    movb %cl, 8(%rdi)
976; AVX512-NEXT:    shrl $16, %edx
977; AVX512-NEXT:    movb %dl, 5(%rdi)
978; AVX512-NEXT:    shrl $16, %esi
979; AVX512-NEXT:    movb %sil, 2(%rdi)
980; AVX512-NEXT:    retq
981  %t = call {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
982  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
983  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
984  %res = sext <4 x i1> %obit to <4 x i32>
985  store <4 x i24> %val, <4 x i24>* %p2
986  ret <4 x i32> %res
987}
988
989define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
990; SSE-LABEL: ssubo_v4i1:
991; SSE:       # %bb.0:
992; SSE-NEXT:    pslld $31, %xmm1
993; SSE-NEXT:    psrad $31, %xmm1
994; SSE-NEXT:    pslld $31, %xmm0
995; SSE-NEXT:    psrad $31, %xmm0
996; SSE-NEXT:    psubd %xmm1, %xmm0
997; SSE-NEXT:    movdqa %xmm0, %xmm1
998; SSE-NEXT:    pslld $31, %xmm1
999; SSE-NEXT:    movmskps %xmm1, %eax
1000; SSE-NEXT:    psrad $31, %xmm1
1001; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
1002; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
1003; SSE-NEXT:    pxor %xmm0, %xmm1
1004; SSE-NEXT:    movb %al, (%rdi)
1005; SSE-NEXT:    movdqa %xmm1, %xmm0
1006; SSE-NEXT:    retq
1007;
1008; AVX-LABEL: ssubo_v4i1:
1009; AVX:       # %bb.0:
1010; AVX-NEXT:    vpslld $31, %xmm1, %xmm1
1011; AVX-NEXT:    vpsrad $31, %xmm1, %xmm1
1012; AVX-NEXT:    vpslld $31, %xmm0, %xmm0
1013; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
1014; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1015; AVX-NEXT:    vpslld $31, %xmm0, %xmm1
1016; AVX-NEXT:    vpsrad $31, %xmm1, %xmm2
1017; AVX-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
1018; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1019; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
1020; AVX-NEXT:    vmovmskps %xmm1, %eax
1021; AVX-NEXT:    movb %al, (%rdi)
1022; AVX-NEXT:    retq
1023;
1024; AVX512-LABEL: ssubo_v4i1:
1025; AVX512:       # %bb.0:
1026; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
1027; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
1028; AVX512-NEXT:    vpslld $31, %xmm1, %xmm1
1029; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k1
1030; AVX512-NEXT:    kxorw %k1, %k0, %k1
1031; AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k2 {%k1}
1032; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1033; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k2} {z}
1034; AVX512-NEXT:    kshiftlw $12, %k1, %k0
1035; AVX512-NEXT:    kshiftrw $12, %k0, %k0
1036; AVX512-NEXT:    kmovd %k0, %eax
1037; AVX512-NEXT:    movb %al, (%rdi)
1038; AVX512-NEXT:    retq
1039  %t = call {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
1040  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
1041  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
1042  %res = sext <4 x i1> %obit to <4 x i32>
1043  store <4 x i1> %val, <4 x i1>* %p2
1044  ret <4 x i32> %res
1045}
1046
1047define <2 x i32> @ssubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
1048; SSE2-LABEL: ssubo_v2i128:
1049; SSE2:       # %bb.0:
1050; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1051; SSE2-NEXT:    subq %r8, %rdi
1052; SSE2-NEXT:    sbbq %r9, %rsi
1053; SSE2-NEXT:    seto %r8b
1054; SSE2-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1055; SSE2-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1056; SSE2-NEXT:    seto %al
1057; SSE2-NEXT:    movzbl %al, %eax
1058; SSE2-NEXT:    negl %eax
1059; SSE2-NEXT:    movd %eax, %xmm1
1060; SSE2-NEXT:    movzbl %r8b, %eax
1061; SSE2-NEXT:    negl %eax
1062; SSE2-NEXT:    movd %eax, %xmm0
1063; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1064; SSE2-NEXT:    movq %rdx, 16(%r10)
1065; SSE2-NEXT:    movq %rdi, (%r10)
1066; SSE2-NEXT:    movq %rcx, 24(%r10)
1067; SSE2-NEXT:    movq %rsi, 8(%r10)
1068; SSE2-NEXT:    retq
1069;
1070; SSSE3-LABEL: ssubo_v2i128:
1071; SSSE3:       # %bb.0:
1072; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1073; SSSE3-NEXT:    subq %r8, %rdi
1074; SSSE3-NEXT:    sbbq %r9, %rsi
1075; SSSE3-NEXT:    seto %r8b
1076; SSSE3-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1077; SSSE3-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1078; SSSE3-NEXT:    seto %al
1079; SSSE3-NEXT:    movzbl %al, %eax
1080; SSSE3-NEXT:    negl %eax
1081; SSSE3-NEXT:    movd %eax, %xmm1
1082; SSSE3-NEXT:    movzbl %r8b, %eax
1083; SSSE3-NEXT:    negl %eax
1084; SSSE3-NEXT:    movd %eax, %xmm0
1085; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1086; SSSE3-NEXT:    movq %rdx, 16(%r10)
1087; SSSE3-NEXT:    movq %rdi, (%r10)
1088; SSSE3-NEXT:    movq %rcx, 24(%r10)
1089; SSSE3-NEXT:    movq %rsi, 8(%r10)
1090; SSSE3-NEXT:    retq
1091;
1092; SSE41-LABEL: ssubo_v2i128:
1093; SSE41:       # %bb.0:
1094; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1095; SSE41-NEXT:    subq %r8, %rdi
1096; SSE41-NEXT:    sbbq %r9, %rsi
1097; SSE41-NEXT:    seto %r8b
1098; SSE41-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1099; SSE41-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1100; SSE41-NEXT:    seto %al
1101; SSE41-NEXT:    movzbl %al, %r9d
1102; SSE41-NEXT:    negl %r9d
1103; SSE41-NEXT:    movzbl %r8b, %eax
1104; SSE41-NEXT:    negl %eax
1105; SSE41-NEXT:    movd %eax, %xmm0
1106; SSE41-NEXT:    pinsrd $1, %r9d, %xmm0
1107; SSE41-NEXT:    movq %rdx, 16(%r10)
1108; SSE41-NEXT:    movq %rdi, (%r10)
1109; SSE41-NEXT:    movq %rcx, 24(%r10)
1110; SSE41-NEXT:    movq %rsi, 8(%r10)
1111; SSE41-NEXT:    retq
1112;
1113; AVX-LABEL: ssubo_v2i128:
1114; AVX:       # %bb.0:
1115; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1116; AVX-NEXT:    subq %r8, %rdi
1117; AVX-NEXT:    sbbq %r9, %rsi
1118; AVX-NEXT:    seto %r8b
1119; AVX-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1120; AVX-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1121; AVX-NEXT:    seto %al
1122; AVX-NEXT:    movzbl %al, %r9d
1123; AVX-NEXT:    negl %r9d
1124; AVX-NEXT:    movzbl %r8b, %eax
1125; AVX-NEXT:    negl %eax
1126; AVX-NEXT:    vmovd %eax, %xmm0
1127; AVX-NEXT:    vpinsrd $1, %r9d, %xmm0, %xmm0
1128; AVX-NEXT:    movq %rdx, 16(%r10)
1129; AVX-NEXT:    movq %rdi, (%r10)
1130; AVX-NEXT:    movq %rcx, 24(%r10)
1131; AVX-NEXT:    movq %rsi, 8(%r10)
1132; AVX-NEXT:    retq
1133;
1134; AVX512-LABEL: ssubo_v2i128:
1135; AVX512:       # %bb.0:
1136; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1137; AVX512-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1138; AVX512-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1139; AVX512-NEXT:    seto %al
1140; AVX512-NEXT:    kmovd %eax, %k0
1141; AVX512-NEXT:    subq %r8, %rdi
1142; AVX512-NEXT:    sbbq %r9, %rsi
1143; AVX512-NEXT:    seto %al
1144; AVX512-NEXT:    andl $1, %eax
1145; AVX512-NEXT:    kmovw %eax, %k1
1146; AVX512-NEXT:    kshiftlw $1, %k0, %k0
1147; AVX512-NEXT:    korw %k0, %k1, %k1
1148; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1149; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1150; AVX512-NEXT:    movq %rdx, 16(%r10)
1151; AVX512-NEXT:    movq %rdi, (%r10)
1152; AVX512-NEXT:    movq %rcx, 24(%r10)
1153; AVX512-NEXT:    movq %rsi, 8(%r10)
1154; AVX512-NEXT:    retq
1155  %t = call {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
1156  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
1157  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
1158  %res = sext <2 x i1> %obit to <2 x i32>
1159  store <2 x i128> %val, <2 x i128>* %p2
1160  ret <2 x i32> %res
1161}
1162