1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck -check-prefixes=CHECK,DAG %s
3; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* -mtriple=arm64-eabi -aarch64-neon-syntax=apple 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL,FALLBACK
4
5; FALLBACK-NOT: remark:{{.*}} sabdl8h
6define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
7; CHECK-LABEL: sabdl8h:
8; CHECK:       // %bb.0:
9; CHECK-NEXT:    ldr d0, [x0]
10; CHECK-NEXT:    ldr d1, [x1]
11; CHECK-NEXT:    sabdl.8h v0, v0, v1
12; CHECK-NEXT:    ret
13  %tmp1 = load <8 x i8>, <8 x i8>* %A
14  %tmp2 = load <8 x i8>, <8 x i8>* %B
15  %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
16  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
17  ret <8 x i16> %tmp4
18}
19
20; FALLBACK-NOT: remark:{{.*}} sabdl4s
21define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
22; CHECK-LABEL: sabdl4s:
23; CHECK:       // %bb.0:
24; CHECK-NEXT:    ldr d0, [x0]
25; CHECK-NEXT:    ldr d1, [x1]
26; CHECK-NEXT:    sabdl.4s v0, v0, v1
27; CHECK-NEXT:    ret
28  %tmp1 = load <4 x i16>, <4 x i16>* %A
29  %tmp2 = load <4 x i16>, <4 x i16>* %B
30  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
31  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
32  ret <4 x i32> %tmp4
33}
34
35; FALLBACK-NOT: remark:{{.*}} sabdl2d
36define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
37; CHECK-LABEL: sabdl2d:
38; CHECK:       // %bb.0:
39; CHECK-NEXT:    ldr d0, [x0]
40; CHECK-NEXT:    ldr d1, [x1]
41; CHECK-NEXT:    sabdl.2d v0, v0, v1
42; CHECK-NEXT:    ret
43  %tmp1 = load <2 x i32>, <2 x i32>* %A
44  %tmp2 = load <2 x i32>, <2 x i32>* %B
45  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
46  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
47  ret <2 x i64> %tmp4
48}
49
50define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
51; CHECK-LABEL: sabdl2_8h:
52; CHECK:       // %bb.0:
53; CHECK-NEXT:    ldr d0, [x0, #8]
54; CHECK-NEXT:    ldr d1, [x1, #8]
55; CHECK-NEXT:    sabdl.8h v0, v0, v1
56; CHECK-NEXT:    ret
57  %load1 = load <16 x i8>, <16 x i8>* %A
58  %load2 = load <16 x i8>, <16 x i8>* %B
59  %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
60  %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
61  %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
62  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
63  ret <8 x i16> %tmp4
64}
65
66define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
67; CHECK-LABEL: sabdl2_4s:
68; CHECK:       // %bb.0:
69; CHECK-NEXT:    ldr d0, [x0, #8]
70; CHECK-NEXT:    ldr d1, [x1, #8]
71; CHECK-NEXT:    sabdl.4s v0, v0, v1
72; CHECK-NEXT:    ret
73  %load1 = load <8 x i16>, <8 x i16>* %A
74  %load2 = load <8 x i16>, <8 x i16>* %B
75  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
76  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
77  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
78  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
79  ret <4 x i32> %tmp4
80}
81
82define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
83; CHECK-LABEL: sabdl2_2d:
84; CHECK:       // %bb.0:
85; CHECK-NEXT:    ldr d0, [x0, #8]
86; CHECK-NEXT:    ldr d1, [x1, #8]
87; CHECK-NEXT:    sabdl.2d v0, v0, v1
88; CHECK-NEXT:    ret
89  %load1 = load <4 x i32>, <4 x i32>* %A
90  %load2 = load <4 x i32>, <4 x i32>* %B
91  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
92  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
93  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
94  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
95  ret <2 x i64> %tmp4
96}
97
98; FALLBACK-NOT: remark:{{.*}} uabdl8h)
99define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
100; CHECK-LABEL: uabdl8h:
101; CHECK:       // %bb.0:
102; CHECK-NEXT:    ldr d0, [x0]
103; CHECK-NEXT:    ldr d1, [x1]
104; CHECK-NEXT:    uabdl.8h v0, v0, v1
105; CHECK-NEXT:    ret
106  %tmp1 = load <8 x i8>, <8 x i8>* %A
107  %tmp2 = load <8 x i8>, <8 x i8>* %B
108  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
109  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
110  ret <8 x i16> %tmp4
111}
112
113; FALLBACK-NOT: remark:{{.*}} uabdl4s)
114define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
115; CHECK-LABEL: uabdl4s:
116; CHECK:       // %bb.0:
117; CHECK-NEXT:    ldr d0, [x0]
118; CHECK-NEXT:    ldr d1, [x1]
119; CHECK-NEXT:    uabdl.4s v0, v0, v1
120; CHECK-NEXT:    ret
121  %tmp1 = load <4 x i16>, <4 x i16>* %A
122  %tmp2 = load <4 x i16>, <4 x i16>* %B
123  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
124  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
125  ret <4 x i32> %tmp4
126}
127
128; FALLBACK-NOT: remark:{{.*}} uabdl2d)
129define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
130; CHECK-LABEL: uabdl2d:
131; CHECK:       // %bb.0:
132; CHECK-NEXT:    ldr d0, [x0]
133; CHECK-NEXT:    ldr d1, [x1]
134; CHECK-NEXT:    uabdl.2d v0, v0, v1
135; CHECK-NEXT:    ret
136  %tmp1 = load <2 x i32>, <2 x i32>* %A
137  %tmp2 = load <2 x i32>, <2 x i32>* %B
138  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
139  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
140  ret <2 x i64> %tmp4
141}
142
143define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
144; CHECK-LABEL: uabdl2_8h:
145; CHECK:       // %bb.0:
146; CHECK-NEXT:    ldr d0, [x0, #8]
147; CHECK-NEXT:    ldr d1, [x1, #8]
148; CHECK-NEXT:    uabdl.8h v0, v0, v1
149; CHECK-NEXT:    ret
150  %load1 = load <16 x i8>, <16 x i8>* %A
151  %load2 = load <16 x i8>, <16 x i8>* %B
152  %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
153  %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
154
155  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
156  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
157  ret <8 x i16> %tmp4
158}
159
160define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
161; CHECK-LABEL: uabdl2_4s:
162; CHECK:       // %bb.0:
163; CHECK-NEXT:    ldr d0, [x0, #8]
164; CHECK-NEXT:    ldr d1, [x1, #8]
165; CHECK-NEXT:    uabdl.4s v0, v0, v1
166; CHECK-NEXT:    ret
167  %load1 = load <8 x i16>, <8 x i16>* %A
168  %load2 = load <8 x i16>, <8 x i16>* %B
169  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
170  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
171  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
172  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
173  ret <4 x i32> %tmp4
174}
175
176define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
177; CHECK-LABEL: uabdl2_2d:
178; CHECK:       // %bb.0:
179; CHECK-NEXT:    ldr d0, [x0, #8]
180; CHECK-NEXT:    ldr d1, [x1, #8]
181; CHECK-NEXT:    uabdl.2d v0, v0, v1
182; CHECK-NEXT:    ret
183  %load1 = load <4 x i32>, <4 x i32>* %A
184  %load2 = load <4 x i32>, <4 x i32>* %B
185  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
186  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
187  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
188  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
189  ret <2 x i64> %tmp4
190}
191
192declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
193declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
194
195define i16 @uabd16b_rdx(<16 x i8>* %a, <16 x i8>* %b) {
196; CHECK-LABEL: uabd16b_rdx:
197; CHECK:       // %bb.0:
198; CHECK-NEXT:    ldr q0, [x0]
199; CHECK-NEXT:    ldr q1, [x1]
200; CHECK-NEXT:    uabd.16b v0, v0, v1
201; CHECK-NEXT:    ushll.8h v1, v0, #0
202; CHECK-NEXT:    uaddw2.8h v0, v1, v0
203; CHECK-NEXT:    addv.8h h0, v0
204; CHECK-NEXT:    fmov w0, s0
205; CHECK-NEXT:    ret
206  %aload = load <16 x i8>, <16 x i8>* %a, align 1
207  %bload = load <16 x i8>, <16 x i8>* %b, align 1
208  %aext = zext <16 x i8> %aload to <16 x i16>
209  %bext = zext <16 x i8> %bload to <16 x i16>
210  %abdiff = sub nsw <16 x i16> %aext, %bext
211  %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
212  %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
213  %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
214  %reduced_v = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %absel)
215  ret i16 %reduced_v
216}
217
218define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
219; CHECK-LABEL: uabd16b_rdx_i32:
220; CHECK:       // %bb.0:
221; CHECK-NEXT:    uabdl.8h v2, v0, v1
222; CHECK-NEXT:    uabal2.8h v2, v0, v1
223; CHECK-NEXT:    uaddlv.8h s0, v2
224; CHECK-NEXT:    fmov w0, s0
225; CHECK-NEXT:    ret
226  %aext = zext <16 x i8> %a to <16 x i32>
227  %bext = zext <16 x i8> %b to <16 x i32>
228  %abdiff = sub nsw <16 x i32> %aext, %bext
229  %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer
230  %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff
231  %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff
232  %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel)
233  ret i32 %reduced_v
234}
235
236define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
237; CHECK-LABEL: sabd16b_rdx_i32:
238; CHECK:       // %bb.0:
239; CHECK-NEXT:    sabdl.8h v2, v0, v1
240; CHECK-NEXT:    sabal2.8h v2, v0, v1
241; CHECK-NEXT:    uaddlv.8h s0, v2
242; CHECK-NEXT:    fmov w0, s0
243; CHECK-NEXT:    ret
244  %aext = sext <16 x i8> %a to <16 x i32>
245  %bext = sext <16 x i8> %b to <16 x i32>
246  %abdiff = sub nsw <16 x i32> %aext, %bext
247  %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer
248  %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff
249  %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff
250  %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel)
251  ret i32 %reduced_v
252}
253
254
255declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
256declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
257
258define i32 @uabd8h_rdx(<8 x i16>* %a, <8 x i16>* %b) {
259; CHECK-LABEL: uabd8h_rdx:
260; CHECK:       // %bb.0:
261; CHECK-NEXT:    ldr q0, [x0]
262; CHECK-NEXT:    ldr q1, [x1]
263; CHECK-NEXT:    uabd.8h v0, v0, v1
264; CHECK-NEXT:    ushll.4s v1, v0, #0
265; CHECK-NEXT:    uaddw2.4s v0, v1, v0
266; CHECK-NEXT:    addv.4s s0, v0
267; CHECK-NEXT:    fmov w0, s0
268; CHECK-NEXT:    ret
269  %aload = load <8 x i16>, <8 x i16>* %a, align 1
270  %bload = load <8 x i16>, <8 x i16>* %b, align 1
271  %aext = zext <8 x i16> %aload to <8 x i32>
272  %bext = zext <8 x i16> %bload to <8 x i32>
273  %abdiff = sub nsw <8 x i32> %aext, %bext
274  %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
275  %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
276  %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
277  %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel)
278  ret i32 %reduced_v
279}
280
281define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) {
282; CHECK-LABEL: sabd8h_rdx:
283; CHECK:       // %bb.0:
284; CHECK-NEXT:    sabd.8h v0, v0, v1
285; CHECK-NEXT:    ushll.4s v1, v0, #0
286; CHECK-NEXT:    uaddw2.4s v0, v1, v0
287; CHECK-NEXT:    addv.4s s0, v0
288; CHECK-NEXT:    fmov w0, s0
289; CHECK-NEXT:    ret
290  %aext = sext <8 x i16> %a to <8 x i32>
291  %bext = sext <8 x i16> %b to <8 x i32>
292  %abdiff = sub nsw <8 x i32> %aext, %bext
293  %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
294  %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
295  %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
296  %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel)
297  ret i32 %reduced_v
298}
299
300define i32 @uabdl4s_rdx_i32(<4 x i16> %a, <4 x i16> %b) {
301; DAG-LABEL: uabdl4s_rdx_i32:
302; DAG:       // %bb.0:
303; DAG-NEXT:    uabdl.4s v0, v0, v1
304; DAG-NEXT:    addv.4s s0, v0
305; DAG-NEXT:    fmov w0, s0
306; DAG-NEXT:    ret
307;
308; GISEL-LABEL: uabdl4s_rdx_i32:
309; GISEL:       // %bb.0:
310; GISEL-NEXT:    movi.2d v2, #0000000000000000
311; GISEL-NEXT:    usubl.4s v0, v0, v1
312; GISEL-NEXT:    cmgt.4s v1, v2, v0
313; GISEL-NEXT:    shl.4s v1, v1, #31
314; GISEL-NEXT:    neg.4s v2, v0
315; GISEL-NEXT:    sshr.4s v1, v1, #31
316; GISEL-NEXT:    bit.16b v0, v2, v1
317; GISEL-NEXT:    addv.4s s0, v0
318; GISEL-NEXT:    fmov w0, s0
319; GISEL-NEXT:    ret
320
321; GISel doesn't match this pattern yet.
322  %aext = zext <4 x i16> %a to <4 x i32>
323  %bext = zext <4 x i16> %b to <4 x i32>
324 %abdiff = sub nsw <4 x i32> %aext, %bext
325  %abcmp = icmp slt <4 x i32> %abdiff, zeroinitializer
326  %ababs = sub nsw <4 x i32> zeroinitializer, %abdiff
327  %absel = select <4 x i1> %abcmp, <4 x i32> %ababs, <4 x i32> %abdiff
328  %reduced_v = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %absel)
329  ret i32 %reduced_v
330}
331
332declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
333declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
334
335define i64 @uabd4s_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
336; CHECK-LABEL: uabd4s_rdx:
337; CHECK:       // %bb.0:
338; CHECK-NEXT:    ldr q0, [x0]
339; CHECK-NEXT:    ldr q1, [x1]
340; CHECK-NEXT:    uabd.4s v0, v0, v1
341; CHECK-NEXT:    ushll.2d v1, v0, #0
342; CHECK-NEXT:    uaddw2.2d v0, v1, v0
343; CHECK-NEXT:    addp.2d d0, v0
344; CHECK-NEXT:    fmov x0, d0
345; CHECK-NEXT:    ret
346  %aload = load <4 x i32>, <4 x i32>* %a, align 1
347  %bload = load <4 x i32>, <4 x i32>* %b, align 1
348  %aext = zext <4 x i32> %aload to <4 x i64>
349  %bext = zext <4 x i32> %bload to <4 x i64>
350  %abdiff = sub nsw <4 x i64> %aext, %bext
351  %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
352  %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
353  %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
354  %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel)
355  ret i64 %reduced_v
356}
357
358define i64 @sabd4s_rdx(<4 x i32> %a, <4 x i32> %b) {
359; CHECK-LABEL: sabd4s_rdx:
360; CHECK:       // %bb.0:
361; CHECK-NEXT:    sabd.4s v0, v0, v1
362; CHECK-NEXT:    ushll.2d v1, v0, #0
363; CHECK-NEXT:    uaddw2.2d v0, v1, v0
364; CHECK-NEXT:    addp.2d d0, v0
365; CHECK-NEXT:    fmov x0, d0
366; CHECK-NEXT:    ret
367  %aext = sext <4 x i32> %a to <4 x i64>
368  %bext = sext <4 x i32> %b to <4 x i64>
369  %abdiff = sub nsw <4 x i64> %aext, %bext
370  %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
371  %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
372  %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
373  %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel)
374  ret i64 %reduced_v
375}
376
377define i64 @uabdl2d_rdx_i64(<2 x i32> %a, <2 x i32> %b) {
378; DAG-LABEL: uabdl2d_rdx_i64:
379; DAG:       // %bb.0:
380; DAG-NEXT:    uabdl.2d v0, v0, v1
381; DAG-NEXT:    addp.2d d0, v0
382; DAG-NEXT:    fmov x0, d0
383; DAG-NEXT:    ret
384;
385; GISEL-LABEL: uabdl2d_rdx_i64:
386; GISEL:       // %bb.0:
387; GISEL-NEXT:    movi.2d v2, #0000000000000000
388; GISEL-NEXT:    usubl.2d v0, v0, v1
389; GISEL-NEXT:    cmgt.2d v1, v2, v0
390; GISEL-NEXT:    shl.2d v1, v1, #63
391; GISEL-NEXT:    neg.2d v2, v0
392; GISEL-NEXT:    sshr.2d v1, v1, #63
393; GISEL-NEXT:    bit.16b v0, v2, v1
394; GISEL-NEXT:    addp.2d d0, v0
395; GISEL-NEXT:    fmov x0, d0
396; GISEL-NEXT:    ret
397
398; GISel doesn't match this pattern yet
399  %aext = zext <2 x i32> %a to <2 x i64>
400  %bext = zext <2 x i32> %b to <2 x i64>
401  %abdiff = sub nsw <2 x i64> %aext, %bext
402  %abcmp = icmp slt <2 x i64> %abdiff, zeroinitializer
403  %ababs = sub nsw <2 x i64> zeroinitializer, %abdiff
404  %absel = select <2 x i1> %abcmp, <2 x i64> %ababs, <2 x i64> %abdiff
405  %reduced_v = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %absel)
406  ret i64 %reduced_v
407}
408
409define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
410; CHECK-LABEL: fabd_2s:
411; CHECK:       // %bb.0:
412; CHECK-NEXT:    ldr d0, [x0]
413; CHECK-NEXT:    ldr d1, [x1]
414; CHECK-NEXT:    fabd.2s v0, v0, v1
415; CHECK-NEXT:    ret
416  %tmp1 = load <2 x float>, <2 x float>* %A
417  %tmp2 = load <2 x float>, <2 x float>* %B
418  %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
419  ret <2 x float> %tmp3
420}
421
422define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
423; CHECK-LABEL: fabd_4s:
424; CHECK:       // %bb.0:
425; CHECK-NEXT:    ldr q0, [x0]
426; CHECK-NEXT:    ldr q1, [x1]
427; CHECK-NEXT:    fabd.4s v0, v0, v1
428; CHECK-NEXT:    ret
429  %tmp1 = load <4 x float>, <4 x float>* %A
430  %tmp2 = load <4 x float>, <4 x float>* %B
431  %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
432  ret <4 x float> %tmp3
433}
434
435define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
436; CHECK-LABEL: fabd_2d:
437; CHECK:       // %bb.0:
438; CHECK-NEXT:    ldr q0, [x0]
439; CHECK-NEXT:    ldr q1, [x1]
440; CHECK-NEXT:    fabd.2d v0, v0, v1
441; CHECK-NEXT:    ret
442  %tmp1 = load <2 x double>, <2 x double>* %A
443  %tmp2 = load <2 x double>, <2 x double>* %B
444  %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
445  ret <2 x double> %tmp3
446}
447
448declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
449declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
450declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
451
452define <2 x float> @fabd_2s_from_fsub_fabs(<2 x float>* %A, <2 x float>* %B) nounwind {
453; CHECK-LABEL: fabd_2s_from_fsub_fabs:
454; CHECK:       // %bb.0:
455; CHECK-NEXT:    ldr d0, [x0]
456; CHECK-NEXT:    ldr d1, [x1]
457; CHECK-NEXT:    fabd.2s v0, v0, v1
458; CHECK-NEXT:    ret
459  %tmp1 = load <2 x float>, <2 x float>* %A
460  %tmp2 = load <2 x float>, <2 x float>* %B
461  %sub = fsub <2 x float> %tmp1, %tmp2
462  %abs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %sub)
463  ret <2 x float> %abs
464}
465
466define <4 x float> @fabd_4s_from_fsub_fabs(<4 x float>* %A, <4 x float>* %B) nounwind {
467; CHECK-LABEL: fabd_4s_from_fsub_fabs:
468; CHECK:       // %bb.0:
469; CHECK-NEXT:    ldr q0, [x0]
470; CHECK-NEXT:    ldr q1, [x1]
471; CHECK-NEXT:    fabd.4s v0, v0, v1
472; CHECK-NEXT:    ret
473  %tmp1 = load <4 x float>, <4 x float>* %A
474  %tmp2 = load <4 x float>, <4 x float>* %B
475  %sub = fsub <4 x float> %tmp1, %tmp2
476  %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %sub)
477  ret <4 x float> %abs
478}
479
480define <2 x double> @fabd_2d_from_fsub_fabs(<2 x double>* %A, <2 x double>* %B) nounwind {
481; CHECK-LABEL: fabd_2d_from_fsub_fabs:
482; CHECK:       // %bb.0:
483; CHECK-NEXT:    ldr q0, [x0]
484; CHECK-NEXT:    ldr q1, [x1]
485; CHECK-NEXT:    fabd.2d v0, v0, v1
486; CHECK-NEXT:    ret
487  %tmp1 = load <2 x double>, <2 x double>* %A
488  %tmp2 = load <2 x double>, <2 x double>* %B
489  %sub = fsub <2 x double> %tmp1, %tmp2
490  %abs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %sub)
491  ret <2 x double> %abs
492}
493
494declare <2 x float> @llvm.fabs.v2f32(<2 x float>) nounwind readnone
495declare <4 x float> @llvm.fabs.v4f32(<4 x float>) nounwind readnone
496declare <2 x double> @llvm.fabs.v2f64(<2 x double>) nounwind readnone
497
498define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
499; CHECK-LABEL: sabd_8b:
500; CHECK:       // %bb.0:
501; CHECK-NEXT:    ldr d0, [x0]
502; CHECK-NEXT:    ldr d1, [x1]
503; CHECK-NEXT:    sabd.8b v0, v0, v1
504; CHECK-NEXT:    ret
505  %tmp1 = load <8 x i8>, <8 x i8>* %A
506  %tmp2 = load <8 x i8>, <8 x i8>* %B
507  %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
508  ret <8 x i8> %tmp3
509}
510
511define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
512; CHECK-LABEL: sabd_16b:
513; CHECK:       // %bb.0:
514; CHECK-NEXT:    ldr q0, [x0]
515; CHECK-NEXT:    ldr q1, [x1]
516; CHECK-NEXT:    sabd.16b v0, v0, v1
517; CHECK-NEXT:    ret
518  %tmp1 = load <16 x i8>, <16 x i8>* %A
519  %tmp2 = load <16 x i8>, <16 x i8>* %B
520  %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
521  ret <16 x i8> %tmp3
522}
523
524define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
525; CHECK-LABEL: sabd_4h:
526; CHECK:       // %bb.0:
527; CHECK-NEXT:    ldr d0, [x0]
528; CHECK-NEXT:    ldr d1, [x1]
529; CHECK-NEXT:    sabd.4h v0, v0, v1
530; CHECK-NEXT:    ret
531  %tmp1 = load <4 x i16>, <4 x i16>* %A
532  %tmp2 = load <4 x i16>, <4 x i16>* %B
533  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
534  ret <4 x i16> %tmp3
535}
536
537define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
538; CHECK-LABEL: sabd_8h:
539; CHECK:       // %bb.0:
540; CHECK-NEXT:    ldr q0, [x0]
541; CHECK-NEXT:    ldr q1, [x1]
542; CHECK-NEXT:    sabd.8h v0, v0, v1
543; CHECK-NEXT:    ret
544  %tmp1 = load <8 x i16>, <8 x i16>* %A
545  %tmp2 = load <8 x i16>, <8 x i16>* %B
546  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
547  ret <8 x i16> %tmp3
548}
549
550define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
551; CHECK-LABEL: sabd_2s:
552; CHECK:       // %bb.0:
553; CHECK-NEXT:    ldr d0, [x0]
554; CHECK-NEXT:    ldr d1, [x1]
555; CHECK-NEXT:    sabd.2s v0, v0, v1
556; CHECK-NEXT:    ret
557  %tmp1 = load <2 x i32>, <2 x i32>* %A
558  %tmp2 = load <2 x i32>, <2 x i32>* %B
559  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
560  ret <2 x i32> %tmp3
561}
562
563define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
564; CHECK-LABEL: sabd_4s:
565; CHECK:       // %bb.0:
566; CHECK-NEXT:    ldr q0, [x0]
567; CHECK-NEXT:    ldr q1, [x1]
568; CHECK-NEXT:    sabd.4s v0, v0, v1
569; CHECK-NEXT:    ret
570  %tmp1 = load <4 x i32>, <4 x i32>* %A
571  %tmp2 = load <4 x i32>, <4 x i32>* %B
572  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
573  ret <4 x i32> %tmp3
574}
575
576declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
577declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
578declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
579declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
580declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
581declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
582
583define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
584; CHECK-LABEL: uabd_8b:
585; CHECK:       // %bb.0:
586; CHECK-NEXT:    ldr d0, [x0]
587; CHECK-NEXT:    ldr d1, [x1]
588; CHECK-NEXT:    uabd.8b v0, v0, v1
589; CHECK-NEXT:    ret
590  %tmp1 = load <8 x i8>, <8 x i8>* %A
591  %tmp2 = load <8 x i8>, <8 x i8>* %B
592  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
593  ret <8 x i8> %tmp3
594}
595
596define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
597; CHECK-LABEL: uabd_16b:
598; CHECK:       // %bb.0:
599; CHECK-NEXT:    ldr q0, [x0]
600; CHECK-NEXT:    ldr q1, [x1]
601; CHECK-NEXT:    uabd.16b v0, v0, v1
602; CHECK-NEXT:    ret
603  %tmp1 = load <16 x i8>, <16 x i8>* %A
604  %tmp2 = load <16 x i8>, <16 x i8>* %B
605  %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
606  ret <16 x i8> %tmp3
607}
608
609define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
610; CHECK-LABEL: uabd_4h:
611; CHECK:       // %bb.0:
612; CHECK-NEXT:    ldr d0, [x0]
613; CHECK-NEXT:    ldr d1, [x1]
614; CHECK-NEXT:    uabd.4h v0, v0, v1
615; CHECK-NEXT:    ret
616  %tmp1 = load <4 x i16>, <4 x i16>* %A
617  %tmp2 = load <4 x i16>, <4 x i16>* %B
618  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
619  ret <4 x i16> %tmp3
620}
621
622define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
623; CHECK-LABEL: uabd_8h:
624; CHECK:       // %bb.0:
625; CHECK-NEXT:    ldr q0, [x0]
626; CHECK-NEXT:    ldr q1, [x1]
627; CHECK-NEXT:    uabd.8h v0, v0, v1
628; CHECK-NEXT:    ret
629  %tmp1 = load <8 x i16>, <8 x i16>* %A
630  %tmp2 = load <8 x i16>, <8 x i16>* %B
631  %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
632  ret <8 x i16> %tmp3
633}
634
635define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
636; CHECK-LABEL: uabd_2s:
637; CHECK:       // %bb.0:
638; CHECK-NEXT:    ldr d0, [x0]
639; CHECK-NEXT:    ldr d1, [x1]
640; CHECK-NEXT:    uabd.2s v0, v0, v1
641; CHECK-NEXT:    ret
642  %tmp1 = load <2 x i32>, <2 x i32>* %A
643  %tmp2 = load <2 x i32>, <2 x i32>* %B
644  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
645  ret <2 x i32> %tmp3
646}
647
648define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
649; CHECK-LABEL: uabd_4s:
650; CHECK:       // %bb.0:
651; CHECK-NEXT:    ldr q0, [x0]
652; CHECK-NEXT:    ldr q1, [x1]
653; CHECK-NEXT:    uabd.4s v0, v0, v1
654; CHECK-NEXT:    ret
655  %tmp1 = load <4 x i32>, <4 x i32>* %A
656  %tmp2 = load <4 x i32>, <4 x i32>* %B
657  %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
658  ret <4 x i32> %tmp3
659}
660
661declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
662declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
663declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
664declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
665declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
666declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
667
668define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
669; CHECK-LABEL: sqabs_8b:
670; CHECK:       // %bb.0:
671; CHECK-NEXT:    ldr d0, [x0]
672; CHECK-NEXT:    sqabs.8b v0, v0
673; CHECK-NEXT:    ret
674  %tmp1 = load <8 x i8>, <8 x i8>* %A
675  %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
676  ret <8 x i8> %tmp3
677}
678
679define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
680; CHECK-LABEL: sqabs_16b:
681; CHECK:       // %bb.0:
682; CHECK-NEXT:    ldr q0, [x0]
683; CHECK-NEXT:    sqabs.16b v0, v0
684; CHECK-NEXT:    ret
685  %tmp1 = load <16 x i8>, <16 x i8>* %A
686  %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
687  ret <16 x i8> %tmp3
688}
689
690define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
691; CHECK-LABEL: sqabs_4h:
692; CHECK:       // %bb.0:
693; CHECK-NEXT:    ldr d0, [x0]
694; CHECK-NEXT:    sqabs.4h v0, v0
695; CHECK-NEXT:    ret
696  %tmp1 = load <4 x i16>, <4 x i16>* %A
697  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
698  ret <4 x i16> %tmp3
699}
700
701define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
702; CHECK-LABEL: sqabs_8h:
703; CHECK:       // %bb.0:
704; CHECK-NEXT:    ldr q0, [x0]
705; CHECK-NEXT:    sqabs.8h v0, v0
706; CHECK-NEXT:    ret
707  %tmp1 = load <8 x i16>, <8 x i16>* %A
708  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
709  ret <8 x i16> %tmp3
710}
711
712define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
713; CHECK-LABEL: sqabs_2s:
714; CHECK:       // %bb.0:
715; CHECK-NEXT:    ldr d0, [x0]
716; CHECK-NEXT:    sqabs.2s v0, v0
717; CHECK-NEXT:    ret
718  %tmp1 = load <2 x i32>, <2 x i32>* %A
719  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
720  ret <2 x i32> %tmp3
721}
722
723define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
724; CHECK-LABEL: sqabs_4s:
725; CHECK:       // %bb.0:
726; CHECK-NEXT:    ldr q0, [x0]
727; CHECK-NEXT:    sqabs.4s v0, v0
728; CHECK-NEXT:    ret
729  %tmp1 = load <4 x i32>, <4 x i32>* %A
730  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
731  ret <4 x i32> %tmp3
732}
733
734declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
735declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
736declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
737declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
738declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
739declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
740
741define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
742; CHECK-LABEL: sqneg_8b:
743; CHECK:       // %bb.0:
744; CHECK-NEXT:    ldr d0, [x0]
745; CHECK-NEXT:    sqneg.8b v0, v0
746; CHECK-NEXT:    ret
747  %tmp1 = load <8 x i8>, <8 x i8>* %A
748  %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
749  ret <8 x i8> %tmp3
750}
751
752define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
753; CHECK-LABEL: sqneg_16b:
754; CHECK:       // %bb.0:
755; CHECK-NEXT:    ldr q0, [x0]
756; CHECK-NEXT:    sqneg.16b v0, v0
757; CHECK-NEXT:    ret
758  %tmp1 = load <16 x i8>, <16 x i8>* %A
759  %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
760  ret <16 x i8> %tmp3
761}
762
763define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
764; CHECK-LABEL: sqneg_4h:
765; CHECK:       // %bb.0:
766; CHECK-NEXT:    ldr d0, [x0]
767; CHECK-NEXT:    sqneg.4h v0, v0
768; CHECK-NEXT:    ret
769  %tmp1 = load <4 x i16>, <4 x i16>* %A
770  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
771  ret <4 x i16> %tmp3
772}
773
774define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
775; CHECK-LABEL: sqneg_8h:
776; CHECK:       // %bb.0:
777; CHECK-NEXT:    ldr q0, [x0]
778; CHECK-NEXT:    sqneg.8h v0, v0
779; CHECK-NEXT:    ret
780  %tmp1 = load <8 x i16>, <8 x i16>* %A
781  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
782  ret <8 x i16> %tmp3
783}
784
785define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
786; CHECK-LABEL: sqneg_2s:
787; CHECK:       // %bb.0:
788; CHECK-NEXT:    ldr d0, [x0]
789; CHECK-NEXT:    sqneg.2s v0, v0
790; CHECK-NEXT:    ret
791  %tmp1 = load <2 x i32>, <2 x i32>* %A
792  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
793  ret <2 x i32> %tmp3
794}
795
796define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
797; CHECK-LABEL: sqneg_4s:
798; CHECK:       // %bb.0:
799; CHECK-NEXT:    ldr q0, [x0]
800; CHECK-NEXT:    sqneg.4s v0, v0
801; CHECK-NEXT:    ret
802  %tmp1 = load <4 x i32>, <4 x i32>* %A
803  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
804  ret <4 x i32> %tmp3
805}
806
807declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
808declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
809declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
810declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
811declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
812declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
813
814define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
815; CHECK-LABEL: abs_8b:
816; CHECK:       // %bb.0:
817; CHECK-NEXT:    ldr d0, [x0]
818; CHECK-NEXT:    abs.8b v0, v0
819; CHECK-NEXT:    ret
820  %tmp1 = load <8 x i8>, <8 x i8>* %A
821  %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
822  ret <8 x i8> %tmp3
823}
824
825define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
826; CHECK-LABEL: abs_16b:
827; CHECK:       // %bb.0:
828; CHECK-NEXT:    ldr q0, [x0]
829; CHECK-NEXT:    abs.16b v0, v0
830; CHECK-NEXT:    ret
831  %tmp1 = load <16 x i8>, <16 x i8>* %A
832  %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
833  ret <16 x i8> %tmp3
834}
835
836define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
837; CHECK-LABEL: abs_4h:
838; CHECK:       // %bb.0:
839; CHECK-NEXT:    ldr d0, [x0]
840; CHECK-NEXT:    abs.4h v0, v0
841; CHECK-NEXT:    ret
842  %tmp1 = load <4 x i16>, <4 x i16>* %A
843  %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
844  ret <4 x i16> %tmp3
845}
846
847define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
848; CHECK-LABEL: abs_8h:
849; CHECK:       // %bb.0:
850; CHECK-NEXT:    ldr q0, [x0]
851; CHECK-NEXT:    abs.8h v0, v0
852; CHECK-NEXT:    ret
853  %tmp1 = load <8 x i16>, <8 x i16>* %A
854  %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
855  ret <8 x i16> %tmp3
856}
857
858define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
859; CHECK-LABEL: abs_2s:
860; CHECK:       // %bb.0:
861; CHECK-NEXT:    ldr d0, [x0]
862; CHECK-NEXT:    abs.2s v0, v0
863; CHECK-NEXT:    ret
864  %tmp1 = load <2 x i32>, <2 x i32>* %A
865  %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
866  ret <2 x i32> %tmp3
867}
868
869define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
870; CHECK-LABEL: abs_4s:
871; CHECK:       // %bb.0:
872; CHECK-NEXT:    ldr q0, [x0]
873; CHECK-NEXT:    abs.4s v0, v0
874; CHECK-NEXT:    ret
875  %tmp1 = load <4 x i32>, <4 x i32>* %A
876  %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
877  ret <4 x i32> %tmp3
878}
879
880define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
881; CHECK-LABEL: abs_1d:
882; CHECK:       // %bb.0:
883; CHECK-NEXT:    abs d0, d0
884; CHECK-NEXT:    ret
885  %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
886  ret <1 x i64> %abs
887}
888
889define i64 @abs_1d_honestly(i64 %A) nounwind {
890; CHECK-LABEL: abs_1d_honestly:
891; CHECK:       // %bb.0:
892; CHECK-NEXT:    fmov d0, x0
893; CHECK-NEXT:    abs d0, d0
894; CHECK-NEXT:    fmov x0, d0
895; CHECK-NEXT:    ret
896  %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
897  ret i64 %abs
898}
899
900declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
901declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
902declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
903declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
904declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
905declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
906declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
907declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
908
909; FALLBACK-NOT: remark:{{.*}} sabal8h
910define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
911; CHECK-LABEL: sabal8h:
912; CHECK:       // %bb.0:
913; CHECK-NEXT:    ldr d1, [x0]
914; CHECK-NEXT:    ldr d2, [x1]
915; CHECK-NEXT:    ldr q0, [x2]
916; CHECK-NEXT:    sabal.8h v0, v1, v2
917; CHECK-NEXT:    ret
918  %tmp1 = load <8 x i8>, <8 x i8>* %A
919  %tmp2 = load <8 x i8>, <8 x i8>* %B
920  %tmp3 = load <8 x i16>, <8 x i16>* %C
921  %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
922  %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
923  %tmp5 = add <8 x i16> %tmp3, %tmp4.1
924  ret <8 x i16> %tmp5
925}
926
927; FALLBACK-NOT: remark:{{.*}} sabal4s
928define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
929; CHECK-LABEL: sabal4s:
930; CHECK:       // %bb.0:
931; CHECK-NEXT:    ldr d1, [x0]
932; CHECK-NEXT:    ldr d2, [x1]
933; CHECK-NEXT:    ldr q0, [x2]
934; CHECK-NEXT:    sabal.4s v0, v1, v2
935; CHECK-NEXT:    ret
936  %tmp1 = load <4 x i16>, <4 x i16>* %A
937  %tmp2 = load <4 x i16>, <4 x i16>* %B
938  %tmp3 = load <4 x i32>, <4 x i32>* %C
939  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
940  %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
941  %tmp5 = add <4 x i32> %tmp3, %tmp4.1
942  ret <4 x i32> %tmp5
943}
944
945; FALLBACK-NOT: remark:{{.*}} sabal2d
946define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
947; CHECK-LABEL: sabal2d:
948; CHECK:       // %bb.0:
949; CHECK-NEXT:    ldr d1, [x0]
950; CHECK-NEXT:    ldr d2, [x1]
951; CHECK-NEXT:    ldr q0, [x2]
952; CHECK-NEXT:    sabal.2d v0, v1, v2
953; CHECK-NEXT:    ret
954  %tmp1 = load <2 x i32>, <2 x i32>* %A
955  %tmp2 = load <2 x i32>, <2 x i32>* %B
956  %tmp3 = load <2 x i64>, <2 x i64>* %C
957  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
958  %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
959  %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
960  %tmp5 = add <2 x i64> %tmp3, %tmp4.1
961  ret <2 x i64> %tmp5
962}
963
964define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
965; CHECK-LABEL: sabal2_8h:
966; CHECK:       // %bb.0:
967; CHECK-NEXT:    ldr q0, [x2]
968; CHECK-NEXT:    ldr d1, [x0, #8]
969; CHECK-NEXT:    ldr d2, [x1, #8]
970; CHECK-NEXT:    sabal.8h v0, v1, v2
971; CHECK-NEXT:    ret
972  %load1 = load <16 x i8>, <16 x i8>* %A
973  %load2 = load <16 x i8>, <16 x i8>* %B
974  %tmp3 = load <8 x i16>, <8 x i16>* %C
975  %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
976  %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
977  %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
978  %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
979  %tmp5 = add <8 x i16> %tmp3, %tmp4.1
980  ret <8 x i16> %tmp5
981}
982
983define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
984; CHECK-LABEL: sabal2_4s:
985; CHECK:       // %bb.0:
986; CHECK-NEXT:    ldr q0, [x2]
987; CHECK-NEXT:    ldr d1, [x0, #8]
988; CHECK-NEXT:    ldr d2, [x1, #8]
989; CHECK-NEXT:    sabal.4s v0, v1, v2
990; CHECK-NEXT:    ret
991  %load1 = load <8 x i16>, <8 x i16>* %A
992  %load2 = load <8 x i16>, <8 x i16>* %B
993  %tmp3 = load <4 x i32>, <4 x i32>* %C
994  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
995  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
996  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
997  %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
998  %tmp5 = add <4 x i32> %tmp3, %tmp4.1
999  ret <4 x i32> %tmp5
1000}
1001
1002define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
1003; CHECK-LABEL: sabal2_2d:
1004; CHECK:       // %bb.0:
1005; CHECK-NEXT:    ldr q0, [x2]
1006; CHECK-NEXT:    ldr d1, [x0, #8]
1007; CHECK-NEXT:    ldr d2, [x1, #8]
1008; CHECK-NEXT:    sabal.2d v0, v1, v2
1009; CHECK-NEXT:    ret
1010  %load1 = load <4 x i32>, <4 x i32>* %A
1011  %load2 = load <4 x i32>, <4 x i32>* %B
1012  %tmp3 = load <2 x i64>, <2 x i64>* %C
1013  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1014  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1015  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1016  %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
1017  %tmp5 = add <2 x i64> %tmp3, %tmp4.1
1018  ret <2 x i64> %tmp5
1019}
1020
1021; FALLBACK-NOT: remark:{{.*}} uabal8h
1022define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
1023; CHECK-LABEL: uabal8h:
1024; CHECK:       // %bb.0:
1025; CHECK-NEXT:    ldr d1, [x0]
1026; CHECK-NEXT:    ldr d2, [x1]
1027; CHECK-NEXT:    ldr q0, [x2]
1028; CHECK-NEXT:    uabal.8h v0, v1, v2
1029; CHECK-NEXT:    ret
1030  %tmp1 = load <8 x i8>, <8 x i8>* %A
1031  %tmp2 = load <8 x i8>, <8 x i8>* %B
1032  %tmp3 = load <8 x i16>, <8 x i16>* %C
1033  %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1034  %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
1035  %tmp5 = add <8 x i16> %tmp3, %tmp4.1
1036  ret <8 x i16> %tmp5
1037}
1038
1039; FALLBACK-NOT: remark:{{.*}} uabal8s
1040define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1041; CHECK-LABEL: uabal4s:
1042; CHECK:       // %bb.0:
1043; CHECK-NEXT:    ldr d1, [x0]
1044; CHECK-NEXT:    ldr d2, [x1]
1045; CHECK-NEXT:    ldr q0, [x2]
1046; CHECK-NEXT:    uabal.4s v0, v1, v2
1047; CHECK-NEXT:    ret
1048  %tmp1 = load <4 x i16>, <4 x i16>* %A
1049  %tmp2 = load <4 x i16>, <4 x i16>* %B
1050  %tmp3 = load <4 x i32>, <4 x i32>* %C
1051  %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1052  %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
1053  %tmp5 = add <4 x i32> %tmp3, %tmp4.1
1054  ret <4 x i32> %tmp5
1055}
1056
1057; FALLBACK-NOT: remark:{{.*}} uabal2d
1058define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1059; CHECK-LABEL: uabal2d:
1060; CHECK:       // %bb.0:
1061; CHECK-NEXT:    ldr d1, [x0]
1062; CHECK-NEXT:    ldr d2, [x1]
1063; CHECK-NEXT:    ldr q0, [x2]
1064; CHECK-NEXT:    uabal.2d v0, v1, v2
1065; CHECK-NEXT:    ret
1066  %tmp1 = load <2 x i32>, <2 x i32>* %A
1067  %tmp2 = load <2 x i32>, <2 x i32>* %B
1068  %tmp3 = load <2 x i64>, <2 x i64>* %C
1069  %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1070  %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
1071  %tmp5 = add <2 x i64> %tmp3, %tmp4.1
1072  ret <2 x i64> %tmp5
1073}
1074
1075define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
1076; CHECK-LABEL: uabal2_8h:
1077; CHECK:       // %bb.0:
1078; CHECK-NEXT:    ldr q0, [x2]
1079; CHECK-NEXT:    ldr d1, [x0, #8]
1080; CHECK-NEXT:    ldr d2, [x1, #8]
1081; CHECK-NEXT:    uabal.8h v0, v1, v2
1082; CHECK-NEXT:    ret
1083  %load1 = load <16 x i8>, <16 x i8>* %A
1084  %load2 = load <16 x i8>, <16 x i8>* %B
1085  %tmp3 = load <8 x i16>, <8 x i16>* %C
1086  %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1087  %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1088  %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1089  %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
1090  %tmp5 = add <8 x i16> %tmp3, %tmp4.1
1091  ret <8 x i16> %tmp5
1092}
1093
1094define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
1095; CHECK-LABEL: uabal2_4s:
1096; CHECK:       // %bb.0:
1097; CHECK-NEXT:    ldr q0, [x2]
1098; CHECK-NEXT:    ldr d1, [x0, #8]
1099; CHECK-NEXT:    ldr d2, [x1, #8]
1100; CHECK-NEXT:    uabal.4s v0, v1, v2
1101; CHECK-NEXT:    ret
1102  %load1 = load <8 x i16>, <8 x i16>* %A
1103  %load2 = load <8 x i16>, <8 x i16>* %B
1104  %tmp3 = load <4 x i32>, <4 x i32>* %C
1105  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1106  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1107  %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1108  %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
1109  %tmp5 = add <4 x i32> %tmp3, %tmp4.1
1110  ret <4 x i32> %tmp5
1111}
1112
1113define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
1114; CHECK-LABEL: uabal2_2d:
1115; CHECK:       // %bb.0:
1116; CHECK-NEXT:    ldr q0, [x2]
1117; CHECK-NEXT:    ldr d1, [x0, #8]
1118; CHECK-NEXT:    ldr d2, [x1, #8]
1119; CHECK-NEXT:    uabal.2d v0, v1, v2
1120; CHECK-NEXT:    ret
1121  %load1 = load <4 x i32>, <4 x i32>* %A
1122  %load2 = load <4 x i32>, <4 x i32>* %B
1123  %tmp3 = load <2 x i64>, <2 x i64>* %C
1124  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1125  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1126  %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1127  %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
1128  %tmp5 = add <2 x i64> %tmp3, %tmp4.1
1129  ret <2 x i64> %tmp5
1130}
1131
1132define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
1133; CHECK-LABEL: saba_8b:
1134; CHECK:       // %bb.0:
1135; CHECK-NEXT:    ldr d1, [x0]
1136; CHECK-NEXT:    ldr d2, [x1]
1137; CHECK-NEXT:    ldr d0, [x2]
1138; CHECK-NEXT:    saba.8b v0, v1, v2
1139; CHECK-NEXT:    ret
1140  %tmp1 = load <8 x i8>, <8 x i8>* %A
1141  %tmp2 = load <8 x i8>, <8 x i8>* %B
1142  %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1143  %tmp4 = load <8 x i8>, <8 x i8>* %C
1144  %tmp5 = add <8 x i8> %tmp3, %tmp4
1145  ret <8 x i8> %tmp5
1146}
1147
1148define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
1149; CHECK-LABEL: saba_16b:
1150; CHECK:       // %bb.0:
1151; CHECK-NEXT:    ldr q1, [x0]
1152; CHECK-NEXT:    ldr q2, [x1]
1153; CHECK-NEXT:    ldr q0, [x2]
1154; CHECK-NEXT:    saba.16b v0, v1, v2
1155; CHECK-NEXT:    ret
1156  %tmp1 = load <16 x i8>, <16 x i8>* %A
1157  %tmp2 = load <16 x i8>, <16 x i8>* %B
1158  %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
1159  %tmp4 = load <16 x i8>, <16 x i8>* %C
1160  %tmp5 = add <16 x i8> %tmp3, %tmp4
1161  ret <16 x i8> %tmp5
1162}
1163
1164define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
1165; CHECK-LABEL: saba_4h:
1166; CHECK:       // %bb.0:
1167; CHECK-NEXT:    ldr d1, [x0]
1168; CHECK-NEXT:    ldr d2, [x1]
1169; CHECK-NEXT:    ldr d0, [x2]
1170; CHECK-NEXT:    saba.4h v0, v1, v2
1171; CHECK-NEXT:    ret
1172  %tmp1 = load <4 x i16>, <4 x i16>* %A
1173  %tmp2 = load <4 x i16>, <4 x i16>* %B
1174  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1175  %tmp4 = load <4 x i16>, <4 x i16>* %C
1176  %tmp5 = add <4 x i16> %tmp3, %tmp4
1177  ret <4 x i16> %tmp5
1178}
1179
1180define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
1181; CHECK-LABEL: saba_8h:
1182; CHECK:       // %bb.0:
1183; CHECK-NEXT:    ldr q1, [x0]
1184; CHECK-NEXT:    ldr q2, [x1]
1185; CHECK-NEXT:    ldr q0, [x2]
1186; CHECK-NEXT:    saba.8h v0, v1, v2
1187; CHECK-NEXT:    ret
1188  %tmp1 = load <8 x i16>, <8 x i16>* %A
1189  %tmp2 = load <8 x i16>, <8 x i16>* %B
1190  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
1191  %tmp4 = load <8 x i16>, <8 x i16>* %C
1192  %tmp5 = add <8 x i16> %tmp3, %tmp4
1193  ret <8 x i16> %tmp5
1194}
1195
1196define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
1197; CHECK-LABEL: saba_2s:
1198; CHECK:       // %bb.0:
1199; CHECK-NEXT:    ldr d1, [x0]
1200; CHECK-NEXT:    ldr d2, [x1]
1201; CHECK-NEXT:    ldr d0, [x2]
1202; CHECK-NEXT:    saba.2s v0, v1, v2
1203; CHECK-NEXT:    ret
1204  %tmp1 = load <2 x i32>, <2 x i32>* %A
1205  %tmp2 = load <2 x i32>, <2 x i32>* %B
1206  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1207  %tmp4 = load <2 x i32>, <2 x i32>* %C
1208  %tmp5 = add <2 x i32> %tmp3, %tmp4
1209  ret <2 x i32> %tmp5
1210}
1211
1212define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
1213; CHECK-LABEL: saba_4s:
1214; CHECK:       // %bb.0:
1215; CHECK-NEXT:    ldr q1, [x0]
1216; CHECK-NEXT:    ldr q2, [x1]
1217; CHECK-NEXT:    ldr q0, [x2]
1218; CHECK-NEXT:    saba.4s v0, v1, v2
1219; CHECK-NEXT:    ret
1220  %tmp1 = load <4 x i32>, <4 x i32>* %A
1221  %tmp2 = load <4 x i32>, <4 x i32>* %B
1222  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
1223  %tmp4 = load <4 x i32>, <4 x i32>* %C
1224  %tmp5 = add <4 x i32> %tmp3, %tmp4
1225  ret <4 x i32> %tmp5
1226}
1227
1228define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
1229; CHECK-LABEL: uaba_8b:
1230; CHECK:       // %bb.0:
1231; CHECK-NEXT:    ldr d1, [x0]
1232; CHECK-NEXT:    ldr d2, [x1]
1233; CHECK-NEXT:    ldr d0, [x2]
1234; CHECK-NEXT:    uaba.8b v0, v1, v2
1235; CHECK-NEXT:    ret
1236  %tmp1 = load <8 x i8>, <8 x i8>* %A
1237  %tmp2 = load <8 x i8>, <8 x i8>* %B
1238  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1239  %tmp4 = load <8 x i8>, <8 x i8>* %C
1240  %tmp5 = add <8 x i8> %tmp3, %tmp4
1241  ret <8 x i8> %tmp5
1242}
1243
1244define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
1245; CHECK-LABEL: uaba_16b:
1246; CHECK:       // %bb.0:
1247; CHECK-NEXT:    ldr q1, [x0]
1248; CHECK-NEXT:    ldr q2, [x1]
1249; CHECK-NEXT:    ldr q0, [x2]
1250; CHECK-NEXT:    uaba.16b v0, v1, v2
1251; CHECK-NEXT:    ret
1252  %tmp1 = load <16 x i8>, <16 x i8>* %A
1253  %tmp2 = load <16 x i8>, <16 x i8>* %B
1254  %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
1255  %tmp4 = load <16 x i8>, <16 x i8>* %C
1256  %tmp5 = add <16 x i8> %tmp3, %tmp4
1257  ret <16 x i8> %tmp5
1258}
1259
1260define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
1261; CHECK-LABEL: uaba_4h:
1262; CHECK:       // %bb.0:
1263; CHECK-NEXT:    ldr d1, [x0]
1264; CHECK-NEXT:    ldr d2, [x1]
1265; CHECK-NEXT:    ldr d0, [x2]
1266; CHECK-NEXT:    uaba.4h v0, v1, v2
1267; CHECK-NEXT:    ret
1268  %tmp1 = load <4 x i16>, <4 x i16>* %A
1269  %tmp2 = load <4 x i16>, <4 x i16>* %B
1270  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1271  %tmp4 = load <4 x i16>, <4 x i16>* %C
1272  %tmp5 = add <4 x i16> %tmp3, %tmp4
1273  ret <4 x i16> %tmp5
1274}
1275
1276define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
1277; CHECK-LABEL: uaba_8h:
1278; CHECK:       // %bb.0:
1279; CHECK-NEXT:    ldr q1, [x0]
1280; CHECK-NEXT:    ldr q2, [x1]
1281; CHECK-NEXT:    ldr q0, [x2]
1282; CHECK-NEXT:    uaba.8h v0, v1, v2
1283; CHECK-NEXT:    ret
1284  %tmp1 = load <8 x i16>, <8 x i16>* %A
1285  %tmp2 = load <8 x i16>, <8 x i16>* %B
1286  %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
1287  %tmp4 = load <8 x i16>, <8 x i16>* %C
1288  %tmp5 = add <8 x i16> %tmp3, %tmp4
1289  ret <8 x i16> %tmp5
1290}
1291
1292define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
1293; CHECK-LABEL: uaba_2s:
1294; CHECK:       // %bb.0:
1295; CHECK-NEXT:    ldr d1, [x0]
1296; CHECK-NEXT:    ldr d2, [x1]
1297; CHECK-NEXT:    ldr d0, [x2]
1298; CHECK-NEXT:    uaba.2s v0, v1, v2
1299; CHECK-NEXT:    ret
1300  %tmp1 = load <2 x i32>, <2 x i32>* %A
1301  %tmp2 = load <2 x i32>, <2 x i32>* %B
1302  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1303  %tmp4 = load <2 x i32>, <2 x i32>* %C
1304  %tmp5 = add <2 x i32> %tmp3, %tmp4
1305  ret <2 x i32> %tmp5
1306}
1307
1308define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
1309; CHECK-LABEL: uaba_4s:
1310; CHECK:       // %bb.0:
1311; CHECK-NEXT:    ldr q1, [x0]
1312; CHECK-NEXT:    ldr q2, [x1]
1313; CHECK-NEXT:    ldr q0, [x2]
1314; CHECK-NEXT:    uaba.4s v0, v1, v2
1315; CHECK-NEXT:    ret
1316  %tmp1 = load <4 x i32>, <4 x i32>* %A
1317  %tmp2 = load <4 x i32>, <4 x i32>* %B
1318  %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
1319  %tmp4 = load <4 x i32>, <4 x i32>* %C
1320  %tmp5 = add <4 x i32> %tmp3, %tmp4
1321  ret <4 x i32> %tmp5
1322}
1323
1324; Scalar FABD
1325define float @fabds(float %a, float %b) nounwind {
1326; CHECK-LABEL: fabds:
1327; CHECK:       // %bb.0:
1328; CHECK-NEXT:    fabd s0, s0, s1
1329; CHECK-NEXT:    ret
1330  %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
1331  ret float %vabd.i
1332}
1333
1334define double @fabdd(double %a, double %b) nounwind {
1335; CHECK-LABEL: fabdd:
1336; CHECK:       // %bb.0:
1337; CHECK-NEXT:    fabd d0, d0, d1
1338; CHECK-NEXT:    ret
1339  %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
1340  ret double %vabd.i
1341}
1342
1343declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
1344declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
1345
1346define float @fabds_from_fsub_fabs(float %a, float %b) nounwind {
1347; CHECK-LABEL: fabds_from_fsub_fabs:
1348; CHECK:       // %bb.0:
1349; CHECK-NEXT:    fabd s0, s0, s1
1350; CHECK-NEXT:    ret
1351  %sub = fsub float %a, %b
1352  %abs = tail call float @llvm.fabs.f32(float %sub)
1353  ret float %abs
1354}
1355
1356define double @fabdd_from_fsub_fabs(double %a, double %b) nounwind {
1357; CHECK-LABEL: fabdd_from_fsub_fabs:
1358; CHECK:       // %bb.0:
1359; CHECK-NEXT:    fabd d0, d0, d1
1360; CHECK-NEXT:    ret
1361  %sub = fsub double %a, %b
1362  %abs = tail call double @llvm.fabs.f64(double %sub)
1363  ret double %abs
1364}
1365
1366declare float @llvm.fabs.f32(float) nounwind readnone
1367declare double @llvm.fabs.f64(double) nounwind readnone
1368
1369define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1370; CHECK-LABEL: uabdl_from_extract_dup:
1371; CHECK:       // %bb.0:
1372; CHECK-NEXT:    dup.2s v1, w0
1373; CHECK-NEXT:    uabdl.2d v0, v0, v1
1374; CHECK-NEXT:    ret
1375  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1376  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1377
1378  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1379
1380  %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1381  %res1 = zext <2 x i32> %res to <2 x i64>
1382  ret <2 x i64> %res1
1383}
1384
1385define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1386; CHECK-LABEL: uabdl2_from_extract_dup:
1387; CHECK:       // %bb.0:
1388; CHECK-NEXT:    dup.4s v1, w0
1389; CHECK-NEXT:    uabdl2.2d v0, v0, v1
1390; CHECK-NEXT:    ret
1391  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1392  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1393
1394  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1395
1396  %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1397  %res1 = zext <2 x i32> %res to <2 x i64>
1398  ret <2 x i64> %res1
1399}
1400
1401define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1402; CHECK-LABEL: sabdl_from_extract_dup:
1403; CHECK:       // %bb.0:
1404; CHECK-NEXT:    dup.2s v1, w0
1405; CHECK-NEXT:    sabdl.2d v0, v0, v1
1406; CHECK-NEXT:    ret
1407  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1408  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1409
1410  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1411
1412  %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1413  %res1 = zext <2 x i32> %res to <2 x i64>
1414  ret <2 x i64> %res1
1415}
1416
1417define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1418; CHECK-LABEL: sabdl2_from_extract_dup:
1419; CHECK:       // %bb.0:
1420; CHECK-NEXT:    dup.4s v1, w0
1421; CHECK-NEXT:    sabdl2.2d v0, v0, v1
1422; CHECK-NEXT:    ret
1423  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1424  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1425
1426  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1427
1428  %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1429  %res1 = zext <2 x i32> %res to <2 x i64>
1430  ret <2 x i64> %res1
1431}
1432
1433define <2 x i32> @abspattern1(<2 x i32> %a) nounwind {
1434; DAG-LABEL: abspattern1:
1435; DAG:       // %bb.0:
1436; DAG-NEXT:    abs.2s v0, v0
1437; DAG-NEXT:    ret
1438;
1439; GISEL-LABEL: abspattern1:
1440; GISEL:       // %bb.0:
1441; GISEL-NEXT:    movi.2d v1, #0000000000000000
1442; GISEL-NEXT:    cmge.2s v1, v0, v1
1443; GISEL-NEXT:    shl.2s v1, v1, #31
1444; GISEL-NEXT:    neg.2s v2, v0
1445; GISEL-NEXT:    sshr.2s v1, v1, #31
1446; GISEL-NEXT:    bif.8b v0, v2, v1
1447; GISEL-NEXT:    ret
1448
1449  %tmp1neg = sub <2 x i32> zeroinitializer, %a
1450  %b = icmp sge <2 x i32> %a, zeroinitializer
1451  %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
1452  ret <2 x i32> %abs
1453}
1454
1455define <4 x i16> @abspattern2(<4 x i16> %a) nounwind {
1456; DAG-LABEL: abspattern2:
1457; DAG:       // %bb.0:
1458; DAG-NEXT:    abs.4h v0, v0
1459; DAG-NEXT:    ret
1460;
1461; GISEL-LABEL: abspattern2:
1462; GISEL:       // %bb.0:
1463; GISEL-NEXT:    movi.2d v1, #0000000000000000
1464; GISEL-NEXT:    cmgt.4h v1, v0, v1
1465; GISEL-NEXT:    shl.4h v1, v1, #15
1466; GISEL-NEXT:    neg.4h v2, v0
1467; GISEL-NEXT:    sshr.4h v1, v1, #15
1468; GISEL-NEXT:    bif.8b v0, v2, v1
1469; GISEL-NEXT:    ret
1470; For GlobalISel, this generates terrible code until we can pattern match this to abs.
1471
1472  %tmp1neg = sub <4 x i16> zeroinitializer, %a
1473  %b = icmp sgt <4 x i16> %a, zeroinitializer
1474  %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
1475  ret <4 x i16> %abs
1476}
1477
1478define <8 x i8> @abspattern3(<8 x i8> %a) nounwind {
1479; DAG-LABEL: abspattern3:
1480; DAG:       // %bb.0:
1481; DAG-NEXT:    abs.8b v0, v0
1482; DAG-NEXT:    ret
1483;
1484; GISEL-LABEL: abspattern3:
1485; GISEL:       // %bb.0:
1486; GISEL-NEXT:    movi.2d v1, #0000000000000000
1487; GISEL-NEXT:    cmgt.8b v1, v1, v0
1488; GISEL-NEXT:    shl.8b v1, v1, #7
1489; GISEL-NEXT:    neg.8b v2, v0
1490; GISEL-NEXT:    sshr.8b v1, v1, #7
1491; GISEL-NEXT:    bit.8b v0, v2, v1
1492; GISEL-NEXT:    ret
1493
1494  %tmp1neg = sub <8 x i8> zeroinitializer, %a
1495  %b = icmp slt <8 x i8> %a, zeroinitializer
1496  %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
1497  ret <8 x i8> %abs
1498}
1499
1500define <4 x i32> @abspattern4(<4 x i32> %a) nounwind {
1501; DAG-LABEL: abspattern4:
1502; DAG:       // %bb.0:
1503; DAG-NEXT:    abs.4s v0, v0
1504; DAG-NEXT:    ret
1505;
1506; GISEL-LABEL: abspattern4:
1507; GISEL:       // %bb.0:
1508; GISEL-NEXT:    movi.2d v1, #0000000000000000
1509; GISEL-NEXT:    cmge.4s v1, v0, v1
1510; GISEL-NEXT:    shl.4s v1, v1, #31
1511; GISEL-NEXT:    neg.4s v2, v0
1512; GISEL-NEXT:    sshr.4s v1, v1, #31
1513; GISEL-NEXT:    bif.16b v0, v2, v1
1514; GISEL-NEXT:    ret
1515
1516  %tmp1neg = sub <4 x i32> zeroinitializer, %a
1517  %b = icmp sge <4 x i32> %a, zeroinitializer
1518  %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
1519  ret <4 x i32> %abs
1520}
1521
1522define <8 x i16> @abspattern5(<8 x i16> %a) nounwind {
1523; DAG-LABEL: abspattern5:
1524; DAG:       // %bb.0:
1525; DAG-NEXT:    abs.8h v0, v0
1526; DAG-NEXT:    ret
1527;
1528; GISEL-LABEL: abspattern5:
1529; GISEL:       // %bb.0:
1530; GISEL-NEXT:    movi.2d v1, #0000000000000000
1531; GISEL-NEXT:    cmgt.8h v1, v0, v1
1532; GISEL-NEXT:    shl.8h v1, v1, #15
1533; GISEL-NEXT:    neg.8h v2, v0
1534; GISEL-NEXT:    sshr.8h v1, v1, #15
1535; GISEL-NEXT:    bif.16b v0, v2, v1
1536; GISEL-NEXT:    ret
1537
1538  %tmp1neg = sub <8 x i16> zeroinitializer, %a
1539  %b = icmp sgt <8 x i16> %a, zeroinitializer
1540  %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
1541  ret <8 x i16> %abs
1542}
1543
1544define <16 x i8> @abspattern6(<16 x i8> %a) nounwind {
1545; DAG-LABEL: abspattern6:
1546; DAG:       // %bb.0:
1547; DAG-NEXT:    abs.16b v0, v0
1548; DAG-NEXT:    ret
1549;
1550; GISEL-LABEL: abspattern6:
1551; GISEL:       // %bb.0:
1552; GISEL-NEXT:    movi.2d v1, #0000000000000000
1553; GISEL-NEXT:    cmgt.16b v1, v1, v0
1554; GISEL-NEXT:    shl.16b v1, v1, #7
1555; GISEL-NEXT:    neg.16b v2, v0
1556; GISEL-NEXT:    sshr.16b v1, v1, #7
1557; GISEL-NEXT:    bit.16b v0, v2, v1
1558; GISEL-NEXT:    ret
1559
1560  %tmp1neg = sub <16 x i8> zeroinitializer, %a
1561  %b = icmp slt <16 x i8> %a, zeroinitializer
1562  %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
1563  ret <16 x i8> %abs
1564}
1565
1566define <2 x i64> @abspattern7(<2 x i64> %a) nounwind {
1567; DAG-LABEL: abspattern7:
1568; DAG:       // %bb.0:
1569; DAG-NEXT:    abs.2d v0, v0
1570; DAG-NEXT:    ret
1571;
1572; GISEL-LABEL: abspattern7:
1573; GISEL:       // %bb.0:
1574; GISEL-NEXT:    movi.2d v1, #0000000000000000
1575; GISEL-NEXT:    cmge.2d v1, v1, v0
1576; GISEL-NEXT:    shl.2d v1, v1, #63
1577; GISEL-NEXT:    neg.2d v2, v0
1578; GISEL-NEXT:    sshr.2d v1, v1, #63
1579; GISEL-NEXT:    bit.16b v0, v2, v1
1580; GISEL-NEXT:    ret
1581
1582  %tmp1neg = sub <2 x i64> zeroinitializer, %a
1583  %b = icmp sle <2 x i64> %a, zeroinitializer
1584  %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a
1585  ret <2 x i64> %abs
1586}
1587
1588define <2 x i64> @uabd_i32(<2 x i32> %a, <2 x i32> %b) {
1589; DAG-LABEL: uabd_i32:
1590; DAG:       // %bb.0:
1591; DAG-NEXT:    sabdl.2d v0, v0, v1
1592; DAG-NEXT:    ret
1593;
1594; GISEL-LABEL: uabd_i32:
1595; GISEL:       // %bb.0:
1596; GISEL-NEXT:    movi.2d v2, #0000000000000000
1597; GISEL-NEXT:    ssubl.2d v0, v0, v1
1598; GISEL-NEXT:    cmgt.2d v1, v2, v0
1599; GISEL-NEXT:    shl.2d v1, v1, #63
1600; GISEL-NEXT:    neg.2d v2, v0
1601; GISEL-NEXT:    sshr.2d v1, v1, #63
1602; GISEL-NEXT:    bit.16b v0, v2, v1
1603; GISEL-NEXT:    ret
1604  %aext = sext <2 x i32> %a to <2 x i64>
1605  %bext = sext <2 x i32> %b to <2 x i64>
1606  %abdiff = sub nsw <2 x i64> %aext, %bext
1607  %abcmp = icmp slt <2 x i64> %abdiff, zeroinitializer
1608  %ababs = sub nsw <2 x i64> zeroinitializer, %abdiff
1609  %absel = select <2 x i1> %abcmp, <2 x i64> %ababs, <2 x i64> %abdiff
1610  ret <2 x i64> %absel
1611}
1612
1613
1614define <2 x i128> @uabd_i64(<2 x i64> %a, <2 x i64> %b) {
1615; CHECK-LABEL: uabd_i64:
1616; CHECK:       // %bb.0:
1617; CHECK-NEXT:    fmov x9, d0
1618; CHECK-NEXT:    fmov x12, d1
1619; CHECK-NEXT:    asr x10, x9, #63
1620; CHECK-NEXT:    asr x13, x12, #63
1621; CHECK-NEXT:    subs x9, x9, x12
1622; CHECK-NEXT:    mov.d x8, v0[1]
1623; CHECK-NEXT:    mov.d x11, v1[1]
1624; CHECK-NEXT:    sbcs x10, x10, x13
1625; CHECK-NEXT:    asr x12, x8, #63
1626; CHECK-NEXT:    asr x14, x11, #63
1627; CHECK-NEXT:    subs x8, x8, x11
1628; CHECK-NEXT:    sbcs x11, x12, x14
1629; CHECK-NEXT:    negs x12, x8
1630; CHECK-NEXT:    ngcs x13, x11
1631; CHECK-NEXT:    cmp x11, #0 // =0
1632; CHECK-NEXT:    csel x2, x12, x8, lt
1633; CHECK-NEXT:    csel x3, x13, x11, lt
1634; CHECK-NEXT:    negs x8, x9
1635; CHECK-NEXT:    ngcs x11, x10
1636; CHECK-NEXT:    cmp x10, #0 // =0
1637; CHECK-NEXT:    csel x8, x8, x9, lt
1638; CHECK-NEXT:    csel x1, x11, x10, lt
1639; CHECK-NEXT:    fmov d0, x8
1640; CHECK-NEXT:    mov.d v0[1], x1
1641; CHECK-NEXT:    fmov x0, d0
1642; CHECK-NEXT:    ret
1643  %aext = sext <2 x i64> %a to <2 x i128>
1644  %bext = sext <2 x i64> %b to <2 x i128>
1645  %abdiff = sub nsw <2 x i128> %aext, %bext
1646  %abcmp = icmp slt <2 x i128> %abdiff, zeroinitializer
1647  %ababs = sub nsw <2 x i128> zeroinitializer, %abdiff
1648  %absel = select <2 x i1> %abcmp, <2 x i128> %ababs, <2 x i128> %abdiff
1649  ret <2 x i128> %absel
1650}
1651