1; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck -check-prefixes=CHECK,DAG %s
2; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* -mtriple=arm64-eabi -aarch64-neon-syntax=apple 2>&1 | FileCheck %s --check-prefixes=FALLBACK,CHECK,GISEL
3
4; FALLBACK-NOT: remark:{{.*}} G_ZEXT
5; FALLBACK-NOT: remark:{{.*}} sabdl8h
6define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
7;CHECK-LABEL: sabdl8h:
8;CHECK: sabdl.8h
9        %tmp1 = load <8 x i8>, <8 x i8>* %A
10        %tmp2 = load <8 x i8>, <8 x i8>* %B
11        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
12        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
13        ret <8 x i16> %tmp4
14}
15
16; FALLBACK-NOT: remark:{{.*}} sabdl4s
17define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
18;CHECK-LABEL: sabdl4s:
19;CHECK: sabdl.4s
20        %tmp1 = load <4 x i16>, <4 x i16>* %A
21        %tmp2 = load <4 x i16>, <4 x i16>* %B
22        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
23        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
24        ret <4 x i32> %tmp4
25}
26
27; FALLBACK-NOT: remark:{{.*}} sabdl2d
28define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
29;CHECK-LABEL: sabdl2d:
30;CHECK: sabdl.2d
31        %tmp1 = load <2 x i32>, <2 x i32>* %A
32        %tmp2 = load <2 x i32>, <2 x i32>* %B
33        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
34        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
35        ret <2 x i64> %tmp4
36}
37
38define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
39;CHECK-LABEL: sabdl2_8h:
40;CHECK: sabdl.8h
41        %load1 = load <16 x i8>, <16 x i8>* %A
42        %load2 = load <16 x i8>, <16 x i8>* %B
43        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
44        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
45        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
46        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
47        ret <8 x i16> %tmp4
48}
49
50define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
51;CHECK-LABEL: sabdl2_4s:
52;CHECK: sabdl.4s
53        %load1 = load <8 x i16>, <8 x i16>* %A
54        %load2 = load <8 x i16>, <8 x i16>* %B
55        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
56        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
57        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
58        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
59        ret <4 x i32> %tmp4
60}
61
62define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
63;CHECK-LABEL: sabdl2_2d:
64;CHECK: sabdl.2d
65        %load1 = load <4 x i32>, <4 x i32>* %A
66        %load2 = load <4 x i32>, <4 x i32>* %B
67        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
68        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
69        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
70        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
71        ret <2 x i64> %tmp4
72}
73
74; FALLBACK-NOT: remark:{{.*}} uabdl8h)
75define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
76;CHECK-LABEL: uabdl8h:
77;CHECK: uabdl.8h
78  %tmp1 = load <8 x i8>, <8 x i8>* %A
79  %tmp2 = load <8 x i8>, <8 x i8>* %B
80  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
81  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
82  ret <8 x i16> %tmp4
83}
84
85; FALLBACK-NOT: remark:{{.*}} uabdl4s)
86define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
87;CHECK-LABEL: uabdl4s:
88;CHECK: uabdl.4s
89  %tmp1 = load <4 x i16>, <4 x i16>* %A
90  %tmp2 = load <4 x i16>, <4 x i16>* %B
91  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
92  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
93  ret <4 x i32> %tmp4
94}
95
96; FALLBACK-NOT: remark:{{.*}} uabdl2d)
97define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
98;CHECK-LABEL: uabdl2d:
99;CHECK: uabdl.2d
100  %tmp1 = load <2 x i32>, <2 x i32>* %A
101  %tmp2 = load <2 x i32>, <2 x i32>* %B
102  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
103  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
104  ret <2 x i64> %tmp4
105}
106
107define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
108;CHECK-LABEL: uabdl2_8h:
109;CHECK: uabdl.8h
110  %load1 = load <16 x i8>, <16 x i8>* %A
111  %load2 = load <16 x i8>, <16 x i8>* %B
112  %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
113  %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
114
115  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
116  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
117  ret <8 x i16> %tmp4
118}
119
120define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
121;CHECK-LABEL: uabdl2_4s:
122;CHECK: uabdl.4s
123  %load1 = load <8 x i16>, <8 x i16>* %A
124  %load2 = load <8 x i16>, <8 x i16>* %B
125  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
126  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
127  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
128  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
129  ret <4 x i32> %tmp4
130}
131
132define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
133;CHECK-LABEL: uabdl2_2d:
134;CHECK: uabdl.2d
135  %load1 = load <4 x i32>, <4 x i32>* %A
136  %load2 = load <4 x i32>, <4 x i32>* %B
137  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
138  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
139  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
140  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
141  ret <2 x i64> %tmp4
142}
143
144declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
145
146define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) {
147; CHECK-LABEL: uabdl8h_rdx
148; CHECK: uabdl2.8h
149; CHECK: uabdl.8h
150  %aload = load <16 x i8>, <16 x i8>* %a, align 1
151  %bload = load <16 x i8>, <16 x i8>* %b, align 1
152  %aext = zext <16 x i8> %aload to <16 x i16>
153  %bext = zext <16 x i8> %bload to <16 x i16>
154  %abdiff = sub nsw <16 x i16> %aext, %bext
155  %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
156  %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
157  %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
158  %reduced_v = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %absel)
159  ret i16 %reduced_v
160}
161
162declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
163
164define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) {
165; CHECK-LABEL: uabdl4s_rdx
166; CHECK: uabdl2.4s
167; CHECK: uabdl.4s
168  %aload = load <8 x i16>, <8 x i16>* %a, align 1
169  %bload = load <8 x i16>, <8 x i16>* %b, align 1
170  %aext = zext <8 x i16> %aload to <8 x i32>
171  %bext = zext <8 x i16> %bload to <8 x i32>
172  %abdiff = sub nsw <8 x i32> %aext, %bext
173  %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
174  %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
175  %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
176  %reduced_v = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %absel)
177  ret i32 %reduced_v
178}
179
180declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
181
182define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
183; CHECK: uabdl2d_rdx
184; CHECK: uabdl2.2d
185; CHECK: uabdl.2d
186  %aload = load <4 x i32>, <4 x i32>* %a, align 1
187  %bload = load <4 x i32>, <4 x i32>* %b, align 1
188  %aext = zext <4 x i32> %aload to <4 x i64>
189  %bext = zext <4 x i32> %bload to <4 x i64>
190  %abdiff = sub nsw <4 x i64> %aext, %bext
191  %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
192  %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
193  %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
194  %reduced_v = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %absel)
195  ret i64 %reduced_v
196}
197
198define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
199;CHECK-LABEL: fabd_2s:
200;CHECK: fabd.2s
201        %tmp1 = load <2 x float>, <2 x float>* %A
202        %tmp2 = load <2 x float>, <2 x float>* %B
203        %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
204        ret <2 x float> %tmp3
205}
206
207define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
208;CHECK-LABEL: fabd_4s:
209;CHECK: fabd.4s
210        %tmp1 = load <4 x float>, <4 x float>* %A
211        %tmp2 = load <4 x float>, <4 x float>* %B
212        %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
213        ret <4 x float> %tmp3
214}
215
216define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
217;CHECK-LABEL: fabd_2d:
218;CHECK: fabd.2d
219        %tmp1 = load <2 x double>, <2 x double>* %A
220        %tmp2 = load <2 x double>, <2 x double>* %B
221        %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
222        ret <2 x double> %tmp3
223}
224
225declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
226declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
227declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
228
229define <2 x float> @fabd_2s_from_fsub_fabs(<2 x float>* %A, <2 x float>* %B) nounwind {
230;CHECK-LABEL: fabd_2s_from_fsub_fabs:
231;CHECK: fabd.2s
232        %tmp1 = load <2 x float>, <2 x float>* %A
233        %tmp2 = load <2 x float>, <2 x float>* %B
234        %sub = fsub <2 x float> %tmp1, %tmp2
235        %abs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %sub)
236        ret <2 x float> %abs
237}
238
239define <4 x float> @fabd_4s_from_fsub_fabs(<4 x float>* %A, <4 x float>* %B) nounwind {
240;CHECK-LABEL: fabd_4s_from_fsub_fabs:
241;CHECK: fabd.4s
242        %tmp1 = load <4 x float>, <4 x float>* %A
243        %tmp2 = load <4 x float>, <4 x float>* %B
244        %sub = fsub <4 x float> %tmp1, %tmp2
245        %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %sub)
246        ret <4 x float> %abs
247}
248
249define <2 x double> @fabd_2d_from_fsub_fabs(<2 x double>* %A, <2 x double>* %B) nounwind {
250;CHECK-LABEL: fabd_2d_from_fsub_fabs:
251;CHECK: fabd.2d
252        %tmp1 = load <2 x double>, <2 x double>* %A
253        %tmp2 = load <2 x double>, <2 x double>* %B
254        %sub = fsub <2 x double> %tmp1, %tmp2
255        %abs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %sub)
256        ret <2 x double> %abs
257}
258
259declare <2 x float> @llvm.fabs.v2f32(<2 x float>) nounwind readnone
260declare <4 x float> @llvm.fabs.v4f32(<4 x float>) nounwind readnone
261declare <2 x double> @llvm.fabs.v2f64(<2 x double>) nounwind readnone
262
263define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
264;CHECK-LABEL: sabd_8b:
265;CHECK: sabd.8b
266        %tmp1 = load <8 x i8>, <8 x i8>* %A
267        %tmp2 = load <8 x i8>, <8 x i8>* %B
268        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
269        ret <8 x i8> %tmp3
270}
271
272define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
273;CHECK-LABEL: sabd_16b:
274;CHECK: sabd.16b
275        %tmp1 = load <16 x i8>, <16 x i8>* %A
276        %tmp2 = load <16 x i8>, <16 x i8>* %B
277        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
278        ret <16 x i8> %tmp3
279}
280
281define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
282;CHECK-LABEL: sabd_4h:
283;CHECK: sabd.4h
284        %tmp1 = load <4 x i16>, <4 x i16>* %A
285        %tmp2 = load <4 x i16>, <4 x i16>* %B
286        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
287        ret <4 x i16> %tmp3
288}
289
290define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
291;CHECK-LABEL: sabd_8h:
292;CHECK: sabd.8h
293        %tmp1 = load <8 x i16>, <8 x i16>* %A
294        %tmp2 = load <8 x i16>, <8 x i16>* %B
295        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
296        ret <8 x i16> %tmp3
297}
298
299define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
300;CHECK-LABEL: sabd_2s:
301;CHECK: sabd.2s
302        %tmp1 = load <2 x i32>, <2 x i32>* %A
303        %tmp2 = load <2 x i32>, <2 x i32>* %B
304        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
305        ret <2 x i32> %tmp3
306}
307
308define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
309;CHECK-LABEL: sabd_4s:
310;CHECK: sabd.4s
311        %tmp1 = load <4 x i32>, <4 x i32>* %A
312        %tmp2 = load <4 x i32>, <4 x i32>* %B
313        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
314        ret <4 x i32> %tmp3
315}
316
317declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
318declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
319declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
320declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
321declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
322declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
323
324define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
325;CHECK-LABEL: uabd_8b:
326;CHECK: uabd.8b
327        %tmp1 = load <8 x i8>, <8 x i8>* %A
328        %tmp2 = load <8 x i8>, <8 x i8>* %B
329        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
330        ret <8 x i8> %tmp3
331}
332
333define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
334;CHECK-LABEL: uabd_16b:
335;CHECK: uabd.16b
336        %tmp1 = load <16 x i8>, <16 x i8>* %A
337        %tmp2 = load <16 x i8>, <16 x i8>* %B
338        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
339        ret <16 x i8> %tmp3
340}
341
342define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
343;CHECK-LABEL: uabd_4h:
344;CHECK: uabd.4h
345        %tmp1 = load <4 x i16>, <4 x i16>* %A
346        %tmp2 = load <4 x i16>, <4 x i16>* %B
347        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
348        ret <4 x i16> %tmp3
349}
350
351define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
352;CHECK-LABEL: uabd_8h:
353;CHECK: uabd.8h
354        %tmp1 = load <8 x i16>, <8 x i16>* %A
355        %tmp2 = load <8 x i16>, <8 x i16>* %B
356        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
357        ret <8 x i16> %tmp3
358}
359
360define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
361;CHECK-LABEL: uabd_2s:
362;CHECK: uabd.2s
363        %tmp1 = load <2 x i32>, <2 x i32>* %A
364        %tmp2 = load <2 x i32>, <2 x i32>* %B
365        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
366        ret <2 x i32> %tmp3
367}
368
369define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
370;CHECK-LABEL: uabd_4s:
371;CHECK: uabd.4s
372        %tmp1 = load <4 x i32>, <4 x i32>* %A
373        %tmp2 = load <4 x i32>, <4 x i32>* %B
374        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
375        ret <4 x i32> %tmp3
376}
377
378declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
379declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
380declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
381declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
382declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
383declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
384
385define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
386;CHECK-LABEL: sqabs_8b:
387;CHECK: sqabs.8b
388        %tmp1 = load <8 x i8>, <8 x i8>* %A
389        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
390        ret <8 x i8> %tmp3
391}
392
393define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
394;CHECK-LABEL: sqabs_16b:
395;CHECK: sqabs.16b
396        %tmp1 = load <16 x i8>, <16 x i8>* %A
397        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
398        ret <16 x i8> %tmp3
399}
400
401define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
402;CHECK-LABEL: sqabs_4h:
403;CHECK: sqabs.4h
404        %tmp1 = load <4 x i16>, <4 x i16>* %A
405        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
406        ret <4 x i16> %tmp3
407}
408
409define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
410;CHECK-LABEL: sqabs_8h:
411;CHECK: sqabs.8h
412        %tmp1 = load <8 x i16>, <8 x i16>* %A
413        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
414        ret <8 x i16> %tmp3
415}
416
417define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
418;CHECK-LABEL: sqabs_2s:
419;CHECK: sqabs.2s
420        %tmp1 = load <2 x i32>, <2 x i32>* %A
421        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
422        ret <2 x i32> %tmp3
423}
424
425define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
426;CHECK-LABEL: sqabs_4s:
427;CHECK: sqabs.4s
428        %tmp1 = load <4 x i32>, <4 x i32>* %A
429        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
430        ret <4 x i32> %tmp3
431}
432
433declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
434declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
435declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
436declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
437declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
438declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
439
440define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
441;CHECK-LABEL: sqneg_8b:
442;CHECK: sqneg.8b
443        %tmp1 = load <8 x i8>, <8 x i8>* %A
444        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
445        ret <8 x i8> %tmp3
446}
447
448define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
449;CHECK-LABEL: sqneg_16b:
450;CHECK: sqneg.16b
451        %tmp1 = load <16 x i8>, <16 x i8>* %A
452        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
453        ret <16 x i8> %tmp3
454}
455
456define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
457;CHECK-LABEL: sqneg_4h:
458;CHECK: sqneg.4h
459        %tmp1 = load <4 x i16>, <4 x i16>* %A
460        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
461        ret <4 x i16> %tmp3
462}
463
464define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
465;CHECK-LABEL: sqneg_8h:
466;CHECK: sqneg.8h
467        %tmp1 = load <8 x i16>, <8 x i16>* %A
468        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
469        ret <8 x i16> %tmp3
470}
471
472define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
473;CHECK-LABEL: sqneg_2s:
474;CHECK: sqneg.2s
475        %tmp1 = load <2 x i32>, <2 x i32>* %A
476        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
477        ret <2 x i32> %tmp3
478}
479
480define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
481;CHECK-LABEL: sqneg_4s:
482;CHECK: sqneg.4s
483        %tmp1 = load <4 x i32>, <4 x i32>* %A
484        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
485        ret <4 x i32> %tmp3
486}
487
488declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
489declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
490declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
491declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
492declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
493declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
494
495define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
496;CHECK-LABEL: abs_8b:
497;CHECK: abs.8b
498        %tmp1 = load <8 x i8>, <8 x i8>* %A
499        %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
500        ret <8 x i8> %tmp3
501}
502
503define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
504;CHECK-LABEL: abs_16b:
505;CHECK: abs.16b
506        %tmp1 = load <16 x i8>, <16 x i8>* %A
507        %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
508        ret <16 x i8> %tmp3
509}
510
511define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
512;CHECK-LABEL: abs_4h:
513;CHECK: abs.4h
514        %tmp1 = load <4 x i16>, <4 x i16>* %A
515        %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
516        ret <4 x i16> %tmp3
517}
518
519define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
520;CHECK-LABEL: abs_8h:
521;CHECK: abs.8h
522        %tmp1 = load <8 x i16>, <8 x i16>* %A
523        %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
524        ret <8 x i16> %tmp3
525}
526
527define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
528;CHECK-LABEL: abs_2s:
529;CHECK: abs.2s
530        %tmp1 = load <2 x i32>, <2 x i32>* %A
531        %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
532        ret <2 x i32> %tmp3
533}
534
535define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
536;CHECK-LABEL: abs_4s:
537;CHECK: abs.4s
538        %tmp1 = load <4 x i32>, <4 x i32>* %A
539        %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
540        ret <4 x i32> %tmp3
541}
542
543define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
544; CHECK-LABEL: abs_1d:
545; CHECK: abs d0, d0
546  %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
547  ret <1 x i64> %abs
548}
549
550define i64 @abs_1d_honestly(i64 %A) nounwind {
551; CHECK-LABEL: abs_1d_honestly:
552; CHECK: abs d0, d0
553  %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
554  ret i64 %abs
555}
556
557declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
558declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
559declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
560declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
561declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
562declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
563declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
564declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
565
566; FALLBACK-NOT: remark:{{.*}} sabal8h
567define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
568;CHECK-LABEL: sabal8h:
569;CHECK: sabal.8h
570        %tmp1 = load <8 x i8>, <8 x i8>* %A
571        %tmp2 = load <8 x i8>, <8 x i8>* %B
572        %tmp3 = load <8 x i16>, <8 x i16>* %C
573        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
574        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
575        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
576        ret <8 x i16> %tmp5
577}
578
579; FALLBACK-NOT: remark:{{.*}} sabal4s
580define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
581;CHECK-LABEL: sabal4s:
582;CHECK: sabal.4s
583        %tmp1 = load <4 x i16>, <4 x i16>* %A
584        %tmp2 = load <4 x i16>, <4 x i16>* %B
585        %tmp3 = load <4 x i32>, <4 x i32>* %C
586        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
587        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
588        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
589        ret <4 x i32> %tmp5
590}
591
592; FALLBACK-NOT: remark:{{.*}} sabal2d
593define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
594;CHECK-LABEL: sabal2d:
595;CHECK: sabal.2d
596        %tmp1 = load <2 x i32>, <2 x i32>* %A
597        %tmp2 = load <2 x i32>, <2 x i32>* %B
598        %tmp3 = load <2 x i64>, <2 x i64>* %C
599        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
600        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
601        %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
602        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
603        ret <2 x i64> %tmp5
604}
605
606define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
607;CHECK-LABEL: sabal2_8h:
608;CHECK: sabal.8h
609        %load1 = load <16 x i8>, <16 x i8>* %A
610        %load2 = load <16 x i8>, <16 x i8>* %B
611        %tmp3 = load <8 x i16>, <8 x i16>* %C
612        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
613        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
614        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
615        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
616        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
617        ret <8 x i16> %tmp5
618}
619
620define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
621;CHECK-LABEL: sabal2_4s:
622;CHECK: sabal.4s
623        %load1 = load <8 x i16>, <8 x i16>* %A
624        %load2 = load <8 x i16>, <8 x i16>* %B
625        %tmp3 = load <4 x i32>, <4 x i32>* %C
626        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
627        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
628        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
629        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
630        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
631        ret <4 x i32> %tmp5
632}
633
634define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
635;CHECK-LABEL: sabal2_2d:
636;CHECK: sabal.2d
637        %load1 = load <4 x i32>, <4 x i32>* %A
638        %load2 = load <4 x i32>, <4 x i32>* %B
639        %tmp3 = load <2 x i64>, <2 x i64>* %C
640        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
641        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
642        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
643        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
644        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
645        ret <2 x i64> %tmp5
646}
647
648; FALLBACK-NOT: remark:{{.*}} uabal8h
649define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
650;CHECK-LABEL: uabal8h:
651;CHECK: uabal.8h
652        %tmp1 = load <8 x i8>, <8 x i8>* %A
653        %tmp2 = load <8 x i8>, <8 x i8>* %B
654        %tmp3 = load <8 x i16>, <8 x i16>* %C
655        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
656        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
657        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
658        ret <8 x i16> %tmp5
659}
660
661; FALLBACK-NOT: remark:{{.*}} uabal8s
662define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
663;CHECK-LABEL: uabal4s:
664;CHECK: uabal.4s
665        %tmp1 = load <4 x i16>, <4 x i16>* %A
666        %tmp2 = load <4 x i16>, <4 x i16>* %B
667        %tmp3 = load <4 x i32>, <4 x i32>* %C
668        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
669        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
670        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
671        ret <4 x i32> %tmp5
672}
673
674; FALLBACK-NOT: remark:{{.*}} uabal2d
675define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
676;CHECK-LABEL: uabal2d:
677;CHECK: uabal.2d
678        %tmp1 = load <2 x i32>, <2 x i32>* %A
679        %tmp2 = load <2 x i32>, <2 x i32>* %B
680        %tmp3 = load <2 x i64>, <2 x i64>* %C
681        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
682        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
683        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
684        ret <2 x i64> %tmp5
685}
686
687define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
688;CHECK-LABEL: uabal2_8h:
689;CHECK: uabal.8h
690        %load1 = load <16 x i8>, <16 x i8>* %A
691        %load2 = load <16 x i8>, <16 x i8>* %B
692        %tmp3 = load <8 x i16>, <8 x i16>* %C
693        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
694        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
695        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
696        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
697        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
698        ret <8 x i16> %tmp5
699}
700
701define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
702;CHECK-LABEL: uabal2_4s:
703;CHECK: uabal.4s
704        %load1 = load <8 x i16>, <8 x i16>* %A
705        %load2 = load <8 x i16>, <8 x i16>* %B
706        %tmp3 = load <4 x i32>, <4 x i32>* %C
707        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
708        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
709        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
710        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
711        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
712        ret <4 x i32> %tmp5
713}
714
715define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
716;CHECK-LABEL: uabal2_2d:
717;CHECK: uabal.2d
718        %load1 = load <4 x i32>, <4 x i32>* %A
719        %load2 = load <4 x i32>, <4 x i32>* %B
720        %tmp3 = load <2 x i64>, <2 x i64>* %C
721        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
722        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
723        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
724        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
725        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
726        ret <2 x i64> %tmp5
727}
728
729define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
730;CHECK-LABEL: saba_8b:
731;CHECK: saba.8b
732        %tmp1 = load <8 x i8>, <8 x i8>* %A
733        %tmp2 = load <8 x i8>, <8 x i8>* %B
734        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
735        %tmp4 = load <8 x i8>, <8 x i8>* %C
736        %tmp5 = add <8 x i8> %tmp3, %tmp4
737        ret <8 x i8> %tmp5
738}
739
740define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
741;CHECK-LABEL: saba_16b:
742;CHECK: saba.16b
743        %tmp1 = load <16 x i8>, <16 x i8>* %A
744        %tmp2 = load <16 x i8>, <16 x i8>* %B
745        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
746        %tmp4 = load <16 x i8>, <16 x i8>* %C
747        %tmp5 = add <16 x i8> %tmp3, %tmp4
748        ret <16 x i8> %tmp5
749}
750
751define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
752;CHECK-LABEL: saba_4h:
753;CHECK: saba.4h
754        %tmp1 = load <4 x i16>, <4 x i16>* %A
755        %tmp2 = load <4 x i16>, <4 x i16>* %B
756        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
757        %tmp4 = load <4 x i16>, <4 x i16>* %C
758        %tmp5 = add <4 x i16> %tmp3, %tmp4
759        ret <4 x i16> %tmp5
760}
761
762define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
763;CHECK-LABEL: saba_8h:
764;CHECK: saba.8h
765        %tmp1 = load <8 x i16>, <8 x i16>* %A
766        %tmp2 = load <8 x i16>, <8 x i16>* %B
767        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
768        %tmp4 = load <8 x i16>, <8 x i16>* %C
769        %tmp5 = add <8 x i16> %tmp3, %tmp4
770        ret <8 x i16> %tmp5
771}
772
773define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
774;CHECK-LABEL: saba_2s:
775;CHECK: saba.2s
776        %tmp1 = load <2 x i32>, <2 x i32>* %A
777        %tmp2 = load <2 x i32>, <2 x i32>* %B
778        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
779        %tmp4 = load <2 x i32>, <2 x i32>* %C
780        %tmp5 = add <2 x i32> %tmp3, %tmp4
781        ret <2 x i32> %tmp5
782}
783
784define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
785;CHECK-LABEL: saba_4s:
786;CHECK: saba.4s
787        %tmp1 = load <4 x i32>, <4 x i32>* %A
788        %tmp2 = load <4 x i32>, <4 x i32>* %B
789        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
790        %tmp4 = load <4 x i32>, <4 x i32>* %C
791        %tmp5 = add <4 x i32> %tmp3, %tmp4
792        ret <4 x i32> %tmp5
793}
794
795define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
796;CHECK-LABEL: uaba_8b:
797;CHECK: uaba.8b
798        %tmp1 = load <8 x i8>, <8 x i8>* %A
799        %tmp2 = load <8 x i8>, <8 x i8>* %B
800        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
801        %tmp4 = load <8 x i8>, <8 x i8>* %C
802        %tmp5 = add <8 x i8> %tmp3, %tmp4
803        ret <8 x i8> %tmp5
804}
805
806define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
807;CHECK-LABEL: uaba_16b:
808;CHECK: uaba.16b
809        %tmp1 = load <16 x i8>, <16 x i8>* %A
810        %tmp2 = load <16 x i8>, <16 x i8>* %B
811        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
812        %tmp4 = load <16 x i8>, <16 x i8>* %C
813        %tmp5 = add <16 x i8> %tmp3, %tmp4
814        ret <16 x i8> %tmp5
815}
816
817define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
818;CHECK-LABEL: uaba_4h:
819;CHECK: uaba.4h
820        %tmp1 = load <4 x i16>, <4 x i16>* %A
821        %tmp2 = load <4 x i16>, <4 x i16>* %B
822        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
823        %tmp4 = load <4 x i16>, <4 x i16>* %C
824        %tmp5 = add <4 x i16> %tmp3, %tmp4
825        ret <4 x i16> %tmp5
826}
827
828define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
829;CHECK-LABEL: uaba_8h:
830;CHECK: uaba.8h
831        %tmp1 = load <8 x i16>, <8 x i16>* %A
832        %tmp2 = load <8 x i16>, <8 x i16>* %B
833        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
834        %tmp4 = load <8 x i16>, <8 x i16>* %C
835        %tmp5 = add <8 x i16> %tmp3, %tmp4
836        ret <8 x i16> %tmp5
837}
838
839define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
840;CHECK-LABEL: uaba_2s:
841;CHECK: uaba.2s
842        %tmp1 = load <2 x i32>, <2 x i32>* %A
843        %tmp2 = load <2 x i32>, <2 x i32>* %B
844        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
845        %tmp4 = load <2 x i32>, <2 x i32>* %C
846        %tmp5 = add <2 x i32> %tmp3, %tmp4
847        ret <2 x i32> %tmp5
848}
849
850define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
851;CHECK-LABEL: uaba_4s:
852;CHECK: uaba.4s
853        %tmp1 = load <4 x i32>, <4 x i32>* %A
854        %tmp2 = load <4 x i32>, <4 x i32>* %B
855        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
856        %tmp4 = load <4 x i32>, <4 x i32>* %C
857        %tmp5 = add <4 x i32> %tmp3, %tmp4
858        ret <4 x i32> %tmp5
859}
860
861; Scalar FABD
862define float @fabds(float %a, float %b) nounwind {
863; CHECK-LABEL: fabds:
864; CHECK: fabd s0, s0, s1
865  %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
866  ret float %vabd.i
867}
868
869define double @fabdd(double %a, double %b) nounwind {
870; CHECK-LABEL: fabdd:
871; CHECK: fabd d0, d0, d1
872  %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
873  ret double %vabd.i
874}
875
876declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
877declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
878
879define float @fabds_from_fsub_fabs(float %a, float %b) nounwind {
880; CHECK-LABEL: fabds_from_fsub_fabs:
881; CHECK: fabd s0, s0, s1
882  %sub = fsub float %a, %b
883  %abs = tail call float @llvm.fabs.f32(float %sub)
884  ret float %abs
885}
886
887define double @fabdd_from_fsub_fabs(double %a, double %b) nounwind {
888; CHECK-LABEL: fabdd_from_fsub_fabs:
889; CHECK: fabd d0, d0, d1
890  %sub = fsub double %a, %b
891  %abs = tail call double @llvm.fabs.f64(double %sub)
892  ret double %abs
893}
894
895declare float @llvm.fabs.f32(float) nounwind readnone
896declare double @llvm.fabs.f64(double) nounwind readnone
897
898define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
899; CHECK-LABEL: uabdl_from_extract_dup:
900; CHECK-NOT: ext.16b
901; CHECK: uabdl.2d
902  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
903  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
904
905  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
906
907  %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
908  %res1 = zext <2 x i32> %res to <2 x i64>
909  ret <2 x i64> %res1
910}
911
912define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
913; CHECK-LABEL: uabdl2_from_extract_dup:
914; CHECK-NOT: ext.16b
915; CHECK: uabdl2.2d
916  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
917  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
918
919  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
920
921  %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
922  %res1 = zext <2 x i32> %res to <2 x i64>
923  ret <2 x i64> %res1
924}
925
926define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
927; CHECK-LABEL: sabdl_from_extract_dup:
928; CHECK-NOT: ext.16b
929; CHECK: sabdl.2d
930  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
931  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
932
933  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
934
935  %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
936  %res1 = zext <2 x i32> %res to <2 x i64>
937  ret <2 x i64> %res1
938}
939
940define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
941; CHECK-LABEL: sabdl2_from_extract_dup:
942; CHECK-NOT: ext.16b
943; CHECK: sabdl2.2d
944  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
945  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
946
947  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
948
949  %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
950  %res1 = zext <2 x i32> %res to <2 x i64>
951  ret <2 x i64> %res1
952}
953
954define <2 x i32> @abspattern1(<2 x i32> %a) nounwind {
955; CHECK-LABEL: abspattern1:
956; DAG: abs.2s
957; DAG-NEXT: ret
958
959; GISEL: cmge.2s
960; GISEL: sub.2s
961; GISEL: fcsel
962; GISEL: fcsel
963        %tmp1neg = sub <2 x i32> zeroinitializer, %a
964        %b = icmp sge <2 x i32> %a, zeroinitializer
965        %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
966        ret <2 x i32> %abs
967}
968
969define <4 x i16> @abspattern2(<4 x i16> %a) nounwind {
970; CHECK-LABEL: abspattern2:
971; CHECK: abs.4h
972; CHECK-NEXT: ret
973        %tmp1neg = sub <4 x i16> zeroinitializer, %a
974        %b = icmp sgt <4 x i16> %a, zeroinitializer
975        %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
976        ret <4 x i16> %abs
977}
978
979define <8 x i8> @abspattern3(<8 x i8> %a) nounwind {
980; CHECK-LABEL: abspattern3:
981; CHECK: abs.8b
982; CHECK-NEXT: ret
983        %tmp1neg = sub <8 x i8> zeroinitializer, %a
984        %b = icmp slt <8 x i8> %a, zeroinitializer
985        %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
986        ret <8 x i8> %abs
987}
988
989define <4 x i32> @abspattern4(<4 x i32> %a) nounwind {
990; CHECK-LABEL: abspattern4:
991; DAG: abs.4s
992; DAG-NEXT: ret
993
994; GISEL: cmge.4s
995; GISEL: fcsel
996; GISEL: fcsel
997; GISEL: fcsel
998; GISEL: fcsel
999        %tmp1neg = sub <4 x i32> zeroinitializer, %a
1000        %b = icmp sge <4 x i32> %a, zeroinitializer
1001        %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
1002        ret <4 x i32> %abs
1003}
1004
1005define <8 x i16> @abspattern5(<8 x i16> %a) nounwind {
1006; CHECK-LABEL: abspattern5:
1007; DAG: abs.8h
1008; DAG-NEXT: ret
1009
1010; GISEL: cmgt.8h
1011; GISEL: sub.8h
1012; GISEL: csel
1013; GISEL: csel
1014; GISEL: csel
1015; GISEL: csel
1016; GISEL: csel
1017; GISEL: csel
1018; GISEL: csel
1019; GISEL: csel
1020        %tmp1neg = sub <8 x i16> zeroinitializer, %a
1021        %b = icmp sgt <8 x i16> %a, zeroinitializer
1022        %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
1023        ret <8 x i16> %abs
1024}
1025
1026define <16 x i8> @abspattern6(<16 x i8> %a) nounwind {
1027; CHECK-LABEL: abspattern6:
1028; CHECK: abs.16b
1029; CHECK-NEXT: ret
1030        %tmp1neg = sub <16 x i8> zeroinitializer, %a
1031        %b = icmp slt <16 x i8> %a, zeroinitializer
1032        %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
1033        ret <16 x i8> %abs
1034}
1035
1036define <2 x i64> @abspattern7(<2 x i64> %a) nounwind {
1037; CHECK-LABEL: abspattern7:
1038; DAG: abs.2d
1039; DAG-NEXT: ret
1040
1041; GISEL: cmge.2d
1042; GISEL: sub.2d
1043; GISEL: fcsel
1044; GISEL: fcsel
1045        %tmp1neg = sub <2 x i64> zeroinitializer, %a
1046        %b = icmp sle <2 x i64> %a, zeroinitializer
1047        %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a
1048        ret <2 x i64> %abs
1049}
1050