1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
3
4define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
5; CHECK-LABEL: shadd8b:
6; CHECK:       // %bb.0:
7; CHECK-NEXT:    ldr d0, [x0]
8; CHECK-NEXT:    ldr d1, [x1]
9; CHECK-NEXT:    shadd.8b v0, v0, v1
10; CHECK-NEXT:    ret
11	%tmp1 = load <8 x i8>, <8 x i8>* %A
12	%tmp2 = load <8 x i8>, <8 x i8>* %B
13	%tmp3 = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
14	ret <8 x i8> %tmp3
15}
16
17define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
18; CHECK-LABEL: shadd16b:
19; CHECK:       // %bb.0:
20; CHECK-NEXT:    ldr q0, [x0]
21; CHECK-NEXT:    ldr q1, [x1]
22; CHECK-NEXT:    shadd.16b v0, v0, v1
23; CHECK-NEXT:    ret
24	%tmp1 = load <16 x i8>, <16 x i8>* %A
25	%tmp2 = load <16 x i8>, <16 x i8>* %B
26	%tmp3 = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
27	ret <16 x i8> %tmp3
28}
29
30define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
31; CHECK-LABEL: shadd4h:
32; CHECK:       // %bb.0:
33; CHECK-NEXT:    ldr d0, [x0]
34; CHECK-NEXT:    ldr d1, [x1]
35; CHECK-NEXT:    shadd.4h v0, v0, v1
36; CHECK-NEXT:    ret
37	%tmp1 = load <4 x i16>, <4 x i16>* %A
38	%tmp2 = load <4 x i16>, <4 x i16>* %B
39	%tmp3 = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
40	ret <4 x i16> %tmp3
41}
42
43define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
44; CHECK-LABEL: shadd8h:
45; CHECK:       // %bb.0:
46; CHECK-NEXT:    ldr q0, [x0]
47; CHECK-NEXT:    ldr q1, [x1]
48; CHECK-NEXT:    shadd.8h v0, v0, v1
49; CHECK-NEXT:    ret
50	%tmp1 = load <8 x i16>, <8 x i16>* %A
51	%tmp2 = load <8 x i16>, <8 x i16>* %B
52	%tmp3 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
53	ret <8 x i16> %tmp3
54}
55
56define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
57; CHECK-LABEL: shadd2s:
58; CHECK:       // %bb.0:
59; CHECK-NEXT:    ldr d0, [x0]
60; CHECK-NEXT:    ldr d1, [x1]
61; CHECK-NEXT:    shadd.2s v0, v0, v1
62; CHECK-NEXT:    ret
63	%tmp1 = load <2 x i32>, <2 x i32>* %A
64	%tmp2 = load <2 x i32>, <2 x i32>* %B
65	%tmp3 = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
66	ret <2 x i32> %tmp3
67}
68
69define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
70; CHECK-LABEL: shadd4s:
71; CHECK:       // %bb.0:
72; CHECK-NEXT:    ldr q0, [x0]
73; CHECK-NEXT:    ldr q1, [x1]
74; CHECK-NEXT:    shadd.4s v0, v0, v1
75; CHECK-NEXT:    ret
76	%tmp1 = load <4 x i32>, <4 x i32>* %A
77	%tmp2 = load <4 x i32>, <4 x i32>* %B
78	%tmp3 = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
79	ret <4 x i32> %tmp3
80}
81
82define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
83; CHECK-LABEL: uhadd8b:
84; CHECK:       // %bb.0:
85; CHECK-NEXT:    ldr d0, [x0]
86; CHECK-NEXT:    ldr d1, [x1]
87; CHECK-NEXT:    uhadd.8b v0, v0, v1
88; CHECK-NEXT:    ret
89	%tmp1 = load <8 x i8>, <8 x i8>* %A
90	%tmp2 = load <8 x i8>, <8 x i8>* %B
91	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
92	ret <8 x i8> %tmp3
93}
94
95define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
96; CHECK-LABEL: uhadd16b:
97; CHECK:       // %bb.0:
98; CHECK-NEXT:    ldr q0, [x0]
99; CHECK-NEXT:    ldr q1, [x1]
100; CHECK-NEXT:    uhadd.16b v0, v0, v1
101; CHECK-NEXT:    ret
102	%tmp1 = load <16 x i8>, <16 x i8>* %A
103	%tmp2 = load <16 x i8>, <16 x i8>* %B
104	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
105	ret <16 x i8> %tmp3
106}
107
108define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
109; CHECK-LABEL: uhadd4h:
110; CHECK:       // %bb.0:
111; CHECK-NEXT:    ldr d0, [x0]
112; CHECK-NEXT:    ldr d1, [x1]
113; CHECK-NEXT:    uhadd.4h v0, v0, v1
114; CHECK-NEXT:    ret
115	%tmp1 = load <4 x i16>, <4 x i16>* %A
116	%tmp2 = load <4 x i16>, <4 x i16>* %B
117	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
118	ret <4 x i16> %tmp3
119}
120
121define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
122; CHECK-LABEL: uhadd8h:
123; CHECK:       // %bb.0:
124; CHECK-NEXT:    ldr q0, [x0]
125; CHECK-NEXT:    ldr q1, [x1]
126; CHECK-NEXT:    uhadd.8h v0, v0, v1
127; CHECK-NEXT:    ret
128	%tmp1 = load <8 x i16>, <8 x i16>* %A
129	%tmp2 = load <8 x i16>, <8 x i16>* %B
130	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
131	ret <8 x i16> %tmp3
132}
133
134define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
135; CHECK-LABEL: uhadd2s:
136; CHECK:       // %bb.0:
137; CHECK-NEXT:    ldr d0, [x0]
138; CHECK-NEXT:    ldr d1, [x1]
139; CHECK-NEXT:    uhadd.2s v0, v0, v1
140; CHECK-NEXT:    ret
141	%tmp1 = load <2 x i32>, <2 x i32>* %A
142	%tmp2 = load <2 x i32>, <2 x i32>* %B
143	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
144	ret <2 x i32> %tmp3
145}
146
147define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
148; CHECK-LABEL: uhadd4s:
149; CHECK:       // %bb.0:
150; CHECK-NEXT:    ldr q0, [x0]
151; CHECK-NEXT:    ldr q1, [x1]
152; CHECK-NEXT:    uhadd.4s v0, v0, v1
153; CHECK-NEXT:    ret
154	%tmp1 = load <4 x i32>, <4 x i32>* %A
155	%tmp2 = load <4 x i32>, <4 x i32>* %B
156	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
157	ret <4 x i32> %tmp3
158}
159
160declare <8 x i8>  @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
161declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
162declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
163
164declare <8 x i8>  @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
165declare <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
166declare <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
167
168declare <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
169declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
170declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
171
172declare <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
173declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
174declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
175
176define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
177; CHECK-LABEL: srhadd8b:
178; CHECK:       // %bb.0:
179; CHECK-NEXT:    ldr d0, [x0]
180; CHECK-NEXT:    ldr d1, [x1]
181; CHECK-NEXT:    srhadd.8b v0, v0, v1
182; CHECK-NEXT:    ret
183	%tmp1 = load <8 x i8>, <8 x i8>* %A
184	%tmp2 = load <8 x i8>, <8 x i8>* %B
185	%tmp3 = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
186	ret <8 x i8> %tmp3
187}
188
189define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
190; CHECK-LABEL: srhadd16b:
191; CHECK:       // %bb.0:
192; CHECK-NEXT:    ldr q0, [x0]
193; CHECK-NEXT:    ldr q1, [x1]
194; CHECK-NEXT:    srhadd.16b v0, v0, v1
195; CHECK-NEXT:    ret
196	%tmp1 = load <16 x i8>, <16 x i8>* %A
197	%tmp2 = load <16 x i8>, <16 x i8>* %B
198	%tmp3 = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
199	ret <16 x i8> %tmp3
200}
201
202define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
203; CHECK-LABEL: srhadd4h:
204; CHECK:       // %bb.0:
205; CHECK-NEXT:    ldr d0, [x0]
206; CHECK-NEXT:    ldr d1, [x1]
207; CHECK-NEXT:    srhadd.4h v0, v0, v1
208; CHECK-NEXT:    ret
209	%tmp1 = load <4 x i16>, <4 x i16>* %A
210	%tmp2 = load <4 x i16>, <4 x i16>* %B
211	%tmp3 = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
212	ret <4 x i16> %tmp3
213}
214
215define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
216; CHECK-LABEL: srhadd8h:
217; CHECK:       // %bb.0:
218; CHECK-NEXT:    ldr q0, [x0]
219; CHECK-NEXT:    ldr q1, [x1]
220; CHECK-NEXT:    srhadd.8h v0, v0, v1
221; CHECK-NEXT:    ret
222	%tmp1 = load <8 x i16>, <8 x i16>* %A
223	%tmp2 = load <8 x i16>, <8 x i16>* %B
224	%tmp3 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
225	ret <8 x i16> %tmp3
226}
227
228define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
229; CHECK-LABEL: srhadd2s:
230; CHECK:       // %bb.0:
231; CHECK-NEXT:    ldr d0, [x0]
232; CHECK-NEXT:    ldr d1, [x1]
233; CHECK-NEXT:    srhadd.2s v0, v0, v1
234; CHECK-NEXT:    ret
235	%tmp1 = load <2 x i32>, <2 x i32>* %A
236	%tmp2 = load <2 x i32>, <2 x i32>* %B
237	%tmp3 = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
238	ret <2 x i32> %tmp3
239}
240
241define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
242; CHECK-LABEL: srhadd4s:
243; CHECK:       // %bb.0:
244; CHECK-NEXT:    ldr q0, [x0]
245; CHECK-NEXT:    ldr q1, [x1]
246; CHECK-NEXT:    srhadd.4s v0, v0, v1
247; CHECK-NEXT:    ret
248	%tmp1 = load <4 x i32>, <4 x i32>* %A
249	%tmp2 = load <4 x i32>, <4 x i32>* %B
250	%tmp3 = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
251	ret <4 x i32> %tmp3
252}
253
254define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
255; CHECK-LABEL: urhadd8b:
256; CHECK:       // %bb.0:
257; CHECK-NEXT:    ldr d0, [x0]
258; CHECK-NEXT:    ldr d1, [x1]
259; CHECK-NEXT:    urhadd.8b v0, v0, v1
260; CHECK-NEXT:    ret
261	%tmp1 = load <8 x i8>, <8 x i8>* %A
262	%tmp2 = load <8 x i8>, <8 x i8>* %B
263	%tmp3 = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
264	ret <8 x i8> %tmp3
265}
266
267define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
268; CHECK-LABEL: urhadd16b:
269; CHECK:       // %bb.0:
270; CHECK-NEXT:    ldr q0, [x0]
271; CHECK-NEXT:    ldr q1, [x1]
272; CHECK-NEXT:    urhadd.16b v0, v0, v1
273; CHECK-NEXT:    ret
274	%tmp1 = load <16 x i8>, <16 x i8>* %A
275	%tmp2 = load <16 x i8>, <16 x i8>* %B
276	%tmp3 = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
277	ret <16 x i8> %tmp3
278}
279
280define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
281; CHECK-LABEL: urhadd4h:
282; CHECK:       // %bb.0:
283; CHECK-NEXT:    ldr d0, [x0]
284; CHECK-NEXT:    ldr d1, [x1]
285; CHECK-NEXT:    urhadd.4h v0, v0, v1
286; CHECK-NEXT:    ret
287	%tmp1 = load <4 x i16>, <4 x i16>* %A
288	%tmp2 = load <4 x i16>, <4 x i16>* %B
289	%tmp3 = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
290	ret <4 x i16> %tmp3
291}
292
293define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
294; CHECK-LABEL: urhadd8h:
295; CHECK:       // %bb.0:
296; CHECK-NEXT:    ldr q0, [x0]
297; CHECK-NEXT:    ldr q1, [x1]
298; CHECK-NEXT:    urhadd.8h v0, v0, v1
299; CHECK-NEXT:    ret
300	%tmp1 = load <8 x i16>, <8 x i16>* %A
301	%tmp2 = load <8 x i16>, <8 x i16>* %B
302	%tmp3 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
303	ret <8 x i16> %tmp3
304}
305
306define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
307; CHECK-LABEL: urhadd2s:
308; CHECK:       // %bb.0:
309; CHECK-NEXT:    ldr d0, [x0]
310; CHECK-NEXT:    ldr d1, [x1]
311; CHECK-NEXT:    urhadd.2s v0, v0, v1
312; CHECK-NEXT:    ret
313	%tmp1 = load <2 x i32>, <2 x i32>* %A
314	%tmp2 = load <2 x i32>, <2 x i32>* %B
315	%tmp3 = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
316	ret <2 x i32> %tmp3
317}
318
319define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
320; CHECK-LABEL: urhadd4s:
321; CHECK:       // %bb.0:
322; CHECK-NEXT:    ldr q0, [x0]
323; CHECK-NEXT:    ldr q1, [x1]
324; CHECK-NEXT:    urhadd.4s v0, v0, v1
325; CHECK-NEXT:    ret
326	%tmp1 = load <4 x i32>, <4 x i32>* %A
327	%tmp2 = load <4 x i32>, <4 x i32>* %B
328	%tmp3 = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
329	ret <4 x i32> %tmp3
330}
331
332define void @testLowerToSRHADD8b(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind {
333; CHECK-LABEL: testLowerToSRHADD8b:
334; CHECK:       // %bb.0:
335; CHECK-NEXT:    srhadd.8b v0, v0, v1
336; CHECK-NEXT:    str d0, [x0]
337; CHECK-NEXT:    ret
338  %sextsrc1 = sext <8 x i8> %src1 to <8 x i16>
339  %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
340  %add1 = add <8 x i16> %sextsrc1, %sextsrc2
341  %add2 = add <8 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
342  %resulti16 = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
343  %result = trunc <8 x i16> %resulti16 to <8 x i8>
344  store <8 x i8> %result, <8 x i8>* %dest, align 8
345  ret void
346}
347
348define void @testLowerToSRHADD4h(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind {
349; CHECK-LABEL: testLowerToSRHADD4h:
350; CHECK:       // %bb.0:
351; CHECK-NEXT:    srhadd.4h v0, v0, v1
352; CHECK-NEXT:    str d0, [x0]
353; CHECK-NEXT:    ret
354  %sextsrc1 = sext <4 x i16> %src1 to <4 x i32>
355  %sextsrc2 = sext <4 x i16> %src2 to <4 x i32>
356  %add1 = add <4 x i32> %sextsrc1, %sextsrc2
357  %add2 = add <4 x i32> %add1, <i32 1, i32 1, i32 1, i32 1>
358  %resulti16 = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
359  %result = trunc <4 x i32> %resulti16 to <4 x i16>
360  store <4 x i16> %result, <4 x i16>* %dest, align 8
361  ret void
362}
363
364define void @testLowerToSRHADD2s(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind {
365; CHECK-LABEL: testLowerToSRHADD2s:
366; CHECK:       // %bb.0:
367; CHECK-NEXT:    srhadd.2s v0, v0, v1
368; CHECK-NEXT:    str d0, [x0]
369; CHECK-NEXT:    ret
370  %sextsrc1 = sext <2 x i32> %src1 to <2 x i64>
371  %sextsrc2 = sext <2 x i32> %src2 to <2 x i64>
372  %add1 = add <2 x i64> %sextsrc1, %sextsrc2
373  %add2 = add <2 x i64> %add1, <i64 1, i64 1>
374  %resulti16 = lshr <2 x i64> %add2, <i64 1, i64 1>
375  %result = trunc <2 x i64> %resulti16 to <2 x i32>
376  store <2 x i32> %result, <2 x i32>* %dest, align 8
377  ret void
378}
379
380define void @testLowerToSRHADD16b(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
381; CHECK-LABEL: testLowerToSRHADD16b:
382; CHECK:       // %bb.0:
383; CHECK-NEXT:    srhadd.16b v0, v0, v1
384; CHECK-NEXT:    str q0, [x0]
385; CHECK-NEXT:    ret
386  %sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
387  %sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
388  %add1 = add <16 x i16> %sextsrc1, %sextsrc2
389  %add2 = add <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
390  %resulti16 = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
391  %result = trunc <16 x i16> %resulti16 to <16 x i8>
392  store <16 x i8> %result, <16 x i8>* %dest, align 16
393  ret void
394}
395
396define void @testLowerToSRHADD8h(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind {
397; CHECK-LABEL: testLowerToSRHADD8h:
398; CHECK:       // %bb.0:
399; CHECK-NEXT:    srhadd.8h v0, v0, v1
400; CHECK-NEXT:    str q0, [x0]
401; CHECK-NEXT:    ret
402  %sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
403  %sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
404  %add1 = add <8 x i32> %sextsrc1, %sextsrc2
405  %add2 = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
406  %resulti16 = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
407  %result = trunc <8 x i32> %resulti16 to <8 x i16>
408  store <8 x i16> %result, <8 x i16>* %dest, align 16
409  ret void
410}
411
412define void @testLowerToSRHADD4s(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind {
413; CHECK-LABEL: testLowerToSRHADD4s:
414; CHECK:       // %bb.0:
415; CHECK-NEXT:    srhadd.4s v0, v0, v1
416; CHECK-NEXT:    str q0, [x0]
417; CHECK-NEXT:    ret
418  %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
419  %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
420  %add1 = add <4 x i64> %sextsrc1, %sextsrc2
421  %add2 = add <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
422  %resulti16 = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
423  %result = trunc <4 x i64> %resulti16 to <4 x i32>
424  store <4 x i32> %result, <4 x i32>* %dest, align 16
425  ret void
426}
427
428define void @testLowerToURHADD8b(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind {
429; CHECK-LABEL: testLowerToURHADD8b:
430; CHECK:       // %bb.0:
431; CHECK-NEXT:    urhadd.8b v0, v0, v1
432; CHECK-NEXT:    str d0, [x0]
433; CHECK-NEXT:    ret
434  %zextsrc1 = zext <8 x i8> %src1 to <8 x i16>
435  %zextsrc2 = zext <8 x i8> %src2 to <8 x i16>
436  %add1 = add <8 x i16> %zextsrc1, %zextsrc2
437  %add2 = add <8 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
438  %resulti16 = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
439  %result = trunc <8 x i16> %resulti16 to <8 x i8>
440  store <8 x i8> %result, <8 x i8>* %dest, align 8
441  ret void
442}
443
444define void @testLowerToURHADD4h(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind {
445; CHECK-LABEL: testLowerToURHADD4h:
446; CHECK:       // %bb.0:
447; CHECK-NEXT:    urhadd.4h v0, v0, v1
448; CHECK-NEXT:    str d0, [x0]
449; CHECK-NEXT:    ret
450  %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
451  %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
452  %add1 = add <4 x i32> %zextsrc1, %zextsrc2
453  %add2 = add <4 x i32> %add1, <i32 1, i32 1, i32 1, i32 1>
454  %resulti16 = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
455  %result = trunc <4 x i32> %resulti16 to <4 x i16>
456  store <4 x i16> %result, <4 x i16>* %dest, align 8
457  ret void
458}
459
460define void @testLowerToURHADD2s(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind {
461; CHECK-LABEL: testLowerToURHADD2s:
462; CHECK:       // %bb.0:
463; CHECK-NEXT:    urhadd.2s v0, v0, v1
464; CHECK-NEXT:    str d0, [x0]
465; CHECK-NEXT:    ret
466  %zextsrc1 = zext <2 x i32> %src1 to <2 x i64>
467  %zextsrc2 = zext <2 x i32> %src2 to <2 x i64>
468  %add1 = add <2 x i64> %zextsrc1, %zextsrc2
469  %add2 = add <2 x i64> %add1, <i64 1, i64 1>
470  %resulti16 = lshr <2 x i64> %add2, <i64 1, i64 1>
471  %result = trunc <2 x i64> %resulti16 to <2 x i32>
472  store <2 x i32> %result, <2 x i32>* %dest, align 8
473  ret void
474}
475
476define void @testLowerToURHADD16b(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
477; CHECK-LABEL: testLowerToURHADD16b:
478; CHECK:       // %bb.0:
479; CHECK-NEXT:    urhadd.16b v0, v0, v1
480; CHECK-NEXT:    str q0, [x0]
481; CHECK-NEXT:    ret
482  %zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
483  %zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
484  %add1 = add <16 x i16> %zextsrc1, %zextsrc2
485  %add2 = add <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
486  %resulti16 = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
487  %result = trunc <16 x i16> %resulti16 to <16 x i8>
488  store <16 x i8> %result, <16 x i8>* %dest, align 16
489  ret void
490}
491
492define void @testLowerToURHADD8h(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind {
493; CHECK-LABEL: testLowerToURHADD8h:
494; CHECK:       // %bb.0:
495; CHECK-NEXT:    urhadd.8h v0, v0, v1
496; CHECK-NEXT:    str q0, [x0]
497; CHECK-NEXT:    ret
498  %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
499  %zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
500  %add1 = add <8 x i32> %zextsrc1, %zextsrc2
501  %add2 = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
502  %resulti16 = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
503  %result = trunc <8 x i32> %resulti16 to <8 x i16>
504  store <8 x i16> %result, <8 x i16>* %dest, align 16
505  ret void
506}
507
508define void @testLowerToURHADD4s(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind {
509; CHECK-LABEL: testLowerToURHADD4s:
510; CHECK:       // %bb.0:
511; CHECK-NEXT:    urhadd.4s v0, v0, v1
512; CHECK-NEXT:    str q0, [x0]
513; CHECK-NEXT:    ret
514  %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
515  %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
516  %add1 = add <4 x i64> %zextsrc1, %zextsrc2
517  %add2 = add <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
518  %resulti16 = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
519  %result = trunc <4 x i64> %resulti16 to <4 x i32>
520  store <4 x i32> %result, <4 x i32>* %dest, align 16
521  ret void
522}
523
524declare <8 x i8>  @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
525declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
526declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
527
528declare <8 x i8>  @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
529declare <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
530declare <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
531
532declare <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
533declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
534declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
535
536declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
537declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
538declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
539