1; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
2
3define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
4;CHECK-LABEL: addhn8b:
5;CHECK: addhn.8b
6        %tmp1 = load <8 x i16>, <8 x i16>* %A
7        %tmp2 = load <8 x i16>, <8 x i16>* %B
8        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
9        ret <8 x i8> %tmp3
10}
11
12define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
13;CHECK-LABEL: addhn4h:
14;CHECK: addhn.4h
15        %tmp1 = load <4 x i32>, <4 x i32>* %A
16        %tmp2 = load <4 x i32>, <4 x i32>* %B
17        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
18        ret <4 x i16> %tmp3
19}
20
21define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
22;CHECK-LABEL: addhn2s:
23;CHECK: addhn.2s
24        %tmp1 = load <2 x i64>, <2 x i64>* %A
25        %tmp2 = load <2 x i64>, <2 x i64>* %B
26        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
27        ret <2 x i32> %tmp3
28}
29
30define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
31;CHECK-LABEL: addhn2_16b:
32;CHECK: addhn.8b
33;CHECK-NEXT: addhn2.16b
34  %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
35  %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
36  %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
37  ret <16 x i8> %res
38}
39
40define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
41;CHECK-LABEL: addhn2_8h:
42;CHECK: addhn.4h
43;CHECK-NEXT: addhn2.8h
44  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
45  %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
46  %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
47  ret <8 x i16> %res
48}
49
50define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
51;CHECK-LABEL: addhn2_4s:
52;CHECK: addhn.2s
53;CHECK-NEXT: addhn2.4s
54  %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
55  %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
56  %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
57  ret <4 x i32> %res
58}
59
60declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
61declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
62declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
63
64
65define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
66;CHECK-LABEL: raddhn8b:
67;CHECK: raddhn.8b
68        %tmp1 = load <8 x i16>, <8 x i16>* %A
69        %tmp2 = load <8 x i16>, <8 x i16>* %B
70        %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
71        ret <8 x i8> %tmp3
72}
73
74define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
75;CHECK-LABEL: raddhn4h:
76;CHECK: raddhn.4h
77        %tmp1 = load <4 x i32>, <4 x i32>* %A
78        %tmp2 = load <4 x i32>, <4 x i32>* %B
79        %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
80        ret <4 x i16> %tmp3
81}
82
83define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
84;CHECK-LABEL: raddhn2s:
85;CHECK: raddhn.2s
86        %tmp1 = load <2 x i64>, <2 x i64>* %A
87        %tmp2 = load <2 x i64>, <2 x i64>* %B
88        %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
89        ret <2 x i32> %tmp3
90}
91
92define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
93;CHECK-LABEL: raddhn2_16b:
94;CHECK: raddhn.8b
95;CHECK-NEXT: raddhn2.16b
96  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
97  %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
98  %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
99  ret <16 x i8> %res
100}
101
102define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
103;CHECK-LABEL: raddhn2_8h:
104;CHECK: raddhn.4h
105;CHECK-NEXT: raddhn2.8h
106  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
107  %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
108  %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
109  ret <8 x i16> %res
110}
111
112define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
113;CHECK-LABEL: raddhn2_4s:
114;CHECK: raddhn.2s
115;CHECK-NEXT: raddhn2.4s
116  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
117  %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
118  %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
119  ret <4 x i32> %res
120}
121
122declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
123declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
124declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
125
126define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
127;CHECK-LABEL: saddl8h:
128;CHECK: saddl.8h
129        %tmp1 = load <8 x i8>, <8 x i8>* %A
130        %tmp2 = load <8 x i8>, <8 x i8>* %B
131  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
132  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
133  %tmp5 = add <8 x i16> %tmp3, %tmp4
134        ret <8 x i16> %tmp5
135}
136
137define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
138;CHECK-LABEL: saddl4s:
139;CHECK: saddl.4s
140        %tmp1 = load <4 x i16>, <4 x i16>* %A
141        %tmp2 = load <4 x i16>, <4 x i16>* %B
142  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
143  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
144  %tmp5 = add <4 x i32> %tmp3, %tmp4
145        ret <4 x i32> %tmp5
146}
147
148define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
149;CHECK-LABEL: saddl2d:
150;CHECK: saddl.2d
151        %tmp1 = load <2 x i32>, <2 x i32>* %A
152        %tmp2 = load <2 x i32>, <2 x i32>* %B
153  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
154  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
155  %tmp5 = add <2 x i64> %tmp3, %tmp4
156        ret <2 x i64> %tmp5
157}
158
159define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
160; CHECK-LABEL: saddl2_8h:
161; CHECK-NEXT: saddl2.8h v0, v0, v1
162; CHECK-NEXT: ret
163  %tmp = bitcast <16 x i8> %a to <2 x i64>
164  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
165  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
166  %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16>
167  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
168  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
169  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
170  %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16>
171  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
172  ret <8 x i16> %add.i
173}
174
175define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
176; CHECK-LABEL: saddl2_4s:
177; CHECK-NEXT: saddl2.4s v0, v0, v1
178; CHECK-NEXT: ret
179  %tmp = bitcast <8 x i16> %a to <2 x i64>
180  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
181  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
182  %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32>
183  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
184  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
185  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
186  %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32>
187  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
188  ret <4 x i32> %add.i
189}
190
191define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
192; CHECK-LABEL: saddl2_2d:
193; CHECK-NEXT: saddl2.2d v0, v0, v1
194; CHECK-NEXT: ret
195  %tmp = bitcast <4 x i32> %a to <2 x i64>
196  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
197  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
198  %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64>
199  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
200  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
201  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
202  %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64>
203  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
204  ret <2 x i64> %add.i
205}
206
207define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
208;CHECK-LABEL: uaddl8h:
209;CHECK: uaddl.8h
210  %tmp1 = load <8 x i8>, <8 x i8>* %A
211  %tmp2 = load <8 x i8>, <8 x i8>* %B
212  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
213  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
214  %tmp5 = add <8 x i16> %tmp3, %tmp4
215  ret <8 x i16> %tmp5
216}
217
218define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
219;CHECK-LABEL: uaddl4s:
220;CHECK: uaddl.4s
221  %tmp1 = load <4 x i16>, <4 x i16>* %A
222  %tmp2 = load <4 x i16>, <4 x i16>* %B
223  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
224  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
225  %tmp5 = add <4 x i32> %tmp3, %tmp4
226  ret <4 x i32> %tmp5
227}
228
229define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
230;CHECK-LABEL: uaddl2d:
231;CHECK: uaddl.2d
232  %tmp1 = load <2 x i32>, <2 x i32>* %A
233  %tmp2 = load <2 x i32>, <2 x i32>* %B
234  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
235  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
236  %tmp5 = add <2 x i64> %tmp3, %tmp4
237  ret <2 x i64> %tmp5
238}
239
240
241define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
242; CHECK-LABEL: uaddl2_8h:
243; CHECK-NEXT: uaddl2.8h v0, v0, v1
244; CHECK-NEXT: ret
245  %tmp = bitcast <16 x i8> %a to <2 x i64>
246  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
247  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
248  %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16>
249  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
250  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
251  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
252  %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16>
253  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
254  ret <8 x i16> %add.i
255}
256
257define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
258; CHECK-LABEL: uaddl2_4s:
259; CHECK-NEXT: uaddl2.4s v0, v0, v1
260; CHECK-NEXT: ret
261  %tmp = bitcast <8 x i16> %a to <2 x i64>
262  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
263  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
264  %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32>
265  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
266  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
267  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
268  %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32>
269  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
270  ret <4 x i32> %add.i
271}
272
273define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
274; CHECK-LABEL: uaddl2_2d:
275; CHECK-NEXT: uaddl2.2d v0, v0, v1
276; CHECK-NEXT: ret
277  %tmp = bitcast <4 x i32> %a to <2 x i64>
278  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
279  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
280  %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64>
281  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
282  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
283  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
284  %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64>
285  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
286  ret <2 x i64> %add.i
287}
288
289define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
290;CHECK-LABEL: uaddw8h:
291;CHECK: uaddw.8h
292        %tmp1 = load <8 x i16>, <8 x i16>* %A
293        %tmp2 = load <8 x i8>, <8 x i8>* %B
294  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
295  %tmp4 = add <8 x i16> %tmp1, %tmp3
296        ret <8 x i16> %tmp4
297}
298
299define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
300;CHECK-LABEL: uaddw4s:
301;CHECK: uaddw.4s
302        %tmp1 = load <4 x i32>, <4 x i32>* %A
303        %tmp2 = load <4 x i16>, <4 x i16>* %B
304  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
305  %tmp4 = add <4 x i32> %tmp1, %tmp3
306        ret <4 x i32> %tmp4
307}
308
309define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
310;CHECK-LABEL: uaddw2d:
311;CHECK: uaddw.2d
312        %tmp1 = load <2 x i64>, <2 x i64>* %A
313        %tmp2 = load <2 x i32>, <2 x i32>* %B
314  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
315  %tmp4 = add <2 x i64> %tmp1, %tmp3
316        ret <2 x i64> %tmp4
317}
318
319define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
320;CHECK-LABEL: uaddw2_8h:
321;CHECK: uaddw.8h
322        %tmp1 = load <8 x i16>, <8 x i16>* %A
323
324        %tmp2 = load <16 x i8>, <16 x i8>* %B
325        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
326        %ext2 = zext <8 x i8> %high2 to <8 x i16>
327
328        %res = add <8 x i16> %tmp1, %ext2
329        ret <8 x i16> %res
330}
331
332define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
333;CHECK-LABEL: uaddw2_4s:
334;CHECK: uaddw.4s
335        %tmp1 = load <4 x i32>, <4 x i32>* %A
336
337        %tmp2 = load <8 x i16>, <8 x i16>* %B
338        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
339        %ext2 = zext <4 x i16> %high2 to <4 x i32>
340
341        %res = add <4 x i32> %tmp1, %ext2
342        ret <4 x i32> %res
343}
344
345define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
346;CHECK-LABEL: uaddw2_2d:
347;CHECK: uaddw.2d
348        %tmp1 = load <2 x i64>, <2 x i64>* %A
349
350        %tmp2 = load <4 x i32>, <4 x i32>* %B
351        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
352        %ext2 = zext <2 x i32> %high2 to <2 x i64>
353
354        %res = add <2 x i64> %tmp1, %ext2
355        ret <2 x i64> %res
356}
357
358define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
359;CHECK-LABEL: saddw8h:
360;CHECK: saddw.8h
361        %tmp1 = load <8 x i16>, <8 x i16>* %A
362        %tmp2 = load <8 x i8>, <8 x i8>* %B
363        %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
364        %tmp4 = add <8 x i16> %tmp1, %tmp3
365        ret <8 x i16> %tmp4
366}
367
368define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
369;CHECK-LABEL: saddw4s:
370;CHECK: saddw.4s
371        %tmp1 = load <4 x i32>, <4 x i32>* %A
372        %tmp2 = load <4 x i16>, <4 x i16>* %B
373        %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
374        %tmp4 = add <4 x i32> %tmp1, %tmp3
375        ret <4 x i32> %tmp4
376}
377
378define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
379;CHECK-LABEL: saddw2d:
380;CHECK: saddw.2d
381        %tmp1 = load <2 x i64>, <2 x i64>* %A
382        %tmp2 = load <2 x i32>, <2 x i32>* %B
383        %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
384        %tmp4 = add <2 x i64> %tmp1, %tmp3
385        ret <2 x i64> %tmp4
386}
387
388define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
389;CHECK-LABEL: saddw2_8h:
390;CHECK: saddw.8h
391        %tmp1 = load <8 x i16>, <8 x i16>* %A
392
393        %tmp2 = load <16 x i8>, <16 x i8>* %B
394        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
395        %ext2 = sext <8 x i8> %high2 to <8 x i16>
396
397        %res = add <8 x i16> %tmp1, %ext2
398        ret <8 x i16> %res
399}
400
401define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
402;CHECK-LABEL: saddw2_4s:
403;CHECK: saddw.4s
404        %tmp1 = load <4 x i32>, <4 x i32>* %A
405
406        %tmp2 = load <8 x i16>, <8 x i16>* %B
407        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
408        %ext2 = sext <4 x i16> %high2 to <4 x i32>
409
410        %res = add <4 x i32> %tmp1, %ext2
411        ret <4 x i32> %res
412}
413
414define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
415;CHECK-LABEL: saddw2_2d:
416;CHECK: saddw.2d
417        %tmp1 = load <2 x i64>, <2 x i64>* %A
418
419        %tmp2 = load <4 x i32>, <4 x i32>* %B
420        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
421        %ext2 = sext <2 x i32> %high2 to <2 x i64>
422
423        %res = add <2 x i64> %tmp1, %ext2
424        ret <2 x i64> %res
425}
426
427define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind {
428;CHECK-LABEL: saddlp4h:
429;CHECK: saddlp.4h
430        %tmp1 = load <8 x i8>, <8 x i8>* %A
431        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
432        ret <4 x i16> %tmp3
433}
434
435define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind {
436;CHECK-LABEL: saddlp2s:
437;CHECK: saddlp.2s
438        %tmp1 = load <4 x i16>, <4 x i16>* %A
439        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
440        ret <2 x i32> %tmp3
441}
442
443define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind {
444;CHECK-LABEL: saddlp1d:
445;CHECK: saddlp.1d
446        %tmp1 = load <2 x i32>, <2 x i32>* %A
447        %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
448        ret <1 x i64> %tmp3
449}
450
451define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind {
452;CHECK-LABEL: saddlp8h:
453;CHECK: saddlp.8h
454        %tmp1 = load <16 x i8>, <16 x i8>* %A
455        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
456        ret <8 x i16> %tmp3
457}
458
459define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind {
460;CHECK-LABEL: saddlp4s:
461;CHECK: saddlp.4s
462        %tmp1 = load <8 x i16>, <8 x i16>* %A
463        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
464        ret <4 x i32> %tmp3
465}
466
467define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind {
468;CHECK-LABEL: saddlp2d:
469;CHECK: saddlp.2d
470        %tmp1 = load <4 x i32>, <4 x i32>* %A
471        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
472        ret <2 x i64> %tmp3
473}
474
475declare <4 x i16>  @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
476declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
477declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
478
479declare <8 x i16>  @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
480declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
481declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
482
483define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
484;CHECK-LABEL: uaddlp4h:
485;CHECK: uaddlp.4h
486        %tmp1 = load <8 x i8>, <8 x i8>* %A
487        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
488        ret <4 x i16> %tmp3
489}
490
491define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind {
492;CHECK-LABEL: uaddlp2s:
493;CHECK: uaddlp.2s
494        %tmp1 = load <4 x i16>, <4 x i16>* %A
495        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
496        ret <2 x i32> %tmp3
497}
498
499define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind {
500;CHECK-LABEL: uaddlp1d:
501;CHECK: uaddlp.1d
502        %tmp1 = load <2 x i32>, <2 x i32>* %A
503        %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
504        ret <1 x i64> %tmp3
505}
506
507define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind {
508;CHECK-LABEL: uaddlp8h:
509;CHECK: uaddlp.8h
510        %tmp1 = load <16 x i8>, <16 x i8>* %A
511        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
512        ret <8 x i16> %tmp3
513}
514
515define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind {
516;CHECK-LABEL: uaddlp4s:
517;CHECK: uaddlp.4s
518        %tmp1 = load <8 x i16>, <8 x i16>* %A
519        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
520        ret <4 x i32> %tmp3
521}
522
523define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind {
524;CHECK-LABEL: uaddlp2d:
525;CHECK: uaddlp.2d
526        %tmp1 = load <4 x i32>, <4 x i32>* %A
527        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
528        ret <2 x i64> %tmp3
529}
530
531declare <4 x i16>  @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
532declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
533declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
534
535declare <8 x i16>  @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
536declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
537declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
538
539define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
540;CHECK-LABEL: sadalp4h:
541;CHECK: sadalp.4h
542        %tmp1 = load <8 x i8>, <8 x i8>* %A
543        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
544        %tmp4 = load <4 x i16>, <4 x i16>* %B
545        %tmp5 = add <4 x i16> %tmp3, %tmp4
546        ret <4 x i16> %tmp5
547}
548
549define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
550;CHECK-LABEL: sadalp2s:
551;CHECK: sadalp.2s
552        %tmp1 = load <4 x i16>, <4 x i16>* %A
553        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
554        %tmp4 = load <2 x i32>, <2 x i32>* %B
555        %tmp5 = add <2 x i32> %tmp3, %tmp4
556        ret <2 x i32> %tmp5
557}
558
559define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
560;CHECK-LABEL: sadalp8h:
561;CHECK: sadalp.8h
562        %tmp1 = load <16 x i8>, <16 x i8>* %A
563        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
564        %tmp4 = load <8 x i16>, <8 x i16>* %B
565        %tmp5 = add <8 x i16> %tmp3, %tmp4
566        ret <8 x i16> %tmp5
567}
568
569define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
570;CHECK-LABEL: sadalp4s:
571;CHECK: sadalp.4s
572        %tmp1 = load <8 x i16>, <8 x i16>* %A
573        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
574        %tmp4 = load <4 x i32>, <4 x i32>* %B
575        %tmp5 = add <4 x i32> %tmp3, %tmp4
576        ret <4 x i32> %tmp5
577}
578
579define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
580;CHECK-LABEL: sadalp2d:
581;CHECK: sadalp.2d
582        %tmp1 = load <4 x i32>, <4 x i32>* %A
583        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
584        %tmp4 = load <2 x i64>, <2 x i64>* %B
585        %tmp5 = add <2 x i64> %tmp3, %tmp4
586        ret <2 x i64> %tmp5
587}
588
589define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
590;CHECK-LABEL: uadalp4h:
591;CHECK: uadalp.4h
592        %tmp1 = load <8 x i8>, <8 x i8>* %A
593        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
594        %tmp4 = load <4 x i16>, <4 x i16>* %B
595        %tmp5 = add <4 x i16> %tmp3, %tmp4
596        ret <4 x i16> %tmp5
597}
598
599define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
600;CHECK-LABEL: uadalp2s:
601;CHECK: uadalp.2s
602        %tmp1 = load <4 x i16>, <4 x i16>* %A
603        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
604        %tmp4 = load <2 x i32>, <2 x i32>* %B
605        %tmp5 = add <2 x i32> %tmp3, %tmp4
606        ret <2 x i32> %tmp5
607}
608
609define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
610;CHECK-LABEL: uadalp8h:
611;CHECK: uadalp.8h
612        %tmp1 = load <16 x i8>, <16 x i8>* %A
613        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
614        %tmp4 = load <8 x i16>, <8 x i16>* %B
615        %tmp5 = add <8 x i16> %tmp3, %tmp4
616        ret <8 x i16> %tmp5
617}
618
619define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
620;CHECK-LABEL: uadalp4s:
621;CHECK: uadalp.4s
622        %tmp1 = load <8 x i16>, <8 x i16>* %A
623        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
624        %tmp4 = load <4 x i32>, <4 x i32>* %B
625        %tmp5 = add <4 x i32> %tmp3, %tmp4
626        ret <4 x i32> %tmp5
627}
628
629define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
630;CHECK-LABEL: uadalp2d:
631;CHECK: uadalp.2d
632        %tmp1 = load <4 x i32>, <4 x i32>* %A
633        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
634        %tmp4 = load <2 x i64>, <2 x i64>* %B
635        %tmp5 = add <2 x i64> %tmp3, %tmp4
636        ret <2 x i64> %tmp5
637}
638
639define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
640;CHECK-LABEL: addp_8b:
641;CHECK: addp.8b
642        %tmp1 = load <8 x i8>, <8 x i8>* %A
643        %tmp2 = load <8 x i8>, <8 x i8>* %B
644        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
645        ret <8 x i8> %tmp3
646}
647
648define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
649;CHECK-LABEL: addp_16b:
650;CHECK: addp.16b
651        %tmp1 = load <16 x i8>, <16 x i8>* %A
652        %tmp2 = load <16 x i8>, <16 x i8>* %B
653        %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
654        ret <16 x i8> %tmp3
655}
656
657define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
658;CHECK-LABEL: addp_4h:
659;CHECK: addp.4h
660        %tmp1 = load <4 x i16>, <4 x i16>* %A
661        %tmp2 = load <4 x i16>, <4 x i16>* %B
662        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
663        ret <4 x i16> %tmp3
664}
665
666define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
667;CHECK-LABEL: addp_8h:
668;CHECK: addp.8h
669        %tmp1 = load <8 x i16>, <8 x i16>* %A
670        %tmp2 = load <8 x i16>, <8 x i16>* %B
671        %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
672        ret <8 x i16> %tmp3
673}
674
675define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
676;CHECK-LABEL: addp_2s:
677;CHECK: addp.2s
678        %tmp1 = load <2 x i32>, <2 x i32>* %A
679        %tmp2 = load <2 x i32>, <2 x i32>* %B
680        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
681        ret <2 x i32> %tmp3
682}
683
684define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
685;CHECK-LABEL: addp_4s:
686;CHECK: addp.4s
687        %tmp1 = load <4 x i32>, <4 x i32>* %A
688        %tmp2 = load <4 x i32>, <4 x i32>* %B
689        %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
690        ret <4 x i32> %tmp3
691}
692
693define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
694;CHECK-LABEL: addp_2d:
695;CHECK: addp.2d
696        %tmp1 = load <2 x i64>, <2 x i64>* %A
697        %tmp2 = load <2 x i64>, <2 x i64>* %B
698        %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
699        ret <2 x i64> %tmp3
700}
701
702declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
703declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
704declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
705declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
706declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
707declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
708declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
709
710define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
711;CHECK-LABEL: faddp_2s:
712;CHECK: faddp.2s
713        %tmp1 = load <2 x float>, <2 x float>* %A
714        %tmp2 = load <2 x float>, <2 x float>* %B
715        %tmp3 = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
716        ret <2 x float> %tmp3
717}
718
719define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
720;CHECK-LABEL: faddp_4s:
721;CHECK: faddp.4s
722        %tmp1 = load <4 x float>, <4 x float>* %A
723        %tmp2 = load <4 x float>, <4 x float>* %B
724        %tmp3 = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
725        ret <4 x float> %tmp3
726}
727
728define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
729;CHECK-LABEL: faddp_2d:
730;CHECK: faddp.2d
731        %tmp1 = load <2 x double>, <2 x double>* %A
732        %tmp2 = load <2 x double>, <2 x double>* %B
733        %tmp3 = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
734        ret <2 x double> %tmp3
735}
736
737declare <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float>, <2 x float>) nounwind readnone
738declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>) nounwind readnone
739declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>) nounwind readnone
740
741define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
742; CHECK-LABEL: uaddl_duprhs
743; CHECK-NOT: ext.16b
744; CHECK: uaddl.2d
745  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
746  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
747
748  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
749
750  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
751  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
752
753  %res = add <2 x i64> %lhs.ext, %rhs.ext
754  ret <2 x i64> %res
755}
756
757define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
758; CHECK-LABEL: uaddl2_duprhs
759; CHECK-NOT: ext.16b
760; CHECK: uaddl2.2d
761  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
762  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
763
764  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
765
766  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
767  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
768
769  %res = add <2 x i64> %lhs.ext, %rhs.ext
770  ret <2 x i64> %res
771}
772
773define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
774; CHECK-LABEL: saddl_duplhs
775; CHECK-NOT: ext.16b
776; CHECK: saddl.2d
777  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
778  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
779
780  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
781
782  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
783  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
784
785  %res = add <2 x i64> %lhs.ext, %rhs.ext
786  ret <2 x i64> %res
787}
788
789define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
790; CHECK-LABEL: saddl2_duplhs
791; CHECK-NOT: ext.16b
792; CHECK: saddl2.2d
793  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
794  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
795
796  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
797
798  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
799  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
800
801  %res = add <2 x i64> %lhs.ext, %rhs.ext
802  ret <2 x i64> %res
803}
804
805define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
806; CHECK-LABEL: usubl_duprhs
807; CHECK-NOT: ext.16b
808; CHECK: usubl.2d
809  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
810  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
811
812  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
813
814  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
815  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
816
817  %res = sub <2 x i64> %lhs.ext, %rhs.ext
818  ret <2 x i64> %res
819}
820
821define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
822; CHECK-LABEL: usubl2_duprhs
823; CHECK-NOT: ext.16b
824; CHECK: usubl2.2d
825  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
826  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
827
828  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
829
830  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
831  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
832
833  %res = sub <2 x i64> %lhs.ext, %rhs.ext
834  ret <2 x i64> %res
835}
836
837define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
838; CHECK-LABEL: ssubl_duplhs:
839; CHECK-NOT: ext.16b
840; CHECK: ssubl.2d
841  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
842  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
843
844  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
845
846  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
847  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
848
849  %res = sub <2 x i64> %lhs.ext, %rhs.ext
850  ret <2 x i64> %res
851}
852
853define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
854; CHECK-LABEL: ssubl2_duplhs:
855; CHECK-NOT: ext.16b
856; CHECK: ssubl2.2d
857  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
858  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
859
860  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
861
862  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
863  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
864
865  %res = sub <2 x i64> %lhs.ext, %rhs.ext
866  ret <2 x i64> %res
867}
868
869define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
870;CHECK-LABEL: addhn8b_natural:
871;CHECK: addhn.8b
872        %tmp1 = load <8 x i16>, <8 x i16>* %A
873        %tmp2 = load <8 x i16>, <8 x i16>* %B
874        %sum = add <8 x i16> %tmp1, %tmp2
875        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
876        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
877        ret <8 x i8> %narrowed
878}
879
880define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
881;CHECK-LABEL: addhn4h_natural:
882;CHECK: addhn.4h
883        %tmp1 = load <4 x i32>, <4 x i32>* %A
884        %tmp2 = load <4 x i32>, <4 x i32>* %B
885        %sum = add <4 x i32> %tmp1, %tmp2
886        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
887        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
888        ret <4 x i16> %narrowed
889}
890
891define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
892;CHECK-LABEL: addhn2s_natural:
893;CHECK: addhn.2s
894        %tmp1 = load <2 x i64>, <2 x i64>* %A
895        %tmp2 = load <2 x i64>, <2 x i64>* %B
896        %sum = add <2 x i64> %tmp1, %tmp2
897        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
898        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
899        ret <2 x i32> %narrowed
900}
901
902define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
903;CHECK-LABEL: addhn2_16b_natural:
904;CHECK: addhn2.16b
905        %tmp1 = load <8 x i16>, <8 x i16>* %A
906        %tmp2 = load <8 x i16>, <8 x i16>* %B
907        %sum = add <8 x i16> %tmp1, %tmp2
908        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
909        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
910        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
911        ret <16 x i8> %res
912}
913
914define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
915;CHECK-LABEL: addhn2_8h_natural:
916;CHECK: addhn2.8h
917        %tmp1 = load <4 x i32>, <4 x i32>* %A
918        %tmp2 = load <4 x i32>, <4 x i32>* %B
919        %sum = add <4 x i32> %tmp1, %tmp2
920        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
921        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
922        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
923        ret <8 x i16> %res
924}
925
926define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
927;CHECK-LABEL: addhn2_4s_natural:
928;CHECK: addhn2.4s
929        %tmp1 = load <2 x i64>, <2 x i64>* %A
930        %tmp2 = load <2 x i64>, <2 x i64>* %B
931        %sum = add <2 x i64> %tmp1, %tmp2
932        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
933        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
934        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
935        ret <4 x i32> %res
936}
937
938define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
939;CHECK-LABEL: subhn8b_natural:
940;CHECK: subhn.8b
941        %tmp1 = load <8 x i16>, <8 x i16>* %A
942        %tmp2 = load <8 x i16>, <8 x i16>* %B
943        %diff = sub <8 x i16> %tmp1, %tmp2
944        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
945        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
946        ret <8 x i8> %narrowed
947}
948
949define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
950;CHECK-LABEL: subhn4h_natural:
951;CHECK: subhn.4h
952        %tmp1 = load <4 x i32>, <4 x i32>* %A
953        %tmp2 = load <4 x i32>, <4 x i32>* %B
954        %diff = sub <4 x i32> %tmp1, %tmp2
955        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
956        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
957        ret <4 x i16> %narrowed
958}
959
960define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
961;CHECK-LABEL: subhn2s_natural:
962;CHECK: subhn.2s
963        %tmp1 = load <2 x i64>, <2 x i64>* %A
964        %tmp2 = load <2 x i64>, <2 x i64>* %B
965        %diff = sub <2 x i64> %tmp1, %tmp2
966        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
967        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
968        ret <2 x i32> %narrowed
969}
970
971define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
972;CHECK-LABEL: subhn2_16b_natural:
973;CHECK: subhn2.16b
974        %tmp1 = load <8 x i16>, <8 x i16>* %A
975        %tmp2 = load <8 x i16>, <8 x i16>* %B
976        %diff = sub <8 x i16> %tmp1, %tmp2
977        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
978        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
979        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
980        ret <16 x i8> %res
981}
982
983define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
984;CHECK-LABEL: subhn2_8h_natural:
985;CHECK: subhn2.8h
986        %tmp1 = load <4 x i32>, <4 x i32>* %A
987        %tmp2 = load <4 x i32>, <4 x i32>* %B
988        %diff = sub <4 x i32> %tmp1, %tmp2
989        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
990        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
991        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
992        ret <8 x i16> %res
993}
994
995define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
996;CHECK-LABEL: subhn2_4s_natural:
997;CHECK: subhn2.4s
998        %tmp1 = load <2 x i64>, <2 x i64>* %A
999        %tmp2 = load <2 x i64>, <2 x i64>* %B
1000        %diff = sub <2 x i64> %tmp1, %tmp2
1001        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
1002        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
1003        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1004        ret <4 x i32> %res
1005}
1006