1; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
2
3; Simple load of v4i16
4define <4 x half> @load_64(<4 x half>* nocapture readonly %a) #0 {
5; CHECK-LABEL: load_64:
6; CHECK: ldr d0, [x0]
7entry:
8  %0 = load <4 x half>* %a, align 8
9  ret <4 x half> %0
10}
11
12; Simple load of v8i16
13define <8 x half> @load_128(<8 x half>* nocapture readonly %a) #0 {
14; CHECK-LABEL: load_128:
15; CHECK: ldr q0, [x0]
16entry:
17  %0 = load <8 x half>* %a, align 16
18  ret <8 x half> %0
19}
20
21; Duplicating load to v4i16
22define <4 x half> @load_dup_64(half* nocapture readonly %a) #0 {
23; CHECK-LABEL: load_dup_64:
24; CHECK: ld1r { v0.4h }, [x0]
25entry:
26  %0 = load half* %a, align 2
27  %1 = insertelement <4 x half> undef, half %0, i32 0
28  %2 = shufflevector <4 x half> %1, <4 x half> undef, <4 x i32> zeroinitializer
29  ret <4 x half> %2
30}
31
32; Duplicating load to v8i16
33define <8 x half> @load_dup_128(half* nocapture readonly %a) #0 {
34; CHECK-LABEL: load_dup_128:
35; CHECK: ld1r { v0.8h }, [x0]
36entry:
37  %0 = load half* %a, align 2
38  %1 = insertelement <8 x half> undef, half %0, i32 0
39  %2 = shufflevector <8 x half> %1, <8 x half> undef, <8 x i32> zeroinitializer
40  ret <8 x half> %2
41}
42
43; Load to one lane of v4f16
44define <4 x half> @load_lane_64(half* nocapture readonly %a, <4 x half> %b) #0 {
45; CHECK-LABEL: load_lane_64:
46; CHECK: ld1 { v0.h }[2], [x0]
47entry:
48  %0 = load half* %a, align 2
49  %1 = insertelement <4 x half> %b, half %0, i32 2
50  ret <4 x half> %1
51}
52
53; Load to one lane of v8f16
54define <8 x half> @load_lane_128(half* nocapture readonly %a, <8 x half> %b) #0 {
55; CHECK-LABEL: load_lane_128:
56; CHECK: ld1 { v0.h }[5], [x0]
57entry:
58  %0 = load half* %a, align 2
59  %1 = insertelement <8 x half> %b, half %0, i32 5
60  ret <8 x half> %1
61}
62
63; Simple store of v4f16
64define void @store_64(<4 x half>* nocapture %a, <4 x half> %b) #1 {
65; CHECK-LABEL: store_64:
66; CHECK: str d0, [x0]
67entry:
68  store <4 x half> %b, <4 x half>* %a, align 8
69  ret void
70}
71
72; Simple store of v8f16
73define void @store_128(<8 x half>* nocapture %a, <8 x half> %b) #1 {
74; CHECK-LABEL: store_128:
75; CHECK: str q0, [x0]
76entry:
77  store <8 x half> %b, <8 x half>* %a, align 16
78  ret void
79}
80
81; Store from one lane of v4f16
82define void @store_lane_64(half* nocapture %a, <4 x half> %b) #1 {
83; CHECK-LABEL: store_lane_64:
84; CHECK: st1 { v0.h }[2], [x0]
85entry:
86  %0 = extractelement <4 x half> %b, i32 2
87  store half %0, half* %a, align 2
88  ret void
89}
90
91; Store from one lane of v8f16
92define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 {
93; CHECK-LABEL: store_lane_128:
94; CHECK: st1 { v0.h }[5], [x0]
95entry:
96  %0 = extractelement <8 x half> %b, i32 5
97  store half %0, half* %a, align 2
98  ret void
99}
100
101; NEON intrinsics - (de-)interleaving loads and stores
102declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*)
103declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*)
104declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>*)
105declare void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
106declare void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
107declare void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
108declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>*)
109declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>*)
110declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>*)
111declare void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
112declare void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
113declare void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
114
115; Load 2 x v4f16 with de-interleaving
116define { <4 x half>, <4 x half> } @load_interleave_64_2(<4 x half>* %a) #0 {
117; CHECK-LABEL: load_interleave_64_2:
118; CHECK: ld2 { v0.4h, v1.4h }, [x0]
119entry:
120  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>* %a)
121  ret { <4 x half>, <4 x half> } %0
122}
123
124; Load 3 x v4f16 with de-interleaving
125define { <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_3(<4 x half>* %a) #0 {
126; CHECK-LABEL: load_interleave_64_3:
127; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0]
128entry:
129  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>* %a)
130  ret { <4 x half>, <4 x half>, <4 x half> } %0
131}
132
133; Load 4 x v4f16 with de-interleaving
134define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_4(<4 x half>* %a) #0 {
135; CHECK-LABEL: load_interleave_64_4:
136; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
137entry:
138  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>* %a)
139  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
140}
141
142; Store 2 x v4f16 with interleaving
143define void @store_interleave_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
144; CHECK-LABEL: store_interleave_64_2:
145; CHECK: st2 { v0.4h, v1.4h }, [x0]
146entry:
147  tail call void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
148  ret void
149}
150
151; Store 3 x v4f16 with interleaving
152define void @store_interleave_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
153; CHECK-LABEL: store_interleave_64_3:
154; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0]
155entry:
156  tail call void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
157  ret void
158}
159
160; Store 4 x v4f16 with interleaving
161define void @store_interleave_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
162; CHECK-LABEL: store_interleave_64_4:
163; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
164entry:
165  tail call void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
166  ret void
167}
168
169; Load 2 x v8f16 with de-interleaving
170define { <8 x half>, <8 x half> } @load_interleave_128_2(<8 x half>* %a) #0 {
171; CHECK-LABEL: load_interleave_128_2:
172; CHECK: ld2 { v0.8h, v1.8h }, [x0]
173entry:
174  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>* %a)
175  ret { <8 x half>, <8 x half> } %0
176}
177
178; Load 3 x v8f16 with de-interleaving
179define { <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_3(<8 x half>* %a) #0 {
180; CHECK-LABEL: load_interleave_128_3:
181; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0]
182entry:
183  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>* %a)
184  ret { <8 x half>, <8 x half>, <8 x half> } %0
185}
186
187; Load 8 x v8f16 with de-interleaving
188define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_4(<8 x half>* %a) #0 {
189; CHECK-LABEL: load_interleave_128_4:
190; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
191entry:
192  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>* %a)
193  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
194}
195
196; Store 2 x v8f16 with interleaving
197define void @store_interleave_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
198; CHECK-LABEL: store_interleave_128_2:
199; CHECK: st2 { v0.8h, v1.8h }, [x0]
200entry:
201  tail call void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
202  ret void
203}
204
205; Store 3 x v8f16 with interleaving
206define void @store_interleave_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
207; CHECK-LABEL: store_interleave_128_3:
208; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0]
209entry:
210  tail call void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
211  ret void
212}
213
214; Store 8 x v8f16 with interleaving
215define void @store_interleave_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
216; CHECK-LABEL: store_interleave_128_4:
217; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
218entry:
219  tail call void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
220  ret void
221}
222
223; NEON intrinsics - duplicating loads
224declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half*)
225declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half*)
226declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half*)
227declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half*)
228declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half*)
229declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half*)
230
231; Load 2 x v4f16 with duplication
232define { <4 x half>, <4 x half> } @load_dup_64_2(half* %a) #0 {
233; CHECK-LABEL: load_dup_64_2:
234; CHECK: ld2r { v0.4h, v1.4h }, [x0]
235entry:
236  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* %a)
237  ret { <4 x half>, <4 x half> } %0
238}
239
240; Load 3 x v4f16 with duplication
241define { <4 x half>, <4 x half>, <4 x half> } @load_dup_64_3(half* %a) #0 {
242; CHECK-LABEL: load_dup_64_3:
243; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0]
244entry:
245  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* %a)
246  ret { <4 x half>, <4 x half>, <4 x half> } %0
247}
248
249; Load 4 x v4f16 with duplication
250define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_dup_64_4(half* %a) #0 {
251; CHECK-LABEL: load_dup_64_4:
252; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
253entry:
254  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* %a)
255  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
256}
257
258; Load 2 x v8f16 with duplication
259define { <8 x half>, <8 x half> } @load_dup_128_2(half* %a) #0 {
260; CHECK-LABEL: load_dup_128_2:
261; CHECK: ld2r { v0.8h, v1.8h }, [x0]
262entry:
263  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* %a)
264  ret { <8 x half>, <8 x half> } %0
265}
266
267; Load 3 x v8f16 with duplication
268define { <8 x half>, <8 x half>, <8 x half> } @load_dup_128_3(half* %a) #0 {
269; CHECK-LABEL: load_dup_128_3:
270; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0]
271entry:
272  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* %a)
273  ret { <8 x half>, <8 x half>, <8 x half> } %0
274}
275
276; Load 8 x v8f16 with duplication
277define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_dup_128_4(half* %a) #0 {
278; CHECK-LABEL: load_dup_128_4:
279; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
280entry:
281  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* %a)
282  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
283}
284
285
286; NEON intrinsics - loads and stores to/from one lane
287declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
288declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
289declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
290declare void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
291declare void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
292declare void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
293declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
294declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
295declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
296declare void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
297declare void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
298declare void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
299
300; Load one lane of 2 x v4f16
301define { <4 x half>, <4 x half> } @load_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
302; CHECK-LABEL: load_lane_64_2:
303; CHECK: ld2 { v0.h, v1.h }[2], [x0]
304entry:
305  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
306  ret { <4 x half>, <4 x half> } %0
307}
308
309; Load one lane of 3 x v4f16
310define { <4 x half>, <4 x half>, <4 x half> } @load_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
311; CHECK-LABEL: load_lane_64_3:
312; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
313entry:
314  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
315  ret { <4 x half>, <4 x half>, <4 x half> } %0
316}
317
318; Load one lane of 4 x v4f16
319define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
320; CHECK-LABEL: load_lane_64_4:
321; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
322entry:
323  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
324  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
325}
326
327; Store one lane of 2 x v4f16
328define void @store_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
329; CHECK-LABEL: store_lane_64_2:
330; CHECK: st2 { v0.h, v1.h }[2], [x0]
331entry:
332  tail call void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
333  ret void
334}
335
336; Store one lane of 3 x v4f16
337define void @store_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
338; CHECK-LABEL: store_lane_64_3:
339; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
340entry:
341  tail call void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
342  ret void
343}
344
345; Store one lane of 4 x v4f16
346define void @store_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
347; CHECK-LABEL: store_lane_64_4:
348; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
349entry:
350  tail call void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
351  ret void
352}
353
354; Load one lane of 2 x v8f16
355define { <8 x half>, <8 x half> } @load_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
356; CHECK-LABEL: load_lane_128_2:
357; CHECK: ld2 { v0.h, v1.h }[2], [x0]
358entry:
359  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
360  ret { <8 x half>, <8 x half> } %0
361}
362
363; Load one lane of 3 x v8f16
364define { <8 x half>, <8 x half>, <8 x half> } @load_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
365; CHECK-LABEL: load_lane_128_3:
366; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
367entry:
368  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
369  ret { <8 x half>, <8 x half>, <8 x half> } %0
370}
371
372; Load one lane of 8 x v8f16
373define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
374; CHECK-LABEL: load_lane_128_4:
375; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
376entry:
377  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
378  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
379}
380
381; Store one lane of 2 x v8f16
382define void @store_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
383; CHECK-LABEL: store_lane_128_2:
384; CHECK: st2 { v0.h, v1.h }[2], [x0]
385entry:
386  tail call void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
387  ret void
388}
389
390; Store one lane of 3 x v8f16
391define void @store_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
392; CHECK-LABEL: store_lane_128_3:
393; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
394entry:
395  tail call void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
396  ret void
397}
398
399; Store one lane of 8 x v8f16
400define void @store_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
401; CHECK-LABEL: store_lane_128_4:
402; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
403entry:
404  tail call void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
405  ret void
406}
407
408; NEON intrinsics - load/store without interleaving
409declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>*)
410declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>*)
411declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>*)
412declare void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
413declare void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
414declare void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
415declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>*)
416declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>*)
417declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>*)
418declare void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
419declare void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
420declare void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
421
422; Load 2 x v4f16 without de-interleaving
423define { <4 x half>, <4 x half> } @load_64_2(<4 x half>* %a) #0 {
424; CHECK-LABEL: load_64_2:
425; CHECK: ld1 { v0.4h, v1.4h }, [x0]
426entry:
427  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>* %a)
428  ret { <4 x half>, <4 x half> } %0
429}
430
431; Load 3 x v4f16 without de-interleaving
432define { <4 x half>, <4 x half>, <4 x half> } @load_64_3(<4 x half>* %a) #0 {
433; CHECK-LABEL: load_64_3:
434; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0]
435entry:
436  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>* %a)
437  ret { <4 x half>, <4 x half>, <4 x half> } %0
438}
439
440; Load 4 x v4f16 without de-interleaving
441define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_64_4(<4 x half>* %a) #0 {
442; CHECK-LABEL: load_64_4:
443; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
444entry:
445  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>* %a)
446  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
447}
448
449; Store 2 x v4f16 without interleaving
450define void @store_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
451; CHECK-LABEL: store_64_2:
452; CHECK: st1 { v0.4h, v1.4h }, [x0]
453entry:
454  tail call void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
455  ret void
456}
457
458; Store 3 x v4f16 without interleaving
459define void @store_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
460; CHECK-LABEL: store_64_3:
461; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0]
462entry:
463  tail call void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
464  ret void
465}
466
467; Store 4 x v4f16 without interleaving
468define void @store_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
469; CHECK-LABEL: store_64_4:
470; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
471entry:
472  tail call void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
473  ret void
474}
475
476; Load 2 x v8f16 without de-interleaving
477define { <8 x half>, <8 x half> } @load_128_2(<8 x half>* %a) #0 {
478; CHECK-LABEL: load_128_2:
479; CHECK: ld1 { v0.8h, v1.8h }, [x0]
480entry:
481  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>* %a)
482  ret { <8 x half>, <8 x half> } %0
483}
484
485; Load 3 x v8f16 without de-interleaving
486define { <8 x half>, <8 x half>, <8 x half> } @load_128_3(<8 x half>* %a) #0 {
487; CHECK-LABEL: load_128_3:
488; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0]
489entry:
490  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>* %a)
491  ret { <8 x half>, <8 x half>, <8 x half> } %0
492}
493
494; Load 8 x v8f16 without de-interleaving
495define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_128_4(<8 x half>* %a) #0 {
496; CHECK-LABEL: load_128_4:
497; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
498entry:
499  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>* %a)
500  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
501}
502
503; Store 2 x v8f16 without interleaving
504define void @store_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
505; CHECK-LABEL: store_128_2:
506; CHECK: st1 { v0.8h, v1.8h }, [x0]
507entry:
508  tail call void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
509  ret void
510}
511
512; Store 3 x v8f16 without interleaving
513define void @store_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
514; CHECK-LABEL: store_128_3:
515; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0]
516entry:
517  tail call void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
518  ret void
519}
520
521; Store 8 x v8f16 without interleaving
522define void @store_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
523; CHECK-LABEL: store_128_4:
524; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
525entry:
526  tail call void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
527  ret void
528}
529