1; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
2
3; Simple load of v4i16
4define <4 x half> @load_64(<4 x half>* nocapture readonly %a) #0 {
5; CHECK-LABEL: load_64:
6; CHECK: ldr d0, [x0]
7entry:
8  %0 = load <4 x half>, <4 x half>* %a, align 8
9  ret <4 x half> %0
10}
11
12; Simple load of v8i16
13define <8 x half> @load_128(<8 x half>* nocapture readonly %a) #0 {
14; CHECK-LABEL: load_128:
15; CHECK: ldr q0, [x0]
16entry:
17  %0 = load <8 x half>, <8 x half>* %a, align 16
18  ret <8 x half> %0
19}
20
21; Duplicating load to v4i16
22define <4 x half> @load_dup_64(half* nocapture readonly %a) #0 {
23; CHECK-LABEL: load_dup_64:
24; CHECK: ld1r { v0.4h }, [x0]
25entry:
26  %0 = load half, half* %a, align 2
27  %1 = insertelement <4 x half> undef, half %0, i32 0
28  %2 = shufflevector <4 x half> %1, <4 x half> undef, <4 x i32> zeroinitializer
29  ret <4 x half> %2
30}
31
32; Duplicating load to v8i16
33define <8 x half> @load_dup_128(half* nocapture readonly %a) #0 {
34; CHECK-LABEL: load_dup_128:
35; CHECK: ld1r { v0.8h }, [x0]
36entry:
37  %0 = load half, half* %a, align 2
38  %1 = insertelement <8 x half> undef, half %0, i32 0
39  %2 = shufflevector <8 x half> %1, <8 x half> undef, <8 x i32> zeroinitializer
40  ret <8 x half> %2
41}
42
43; Load to one lane of v4f16
44define <4 x half> @load_lane_64(half* nocapture readonly %a, <4 x half> %b) #0 {
45; CHECK-LABEL: load_lane_64:
46; CHECK: ld1 { v0.h }[2], [x0]
47entry:
48  %0 = load half, half* %a, align 2
49  %1 = insertelement <4 x half> %b, half %0, i32 2
50  ret <4 x half> %1
51}
52
53; Load to one lane of v8f16
54define <8 x half> @load_lane_128(half* nocapture readonly %a, <8 x half> %b) #0 {
55; CHECK-LABEL: load_lane_128:
56; CHECK: ld1 { v0.h }[5], [x0]
57entry:
58  %0 = load half, half* %a, align 2
59  %1 = insertelement <8 x half> %b, half %0, i32 5
60  ret <8 x half> %1
61}
62
63; Simple store of v4f16
64define void @store_64(<4 x half>* nocapture %a, <4 x half> %b) #1 {
65; CHECK-LABEL: store_64:
66; CHECK: str d0, [x0]
67entry:
68  store <4 x half> %b, <4 x half>* %a, align 8
69  ret void
70}
71
72; Simple store of v8f16
73define void @store_128(<8 x half>* nocapture %a, <8 x half> %b) #1 {
74; CHECK-LABEL: store_128:
75; CHECK: str q0, [x0]
76entry:
77  store <8 x half> %b, <8 x half>* %a, align 16
78  ret void
79}
80
81; Store from one lane of v4f16
82define void @store_lane_64(half* nocapture %a, <4 x half> %b) #1 {
83; CHECK-LABEL: store_lane_64:
84; CHECK: st1 { v0.h }[2], [x0]
85entry:
86  %0 = extractelement <4 x half> %b, i32 2
87  store half %0, half* %a, align 2
88  ret void
89}
90
91define void @store_lane0_64(half* nocapture %a, <4 x half> %b) #1 {
92; CHECK-LABEL: store_lane0_64:
93; CHECK: str h0, [x0]
94entry:
95  %0 = extractelement <4 x half> %b, i32 0
96  store half %0, half* %a, align 2
97  ret void
98}
99
100define void @storeu_lane0_64(half* nocapture %a, <4 x half> %b) #1 {
101; CHECK-LABEL: storeu_lane0_64:
102; CHECK: stur h0, [x{{[0-9]+}}, #-2]
103entry:
104  %0 = getelementptr half, half* %a, i64 -1
105  %1 = extractelement <4 x half> %b, i32 0
106  store half %1, half* %0, align 2
107  ret void
108}
109
110define void @storero_lane_64(half* nocapture %a, <4 x half> %b, i64 %c) #1 {
111; CHECK-LABEL: storero_lane_64:
112; CHECK: st1 { v0.h }[2], [x{{[0-9]+}}]
113entry:
114  %0 = getelementptr half, half* %a, i64 %c
115  %1 = extractelement <4 x half> %b, i32 2
116  store half %1, half* %0, align 2
117  ret void
118}
119
120define void @storero_lane0_64(half* nocapture %a, <4 x half> %b, i64 %c) #1 {
121; CHECK-LABEL: storero_lane0_64:
122; CHECK: str h0, [x0, x1, lsl #1]
123entry:
124  %0 = getelementptr half, half* %a, i64 %c
125  %1 = extractelement <4 x half> %b, i32 0
126  store half %1, half* %0, align 2
127  ret void
128}
129
130; Store from one lane of v8f16
131define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 {
132; CHECK-LABEL: store_lane_128:
133; CHECK: st1 { v0.h }[5], [x0]
134entry:
135  %0 = extractelement <8 x half> %b, i32 5
136  store half %0, half* %a, align 2
137  ret void
138}
139
140define void @store_lane0_128(half* nocapture %a, <8 x half> %b) #1 {
141; CHECK-LABEL: store_lane0_128:
142; CHECK: str h0, [x0]
143entry:
144  %0 = extractelement <8 x half> %b, i32 0
145  store half %0, half* %a, align 2
146  ret void
147}
148
149define void @storeu_lane0_128(half* nocapture %a, <8 x half> %b) #1 {
150; CHECK-LABEL: storeu_lane0_128:
151; CHECK: stur h0, [x{{[0-9]+}}, #-2]
152entry:
153  %0 = getelementptr half, half* %a, i64 -1
154  %1 = extractelement <8 x half> %b, i32 0
155  store half %1, half* %0, align 2
156  ret void
157}
158
159define void @storero_lane_128(half* nocapture %a, <8 x half> %b, i64 %c) #1 {
160; CHECK-LABEL: storero_lane_128:
161; CHECK: st1 { v0.h }[4], [x{{[0-9]+}}]
162entry:
163  %0 = getelementptr half, half* %a, i64 %c
164  %1 = extractelement <8 x half> %b, i32 4
165  store half %1, half* %0, align 2
166  ret void
167}
168
169define void @storero_lane0_128(half* nocapture %a, <8 x half> %b, i64 %c) #1 {
170; CHECK-LABEL: storero_lane0_128:
171; CHECK: str h0, [x0, x1, lsl #1]
172entry:
173  %0 = getelementptr half, half* %a, i64 %c
174  %1 = extractelement <8 x half> %b, i32 0
175  store half %1, half* %0, align 2
176  ret void
177}
178
179; NEON intrinsics - (de-)interleaving loads and stores
180declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*)
181declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*)
182declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>*)
183declare void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
184declare void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
185declare void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
186declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>*)
187declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>*)
188declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>*)
189declare void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
190declare void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
191declare void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
192
193; Load 2 x v4f16 with de-interleaving
194define { <4 x half>, <4 x half> } @load_interleave_64_2(<4 x half>* %a) #0 {
195; CHECK-LABEL: load_interleave_64_2:
196; CHECK: ld2 { v0.4h, v1.4h }, [x0]
197entry:
198  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>* %a)
199  ret { <4 x half>, <4 x half> } %0
200}
201
202; Load 3 x v4f16 with de-interleaving
203define { <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_3(<4 x half>* %a) #0 {
204; CHECK-LABEL: load_interleave_64_3:
205; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0]
206entry:
207  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>* %a)
208  ret { <4 x half>, <4 x half>, <4 x half> } %0
209}
210
211; Load 4 x v4f16 with de-interleaving
212define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_4(<4 x half>* %a) #0 {
213; CHECK-LABEL: load_interleave_64_4:
214; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
215entry:
216  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>* %a)
217  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
218}
219
220; Store 2 x v4f16 with interleaving
221define void @store_interleave_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
222; CHECK-LABEL: store_interleave_64_2:
223; CHECK: st2 { v0.4h, v1.4h }, [x0]
224entry:
225  tail call void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
226  ret void
227}
228
229; Store 3 x v4f16 with interleaving
230define void @store_interleave_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
231; CHECK-LABEL: store_interleave_64_3:
232; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0]
233entry:
234  tail call void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
235  ret void
236}
237
238; Store 4 x v4f16 with interleaving
239define void @store_interleave_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
240; CHECK-LABEL: store_interleave_64_4:
241; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
242entry:
243  tail call void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
244  ret void
245}
246
247; Load 2 x v8f16 with de-interleaving
248define { <8 x half>, <8 x half> } @load_interleave_128_2(<8 x half>* %a) #0 {
249; CHECK-LABEL: load_interleave_128_2:
250; CHECK: ld2 { v0.8h, v1.8h }, [x0]
251entry:
252  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>* %a)
253  ret { <8 x half>, <8 x half> } %0
254}
255
256; Load 3 x v8f16 with de-interleaving
257define { <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_3(<8 x half>* %a) #0 {
258; CHECK-LABEL: load_interleave_128_3:
259; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0]
260entry:
261  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>* %a)
262  ret { <8 x half>, <8 x half>, <8 x half> } %0
263}
264
265; Load 8 x v8f16 with de-interleaving
266define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_4(<8 x half>* %a) #0 {
267; CHECK-LABEL: load_interleave_128_4:
268; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
269entry:
270  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>* %a)
271  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
272}
273
274; Store 2 x v8f16 with interleaving
275define void @store_interleave_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
276; CHECK-LABEL: store_interleave_128_2:
277; CHECK: st2 { v0.8h, v1.8h }, [x0]
278entry:
279  tail call void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
280  ret void
281}
282
283; Store 3 x v8f16 with interleaving
284define void @store_interleave_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
285; CHECK-LABEL: store_interleave_128_3:
286; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0]
287entry:
288  tail call void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
289  ret void
290}
291
292; Store 8 x v8f16 with interleaving
293define void @store_interleave_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
294; CHECK-LABEL: store_interleave_128_4:
295; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
296entry:
297  tail call void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
298  ret void
299}
300
301; NEON intrinsics - duplicating loads
302declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half*)
303declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half*)
304declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half*)
305declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half*)
306declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half*)
307declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half*)
308
309; Load 2 x v4f16 with duplication
310define { <4 x half>, <4 x half> } @load_dup_64_2(half* %a) #0 {
311; CHECK-LABEL: load_dup_64_2:
312; CHECK: ld2r { v0.4h, v1.4h }, [x0]
313entry:
314  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* %a)
315  ret { <4 x half>, <4 x half> } %0
316}
317
318; Load 3 x v4f16 with duplication
319define { <4 x half>, <4 x half>, <4 x half> } @load_dup_64_3(half* %a) #0 {
320; CHECK-LABEL: load_dup_64_3:
321; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0]
322entry:
323  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* %a)
324  ret { <4 x half>, <4 x half>, <4 x half> } %0
325}
326
327; Load 4 x v4f16 with duplication
328define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_dup_64_4(half* %a) #0 {
329; CHECK-LABEL: load_dup_64_4:
330; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
331entry:
332  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* %a)
333  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
334}
335
336; Load 2 x v8f16 with duplication
337define { <8 x half>, <8 x half> } @load_dup_128_2(half* %a) #0 {
338; CHECK-LABEL: load_dup_128_2:
339; CHECK: ld2r { v0.8h, v1.8h }, [x0]
340entry:
341  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* %a)
342  ret { <8 x half>, <8 x half> } %0
343}
344
345; Load 3 x v8f16 with duplication
346define { <8 x half>, <8 x half>, <8 x half> } @load_dup_128_3(half* %a) #0 {
347; CHECK-LABEL: load_dup_128_3:
348; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0]
349entry:
350  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* %a)
351  ret { <8 x half>, <8 x half>, <8 x half> } %0
352}
353
354; Load 8 x v8f16 with duplication
355define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_dup_128_4(half* %a) #0 {
356; CHECK-LABEL: load_dup_128_4:
357; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
358entry:
359  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* %a)
360  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
361}
362
363
364; NEON intrinsics - loads and stores to/from one lane
365declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
366declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
367declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
368declare void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
369declare void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
370declare void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
371declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
372declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
373declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
374declare void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
375declare void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
376declare void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
377
378; Load one lane of 2 x v4f16
379define { <4 x half>, <4 x half> } @load_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
380; CHECK-LABEL: load_lane_64_2:
381; CHECK: ld2 { v0.h, v1.h }[2], [x0]
382entry:
383  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
384  ret { <4 x half>, <4 x half> } %0
385}
386
387; Load one lane of 3 x v4f16
388define { <4 x half>, <4 x half>, <4 x half> } @load_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
389; CHECK-LABEL: load_lane_64_3:
390; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
391entry:
392  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
393  ret { <4 x half>, <4 x half>, <4 x half> } %0
394}
395
396; Load one lane of 4 x v4f16
397define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
398; CHECK-LABEL: load_lane_64_4:
399; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
400entry:
401  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
402  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
403}
404
405; Store one lane of 2 x v4f16
406define void @store_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
407; CHECK-LABEL: store_lane_64_2:
408; CHECK: st2 { v0.h, v1.h }[2], [x0]
409entry:
410  tail call void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
411  ret void
412}
413
414; Store one lane of 3 x v4f16
415define void @store_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
416; CHECK-LABEL: store_lane_64_3:
417; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
418entry:
419  tail call void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
420  ret void
421}
422
423; Store one lane of 4 x v4f16
424define void @store_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
425; CHECK-LABEL: store_lane_64_4:
426; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
427entry:
428  tail call void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
429  ret void
430}
431
432; Load one lane of 2 x v8f16
433define { <8 x half>, <8 x half> } @load_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
434; CHECK-LABEL: load_lane_128_2:
435; CHECK: ld2 { v0.h, v1.h }[2], [x0]
436entry:
437  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
438  ret { <8 x half>, <8 x half> } %0
439}
440
441; Load one lane of 3 x v8f16
442define { <8 x half>, <8 x half>, <8 x half> } @load_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
443; CHECK-LABEL: load_lane_128_3:
444; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
445entry:
446  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
447  ret { <8 x half>, <8 x half>, <8 x half> } %0
448}
449
450; Load one lane of 8 x v8f16
451define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
452; CHECK-LABEL: load_lane_128_4:
453; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
454entry:
455  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
456  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
457}
458
459; Store one lane of 2 x v8f16
460define void @store_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
461; CHECK-LABEL: store_lane_128_2:
462; CHECK: st2 { v0.h, v1.h }[2], [x0]
463entry:
464  tail call void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
465  ret void
466}
467
468; Store one lane of 3 x v8f16
469define void @store_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
470; CHECK-LABEL: store_lane_128_3:
471; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
472entry:
473  tail call void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
474  ret void
475}
476
477; Store one lane of 8 x v8f16
478define void @store_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
479; CHECK-LABEL: store_lane_128_4:
480; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
481entry:
482  tail call void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
483  ret void
484}
485
486; NEON intrinsics - load/store without interleaving
487declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>*)
488declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>*)
489declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>*)
490declare void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
491declare void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
492declare void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
493declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>*)
494declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>*)
495declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>*)
496declare void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
497declare void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
498declare void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
499
500; Load 2 x v4f16 without de-interleaving
501define { <4 x half>, <4 x half> } @load_64_2(<4 x half>* %a) #0 {
502; CHECK-LABEL: load_64_2:
503; CHECK: ld1 { v0.4h, v1.4h }, [x0]
504entry:
505  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>* %a)
506  ret { <4 x half>, <4 x half> } %0
507}
508
509; Load 3 x v4f16 without de-interleaving
510define { <4 x half>, <4 x half>, <4 x half> } @load_64_3(<4 x half>* %a) #0 {
511; CHECK-LABEL: load_64_3:
512; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0]
513entry:
514  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>* %a)
515  ret { <4 x half>, <4 x half>, <4 x half> } %0
516}
517
518; Load 4 x v4f16 without de-interleaving
519define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_64_4(<4 x half>* %a) #0 {
520; CHECK-LABEL: load_64_4:
521; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
522entry:
523  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>* %a)
524  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
525}
526
527; Store 2 x v4f16 without interleaving
528define void @store_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
529; CHECK-LABEL: store_64_2:
530; CHECK: st1 { v0.4h, v1.4h }, [x0]
531entry:
532  tail call void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
533  ret void
534}
535
536; Store 3 x v4f16 without interleaving
537define void @store_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
538; CHECK-LABEL: store_64_3:
539; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0]
540entry:
541  tail call void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
542  ret void
543}
544
545; Store 4 x v4f16 without interleaving
546define void @store_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
547; CHECK-LABEL: store_64_4:
548; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
549entry:
550  tail call void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
551  ret void
552}
553
554; Load 2 x v8f16 without de-interleaving
555define { <8 x half>, <8 x half> } @load_128_2(<8 x half>* %a) #0 {
556; CHECK-LABEL: load_128_2:
557; CHECK: ld1 { v0.8h, v1.8h }, [x0]
558entry:
559  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>* %a)
560  ret { <8 x half>, <8 x half> } %0
561}
562
563; Load 3 x v8f16 without de-interleaving
564define { <8 x half>, <8 x half>, <8 x half> } @load_128_3(<8 x half>* %a) #0 {
565; CHECK-LABEL: load_128_3:
566; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0]
567entry:
568  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>* %a)
569  ret { <8 x half>, <8 x half>, <8 x half> } %0
570}
571
572; Load 8 x v8f16 without de-interleaving
573define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_128_4(<8 x half>* %a) #0 {
574; CHECK-LABEL: load_128_4:
575; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
576entry:
577  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>* %a)
578  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
579}
580
581; Store 2 x v8f16 without interleaving
582define void @store_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
583; CHECK-LABEL: store_128_2:
584; CHECK: st1 { v0.8h, v1.8h }, [x0]
585entry:
586  tail call void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
587  ret void
588}
589
590; Store 3 x v8f16 without interleaving
591define void @store_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
592; CHECK-LABEL: store_128_3:
593; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0]
594entry:
595  tail call void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
596  ret void
597}
598
599; Store 8 x v8f16 without interleaving
600define void @store_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
601; CHECK-LABEL: store_128_4:
602; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
603entry:
604  tail call void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
605  ret void
606}
607