1; RUN: llc -mtriple aarch64-arm-none-eabi -asm-verbose=1 -mattr=+bf16 %s -o - | FileCheck %s
2
3%struct.bfloat16x4x2_t = type { [2 x <4 x bfloat>] }
4%struct.bfloat16x8x2_t = type { [2 x <8 x bfloat>] }
5%struct.bfloat16x4x3_t = type { [3 x <4 x bfloat>] }
6%struct.bfloat16x8x3_t = type { [3 x <8 x bfloat>] }
7%struct.bfloat16x4x4_t = type { [4 x <4 x bfloat>] }
8%struct.bfloat16x8x4_t = type { [4 x <8 x bfloat>] }
9
10define <4 x bfloat> @test_vld1_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr nounwind {
11; CHECK-LABEL: test_vld1_bf16:
12; CHECK:       // %bb.0: // %entry
13; CHECK-NEXT:    ldr d0, [x0]
14; CHECK-NEXT:    ret
15entry:
16  %0 = bitcast bfloat* %ptr to <4 x bfloat>*
17  %1 = load <4 x bfloat>, <4 x bfloat>* %0, align 2
18  ret <4 x bfloat> %1
19}
20
21define <8 x bfloat> @test_vld1q_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr nounwind {
22; CHECK-LABEL: test_vld1q_bf16:
23; CHECK:       // %bb.0: // %entry
24; CHECK-NEXT:    ldr q0, [x0]
25; CHECK-NEXT:    ret
26entry:
27  %0 = bitcast bfloat* %ptr to <8 x bfloat>*
28  %1 = load <8 x bfloat>, <8 x bfloat>* %0, align 2
29  ret <8 x bfloat> %1
30}
31
32define <4 x bfloat> @test_vld1_lane_bf16(bfloat* nocapture readonly %ptr, <4 x bfloat> %src) local_unnamed_addr nounwind {
33; CHECK-LABEL: test_vld1_lane_bf16:
34; CHECK:       // %bb.0: // %entry
35; CHECK:    ld1 { v0.h }[0], [x0]
36; CHECK:    ret
37entry:
38  %0 = load bfloat, bfloat* %ptr, align 2
39  %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0
40  ret <4 x bfloat> %vld1_lane
41}
42
43define <8 x bfloat> @test_vld1q_lane_bf16(bfloat* nocapture readonly %ptr, <8 x bfloat> %src) local_unnamed_addr nounwind {
44; CHECK-LABEL: test_vld1q_lane_bf16:
45; CHECK:       // %bb.0: // %entry
46; CHECK-NEXT:    ld1 { v0.h }[7], [x0]
47; CHECK-NEXT:    ret
48entry:
49  %0 = load bfloat, bfloat* %ptr, align 2
50  %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7
51  ret <8 x bfloat> %vld1_lane
52}
53
54define <4 x bfloat> @test_vld1_dup_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr nounwind {
55; CHECK-LABEL: test_vld1_dup_bf16:
56; CHECK:       // %bb.0: // %entry
57; CHECK-NEXT:    ld1r { v0.4h }, [x0]
58; CHECK-NEXT:    ret
59entry:
60  %0 = load bfloat, bfloat* %ptr, align 2
61  %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0
62  %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer
63  ret <4 x bfloat> %lane
64}
65
66define %struct.bfloat16x4x2_t @test_vld1_bf16_x2(bfloat* %ptr) local_unnamed_addr nounwind {
67; CHECK-LABEL: test_vld1_bf16_x2:
68; CHECK:       // %bb.0: // %entry
69; CHECK-NEXT:    ld1 { v0.4h, v1.4h }, [x0]
70; CHECK-NEXT:    ret
71entry:
72  %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat* %ptr)
73  %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
74  %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
75  %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0
76  %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1
77  ret %struct.bfloat16x4x2_t %.fca.0.1.insert
78}
79
80declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat*) nounwind
81
82define %struct.bfloat16x8x2_t @test_vld1q_bf16_x2(bfloat* %ptr) local_unnamed_addr nounwind {
83; CHECK-LABEL: test_vld1q_bf16_x2:
84; CHECK:       // %bb.0: // %entry
85; CHECK-NEXT:    ld1 { v0.8h, v1.8h }, [x0]
86; CHECK-NEXT:    ret
87entry:
88  %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat* %ptr)
89  %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
90  %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
91  %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0
92  %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1
93  ret %struct.bfloat16x8x2_t %.fca.0.1.insert
94}
95
96; Function Attrs: argmemonly nounwind readonly
97declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat*) nounwind
98
99define %struct.bfloat16x4x3_t @test_vld1_bf16_x3(bfloat* %ptr) local_unnamed_addr nounwind {
100; CHECK-LABEL: test_vld1_bf16_x3:
101; CHECK:       // %bb.0: // %entry
102; CHECK-NEXT:    ld1 { v0.4h, v1.4h, v2.4h }, [x0]
103; CHECK-NEXT:    ret
104entry:
105  %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat* %ptr)
106  %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
107  %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
108  %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2
109  %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0
110  %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1
111  %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld1xN.fca.2.extract, 0, 2
112  ret %struct.bfloat16x4x3_t %.fca.0.2.insert
113}
114
115; Function Attrs: argmemonly nounwind readonly
116declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat*) nounwind
117
118define %struct.bfloat16x8x3_t @test_vld1q_bf16_x3(bfloat* %ptr) local_unnamed_addr nounwind {
119; CHECK-LABEL: test_vld1q_bf16_x3:
120; CHECK:       // %bb.0: // %entry
121; CHECK-NEXT:    ld1 { v0.8h, v1.8h, v2.8h }, [x0]
122; CHECK-NEXT:    ret
123entry:
124  %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat* %ptr)
125  %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
126  %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
127  %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2
128  %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0
129  %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1
130  %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld1xN.fca.2.extract, 0, 2
131  ret %struct.bfloat16x8x3_t %.fca.0.2.insert
132}
133
134; Function Attrs: argmemonly nounwind readonly
135declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat*) nounwind
136
137define %struct.bfloat16x4x4_t @test_vld1_bf16_x4(bfloat* %ptr) local_unnamed_addr nounwind {
138; CHECK-LABEL: test_vld1_bf16_x4:
139; CHECK:       // %bb.0: // %entry
140; CHECK-NEXT:    ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
141; CHECK-NEXT:    ret
142entry:
143  %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat* %ptr)
144  %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
145  %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
146  %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2
147  %vld1xN.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 3
148  %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0
149  %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1
150  %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld1xN.fca.2.extract, 0, 2
151  %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld1xN.fca.3.extract, 0, 3
152  ret %struct.bfloat16x4x4_t %.fca.0.3.insert
153}
154
155; Function Attrs: argmemonly nounwind readonly
156declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat*) nounwind
157
158define %struct.bfloat16x8x4_t @test_vld1q_bf16_x4(bfloat* %ptr) local_unnamed_addr nounwind {
159; CHECK-LABEL: test_vld1q_bf16_x4:
160; CHECK:       // %bb.0: // %entry
161; CHECK-NEXT:    ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
162; CHECK-NEXT:    ret
163entry:
164  %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat* %ptr)
165  %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
166  %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
167  %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2
168  %vld1xN.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 3
169  %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0
170  %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1
171  %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld1xN.fca.2.extract, 0, 2
172  %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld1xN.fca.3.extract, 0, 3
173  ret %struct.bfloat16x8x4_t %.fca.0.3.insert
174}
175
176; Function Attrs: argmemonly nounwind readonly
177declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat*) nounwind
178
179define <8 x bfloat> @test_vld1q_dup_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr nounwind {
180; CHECK-LABEL: test_vld1q_dup_bf16:
181; CHECK:       // %bb.0: // %entry
182; CHECK-NEXT:    ld1r { v0.8h }, [x0]
183; CHECK-NEXT:    ret
184entry:
185  %0 = load bfloat, bfloat* %ptr, align 2
186  %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0
187  %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer
188  ret <8 x bfloat> %lane
189}
190
191define %struct.bfloat16x4x2_t @test_vld2_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
192; CHECK-LABEL: test_vld2_bf16:
193; CHECK:       // %bb.0: // %entry
194; CHECK-NEXT:    ld2 { v0.4h, v1.4h }, [x0]
195; CHECK-NEXT:    ret
196entry:
197  %0 = bitcast bfloat* %ptr to <4 x bfloat>*
198  %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>* %0)
199  %vld2.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 0
200  %vld2.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 1
201  %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2.fca.0.extract, 0, 0
202  %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2.fca.1.extract, 0, 1
203  ret %struct.bfloat16x4x2_t %.fca.0.1.insert
204}
205
206; Function Attrs: argmemonly nounwind readonly
207declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>*) nounwind
208
209define %struct.bfloat16x8x2_t @test_vld2q_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
210; CHECK-LABEL: test_vld2q_bf16:
211; CHECK:       // %bb.0: // %entry
212; CHECK-NEXT:    ld2 { v0.8h, v1.8h }, [x0]
213; CHECK-NEXT:    ret
214entry:
215  %0 = bitcast bfloat* %ptr to <8 x bfloat>*
216  %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>* %0)
217  %vld2.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 0
218  %vld2.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 1
219  %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2.fca.0.extract, 0, 0
220  %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2.fca.1.extract, 0, 1
221  ret %struct.bfloat16x8x2_t %.fca.0.1.insert
222}
223
224; Function Attrs: argmemonly nounwind readonly
225declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>*) nounwind
226define %struct.bfloat16x4x2_t @test_vld2_lane_bf16(bfloat* %ptr, [2 x <4 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
227; CHECK-LABEL: test_vld2_lane_bf16:
228; CHECK:       // %bb.0: // %entry
229; CHECK:    ld2 { v0.h, v1.h }[1], [x0]
230; CHECK:    ret
231entry:
232  %src.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %src.coerce, 0
233  %src.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %src.coerce, 1
234  %0 = bitcast bfloat* %ptr to i8*
235  %vld2_lane = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, i64 1, i8* %0)
236  %vld2_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane, 0
237  %vld2_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane, 1
238  %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2_lane.fca.0.extract, 0, 0
239  %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2_lane.fca.1.extract, 0, 1
240  ret %struct.bfloat16x4x2_t %.fca.0.1.insert
241}
242
243; Function Attrs: argmemonly nounwind readonly
244declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, i64, i8*) nounwind
245
246define %struct.bfloat16x8x2_t @test_vld2q_lane_bf16(bfloat* %ptr, [2 x <8 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
247; CHECK-LABEL: test_vld2q_lane_bf16:
248; CHECK:       // %bb.0: // %entry
249; CHECK:    ld2 { v0.h, v1.h }[7], [x0]
250; CHECK:    ret
251entry:
252  %src.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %src.coerce, 0
253  %src.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %src.coerce, 1
254  %0 = bitcast bfloat* %ptr to i8*
255  %vld2_lane = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, i64 7, i8* %0)
256  %vld2_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2_lane, 0
257  %vld2_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2_lane, 1
258  %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2_lane.fca.0.extract, 0, 0
259  %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2_lane.fca.1.extract, 0, 1
260  ret %struct.bfloat16x8x2_t %.fca.0.1.insert
261}
262
263; Function Attrs: argmemonly nounwind readonly
264declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, i64, i8*) nounwind
265
266define %struct.bfloat16x4x3_t @test_vld3_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
267; CHECK-LABEL: test_vld3_bf16:
268; CHECK:       // %bb.0: // %entry
269; CHECK-NEXT:    ld3 { v0.4h, v1.4h, v2.4h }, [x0]
270; CHECK-NEXT:    ret
271entry:
272  %0 = bitcast bfloat* %ptr to <4 x bfloat>*
273  %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>* %0)
274  %vld3.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 0
275  %vld3.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 1
276  %vld3.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 2
277  %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3.fca.0.extract, 0, 0
278  %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3.fca.1.extract, 0, 1
279  %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3.fca.2.extract, 0, 2
280  ret %struct.bfloat16x4x3_t %.fca.0.2.insert
281}
282
283; Function Attrs: argmemonly nounwind readonly
284declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>*) nounwind
285
286define %struct.bfloat16x8x3_t @test_vld3q_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
287; CHECK-LABEL: test_vld3q_bf16:
288; CHECK:       // %bb.0: // %entry
289; CHECK-NEXT:    ld3 { v0.8h, v1.8h, v2.8h }, [x0]
290; CHECK-NEXT:    ret
291entry:
292  %0 = bitcast bfloat* %ptr to <8 x bfloat>*
293  %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>* %0)
294  %vld3.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 0
295  %vld3.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 1
296  %vld3.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 2
297  %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3.fca.0.extract, 0, 0
298  %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3.fca.1.extract, 0, 1
299  %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3.fca.2.extract, 0, 2
300  ret %struct.bfloat16x8x3_t %.fca.0.2.insert
301}
302
303; Function Attrs: argmemonly nounwind readonly
304declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>*) nounwind
305
306define %struct.bfloat16x4x3_t @test_vld3_lane_bf16(bfloat* %ptr, [3 x <4 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
307; CHECK-LABEL: test_vld3_lane_bf16:
308; CHECK:       // %bb.0: // %entry
309; CHECK:    ld3 { v0.h, v1.h, v2.h }[1], [x0]
310; CHECK:    ret
311entry:
312  %src.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 0
313  %src.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 1
314  %src.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 2
315  %0 = bitcast bfloat* %ptr to i8*
316  %vld3_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, i64 1, i8* %0)
317  %vld3_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 0
318  %vld3_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 1
319  %vld3_lane.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 2
320  %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3_lane.fca.0.extract, 0, 0
321  %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3_lane.fca.1.extract, 0, 1
322  %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3_lane.fca.2.extract, 0, 2
323  ret %struct.bfloat16x4x3_t %.fca.0.2.insert
324}
325
326; Function Attrs: argmemonly nounwind readonly
327declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8*) nounwind
328
329define %struct.bfloat16x8x3_t @test_vld3q_lane_bf16(bfloat* %ptr, [3 x <8 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
330; CHECK-LABEL: test_vld3q_lane_bf16:
331; CHECK:       // %bb.0: // %entry
332; CHECKT:    ld3 { v0.h, v1.h, v2.h }[7], [x0]
333; CHECKT:    ret
334entry:
335  %src.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 0
336  %src.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 1
337  %src.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 2
338  %0 = bitcast bfloat* %ptr to i8*
339  %vld3_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, i64 7, i8* %0)
340  %vld3_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 0
341  %vld3_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 1
342  %vld3_lane.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 2
343  %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3_lane.fca.0.extract, 0, 0
344  %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3_lane.fca.1.extract, 0, 1
345  %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3_lane.fca.2.extract, 0, 2
346  ret %struct.bfloat16x8x3_t %.fca.0.2.insert
347}
348
349; Function Attrs: argmemonly nounwind readonly
350declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8*) nounwind
351
352define %struct.bfloat16x4x4_t @test_vld4_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
353; CHECK-LABEL: test_vld4_bf16:
354; CHECK:       // %bb.0: // %entry
355; CHECK-NEXT:    ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
356; CHECK-NEXT:    ret
357entry:
358  %0 = bitcast bfloat* %ptr to <4 x bfloat>*
359  %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>* %0)
360  %vld4.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 0
361  %vld4.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 1
362  %vld4.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 2
363  %vld4.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 3
364  %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4.fca.0.extract, 0, 0
365  %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4.fca.1.extract, 0, 1
366  %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4.fca.2.extract, 0, 2
367  %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4.fca.3.extract, 0, 3
368  ret %struct.bfloat16x4x4_t %.fca.0.3.insert
369}
370
371; Function Attrs: argmemonly nounwind readonly
372declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>*) nounwind
373
374define %struct.bfloat16x8x4_t @test_vld4q_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
375; CHECK-LABEL: test_vld4q_bf16:
376; CHECK:       // %bb.0: // %entry
377; CHECK-NEXT:    ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
378; CHECK-NEXT:    ret
379entry:
380  %0 = bitcast bfloat* %ptr to <8 x bfloat>*
381  %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>* %0)
382  %vld4.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 0
383  %vld4.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 1
384  %vld4.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 2
385  %vld4.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 3
386  %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4.fca.0.extract, 0, 0
387  %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4.fca.1.extract, 0, 1
388  %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4.fca.2.extract, 0, 2
389  %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4.fca.3.extract, 0, 3
390  ret %struct.bfloat16x8x4_t %.fca.0.3.insert
391}
392
393; Function Attrs: argmemonly nounwind readonly
394declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>*) nounwind
395
396define %struct.bfloat16x4x4_t @test_vld4_lane_bf16(bfloat* %ptr, [4 x <4 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
397; CHECK-LABEL: test_vld4_lane_bf16:
398; CHECK:       // %bb.0: // %entry
399; CHECK:    ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x0]
400; CHECK:    ret
401entry:
402  %src.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 0
403  %src.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 1
404  %src.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 2
405  %src.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 3
406  %0 = bitcast bfloat* %ptr to i8*
407  %vld4_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, <4 x bfloat> %src.coerce.fca.3.extract, i64 1, i8* %0)
408  %vld4_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 0
409  %vld4_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 1
410  %vld4_lane.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 2
411  %vld4_lane.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 3
412  %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4_lane.fca.0.extract, 0, 0
413  %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4_lane.fca.1.extract, 0, 1
414  %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4_lane.fca.2.extract, 0, 2
415  %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4_lane.fca.3.extract, 0, 3
416  ret %struct.bfloat16x4x4_t %.fca.0.3.insert
417}
418
419; Function Attrs: argmemonly nounwind readonly
420declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8*) nounwind
421
422define %struct.bfloat16x8x4_t @test_vld4q_lane_bf16(bfloat* %ptr, [4 x <8 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
423; CHECK-LABEL: test_vld4q_lane_bf16:
424; CHECK:       // %bb.0: // %entry
425; CHECK:    ld4 { v0.h, v1.h, v2.h, v3.h }[7], [x0]
426; CHECK:    ret
427entry:
428  %src.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 0
429  %src.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 1
430  %src.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 2
431  %src.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 3
432  %0 = bitcast bfloat* %ptr to i8*
433  %vld4_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, <8 x bfloat> %src.coerce.fca.3.extract, i64 7, i8* %0)
434  %vld4_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 0
435  %vld4_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 1
436  %vld4_lane.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 2
437  %vld4_lane.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 3
438  %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4_lane.fca.0.extract, 0, 0
439  %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4_lane.fca.1.extract, 0, 1
440  %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4_lane.fca.2.extract, 0, 2
441  %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4_lane.fca.3.extract, 0, 3
442  ret %struct.bfloat16x8x4_t %.fca.0.3.insert
443}
444
445; Function Attrs: argmemonly nounwind readonly
446declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8*) nounwind
447
448define %struct.bfloat16x4x2_t @test_vld2_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
449; CHECK-LABEL: test_vld2_dup_bf16:
450; CHECK:       // %bb.0: // %entry
451; CHECK-NEXT:    ld2r { v0.4h, v1.4h }, [x0]
452; CHECK-NEXT:    ret
453entry:
454  %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat* %ptr)
455  %vld2.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 0
456  %vld2.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 1
457  %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2.fca.0.extract, 0, 0
458  %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2.fca.1.extract, 0, 1
459  ret %struct.bfloat16x4x2_t %.fca.0.1.insert
460}
461
462; Function Attrs: argmemonly nounwind readonly
463declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat*) nounwind
464
465define %struct.bfloat16x8x2_t @test_vld2q_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
466; CHECK-LABEL: test_vld2q_dup_bf16:
467; CHECK:       // %bb.0: // %entry
468; CHECK-NEXT:    ld2r { v0.8h, v1.8h }, [x0]
469; CHECK-NEXT:    ret
470entry:
471  %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat* %ptr)
472  %vld2.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 0
473  %vld2.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 1
474  %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2.fca.0.extract, 0, 0
475  %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2.fca.1.extract, 0, 1
476  ret %struct.bfloat16x8x2_t %.fca.0.1.insert
477}
478
479; Function Attrs: argmemonly nounwind readonly
480declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat*) nounwind
481
482define %struct.bfloat16x4x3_t @test_vld3_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
483; CHECK-LABEL: test_vld3_dup_bf16:
484; CHECK:       // %bb.0: // %entry
485; CHECK-NEXT:    ld3r { v0.4h, v1.4h, v2.4h }, [x0]
486; CHECK-NEXT:    ret
487entry:
488  %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat* %ptr)
489  %vld3.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 0
490  %vld3.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 1
491  %vld3.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 2
492  %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3.fca.0.extract, 0, 0
493  %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3.fca.1.extract, 0, 1
494  %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3.fca.2.extract, 0, 2
495  ret %struct.bfloat16x4x3_t %.fca.0.2.insert
496}
497
498; Function Attrs: argmemonly nounwind readonly
499declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat*) nounwind
500
501define %struct.bfloat16x8x3_t @test_vld3q_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
502; CHECK-LABEL: test_vld3q_dup_bf16:
503; CHECK:       // %bb.0: // %entry
504; CHECK-NEXT:    ld3r { v0.8h, v1.8h, v2.8h }, [x0]
505; CHECK-NEXT:    ret
506entry:
507  %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat* %ptr)
508  %vld3.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 0
509  %vld3.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 1
510  %vld3.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 2
511  %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3.fca.0.extract, 0, 0
512  %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3.fca.1.extract, 0, 1
513  %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3.fca.2.extract, 0, 2
514  ret %struct.bfloat16x8x3_t %.fca.0.2.insert
515}
516
517; Function Attrs: argmemonly nounwind readonly
518declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat*) nounwind
519
520define %struct.bfloat16x4x4_t @test_vld4_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
521; CHECK-LABEL: test_vld4_dup_bf16:
522; CHECK:       // %bb.0: // %entry
523; CHECK-NEXT:    ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
524; CHECK-NEXT:    ret
525entry:
526  %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat* %ptr)
527  %vld4.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 0
528  %vld4.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 1
529  %vld4.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 2
530  %vld4.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 3
531  %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4.fca.0.extract, 0, 0
532  %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4.fca.1.extract, 0, 1
533  %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4.fca.2.extract, 0, 2
534  %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4.fca.3.extract, 0, 3
535  ret %struct.bfloat16x4x4_t %.fca.0.3.insert
536}
537
538; Function Attrs: argmemonly nounwind readonly
539declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat*) nounwind
540
541define %struct.bfloat16x8x4_t @test_vld4q_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
542; CHECK-LABEL: test_vld4q_dup_bf16:
543; CHECK:       // %bb.0: // %entry
544; CHECK-NEXT:    ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
545; CHECK-NEXT:    ret
546entry:
547  %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat* %ptr)
548  %vld4.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 0
549  %vld4.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 1
550  %vld4.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 2
551  %vld4.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 3
552  %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4.fca.0.extract, 0, 0
553  %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4.fca.1.extract, 0, 1
554  %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4.fca.2.extract, 0, 2
555  %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4.fca.3.extract, 0, 3
556  ret %struct.bfloat16x8x4_t %.fca.0.3.insert
557}
558
559; Function Attrs: argmemonly nounwind readonly
560declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat*) nounwind
561
562define void @test_vst1_bf16(bfloat* nocapture %ptr, <4 x bfloat> %val) local_unnamed_addr nounwind {
563; CHECK-LABEL: test_vst1_bf16:
564; CHECK:       // %bb.0: // %entry
565; CHECK-NEXT:    str d0, [x0]
566; CHECK-NEXT:    ret
567entry:
568  %0 = bitcast bfloat* %ptr to <4 x bfloat>*
569  store <4 x bfloat> %val, <4 x bfloat>* %0, align 8
570  ret void
571}
572
573define void @test_vst1q_bf16(bfloat* nocapture %ptr, <8 x bfloat> %val) local_unnamed_addr nounwind {
574; CHECK-LABEL: test_vst1q_bf16:
575; CHECK:       // %bb.0: // %entry
576; CHECK-NEXT:    str q0, [x0]
577; CHECK-NEXT:    ret
578entry:
579  %0 = bitcast bfloat* %ptr to <8 x bfloat>*
580  store <8 x bfloat> %val, <8 x bfloat>* %0, align 16
581  ret void
582}
583
584define void @test_vst1_lane_bf16(bfloat* nocapture %ptr, <4 x bfloat> %val) local_unnamed_addr nounwind {
585; CHECK-LABEL: test_vst1_lane_bf16:
586; CHECK:       // %bb.0: // %entry
587; CHECK:    st1 { v0.h }[1], [x0]
588; CHECK:    ret
589entry:
590  %0 = extractelement <4 x bfloat> %val, i32 1
591  store bfloat %0, bfloat* %ptr, align 2
592  ret void
593}
594
595define void @test_vst1q_lane_bf16(bfloat* nocapture %ptr, <8 x bfloat> %val) local_unnamed_addr nounwind {
596; CHECK-LABEL: test_vst1q_lane_bf16:
597; CHECK:       // %bb.0: // %entry
598; CHECK-NEXT:    st1 { v0.h }[7], [x0]
599; CHECK-NEXT:    ret
600entry:
601  %0 = extractelement <8 x bfloat> %val, i32 7
602  store bfloat %0, bfloat* %ptr, align 2
603  ret void
604}
605
606define void @test_vst1_bf16_x2(bfloat* nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
607; CHECK-LABEL: test_vst1_bf16_x2:
608; CHECK:       // %bb.0: // %entry
609; CHECK:    st1 { v0.4h, v1.4h }, [x0]
610; CHECK:    ret
611entry:
612  %val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0
613  %val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1
614  tail call void @llvm.aarch64.neon.st1x2.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr)
615  ret void
616}
617
618; Function Attrs: argmemonly nounwind
619declare void @llvm.aarch64.neon.st1x2.v4bf16.p0bf16(<4 x bfloat>, <4 x bfloat>, bfloat* nocapture) nounwind
620
621define void @test_vst1q_bf16_x2(bfloat* nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
622; CHECK-LABEL: test_vst1q_bf16_x2:
623; CHECK:       // %bb.0: // %entry
624; CHECK:    st1 { v0.8h, v1.8h }, [x0]
625; CHECK:    ret
626entry:
627  %val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0
628  %val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1
629  tail call void @llvm.aarch64.neon.st1x2.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr)
630  ret void
631}
632
633; Function Attrs: argmemonly nounwind
634declare void @llvm.aarch64.neon.st1x2.v8bf16.p0bf16(<8 x bfloat>, <8 x bfloat>, bfloat* nocapture) nounwind
635
636define void @test_vst1_bf16_x3(bfloat* nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
637; CHECK-LABEL: test_vst1_bf16_x3:
638; CHECK:       // %bb.0: // %entry
639; CHECK:    st1 { v0.4h, v1.4h, v2.4h }, [x0]
640; CHECK:    ret
641entry:
642  %val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0
643  %val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1
644  %val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2
645  tail call void @llvm.aarch64.neon.st1x3.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr)
646  ret void
647}
648
649; Function Attrs: argmemonly nounwind
650declare void @llvm.aarch64.neon.st1x3.v4bf16.p0bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, bfloat* nocapture) nounwind
651
652define void @test_vst1q_bf16_x3(bfloat* nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
653; CHECK-LABEL: test_vst1q_bf16_x3:
654; CHECK:       // %bb.0: // %entry
655; CHECK:    st1 { v0.8h, v1.8h, v2.8h }, [x0]
656; CHECK:    ret
657entry:
658  %val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0
659  %val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1
660  %val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2
661  tail call void @llvm.aarch64.neon.st1x3.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr)
662  ret void
663}
664
665; Function Attrs: argmemonly nounwind
666declare void @llvm.aarch64.neon.st1x3.v8bf16.p0bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, bfloat* nocapture) nounwind
667
668; Function Attrs: nounwind
669define void @test_vst1_bf16_x4(bfloat* nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
670; CHECK-LABEL: test_vst1_bf16_x4:
671; CHECK:       // %bb.0: // %entry
672; CHECK:    st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
673; CHECK:    ret
674entry:
675  %val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0
676  %val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1
677  %val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2
678  %val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3
679  tail call void @llvm.aarch64.neon.st1x4.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr)
680  ret void
681}
682
683; Function Attrs: argmemonly nounwind
684declare void @llvm.aarch64.neon.st1x4.v4bf16.p0bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, bfloat* nocapture) nounwind
685
686define void @test_vst1q_bf16_x4(bfloat* nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
687; CHECK-LABEL: test_vst1q_bf16_x4:
688; CHECK:       // %bb.0: // %entry
689; CHECK:    st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
690; CHECK:    ret
691entry:
692  %val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0
693  %val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1
694  %val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2
695  %val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3
696  tail call void @llvm.aarch64.neon.st1x4.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr)
697  ret void
698}
699
700; Function Attrs: argmemonly nounwind
701declare void @llvm.aarch64.neon.st1x4.v8bf16.p0bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, bfloat* nocapture) nounwind
702
703define void @test_vst2_bf16(bfloat* nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
704; CHECK-LABEL: test_vst2_bf16:
705; CHECK:       // %bb.0: // %entry
706; CHECK:    st2 { v0.4h, v1.4h }, [x0]
707; CHECK:    ret
708entry:
709  %val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0
710  %val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1
711  %0 = bitcast bfloat* %ptr to i8*
712  tail call void @llvm.aarch64.neon.st2.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i8* %0)
713  ret void
714}
715
716; Function Attrs: argmemonly nounwind
717declare void @llvm.aarch64.neon.st2.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, i8* nocapture) nounwind
718
719define void @test_vst2q_bf16(bfloat* nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
720; CHECK-LABEL: test_vst2q_bf16:
721; CHECK:       // %bb.0: // %entry
722; CHECK:    st2 { v0.8h, v1.8h }, [x0]
723; CHECK:    ret
724entry:
725  %val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0
726  %val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1
727  %0 = bitcast bfloat* %ptr to i8*
728  tail call void @llvm.aarch64.neon.st2.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i8* %0)
729  ret void
730}
731
732; Function Attrs: argmemonly nounwind
733declare void @llvm.aarch64.neon.st2.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, i8* nocapture) nounwind
734
735define void @test_vst2_lane_bf16(bfloat* nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
736; CHECK-LABEL: test_vst2_lane_bf16:
737; CHECK:       // %bb.0: // %entry
738; CHECK:    st2 { v0.h, v1.h }[1], [x0]
739; CHECK:    ret
740entry:
741  %val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0
742  %val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1
743  %0 = bitcast bfloat* %ptr to i8*
744  tail call void @llvm.aarch64.neon.st2lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i64 1, i8* %0)
745  ret void
746}
747
748; Function Attrs: argmemonly nounwind
749declare void @llvm.aarch64.neon.st2lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, i64, i8* nocapture) nounwind
750
751; Function Attrs: nounwind
752define void @test_vst2q_lane_bf16(bfloat* nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
753; CHECK-LABEL: test_vst2q_lane_bf16:
754; CHECK:       // %bb.0: // %entry
755; CHECK:    st2 { v0.h, v1.h }[7], [x0]
756; CHECK:    ret
757entry:
758  %val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0
759  %val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1
760  %0 = bitcast bfloat* %ptr to i8*
761  tail call void @llvm.aarch64.neon.st2lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i64 7, i8* %0)
762  ret void
763}
764
765; Function Attrs: argmemonly nounwind
766declare void @llvm.aarch64.neon.st2lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, i64, i8* nocapture) nounwind
767
768; Function Attrs: nounwind
769define void @test_vst3_bf16(bfloat* nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
770; CHECK-LABEL: test_vst3_bf16:
771; CHECK:       // %bb.0: // %entry
772; CHECK:    st3 { v0.4h, v1.4h, v2.4h }, [x0]
773; CHECK:    ret
774entry:
775  %val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0
776  %val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1
777  %val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2
778  %0 = bitcast bfloat* %ptr to i8*
779  tail call void @llvm.aarch64.neon.st3.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i8* %0)
780  ret void
781}
782
783; Function Attrs: argmemonly nounwind
784declare void @llvm.aarch64.neon.st3.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i8* nocapture) nounwind
785
786; Function Attrs: nounwind
787define void @test_vst3q_bf16(bfloat* nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
788; CHECK-LABEL: test_vst3q_bf16:
789; CHECK:       // %bb.0: // %entry
790; CHECK:    st3 { v0.8h, v1.8h, v2.8h }, [x0]
791; CHECK:    ret
792entry:
793  %val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0
794  %val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1
795  %val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2
796  %0 = bitcast bfloat* %ptr to i8*
797  tail call void @llvm.aarch64.neon.st3.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i8* %0)
798  ret void
799}
800
801; Function Attrs: argmemonly nounwind
802declare void @llvm.aarch64.neon.st3.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i8* nocapture) nounwind
803
804; Function Attrs: nounwind
805define void @test_vst3_lane_bf16(bfloat* nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
806; CHECK-LABEL: test_vst3_lane_bf16:
807; CHECK:       // %bb.0: // %entry
808; CHECK:    st3 { v0.h, v1.h, v2.h }[1], [x0]
809; CHECK:    ret
810entry:
811  %val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0
812  %val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1
813  %val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2
814  %0 = bitcast bfloat* %ptr to i8*
815  tail call void @llvm.aarch64.neon.st3lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i64 1, i8* %0)
816  ret void
817}
818
819; Function Attrs: argmemonly nounwind
820declare void @llvm.aarch64.neon.st3lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8* nocapture) nounwind
821
822; Function Attrs: nounwind
823define void @test_vst3q_lane_bf16(bfloat* nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
824; CHECK-LABEL: test_vst3q_lane_bf16:
825; CHECK:       // %bb.0: // %entry
826; CHECK:    st3 { v0.h, v1.h, v2.h }[7], [x0]
827; CHECK:    ret
828entry:
829  %val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0
830  %val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1
831  %val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2
832  %0 = bitcast bfloat* %ptr to i8*
833  tail call void @llvm.aarch64.neon.st3lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i64 7, i8* %0)
834  ret void
835}
836
837; Function Attrs: argmemonly nounwind
838declare void @llvm.aarch64.neon.st3lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8* nocapture) nounwind
839
840; Function Attrs: nounwind
841define void @test_vst4_bf16(bfloat* nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
842; CHECK-LABEL: test_vst4_bf16:
843; CHECK:       // %bb.0: // %entry
844; CHECK:    st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
845; CHECK:    ret
846entry:
847  %val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0
848  %val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1
849  %val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2
850  %val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3
851  %0 = bitcast bfloat* %ptr to i8*
852  tail call void @llvm.aarch64.neon.st4.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i8* %0)
853  ret void
854}
855
856; Function Attrs: argmemonly nounwind
857declare void @llvm.aarch64.neon.st4.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i8* nocapture) nounwind
858
859; Function Attrs: nounwind
860define void @test_vst4q_bf16(bfloat* nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
861; CHECK-LABEL: test_vst4q_bf16:
862; CHECK:       // %bb.0: // %entry
863; CHECK:    st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
864; CHECK:    ret
865entry:
866  %val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0
867  %val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1
868  %val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2
869  %val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3
870  %0 = bitcast bfloat* %ptr to i8*
871  tail call void @llvm.aarch64.neon.st4.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i8* %0)
872  ret void
873}
874
875; Function Attrs: argmemonly nounwind
876declare void @llvm.aarch64.neon.st4.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i8* nocapture) nounwind
877
878; Function Attrs: nounwind
879define void @test_vst4_lane_bf16(bfloat* nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
880; CHECK-LABEL: test_vst4_lane_bf16:
881; CHECK:       // %bb.0: // %entry
882; CHECK:    st4 { v0.h, v1.h, v2.h, v3.h }[1], [x0]
883; CHECK:    ret
884entry:
885  %val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0
886  %val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1
887  %val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2
888  %val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3
889  %0 = bitcast bfloat* %ptr to i8*
890  tail call void @llvm.aarch64.neon.st4lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i64 1, i8* %0)
891  ret void
892}
893
894; Function Attrs: argmemonly nounwind
895declare void @llvm.aarch64.neon.st4lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8* nocapture) nounwind
896
897; Function Attrs: nounwind
898define void @test_vst4q_lane_bf16(bfloat* nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
899; CHECK-LABEL: test_vst4q_lane_bf16:
900; CHECK:       // %bb.0: // %entry
901; CHECK:    st4 { v0.h, v1.h, v2.h, v3.h }[7], [x0]
902; CHECK:    ret
903entry:
904  %val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0
905  %val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1
906  %val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2
907  %val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3
908  %0 = bitcast bfloat* %ptr to i8*
909  tail call void @llvm.aarch64.neon.st4lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i64 7, i8* %0)
910  ret void
911}
912
913; Function Attrs: argmemonly nounwind
914declare void @llvm.aarch64.neon.st4lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8* nocapture) nounwind
915
916
917