1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=armv8.6a-arm-none-eabi -mattr=+bf16,+neon,+fullfp16 < %s | FileCheck %s
3; FIXME: Remove fullfp16 once bfloat arguments and returns lowering stops
4; depending on it.
5
6define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_bf16(bfloat* nocapture readonly %ptr) {
7; CHECK-LABEL: test_vld1_bf16:
8; CHECK:       @ %bb.0: @ %entry
9; CHECK-NEXT:    vld1.16 {d0}, [r0]
10; CHECK-NEXT:    bx lr
11entry:
12  %0 = bitcast bfloat* %ptr to <4 x bfloat>*
13  %1 = load <4 x bfloat>, <4 x bfloat>* %0, align 2
14  ret <4 x bfloat> %1
15}
16
17define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_bf16(bfloat* nocapture readonly %ptr) {
18; CHECK-LABEL: test_vld1q_bf16:
19; CHECK:       @ %bb.0: @ %entry
20; CHECK-NEXT:    vld1.16 {d0, d1}, [r0]
21; CHECK-NEXT:    bx lr
22entry:
23  %0 = bitcast bfloat* %ptr to <8 x bfloat>*
24  %1 = load <8 x bfloat>, <8 x bfloat>* %0, align 2
25  ret <8 x bfloat> %1
26}
27
28define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_lane_bf16(bfloat* nocapture readonly %ptr, <4 x bfloat> %src) {
29; CHECK-LABEL: test_vld1_lane_bf16:
30; CHECK:       @ %bb.0: @ %entry
31; CHECK-NEXT:    vld1.16 {d0[0]}, [r0:16]
32; CHECK-NEXT:    bx lr
33entry:
34  %0 = load bfloat, bfloat* %ptr, align 2
35  %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0
36  ret <4 x bfloat> %vld1_lane
37}
38
39define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_lane_bf16(bfloat* nocapture readonly %ptr, <8 x bfloat> %src) {
40; CHECK-LABEL: test_vld1q_lane_bf16:
41; CHECK:       @ %bb.0: @ %entry
42; CHECK-NEXT:    vld1.16 {d1[3]}, [r0:16]
43; CHECK-NEXT:    bx lr
44entry:
45  %0 = load bfloat, bfloat* %ptr, align 2
46  %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7
47  ret <8 x bfloat> %vld1_lane
48}
49
50define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_dup_bf16(bfloat* nocapture readonly %ptr) {
51; CHECK-LABEL: test_vld1_dup_bf16:
52; CHECK:       @ %bb.0: @ %entry
53; CHECK-NEXT:    vld1.16 {d0[]}, [r0:16]
54; CHECK-NEXT:    bx lr
55entry:
56  %0 = load bfloat, bfloat* %ptr, align 2
57  %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0
58  %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer
59  ret <4 x bfloat> %lane
60}
61
62define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld1_bf16_x2(bfloat* %ptr) {
63; CHECK-LABEL: test_vld1_bf16_x2:
64; CHECK:       @ %bb.0: @ %entry
65; CHECK-NEXT:    vld1.16 {d0, d1}, [r0:64]
66; CHECK-NEXT:    bx lr
67entry:
68  %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0bf16(bfloat* %ptr)
69  %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
70  %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
71  %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32>
72  %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32>
73  %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %0, 0
74  %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
75  ret [2 x <2 x i32>] %.fca.1.insert
76}
77
78define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld1q_bf16_x2(bfloat* %ptr) {
79; CHECK-LABEL: test_vld1q_bf16_x2:
80; CHECK:       @ %bb.0: @ %entry
81; CHECK-NEXT:    vld1.16 {d0, d1, d2, d3}, [r0:256]
82; CHECK-NEXT:    bx lr
83entry:
84  %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0bf16(bfloat* %ptr)
85  %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
86  %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
87  %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32>
88  %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32>
89  %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %0, 0
90  %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
91  ret [2 x <4 x i32>] %.fca.1.insert
92}
93
94define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld1_bf16_x3(bfloat* %ptr) {
95; CHECK-LABEL: test_vld1_bf16_x3:
96; CHECK:       @ %bb.0: @ %entry
97; CHECK-NEXT:    vld1.16 {d0, d1, d2}, [r0:64]
98; CHECK-NEXT:    bx lr
99entry:
100  %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0bf16(bfloat* %ptr)
101  %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
102  %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
103  %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2
104  %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32>
105  %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32>
106  %2 = bitcast <4 x bfloat> %vld1xN.fca.2.extract to <2 x i32>
107  %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %0, 0
108  %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
109  %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
110  ret [3 x <2 x i32>] %.fca.2.insert
111}
112
113define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld1q_bf16_x3(bfloat* %ptr) {
114; CHECK-LABEL: test_vld1q_bf16_x3:
115; CHECK:       @ %bb.0: @ %entry
116; CHECK-NEXT:    vld1.16 {d0, d1, d2}, [r0:64]!
117; CHECK-NEXT:    vld1.16 {d3, d4, d5}, [r0:64]
118; CHECK-NEXT:    bx lr
119entry:
120  %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0bf16(bfloat* %ptr)
121  %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
122  %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
123  %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2
124  %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32>
125  %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32>
126  %2 = bitcast <8 x bfloat> %vld1xN.fca.2.extract to <4 x i32>
127  %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %0, 0
128  %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
129  %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
130  ret [3 x <4 x i32>] %.fca.2.insert
131}
132
133define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld1_bf16_x4(bfloat* %ptr) {
134; CHECK-LABEL: test_vld1_bf16_x4:
135; CHECK:       @ %bb.0: @ %entry
136; CHECK-NEXT:    vld1.16 {d0, d1, d2, d3}, [r0:256]
137; CHECK-NEXT:    bx lr
138entry:
139  %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0bf16(bfloat* %ptr)
140  %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
141  %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
142  %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2
143  %vld1xN.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 3
144  %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32>
145  %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32>
146  %2 = bitcast <4 x bfloat> %vld1xN.fca.2.extract to <2 x i32>
147  %3 = bitcast <4 x bfloat> %vld1xN.fca.3.extract to <2 x i32>
148  %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %0, 0
149  %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
150  %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
151  %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %3, 3
152  ret [4 x <2 x i32>] %.fca.3.insert
153}
154
155define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld1q_bf16_x4(bfloat* %ptr) {
156; CHECK-LABEL: test_vld1q_bf16_x4:
157; CHECK:       @ %bb.0: @ %entry
158; CHECK-NEXT:    vld1.16 {d0, d1, d2, d3}, [r0:256]!
159; CHECK-NEXT:    vld1.16 {d4, d5, d6, d7}, [r0:256]
160; CHECK-NEXT:    bx lr
161entry:
162  %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0bf16(bfloat* %ptr)
163  %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
164  %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
165  %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2
166  %vld1xN.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 3
167  %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32>
168  %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32>
169  %2 = bitcast <8 x bfloat> %vld1xN.fca.2.extract to <4 x i32>
170  %3 = bitcast <8 x bfloat> %vld1xN.fca.3.extract to <4 x i32>
171  %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %0, 0
172  %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
173  %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
174  %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %3, 3
175  ret [4 x <4 x i32>] %.fca.3.insert
176}
177
178define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_dup_bf16(bfloat* nocapture readonly %ptr) {
179; CHECK-LABEL: test_vld1q_dup_bf16:
180; CHECK:       @ %bb.0: @ %entry
181; CHECK-NEXT:    vld1.16 {d0[], d1[]}, [r0:16]
182; CHECK-NEXT:    bx lr
183entry:
184  %0 = load bfloat, bfloat* %ptr, align 2
185  %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0
186  %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer
187  ret <8 x bfloat> %lane
188}
189
190define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_bf16(bfloat* %ptr) {
191; CHECK-LABEL: test_vld2_bf16:
192; CHECK:       @ %bb.0: @ %entry
193; CHECK-NEXT:    vld2.16 {d0, d1}, [r0]
194; CHECK-NEXT:    bx lr
195entry:
196  %0 = bitcast bfloat* %ptr to i8*
197  %vld2_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0i8(i8* %0, i32 2)
198  %vld2_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_v, 0
199  %vld2_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_v, 1
200  %1 = bitcast <4 x bfloat> %vld2_v.fca.0.extract to <2 x i32>
201  %2 = bitcast <4 x bfloat> %vld2_v.fca.1.extract to <2 x i32>
202  %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %1, 0
203  %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1
204  ret [2 x <2 x i32>] %.fca.1.insert
205}
206
207define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_bf16(bfloat* %ptr) {
208; CHECK-LABEL: test_vld2q_bf16:
209; CHECK:       @ %bb.0: @ %entry
210; CHECK-NEXT:    vld2.16 {d0, d1, d2, d3}, [r0]
211; CHECK-NEXT:    bx lr
212entry:
213  %0 = bitcast bfloat* %ptr to i8*
214  %vld2q_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0i8(i8* %0, i32 2)
215  %vld2q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_v, 0
216  %vld2q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_v, 1
217  %1 = bitcast <8 x bfloat> %vld2q_v.fca.0.extract to <4 x i32>
218  %2 = bitcast <8 x bfloat> %vld2q_v.fca.1.extract to <4 x i32>
219  %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %1, 0
220  %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1
221  ret [2 x <4 x i32>] %.fca.1.insert
222}
223
224define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_lane_bf16(bfloat* %ptr, [2 x <2 x i32>] %src.coerce) {
225; CHECK-LABEL: test_vld2_lane_bf16:
226; CHECK:       @ %bb.0: @ %entry
227; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0 def $q0
228; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0 def $q0
229; CHECK-NEXT:    vld2.16 {d0[1], d1[1]}, [r0]
230; CHECK-NEXT:    bx lr
231entry:
232  %src.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %src.coerce, 0
233  %src.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %src.coerce, 1
234  %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat>
235  %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat>
236  %2 = bitcast bfloat* %ptr to i8*
237  %vld2_lane_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0i8(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
238  %vld2_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane_v, 0
239  %vld2_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane_v, 1
240  %3 = bitcast <4 x bfloat> %vld2_lane_v.fca.0.extract to <2 x i32>
241  %4 = bitcast <4 x bfloat> %vld2_lane_v.fca.1.extract to <2 x i32>
242  %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %3, 0
243  %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %4, 1
244  ret [2 x <2 x i32>] %.fca.1.insert
245}
246
247define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_lane_bf16(bfloat* %ptr, [2 x <4 x i32>] %src.coerce) {
248; CHECK-LABEL: test_vld2q_lane_bf16:
249; CHECK:       @ %bb.0: @ %entry
250; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
251; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
252; CHECK-NEXT:    vld2.16 {d1[3], d3[3]}, [r0]
253; CHECK-NEXT:    bx lr
254entry:
255  %src.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %src.coerce, 0
256  %src.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %src.coerce, 1
257  %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat>
258  %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat>
259  %2 = bitcast bfloat* %ptr to i8*
260  %vld2q_lane_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0i8(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
261  %vld2q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_lane_v, 0
262  %vld2q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_lane_v, 1
263  %3 = bitcast <8 x bfloat> %vld2q_lane_v.fca.0.extract to <4 x i32>
264  %4 = bitcast <8 x bfloat> %vld2q_lane_v.fca.1.extract to <4 x i32>
265  %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %3, 0
266  %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %4, 1
267  ret [2 x <4 x i32>] %.fca.1.insert
268}
269
270define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_bf16(bfloat* %ptr) {
271; CHECK-LABEL: test_vld3_bf16:
272; CHECK:       @ %bb.0: @ %entry
273; CHECK-NEXT:    vld3.16 {d0, d1, d2}, [r0]
274; CHECK-NEXT:    bx lr
275entry:
276  %0 = bitcast bfloat* %ptr to i8*
277  %vld3_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0i8(i8* %0, i32 2)
278  %vld3_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 0
279  %vld3_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 1
280  %vld3_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 2
281  %1 = bitcast <4 x bfloat> %vld3_v.fca.0.extract to <2 x i32>
282  %2 = bitcast <4 x bfloat> %vld3_v.fca.1.extract to <2 x i32>
283  %3 = bitcast <4 x bfloat> %vld3_v.fca.2.extract to <2 x i32>
284  %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %1, 0
285  %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1
286  %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %3, 2
287  ret [3 x <2 x i32>] %.fca.2.insert
288}
289
290define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_bf16(bfloat* %ptr) {
291; CHECK-LABEL: test_vld3q_bf16:
292; CHECK:       @ %bb.0: @ %entry
293; CHECK-NEXT:    vld3.16 {d0, d2, d4}, [r0]!
294; CHECK-NEXT:    vld3.16 {d1, d3, d5}, [r0]
295; CHECK-NEXT:    bx lr
296entry:
297  %0 = bitcast bfloat* %ptr to i8*
298  %vld3q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0i8(i8* %0, i32 2)
299  %vld3q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 0
300  %vld3q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 1
301  %vld3q_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 2
302  %1 = bitcast <8 x bfloat> %vld3q_v.fca.0.extract to <4 x i32>
303  %2 = bitcast <8 x bfloat> %vld3q_v.fca.1.extract to <4 x i32>
304  %3 = bitcast <8 x bfloat> %vld3q_v.fca.2.extract to <4 x i32>
305  %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %1, 0
306  %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1
307  %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %3, 2
308  ret [3 x <4 x i32>] %.fca.2.insert
309}
310
311define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_lane_bf16(bfloat* %ptr, [3 x <2 x i32>] %src.coerce) {
312; CHECK-LABEL: test_vld3_lane_bf16:
313; CHECK:       @ %bb.0: @ %entry
314; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
315; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
316; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
317; CHECK-NEXT:    vld3.16 {d0[1], d1[1], d2[1]}, [r0]
318; CHECK-NEXT:    bx lr
319entry:
320  %src.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %src.coerce, 0
321  %src.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %src.coerce, 1
322  %src.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %src.coerce, 2
323  %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat>
324  %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat>
325  %2 = bitcast <2 x i32> %src.coerce.fca.2.extract to <4 x bfloat>
326  %3 = bitcast bfloat* %ptr to i8*
327  %vld3_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
328  %vld3_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 0
329  %vld3_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 1
330  %vld3_lane_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 2
331  %4 = bitcast <4 x bfloat> %vld3_lane_v.fca.0.extract to <2 x i32>
332  %5 = bitcast <4 x bfloat> %vld3_lane_v.fca.1.extract to <2 x i32>
333  %6 = bitcast <4 x bfloat> %vld3_lane_v.fca.2.extract to <2 x i32>
334  %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %4, 0
335  %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %5, 1
336  %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %6, 2
337  ret [3 x <2 x i32>] %.fca.2.insert
338}
339
340define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_lane_bf16(bfloat* %ptr, [3 x <4 x i32>] %src.coerce) {
341; CHECK-LABEL: test_vld3q_lane_bf16:
342; CHECK:       @ %bb.0: @ %entry
343; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
344; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
345; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
346; CHECK-NEXT:    vld3.16 {d1[3], d3[3], d5[3]}, [r0]
347; CHECK-NEXT:    bx lr
348entry:
349  %src.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %src.coerce, 0
350  %src.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %src.coerce, 1
351  %src.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %src.coerce, 2
352  %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat>
353  %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat>
354  %2 = bitcast <4 x i32> %src.coerce.fca.2.extract to <8 x bfloat>
355  %3 = bitcast bfloat* %ptr to i8*
356  %vld3q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
357  %vld3q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 0
358  %vld3q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 1
359  %vld3q_lane_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 2
360  %4 = bitcast <8 x bfloat> %vld3q_lane_v.fca.0.extract to <4 x i32>
361  %5 = bitcast <8 x bfloat> %vld3q_lane_v.fca.1.extract to <4 x i32>
362  %6 = bitcast <8 x bfloat> %vld3q_lane_v.fca.2.extract to <4 x i32>
363  %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %4, 0
364  %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %5, 1
365  %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %6, 2
366  ret [3 x <4 x i32>] %.fca.2.insert
367}
368
369define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_bf16(bfloat* %ptr) {
370; CHECK-LABEL: test_vld4_bf16:
371; CHECK:       @ %bb.0: @ %entry
372; CHECK-NEXT:    vld4.16 {d0, d1, d2, d3}, [r0]
373; CHECK-NEXT:    bx lr
374entry:
375  %0 = bitcast bfloat* %ptr to i8*
376  %vld4_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0i8(i8* %0, i32 2)
377  %vld4_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 0
378  %vld4_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 1
379  %vld4_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 2
380  %vld4_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 3
381  %1 = bitcast <4 x bfloat> %vld4_v.fca.0.extract to <2 x i32>
382  %2 = bitcast <4 x bfloat> %vld4_v.fca.1.extract to <2 x i32>
383  %3 = bitcast <4 x bfloat> %vld4_v.fca.2.extract to <2 x i32>
384  %4 = bitcast <4 x bfloat> %vld4_v.fca.3.extract to <2 x i32>
385  %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %1, 0
386  %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1
387  %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %3, 2
388  %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %4, 3
389  ret [4 x <2 x i32>] %.fca.3.insert
390}
391
392define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_bf16(bfloat* %ptr) {
393; CHECK-LABEL: test_vld4q_bf16:
394; CHECK:       @ %bb.0: @ %entry
395; CHECK-NEXT:    vld4.16 {d0, d2, d4, d6}, [r0]!
396; CHECK-NEXT:    vld4.16 {d1, d3, d5, d7}, [r0]
397; CHECK-NEXT:    bx lr
398entry:
399  %0 = bitcast bfloat* %ptr to i8*
400  %vld4q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0i8(i8* %0, i32 2)
401  %vld4q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 0
402  %vld4q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 1
403  %vld4q_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 2
404  %vld4q_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 3
405  %1 = bitcast <8 x bfloat> %vld4q_v.fca.0.extract to <4 x i32>
406  %2 = bitcast <8 x bfloat> %vld4q_v.fca.1.extract to <4 x i32>
407  %3 = bitcast <8 x bfloat> %vld4q_v.fca.2.extract to <4 x i32>
408  %4 = bitcast <8 x bfloat> %vld4q_v.fca.3.extract to <4 x i32>
409  %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %1, 0
410  %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1
411  %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %3, 2
412  %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %4, 3
413  ret [4 x <4 x i32>] %.fca.3.insert
414}
415
416define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_lane_bf16(bfloat* %ptr, [4 x <2 x i32>] %src.coerce) {
417; CHECK-LABEL: test_vld4_lane_bf16:
418; CHECK:       @ %bb.0: @ %entry
419; CHECK-NEXT:    @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
420; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
421; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
422; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
423; CHECK-NEXT:    vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0]
424; CHECK-NEXT:    bx lr
425entry:
426  %src.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %src.coerce, 0
427  %src.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %src.coerce, 1
428  %src.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %src.coerce, 2
429  %src.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %src.coerce, 3
430  %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat>
431  %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat>
432  %2 = bitcast <2 x i32> %src.coerce.fca.2.extract to <4 x bfloat>
433  %3 = bitcast <2 x i32> %src.coerce.fca.3.extract to <4 x bfloat>
434  %4 = bitcast bfloat* %ptr to i8*
435  %vld4_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
436  %vld4_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 0
437  %vld4_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 1
438  %vld4_lane_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 2
439  %vld4_lane_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 3
440  %5 = bitcast <4 x bfloat> %vld4_lane_v.fca.0.extract to <2 x i32>
441  %6 = bitcast <4 x bfloat> %vld4_lane_v.fca.1.extract to <2 x i32>
442  %7 = bitcast <4 x bfloat> %vld4_lane_v.fca.2.extract to <2 x i32>
443  %8 = bitcast <4 x bfloat> %vld4_lane_v.fca.3.extract to <2 x i32>
444  %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %5, 0
445  %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %6, 1
446  %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %7, 2
447  %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %8, 3
448  ret [4 x <2 x i32>] %.fca.3.insert
449}
450
451define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_lane_bf16(bfloat* %ptr, [4 x <4 x i32>] %src.coerce) {
452; CHECK-LABEL: test_vld4q_lane_bf16:
453; CHECK:       @ %bb.0: @ %entry
454; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
455; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
456; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
457; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
458; CHECK-NEXT:    vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0]
459; CHECK-NEXT:    bx lr
460entry:
461  %src.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %src.coerce, 0
462  %src.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %src.coerce, 1
463  %src.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %src.coerce, 2
464  %src.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %src.coerce, 3
465  %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat>
466  %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat>
467  %2 = bitcast <4 x i32> %src.coerce.fca.2.extract to <8 x bfloat>
468  %3 = bitcast <4 x i32> %src.coerce.fca.3.extract to <8 x bfloat>
469  %4 = bitcast bfloat* %ptr to i8*
470  %vld4q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)
471  %vld4q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 0
472  %vld4q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 1
473  %vld4q_lane_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 2
474  %vld4q_lane_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 3
475  %5 = bitcast <8 x bfloat> %vld4q_lane_v.fca.0.extract to <4 x i32>
476  %6 = bitcast <8 x bfloat> %vld4q_lane_v.fca.1.extract to <4 x i32>
477  %7 = bitcast <8 x bfloat> %vld4q_lane_v.fca.2.extract to <4 x i32>
478  %8 = bitcast <8 x bfloat> %vld4q_lane_v.fca.3.extract to <4 x i32>
479  %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %5, 0
480  %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %6, 1
481  %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %7, 2
482  %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %8, 3
483  ret [4 x <4 x i32>] %.fca.3.insert
484}
485
486define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_dup_bf16(bfloat* %ptr) {
487; CHECK-LABEL: test_vld2_dup_bf16:
488; CHECK:       @ %bb.0: @ %entry
489; CHECK-NEXT:    vld2.16 {d0[], d1[]}, [r0]
490; CHECK-NEXT:    bx lr
491entry:
492  %0 = bitcast bfloat* %ptr to i8*
493  %vld2_dup_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0i8(i8* %0, i32 2)
494  %vld2_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_dup_v, 0
495  %vld2_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_dup_v, 1
496  %1 = bitcast <4 x bfloat> %vld2_dup_v.fca.0.extract to <2 x i32>
497  %2 = bitcast <4 x bfloat> %vld2_dup_v.fca.1.extract to <2 x i32>
498  %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %1, 0
499  %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1
500  ret [2 x <2 x i32>] %.fca.1.insert
501}
502
503define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_dup_bf16(bfloat* %ptr) {
504; CHECK-LABEL: test_vld2q_dup_bf16:
505; CHECK:       @ %bb.0: @ %entry
506; CHECK-NEXT:    vld2.16 {d16[], d18[]}, [r0]
507; CHECK-NEXT:    vld2.16 {d1[], d3[]}, [r0]
508; CHECK-NEXT:    bx lr
509entry:
510  %0 = bitcast bfloat* %ptr to i8*
511  %vld2q_dup_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0i8(i8* %0, i32 2)
512  %vld2q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_dup_v, 0
513  %vld2q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_dup_v, 1
514  %1 = bitcast <8 x bfloat> %vld2q_dup_v.fca.0.extract to <4 x i32>
515  %2 = bitcast <8 x bfloat> %vld2q_dup_v.fca.1.extract to <4 x i32>
516  %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %1, 0
517  %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1
518  ret [2 x <4 x i32>] %.fca.1.insert
519}
520
521define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_dup_bf16(bfloat* %ptr) {
522; CHECK-LABEL: test_vld3_dup_bf16:
523; CHECK:       @ %bb.0: @ %entry
524; CHECK-NEXT:    vld3.16 {d0[], d1[], d2[]}, [r0]
525; CHECK-NEXT:    bx lr
526entry:
527  %0 = bitcast bfloat* %ptr to i8*
528  %vld3_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0i8(i8* %0, i32 2)
529  %vld3_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 0
530  %vld3_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 1
531  %vld3_dup_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 2
532  %1 = bitcast <4 x bfloat> %vld3_dup_v.fca.0.extract to <2 x i32>
533  %2 = bitcast <4 x bfloat> %vld3_dup_v.fca.1.extract to <2 x i32>
534  %3 = bitcast <4 x bfloat> %vld3_dup_v.fca.2.extract to <2 x i32>
535  %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %1, 0
536  %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1
537  %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %3, 2
538  ret [3 x <2 x i32>] %.fca.2.insert
539}
540
541define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_dup_bf16(bfloat* %ptr) {
542; CHECK-LABEL: test_vld3q_dup_bf16:
543; CHECK:       @ %bb.0: @ %entry
544; CHECK-NEXT:    vld3.16 {d0[], d2[], d4[]}, [r0]
545; CHECK-NEXT:    vld3.16 {d1[], d3[], d5[]}, [r0]
546; CHECK-NEXT:    bx lr
547entry:
548  %0 = bitcast bfloat* %ptr to i8*
549  %vld3q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0i8(i8* %0, i32 2)
550  %vld3q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 0
551  %vld3q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 1
552  %vld3q_dup_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 2
553  %1 = bitcast <8 x bfloat> %vld3q_dup_v.fca.0.extract to <4 x i32>
554  %2 = bitcast <8 x bfloat> %vld3q_dup_v.fca.1.extract to <4 x i32>
555  %3 = bitcast <8 x bfloat> %vld3q_dup_v.fca.2.extract to <4 x i32>
556  %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %1, 0
557  %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1
558  %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %3, 2
559  ret [3 x <4 x i32>] %.fca.2.insert
560}
561
562define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_dup_bf16(bfloat* %ptr) {
563; CHECK-LABEL: test_vld4_dup_bf16:
564; CHECK:       @ %bb.0: @ %entry
565; CHECK-NEXT:    vld4.16 {d0[], d1[], d2[], d3[]}, [r0]
566; CHECK-NEXT:    bx lr
567entry:
568  %0 = bitcast bfloat* %ptr to i8*
569  %vld4_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0i8(i8* %0, i32 2)
570  %vld4_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 0
571  %vld4_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 1
572  %vld4_dup_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 2
573  %vld4_dup_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 3
574  %1 = bitcast <4 x bfloat> %vld4_dup_v.fca.0.extract to <2 x i32>
575  %2 = bitcast <4 x bfloat> %vld4_dup_v.fca.1.extract to <2 x i32>
576  %3 = bitcast <4 x bfloat> %vld4_dup_v.fca.2.extract to <2 x i32>
577  %4 = bitcast <4 x bfloat> %vld4_dup_v.fca.3.extract to <2 x i32>
578  %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %1, 0
579  %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1
580  %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %3, 2
581  %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %4, 3
582  ret [4 x <2 x i32>] %.fca.3.insert
583}
584
585define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_dup_bf16(bfloat* %ptr) {
586; CHECK-LABEL: test_vld4q_dup_bf16:
587; CHECK:       @ %bb.0: @ %entry
588; CHECK-NEXT:    vld4.16 {d0[], d2[], d4[], d6[]}, [r0]
589; CHECK-NEXT:    vld4.16 {d1[], d3[], d5[], d7[]}, [r0]
590; CHECK-NEXT:    bx lr
591entry:
592  %0 = bitcast bfloat* %ptr to i8*
593  %vld4q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0i8(i8* %0, i32 2)
594  %vld4q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 0
595  %vld4q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 1
596  %vld4q_dup_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 2
597  %vld4q_dup_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 3
598  %1 = bitcast <8 x bfloat> %vld4q_dup_v.fca.0.extract to <4 x i32>
599  %2 = bitcast <8 x bfloat> %vld4q_dup_v.fca.1.extract to <4 x i32>
600  %3 = bitcast <8 x bfloat> %vld4q_dup_v.fca.2.extract to <4 x i32>
601  %4 = bitcast <8 x bfloat> %vld4q_dup_v.fca.3.extract to <4 x i32>
602  %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %1, 0
603  %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1
604  %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %3, 2
605  %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %4, 3
606  ret [4 x <4 x i32>] %.fca.3.insert
607}
608
609define arm_aapcs_vfpcc void @test_vst1_bf16(bfloat* %ptr, <4 x bfloat> %val) {
610; CHECK-LABEL: test_vst1_bf16:
611; CHECK:       @ %bb.0: @ %entry
612; CHECK-NEXT:    vst1.16 {d0}, [r0]
613; CHECK-NEXT:    bx lr
614entry:
615  %0 = bitcast bfloat* %ptr to i8*
616  tail call void @llvm.arm.neon.vst1.p0i8.v4bf16(i8* %0, <4 x bfloat> %val, i32 2)
617  ret void
618}
619
620define arm_aapcs_vfpcc void @test_vst1q_bf16(bfloat* %ptr, <8 x bfloat> %val) {
621; CHECK-LABEL: test_vst1q_bf16:
622; CHECK:       @ %bb.0: @ %entry
623; CHECK-NEXT:    vst1.16 {d0, d1}, [r0]
624; CHECK-NEXT:    bx lr
625entry:
626  %0 = bitcast bfloat* %ptr to i8*
627  tail call void @llvm.arm.neon.vst1.p0i8.v8bf16(i8* %0, <8 x bfloat> %val, i32 2)
628  ret void
629}
630
631define arm_aapcs_vfpcc void @test_vst1_lane_bf16(bfloat* nocapture %ptr, <4 x bfloat> %val) {
632; CHECK-LABEL: test_vst1_lane_bf16:
633; CHECK:       @ %bb.0: @ %entry
634; CHECK-NEXT:    vmovx.f16 s0, s0
635; CHECK-NEXT:    vstr.16 s0, [r0]
636; CHECK-NEXT:    bx lr
637entry:
638  %0 = extractelement <4 x bfloat> %val, i32 1
639  store bfloat %0, bfloat* %ptr, align 2
640  ret void
641}
642
643define arm_aapcs_vfpcc void @test_vst1q_lane_bf16(bfloat* nocapture %ptr, <8 x bfloat> %val) {
644; CHECK-LABEL: test_vst1q_lane_bf16:
645; CHECK:       @ %bb.0: @ %entry
646; CHECK-NEXT:    vmovx.f16 s0, s3
647; CHECK-NEXT:    vstr.16 s0, [r0]
648; CHECK-NEXT:    bx lr
649entry:
650  %0 = extractelement <8 x bfloat> %val, i32 7
651  store bfloat %0, bfloat* %ptr, align 2
652  ret void
653}
654
655define arm_aapcs_vfpcc void @test_vst1_bf16_x2(bfloat* nocapture %ptr, [2 x <2 x i32>] %val.coerce) {
656; CHECK-LABEL: test_vst1_bf16_x2:
657; CHECK:       @ %bb.0: @ %entry
658; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0 def $q0
659; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0 def $q0
660; CHECK-NEXT:    vst1.16 {d0, d1}, [r0:64]
661; CHECK-NEXT:    bx lr
662entry:
663  %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0
664  %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1
665  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
666  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
667  tail call void @llvm.arm.neon.vst1x2.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1)
668  ret void
669}
670
671define arm_aapcs_vfpcc void @test_vst1q_bf16_x2(bfloat* nocapture %ptr, [2 x <4 x i32>] %val.coerce) {
672; CHECK-LABEL: test_vst1q_bf16_x2:
673; CHECK:       @ %bb.0: @ %entry
674; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
675; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
676; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0:256]
677; CHECK-NEXT:    bx lr
678entry:
679  %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0
680  %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1
681  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
682  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
683  tail call void @llvm.arm.neon.vst1x2.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1)
684  ret void
685}
686
687define arm_aapcs_vfpcc void @test_vst1_bf16_x3(bfloat* nocapture %ptr, [3 x <2 x i32>] %val.coerce) {
688; CHECK-LABEL: test_vst1_bf16_x3:
689; CHECK:       @ %bb.0: @ %entry
690; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
691; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
692; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
693; CHECK-NEXT:    vst1.16 {d0, d1, d2}, [r0:64]
694; CHECK-NEXT:    bx lr
695entry:
696  %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0
697  %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1
698  %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2
699  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
700  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
701  %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
702  tail call void @llvm.arm.neon.vst1x3.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2)
703  ret void
704}
705
706define arm_aapcs_vfpcc void @test_vst1q_bf16_x3(bfloat* nocapture %ptr, [3 x <4 x i32>] %val.coerce) {
707; CHECK-LABEL: test_vst1q_bf16_x3:
708; CHECK:       @ %bb.0: @ %entry
709; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
710; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
711; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
712; CHECK-NEXT:    vst1.16 {d0, d1, d2}, [r0:64]!
713; CHECK-NEXT:    vst1.16 {d3, d4, d5}, [r0:64]
714; CHECK-NEXT:    bx lr
715entry:
716  %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0
717  %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1
718  %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2
719  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
720  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
721  %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
722  tail call void @llvm.arm.neon.vst1x3.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2)
723  ret void
724}
725
726define arm_aapcs_vfpcc void @test_vst1_bf16_x4(bfloat* nocapture %ptr, [4 x <2 x i32>] %val.coerce) {
727; CHECK-LABEL: test_vst1_bf16_x4:
728; CHECK:       @ %bb.0: @ %entry
729; CHECK-NEXT:    @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
730; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
731; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
732; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
733; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0:256]
734; CHECK-NEXT:    bx lr
735entry:
736  %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0
737  %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1
738  %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2
739  %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3
740  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
741  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
742  %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
743  %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat>
744  tail call void @llvm.arm.neon.vst1x4.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3)
745  ret void
746}
747
748define arm_aapcs_vfpcc void @test_vst1q_bf16_x4(bfloat* nocapture %ptr, [4 x <4 x i32>] %val.coerce) {
749; CHECK-LABEL: test_vst1q_bf16_x4:
750; CHECK:       @ %bb.0: @ %entry
751; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
752; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
753; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
754; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
755; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0:256]!
756; CHECK-NEXT:    vst1.16 {d4, d5, d6, d7}, [r0:256]
757; CHECK-NEXT:    bx lr
758entry:
759  %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0
760  %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1
761  %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2
762  %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3
763  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
764  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
765  %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
766  %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat>
767  tail call void @llvm.arm.neon.vst1x4.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3)
768  ret void
769}
770
771define arm_aapcs_vfpcc void @test_vst2_bf16(bfloat* %ptr, [2 x <2 x i32>] %val.coerce) {
772; CHECK-LABEL: test_vst2_bf16:
773; CHECK:       @ %bb.0: @ %entry
774; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0 def $q0
775; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0 def $q0
776; CHECK-NEXT:    vst2.16 {d0, d1}, [r0]
777; CHECK-NEXT:    bx lr
778entry:
779  %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0
780  %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1
781  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
782  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
783  %2 = bitcast bfloat* %ptr to i8*
784  tail call void @llvm.arm.neon.vst2.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 2)
785  ret void
786}
787
788define arm_aapcs_vfpcc void @test_vst2q_bf16(bfloat* %ptr, [2 x <4 x i32>] %val.coerce) {
789; CHECK-LABEL: test_vst2q_bf16:
790; CHECK:       @ %bb.0: @ %entry
791; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
792; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
793; CHECK-NEXT:    vst2.16 {d0, d1, d2, d3}, [r0]
794; CHECK-NEXT:    bx lr
795entry:
796  %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0
797  %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1
798  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
799  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
800  %2 = bitcast bfloat* %ptr to i8*
801  tail call void @llvm.arm.neon.vst2.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 2)
802  ret void
803}
804
805define arm_aapcs_vfpcc void @test_vst2_lane_bf16(bfloat* %ptr, [2 x <2 x i32>] %val.coerce) {
806; CHECK-LABEL: test_vst2_lane_bf16:
807; CHECK:       @ %bb.0: @ %entry
808; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0 def $q0
809; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0 def $q0
810; CHECK-NEXT:    vst2.16 {d0[1], d1[1]}, [r0]
811; CHECK-NEXT:    bx lr
812entry:
813  %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0
814  %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1
815  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
816  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
817  %2 = bitcast bfloat* %ptr to i8*
818  tail call void @llvm.arm.neon.vst2lane.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
819  ret void
820}
821
822define arm_aapcs_vfpcc void @test_vst2q_lane_bf16(bfloat* %ptr, [2 x <4 x i32>] %val.coerce) {
823; CHECK-LABEL: test_vst2q_lane_bf16:
824; CHECK:       @ %bb.0: @ %entry
825; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
826; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
827; CHECK-NEXT:    vst2.16 {d1[3], d3[3]}, [r0]
828; CHECK-NEXT:    bx lr
829entry:
830  %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0
831  %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1
832  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
833  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
834  %2 = bitcast bfloat* %ptr to i8*
835  tail call void @llvm.arm.neon.vst2lane.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
836  ret void
837}
838
839define arm_aapcs_vfpcc void @test_vst3_bf16(bfloat* %ptr, [3 x <2 x i32>] %val.coerce) {
840; CHECK-LABEL: test_vst3_bf16:
841; CHECK:       @ %bb.0: @ %entry
842; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
843; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
844; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
845; CHECK-NEXT:    vst3.16 {d0, d1, d2}, [r0]
846; CHECK-NEXT:    bx lr
847entry:
848  %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0
849  %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1
850  %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2
851  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
852  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
853  %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
854  %3 = bitcast bfloat* %ptr to i8*
855  tail call void @llvm.arm.neon.vst3.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 2)
856  ret void
857}
858
859define arm_aapcs_vfpcc void @test_vst3q_bf16(bfloat* %ptr, [3 x <4 x i32>] %val.coerce) {
860; CHECK-LABEL: test_vst3q_bf16:
861; CHECK:       @ %bb.0: @ %entry
862; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
863; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
864; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
865; CHECK-NEXT:    vst3.16 {d0, d2, d4}, [r0]!
866; CHECK-NEXT:    vst3.16 {d1, d3, d5}, [r0]
867; CHECK-NEXT:    bx lr
868entry:
869  %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0
870  %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1
871  %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2
872  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
873  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
874  %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
875  %3 = bitcast bfloat* %ptr to i8*
876  tail call void @llvm.arm.neon.vst3.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 2)
877  ret void
878}
879
880define arm_aapcs_vfpcc void @test_vst3_lane_bf16(bfloat* %ptr, [3 x <2 x i32>] %val.coerce) {
881; CHECK-LABEL: test_vst3_lane_bf16:
882; CHECK:       @ %bb.0: @ %entry
883; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
884; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
885; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
886; CHECK-NEXT:    vst3.16 {d0[1], d1[1], d2[1]}, [r0]
887; CHECK-NEXT:    bx lr
888entry:
889  %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0
890  %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1
891  %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2
892  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
893  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
894  %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
895  %3 = bitcast bfloat* %ptr to i8*
896  tail call void @llvm.arm.neon.vst3lane.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
897  ret void
898}
899
900define arm_aapcs_vfpcc void @test_vst3q_lane_bf16(bfloat* %ptr, [3 x <4 x i32>] %val.coerce) {
901; CHECK-LABEL: test_vst3q_lane_bf16:
902; CHECK:       @ %bb.0: @ %entry
903; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
904; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
905; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
906; CHECK-NEXT:    vst3.16 {d1[3], d3[3], d5[3]}, [r0]
907; CHECK-NEXT:    bx lr
908entry:
909  %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0
910  %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1
911  %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2
912  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
913  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
914  %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
915  %3 = bitcast bfloat* %ptr to i8*
916  tail call void @llvm.arm.neon.vst3lane.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
917  ret void
918}
919
920define arm_aapcs_vfpcc void @test_vst4_bf16(bfloat* %ptr, [4 x <2 x i32>] %val.coerce) {
921; CHECK-LABEL: test_vst4_bf16:
922; CHECK:       @ %bb.0: @ %entry
923; CHECK-NEXT:    @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
924; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
925; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
926; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
927; CHECK-NEXT:    vst4.16 {d0, d1, d2, d3}, [r0]
928; CHECK-NEXT:    bx lr
929entry:
930  %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0
931  %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1
932  %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2
933  %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3
934  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
935  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
936  %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
937  %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat>
938  %4 = bitcast bfloat* %ptr to i8*
939  tail call void @llvm.arm.neon.vst4.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 2)
940  ret void
941}
942
943define arm_aapcs_vfpcc void @test_vst4q_bf16(bfloat* %ptr, [4 x <4 x i32>] %val.coerce) {
944; CHECK-LABEL: test_vst4q_bf16:
945; CHECK:       @ %bb.0: @ %entry
946; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
947; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
948; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
949; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
950; CHECK-NEXT:    vst4.16 {d0, d2, d4, d6}, [r0]!
951; CHECK-NEXT:    vst4.16 {d1, d3, d5, d7}, [r0]
952; CHECK-NEXT:    bx lr
953entry:
954  %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0
955  %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1
956  %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2
957  %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3
958  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
959  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
960  %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
961  %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat>
962  %4 = bitcast bfloat* %ptr to i8*
963  tail call void @llvm.arm.neon.vst4.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 2)
964  ret void
965}
966
967define arm_aapcs_vfpcc void @test_vst4_lane_bf16(bfloat* %ptr, [4 x <2 x i32>] %val.coerce) {
968; CHECK-LABEL: test_vst4_lane_bf16:
969; CHECK:       @ %bb.0: @ %entry
970; CHECK-NEXT:    @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
971; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
972; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
973; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
974; CHECK-NEXT:    vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0]
975; CHECK-NEXT:    bx lr
976entry:
977  %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0
978  %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1
979  %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2
980  %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3
981  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
982  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
983  %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
984  %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat>
985  %4 = bitcast bfloat* %ptr to i8*
986  tail call void @llvm.arm.neon.vst4lane.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
987  ret void
988}
989
990define arm_aapcs_vfpcc void @test_vst4q_lane_bf16(bfloat* %ptr, [4 x <4 x i32>] %val.coerce) {
991; CHECK-LABEL: test_vst4q_lane_bf16:
992; CHECK:       @ %bb.0: @ %entry
993; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
994; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
995; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
996; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
997; CHECK-NEXT:    vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0]
998; CHECK-NEXT:    bx lr
999entry:
1000  %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0
1001  %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1
1002  %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2
1003  %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3
1004  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
1005  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
1006  %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
1007  %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat>
1008  %4 = bitcast bfloat* %ptr to i8*
1009  tail call void @llvm.arm.neon.vst4lane.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)
1010  ret void
1011}
1012
1013declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0i8(i8*, i32)
1014declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0i8(i8*, i32)
1015declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0i8(i8*, i32)
1016declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0i8(i8*, i32)
1017declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0i8(i8*, i32)
1018declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0i8(i8*, i32)
1019
1020declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0i8(i8*, i32)
1021declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0i8(i8*, i32)
1022declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0i8(i8*, i32)
1023declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0i8(i8*, i32)
1024declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0i8(i8*, i32)
1025declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0i8(i8*, i32)
1026
1027declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0bf16(bfloat*)
1028declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0bf16(bfloat*)
1029declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0bf16(bfloat*)
1030declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0bf16(bfloat*)
1031declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0bf16(bfloat*)
1032declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0bf16(bfloat*)
1033
1034declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0i8(i8*, <4 x bfloat>, <4 x bfloat>, i32, i32)
1035declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0i8(i8*, <8 x bfloat>, <8 x bfloat>, i32, i32)
1036declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1037declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)
1038declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1039declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)
1040
1041declare void @llvm.arm.neon.vst1.p0i8.v4bf16(i8*, <4 x bfloat>, i32)
1042declare void @llvm.arm.neon.vst1.p0i8.v8bf16(i8*, <8 x bfloat>, i32)
1043declare void @llvm.arm.neon.vst2.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, i32)
1044declare void @llvm.arm.neon.vst2.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, i32)
1045declare void @llvm.arm.neon.vst3.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32)
1046declare void @llvm.arm.neon.vst3.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32)
1047declare void @llvm.arm.neon.vst4.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32)
1048declare void @llvm.arm.neon.vst4.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32)
1049
1050declare void @llvm.arm.neon.vst1x2.p0bf16.v4bf16(bfloat* nocapture, <4 x bfloat>, <4 x bfloat>)
1051declare void @llvm.arm.neon.vst1x2.p0bf16.v8bf16(bfloat* nocapture, <8 x bfloat>, <8 x bfloat>)
1052declare void @llvm.arm.neon.vst1x3.p0bf16.v4bf16(bfloat* nocapture, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
1053declare void @llvm.arm.neon.vst1x3.p0bf16.v8bf16(bfloat* nocapture, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>)
1054declare void @llvm.arm.neon.vst1x4.p0bf16.v4bf16(bfloat* nocapture, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
1055declare void @llvm.arm.neon.vst1x4.p0bf16.v8bf16(bfloat* nocapture, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>)
1056
1057declare void @llvm.arm.neon.vst2lane.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, i32, i32)
1058declare void @llvm.arm.neon.vst2lane.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, i32, i32)
1059declare void @llvm.arm.neon.vst3lane.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1060declare void @llvm.arm.neon.vst3lane.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)
1061declare void @llvm.arm.neon.vst4lane.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1062declare void @llvm.arm.neon.vst4lane.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)
1063