1; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 -asm-verbose=0 < %s | FileCheck %s
2
3;
4; BFDOT
5;
6
7define <vscale x 4 x float> @bfdot_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
8; CHECK-LABEL: bfdot_f32:
9; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h
10; CHECK-NEXT:  ret
11  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
12  ret <vscale x 4 x float> %out
13}
14
15define <vscale x 4 x float> @bfdot_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
16; CHECK-LABEL: bfdot_lane_0_f32:
17; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[0]
18; CHECK-NEXT:  ret
19  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
20  ret <vscale x 4 x float> %out
21}
22
23define <vscale x 4 x float> @bfdot_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
24; CHECK-LABEL: bfdot_lane_1_f32:
25; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[1]
26; CHECK-NEXT:  ret
27  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
28  ret <vscale x 4 x float> %out
29}
30
31define <vscale x 4 x float> @bfdot_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
32; CHECK-LABEL: bfdot_lane_2_f32:
33; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[2]
34; CHECK-NEXT:  ret
35  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
36  ret <vscale x 4 x float> %out
37}
38
39define <vscale x 4 x float> @bfdot_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
40; CHECK-LABEL: bfdot_lane_3_f32:
41; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[3]
42; CHECK-NEXT:  ret
43  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
44  ret <vscale x 4 x float> %out
45}
46
47;
48; BFMLALB
49;
50
51define <vscale x 4 x float> @bfmlalb_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
52; CHECK-LABEL: bfmlalb_f32:
53; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h
54; CHECK-NEXT:  ret
55  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
56  ret <vscale x 4 x float> %out
57}
58
59define <vscale x 4 x float> @bfmlalb_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
60; CHECK-LABEL: bfmlalb_lane_0_f32:
61; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[0]
62; CHECK-NEXT:  ret
63  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
64  ret <vscale x 4 x float> %out
65}
66
67define <vscale x 4 x float> @bfmlalb_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
68; CHECK-LABEL: bfmlalb_lane_1_f32:
69; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[1]
70; CHECK-NEXT:  ret
71  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
72  ret <vscale x 4 x float> %out
73}
74
75define <vscale x 4 x float> @bfmlalb_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
76; CHECK-LABEL: bfmlalb_lane_2_f32:
77; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[2]
78; CHECK-NEXT:  ret
79  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
80  ret <vscale x 4 x float> %out
81}
82
83define <vscale x 4 x float> @bfmlalb_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
84; CHECK-LABEL: bfmlalb_lane_3_f32:
85; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[3]
86; CHECK-NEXT:  ret
87  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
88  ret <vscale x 4 x float> %out
89}
90
91define <vscale x 4 x float> @bfmlalb_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
92; CHECK-LABEL: bfmlalb_lane_4_f32:
93; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[4]
94; CHECK-NEXT:  ret
95  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4)
96  ret <vscale x 4 x float> %out
97}
98
99define <vscale x 4 x float> @bfmlalb_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
100; CHECK-LABEL: bfmlalb_lane_5_f32:
101; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[5]
102; CHECK-NEXT:  ret
103  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5)
104  ret <vscale x 4 x float> %out
105}
106
107define <vscale x 4 x float> @bfmlalb_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
108; CHECK-LABEL: bfmlalb_lane_6_f32:
109; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[6]
110; CHECK-NEXT:  ret
111  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6)
112  ret <vscale x 4 x float> %out
113}
114
115define <vscale x 4 x float> @bfmlalb_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
116; CHECK-LABEL: bfmlalb_lane_7_f32:
117; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[7]
118; CHECK-NEXT:  ret
119  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7)
120  ret <vscale x 4 x float> %out
121}
122
123;
124; BFMLALT
125;
126
127define <vscale x 4 x float> @bfmlalt_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
128; CHECK-LABEL: bfmlalt_f32:
129; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h
130; CHECK-NEXT:  ret
131  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
132  ret <vscale x 4 x float> %out
133}
134
135define <vscale x 4 x float> @bfmlalt_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
136; CHECK-LABEL: bfmlalt_lane_0_f32:
137; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[0]
138; CHECK-NEXT:  ret
139  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
140  ret <vscale x 4 x float> %out
141}
142
143define <vscale x 4 x float> @bfmlalt_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
144; CHECK-LABEL: bfmlalt_lane_1_f32:
145; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[1]
146; CHECK-NEXT:  ret
147  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
148  ret <vscale x 4 x float> %out
149}
150
151define <vscale x 4 x float> @bfmlalt_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
152; CHECK-LABEL: bfmlalt_lane_2_f32:
153; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[2]
154; CHECK-NEXT:  ret
155  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
156  ret <vscale x 4 x float> %out
157}
158
159define <vscale x 4 x float> @bfmlalt_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
160; CHECK-LABEL: bfmlalt_lane_3_f32:
161; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[3]
162; CHECK-NEXT:  ret
163  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
164  ret <vscale x 4 x float> %out
165}
166
167define <vscale x 4 x float> @bfmlalt_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
168; CHECK-LABEL: bfmlalt_lane_4_f32:
169; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[4]
170; CHECK-NEXT:  ret
171  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4)
172  ret <vscale x 4 x float> %out
173}
174
175define <vscale x 4 x float> @bfmlalt_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
176; CHECK-LABEL: bfmlalt_lane_5_f32:
177; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[5]
178; CHECK-NEXT:  ret
179  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5)
180  ret <vscale x 4 x float> %out
181}
182
183define <vscale x 4 x float> @bfmlalt_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
184; CHECK-LABEL: bfmlalt_lane_6_f32:
185; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[6]
186; CHECK-NEXT:  ret
187  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6)
188  ret <vscale x 4 x float> %out
189}
190
191define <vscale x 4 x float> @bfmlalt_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
192; CHECK-LABEL: bfmlalt_lane_7_f32:
193; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[7]
194; CHECK-NEXT:  ret
195  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7)
196  ret <vscale x 4 x float> %out
197}
198
199;
200; BFMMLA
201;
202
203define <vscale x 4 x float> @bfmmla_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
204; CHECK-LABEL: bfmmla_f32:
205; CHECK-NEXT:  bfmmla z0.s, z1.h, z2.h
206; CHECK-NEXT:  ret
207  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
208  ret <vscale x 4 x float> %out
209}
210
211;
212; BFCVT
213;
214
215define <vscale x 8 x bfloat> @fcvt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind {
216; CHECK-LABEL: fcvt_bf16_f32:
217; CHECK-NEXT: bfcvt z0.h, p0/m, z1.s
218; CHECK-NEXT: ret
219  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)
220  ret <vscale x 8 x bfloat> %out
221}
222
223;
224; BFCVTNT
225;
226
227define <vscale x 8 x bfloat> @fcvtnt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind {
228; CHECK-LABEL: fcvtnt_bf16_f32:
229; CHECK-NEXT: bfcvtnt z0.h, p0/m, z1.s
230; CHECK-NEXT: ret
231  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)
232  ret <vscale x 8 x bfloat> %out
233}
234
235declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
236declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
237declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
238declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
239declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
240declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
241declare <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
242declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)
243declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)
244