1 /* { dg-do assemble { target { arm*-*-* } } } */
2 /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
3 /* { dg-add-options arm_v8_2a_bf16_neon } */
4 /* { dg-additional-options "-save-temps -march=armv8.2-a+fp16+bf16 -mfpu=crypto-neon-fp-armv8" } */
5
6 #include <arm_neon.h>
7
8 float32x2_t
test_vbfdot_f32_s8(float32x2_t r,int8x8_t a,int8x8_t b)9 test_vbfdot_f32_s8 (float32x2_t r, int8x8_t a, int8x8_t b)
10 {
11 bfloat16x4_t _a = vreinterpret_bf16_s8(a);
12 bfloat16x4_t _b = vreinterpret_bf16_s8(b);
13
14 return vbfdot_f32 (r, _a, _b);
15 }
16
17 float32x2_t
test_vbfdot_f32_s16(float32x2_t r,int16x4_t a,int16x4_t b)18 test_vbfdot_f32_s16 (float32x2_t r, int16x4_t a, int16x4_t b)
19 {
20 bfloat16x4_t _a = vreinterpret_bf16_s16(a);
21 bfloat16x4_t _b = vreinterpret_bf16_s16(b);
22
23 return vbfdot_f32 (r, _a, _b);
24 }
25
26 float32x2_t
test_vbfdot_f32_s32(float32x2_t r,int32x2_t a,int32x2_t b)27 test_vbfdot_f32_s32 (float32x2_t r, int32x2_t a, int32x2_t b)
28 {
29 bfloat16x4_t _a = vreinterpret_bf16_s32(a);
30 bfloat16x4_t _b = vreinterpret_bf16_s32(b);
31
32 return vbfdot_f32 (r, _a, _b);
33 }
34
35 float32x2_t
test_vbfdot_f32_s64(float32x2_t r,int64x1_t a,int64x1_t b)36 test_vbfdot_f32_s64 (float32x2_t r, int64x1_t a, int64x1_t b)
37 {
38 bfloat16x4_t _a = vreinterpret_bf16_s64(a);
39 bfloat16x4_t _b = vreinterpret_bf16_s64(b);
40
41 return vbfdot_f32 (r, _a, _b);
42 }
43
44 float32x2_t
test_vbfdot_f32_u8(float32x2_t r,uint8x8_t a,uint8x8_t b)45 test_vbfdot_f32_u8 (float32x2_t r, uint8x8_t a, uint8x8_t b)
46 {
47 bfloat16x4_t _a = vreinterpret_bf16_u8(a);
48 bfloat16x4_t _b = vreinterpret_bf16_u8(b);
49
50 return vbfdot_f32 (r, _a, _b);
51 }
52
53 float32x2_t
test_vbfdot_f32_u16(float32x2_t r,uint16x4_t a,uint16x4_t b)54 test_vbfdot_f32_u16 (float32x2_t r, uint16x4_t a, uint16x4_t b)
55 {
56 bfloat16x4_t _a = vreinterpret_bf16_u16(a);
57 bfloat16x4_t _b = vreinterpret_bf16_u16(b);
58
59 return vbfdot_f32 (r, _a, _b);
60 }
61
62 float32x2_t
test_vbfdot_f32_u32(float32x2_t r,uint32x2_t a,uint32x2_t b)63 test_vbfdot_f32_u32 (float32x2_t r, uint32x2_t a, uint32x2_t b)
64 {
65 bfloat16x4_t _a = vreinterpret_bf16_u32(a);
66 bfloat16x4_t _b = vreinterpret_bf16_u32(b);
67
68 return vbfdot_f32 (r, _a, _b);
69 }
70
71 float32x2_t
test_vbfdot_f32_u64(float32x2_t r,uint64x1_t a,uint64x1_t b)72 test_vbfdot_f32_u64 (float32x2_t r, uint64x1_t a, uint64x1_t b)
73 {
74 bfloat16x4_t _a = vreinterpret_bf16_u64(a);
75 bfloat16x4_t _b = vreinterpret_bf16_u64(b);
76
77 return vbfdot_f32 (r, _a, _b);
78 }
79
80 float32x2_t
test_vbfdot_f32_p8(float32x2_t r,poly8x8_t a,poly8x8_t b)81 test_vbfdot_f32_p8 (float32x2_t r, poly8x8_t a, poly8x8_t b)
82 {
83 bfloat16x4_t _a = vreinterpret_bf16_p8(a);
84 bfloat16x4_t _b = vreinterpret_bf16_p8(b);
85
86 return vbfdot_f32 (r, _a, _b);
87 }
88
89 float32x2_t
test_vbfdot_f32_p16(float32x2_t r,poly16x4_t a,poly16x4_t b)90 test_vbfdot_f32_p16 (float32x2_t r, poly16x4_t a, poly16x4_t b)
91 {
92 bfloat16x4_t _a = vreinterpret_bf16_p16(a);
93 bfloat16x4_t _b = vreinterpret_bf16_p16(b);
94
95 return vbfdot_f32 (r, _a, _b);
96 }
97
98 float32x2_t
test_vbfdot_f32_p64(float32x2_t r,poly64x1_t a,poly64x1_t b)99 test_vbfdot_f32_p64 (float32x2_t r, poly64x1_t a, poly64x1_t b)
100 {
101 bfloat16x4_t _a = vreinterpret_bf16_p64(a);
102 bfloat16x4_t _b = vreinterpret_bf16_p64(b);
103
104 return vbfdot_f32 (r, _a, _b);
105 }
106
107 float32x2_t
test_vbfdot_f32_f16(float32x2_t r,float16x4_t a,float16x4_t b)108 test_vbfdot_f32_f16 (float32x2_t r, float16x4_t a, float16x4_t b)
109 {
110 bfloat16x4_t _a = vreinterpret_bf16_f16(a);
111 bfloat16x4_t _b = vreinterpret_bf16_f16(b);
112
113 return vbfdot_f32 (r, _a, _b);
114 }
115
116 float32x2_t
test_vbfdot_f32_f32(float32x2_t r,float32x2_t a,float32x2_t b)117 test_vbfdot_f32_f32 (float32x2_t r, float32x2_t a, float32x2_t b)
118 {
119 bfloat16x4_t _a = vreinterpret_bf16_f32(a);
120 bfloat16x4_t _b = vreinterpret_bf16_f32(b);
121
122 return vbfdot_f32 (r, _a, _b);
123 }
124
125 float32x4_t
test_vbfdotq_f32_s8(float32x4_t r,int8x16_t a,int8x16_t b)126 test_vbfdotq_f32_s8 (float32x4_t r, int8x16_t a, int8x16_t b)
127 {
128 bfloat16x8_t _a = vreinterpretq_bf16_s8(a);
129 bfloat16x8_t _b = vreinterpretq_bf16_s8(b);
130
131 return vbfdotq_f32 (r, _a, _b);
132 }
133
134 float32x4_t
test_vbfdotq_f32_s16(float32x4_t r,int16x8_t a,int16x8_t b)135 test_vbfdotq_f32_s16 (float32x4_t r, int16x8_t a, int16x8_t b)
136 {
137 bfloat16x8_t _a = vreinterpretq_bf16_s16(a);
138 bfloat16x8_t _b = vreinterpretq_bf16_s16(b);
139
140 return vbfdotq_f32 (r, _a, _b);
141 }
142
143 float32x4_t
test_vbfdotq_f32_s32(float32x4_t r,int32x4_t a,int32x4_t b)144 test_vbfdotq_f32_s32 (float32x4_t r, int32x4_t a, int32x4_t b)
145 {
146 bfloat16x8_t _a = vreinterpretq_bf16_s32(a);
147 bfloat16x8_t _b = vreinterpretq_bf16_s32(b);
148
149 return vbfdotq_f32 (r, _a, _b);
150 }
151
152 float32x4_t
test_vbfdotq_f32_s64(float32x4_t r,int64x2_t a,int64x2_t b)153 test_vbfdotq_f32_s64 (float32x4_t r, int64x2_t a, int64x2_t b)
154 {
155 bfloat16x8_t _a = vreinterpretq_bf16_s64(a);
156 bfloat16x8_t _b = vreinterpretq_bf16_s64(b);
157
158 return vbfdotq_f32 (r, _a, _b);
159 }
160
161 float32x4_t
test_vbfdotq_f32_u8(float32x4_t r,uint8x16_t a,uint8x16_t b)162 test_vbfdotq_f32_u8 (float32x4_t r, uint8x16_t a, uint8x16_t b)
163 {
164 bfloat16x8_t _a = vreinterpretq_bf16_u8(a);
165 bfloat16x8_t _b = vreinterpretq_bf16_u8(b);
166
167 return vbfdotq_f32 (r, _a, _b);
168 }
169
170 float32x4_t
test_vbfdotq_f32_u16(float32x4_t r,uint16x8_t a,uint16x8_t b)171 test_vbfdotq_f32_u16 (float32x4_t r, uint16x8_t a, uint16x8_t b)
172 {
173 bfloat16x8_t _a = vreinterpretq_bf16_u16(a);
174 bfloat16x8_t _b = vreinterpretq_bf16_u16(b);
175
176 return vbfdotq_f32 (r, _a, _b);
177 }
178
179 float32x4_t
test_vbfdotq_f32_u32(float32x4_t r,uint32x4_t a,uint32x4_t b)180 test_vbfdotq_f32_u32 (float32x4_t r, uint32x4_t a, uint32x4_t b)
181 {
182 bfloat16x8_t _a = vreinterpretq_bf16_u32(a);
183 bfloat16x8_t _b = vreinterpretq_bf16_u32(b);
184
185 return vbfdotq_f32 (r, _a, _b);
186 }
187
188 float32x4_t
test_vbfdotq_f32_u64(float32x4_t r,uint64x2_t a,uint64x2_t b)189 test_vbfdotq_f32_u64 (float32x4_t r, uint64x2_t a, uint64x2_t b)
190 {
191 bfloat16x8_t _a = vreinterpretq_bf16_u64(a);
192 bfloat16x8_t _b = vreinterpretq_bf16_u64(b);
193
194 return vbfdotq_f32 (r, _a, _b);
195 }
196
197 float32x4_t
test_vbfdotq_f32_p8(float32x4_t r,poly8x16_t a,poly8x16_t b)198 test_vbfdotq_f32_p8 (float32x4_t r, poly8x16_t a, poly8x16_t b)
199 {
200 bfloat16x8_t _a = vreinterpretq_bf16_p8(a);
201 bfloat16x8_t _b = vreinterpretq_bf16_p8(b);
202
203 return vbfdotq_f32 (r, _a, _b);
204 }
205
206 float32x4_t
test_vbfdotq_f32_p16(float32x4_t r,poly16x8_t a,poly16x8_t b)207 test_vbfdotq_f32_p16 (float32x4_t r, poly16x8_t a, poly16x8_t b)
208 {
209 bfloat16x8_t _a = vreinterpretq_bf16_p16(a);
210 bfloat16x8_t _b = vreinterpretq_bf16_p16(b);
211
212 return vbfdotq_f32 (r, _a, _b);
213 }
214
215 float32x4_t
test_vbfdotq_f32_p64(float32x4_t r,poly64x2_t a,poly64x2_t b)216 test_vbfdotq_f32_p64 (float32x4_t r, poly64x2_t a, poly64x2_t b)
217 {
218 bfloat16x8_t _a = vreinterpretq_bf16_p64(a);
219 bfloat16x8_t _b = vreinterpretq_bf16_p64(b);
220
221 return vbfdotq_f32 (r, _a, _b);
222 }
223
224 float32x4_t
test_vbfdotq_f32_p128(float32x4_t r,poly128_t a,poly128_t b)225 test_vbfdotq_f32_p128 (float32x4_t r, poly128_t a, poly128_t b)
226 {
227 bfloat16x8_t _a = vreinterpretq_bf16_p128(a);
228 bfloat16x8_t _b = vreinterpretq_bf16_p128(b);
229
230 return vbfdotq_f32 (r, _a, _b);
231 }
232
233 float32x4_t
test_vbfdotq_f32_f16(float32x4_t r,float16x8_t a,float16x8_t b)234 test_vbfdotq_f32_f16 (float32x4_t r, float16x8_t a, float16x8_t b)
235 {
236 bfloat16x8_t _a = vreinterpretq_bf16_f16(a);
237 bfloat16x8_t _b = vreinterpretq_bf16_f16(b);
238
239 return vbfdotq_f32 (r, _a, _b);
240 }
241
242 float32x4_t
test_vbfdotq_f32_f32(float32x4_t r,float32x4_t a,float32x4_t b)243 test_vbfdotq_f32_f32 (float32x4_t r, float32x4_t a, float32x4_t b)
244 {
245 bfloat16x8_t _a = vreinterpretq_bf16_f32(a);
246 bfloat16x8_t _b = vreinterpretq_bf16_f32(b);
247
248 return vbfdotq_f32 (r, _a, _b);
249 }
250
251 /* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\n} 13 } } */
252 /* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 14 } } */
253
test_vreinterpret_s8_bf16(bfloat16x4_t a,int8x8_t b)254 int8x8_t test_vreinterpret_s8_bf16 (bfloat16x4_t a, int8x8_t b)
255 {
256 int8x8_t _a = vreinterpret_s8_bf16 (a);
257 return vadd_s8 (_a, b);
258 }
259
test_vreinterpret_s16_bf16(bfloat16x4_t a,int16x4_t b)260 int16x4_t test_vreinterpret_s16_bf16 (bfloat16x4_t a, int16x4_t b)
261 {
262 int16x4_t _a = vreinterpret_s16_bf16 (a);
263 return vadd_s16 (_a, b);
264 }
265
test_vreinterpret_s32_bf16(bfloat16x4_t a,int32x2_t b)266 int32x2_t test_vreinterpret_s32_bf16 (bfloat16x4_t a, int32x2_t b)
267 {
268 int32x2_t _a = vreinterpret_s32_bf16 (a);
269 return vadd_s32 (_a, b);
270 }
271
test_vreinterpret_s64_bf16(bfloat16x4_t a,int64x1_t b)272 int64x1_t test_vreinterpret_s64_bf16 (bfloat16x4_t a, int64x1_t b)
273 {
274 int64x1_t _a = vreinterpret_s64_bf16 (a);
275 return vrshl_s64 (_a, b);
276 }
277
test_vreinterpret_u8_bf16(bfloat16x4_t a,uint8x8_t b)278 uint8x8_t test_vreinterpret_u8_bf16 (bfloat16x4_t a, uint8x8_t b)
279 {
280 uint8x8_t _a = vreinterpret_u8_bf16 (a);
281 return vadd_u8 (_a, b);
282 }
283
test_vreinterpret_u16_bf16(bfloat16x4_t a,uint16x4_t b)284 uint16x4_t test_vreinterpret_u16_bf16 (bfloat16x4_t a, uint16x4_t b)
285 {
286 uint16x4_t _a = vreinterpret_u16_bf16 (a);
287 return vadd_u16 (_a, b);
288 }
289
test_vreinterpret_u32_bf16(bfloat16x4_t a,uint32x2_t b)290 uint32x2_t test_vreinterpret_u32_bf16 (bfloat16x4_t a, uint32x2_t b)
291 {
292 uint32x2_t _a = vreinterpret_u32_bf16 (a);
293 return vadd_u32 (_a, b);
294 }
295
test_vreinterpret_u64_bf16(bfloat16x4_t a,int64x1_t b)296 uint64x1_t test_vreinterpret_u64_bf16 (bfloat16x4_t a, int64x1_t b)
297 {
298 uint64x1_t _a = vreinterpret_u64_bf16 (a);
299 return vrshl_u64 (_a, b);
300 }
301
test_vreinterpret_p8_bf16(bfloat16x4_t a,poly8x8_t b)302 poly8x8x2_t test_vreinterpret_p8_bf16 (bfloat16x4_t a, poly8x8_t b)
303 {
304 poly8x8_t _a = vreinterpret_p8_bf16 (a);
305 return vzip_p8 (_a, b);
306 }
307
test_vreinterpret_p16_bf16(bfloat16x4_t a,poly16x4_t b)308 poly16x4x2_t test_vreinterpret_p16_bf16 (bfloat16x4_t a, poly16x4_t b)
309 {
310 poly16x4_t _a = vreinterpret_p16_bf16 (a);
311 return vzip_p16 (_a, b);
312 }
313
test_vreinterpret_p64_bf16(bfloat16x4_t a,poly64x1_t b)314 poly64x1_t test_vreinterpret_p64_bf16 (bfloat16x4_t a, poly64x1_t b)
315 {
316 poly64x1_t _a = vreinterpret_p64_bf16 (a);
317 return vsli_n_p64 (_a, b, 3);
318 }
319
test_vreinterpret_f32_bf16(bfloat16x4_t a,float32x2_t b)320 float32x2_t test_vreinterpret_f32_bf16 (bfloat16x4_t a, float32x2_t b)
321 {
322 float32x2_t _a = vreinterpret_f32_bf16 (a);
323 return vsub_f32 (_a, b);
324 }
325
test_vreinterpretq_s8_bf16(bfloat16x8_t a,int8x16_t b)326 int8x16_t test_vreinterpretq_s8_bf16 (bfloat16x8_t a, int8x16_t b)
327 {
328 int8x16_t _a = vreinterpretq_s8_bf16 (a);
329 return vaddq_s8 (_a, b);
330 }
331
test_vreinterpretq_s16_bf16(bfloat16x8_t a,int16x8_t b)332 int16x8_t test_vreinterpretq_s16_bf16 (bfloat16x8_t a, int16x8_t b)
333 {
334 int16x8_t _a = vreinterpretq_s16_bf16 (a);
335 return vaddq_s16 (_a, b);
336 }
337
test_vreinterpretq_s32_bf16(bfloat16x8_t a,int32x4_t b)338 int32x4_t test_vreinterpretq_s32_bf16 (bfloat16x8_t a, int32x4_t b)
339 {
340 int32x4_t _a = vreinterpretq_s32_bf16 (a);
341 return vaddq_s32 (_a, b);
342 }
343
test_vreinterpretq_s64_bf16(bfloat16x8_t a,int64x2_t b)344 int64x2_t test_vreinterpretq_s64_bf16 (bfloat16x8_t a, int64x2_t b)
345 {
346 int64x2_t _a = vreinterpretq_s64_bf16 (a);
347 return vaddq_s64 (_a, b);
348 }
349
test_vreinterpretq_u8_bf16(bfloat16x8_t a,uint8x16_t b)350 uint8x16_t test_vreinterpretq_u8_bf16 (bfloat16x8_t a, uint8x16_t b)
351 {
352 uint8x16_t _a = vreinterpretq_u8_bf16 (a);
353 return vaddq_u8 (_a, b);
354 }
355
test_vreinterpretq_u16_bf16(bfloat16x8_t a,uint16x8_t b)356 uint16x8_t test_vreinterpretq_u16_bf16 (bfloat16x8_t a, uint16x8_t b)
357 {
358 uint16x8_t _a = vreinterpretq_u16_bf16 (a);
359 return vaddq_u16 (_a, b);
360 }
361
test_vreinterpretq_u32_bf16(bfloat16x8_t a,uint32x4_t b)362 uint32x4_t test_vreinterpretq_u32_bf16 (bfloat16x8_t a, uint32x4_t b)
363 {
364 uint32x4_t _a = vreinterpretq_u32_bf16 (a);
365 return vaddq_u32 (_a, b);
366 }
367
test_vreinterpretq_u64_bf16(bfloat16x8_t a,uint64x2_t b)368 uint64x2_t test_vreinterpretq_u64_bf16 (bfloat16x8_t a, uint64x2_t b)
369 {
370 uint64x2_t _a = vreinterpretq_u64_bf16 (a);
371 return vaddq_u64 (_a, b);
372 }
373
test_vreinterpretq_p8_bf16(bfloat16x8_t a,poly8x16_t b)374 poly8x16x2_t test_vreinterpretq_p8_bf16 (bfloat16x8_t a, poly8x16_t b)
375 {
376 poly8x16_t _a = vreinterpretq_p8_bf16 (a);
377 return vzipq_p8 (_a, b);
378 }
379
test_vreinterpretq_p16_bf16(bfloat16x8_t a,poly16x8_t b)380 poly16x8x2_t test_vreinterpretq_p16_bf16 (bfloat16x8_t a, poly16x8_t b)
381 {
382 poly16x8_t _a = vreinterpretq_p16_bf16 (a);
383 return vzipq_p16 (_a, b);
384 }
385
test_vreinterpretq_p64_bf16(bfloat16x8_t a,poly64x2_t b)386 poly64x2_t test_vreinterpretq_p64_bf16 (bfloat16x8_t a, poly64x2_t b)
387 {
388 poly64x2_t _a = vreinterpretq_p64_bf16 (a);
389 return vsliq_n_p64 (_a, b, 3);
390 }
391
test_vreinterpretq_p128_bf16(bfloat16x8_t a,poly16x8_t b)392 poly128_t test_vreinterpretq_p128_bf16 (bfloat16x8_t a, poly16x8_t b)
393 {
394 poly128_t _a = vreinterpretq_p128_bf16 (a);
395 return _a;
396 }
397
test_vreinterpretq_f32_bf16(bfloat16x8_t a,float32x4_t b)398 float32x4_t test_vreinterpretq_f32_bf16 (bfloat16x8_t a, float32x4_t b)
399 {
400 float32x4_t _a = vreinterpretq_f32_bf16 (a);
401 return vsubq_f32 (_a, b);
402 }
403
test_vreinterpret_f16_bf16(bfloat16x4_t a)404 float16x4_t test_vreinterpret_f16_bf16 (bfloat16x4_t a)
405 {
406 return vreinterpret_f16_bf16 (a);
407 }
408
test_vreinterpretq_f16_bf16(bfloat16x8_t a)409 float16x8_t test_vreinterpretq_f16_bf16 (bfloat16x8_t a)
410 {
411 return vreinterpretq_f16_bf16 (a);
412 }
413
414 /* { dg-final { scan-assembler-times {\tvadd.i8\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
415 /* { dg-final { scan-assembler-times {\tvadd.i16\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
416 /* { dg-final { scan-assembler-times {\tvadd.i32\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
417
418 /* { dg-final { scan-assembler-times {\tvadd.i8\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
419 /* { dg-final { scan-assembler-times {\tvadd.i16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
420 /* { dg-final { scan-assembler-times {\tvadd.i32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
421 /* { dg-final { scan-assembler-times {\tvadd.i64\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
422
423 /* { dg-final { scan-assembler {\tvsub.f32\td[0-9]+, d[0-9]+, d[0-9]+\n} } } */
424 /* { dg-final { scan-assembler {\tvsub.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} } } */
425
426 /* { dg-final { scan-assembler {\tvzip.8\td[0-9]+, d[0-9]+\n} } } */
427 /* { dg-final { scan-assembler {\tvzip.16\td[0-9]+, d[0-9]+\n} } } */
428 /* { dg-final { scan-assembler {\tvzip.8\tq[0-9]+, q[0-9]+\n} } } */
429 /* { dg-final { scan-assembler {\tvzip.16\tq[0-9]+, q[0-9]+\n} } } */
430
431 /* { dg-final { scan-assembler {\tvrshl.s64\td[0-9]+, d[0-9]+, d[0-9]+\n} } } */
432 /* { dg-final { scan-assembler {\tvrshl.u64\td[0-9]+, d[0-9]+, d[0-9]+\n} } } */
433
434 /* { dg-final { scan-assembler {\tvsli.64\td[0-9]+, d[0-9]+, #3\n} } } */
435 /* { dg-final { scan-assembler {\tvsli.64\tq[0-9]+, q[0-9]+, #3\n} } } */
436