1 /* { dg-do assemble { target { arm*-*-* } } } */
2 /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
3 /* { dg-add-options arm_v8_2a_bf16_neon }  */
4 /* { dg-additional-options "-save-temps -march=armv8.2-a+fp16+bf16 -mfpu=crypto-neon-fp-armv8" } */
5 
6 #include <arm_neon.h>
7 
8 float32x2_t
test_vbfdot_f32_s8(float32x2_t r,int8x8_t a,int8x8_t b)9 test_vbfdot_f32_s8 (float32x2_t r, int8x8_t a, int8x8_t b)
10 {
11   bfloat16x4_t _a = vreinterpret_bf16_s8(a);
12   bfloat16x4_t _b = vreinterpret_bf16_s8(b);
13 
14   return vbfdot_f32 (r, _a, _b);
15 }
16 
17 float32x2_t
test_vbfdot_f32_s16(float32x2_t r,int16x4_t a,int16x4_t b)18 test_vbfdot_f32_s16 (float32x2_t r, int16x4_t a, int16x4_t b)
19 {
20   bfloat16x4_t _a = vreinterpret_bf16_s16(a);
21   bfloat16x4_t _b = vreinterpret_bf16_s16(b);
22 
23   return vbfdot_f32 (r, _a, _b);
24 }
25 
26 float32x2_t
test_vbfdot_f32_s32(float32x2_t r,int32x2_t a,int32x2_t b)27 test_vbfdot_f32_s32 (float32x2_t r, int32x2_t a, int32x2_t b)
28 {
29   bfloat16x4_t _a = vreinterpret_bf16_s32(a);
30   bfloat16x4_t _b = vreinterpret_bf16_s32(b);
31 
32   return vbfdot_f32 (r, _a, _b);
33 }
34 
35 float32x2_t
test_vbfdot_f32_s64(float32x2_t r,int64x1_t a,int64x1_t b)36 test_vbfdot_f32_s64 (float32x2_t r, int64x1_t a, int64x1_t b)
37 {
38   bfloat16x4_t _a = vreinterpret_bf16_s64(a);
39   bfloat16x4_t _b = vreinterpret_bf16_s64(b);
40 
41   return vbfdot_f32 (r, _a, _b);
42 }
43 
44 float32x2_t
test_vbfdot_f32_u8(float32x2_t r,uint8x8_t a,uint8x8_t b)45 test_vbfdot_f32_u8 (float32x2_t r, uint8x8_t a, uint8x8_t b)
46 {
47   bfloat16x4_t _a = vreinterpret_bf16_u8(a);
48   bfloat16x4_t _b = vreinterpret_bf16_u8(b);
49 
50   return vbfdot_f32 (r, _a, _b);
51 }
52 
53 float32x2_t
test_vbfdot_f32_u16(float32x2_t r,uint16x4_t a,uint16x4_t b)54 test_vbfdot_f32_u16 (float32x2_t r, uint16x4_t a, uint16x4_t b)
55 {
56   bfloat16x4_t _a = vreinterpret_bf16_u16(a);
57   bfloat16x4_t _b = vreinterpret_bf16_u16(b);
58 
59   return vbfdot_f32 (r, _a, _b);
60 }
61 
62 float32x2_t
test_vbfdot_f32_u32(float32x2_t r,uint32x2_t a,uint32x2_t b)63 test_vbfdot_f32_u32 (float32x2_t r, uint32x2_t a, uint32x2_t b)
64 {
65   bfloat16x4_t _a = vreinterpret_bf16_u32(a);
66   bfloat16x4_t _b = vreinterpret_bf16_u32(b);
67 
68   return vbfdot_f32 (r, _a, _b);
69 }
70 
71 float32x2_t
test_vbfdot_f32_u64(float32x2_t r,uint64x1_t a,uint64x1_t b)72 test_vbfdot_f32_u64 (float32x2_t r, uint64x1_t a, uint64x1_t b)
73 {
74   bfloat16x4_t _a = vreinterpret_bf16_u64(a);
75   bfloat16x4_t _b = vreinterpret_bf16_u64(b);
76 
77   return vbfdot_f32 (r, _a, _b);
78 }
79 
80 float32x2_t
test_vbfdot_f32_p8(float32x2_t r,poly8x8_t a,poly8x8_t b)81 test_vbfdot_f32_p8 (float32x2_t r, poly8x8_t a, poly8x8_t b)
82 {
83   bfloat16x4_t _a = vreinterpret_bf16_p8(a);
84   bfloat16x4_t _b = vreinterpret_bf16_p8(b);
85 
86   return vbfdot_f32 (r, _a, _b);
87 }
88 
89 float32x2_t
test_vbfdot_f32_p16(float32x2_t r,poly16x4_t a,poly16x4_t b)90 test_vbfdot_f32_p16 (float32x2_t r, poly16x4_t a, poly16x4_t b)
91 {
92   bfloat16x4_t _a = vreinterpret_bf16_p16(a);
93   bfloat16x4_t _b = vreinterpret_bf16_p16(b);
94 
95   return vbfdot_f32 (r, _a, _b);
96 }
97 
98 float32x2_t
test_vbfdot_f32_p64(float32x2_t r,poly64x1_t a,poly64x1_t b)99 test_vbfdot_f32_p64 (float32x2_t r, poly64x1_t a, poly64x1_t b)
100 {
101   bfloat16x4_t _a = vreinterpret_bf16_p64(a);
102   bfloat16x4_t _b = vreinterpret_bf16_p64(b);
103 
104   return vbfdot_f32 (r, _a, _b);
105 }
106 
107 float32x2_t
test_vbfdot_f32_f16(float32x2_t r,float16x4_t a,float16x4_t b)108 test_vbfdot_f32_f16 (float32x2_t r, float16x4_t a, float16x4_t b)
109 {
110   bfloat16x4_t _a = vreinterpret_bf16_f16(a);
111   bfloat16x4_t _b = vreinterpret_bf16_f16(b);
112 
113   return vbfdot_f32 (r, _a, _b);
114 }
115 
116 float32x2_t
test_vbfdot_f32_f32(float32x2_t r,float32x2_t a,float32x2_t b)117 test_vbfdot_f32_f32 (float32x2_t r, float32x2_t a, float32x2_t b)
118 {
119   bfloat16x4_t _a = vreinterpret_bf16_f32(a);
120   bfloat16x4_t _b = vreinterpret_bf16_f32(b);
121 
122   return vbfdot_f32 (r, _a, _b);
123 }
124 
125 float32x4_t
test_vbfdotq_f32_s8(float32x4_t r,int8x16_t a,int8x16_t b)126 test_vbfdotq_f32_s8 (float32x4_t r, int8x16_t a, int8x16_t b)
127 {
128   bfloat16x8_t _a = vreinterpretq_bf16_s8(a);
129   bfloat16x8_t _b = vreinterpretq_bf16_s8(b);
130 
131   return vbfdotq_f32 (r, _a, _b);
132 }
133 
134 float32x4_t
test_vbfdotq_f32_s16(float32x4_t r,int16x8_t a,int16x8_t b)135 test_vbfdotq_f32_s16 (float32x4_t r, int16x8_t a, int16x8_t b)
136 {
137   bfloat16x8_t _a = vreinterpretq_bf16_s16(a);
138   bfloat16x8_t _b = vreinterpretq_bf16_s16(b);
139 
140   return vbfdotq_f32 (r, _a, _b);
141 }
142 
143 float32x4_t
test_vbfdotq_f32_s32(float32x4_t r,int32x4_t a,int32x4_t b)144 test_vbfdotq_f32_s32 (float32x4_t r, int32x4_t a, int32x4_t b)
145 {
146   bfloat16x8_t _a = vreinterpretq_bf16_s32(a);
147   bfloat16x8_t _b = vreinterpretq_bf16_s32(b);
148 
149   return vbfdotq_f32 (r, _a, _b);
150 }
151 
152 float32x4_t
test_vbfdotq_f32_s64(float32x4_t r,int64x2_t a,int64x2_t b)153 test_vbfdotq_f32_s64 (float32x4_t r, int64x2_t a, int64x2_t b)
154 {
155   bfloat16x8_t _a = vreinterpretq_bf16_s64(a);
156   bfloat16x8_t _b = vreinterpretq_bf16_s64(b);
157 
158   return vbfdotq_f32 (r, _a, _b);
159 }
160 
161 float32x4_t
test_vbfdotq_f32_u8(float32x4_t r,uint8x16_t a,uint8x16_t b)162 test_vbfdotq_f32_u8 (float32x4_t r, uint8x16_t a, uint8x16_t b)
163 {
164   bfloat16x8_t _a = vreinterpretq_bf16_u8(a);
165   bfloat16x8_t _b = vreinterpretq_bf16_u8(b);
166 
167   return vbfdotq_f32 (r, _a, _b);
168 }
169 
170 float32x4_t
test_vbfdotq_f32_u16(float32x4_t r,uint16x8_t a,uint16x8_t b)171 test_vbfdotq_f32_u16 (float32x4_t r, uint16x8_t a, uint16x8_t b)
172 {
173   bfloat16x8_t _a = vreinterpretq_bf16_u16(a);
174   bfloat16x8_t _b = vreinterpretq_bf16_u16(b);
175 
176   return vbfdotq_f32 (r, _a, _b);
177 }
178 
179 float32x4_t
test_vbfdotq_f32_u32(float32x4_t r,uint32x4_t a,uint32x4_t b)180 test_vbfdotq_f32_u32 (float32x4_t r, uint32x4_t a, uint32x4_t b)
181 {
182   bfloat16x8_t _a = vreinterpretq_bf16_u32(a);
183   bfloat16x8_t _b = vreinterpretq_bf16_u32(b);
184 
185   return vbfdotq_f32 (r, _a, _b);
186 }
187 
188 float32x4_t
test_vbfdotq_f32_u64(float32x4_t r,uint64x2_t a,uint64x2_t b)189 test_vbfdotq_f32_u64 (float32x4_t r, uint64x2_t a, uint64x2_t b)
190 {
191   bfloat16x8_t _a = vreinterpretq_bf16_u64(a);
192   bfloat16x8_t _b = vreinterpretq_bf16_u64(b);
193 
194   return vbfdotq_f32 (r, _a, _b);
195 }
196 
197 float32x4_t
test_vbfdotq_f32_p8(float32x4_t r,poly8x16_t a,poly8x16_t b)198 test_vbfdotq_f32_p8 (float32x4_t r, poly8x16_t a, poly8x16_t b)
199 {
200   bfloat16x8_t _a = vreinterpretq_bf16_p8(a);
201   bfloat16x8_t _b = vreinterpretq_bf16_p8(b);
202 
203   return vbfdotq_f32 (r, _a, _b);
204 }
205 
206 float32x4_t
test_vbfdotq_f32_p16(float32x4_t r,poly16x8_t a,poly16x8_t b)207 test_vbfdotq_f32_p16 (float32x4_t r, poly16x8_t a, poly16x8_t b)
208 {
209   bfloat16x8_t _a = vreinterpretq_bf16_p16(a);
210   bfloat16x8_t _b = vreinterpretq_bf16_p16(b);
211 
212   return vbfdotq_f32 (r, _a, _b);
213 }
214 
215 float32x4_t
test_vbfdotq_f32_p64(float32x4_t r,poly64x2_t a,poly64x2_t b)216 test_vbfdotq_f32_p64 (float32x4_t r, poly64x2_t a, poly64x2_t b)
217 {
218   bfloat16x8_t _a = vreinterpretq_bf16_p64(a);
219   bfloat16x8_t _b = vreinterpretq_bf16_p64(b);
220 
221   return vbfdotq_f32 (r, _a, _b);
222 }
223 
224 float32x4_t
test_vbfdotq_f32_p128(float32x4_t r,poly128_t a,poly128_t b)225 test_vbfdotq_f32_p128 (float32x4_t r, poly128_t a, poly128_t b)
226 {
227   bfloat16x8_t _a = vreinterpretq_bf16_p128(a);
228   bfloat16x8_t _b = vreinterpretq_bf16_p128(b);
229 
230   return vbfdotq_f32 (r, _a, _b);
231 }
232 
233 float32x4_t
test_vbfdotq_f32_f16(float32x4_t r,float16x8_t a,float16x8_t b)234 test_vbfdotq_f32_f16 (float32x4_t r, float16x8_t a, float16x8_t b)
235 {
236   bfloat16x8_t _a = vreinterpretq_bf16_f16(a);
237   bfloat16x8_t _b = vreinterpretq_bf16_f16(b);
238 
239   return vbfdotq_f32 (r, _a, _b);
240 }
241 
242 float32x4_t
test_vbfdotq_f32_f32(float32x4_t r,float32x4_t a,float32x4_t b)243 test_vbfdotq_f32_f32 (float32x4_t r, float32x4_t a, float32x4_t b)
244 {
245   bfloat16x8_t _a = vreinterpretq_bf16_f32(a);
246   bfloat16x8_t _b = vreinterpretq_bf16_f32(b);
247 
248   return vbfdotq_f32 (r, _a, _b);
249 }
250 
251 /* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\n} 13 } } */
252 /* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 14 } } */
253 
test_vreinterpret_s8_bf16(bfloat16x4_t a,int8x8_t b)254 int8x8_t test_vreinterpret_s8_bf16 (bfloat16x4_t a, int8x8_t b)
255 {
256   int8x8_t _a = vreinterpret_s8_bf16 (a);
257   return vadd_s8 (_a, b);
258 }
259 
test_vreinterpret_s16_bf16(bfloat16x4_t a,int16x4_t b)260 int16x4_t test_vreinterpret_s16_bf16 (bfloat16x4_t a, int16x4_t b)
261 {
262   int16x4_t _a = vreinterpret_s16_bf16 (a);
263   return vadd_s16 (_a, b);
264 }
265 
test_vreinterpret_s32_bf16(bfloat16x4_t a,int32x2_t b)266 int32x2_t test_vreinterpret_s32_bf16 (bfloat16x4_t a, int32x2_t b)
267 {
268   int32x2_t _a = vreinterpret_s32_bf16 (a);
269   return vadd_s32 (_a, b);
270 }
271 
test_vreinterpret_s64_bf16(bfloat16x4_t a,int64x1_t b)272 int64x1_t test_vreinterpret_s64_bf16 (bfloat16x4_t a, int64x1_t b)
273 {
274   int64x1_t _a = vreinterpret_s64_bf16 (a);
275   return vrshl_s64 (_a, b);
276 }
277 
test_vreinterpret_u8_bf16(bfloat16x4_t a,uint8x8_t b)278 uint8x8_t test_vreinterpret_u8_bf16 (bfloat16x4_t a, uint8x8_t b)
279 {
280   uint8x8_t _a = vreinterpret_u8_bf16 (a);
281   return vadd_u8 (_a, b);
282 }
283 
test_vreinterpret_u16_bf16(bfloat16x4_t a,uint16x4_t b)284 uint16x4_t test_vreinterpret_u16_bf16 (bfloat16x4_t a, uint16x4_t b)
285 {
286   uint16x4_t _a = vreinterpret_u16_bf16 (a);
287   return vadd_u16 (_a, b);
288 }
289 
test_vreinterpret_u32_bf16(bfloat16x4_t a,uint32x2_t b)290 uint32x2_t test_vreinterpret_u32_bf16 (bfloat16x4_t a, uint32x2_t b)
291 {
292   uint32x2_t _a = vreinterpret_u32_bf16 (a);
293   return vadd_u32 (_a, b);
294 }
295 
test_vreinterpret_u64_bf16(bfloat16x4_t a,int64x1_t b)296 uint64x1_t test_vreinterpret_u64_bf16 (bfloat16x4_t a, int64x1_t b)
297 {
298   uint64x1_t _a = vreinterpret_u64_bf16 (a);
299   return vrshl_u64 (_a, b);
300 }
301 
test_vreinterpret_p8_bf16(bfloat16x4_t a,poly8x8_t b)302 poly8x8x2_t test_vreinterpret_p8_bf16 (bfloat16x4_t a, poly8x8_t b)
303 {
304   poly8x8_t _a = vreinterpret_p8_bf16 (a);
305   return vzip_p8 (_a, b);
306 }
307 
test_vreinterpret_p16_bf16(bfloat16x4_t a,poly16x4_t b)308 poly16x4x2_t test_vreinterpret_p16_bf16 (bfloat16x4_t a, poly16x4_t b)
309 {
310   poly16x4_t _a = vreinterpret_p16_bf16 (a);
311   return vzip_p16 (_a, b);
312 }
313 
test_vreinterpret_p64_bf16(bfloat16x4_t a,poly64x1_t b)314 poly64x1_t test_vreinterpret_p64_bf16 (bfloat16x4_t a, poly64x1_t b)
315 {
316   poly64x1_t _a = vreinterpret_p64_bf16 (a);
317   return vsli_n_p64 (_a, b, 3);
318 }
319 
test_vreinterpret_f32_bf16(bfloat16x4_t a,float32x2_t b)320 float32x2_t test_vreinterpret_f32_bf16 (bfloat16x4_t a, float32x2_t b)
321 {
322   float32x2_t _a = vreinterpret_f32_bf16 (a);
323   return vsub_f32 (_a, b);
324 }
325 
test_vreinterpretq_s8_bf16(bfloat16x8_t a,int8x16_t b)326 int8x16_t test_vreinterpretq_s8_bf16 (bfloat16x8_t a, int8x16_t b)
327 {
328   int8x16_t _a = vreinterpretq_s8_bf16 (a);
329   return vaddq_s8 (_a, b);
330 }
331 
test_vreinterpretq_s16_bf16(bfloat16x8_t a,int16x8_t b)332 int16x8_t test_vreinterpretq_s16_bf16 (bfloat16x8_t a, int16x8_t b)
333 {
334   int16x8_t _a = vreinterpretq_s16_bf16 (a);
335   return vaddq_s16 (_a, b);
336 }
337 
test_vreinterpretq_s32_bf16(bfloat16x8_t a,int32x4_t b)338 int32x4_t test_vreinterpretq_s32_bf16 (bfloat16x8_t a, int32x4_t b)
339 {
340   int32x4_t _a = vreinterpretq_s32_bf16 (a);
341   return vaddq_s32 (_a, b);
342 }
343 
test_vreinterpretq_s64_bf16(bfloat16x8_t a,int64x2_t b)344 int64x2_t test_vreinterpretq_s64_bf16 (bfloat16x8_t a, int64x2_t b)
345 {
346   int64x2_t _a = vreinterpretq_s64_bf16 (a);
347   return vaddq_s64 (_a, b);
348 }
349 
test_vreinterpretq_u8_bf16(bfloat16x8_t a,uint8x16_t b)350 uint8x16_t test_vreinterpretq_u8_bf16 (bfloat16x8_t a, uint8x16_t b)
351 {
352   uint8x16_t _a = vreinterpretq_u8_bf16 (a);
353   return vaddq_u8 (_a, b);
354 }
355 
test_vreinterpretq_u16_bf16(bfloat16x8_t a,uint16x8_t b)356 uint16x8_t test_vreinterpretq_u16_bf16 (bfloat16x8_t a, uint16x8_t b)
357 {
358   uint16x8_t _a = vreinterpretq_u16_bf16 (a);
359   return vaddq_u16 (_a, b);
360 }
361 
test_vreinterpretq_u32_bf16(bfloat16x8_t a,uint32x4_t b)362 uint32x4_t test_vreinterpretq_u32_bf16 (bfloat16x8_t a, uint32x4_t b)
363 {
364   uint32x4_t _a = vreinterpretq_u32_bf16 (a);
365   return vaddq_u32 (_a, b);
366 }
367 
test_vreinterpretq_u64_bf16(bfloat16x8_t a,uint64x2_t b)368 uint64x2_t test_vreinterpretq_u64_bf16 (bfloat16x8_t a, uint64x2_t b)
369 {
370   uint64x2_t _a = vreinterpretq_u64_bf16 (a);
371   return vaddq_u64 (_a, b);
372 }
373 
test_vreinterpretq_p8_bf16(bfloat16x8_t a,poly8x16_t b)374 poly8x16x2_t test_vreinterpretq_p8_bf16 (bfloat16x8_t a, poly8x16_t b)
375 {
376   poly8x16_t _a = vreinterpretq_p8_bf16 (a);
377   return vzipq_p8 (_a, b);
378 }
379 
test_vreinterpretq_p16_bf16(bfloat16x8_t a,poly16x8_t b)380 poly16x8x2_t test_vreinterpretq_p16_bf16 (bfloat16x8_t a, poly16x8_t b)
381 {
382   poly16x8_t _a = vreinterpretq_p16_bf16 (a);
383   return vzipq_p16 (_a, b);
384 }
385 
test_vreinterpretq_p64_bf16(bfloat16x8_t a,poly64x2_t b)386 poly64x2_t test_vreinterpretq_p64_bf16 (bfloat16x8_t a, poly64x2_t b)
387 {
388   poly64x2_t _a = vreinterpretq_p64_bf16 (a);
389   return vsliq_n_p64 (_a, b, 3);
390 }
391 
test_vreinterpretq_p128_bf16(bfloat16x8_t a,poly16x8_t b)392 poly128_t test_vreinterpretq_p128_bf16 (bfloat16x8_t a, poly16x8_t b)
393 {
394   poly128_t _a = vreinterpretq_p128_bf16 (a);
395   return _a;
396 }
397 
test_vreinterpretq_f32_bf16(bfloat16x8_t a,float32x4_t b)398 float32x4_t test_vreinterpretq_f32_bf16 (bfloat16x8_t a, float32x4_t b)
399 {
400   float32x4_t _a = vreinterpretq_f32_bf16 (a);
401   return vsubq_f32 (_a, b);
402 }
403 
test_vreinterpret_f16_bf16(bfloat16x4_t a)404 float16x4_t test_vreinterpret_f16_bf16 (bfloat16x4_t a)
405 {
406   return vreinterpret_f16_bf16 (a);
407 }
408 
test_vreinterpretq_f16_bf16(bfloat16x8_t a)409 float16x8_t test_vreinterpretq_f16_bf16 (bfloat16x8_t a)
410 {
411   return vreinterpretq_f16_bf16 (a);
412 }
413 
414 /* { dg-final { scan-assembler-times {\tvadd.i8\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
415 /* { dg-final { scan-assembler-times {\tvadd.i16\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
416 /* { dg-final { scan-assembler-times {\tvadd.i32\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
417 
418 /* { dg-final { scan-assembler-times {\tvadd.i8\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
419 /* { dg-final { scan-assembler-times {\tvadd.i16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
420 /* { dg-final { scan-assembler-times {\tvadd.i32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
421 /* { dg-final { scan-assembler-times {\tvadd.i64\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
422 
423 /* { dg-final { scan-assembler {\tvsub.f32\td[0-9]+, d[0-9]+, d[0-9]+\n} } } */
424 /* { dg-final { scan-assembler {\tvsub.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} } } */
425 
426 /* { dg-final { scan-assembler {\tvzip.8\td[0-9]+, d[0-9]+\n} } } */
427 /* { dg-final { scan-assembler {\tvzip.16\td[0-9]+, d[0-9]+\n} } } */
428 /* { dg-final { scan-assembler {\tvzip.8\tq[0-9]+, q[0-9]+\n} } } */
429 /* { dg-final { scan-assembler {\tvzip.16\tq[0-9]+, q[0-9]+\n} } } */
430 
431 /* { dg-final { scan-assembler {\tvrshl.s64\td[0-9]+, d[0-9]+, d[0-9]+\n} } } */
432 /* { dg-final { scan-assembler {\tvrshl.u64\td[0-9]+, d[0-9]+, d[0-9]+\n} } } */
433 
434 /* { dg-final { scan-assembler {\tvsli.64\td[0-9]+, d[0-9]+, #3\n} } } */
435 /* { dg-final { scan-assembler {\tvsli.64\tq[0-9]+, q[0-9]+, #3\n} } } */
436