1 /*
2 * Vector math abstractions.
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9 #ifndef _V_MATH_H
10 #define _V_MATH_H
11
12 #ifndef WANT_VMATH
13 /* Enable the build of vector math code. */
14 # define WANT_VMATH 1
15 #endif
16 #if WANT_VMATH
17
18 /* The goal of this header is to allow vector and scalar
19 build of the same algorithm, the provided intrinsic
20 wrappers are also vector length agnostic so they can
21 be implemented for SVE too (or other simd architectures)
22 and then the code should work on those targets too. */
23
24 #if SCALAR
25 #define V_NAME(x) __s_##x
26 #elif VPCS && __aarch64__
27 #define V_NAME(x) __vn_##x
28 #define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
29 #else
30 #define V_NAME(x) __v_##x
31 #endif
32
33 #ifndef VPCS_ATTR
34 #define VPCS_ATTR
35 #endif
36 #ifndef VPCS_ALIAS
37 #define VPCS_ALIAS
38 #endif
39
40 #include <stdint.h>
41 #include "math_config.h"
42
43 typedef float f32_t;
44 typedef uint32_t u32_t;
45 typedef int32_t s32_t;
46 typedef double f64_t;
47 typedef uint64_t u64_t;
48 typedef int64_t s64_t;
49
50 /* reinterpret as type1 from type2. */
51 static inline u32_t
as_u32_f32(f32_t x)52 as_u32_f32 (f32_t x)
53 {
54 union { f32_t f; u32_t u; } r = {x};
55 return r.u;
56 }
57 static inline f32_t
as_f32_u32(u32_t x)58 as_f32_u32 (u32_t x)
59 {
60 union { u32_t u; f32_t f; } r = {x};
61 return r.f;
62 }
63 static inline s32_t
as_s32_u32(u32_t x)64 as_s32_u32 (u32_t x)
65 {
66 union { u32_t u; s32_t i; } r = {x};
67 return r.i;
68 }
69 static inline u32_t
as_u32_s32(s32_t x)70 as_u32_s32 (s32_t x)
71 {
72 union { s32_t i; u32_t u; } r = {x};
73 return r.u;
74 }
75 static inline u64_t
as_u64_f64(f64_t x)76 as_u64_f64 (f64_t x)
77 {
78 union { f64_t f; u64_t u; } r = {x};
79 return r.u;
80 }
81 static inline f64_t
as_f64_u64(u64_t x)82 as_f64_u64 (u64_t x)
83 {
84 union { u64_t u; f64_t f; } r = {x};
85 return r.f;
86 }
87 static inline s64_t
as_s64_u64(u64_t x)88 as_s64_u64 (u64_t x)
89 {
90 union { u64_t u; s64_t i; } r = {x};
91 return r.i;
92 }
93 static inline u64_t
as_u64_s64(s64_t x)94 as_u64_s64 (s64_t x)
95 {
96 union { s64_t i; u64_t u; } r = {x};
97 return r.u;
98 }
99
100 #if SCALAR
101 #define V_SUPPORTED 1
102 typedef f32_t v_f32_t;
103 typedef u32_t v_u32_t;
104 typedef s32_t v_s32_t;
105 typedef f64_t v_f64_t;
106 typedef u64_t v_u64_t;
107 typedef s64_t v_s64_t;
108
109 static inline int
v_lanes32(void)110 v_lanes32 (void)
111 {
112 return 1;
113 }
114
115 static inline v_f32_t
v_f32(f32_t x)116 v_f32 (f32_t x)
117 {
118 return x;
119 }
120 static inline v_u32_t
v_u32(u32_t x)121 v_u32 (u32_t x)
122 {
123 return x;
124 }
125 static inline v_s32_t
v_s32(s32_t x)126 v_s32 (s32_t x)
127 {
128 return x;
129 }
130
131 static inline f32_t
v_get_f32(v_f32_t x,int i)132 v_get_f32 (v_f32_t x, int i)
133 {
134 return x;
135 }
136 static inline u32_t
v_get_u32(v_u32_t x,int i)137 v_get_u32 (v_u32_t x, int i)
138 {
139 return x;
140 }
141 static inline s32_t
v_get_s32(v_s32_t x,int i)142 v_get_s32 (v_s32_t x, int i)
143 {
144 return x;
145 }
146
147 static inline void
v_set_f32(v_f32_t * x,int i,f32_t v)148 v_set_f32 (v_f32_t *x, int i, f32_t v)
149 {
150 *x = v;
151 }
152 static inline void
v_set_u32(v_u32_t * x,int i,u32_t v)153 v_set_u32 (v_u32_t *x, int i, u32_t v)
154 {
155 *x = v;
156 }
157 static inline void
v_set_s32(v_s32_t * x,int i,s32_t v)158 v_set_s32 (v_s32_t *x, int i, s32_t v)
159 {
160 *x = v;
161 }
162
163 /* true if any elements of a v_cond result is non-zero. */
164 static inline int
v_any_u32(v_u32_t x)165 v_any_u32 (v_u32_t x)
166 {
167 return x != 0;
168 }
169 /* to wrap the result of relational operators. */
170 static inline v_u32_t
v_cond_u32(v_u32_t x)171 v_cond_u32 (v_u32_t x)
172 {
173 return x ? -1 : 0;
174 }
175 static inline v_f32_t
v_abs_f32(v_f32_t x)176 v_abs_f32 (v_f32_t x)
177 {
178 return __builtin_fabsf (x);
179 }
180 static inline v_f32_t
v_fma_f32(v_f32_t x,v_f32_t y,v_f32_t z)181 v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
182 {
183 return __builtin_fmaf (x, y, z);
184 }
185 static inline v_f32_t
v_round_f32(v_f32_t x)186 v_round_f32 (v_f32_t x)
187 {
188 return __builtin_roundf (x);
189 }
190 static inline v_s32_t
v_round_s32(v_f32_t x)191 v_round_s32 (v_f32_t x)
192 {
193 return __builtin_lroundf (x); /* relies on -fno-math-errno. */
194 }
195 /* convert to type1 from type2. */
196 static inline v_f32_t
v_to_f32_s32(v_s32_t x)197 v_to_f32_s32 (v_s32_t x)
198 {
199 return x;
200 }
201 static inline v_f32_t
v_to_f32_u32(v_u32_t x)202 v_to_f32_u32 (v_u32_t x)
203 {
204 return x;
205 }
206 /* reinterpret as type1 from type2. */
207 static inline v_u32_t
v_as_u32_f32(v_f32_t x)208 v_as_u32_f32 (v_f32_t x)
209 {
210 union { v_f32_t f; v_u32_t u; } r = {x};
211 return r.u;
212 }
213 static inline v_f32_t
v_as_f32_u32(v_u32_t x)214 v_as_f32_u32 (v_u32_t x)
215 {
216 union { v_u32_t u; v_f32_t f; } r = {x};
217 return r.f;
218 }
219 static inline v_s32_t
v_as_s32_u32(v_u32_t x)220 v_as_s32_u32 (v_u32_t x)
221 {
222 union { v_u32_t u; v_s32_t i; } r = {x};
223 return r.i;
224 }
225 static inline v_u32_t
v_as_u32_s32(v_s32_t x)226 v_as_u32_s32 (v_s32_t x)
227 {
228 union { v_s32_t i; v_u32_t u; } r = {x};
229 return r.u;
230 }
231 static inline v_f32_t
v_lookup_f32(const f32_t * tab,v_u32_t idx)232 v_lookup_f32 (const f32_t *tab, v_u32_t idx)
233 {
234 return tab[idx];
235 }
236 static inline v_u32_t
v_lookup_u32(const u32_t * tab,v_u32_t idx)237 v_lookup_u32 (const u32_t *tab, v_u32_t idx)
238 {
239 return tab[idx];
240 }
241 static inline v_f32_t
v_call_f32(f32_t (* f)(f32_t),v_f32_t x,v_f32_t y,v_u32_t p)242 v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
243 {
244 return f (x);
245 }
246 static inline v_f32_t
v_call2_f32(f32_t (* f)(f32_t,f32_t),v_f32_t x1,v_f32_t x2,v_f32_t y,v_u32_t p)247 v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
248 v_u32_t p)
249 {
250 return f (x1, x2);
251 }
252
253 static inline int
v_lanes64(void)254 v_lanes64 (void)
255 {
256 return 1;
257 }
258 static inline v_f64_t
v_f64(f64_t x)259 v_f64 (f64_t x)
260 {
261 return x;
262 }
263 static inline v_u64_t
v_u64(u64_t x)264 v_u64 (u64_t x)
265 {
266 return x;
267 }
268 static inline v_s64_t
v_s64(s64_t x)269 v_s64 (s64_t x)
270 {
271 return x;
272 }
273 static inline f64_t
v_get_f64(v_f64_t x,int i)274 v_get_f64 (v_f64_t x, int i)
275 {
276 return x;
277 }
278 static inline void
v_set_f64(v_f64_t * x,int i,f64_t v)279 v_set_f64 (v_f64_t *x, int i, f64_t v)
280 {
281 *x = v;
282 }
283 /* true if any elements of a v_cond result is non-zero. */
284 static inline int
v_any_u64(v_u64_t x)285 v_any_u64 (v_u64_t x)
286 {
287 return x != 0;
288 }
289 /* to wrap the result of relational operators. */
290 static inline v_u64_t
v_cond_u64(v_u64_t x)291 v_cond_u64 (v_u64_t x)
292 {
293 return x ? -1 : 0;
294 }
295 static inline v_f64_t
v_abs_f64(v_f64_t x)296 v_abs_f64 (v_f64_t x)
297 {
298 return __builtin_fabs (x);
299 }
300 static inline v_f64_t
v_fma_f64(v_f64_t x,v_f64_t y,v_f64_t z)301 v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
302 {
303 return __builtin_fma (x, y, z);
304 }
305 static inline v_f64_t
v_round_f64(v_f64_t x)306 v_round_f64 (v_f64_t x)
307 {
308 return __builtin_round (x);
309 }
310 static inline v_s64_t
v_round_s64(v_f64_t x)311 v_round_s64 (v_f64_t x)
312 {
313 return __builtin_lround (x); /* relies on -fno-math-errno. */
314 }
315 /* convert to type1 from type2. */
316 static inline v_f64_t
v_to_f64_s64(v_s64_t x)317 v_to_f64_s64 (v_s64_t x)
318 {
319 return x;
320 }
321 static inline v_f64_t
v_to_f64_u64(v_u64_t x)322 v_to_f64_u64 (v_u64_t x)
323 {
324 return x;
325 }
326 /* reinterpret as type1 from type2. */
327 static inline v_u64_t
v_as_u64_f64(v_f64_t x)328 v_as_u64_f64 (v_f64_t x)
329 {
330 union { v_f64_t f; v_u64_t u; } r = {x};
331 return r.u;
332 }
333 static inline v_f64_t
v_as_f64_u64(v_u64_t x)334 v_as_f64_u64 (v_u64_t x)
335 {
336 union { v_u64_t u; v_f64_t f; } r = {x};
337 return r.f;
338 }
339 static inline v_s64_t
v_as_s64_u64(v_u64_t x)340 v_as_s64_u64 (v_u64_t x)
341 {
342 union { v_u64_t u; v_s64_t i; } r = {x};
343 return r.i;
344 }
345 static inline v_u64_t
v_as_u64_s64(v_s64_t x)346 v_as_u64_s64 (v_s64_t x)
347 {
348 union { v_s64_t i; v_u64_t u; } r = {x};
349 return r.u;
350 }
351 static inline v_f64_t
v_lookup_f64(const f64_t * tab,v_u64_t idx)352 v_lookup_f64 (const f64_t *tab, v_u64_t idx)
353 {
354 return tab[idx];
355 }
356 static inline v_u64_t
v_lookup_u64(const u64_t * tab,v_u64_t idx)357 v_lookup_u64 (const u64_t *tab, v_u64_t idx)
358 {
359 return tab[idx];
360 }
361 static inline v_f64_t
v_call_f64(f64_t (* f)(f64_t),v_f64_t x,v_f64_t y,v_u64_t p)362 v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
363 {
364 return f (x);
365 }
366
367 #elif __aarch64__
368 #define V_SUPPORTED 1
369 #include <arm_neon.h>
370 typedef float32x4_t v_f32_t;
371 typedef uint32x4_t v_u32_t;
372 typedef int32x4_t v_s32_t;
373 typedef float64x2_t v_f64_t;
374 typedef uint64x2_t v_u64_t;
375 typedef int64x2_t v_s64_t;
376
377 static inline int
v_lanes32(void)378 v_lanes32 (void)
379 {
380 return 4;
381 }
382
383 static inline v_f32_t
v_f32(f32_t x)384 v_f32 (f32_t x)
385 {
386 return (v_f32_t){x, x, x, x};
387 }
388 static inline v_u32_t
v_u32(u32_t x)389 v_u32 (u32_t x)
390 {
391 return (v_u32_t){x, x, x, x};
392 }
393 static inline v_s32_t
v_s32(s32_t x)394 v_s32 (s32_t x)
395 {
396 return (v_s32_t){x, x, x, x};
397 }
398
399 static inline f32_t
v_get_f32(v_f32_t x,int i)400 v_get_f32 (v_f32_t x, int i)
401 {
402 return x[i];
403 }
404 static inline u32_t
v_get_u32(v_u32_t x,int i)405 v_get_u32 (v_u32_t x, int i)
406 {
407 return x[i];
408 }
409 static inline s32_t
v_get_s32(v_s32_t x,int i)410 v_get_s32 (v_s32_t x, int i)
411 {
412 return x[i];
413 }
414
415 static inline void
v_set_f32(v_f32_t * x,int i,f32_t v)416 v_set_f32 (v_f32_t *x, int i, f32_t v)
417 {
418 (*x)[i] = v;
419 }
420 static inline void
v_set_u32(v_u32_t * x,int i,u32_t v)421 v_set_u32 (v_u32_t *x, int i, u32_t v)
422 {
423 (*x)[i] = v;
424 }
425 static inline void
v_set_s32(v_s32_t * x,int i,s32_t v)426 v_set_s32 (v_s32_t *x, int i, s32_t v)
427 {
428 (*x)[i] = v;
429 }
430
431 /* true if any elements of a v_cond result is non-zero. */
432 static inline int
v_any_u32(v_u32_t x)433 v_any_u32 (v_u32_t x)
434 {
435 /* assume elements in x are either 0 or -1u. */
436 return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
437 }
438 /* to wrap the result of relational operators. */
439 static inline v_u32_t
v_cond_u32(v_u32_t x)440 v_cond_u32 (v_u32_t x)
441 {
442 return x;
443 }
444 static inline v_f32_t
v_abs_f32(v_f32_t x)445 v_abs_f32 (v_f32_t x)
446 {
447 return vabsq_f32 (x);
448 }
449 static inline v_f32_t
v_fma_f32(v_f32_t x,v_f32_t y,v_f32_t z)450 v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
451 {
452 return vfmaq_f32 (z, x, y);
453 }
454 static inline v_f32_t
v_round_f32(v_f32_t x)455 v_round_f32 (v_f32_t x)
456 {
457 return vrndaq_f32 (x);
458 }
459 static inline v_s32_t
v_round_s32(v_f32_t x)460 v_round_s32 (v_f32_t x)
461 {
462 return vcvtaq_s32_f32 (x);
463 }
464 /* convert to type1 from type2. */
465 static inline v_f32_t
v_to_f32_s32(v_s32_t x)466 v_to_f32_s32 (v_s32_t x)
467 {
468 return (v_f32_t){x[0], x[1], x[2], x[3]};
469 }
470 static inline v_f32_t
v_to_f32_u32(v_u32_t x)471 v_to_f32_u32 (v_u32_t x)
472 {
473 return (v_f32_t){x[0], x[1], x[2], x[3]};
474 }
475 /* reinterpret as type1 from type2. */
476 static inline v_u32_t
v_as_u32_f32(v_f32_t x)477 v_as_u32_f32 (v_f32_t x)
478 {
479 union { v_f32_t f; v_u32_t u; } r = {x};
480 return r.u;
481 }
482 static inline v_f32_t
v_as_f32_u32(v_u32_t x)483 v_as_f32_u32 (v_u32_t x)
484 {
485 union { v_u32_t u; v_f32_t f; } r = {x};
486 return r.f;
487 }
488 static inline v_s32_t
v_as_s32_u32(v_u32_t x)489 v_as_s32_u32 (v_u32_t x)
490 {
491 union { v_u32_t u; v_s32_t i; } r = {x};
492 return r.i;
493 }
494 static inline v_u32_t
v_as_u32_s32(v_s32_t x)495 v_as_u32_s32 (v_s32_t x)
496 {
497 union { v_s32_t i; v_u32_t u; } r = {x};
498 return r.u;
499 }
500 static inline v_f32_t
v_lookup_f32(const f32_t * tab,v_u32_t idx)501 v_lookup_f32 (const f32_t *tab, v_u32_t idx)
502 {
503 return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
504 }
505 static inline v_u32_t
v_lookup_u32(const u32_t * tab,v_u32_t idx)506 v_lookup_u32 (const u32_t *tab, v_u32_t idx)
507 {
508 return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
509 }
510 static inline v_f32_t
v_call_f32(f32_t (* f)(f32_t),v_f32_t x,v_f32_t y,v_u32_t p)511 v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
512 {
513 return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
514 p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
515 }
516 static inline v_f32_t
v_call2_f32(f32_t (* f)(f32_t,f32_t),v_f32_t x1,v_f32_t x2,v_f32_t y,v_u32_t p)517 v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
518 v_u32_t p)
519 {
520 return (
521 v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
522 p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
523 }
524
525 static inline int
v_lanes64(void)526 v_lanes64 (void)
527 {
528 return 2;
529 }
530 static inline v_f64_t
v_f64(f64_t x)531 v_f64 (f64_t x)
532 {
533 return (v_f64_t){x, x};
534 }
535 static inline v_u64_t
v_u64(u64_t x)536 v_u64 (u64_t x)
537 {
538 return (v_u64_t){x, x};
539 }
540 static inline v_s64_t
v_s64(s64_t x)541 v_s64 (s64_t x)
542 {
543 return (v_s64_t){x, x};
544 }
545 static inline f64_t
v_get_f64(v_f64_t x,int i)546 v_get_f64 (v_f64_t x, int i)
547 {
548 return x[i];
549 }
550 static inline void
v_set_f64(v_f64_t * x,int i,f64_t v)551 v_set_f64 (v_f64_t *x, int i, f64_t v)
552 {
553 (*x)[i] = v;
554 }
555 /* true if any elements of a v_cond result is non-zero. */
556 static inline int
v_any_u64(v_u64_t x)557 v_any_u64 (v_u64_t x)
558 {
559 /* assume elements in x are either 0 or -1u. */
560 return vpaddd_u64 (x) != 0;
561 }
562 /* to wrap the result of relational operators. */
563 static inline v_u64_t
v_cond_u64(v_u64_t x)564 v_cond_u64 (v_u64_t x)
565 {
566 return x;
567 }
568 static inline v_f64_t
v_abs_f64(v_f64_t x)569 v_abs_f64 (v_f64_t x)
570 {
571 return vabsq_f64 (x);
572 }
573 static inline v_f64_t
v_fma_f64(v_f64_t x,v_f64_t y,v_f64_t z)574 v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
575 {
576 return vfmaq_f64 (z, x, y);
577 }
578 static inline v_f64_t
v_round_f64(v_f64_t x)579 v_round_f64 (v_f64_t x)
580 {
581 return vrndaq_f64 (x);
582 }
583 static inline v_s64_t
v_round_s64(v_f64_t x)584 v_round_s64 (v_f64_t x)
585 {
586 return vcvtaq_s64_f64 (x);
587 }
588 /* convert to type1 from type2. */
589 static inline v_f64_t
v_to_f64_s64(v_s64_t x)590 v_to_f64_s64 (v_s64_t x)
591 {
592 return (v_f64_t){x[0], x[1]};
593 }
594 static inline v_f64_t
v_to_f64_u64(v_u64_t x)595 v_to_f64_u64 (v_u64_t x)
596 {
597 return (v_f64_t){x[0], x[1]};
598 }
599 /* reinterpret as type1 from type2. */
600 static inline v_u64_t
v_as_u64_f64(v_f64_t x)601 v_as_u64_f64 (v_f64_t x)
602 {
603 union { v_f64_t f; v_u64_t u; } r = {x};
604 return r.u;
605 }
606 static inline v_f64_t
v_as_f64_u64(v_u64_t x)607 v_as_f64_u64 (v_u64_t x)
608 {
609 union { v_u64_t u; v_f64_t f; } r = {x};
610 return r.f;
611 }
612 static inline v_s64_t
v_as_s64_u64(v_u64_t x)613 v_as_s64_u64 (v_u64_t x)
614 {
615 union { v_u64_t u; v_s64_t i; } r = {x};
616 return r.i;
617 }
618 static inline v_u64_t
v_as_u64_s64(v_s64_t x)619 v_as_u64_s64 (v_s64_t x)
620 {
621 union { v_s64_t i; v_u64_t u; } r = {x};
622 return r.u;
623 }
624 static inline v_f64_t
v_lookup_f64(const f64_t * tab,v_u64_t idx)625 v_lookup_f64 (const f64_t *tab, v_u64_t idx)
626 {
627 return (v_f64_t){tab[idx[0]], tab[idx[1]]};
628 }
629 static inline v_u64_t
v_lookup_u64(const u64_t * tab,v_u64_t idx)630 v_lookup_u64 (const u64_t *tab, v_u64_t idx)
631 {
632 return (v_u64_t){tab[idx[0]], tab[idx[1]]};
633 }
634 static inline v_f64_t
v_call_f64(f64_t (* f)(f64_t),v_f64_t x,v_f64_t y,v_u64_t p)635 v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
636 {
637 return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
638 }
639 #endif
640
641 #endif
642 #endif
643