1 /*
2 * Copyright (c), Recep Aslantas.
3 *
4 * MIT License (MIT), http://opensource.org/licenses/MIT
5 * Full license can be found in the LICENSE file
6 */
7
8 #ifndef cglm_simd_arm_h
9 #define cglm_simd_arm_h
10 #include "intrin.h"
11 #ifdef CGLM_SIMD_ARM
12
13 #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__)
14 # define CGLM_ARM64 1
15 #endif
16
17 #define glmm_load(p) vld1q_f32(p)
18 #define glmm_store(p, a) vst1q_f32(p, a)
19
20 #define glmm_set1(x) vdupq_n_f32(x)
21 #define glmm_128 float32x4_t
22
23 #define glmm_splat_x(x) vdupq_lane_f32(vget_low_f32(x), 0)
24 #define glmm_splat_y(x) vdupq_lane_f32(vget_low_f32(x), 1)
25 #define glmm_splat_z(x) vdupq_lane_f32(vget_high_f32(x), 0)
26 #define glmm_splat_w(x) vdupq_lane_f32(vget_high_f32(x), 1)
27
28 #define glmm_xor(a, b) \
29 vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a), \
30 vreinterpretq_s32_f32(b)))
31
32 #define glmm_swplane(v) vextq_f32(v, v, 2)
33 #define glmm_low(x) vget_low_f32(x)
34 #define glmm_high(x) vget_high_f32(x)
35
36 #define glmm_combine_ll(x, y) vcombine_f32(vget_low_f32(x), vget_low_f32(y))
37 #define glmm_combine_hl(x, y) vcombine_f32(vget_high_f32(x), vget_low_f32(y))
38 #define glmm_combine_lh(x, y) vcombine_f32(vget_low_f32(x), vget_high_f32(y))
39 #define glmm_combine_hh(x, y) vcombine_f32(vget_high_f32(x), vget_high_f32(y))
40
41 static inline
42 float32x4_t
glmm_abs(float32x4_t v)43 glmm_abs(float32x4_t v) {
44 return vabsq_f32(v);
45 }
46
47 static inline
48 float32x4_t
glmm_vhadd(float32x4_t v)49 glmm_vhadd(float32x4_t v) {
50 return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)),
51 vaddq_f32(glmm_splat_z(v), glmm_splat_w(v)));
52 /*
53 this seems slower:
54 v = vaddq_f32(v, vrev64q_f32(v));
55 return vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v)));
56 */
57 }
58
59 static inline
60 float
glmm_hadd(float32x4_t v)61 glmm_hadd(float32x4_t v) {
62 #if CGLM_ARM64
63 return vaddvq_f32(v);
64 #else
65 v = vaddq_f32(v, vrev64q_f32(v));
66 v = vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v)));
67 return vgetq_lane_f32(v, 0);
68 #endif
69 }
70
71 static inline
72 float
glmm_hmin(float32x4_t v)73 glmm_hmin(float32x4_t v) {
74 float32x2_t t;
75 t = vpmin_f32(vget_low_f32(v), vget_high_f32(v));
76 t = vpmin_f32(t, t);
77 return vget_lane_f32(t, 0);
78 }
79
80 static inline
81 float
glmm_hmax(float32x4_t v)82 glmm_hmax(float32x4_t v) {
83 float32x2_t t;
84 t = vpmax_f32(vget_low_f32(v), vget_high_f32(v));
85 t = vpmax_f32(t, t);
86 return vget_lane_f32(t, 0);
87 }
88
89 static inline
90 float
glmm_dot(float32x4_t a,float32x4_t b)91 glmm_dot(float32x4_t a, float32x4_t b) {
92 return glmm_hadd(vmulq_f32(a, b));
93 }
94
95 static inline
96 float
glmm_norm(float32x4_t a)97 glmm_norm(float32x4_t a) {
98 return sqrtf(glmm_dot(a, a));
99 }
100
101 static inline
102 float
glmm_norm2(float32x4_t a)103 glmm_norm2(float32x4_t a) {
104 return glmm_dot(a, a);
105 }
106
107 static inline
108 float
glmm_norm_one(float32x4_t a)109 glmm_norm_one(float32x4_t a) {
110 return glmm_hadd(glmm_abs(a));
111 }
112
113 static inline
114 float
glmm_norm_inf(float32x4_t a)115 glmm_norm_inf(float32x4_t a) {
116 return glmm_hmax(glmm_abs(a));
117 }
118
119 static inline
120 float32x4_t
glmm_div(float32x4_t a,float32x4_t b)121 glmm_div(float32x4_t a, float32x4_t b) {
122 #if CGLM_ARM64
123 return vdivq_f32(a, b);
124 #else
125 /* 2 iterations of Newton-Raphson refinement of reciprocal */
126 float32x4_t r0, r1;
127 r0 = vrecpeq_f32(b);
128 r1 = vrecpsq_f32(r0, b);
129 r0 = vmulq_f32(r1, r0);
130 r1 = vrecpsq_f32(r0, b);
131 r0 = vmulq_f32(r1, r0);
132 return vmulq_f32(a, r0);
133 #endif
134 }
135
136 static inline
137 float32x4_t
glmm_fmadd(float32x4_t a,float32x4_t b,float32x4_t c)138 glmm_fmadd(float32x4_t a, float32x4_t b, float32x4_t c) {
139 #if CGLM_ARM64
140 return vfmaq_f32(c, a, b); /* why vfmaq_f32 is slower than vmlaq_f32 ??? */
141 #else
142 return vmlaq_f32(c, a, b);
143 #endif
144 }
145
146 static inline
147 float32x4_t
glmm_fnmadd(float32x4_t a,float32x4_t b,float32x4_t c)148 glmm_fnmadd(float32x4_t a, float32x4_t b, float32x4_t c) {
149 #if CGLM_ARM64
150 return vfmsq_f32(c, a, b);
151 #else
152 return vmlsq_f32(c, a, b);
153 #endif
154 }
155
156 static inline
157 float32x4_t
glmm_fmsub(float32x4_t a,float32x4_t b,float32x4_t c)158 glmm_fmsub(float32x4_t a, float32x4_t b, float32x4_t c) {
159 #if CGLM_ARM64
160 return vfmsq_f32(c, a, b);
161 #else
162 return vmlsq_f32(c, a, b);
163 #endif
164 }
165
166 static inline
167 float32x4_t
glmm_fnmsub(float32x4_t a,float32x4_t b,float32x4_t c)168 glmm_fnmsub(float32x4_t a, float32x4_t b, float32x4_t c) {
169 return vsubq_f32(vdupq_n_f32(0.0f), glmm_fmadd(a, b, c));
170 }
171
172 #endif
173 #endif /* cglm_simd_arm_h */
174