1 /*
2  * Copyright (c), Recep Aslantas.
3  *
4  * MIT License (MIT), http://opensource.org/licenses/MIT
5  * Full license can be found in the LICENSE file
6  */
7 
8 #ifndef cglm_simd_arm_h
9 #define cglm_simd_arm_h
10 #include "intrin.h"
11 #ifdef CGLM_SIMD_ARM
12 
13 #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__)
14 # define CGLM_ARM64 1
15 #endif
16 
17 #define glmm_load(p)      vld1q_f32(p)
18 #define glmm_store(p, a)  vst1q_f32(p, a)
19 
20 #define glmm_set1(x) vdupq_n_f32(x)
21 #define glmm_128     float32x4_t
22 
23 #define glmm_splat_x(x) vdupq_lane_f32(vget_low_f32(x),  0)
24 #define glmm_splat_y(x) vdupq_lane_f32(vget_low_f32(x),  1)
25 #define glmm_splat_z(x) vdupq_lane_f32(vget_high_f32(x), 0)
26 #define glmm_splat_w(x) vdupq_lane_f32(vget_high_f32(x), 1)
27 
28 #define glmm_xor(a, b)                                                        \
29   vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a),                   \
30                                   vreinterpretq_s32_f32(b)))
31 
32 #define glmm_swplane(v) vextq_f32(v, v, 2)
33 #define glmm_low(x)     vget_low_f32(x)
34 #define glmm_high(x)    vget_high_f32(x)
35 
36 #define glmm_combine_ll(x, y) vcombine_f32(vget_low_f32(x),  vget_low_f32(y))
37 #define glmm_combine_hl(x, y) vcombine_f32(vget_high_f32(x), vget_low_f32(y))
38 #define glmm_combine_lh(x, y) vcombine_f32(vget_low_f32(x),  vget_high_f32(y))
39 #define glmm_combine_hh(x, y) vcombine_f32(vget_high_f32(x), vget_high_f32(y))
40 
41 static inline
42 float32x4_t
glmm_abs(float32x4_t v)43 glmm_abs(float32x4_t v) {
44   return vabsq_f32(v);
45 }
46 
47 static inline
48 float32x4_t
glmm_vhadd(float32x4_t v)49 glmm_vhadd(float32x4_t v) {
50   return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)),
51                    vaddq_f32(glmm_splat_z(v), glmm_splat_w(v)));
52   /*
53    this seems slower:
54    v = vaddq_f32(v, vrev64q_f32(v));
55    return vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v)));
56    */
57 }
58 
59 static inline
60 float
glmm_hadd(float32x4_t v)61 glmm_hadd(float32x4_t v) {
62 #if CGLM_ARM64
63   return vaddvq_f32(v);
64 #else
65   v = vaddq_f32(v, vrev64q_f32(v));
66   v = vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v)));
67   return vgetq_lane_f32(v, 0);
68 #endif
69 }
70 
71 static inline
72 float
glmm_hmin(float32x4_t v)73 glmm_hmin(float32x4_t v) {
74   float32x2_t t;
75   t = vpmin_f32(vget_low_f32(v), vget_high_f32(v));
76   t = vpmin_f32(t, t);
77   return vget_lane_f32(t, 0);
78 }
79 
80 static inline
81 float
glmm_hmax(float32x4_t v)82 glmm_hmax(float32x4_t v) {
83   float32x2_t t;
84   t = vpmax_f32(vget_low_f32(v), vget_high_f32(v));
85   t = vpmax_f32(t, t);
86   return vget_lane_f32(t, 0);
87 }
88 
89 static inline
90 float
glmm_dot(float32x4_t a,float32x4_t b)91 glmm_dot(float32x4_t a, float32x4_t b) {
92   return glmm_hadd(vmulq_f32(a, b));
93 }
94 
95 static inline
96 float
glmm_norm(float32x4_t a)97 glmm_norm(float32x4_t a) {
98   return sqrtf(glmm_dot(a, a));
99 }
100 
101 static inline
102 float
glmm_norm2(float32x4_t a)103 glmm_norm2(float32x4_t a) {
104   return glmm_dot(a, a);
105 }
106 
107 static inline
108 float
glmm_norm_one(float32x4_t a)109 glmm_norm_one(float32x4_t a) {
110   return glmm_hadd(glmm_abs(a));
111 }
112 
113 static inline
114 float
glmm_norm_inf(float32x4_t a)115 glmm_norm_inf(float32x4_t a) {
116   return glmm_hmax(glmm_abs(a));
117 }
118 
119 static inline
120 float32x4_t
glmm_div(float32x4_t a,float32x4_t b)121 glmm_div(float32x4_t a, float32x4_t b) {
122 #if CGLM_ARM64
123   return vdivq_f32(a, b);
124 #else
125   /* 2 iterations of Newton-Raphson refinement of reciprocal */
126   float32x4_t r0, r1;
127   r0 = vrecpeq_f32(b);
128   r1 = vrecpsq_f32(r0, b);
129   r0 = vmulq_f32(r1, r0);
130   r1 = vrecpsq_f32(r0, b);
131   r0 = vmulq_f32(r1, r0);
132   return vmulq_f32(a, r0);
133 #endif
134 }
135 
136 static inline
137 float32x4_t
glmm_fmadd(float32x4_t a,float32x4_t b,float32x4_t c)138 glmm_fmadd(float32x4_t a, float32x4_t b, float32x4_t c) {
139 #if CGLM_ARM64
140   return vfmaq_f32(c, a, b); /* why vfmaq_f32 is slower than vmlaq_f32 ??? */
141 #else
142   return vmlaq_f32(c, a, b);
143 #endif
144 }
145 
146 static inline
147 float32x4_t
glmm_fnmadd(float32x4_t a,float32x4_t b,float32x4_t c)148 glmm_fnmadd(float32x4_t a, float32x4_t b, float32x4_t c) {
149 #if CGLM_ARM64
150   return vfmsq_f32(c, a, b);
151 #else
152   return vmlsq_f32(c, a, b);
153 #endif
154 }
155 
156 static inline
157 float32x4_t
glmm_fmsub(float32x4_t a,float32x4_t b,float32x4_t c)158 glmm_fmsub(float32x4_t a, float32x4_t b, float32x4_t c) {
159 #if CGLM_ARM64
160   return vfmsq_f32(c, a, b);
161 #else
162   return vmlsq_f32(c, a, b);
163 #endif
164 }
165 
166 static inline
167 float32x4_t
glmm_fnmsub(float32x4_t a,float32x4_t b,float32x4_t c)168 glmm_fnmsub(float32x4_t a, float32x4_t b, float32x4_t c) {
169   return vsubq_f32(vdupq_n_f32(0.0f), glmm_fmadd(a, b, c));
170 }
171 
172 #endif
173 #endif /* cglm_simd_arm_h */
174