1 //          Copyright Naoki Shibata 2010 - 2017.
2 // Distributed under the Boost Software License, Version 1.0.
3 //    (See accompanying file LICENSE.txt or copy at
4 //          http://www.boost.org/LICENSE_1_0.txt)
5 
6 #ifndef __ARM_NEON
7 #error Please specify -mfpu=neon.
8 #endif
9 
10 #ifdef __aarch64__
11 #warning This implementation is for AARCH32.
12 #endif
13 
14 #define ENABLE_SP
15 #define LOG2VECTLENSP 2
16 #define VECTLENSP (1 << LOG2VECTLENSP)
17 
18 #define ISANAME "AARCH32 NEON"
19 #define DFTPRIORITY 10
20 
21 #define ENABLE_RECSQRT_SP
22 #define SPLIT_KERNEL
23 
24 #include <arm_neon.h>
25 
26 typedef uint32x4_t vmask;
27 typedef uint32x4_t vopmask;
28 
29 //typedef int32x4_t vint;
30 
31 typedef float32x4_t vfloat;
32 typedef int32x4_t vint2;
33 
34 //
35 
vprefetch_v_p(const void * ptr)36 static INLINE void vprefetch_v_p(const void *ptr) { }
37 
vtestallones_i_vo32(vopmask g)38 static INLINE int vtestallones_i_vo32(vopmask g) {
39   uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
40   uint32x2_t x1 = vpmin_u32(x0, x0);
41   return vget_lane_u32(x1, 0);
42 }
43 
vloaduf(float * p)44 static vfloat vloaduf(float *p) { return vld1q_f32(p); }
vstoreuf(float * p,vfloat v)45 static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); }
46 
vloadu_vi2_p(int32_t * p)47 static vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
vstoreu_p_vi2(int32_t * p,vint2 v)48 static void vstoreu_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
49 
50 //
51 
vand_vm_vm_vm(vmask x,vmask y)52 static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
vandnot_vm_vm_vm(vmask x,vmask y)53 static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vbicq_u32(y, x); }
vor_vm_vm_vm(vmask x,vmask y)54 static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }
vxor_vm_vm_vm(vmask x,vmask y)55 static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }
56 
vand_vo_vo_vo(vopmask x,vopmask y)57 static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vandq_u32(x, y); }
vandnot_vo_vo_vo(vopmask x,vopmask y)58 static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vbicq_u32(y, x); }
vor_vo_vo_vo(vopmask x,vopmask y)59 static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vorrq_u32(x, y); }
vxor_vo_vo_vo(vopmask x,vopmask y)60 static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return veorq_u32(x, y); }
61 
vand_vm_vo64_vm(vopmask x,vmask y)62 static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vandq_u32(x, y); }
vandnot_vm_vo64_vm(vopmask x,vmask y)63 static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }
vor_vm_vo64_vm(vopmask x,vmask y)64 static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }
vxor_vm_vo64_vm(vopmask x,vmask y)65 static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return veorq_u32(x, y); }
66 
vand_vm_vo32_vm(vopmask x,vmask y)67 static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vandq_u32(x, y); }
vandnot_vm_vo32_vm(vopmask x,vmask y)68 static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }
vor_vm_vo32_vm(vopmask x,vmask y)69 static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }
vxor_vm_vo32_vm(vopmask x,vmask y)70 static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return veorq_u32(x, y); }
71 
vcast_vo32_vo64(vopmask m)72 static INLINE vopmask vcast_vo32_vo64(vopmask m) { return vuzpq_u32(m, m).val[0]; }
vcast_vo64_vo32(vopmask m)73 static INLINE vopmask vcast_vo64_vo32(vopmask m) { return vzipq_u32(m, m).val[0]; }
74 
75 //
76 
vcast_vm_i_i(int i0,int i1)77 static INLINE vmask vcast_vm_i_i(int i0, int i1) { return (vmask)vdupq_n_u64((uint64_t)i0 | (((uint64_t)i1) << 32)); }
veq64_vo_vm_vm(vmask x,vmask y)78 static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
79   uint32x4_t t = vceqq_u32(x, y);
80   return vandq_u32(t, vrev64q_u32(t));
81 }
82 
83 //
84 
vcast_vi2_vm(vmask vm)85 static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
vcast_vm_vi2(vint2 vi)86 static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
vrint_vi2_vf(vfloat d)87 static INLINE vint2 vrint_vi2_vf(vfloat d) {
88   return vcvtq_s32_f32(vaddq_f32(d, (float32x4_t)vorrq_u32(vandq_u32((uint32x4_t)d, (uint32x4_t)vdupq_n_f32(-0.0f)), (uint32x4_t)vdupq_n_f32(0.5f))));
89 }
vtruncate_vi2_vf(vfloat vf)90 static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }
vcast_vf_vi2(vint2 vi)91 static INLINE vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }
92 
vtruncate_vf_vf(vfloat vd)93 static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
vrint_vf_vf(vfloat vd)94 static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
95 
vcast_vf_f(float f)96 static INLINE vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }
vcast_vi2_i(int i)97 static INLINE vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }
vreinterpret_vm_vf(vfloat vf)98 static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
vreinterpret_vf_vm(vmask vm)99 static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
vreinterpret_vf_vi2(vint2 vm)100 static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return (vfloat)vm; }
vreinterpret_vi2_vf(vfloat vf)101 static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
102 
vadd_vf_vf_vf(vfloat x,vfloat y)103 static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return vaddq_f32(x, y); }
vsub_vf_vf_vf(vfloat x,vfloat y)104 static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return vsubq_f32(x, y); }
vmul_vf_vf_vf(vfloat x,vfloat y)105 static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return vmulq_f32(x, y); }
106 
vdiv_vf_vf_vf(vfloat n,vfloat d)107 static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
108   float32x4_t x = vrecpeq_f32(d);
109   x = vmulq_f32(x, vrecpsq_f32(d, x));
110   float32x4_t t = vmulq_f32(n, x);
111   return vmlsq_f32(vaddq_f32(t, t), vmulq_f32(t, x), d);
112 }
113 
vrec_vf_vf(vfloat d)114 static INLINE vfloat vrec_vf_vf(vfloat d) {
115   float32x4_t x = vrecpeq_f32(d);
116   x = vmulq_f32(x, vrecpsq_f32(d, x));
117   return vmlsq_f32(vaddq_f32(x, x), vmulq_f32(x, x), d);
118 }
119 
vsqrt_vf_vf(vfloat d)120 static INLINE vfloat vsqrt_vf_vf(vfloat d) {
121   float32x4_t x = vrsqrteq_f32(d);
122   x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
123   float32x4_t u = vmulq_f32(x, d);
124   u = vmlaq_f32(u, vmlsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));
125   return (float32x4_t)vbicq_u32((uint32x4_t)u, vceqq_f32(d, vdupq_n_f32(0.0f)));
126 }
127 
vrecsqrt_vf_vf(vfloat d)128 static INLINE vfloat vrecsqrt_vf_vf(vfloat d) {
129   float32x4_t x = vrsqrteq_f32(d);
130   x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
131   return vmlaq_f32(x, vmlsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5)));
132 }
133 
vabs_vf_vf(vfloat f)134 static INLINE vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
vneg_vf_vf(vfloat f)135 static INLINE vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }
vmla_vf_vf_vf_vf(vfloat x,vfloat y,vfloat z)136 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlaq_f32(z, x, y); }
vmlanp_vf_vf_vf_vf(vfloat x,vfloat y,vfloat z)137 static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlsq_f32(z, x, y); }
vmax_vf_vf_vf(vfloat x,vfloat y)138 static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vmaxq_f32(x, y); }
vmin_vf_vf_vf(vfloat x,vfloat y)139 static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vminq_f32(x, y); }
140 
veq_vo_vf_vf(vfloat x,vfloat y)141 static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }
vneq_vo_vf_vf(vfloat x,vfloat y)142 static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vmvnq_u32(vceqq_f32(x, y)); }
vlt_vo_vf_vf(vfloat x,vfloat y)143 static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }
vle_vo_vf_vf(vfloat x,vfloat y)144 static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }
vgt_vo_vf_vf(vfloat x,vfloat y)145 static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }
vge_vo_vf_vf(vfloat x,vfloat y)146 static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }
147 
vadd_vi2_vi2_vi2(vint2 x,vint2 y)148 static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vaddq_s32(x, y); }
vsub_vi2_vi2_vi2(vint2 x,vint2 y)149 static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsubq_s32(x, y); }
vneg_vi2_vi2(vint2 e)150 static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }
151 
vand_vi2_vi2_vi2(vint2 x,vint2 y)152 static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vandq_s32(x, y); }
vandnot_vi2_vi2_vi2(vint2 x,vint2 y)153 static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vbicq_s32(y, x); }
vor_vi2_vi2_vi2(vint2 x,vint2 y)154 static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vorrq_s32(x, y); }
vxor_vi2_vi2_vi2(vint2 x,vint2 y)155 static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return veorq_s32(x, y); }
156 
vand_vi2_vo_vi2(vopmask x,vint2 y)157 static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vandq_u32(x, (vopmask)y); }
vandnot_vi2_vo_vi2(vopmask x,vint2 y)158 static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vbicq_u32((vopmask)y, x); }
159 
160 #if defined(__clang__)
161 #define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
162 #define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
163 #define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
164 #else
vsll_vi2_vi2_i(vint2 x,int c)165 static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return (int32x4_t) vshlq_n_u32((uint32x4_t)x, c); }
vsrl_vi2_vi2_i(vint2 x,int c)166 static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return (int32x4_t) vshrq_n_u32((uint32x4_t)x, c); }
vsra_vi2_vi2_i(vint2 x,int c)167 static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vshrq_n_s32(x, c); }
168 #endif
169 
veq_vo_vi2_vi2(vint2 x,vint2 y)170 static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }
vgt_vo_vi2_vi2(vint2 x,vint2 y)171 static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return vcgeq_s32(x, y); }
veq_vi2_vi2_vi2(vint2 x,vint2 y)172 static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vceqq_s32(x, y); }
vgt_vi2_vi2_vi2(vint2 x,vint2 y)173 static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vcgeq_s32(x, y); }
174 
vsel_vi2_vo_vi2_vi2(vopmask m,vint2 x,vint2 y)175 static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return (vint2)vbslq_u32(m, (vmask)x, (vmask)y); }
176 
vsel_vf_vo_vf_vf(vopmask mask,vfloat x,vfloat y)177 static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
178   return (vfloat)vbslq_u32(mask, (vmask)x, (vmask)y);
179 }
180 
vsel_vf_vo_f_f(vopmask o,float v1,float v0)181 static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
182   return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
183 }
184 
vsel_vf_vo_vo_f_f_f(vopmask o0,vopmask o1,float d0,float d1,float d2)185 static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
186   return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
187 }
188 
vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0,vopmask o1,vopmask o2,float d0,float d1,float d2,float d3)189 static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
190   return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
191 }
192 
visinf_vo_vf(vfloat d)193 static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
vispinf_vo_vf(vfloat d)194 static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
visminf_vo_vf(vfloat d)195 static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
visnan_vo_vf(vfloat d)196 static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
197 
198 // This function is needed when debugging on MSVC.
vcast_f_vf(vfloat v)199 static INLINE float vcast_f_vf(vfloat v) {
200   float p[4];
201   vst1q_f32 (p, v);
202   return p[0];
203 }
204 
vavailability_i(int name)205 static INLINE int vavailability_i(int name) {
206   if (name != 2) return 0;
207   return vcast_f_vf(vadd_vf_vf_vf(vcast_vf_f(name), vcast_vf_f(name))) != 0.0;
208 }
209 
210 
vload_vf_p(const float * ptr)211 static INLINE vfloat vload_vf_p(const float *ptr) { return vld1q_f32(__builtin_assume_aligned(ptr, 16)); }
vloadu_vf_p(const float * ptr)212 static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
213 
vstore_v_p_vf(float * ptr,vfloat v)214 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); }
vstoreu_v_p_vf(float * ptr,vfloat v)215 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
216 
217 #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
218 #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
219 
vposneg_vf_vf(vfloat d)220 static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }
vnegpos_vf_vf(vfloat d)221 static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }
222 
vsubadd_vf_vf_vf(vfloat d0,vfloat d1)223 static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
vmlsubadd_vf_vf_vf_vf(vfloat x,vfloat y,vfloat z)224 static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
225 
vrev21_vf_vf(vfloat d0)226 static INLINE vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }
vreva2_vf_vf(vfloat d0)227 static INLINE vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); }
228 
vstream_v_p_vf(float * ptr,vfloat v)229 static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
230 
vscatter2_v_p_i_i_vf(float * ptr,int offset,int step,vfloat v)231 static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
232   vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
233   vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
234 }
235 
vsscatter2_v_p_i_i_vf(float * ptr,int offset,int step,vfloat v)236 static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
237   vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
238   vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
239 }
240 
241 // TODO
242 
vall_lte32_i_vf_vf(vfloat x,vfloat lim)243 static INLINE int vall_lte32_i_vf_vf(vfloat x, vfloat lim) {
244   return 0;
245 }
246