1 /* Vectorized routines for ARM NEON intrinsics.
2  *
3  * This header file, unusually, provides many complete function
4  * implementations so they can be inlined by the compiler.
5  *
6  * We expect to compile separately for ARMv7 versus AARCH64 platforms,
7  * but 128bit NEON intrinsics are essentially the same on both. A few
8  * extra intrinsics are available on AARCH64 (ARMv8). Our autoconf
9  * sets an extra preprocessor define, eslHAVE_NEON_AARCH64, when these
10  * intrinsics are available. Code in esl_neon.[ch] works in both
11  * cases.
12  *
13  * Contents:
14  *    1. Data structures for ARM/Intel intrinsics compatibility
15  *    2. Function declarations for esl_neon
16  *    3. Inlined functions: horizontal max, sum
17  *    4. Inlined functions: left, right shift
18  *    5. Inlined functions: any_gt
19  *    6. Inlined functions: select
20  *
21  */
22 #include "esl_config.h"
23 #ifdef  eslENABLE_NEON
24 #ifndef eslNEON_INCLUDED
25 #define eslNEON_INCLUDED
26 #include "esl_config.h"
27 
28 #include "easel.h"
29 #include <stdio.h>
30 #include <arm_neon.h>
31 
32 
33 /*****************************************************************
34  * 1. Data structures for ARM/Intel intrinsics compatibility
35  *****************************************************************
36  *
37  * We tend to develop in x86 vector intrinsics (SSE/AVX/AVX512) then
38  * port to ARM NEON. It simplifies this process to have our ARM port
39  * work in terms of variables that work like x86 vector variables.
40  *
41  * x86 vector code utilizes a single type for each view of its vector
42  * registers; for example:
43  *
44  * __m128 a = _mm_and_ps(...)
45  *
46  * would be used for any combination of element sizes and lane numbers
47  * for some Intel vector register mapped to the C variable 'a'.
48  *
49  * In contrast, on ARM NEON you specify both the element size and the
50  * number of lanes when mapping a C variable onto a NEON register:
51  *
52  * uint32x4_t a = vdupq_n_s32(...)
53  *
54  * We define here x86-style union types that encompass each different
55  * NEON-style view of the 128-bit registers.
56  */
57 typedef union
58 {
59   int8x16_t   s8x16;
60   int16x8_t   s16x8;
61   int32x4_t   s32x4;
62   int64x2_t   s64x2;
63   int8x8x2_t  s8x8x2;
64   uint8x16_t  u8x16;
65   uint16x8_t  u16x8;
66   uint32x4_t  u32x4;
67   uint64x2_t  u64x2;
68   uint8x8x2_t u8x8x2;
69 } esl_neon_128i_t;
70 
71 typedef union
72 {
73   int8x8_t   s8x8;
74   uint8x8_t  u8x8;
75   int64x1_t  s64x1;
76   uint64x1_t u64x1;
77 } esl_neon_64i_t;
78 
79 /* Union type for vectorized floating point values. Note: AArch32 does not
80  * allow double-precision floating-point vector operations; this was newly
81  * introduced in AArch64. */
82 typedef union
83 {
84 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
85   float16x4_t f16x4;
86 #endif
87   float32x2_t f32x2;
88 } esl_neon_64f_t;
89 
90 typedef union
91 {
92   float32x4_t f32x4;
93 } esl_neon_128f_t;
94 
95 /* Union type for polynomial values. */
96 typedef union
97 {
98   poly8x16_t p8x16;
99   poly16x8_t p16x8;
100 } esl_neon_128p_t;
101 
102 /* Composite types */
103 typedef union
104 {
105   int8x8x2_t   s8x8x2;
106   int16x4x2_t  s16x4x2;
107   int32x2x2_t  s32x2x2;
108   uint8x8x2_t  u8x8x2;
109   uint16x4x2_t u16x4x2;
110   uint32x2x2_t u32x2x2;
111   uint64x1_t   u64x1;     /* useful for loading constants */
112 } esl_neon_128ic_t;
113 
114 typedef union
115 {
116   int8x16x2_t  s8x16x2;
117   int16x8x2_t  s16x8x2;
118   int32x4x2_t  s32x4x2;
119   uint8x16x2_t u8x16x2;
120   uint16x8x2_t u16x8x2;
121   uint32x4x2_t u32x4x2;
122 } esl_neon_256ic_t;
123 
124 typedef union
125 {
126   float32x2x2_t f32x2x2;
127 } esl_neon_128fc_t;
128 
129 typedef union
130 {
131   float32x4x2_t f32x4x2;
132 } esl_neon_256fc_t;
133 
134 
135 
136 /*****************************************************************
137  * 2. Function declarations (from esl_neon.c)
138  *****************************************************************/
139 
140 extern esl_neon_128f_t  esl_neon_logf(esl_neon_128f_t x);
141 extern esl_neon_128f_t  esl_neon_expf(esl_neon_128f_t x);
142 extern void             esl_neon_dump_float(FILE *fp, esl_neon_128f_t v);
143 
144 
145 /*****************************************************************
146  * 3. Inlined functions: horizontal max, sum
147  *****************************************************************/
148 
149 
150 
151 /* Function:  esl_neon_hmax_u8()
152  * Synopsis:  Return max of 16 uint8_t elements in u8 vector.
153  */
154 static inline uint8_t
esl_neon_hmax_u8(esl_neon_128i_t a)155 esl_neon_hmax_u8(esl_neon_128i_t a)
156 {
157 #ifdef eslHAVE_NEON_AARCH64
158   return vmaxvq_u8(a.u8x16);
159 #else
160   a.u8x16 = vmaxq_u8(a.u8x16, vreinterpretq_u8_u32(vextq_u32(a.u32x4, a.u32x4, 2)));
161   a.u8x16 = vmaxq_u8(a.u8x16, vreinterpretq_u8_u32(vextq_u32(a.u32x4, a.u32x4, 1)));
162   a.u8x16 = vmaxq_u8(a.u8x16, vreinterpretq_u8_u16(vrev64q_u16(a.u16x8)));
163   a.u8x16 = vmaxq_u8(a.u8x16, vrev64q_u8(a.u8x16));
164   return vgetq_lane_u8(a.u8x16, 15);
165 #endif
166 }
167 
168 /* Function:  esl_neon_hmax_s8()
169  * Synopsis:  Return max of 16 int8_t elements in s8 vector.
170  */
171 static inline int8_t
esl_neon_hmax_s8(esl_neon_128i_t a)172 esl_neon_hmax_s8(esl_neon_128i_t a)
173 {
174 #ifdef eslHAVE_NEON_AARCH64
175   return vmaxvq_s8(a.s8x16);
176 #else
177   a.s8x16 = vmaxq_s8(a.s8x16, vreinterpretq_s8_s32(vextq_s32(a.s32x4, a.s32x4, 2)));
178   a.s8x16 = vmaxq_s8(a.s8x16, vreinterpretq_s8_s32(vextq_s32(a.s32x4, a.s32x4, 1)));
179   a.s8x16 = vmaxq_s8(a.s8x16, vreinterpretq_s8_s16(vrev64q_s16(a.s16x8)));
180   a.s8x16 = vmaxq_s8(a.s8x16, vrev64q_s8(a.s8x16));
181   return vgetq_lane_s8(a.s8x16, 15);
182 #endif
183 }
184 
185 /* Function:  esl_neon_hmax_s16()
186  * Synopsis:  Return max of 8 elements in s16 vector.
187  */
188 static inline int16_t
esl_neon_hmax_s16(esl_neon_128i_t a)189 esl_neon_hmax_s16(esl_neon_128i_t a)
190 {
191 #ifdef eslHAVE_NEON_AARCH64
192   return vmaxvq_s16(a.s16x8);
193 #else
194   a.s16x8 = vmaxq_s16(a.s16x8, vrev64q_s16(a.s16x8));
195   a.s16x8 = vmaxq_s16(a.s16x8, vreinterpretq_s16_s32(vrev64q_s32(a.s32x4)));
196   a.s16x8 = vmaxq_s16(a.s16x8, vreinterpretq_s16_s32(vextq_s32(a.s32x4, a.s32x4, 2)));
197   return vgetq_lane_s16(a.s16x8, 7);
198 #endif
199 }
200 
201 
202 /* Function:  esl_neon_hsum_float()
203  * Synopsis:  Takes the horizontal sum of elements in a vector.
204  *
205  * Purpose:   Add the four float elements in vector <a>; return
206  *            that sum in <*ret_sum>.
207  */
208 static inline void
esl_neon_hsum_float(esl_neon_128f_t a,float * ret_sum)209 esl_neon_hsum_float(esl_neon_128f_t a, float *ret_sum)
210 {
211 #ifdef eslHAVE_NEON_AARCH64
212   *ret_sum = vaddvq_f32(a.f32x4);
213 #else
214   esl_neon_128f_t fvec;
215   a.f32x4    = vaddq_f32(a.f32x4, vrev64q_f32(a.f32x4));
216   fvec.f32x4 = vextq_f32(a.f32x4, a.f32x4, 2);
217   a.f32x4    = vaddq_f32(a.f32x4, fvec.f32x4);
218   vst1q_lane_f32(ret_sum, a.f32x4, 0);
219 #endif
220 }
221 
222 /*****************************************************************
223  * 4. Inlined functions: left, right shifts
224  *****************************************************************/
225 
226 
227 /* Function:  esl_neon_rightshift_float()
228  * Synopsis:  Shift vector elements to the right.
229  *
230  * Purpose:   Returns a vector containing
231  *            <{ b[0] a[0] a[1] a[2] }>:
232  *            i.e. shift the values in <a> to the
233  *            right, and load the first value of
234  *            <b> into the first slot.
235  */
236 static inline esl_neon_128f_t
esl_neon_rightshift_float(esl_neon_128f_t a,esl_neon_128f_t b)237 esl_neon_rightshift_float(esl_neon_128f_t a, esl_neon_128f_t b)
238 {
239   register esl_neon_128f_t v;
240 
241   v.f32x4 = vrev64q_f32(b.f32x4);            /* b[1] b[0] b[3] b[2] */
242   v.f32x4 = vextq_f32(v.f32x4, v.f32x4, 2);  /* b[3] b[2] b[1] b[0] */
243   v.f32x4 = vextq_f32(v.f32x4, a.f32x4, 3);  /* b[0] a[0] a[1] a[2] */
244   return v;
245 }
246 
247 /* Function:  esl_neon_leftshift_float()
248  * Synopsis:  Shift vector elements to the left.
249  *
250  * Purpose:   Returns a vector containing
251  *            <{ a[1] a[2] a[3] b[0]}>:
252  *            i.e. shift the values in <a> to the
253  *            left and load the first value of
254  *            <b> into the first slot.
255  */
256 static inline esl_neon_128f_t
esl_neon_leftshift_float(esl_neon_128f_t a,esl_neon_128f_t b)257 esl_neon_leftshift_float(esl_neon_128f_t a, esl_neon_128f_t b)
258 {
259   register esl_neon_128f_t v;
260   v.f32x4 = vextq_f32(a.f32x4, b.f32x4, 1);/* now a[1] a[2] a[3] b[0] */
261   return v;
262 }
263 
264 
265 /*****************************************************************
266  * 5. Inlined functions: any_gt
267  *****************************************************************/
268 
269 /* Function:  esl_neon_any_gt_s16()
270  * Synopsis:  Returns TRUE if any a[z] > b[z].
271  *
272  * Purpose:   Return TRUE if any <a[z] > b[z]> for <z=0..15>
273  *            in two <s16> vectors.
274  */
275 static inline int
esl_neon_any_gt_s16(esl_neon_128i_t a,esl_neon_128i_t b)276 esl_neon_any_gt_s16(esl_neon_128i_t a, esl_neon_128i_t b)
277 {
278   esl_neon_128i_t mask;
279   int64_t         l0, l1;
280   int64_t         maskbits;
281 
282   mask.u16x8 = vcgtq_s16(a.s16x8,b.s16x8);
283   l0         = vgetq_lane_u64(mask.u64x2, 0);
284   l1         = vgetq_lane_u64(mask.u64x2, 1);
285   maskbits   = l0 | l1;
286   return maskbits != 0;
287 }
288 
289 /* Function:  esl_neon_any_gt_float()
290  * Synopsis:  Returns TRUE if any a[z] > b[z]
291  *
292  * Purpose:   Returns TRUE if any a[z] > b[z] in two
293  *            <ps> vectors of floats.
294  *
295  * Note:      Ported from esl_sse.c::esl_sse_any_gt_float().
296  */
297 static inline int
esl_neon_any_gt_float(esl_neon_128f_t a,esl_neon_128f_t b)298 esl_neon_any_gt_float(esl_neon_128f_t a, esl_neon_128f_t b)
299 {
300   esl_neon_128i_t mask;
301   int             l0, l1;
302   int             maskbits;
303 
304   mask.u32x4 = vcgtq_f32(a.f32x4,b.f32x4);
305   l0         = vgetq_lane_u64(mask.u64x2, 0);
306   l1         = vgetq_lane_u64(mask.u64x2, 1);
307   maskbits   = l0 | l1;
308   return maskbits != 0;
309 }
310 
311 
312 
313 /*****************************************************************
314  * 6. Inlined functions: select
315  *****************************************************************/
316 
317 /* Function:  esl_neon_select_float()
318  * Synopsis:  NEON equivalent of <vec_sel()>
319  *
320  * Purpose:   Vector select. Returns a vector <r[z] = a[z]> where <mask[z]>
321  *            is all 0's; <r[z] = b[z]> where <mask[z]> is all 1's.
322  *
323  *            Useful for avoiding conditional branches. For example,
324  *            to implement \ccode{if (a > 0) a += a;}:
325  *
326  *            \begin{cchunk}
327  *              mask = _mm_cmpgt_ps(a, _mm_setzero_ps());
328  *              twoa = _mm_add_ps(a, a);
329  *              a    = esl_sse_select_ps(a, twoa, mask);
330  *            \end{cchunk}
331  *
332  */
333 static inline esl_neon_128f_t
esl_neon_select_float(esl_neon_128f_t a,esl_neon_128f_t b,esl_neon_128f_t mask)334 esl_neon_select_float(esl_neon_128f_t a, esl_neon_128f_t b, esl_neon_128f_t mask)
335 {
336   esl_neon_128i_t aview, bview, maskview, masknot;
337   esl_neon_128f_t ret;
338 
339   maskview.s64x2 = vreinterpretq_s64_f32(mask.f32x4);
340   bview.s64x2    = vreinterpretq_s64_f32(b.f32x4);
341   aview.s64x2    = vreinterpretq_s64_f32(a.f32x4);
342   bview.s64x2    = vandq_s64(bview.s64x2, maskview.s64x2);
343   masknot.s32x4  = vmvnq_s32(maskview.s32x4);
344   aview.s64x2    = vandq_s64(masknot.s64x2, aview.s64x2);
345   ret.f32x4      = vreinterpretq_f32_s64(vorrq_s64(aview.s64x2,bview.s64x2));
346   return ret;
347 }
348 
349 
350 
351 #endif // eslNEON_INCLUDED
352 #endif // eslENABLE_NEON
353