1 /* Vectorized routines for ARM NEON intrinsics.
2 *
3 * This header file, unusually, provides many complete function
4 * implementations so they can be inlined by the compiler.
5 *
6 * We expect to compile separately for ARMv7 versus AARCH64 platforms,
7 * but 128bit NEON intrinsics are essentially the same on both. A few
8 * extra intrinsics are available on AARCH64 (ARMv8). Our autoconf
9 * sets an extra preprocessor define, eslHAVE_NEON_AARCH64, when these
10 * intrinsics are available. Code in esl_neon.[ch] works in both
11 * cases.
12 *
13 * Contents:
14 * 1. Data structures for ARM/Intel intrinsics compatibility
15 * 2. Function declarations for esl_neon
16 * 3. Inlined functions: horizontal max, sum
17 * 4. Inlined functions: left, right shift
18 * 5. Inlined functions: any_gt
19 * 6. Inlined functions: select
20 *
21 */
22 #include "esl_config.h"
23 #ifdef eslENABLE_NEON
24 #ifndef eslNEON_INCLUDED
25 #define eslNEON_INCLUDED
26 #include "esl_config.h"
27
28 #include "easel.h"
29 #include <stdio.h>
30 #include <arm_neon.h>
31
32
33 /*****************************************************************
34 * 1. Data structures for ARM/Intel intrinsics compatibility
35 *****************************************************************
36 *
37 * We tend to develop in x86 vector intrinsics (SSE/AVX/AVX512) then
38 * port to ARM NEON. It simplifies this process to have our ARM port
39 * work in terms of variables that work like x86 vector variables.
40 *
41 * x86 vector code utilizes a single type for each view of its vector
42 * registers; for example:
43 *
44 * __m128 a = _mm_and_ps(...)
45 *
46 * would be used for any combination of element sizes and lane numbers
47 * for some Intel vector register mapped to the C variable 'a'.
48 *
49 * In contrast, on ARM NEON you specify both the element size and the
50 * number of lanes when mapping a C variable onto a NEON register:
51 *
52 * uint32x4_t a = vdupq_n_s32(...)
53 *
54 * We define here x86-style union types that encompass each different
55 * NEON-style view of the 128-bit registers.
56 */
57 typedef union
58 {
59 int8x16_t s8x16;
60 int16x8_t s16x8;
61 int32x4_t s32x4;
62 int64x2_t s64x2;
63 int8x8x2_t s8x8x2;
64 uint8x16_t u8x16;
65 uint16x8_t u16x8;
66 uint32x4_t u32x4;
67 uint64x2_t u64x2;
68 uint8x8x2_t u8x8x2;
69 } esl_neon_128i_t;
70
71 typedef union
72 {
73 int8x8_t s8x8;
74 uint8x8_t u8x8;
75 int64x1_t s64x1;
76 uint64x1_t u64x1;
77 } esl_neon_64i_t;
78
79 /* Union type for vectorized floating point values. Note: AArch32 does not
80 * allow double-precision floating-point vector operations; this was newly
81 * introduced in AArch64. */
82 typedef union
83 {
84 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
85 float16x4_t f16x4;
86 #endif
87 float32x2_t f32x2;
88 } esl_neon_64f_t;
89
90 typedef union
91 {
92 float32x4_t f32x4;
93 } esl_neon_128f_t;
94
95 /* Union type for polynomial values. */
96 typedef union
97 {
98 poly8x16_t p8x16;
99 poly16x8_t p16x8;
100 } esl_neon_128p_t;
101
102 /* Composite types */
103 typedef union
104 {
105 int8x8x2_t s8x8x2;
106 int16x4x2_t s16x4x2;
107 int32x2x2_t s32x2x2;
108 uint8x8x2_t u8x8x2;
109 uint16x4x2_t u16x4x2;
110 uint32x2x2_t u32x2x2;
111 uint64x1_t u64x1; /* useful for loading constants */
112 } esl_neon_128ic_t;
113
114 typedef union
115 {
116 int8x16x2_t s8x16x2;
117 int16x8x2_t s16x8x2;
118 int32x4x2_t s32x4x2;
119 uint8x16x2_t u8x16x2;
120 uint16x8x2_t u16x8x2;
121 uint32x4x2_t u32x4x2;
122 } esl_neon_256ic_t;
123
124 typedef union
125 {
126 float32x2x2_t f32x2x2;
127 } esl_neon_128fc_t;
128
129 typedef union
130 {
131 float32x4x2_t f32x4x2;
132 } esl_neon_256fc_t;
133
134
135
136 /*****************************************************************
137 * 2. Function declarations (from esl_neon.c)
138 *****************************************************************/
139
140 extern esl_neon_128f_t esl_neon_logf(esl_neon_128f_t x);
141 extern esl_neon_128f_t esl_neon_expf(esl_neon_128f_t x);
142 extern void esl_neon_dump_float(FILE *fp, esl_neon_128f_t v);
143
144
145 /*****************************************************************
146 * 3. Inlined functions: horizontal max, sum
147 *****************************************************************/
148
149
150
151 /* Function: esl_neon_hmax_u8()
152 * Synopsis: Return max of 16 uint8_t elements in u8 vector.
153 */
154 static inline uint8_t
esl_neon_hmax_u8(esl_neon_128i_t a)155 esl_neon_hmax_u8(esl_neon_128i_t a)
156 {
157 #ifdef eslHAVE_NEON_AARCH64
158 return vmaxvq_u8(a.u8x16);
159 #else
160 a.u8x16 = vmaxq_u8(a.u8x16, vreinterpretq_u8_u32(vextq_u32(a.u32x4, a.u32x4, 2)));
161 a.u8x16 = vmaxq_u8(a.u8x16, vreinterpretq_u8_u32(vextq_u32(a.u32x4, a.u32x4, 1)));
162 a.u8x16 = vmaxq_u8(a.u8x16, vreinterpretq_u8_u16(vrev64q_u16(a.u16x8)));
163 a.u8x16 = vmaxq_u8(a.u8x16, vrev64q_u8(a.u8x16));
164 return vgetq_lane_u8(a.u8x16, 15);
165 #endif
166 }
167
168 /* Function: esl_neon_hmax_s8()
169 * Synopsis: Return max of 16 int8_t elements in s8 vector.
170 */
171 static inline int8_t
esl_neon_hmax_s8(esl_neon_128i_t a)172 esl_neon_hmax_s8(esl_neon_128i_t a)
173 {
174 #ifdef eslHAVE_NEON_AARCH64
175 return vmaxvq_s8(a.s8x16);
176 #else
177 a.s8x16 = vmaxq_s8(a.s8x16, vreinterpretq_s8_s32(vextq_s32(a.s32x4, a.s32x4, 2)));
178 a.s8x16 = vmaxq_s8(a.s8x16, vreinterpretq_s8_s32(vextq_s32(a.s32x4, a.s32x4, 1)));
179 a.s8x16 = vmaxq_s8(a.s8x16, vreinterpretq_s8_s16(vrev64q_s16(a.s16x8)));
180 a.s8x16 = vmaxq_s8(a.s8x16, vrev64q_s8(a.s8x16));
181 return vgetq_lane_s8(a.s8x16, 15);
182 #endif
183 }
184
185 /* Function: esl_neon_hmax_s16()
186 * Synopsis: Return max of 8 elements in s16 vector.
187 */
188 static inline int16_t
esl_neon_hmax_s16(esl_neon_128i_t a)189 esl_neon_hmax_s16(esl_neon_128i_t a)
190 {
191 #ifdef eslHAVE_NEON_AARCH64
192 return vmaxvq_s16(a.s16x8);
193 #else
194 a.s16x8 = vmaxq_s16(a.s16x8, vrev64q_s16(a.s16x8));
195 a.s16x8 = vmaxq_s16(a.s16x8, vreinterpretq_s16_s32(vrev64q_s32(a.s32x4)));
196 a.s16x8 = vmaxq_s16(a.s16x8, vreinterpretq_s16_s32(vextq_s32(a.s32x4, a.s32x4, 2)));
197 return vgetq_lane_s16(a.s16x8, 7);
198 #endif
199 }
200
201
202 /* Function: esl_neon_hsum_float()
203 * Synopsis: Takes the horizontal sum of elements in a vector.
204 *
205 * Purpose: Add the four float elements in vector <a>; return
206 * that sum in <*ret_sum>.
207 */
208 static inline void
esl_neon_hsum_float(esl_neon_128f_t a,float * ret_sum)209 esl_neon_hsum_float(esl_neon_128f_t a, float *ret_sum)
210 {
211 #ifdef eslHAVE_NEON_AARCH64
212 *ret_sum = vaddvq_f32(a.f32x4);
213 #else
214 esl_neon_128f_t fvec;
215 a.f32x4 = vaddq_f32(a.f32x4, vrev64q_f32(a.f32x4));
216 fvec.f32x4 = vextq_f32(a.f32x4, a.f32x4, 2);
217 a.f32x4 = vaddq_f32(a.f32x4, fvec.f32x4);
218 vst1q_lane_f32(ret_sum, a.f32x4, 0);
219 #endif
220 }
221
222 /*****************************************************************
223 * 4. Inlined functions: left, right shifts
224 *****************************************************************/
225
226
227 /* Function: esl_neon_rightshift_float()
228 * Synopsis: Shift vector elements to the right.
229 *
230 * Purpose: Returns a vector containing
231 * <{ b[0] a[0] a[1] a[2] }>:
232 * i.e. shift the values in <a> to the
233 * right, and load the first value of
234 * <b> into the first slot.
235 */
236 static inline esl_neon_128f_t
esl_neon_rightshift_float(esl_neon_128f_t a,esl_neon_128f_t b)237 esl_neon_rightshift_float(esl_neon_128f_t a, esl_neon_128f_t b)
238 {
239 register esl_neon_128f_t v;
240
241 v.f32x4 = vrev64q_f32(b.f32x4); /* b[1] b[0] b[3] b[2] */
242 v.f32x4 = vextq_f32(v.f32x4, v.f32x4, 2); /* b[3] b[2] b[1] b[0] */
243 v.f32x4 = vextq_f32(v.f32x4, a.f32x4, 3); /* b[0] a[0] a[1] a[2] */
244 return v;
245 }
246
247 /* Function: esl_neon_leftshift_float()
248 * Synopsis: Shift vector elements to the left.
249 *
250 * Purpose: Returns a vector containing
251 * <{ a[1] a[2] a[3] b[0]}>:
252 * i.e. shift the values in <a> to the
253 * left and load the first value of
254 * <b> into the first slot.
255 */
256 static inline esl_neon_128f_t
esl_neon_leftshift_float(esl_neon_128f_t a,esl_neon_128f_t b)257 esl_neon_leftshift_float(esl_neon_128f_t a, esl_neon_128f_t b)
258 {
259 register esl_neon_128f_t v;
260 v.f32x4 = vextq_f32(a.f32x4, b.f32x4, 1);/* now a[1] a[2] a[3] b[0] */
261 return v;
262 }
263
264
265 /*****************************************************************
266 * 5. Inlined functions: any_gt
267 *****************************************************************/
268
269 /* Function: esl_neon_any_gt_s16()
270 * Synopsis: Returns TRUE if any a[z] > b[z].
271 *
272 * Purpose: Return TRUE if any <a[z] > b[z]> for <z=0..15>
273 * in two <s16> vectors.
274 */
275 static inline int
esl_neon_any_gt_s16(esl_neon_128i_t a,esl_neon_128i_t b)276 esl_neon_any_gt_s16(esl_neon_128i_t a, esl_neon_128i_t b)
277 {
278 esl_neon_128i_t mask;
279 int64_t l0, l1;
280 int64_t maskbits;
281
282 mask.u16x8 = vcgtq_s16(a.s16x8,b.s16x8);
283 l0 = vgetq_lane_u64(mask.u64x2, 0);
284 l1 = vgetq_lane_u64(mask.u64x2, 1);
285 maskbits = l0 | l1;
286 return maskbits != 0;
287 }
288
289 /* Function: esl_neon_any_gt_float()
290 * Synopsis: Returns TRUE if any a[z] > b[z]
291 *
292 * Purpose: Returns TRUE if any a[z] > b[z] in two
293 * <ps> vectors of floats.
294 *
295 * Note: Ported from esl_sse.c::esl_sse_any_gt_float().
296 */
297 static inline int
esl_neon_any_gt_float(esl_neon_128f_t a,esl_neon_128f_t b)298 esl_neon_any_gt_float(esl_neon_128f_t a, esl_neon_128f_t b)
299 {
300 esl_neon_128i_t mask;
301 int l0, l1;
302 int maskbits;
303
304 mask.u32x4 = vcgtq_f32(a.f32x4,b.f32x4);
305 l0 = vgetq_lane_u64(mask.u64x2, 0);
306 l1 = vgetq_lane_u64(mask.u64x2, 1);
307 maskbits = l0 | l1;
308 return maskbits != 0;
309 }
310
311
312
313 /*****************************************************************
314 * 6. Inlined functions: select
315 *****************************************************************/
316
317 /* Function: esl_neon_select_float()
318 * Synopsis: NEON equivalent of <vec_sel()>
319 *
320 * Purpose: Vector select. Returns a vector <r[z] = a[z]> where <mask[z]>
321 * is all 0's; <r[z] = b[z]> where <mask[z]> is all 1's.
322 *
323 * Useful for avoiding conditional branches. For example,
324 * to implement \ccode{if (a > 0) a += a;}:
325 *
326 * \begin{cchunk}
327 * mask = _mm_cmpgt_ps(a, _mm_setzero_ps());
328 * twoa = _mm_add_ps(a, a);
329 * a = esl_sse_select_ps(a, twoa, mask);
330 * \end{cchunk}
331 *
332 */
333 static inline esl_neon_128f_t
esl_neon_select_float(esl_neon_128f_t a,esl_neon_128f_t b,esl_neon_128f_t mask)334 esl_neon_select_float(esl_neon_128f_t a, esl_neon_128f_t b, esl_neon_128f_t mask)
335 {
336 esl_neon_128i_t aview, bview, maskview, masknot;
337 esl_neon_128f_t ret;
338
339 maskview.s64x2 = vreinterpretq_s64_f32(mask.f32x4);
340 bview.s64x2 = vreinterpretq_s64_f32(b.f32x4);
341 aview.s64x2 = vreinterpretq_s64_f32(a.f32x4);
342 bview.s64x2 = vandq_s64(bview.s64x2, maskview.s64x2);
343 masknot.s32x4 = vmvnq_s32(maskview.s32x4);
344 aview.s64x2 = vandq_s64(masknot.s64x2, aview.s64x2);
345 ret.f32x4 = vreinterpretq_f32_s64(vorrq_s64(aview.s64x2,bview.s64x2));
346 return ret;
347 }
348
349
350
351 #endif // eslNEON_INCLUDED
352 #endif // eslENABLE_NEON
353