1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
13 #define AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
14
15 #include <arm_neon.h>
16
17 #include "aom_dsp/simd/v64_intrinsics_arm.h"
18
19 typedef int64x2_t v128;
20
v128_low_u32(v128 a)21 SIMD_INLINE uint32_t v128_low_u32(v128 a) {
22 return v64_low_u32(vget_low_s64(a));
23 }
24
v128_low_v64(v128 a)25 SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); }
26
v128_high_v64(v128 a)27 SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); }
28
v128_from_v64(v64 a,v64 b)29 SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
30
v128_from_64(uint64_t a,uint64_t b)31 SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
32 return vcombine_s64((int64x1_t)b, (int64x1_t)a);
33 }
34
v128_from_32(uint32_t a,uint32_t b,uint32_t c,uint32_t d)35 SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
36 return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b));
37 }
38
v128_load_aligned(const void * p)39 SIMD_INLINE v128 v128_load_aligned(const void *p) {
40 return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p));
41 }
42
v128_load_unaligned(const void * p)43 SIMD_INLINE v128 v128_load_unaligned(const void *p) {
44 return v128_load_aligned(p);
45 }
46
v128_store_aligned(void * p,v128 r)47 SIMD_INLINE void v128_store_aligned(void *p, v128 r) {
48 vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
49 }
50
v128_store_unaligned(void * p,v128 r)51 SIMD_INLINE void v128_store_unaligned(void *p, v128 r) {
52 vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
53 }
54
v128_align(v128 a,v128 b,unsigned int c)55 SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
56 // The following functions require an immediate.
57 // Some compilers will check this during optimisation, others wont.
58 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
59 return c ? vreinterpretq_s64_s8(
60 vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c))
61 : b;
62 #else
63 return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c),
64 v64_align(v128_high_v64(b), v128_low_v64(b), c))
65 : v128_from_v64(
66 v64_align(v128_high_v64(a), v128_low_v64(a), c - 8),
67 v64_align(v128_low_v64(a), v128_high_v64(b), c - 8));
68 #endif
69 }
70
v128_zero(void)71 SIMD_INLINE v128 v128_zero(void) { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
72
v128_ones(void)73 SIMD_INLINE v128 v128_ones(void) {
74 return vreinterpretq_s64_u8(vdupq_n_u8(-1));
75 }
76
v128_dup_8(uint8_t x)77 SIMD_INLINE v128 v128_dup_8(uint8_t x) {
78 return vreinterpretq_s64_u8(vdupq_n_u8(x));
79 }
80
v128_dup_16(uint16_t x)81 SIMD_INLINE v128 v128_dup_16(uint16_t x) {
82 return vreinterpretq_s64_u16(vdupq_n_u16(x));
83 }
84
v128_dup_32(uint32_t x)85 SIMD_INLINE v128 v128_dup_32(uint32_t x) {
86 return vreinterpretq_s64_u32(vdupq_n_u32(x));
87 }
88
v128_dup_64(uint64_t x)89 SIMD_INLINE v128 v128_dup_64(uint64_t x) {
90 return vreinterpretq_s64_u64(vdupq_n_u64(x));
91 }
92
v128_dotp_su8(v128 a,v128 b)93 SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
94 int16x8_t t1 = vmulq_s16(
95 vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))),
96 vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(b)))));
97 int16x8_t t2 = vmulq_s16(
98 vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))),
99 vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b)))));
100 #if defined(__aarch64__)
101 return vaddlvq_s16(t1) + vaddlvq_s16(t2);
102 #else
103 int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2)));
104 return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
105 #endif
106 }
107
v128_dotp_s16(v128 a,v128 b)108 SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
109 return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) +
110 v64_dotp_s16(vget_low_s64(a), vget_low_s64(b));
111 }
112
v128_dotp_s32(v128 a,v128 b)113 SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
114 int64x2_t t = vpaddlq_s32(
115 vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
116 return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
117 }
118
v128_hadd_u8(v128 x)119 SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
120 #if defined(__aarch64__)
121 return vaddlvq_u8(vreinterpretq_u8_s64(x));
122 #else
123 uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x))));
124 return vget_lane_s32(
125 vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
126 #endif
127 }
128
v128_padd_s16(v128 a)129 SIMD_INLINE v128 v128_padd_s16(v128 a) {
130 return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a)));
131 }
132
v128_padd_u8(v128 a)133 SIMD_INLINE v128 v128_padd_u8(v128 a) {
134 return vreinterpretq_s64_u16(vpaddlq_u8(vreinterpretq_u8_s64(a)));
135 }
136
137 typedef struct {
138 sad64_internal hi, lo;
139 } sad128_internal;
140
v128_sad_u8_init(void)141 SIMD_INLINE sad128_internal v128_sad_u8_init(void) {
142 sad128_internal s;
143 s.hi = s.lo = vdupq_n_u16(0);
144 return s;
145 }
146
147 /* Implementation dependent return value. Result must be finalised with
148 v128_sad_u8_sum().
149 The result for more than 32 v128_sad_u8() calls is undefined. */
v128_sad_u8(sad128_internal s,v128 a,v128 b)150 SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
151 sad128_internal r;
152 r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
153 r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
154 return r;
155 }
156
v128_sad_u8_sum(sad128_internal s)157 SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
158 #if defined(__aarch64__)
159 return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo);
160 #else
161 uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo)));
162 return (uint32_t)(uint64_t)(vget_high_u64(t) + vget_low_u64(t));
163 #endif
164 }
165
166 typedef struct {
167 ssd64_internal hi, lo;
168 } ssd128_internal;
169
v128_ssd_u8_init(void)170 SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) {
171 ssd128_internal s;
172 s.hi = s.lo = v64_ssd_u8_init();
173 return s;
174 }
175
176 /* Implementation dependent return value. Result must be finalised with
177 * v128_ssd_u8_sum(). */
v128_ssd_u8(ssd128_internal s,v128 a,v128 b)178 SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
179 ssd128_internal r;
180 r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
181 r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
182 return r;
183 }
184
v128_ssd_u8_sum(ssd128_internal s)185 SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
186 return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo));
187 }
188
v128_or(v128 x,v128 y)189 SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); }
190
v128_xor(v128 x,v128 y)191 SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); }
192
v128_and(v128 x,v128 y)193 SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); }
194
v128_andn(v128 x,v128 y)195 SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); }
196
v128_add_8(v128 x,v128 y)197 SIMD_INLINE v128 v128_add_8(v128 x, v128 y) {
198 return vreinterpretq_s64_u8(
199 vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
200 }
201
v128_sadd_u8(v128 x,v128 y)202 SIMD_INLINE v128 v128_sadd_u8(v128 x, v128 y) {
203 return vreinterpretq_s64_u8(
204 vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
205 }
206
v128_sadd_s8(v128 x,v128 y)207 SIMD_INLINE v128 v128_sadd_s8(v128 x, v128 y) {
208 return vreinterpretq_s64_s8(
209 vqaddq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
210 }
211
v128_add_16(v128 x,v128 y)212 SIMD_INLINE v128 v128_add_16(v128 x, v128 y) {
213 return vreinterpretq_s64_s16(
214 vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
215 }
216
v128_sadd_s16(v128 x,v128 y)217 SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) {
218 return vreinterpretq_s64_s16(
219 vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
220 }
221
v128_add_32(v128 x,v128 y)222 SIMD_INLINE v128 v128_add_32(v128 x, v128 y) {
223 return vreinterpretq_s64_u32(
224 vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y)));
225 }
226
v128_add_64(v128 x,v128 y)227 SIMD_INLINE v128 v128_add_64(v128 x, v128 y) {
228 return vreinterpretq_s64_u64(
229 vaddq_u64(vreinterpretq_u64_s64(x), vreinterpretq_u64_s64(y)));
230 }
231
v128_sub_8(v128 x,v128 y)232 SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) {
233 return vreinterpretq_s64_u8(
234 vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
235 }
236
v128_sub_16(v128 x,v128 y)237 SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) {
238 return vreinterpretq_s64_s16(
239 vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
240 }
241
v128_ssub_s16(v128 x,v128 y)242 SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) {
243 return vreinterpretq_s64_s16(
244 vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
245 }
246
v128_ssub_u16(v128 x,v128 y)247 SIMD_INLINE v128 v128_ssub_u16(v128 x, v128 y) {
248 return vreinterpretq_s64_u16(
249 vqsubq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
250 }
251
v128_ssub_u8(v128 x,v128 y)252 SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) {
253 return vreinterpretq_s64_u8(
254 vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
255 }
256
v128_ssub_s8(v128 x,v128 y)257 SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) {
258 return vreinterpretq_s64_s8(
259 vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
260 }
261
v128_sub_32(v128 x,v128 y)262 SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) {
263 return vreinterpretq_s64_s32(
264 vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
265 }
266
v128_sub_64(v128 x,v128 y)267 SIMD_INLINE v128 v128_sub_64(v128 x, v128 y) { return vsubq_s64(x, y); }
268
v128_abs_s16(v128 x)269 SIMD_INLINE v128 v128_abs_s16(v128 x) {
270 return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x)));
271 }
272
v128_abs_s8(v128 x)273 SIMD_INLINE v128 v128_abs_s8(v128 x) {
274 return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x)));
275 }
276
v128_mul_s16(v64 a,v64 b)277 SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
278 return vreinterpretq_s64_s32(
279 vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b)));
280 }
281
v128_mullo_s16(v128 a,v128 b)282 SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
283 return vreinterpretq_s64_s16(
284 vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)));
285 }
286
v128_mulhi_s16(v128 a,v128 b)287 SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
288 #if defined(__aarch64__)
289 return vreinterpretq_s64_s16(vuzp2q_s16(
290 vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
291 vreinterpret_s16_s64(vget_low_s64(b)))),
292 vreinterpretq_s16_s32(
293 vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)))));
294 #else
295 return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)),
296 v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b)));
297 #endif
298 }
299
v128_mullo_s32(v128 a,v128 b)300 SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
301 return vreinterpretq_s64_s32(
302 vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
303 }
304
v128_madd_s16(v128 a,v128 b)305 SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) {
306 #if defined(__aarch64__)
307 int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
308 vreinterpret_s16_s64(vget_low_s64(b)));
309 int32x4_t t2 =
310 vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b));
311 return vreinterpretq_s64_s32(vpaddq_s32(t1, t2));
312 #else
313 return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)),
314 v64_madd_s16(vget_low_s64(a), vget_low_s64(b)));
315 #endif
316 }
317
v128_madd_us8(v128 a,v128 b)318 SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
319 #if defined(__aarch64__)
320 int16x8_t t1 = vmulq_s16(
321 vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))),
322 vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b))));
323 int16x8_t t2 = vmulq_s16(
324 vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))),
325 vmovl_s8(vreinterpret_s8_s64(vget_high_s64(b))));
326 return vreinterpretq_s64_s16(
327 vqaddq_s16(vuzp1q_s16(t1, t2), vuzp2q_s16(t1, t2)));
328 #else
329 return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)),
330 v64_madd_us8(vget_low_s64(a), vget_low_s64(b)));
331 #endif
332 }
333
v128_avg_u8(v128 x,v128 y)334 SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) {
335 return vreinterpretq_s64_u8(
336 vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
337 }
338
v128_rdavg_u8(v128 x,v128 y)339 SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) {
340 return vreinterpretq_s64_u8(
341 vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
342 }
343
v128_rdavg_u16(v128 x,v128 y)344 SIMD_INLINE v128 v128_rdavg_u16(v128 x, v128 y) {
345 return vreinterpretq_s64_u16(
346 vhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
347 }
348
v128_avg_u16(v128 x,v128 y)349 SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) {
350 return vreinterpretq_s64_u16(
351 vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
352 }
353
v128_min_u8(v128 x,v128 y)354 SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) {
355 return vreinterpretq_s64_u8(
356 vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
357 }
358
v128_max_u8(v128 x,v128 y)359 SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) {
360 return vreinterpretq_s64_u8(
361 vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
362 }
363
v128_min_s8(v128 x,v128 y)364 SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) {
365 return vreinterpretq_s64_s8(
366 vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
367 }
368
v128_movemask_8(v128 a)369 SIMD_INLINE uint32_t v128_movemask_8(v128 a) {
370 a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0)));
371 #if defined(__aarch64__)
372 uint8x16_t m =
373 vandq_u8(vreinterpretq_u8_s64(a),
374 vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)));
375 return vaddv_u8(vget_low_u8(m)) + (vaddv_u8(vget_high_u8(m)) << 8);
376 #else
377 uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(
378 vandq_u8(vreinterpretq_u8_s64(a),
379 vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))))));
380 return v64_low_u32(
381 v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m)));
382 #endif
383 }
384
v128_blend_8(v128 a,v128 b,v128 c)385 SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
386 c = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(c), vdupq_n_s8(0)));
387 return v128_or(v128_and(b, c), v128_andn(a, c));
388 }
389
v128_max_s8(v128 x,v128 y)390 SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) {
391 return vreinterpretq_s64_s8(
392 vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
393 }
394
v128_min_s16(v128 x,v128 y)395 SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) {
396 return vreinterpretq_s64_s16(
397 vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
398 }
399
v128_max_s16(v128 x,v128 y)400 SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) {
401 return vreinterpretq_s64_s16(
402 vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
403 }
404
v128_min_s32(v128 x,v128 y)405 SIMD_INLINE v128 v128_min_s32(v128 x, v128 y) {
406 return vreinterpretq_s64_s32(
407 vminq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
408 }
409
v128_max_s32(v128 x,v128 y)410 SIMD_INLINE v128 v128_max_s32(v128 x, v128 y) {
411 return vreinterpretq_s64_s32(
412 vmaxq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
413 }
414
v128_ziplo_8(v128 x,v128 y)415 SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) {
416 #if defined(__aarch64__)
417 return vreinterpretq_s64_u8(
418 vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
419 #else
420 uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
421 return vreinterpretq_s64_u8(r.val[0]);
422 #endif
423 }
424
v128_ziphi_8(v128 x,v128 y)425 SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) {
426 #if defined(__aarch64__)
427 return vreinterpretq_s64_u8(
428 vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
429 #else
430 uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
431 return vreinterpretq_s64_u8(r.val[1]);
432 #endif
433 }
434
v128_zip_8(v64 x,v64 y)435 SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) {
436 uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
437 return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1]));
438 }
439
v128_ziplo_16(v128 x,v128 y)440 SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) {
441 #if defined(__aarch64__)
442 return vreinterpretq_s64_u16(
443 vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
444 #else
445 int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
446 return vreinterpretq_s64_s16(r.val[0]);
447 #endif
448 }
449
v128_ziphi_16(v128 x,v128 y)450 SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) {
451 #if defined(__aarch64__)
452 return vreinterpretq_s64_u16(
453 vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
454 #else
455 int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
456 return vreinterpretq_s64_s16(r.val[1]);
457 #endif
458 }
459
v128_zip_16(v64 x,v64 y)460 SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) {
461 uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
462 return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1]));
463 }
464
v128_ziplo_32(v128 x,v128 y)465 SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) {
466 #if defined(__aarch64__)
467 return vreinterpretq_s64_u32(
468 vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
469 #else
470 int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
471 return vreinterpretq_s64_s32(r.val[0]);
472 #endif
473 }
474
v128_ziphi_32(v128 x,v128 y)475 SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) {
476 #if defined(__aarch64__)
477 return vreinterpretq_s64_u32(
478 vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
479 #else
480 int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
481 return vreinterpretq_s64_s32(r.val[1]);
482 #endif
483 }
484
v128_zip_32(v64 x,v64 y)485 SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) {
486 uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x));
487 return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1]));
488 }
489
v128_ziplo_64(v128 a,v128 b)490 SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
491 return v128_from_v64(vget_low_s64((int64x2_t)a), vget_low_s64((int64x2_t)b));
492 }
493
v128_ziphi_64(v128 a,v128 b)494 SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
495 return v128_from_v64(vget_high_s64((int64x2_t)a),
496 vget_high_s64((int64x2_t)b));
497 }
498
v128_unziplo_8(v128 x,v128 y)499 SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
500 #if defined(__aarch64__)
501 return vreinterpretq_s64_u8(
502 vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
503 #else
504 uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
505 return vreinterpretq_s64_u8(r.val[0]);
506 #endif
507 }
508
v128_unziphi_8(v128 x,v128 y)509 SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) {
510 #if defined(__aarch64__)
511 return vreinterpretq_s64_u8(
512 vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
513 #else
514 uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
515 return vreinterpretq_s64_u8(r.val[1]);
516 #endif
517 }
518
v128_unziplo_16(v128 x,v128 y)519 SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) {
520 #if defined(__aarch64__)
521 return vreinterpretq_s64_u16(
522 vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
523 #else
524 uint16x8x2_t r =
525 vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
526 return vreinterpretq_s64_u16(r.val[0]);
527 #endif
528 }
529
v128_unziphi_16(v128 x,v128 y)530 SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) {
531 #if defined(__aarch64__)
532 return vreinterpretq_s64_u16(
533 vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
534 #else
535 uint16x8x2_t r =
536 vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
537 return vreinterpretq_s64_u16(r.val[1]);
538 #endif
539 }
540
v128_unziplo_32(v128 x,v128 y)541 SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) {
542 #if defined(__aarch64__)
543 return vreinterpretq_s64_u32(
544 vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
545 #else
546 uint32x4x2_t r =
547 vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
548 return vreinterpretq_s64_u32(r.val[0]);
549 #endif
550 }
551
v128_unziphi_32(v128 x,v128 y)552 SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) {
553 #if defined(__aarch64__)
554 return vreinterpretq_s64_u32(
555 vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
556 #else
557 uint32x4x2_t r =
558 vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
559 return vreinterpretq_s64_u32(r.val[1]);
560 #endif
561 }
562
v128_unpack_u8_s16(v64 a)563 SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
564 return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a)));
565 }
566
v128_unpacklo_u8_s16(v128 a)567 SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
568 return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a))));
569 }
570
v128_unpackhi_u8_s16(v128 a)571 SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
572 return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a))));
573 }
574
v128_unpack_s8_s16(v64 a)575 SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
576 return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(a)));
577 }
578
v128_unpacklo_s8_s16(v128 a)579 SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
580 return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))));
581 }
582
v128_unpackhi_s8_s16(v128 a)583 SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
584 return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))));
585 }
586
v128_pack_s32_s16(v128 a,v128 b)587 SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
588 return v128_from_v64(
589 vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))),
590 vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b))));
591 }
592
v128_pack_s32_u16(v128 a,v128 b)593 SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
594 return v128_from_v64(
595 vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(a))),
596 vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(b))));
597 }
598
v128_pack_s16_u8(v128 a,v128 b)599 SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
600 return v128_from_v64(
601 vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))),
602 vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b))));
603 }
604
v128_pack_s16_s8(v128 a,v128 b)605 SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
606 return v128_from_v64(
607 vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))),
608 vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b))));
609 }
610
v128_unpack_u16_s32(v64 a)611 SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
612 return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a)));
613 }
614
v128_unpack_s16_s32(v64 a)615 SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
616 return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a)));
617 }
618
v128_unpacklo_u16_s32(v128 a)619 SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
620 return vreinterpretq_s64_u32(
621 vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a))));
622 }
623
v128_unpacklo_s16_s32(v128 a)624 SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
625 return vreinterpretq_s64_s32(
626 vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a))));
627 }
628
v128_unpackhi_u16_s32(v128 a)629 SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
630 return vreinterpretq_s64_u32(
631 vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a))));
632 }
633
v128_unpackhi_s16_s32(v128 a)634 SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
635 return vreinterpretq_s64_s32(
636 vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a))));
637 }
638
v128_shuffle_8(v128 x,v128 pattern)639 SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
640 #if defined(__aarch64__)
641 return vreinterpretq_s64_u8(
642 vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern)));
643 #else
644 uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)),
645 vget_high_u8(vreinterpretq_u8_s64(x)) } };
646 return v128_from_64((uint64_t)vreinterpret_s64_u8(vtbl2_u8(
647 p, vreinterpret_u8_s64(vget_high_s64(pattern)))),
648 (uint64_t)vreinterpret_s64_u8(vtbl2_u8(
649 p, vreinterpret_u8_s64(vget_low_s64(pattern)))));
650 #endif
651 }
652
v128_cmpgt_s8(v128 x,v128 y)653 SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) {
654 return vreinterpretq_s64_u8(
655 vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
656 }
657
v128_cmplt_s8(v128 x,v128 y)658 SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) {
659 return vreinterpretq_s64_u8(
660 vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
661 }
662
v128_cmpeq_8(v128 x,v128 y)663 SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) {
664 return vreinterpretq_s64_u8(
665 vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
666 }
667
v128_cmpgt_s16(v128 x,v128 y)668 SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) {
669 return vreinterpretq_s64_u16(
670 vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
671 }
672
v128_cmplt_s16(v128 x,v128 y)673 SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) {
674 return vreinterpretq_s64_u16(
675 vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
676 }
677
v128_cmpeq_16(v128 x,v128 y)678 SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) {
679 return vreinterpretq_s64_u16(
680 vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
681 }
682
v128_cmpgt_s32(v128 x,v128 y)683 SIMD_INLINE v128 v128_cmpgt_s32(v128 x, v128 y) {
684 return vreinterpretq_s64_u32(
685 vcgtq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
686 }
687
v128_cmplt_s32(v128 x,v128 y)688 SIMD_INLINE v128 v128_cmplt_s32(v128 x, v128 y) {
689 return vreinterpretq_s64_u32(
690 vcltq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
691 }
692
v128_cmpeq_32(v128 x,v128 y)693 SIMD_INLINE v128 v128_cmpeq_32(v128 x, v128 y) {
694 return vreinterpretq_s64_u32(
695 vceqq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
696 }
697
v128_shl_8(v128 a,unsigned int c)698 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
699 return (c > 7) ? v128_zero()
700 : vreinterpretq_s64_u8(
701 vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(c)));
702 }
703
v128_shr_u8(v128 a,unsigned int c)704 SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
705 return (c > 7) ? v128_zero()
706 : vreinterpretq_s64_u8(
707 vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(-c)));
708 }
709
v128_shr_s8(v128 a,unsigned int c)710 SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
711 return (c > 7) ? v128_ones()
712 : vreinterpretq_s64_s8(
713 vshlq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(-c)));
714 }
715
v128_shl_16(v128 a,unsigned int c)716 SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
717 return (c > 15) ? v128_zero()
718 : vreinterpretq_s64_u16(
719 vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(c)));
720 }
721
v128_shr_u16(v128 a,unsigned int c)722 SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
723 return (c > 15) ? v128_zero()
724 : vreinterpretq_s64_u16(
725 vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(-c)));
726 }
727
v128_shr_s16(v128 a,unsigned int c)728 SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
729 return (c > 15) ? v128_ones()
730 : vreinterpretq_s64_s16(
731 vshlq_s16(vreinterpretq_s16_s64(a), vdupq_n_s16(-c)));
732 }
733
v128_shl_32(v128 a,unsigned int c)734 SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
735 return (c > 31) ? v128_zero()
736 : vreinterpretq_s64_u32(
737 vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(c)));
738 }
739
v128_shr_u32(v128 a,unsigned int c)740 SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
741 return (c > 31) ? v128_zero()
742 : vreinterpretq_s64_u32(
743 vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-c)));
744 }
745
v128_shr_s32(v128 a,unsigned int c)746 SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
747 return (c > 31) ? v128_ones()
748 : vreinterpretq_s64_s32(
749 vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c)));
750 }
751
v128_shl_64(v128 a,unsigned int c)752 SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
753 return (c > 63) ? v128_zero()
754 : vreinterpretq_s64_u64(
755 vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(c)));
756 }
757
v128_shr_u64(v128 a,unsigned int c)758 SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
759 return (c > 63) ? v128_zero()
760 : vreinterpretq_s64_u64(
761 vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(-c)));
762 }
763
v128_shr_s64(v128 a,unsigned int c)764 SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
765 return (c > 63) ? v128_ones() : vshlq_s64(a, vdupq_n_s64(-c));
766 }
767
768 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
769
v128_shl_n_byte(v128 a,unsigned int n)770 SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
771 return n < 8
772 ? v128_from_64(
773 (uint64_t)vorr_u64(
774 vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
775 n * 8),
776 vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
777 (8 - n) * 8)),
778 (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
779 n * 8))
780 : (n == 8 ? v128_from_64(
781 (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0)
782 : v128_from_64((uint64_t)vshl_n_u64(
783 vreinterpret_u64_s64(vget_low_s64(a)),
784 (n - 8) * 8),
785 0));
786 }
787
v128_shr_n_byte(v128 a,unsigned int n)788 SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
789 return n == 0
790 ? a
791 : (n < 8
792 ? v128_from_64(
793 (uint64_t)vshr_n_u64(
794 vreinterpret_u64_s64(vget_high_s64(a)), n * 8),
795 (uint64_t)vorr_u64(
796 vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
797 n * 8),
798 vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
799 (8 - n) * 8)))
800 : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64(
801 vget_high_s64(a)))
802 : v128_from_64(0, (uint64_t)vshr_n_u64(
803 vreinterpret_u64_s64(
804 vget_high_s64(a)),
805 (n - 8) * 8))));
806 }
807
v128_shl_n_8(v128 a,unsigned int c)808 SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
809 return c ? vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c)) : a;
810 }
811
v128_shr_n_u8(v128 a,unsigned int c)812 SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
813 return c ? vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c)) : a;
814 }
815
v128_shr_n_s8(v128 a,unsigned int c)816 SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
817 return c ? vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c)) : a;
818 }
819
v128_shl_n_16(v128 a,unsigned int c)820 SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
821 return c ? vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c))
822 : a;
823 }
824
v128_shr_n_u16(v128 a,unsigned int c)825 SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
826 return c ? vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c))
827 : a;
828 }
829
v128_shr_n_s16(v128 a,unsigned int c)830 SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
831 return c ? vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c))
832 : a;
833 }
834
v128_shl_n_32(v128 a,unsigned int c)835 SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
836 return c ? vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c))
837 : a;
838 }
839
v128_shr_n_u32(v128 a,unsigned int c)840 SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
841 return c ? vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c))
842 : a;
843 }
844
v128_shr_n_s32(v128 a,unsigned int c)845 SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
846 return c ? vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c))
847 : a;
848 }
849
v128_shl_n_64(v128 a,unsigned int c)850 SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
851 return c ? vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c))
852 : a;
853 }
854
v128_shr_n_u64(v128 a,unsigned int c)855 SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
856 return c ? vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c))
857 : a;
858 }
859
v128_shr_n_s64(v128 a,unsigned int c)860 SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
861 return c ? vshrq_n_s64(a, c) : a;
862 }
863
864 #else
865
v128_shl_n_byte(v128 a,unsigned int n)866 SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
867 if (n < 8)
868 return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n),
869 v64_shr_n_byte(v128_low_v64(a), 8 - n)),
870 v64_shl_n_byte(v128_low_v64(a), n));
871 else
872 return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero());
873 }
874
v128_shr_n_byte(v128 a,unsigned int n)875 SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
876 if (n < 8)
877 return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n),
878 v64_or(v64_shr_n_byte(v128_low_v64(a), n),
879 v64_shl_n_byte(v128_high_v64(a), 8 - n)));
880 else
881 return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8));
882 }
883
v128_shl_n_8(v128 a,unsigned int c)884 SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
885 return v128_shl_8(a, c);
886 }
887
v128_shr_n_u8(v128 a,unsigned int c)888 SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
889 return v128_shr_u8(a, c);
890 }
891
v128_shr_n_s8(v128 a,unsigned int c)892 SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
893 return v128_shr_s8(a, c);
894 }
895
v128_shl_n_16(v128 a,unsigned int c)896 SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
897 return v128_shl_16(a, c);
898 }
899
v128_shr_n_u16(v128 a,unsigned int c)900 SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
901 return v128_shr_u16(a, c);
902 }
903
v128_shr_n_s16(v128 a,unsigned int c)904 SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
905 return v128_shr_s16(a, c);
906 }
907
v128_shl_n_32(v128 a,unsigned int c)908 SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
909 return v128_shl_32(a, c);
910 }
911
v128_shr_n_u32(v128 a,unsigned int c)912 SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
913 return v128_shr_u32(a, c);
914 }
915
v128_shr_n_s32(v128 a,unsigned int c)916 SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
917 return v128_shr_s32(a, c);
918 }
919
v128_shl_n_64(v128 a,unsigned int c)920 SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
921 return v128_shl_64(a, c);
922 }
923
v128_shr_n_u64(v128 a,unsigned int c)924 SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
925 return v128_shr_u64(a, c);
926 }
927
v128_shr_n_s64(v128 a,unsigned int c)928 SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
929 return v128_shr_s64(a, c);
930 }
931
932 #endif
933
934 typedef uint32x4_t sad128_internal_u16;
935
v128_sad_u16_init(void)936 SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) {
937 return vdupq_n_u32(0);
938 }
939
940 /* Implementation dependent return value. Result must be finalised with
941 * v128_sad_u16_sum(). */
v128_sad_u16(sad128_internal_u16 s,v128 a,v128 b)942 SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
943 v128 b) {
944 return vaddq_u32(
945 s, vpaddlq_u16(vsubq_u16(
946 vmaxq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)),
947 vminq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)))));
948 }
949
v128_sad_u16_sum(sad128_internal_u16 s)950 SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
951 uint64x2_t t = vpaddlq_u32(s);
952 return (uint32_t)(uint64_t)vget_high_u64(t) +
953 (uint32_t)(uint64_t)vget_low_u64(t);
954 }
955
956 typedef v128 ssd128_internal_s16;
v128_ssd_s16_init(void)957 SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); }
958
959 /* Implementation dependent return value. Result must be finalised with
960 * v128_ssd_s16_sum(). */
v128_ssd_s16(ssd128_internal_s16 s,v128 a,v128 b)961 SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
962 v128 b) {
963 v128 d = v128_sub_16(a, b);
964 d = v128_madd_s16(d, d);
965 return v128_add_64(
966 s, vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s64(d))));
967 }
968
v128_ssd_s16_sum(ssd128_internal_s16 s)969 SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
970 return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
971 }
972
973 #endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
974