1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
13 #define AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
14 
15 #include <arm_neon.h>
16 
17 #include "aom_dsp/simd/v64_intrinsics_arm.h"
18 
19 typedef int64x2_t v128;
20 
v128_low_u32(v128 a)21 SIMD_INLINE uint32_t v128_low_u32(v128 a) {
22   return v64_low_u32(vget_low_s64(a));
23 }
24 
v128_low_v64(v128 a)25 SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); }
26 
v128_high_v64(v128 a)27 SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); }
28 
v128_from_v64(v64 a,v64 b)29 SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
30 
v128_from_64(uint64_t a,uint64_t b)31 SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
32   return vcombine_s64((int64x1_t)b, (int64x1_t)a);
33 }
34 
v128_from_32(uint32_t a,uint32_t b,uint32_t c,uint32_t d)35 SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
36   return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b));
37 }
38 
v128_load_aligned(const void * p)39 SIMD_INLINE v128 v128_load_aligned(const void *p) {
40   return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p));
41 }
42 
v128_load_unaligned(const void * p)43 SIMD_INLINE v128 v128_load_unaligned(const void *p) {
44   return v128_load_aligned(p);
45 }
46 
v128_store_aligned(void * p,v128 r)47 SIMD_INLINE void v128_store_aligned(void *p, v128 r) {
48   vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
49 }
50 
v128_store_unaligned(void * p,v128 r)51 SIMD_INLINE void v128_store_unaligned(void *p, v128 r) {
52   vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
53 }
54 
v128_align(v128 a,v128 b,unsigned int c)55 SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
56 // The following functions require an immediate.
57 // Some compilers will check this during optimisation, others wont.
58 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
59   return c ? vreinterpretq_s64_s8(
60                  vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c))
61            : b;
62 #else
63   return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c),
64                                v64_align(v128_high_v64(b), v128_low_v64(b), c))
65                : v128_from_v64(
66                      v64_align(v128_high_v64(a), v128_low_v64(a), c - 8),
67                      v64_align(v128_low_v64(a), v128_high_v64(b), c - 8));
68 #endif
69 }
70 
v128_zero(void)71 SIMD_INLINE v128 v128_zero(void) { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
72 
v128_ones(void)73 SIMD_INLINE v128 v128_ones(void) {
74   return vreinterpretq_s64_u8(vdupq_n_u8(-1));
75 }
76 
v128_dup_8(uint8_t x)77 SIMD_INLINE v128 v128_dup_8(uint8_t x) {
78   return vreinterpretq_s64_u8(vdupq_n_u8(x));
79 }
80 
v128_dup_16(uint16_t x)81 SIMD_INLINE v128 v128_dup_16(uint16_t x) {
82   return vreinterpretq_s64_u16(vdupq_n_u16(x));
83 }
84 
v128_dup_32(uint32_t x)85 SIMD_INLINE v128 v128_dup_32(uint32_t x) {
86   return vreinterpretq_s64_u32(vdupq_n_u32(x));
87 }
88 
v128_dup_64(uint64_t x)89 SIMD_INLINE v128 v128_dup_64(uint64_t x) {
90   return vreinterpretq_s64_u64(vdupq_n_u64(x));
91 }
92 
v128_dotp_su8(v128 a,v128 b)93 SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
94   int16x8_t t1 = vmulq_s16(
95       vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))),
96       vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(b)))));
97   int16x8_t t2 = vmulq_s16(
98       vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))),
99       vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b)))));
100 #if defined(__aarch64__)
101   return vaddlvq_s16(t1) + vaddlvq_s16(t2);
102 #else
103   int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2)));
104   return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
105 #endif
106 }
107 
v128_dotp_s16(v128 a,v128 b)108 SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
109   return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) +
110          v64_dotp_s16(vget_low_s64(a), vget_low_s64(b));
111 }
112 
v128_dotp_s32(v128 a,v128 b)113 SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
114   int64x2_t t = vpaddlq_s32(
115       vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
116   return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
117 }
118 
v128_hadd_u8(v128 x)119 SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
120 #if defined(__aarch64__)
121   return vaddlvq_u8(vreinterpretq_u8_s64(x));
122 #else
123   uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x))));
124   return vget_lane_s32(
125       vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
126 #endif
127 }
128 
v128_padd_s16(v128 a)129 SIMD_INLINE v128 v128_padd_s16(v128 a) {
130   return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a)));
131 }
132 
v128_padd_u8(v128 a)133 SIMD_INLINE v128 v128_padd_u8(v128 a) {
134   return vreinterpretq_s64_u16(vpaddlq_u8(vreinterpretq_u8_s64(a)));
135 }
136 
137 typedef struct {
138   sad64_internal hi, lo;
139 } sad128_internal;
140 
v128_sad_u8_init(void)141 SIMD_INLINE sad128_internal v128_sad_u8_init(void) {
142   sad128_internal s;
143   s.hi = s.lo = vdupq_n_u16(0);
144   return s;
145 }
146 
147 /* Implementation dependent return value.  Result must be finalised with
148    v128_sad_u8_sum().
149    The result for more than 32 v128_sad_u8() calls is undefined. */
v128_sad_u8(sad128_internal s,v128 a,v128 b)150 SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
151   sad128_internal r;
152   r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
153   r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
154   return r;
155 }
156 
v128_sad_u8_sum(sad128_internal s)157 SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
158 #if defined(__aarch64__)
159   return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo);
160 #else
161   uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo)));
162   return (uint32_t)(uint64_t)(vget_high_u64(t) + vget_low_u64(t));
163 #endif
164 }
165 
166 typedef struct {
167   ssd64_internal hi, lo;
168 } ssd128_internal;
169 
v128_ssd_u8_init(void)170 SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) {
171   ssd128_internal s;
172   s.hi = s.lo = v64_ssd_u8_init();
173   return s;
174 }
175 
176 /* Implementation dependent return value.  Result must be finalised with
177  * v128_ssd_u8_sum(). */
v128_ssd_u8(ssd128_internal s,v128 a,v128 b)178 SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
179   ssd128_internal r;
180   r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
181   r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
182   return r;
183 }
184 
v128_ssd_u8_sum(ssd128_internal s)185 SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
186   return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo));
187 }
188 
v128_or(v128 x,v128 y)189 SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); }
190 
v128_xor(v128 x,v128 y)191 SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); }
192 
v128_and(v128 x,v128 y)193 SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); }
194 
v128_andn(v128 x,v128 y)195 SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); }
196 
v128_add_8(v128 x,v128 y)197 SIMD_INLINE v128 v128_add_8(v128 x, v128 y) {
198   return vreinterpretq_s64_u8(
199       vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
200 }
201 
v128_sadd_u8(v128 x,v128 y)202 SIMD_INLINE v128 v128_sadd_u8(v128 x, v128 y) {
203   return vreinterpretq_s64_u8(
204       vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
205 }
206 
v128_sadd_s8(v128 x,v128 y)207 SIMD_INLINE v128 v128_sadd_s8(v128 x, v128 y) {
208   return vreinterpretq_s64_s8(
209       vqaddq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
210 }
211 
v128_add_16(v128 x,v128 y)212 SIMD_INLINE v128 v128_add_16(v128 x, v128 y) {
213   return vreinterpretq_s64_s16(
214       vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
215 }
216 
v128_sadd_s16(v128 x,v128 y)217 SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) {
218   return vreinterpretq_s64_s16(
219       vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
220 }
221 
v128_add_32(v128 x,v128 y)222 SIMD_INLINE v128 v128_add_32(v128 x, v128 y) {
223   return vreinterpretq_s64_u32(
224       vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y)));
225 }
226 
v128_add_64(v128 x,v128 y)227 SIMD_INLINE v128 v128_add_64(v128 x, v128 y) {
228   return vreinterpretq_s64_u64(
229       vaddq_u64(vreinterpretq_u64_s64(x), vreinterpretq_u64_s64(y)));
230 }
231 
v128_sub_8(v128 x,v128 y)232 SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) {
233   return vreinterpretq_s64_u8(
234       vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
235 }
236 
v128_sub_16(v128 x,v128 y)237 SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) {
238   return vreinterpretq_s64_s16(
239       vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
240 }
241 
v128_ssub_s16(v128 x,v128 y)242 SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) {
243   return vreinterpretq_s64_s16(
244       vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
245 }
246 
v128_ssub_u16(v128 x,v128 y)247 SIMD_INLINE v128 v128_ssub_u16(v128 x, v128 y) {
248   return vreinterpretq_s64_u16(
249       vqsubq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
250 }
251 
v128_ssub_u8(v128 x,v128 y)252 SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) {
253   return vreinterpretq_s64_u8(
254       vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
255 }
256 
v128_ssub_s8(v128 x,v128 y)257 SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) {
258   return vreinterpretq_s64_s8(
259       vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
260 }
261 
v128_sub_32(v128 x,v128 y)262 SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) {
263   return vreinterpretq_s64_s32(
264       vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
265 }
266 
v128_sub_64(v128 x,v128 y)267 SIMD_INLINE v128 v128_sub_64(v128 x, v128 y) { return vsubq_s64(x, y); }
268 
v128_abs_s16(v128 x)269 SIMD_INLINE v128 v128_abs_s16(v128 x) {
270   return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x)));
271 }
272 
v128_abs_s8(v128 x)273 SIMD_INLINE v128 v128_abs_s8(v128 x) {
274   return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x)));
275 }
276 
v128_mul_s16(v64 a,v64 b)277 SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
278   return vreinterpretq_s64_s32(
279       vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b)));
280 }
281 
v128_mullo_s16(v128 a,v128 b)282 SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
283   return vreinterpretq_s64_s16(
284       vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)));
285 }
286 
v128_mulhi_s16(v128 a,v128 b)287 SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
288 #if defined(__aarch64__)
289   return vreinterpretq_s64_s16(vuzp2q_s16(
290       vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
291                                       vreinterpret_s16_s64(vget_low_s64(b)))),
292       vreinterpretq_s16_s32(
293           vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)))));
294 #else
295   return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)),
296                        v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b)));
297 #endif
298 }
299 
v128_mullo_s32(v128 a,v128 b)300 SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
301   return vreinterpretq_s64_s32(
302       vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
303 }
304 
v128_madd_s16(v128 a,v128 b)305 SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) {
306 #if defined(__aarch64__)
307   int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
308                            vreinterpret_s16_s64(vget_low_s64(b)));
309   int32x4_t t2 =
310       vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b));
311   return vreinterpretq_s64_s32(vpaddq_s32(t1, t2));
312 #else
313   return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)),
314                        v64_madd_s16(vget_low_s64(a), vget_low_s64(b)));
315 #endif
316 }
317 
v128_madd_us8(v128 a,v128 b)318 SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
319 #if defined(__aarch64__)
320   int16x8_t t1 = vmulq_s16(
321       vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))),
322       vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b))));
323   int16x8_t t2 = vmulq_s16(
324       vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))),
325       vmovl_s8(vreinterpret_s8_s64(vget_high_s64(b))));
326   return vreinterpretq_s64_s16(
327       vqaddq_s16(vuzp1q_s16(t1, t2), vuzp2q_s16(t1, t2)));
328 #else
329   return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)),
330                        v64_madd_us8(vget_low_s64(a), vget_low_s64(b)));
331 #endif
332 }
333 
v128_avg_u8(v128 x,v128 y)334 SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) {
335   return vreinterpretq_s64_u8(
336       vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
337 }
338 
v128_rdavg_u8(v128 x,v128 y)339 SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) {
340   return vreinterpretq_s64_u8(
341       vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
342 }
343 
v128_rdavg_u16(v128 x,v128 y)344 SIMD_INLINE v128 v128_rdavg_u16(v128 x, v128 y) {
345   return vreinterpretq_s64_u16(
346       vhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
347 }
348 
v128_avg_u16(v128 x,v128 y)349 SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) {
350   return vreinterpretq_s64_u16(
351       vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
352 }
353 
v128_min_u8(v128 x,v128 y)354 SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) {
355   return vreinterpretq_s64_u8(
356       vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
357 }
358 
v128_max_u8(v128 x,v128 y)359 SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) {
360   return vreinterpretq_s64_u8(
361       vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
362 }
363 
v128_min_s8(v128 x,v128 y)364 SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) {
365   return vreinterpretq_s64_s8(
366       vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
367 }
368 
v128_movemask_8(v128 a)369 SIMD_INLINE uint32_t v128_movemask_8(v128 a) {
370   a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0)));
371 #if defined(__aarch64__)
372   uint8x16_t m =
373       vandq_u8(vreinterpretq_u8_s64(a),
374                vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)));
375   return vaddv_u8(vget_low_u8(m)) + (vaddv_u8(vget_high_u8(m)) << 8);
376 #else
377   uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(
378       vandq_u8(vreinterpretq_u8_s64(a),
379                vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))))));
380   return v64_low_u32(
381       v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m)));
382 #endif
383 }
384 
v128_blend_8(v128 a,v128 b,v128 c)385 SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
386   c = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(c), vdupq_n_s8(0)));
387   return v128_or(v128_and(b, c), v128_andn(a, c));
388 }
389 
v128_max_s8(v128 x,v128 y)390 SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) {
391   return vreinterpretq_s64_s8(
392       vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
393 }
394 
v128_min_s16(v128 x,v128 y)395 SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) {
396   return vreinterpretq_s64_s16(
397       vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
398 }
399 
v128_max_s16(v128 x,v128 y)400 SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) {
401   return vreinterpretq_s64_s16(
402       vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
403 }
404 
v128_min_s32(v128 x,v128 y)405 SIMD_INLINE v128 v128_min_s32(v128 x, v128 y) {
406   return vreinterpretq_s64_s32(
407       vminq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
408 }
409 
v128_max_s32(v128 x,v128 y)410 SIMD_INLINE v128 v128_max_s32(v128 x, v128 y) {
411   return vreinterpretq_s64_s32(
412       vmaxq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
413 }
414 
v128_ziplo_8(v128 x,v128 y)415 SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) {
416 #if defined(__aarch64__)
417   return vreinterpretq_s64_u8(
418       vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
419 #else
420   uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
421   return vreinterpretq_s64_u8(r.val[0]);
422 #endif
423 }
424 
v128_ziphi_8(v128 x,v128 y)425 SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) {
426 #if defined(__aarch64__)
427   return vreinterpretq_s64_u8(
428       vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
429 #else
430   uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
431   return vreinterpretq_s64_u8(r.val[1]);
432 #endif
433 }
434 
v128_zip_8(v64 x,v64 y)435 SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) {
436   uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
437   return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1]));
438 }
439 
v128_ziplo_16(v128 x,v128 y)440 SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) {
441 #if defined(__aarch64__)
442   return vreinterpretq_s64_u16(
443       vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
444 #else
445   int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
446   return vreinterpretq_s64_s16(r.val[0]);
447 #endif
448 }
449 
v128_ziphi_16(v128 x,v128 y)450 SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) {
451 #if defined(__aarch64__)
452   return vreinterpretq_s64_u16(
453       vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
454 #else
455   int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
456   return vreinterpretq_s64_s16(r.val[1]);
457 #endif
458 }
459 
v128_zip_16(v64 x,v64 y)460 SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) {
461   uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
462   return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1]));
463 }
464 
v128_ziplo_32(v128 x,v128 y)465 SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) {
466 #if defined(__aarch64__)
467   return vreinterpretq_s64_u32(
468       vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
469 #else
470   int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
471   return vreinterpretq_s64_s32(r.val[0]);
472 #endif
473 }
474 
v128_ziphi_32(v128 x,v128 y)475 SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) {
476 #if defined(__aarch64__)
477   return vreinterpretq_s64_u32(
478       vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
479 #else
480   int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
481   return vreinterpretq_s64_s32(r.val[1]);
482 #endif
483 }
484 
v128_zip_32(v64 x,v64 y)485 SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) {
486   uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x));
487   return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1]));
488 }
489 
v128_ziplo_64(v128 a,v128 b)490 SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
491   return v128_from_v64(vget_low_s64((int64x2_t)a), vget_low_s64((int64x2_t)b));
492 }
493 
v128_ziphi_64(v128 a,v128 b)494 SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
495   return v128_from_v64(vget_high_s64((int64x2_t)a),
496                        vget_high_s64((int64x2_t)b));
497 }
498 
v128_unziplo_8(v128 x,v128 y)499 SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
500 #if defined(__aarch64__)
501   return vreinterpretq_s64_u8(
502       vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
503 #else
504   uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
505   return vreinterpretq_s64_u8(r.val[0]);
506 #endif
507 }
508 
v128_unziphi_8(v128 x,v128 y)509 SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) {
510 #if defined(__aarch64__)
511   return vreinterpretq_s64_u8(
512       vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
513 #else
514   uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
515   return vreinterpretq_s64_u8(r.val[1]);
516 #endif
517 }
518 
v128_unziplo_16(v128 x,v128 y)519 SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) {
520 #if defined(__aarch64__)
521   return vreinterpretq_s64_u16(
522       vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
523 #else
524   uint16x8x2_t r =
525       vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
526   return vreinterpretq_s64_u16(r.val[0]);
527 #endif
528 }
529 
v128_unziphi_16(v128 x,v128 y)530 SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) {
531 #if defined(__aarch64__)
532   return vreinterpretq_s64_u16(
533       vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
534 #else
535   uint16x8x2_t r =
536       vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
537   return vreinterpretq_s64_u16(r.val[1]);
538 #endif
539 }
540 
v128_unziplo_32(v128 x,v128 y)541 SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) {
542 #if defined(__aarch64__)
543   return vreinterpretq_s64_u32(
544       vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
545 #else
546   uint32x4x2_t r =
547       vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
548   return vreinterpretq_s64_u32(r.val[0]);
549 #endif
550 }
551 
v128_unziphi_32(v128 x,v128 y)552 SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) {
553 #if defined(__aarch64__)
554   return vreinterpretq_s64_u32(
555       vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
556 #else
557   uint32x4x2_t r =
558       vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
559   return vreinterpretq_s64_u32(r.val[1]);
560 #endif
561 }
562 
v128_unpack_u8_s16(v64 a)563 SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
564   return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a)));
565 }
566 
v128_unpacklo_u8_s16(v128 a)567 SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
568   return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a))));
569 }
570 
v128_unpackhi_u8_s16(v128 a)571 SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
572   return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a))));
573 }
574 
v128_unpack_s8_s16(v64 a)575 SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
576   return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(a)));
577 }
578 
v128_unpacklo_s8_s16(v128 a)579 SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
580   return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))));
581 }
582 
v128_unpackhi_s8_s16(v128 a)583 SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
584   return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))));
585 }
586 
v128_pack_s32_s16(v128 a,v128 b)587 SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
588   return v128_from_v64(
589       vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))),
590       vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b))));
591 }
592 
v128_pack_s32_u16(v128 a,v128 b)593 SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
594   return v128_from_v64(
595       vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(a))),
596       vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(b))));
597 }
598 
v128_pack_s16_u8(v128 a,v128 b)599 SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
600   return v128_from_v64(
601       vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))),
602       vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b))));
603 }
604 
v128_pack_s16_s8(v128 a,v128 b)605 SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
606   return v128_from_v64(
607       vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))),
608       vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b))));
609 }
610 
v128_unpack_u16_s32(v64 a)611 SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
612   return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a)));
613 }
614 
v128_unpack_s16_s32(v64 a)615 SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
616   return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a)));
617 }
618 
v128_unpacklo_u16_s32(v128 a)619 SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
620   return vreinterpretq_s64_u32(
621       vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a))));
622 }
623 
v128_unpacklo_s16_s32(v128 a)624 SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
625   return vreinterpretq_s64_s32(
626       vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a))));
627 }
628 
v128_unpackhi_u16_s32(v128 a)629 SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
630   return vreinterpretq_s64_u32(
631       vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a))));
632 }
633 
v128_unpackhi_s16_s32(v128 a)634 SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
635   return vreinterpretq_s64_s32(
636       vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a))));
637 }
638 
v128_shuffle_8(v128 x,v128 pattern)639 SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
640 #if defined(__aarch64__)
641   return vreinterpretq_s64_u8(
642       vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern)));
643 #else
644   uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)),
645                       vget_high_u8(vreinterpretq_u8_s64(x)) } };
646   return v128_from_64((uint64_t)vreinterpret_s64_u8(vtbl2_u8(
647                           p, vreinterpret_u8_s64(vget_high_s64(pattern)))),
648                       (uint64_t)vreinterpret_s64_u8(vtbl2_u8(
649                           p, vreinterpret_u8_s64(vget_low_s64(pattern)))));
650 #endif
651 }
652 
v128_cmpgt_s8(v128 x,v128 y)653 SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) {
654   return vreinterpretq_s64_u8(
655       vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
656 }
657 
v128_cmplt_s8(v128 x,v128 y)658 SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) {
659   return vreinterpretq_s64_u8(
660       vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
661 }
662 
v128_cmpeq_8(v128 x,v128 y)663 SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) {
664   return vreinterpretq_s64_u8(
665       vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
666 }
667 
v128_cmpgt_s16(v128 x,v128 y)668 SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) {
669   return vreinterpretq_s64_u16(
670       vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
671 }
672 
v128_cmplt_s16(v128 x,v128 y)673 SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) {
674   return vreinterpretq_s64_u16(
675       vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
676 }
677 
v128_cmpeq_16(v128 x,v128 y)678 SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) {
679   return vreinterpretq_s64_u16(
680       vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
681 }
682 
v128_cmpgt_s32(v128 x,v128 y)683 SIMD_INLINE v128 v128_cmpgt_s32(v128 x, v128 y) {
684   return vreinterpretq_s64_u32(
685       vcgtq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
686 }
687 
v128_cmplt_s32(v128 x,v128 y)688 SIMD_INLINE v128 v128_cmplt_s32(v128 x, v128 y) {
689   return vreinterpretq_s64_u32(
690       vcltq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
691 }
692 
v128_cmpeq_32(v128 x,v128 y)693 SIMD_INLINE v128 v128_cmpeq_32(v128 x, v128 y) {
694   return vreinterpretq_s64_u32(
695       vceqq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
696 }
697 
v128_shl_8(v128 a,unsigned int c)698 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
699   return (c > 7) ? v128_zero()
700                  : vreinterpretq_s64_u8(
701                        vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(c)));
702 }
703 
v128_shr_u8(v128 a,unsigned int c)704 SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
705   return (c > 7) ? v128_zero()
706                  : vreinterpretq_s64_u8(
707                        vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(-c)));
708 }
709 
v128_shr_s8(v128 a,unsigned int c)710 SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
711   return (c > 7) ? v128_ones()
712                  : vreinterpretq_s64_s8(
713                        vshlq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(-c)));
714 }
715 
v128_shl_16(v128 a,unsigned int c)716 SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
717   return (c > 15) ? v128_zero()
718                   : vreinterpretq_s64_u16(
719                         vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(c)));
720 }
721 
v128_shr_u16(v128 a,unsigned int c)722 SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
723   return (c > 15) ? v128_zero()
724                   : vreinterpretq_s64_u16(
725                         vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(-c)));
726 }
727 
v128_shr_s16(v128 a,unsigned int c)728 SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
729   return (c > 15) ? v128_ones()
730                   : vreinterpretq_s64_s16(
731                         vshlq_s16(vreinterpretq_s16_s64(a), vdupq_n_s16(-c)));
732 }
733 
v128_shl_32(v128 a,unsigned int c)734 SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
735   return (c > 31) ? v128_zero()
736                   : vreinterpretq_s64_u32(
737                         vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(c)));
738 }
739 
v128_shr_u32(v128 a,unsigned int c)740 SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
741   return (c > 31) ? v128_zero()
742                   : vreinterpretq_s64_u32(
743                         vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-c)));
744 }
745 
v128_shr_s32(v128 a,unsigned int c)746 SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
747   return (c > 31) ? v128_ones()
748                   : vreinterpretq_s64_s32(
749                         vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c)));
750 }
751 
v128_shl_64(v128 a,unsigned int c)752 SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
753   return (c > 63) ? v128_zero()
754                   : vreinterpretq_s64_u64(
755                         vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(c)));
756 }
757 
v128_shr_u64(v128 a,unsigned int c)758 SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
759   return (c > 63) ? v128_zero()
760                   : vreinterpretq_s64_u64(
761                         vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(-c)));
762 }
763 
v128_shr_s64(v128 a,unsigned int c)764 SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
765   return (c > 63) ? v128_ones() : vshlq_s64(a, vdupq_n_s64(-c));
766 }
767 
768 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
769 
v128_shl_n_byte(v128 a,unsigned int n)770 SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
771   return n < 8
772              ? v128_from_64(
773                    (uint64_t)vorr_u64(
774                        vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
775                                   n * 8),
776                        vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
777                                   (8 - n) * 8)),
778                    (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
779                                         n * 8))
780              : (n == 8 ? v128_from_64(
781                              (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0)
782                        : v128_from_64((uint64_t)vshl_n_u64(
783                                           vreinterpret_u64_s64(vget_low_s64(a)),
784                                           (n - 8) * 8),
785                                       0));
786 }
787 
v128_shr_n_byte(v128 a,unsigned int n)788 SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
789   return n == 0
790              ? a
791              : (n < 8
792                     ? v128_from_64(
793                           (uint64_t)vshr_n_u64(
794                               vreinterpret_u64_s64(vget_high_s64(a)), n * 8),
795                           (uint64_t)vorr_u64(
796                               vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
797                                          n * 8),
798                               vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
799                                          (8 - n) * 8)))
800                     : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64(
801                                                     vget_high_s64(a)))
802                               : v128_from_64(0, (uint64_t)vshr_n_u64(
803                                                     vreinterpret_u64_s64(
804                                                         vget_high_s64(a)),
805                                                     (n - 8) * 8))));
806 }
807 
v128_shl_n_8(v128 a,unsigned int c)808 SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
809   return c ? vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c)) : a;
810 }
811 
v128_shr_n_u8(v128 a,unsigned int c)812 SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
813   return c ? vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c)) : a;
814 }
815 
v128_shr_n_s8(v128 a,unsigned int c)816 SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
817   return c ? vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c)) : a;
818 }
819 
v128_shl_n_16(v128 a,unsigned int c)820 SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
821   return c ? vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c))
822            : a;
823 }
824 
v128_shr_n_u16(v128 a,unsigned int c)825 SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
826   return c ? vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c))
827            : a;
828 }
829 
v128_shr_n_s16(v128 a,unsigned int c)830 SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
831   return c ? vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c))
832            : a;
833 }
834 
v128_shl_n_32(v128 a,unsigned int c)835 SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
836   return c ? vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c))
837            : a;
838 }
839 
v128_shr_n_u32(v128 a,unsigned int c)840 SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
841   return c ? vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c))
842            : a;
843 }
844 
v128_shr_n_s32(v128 a,unsigned int c)845 SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
846   return c ? vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c))
847            : a;
848 }
849 
v128_shl_n_64(v128 a,unsigned int c)850 SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
851   return c ? vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c))
852            : a;
853 }
854 
v128_shr_n_u64(v128 a,unsigned int c)855 SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
856   return c ? vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c))
857            : a;
858 }
859 
v128_shr_n_s64(v128 a,unsigned int c)860 SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
861   return c ? vshrq_n_s64(a, c) : a;
862 }
863 
864 #else
865 
v128_shl_n_byte(v128 a,unsigned int n)866 SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
867   if (n < 8)
868     return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n),
869                                 v64_shr_n_byte(v128_low_v64(a), 8 - n)),
870                          v64_shl_n_byte(v128_low_v64(a), n));
871   else
872     return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero());
873 }
874 
v128_shr_n_byte(v128 a,unsigned int n)875 SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
876   if (n < 8)
877     return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n),
878                          v64_or(v64_shr_n_byte(v128_low_v64(a), n),
879                                 v64_shl_n_byte(v128_high_v64(a), 8 - n)));
880   else
881     return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8));
882 }
883 
v128_shl_n_8(v128 a,unsigned int c)884 SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
885   return v128_shl_8(a, c);
886 }
887 
v128_shr_n_u8(v128 a,unsigned int c)888 SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
889   return v128_shr_u8(a, c);
890 }
891 
v128_shr_n_s8(v128 a,unsigned int c)892 SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
893   return v128_shr_s8(a, c);
894 }
895 
v128_shl_n_16(v128 a,unsigned int c)896 SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
897   return v128_shl_16(a, c);
898 }
899 
v128_shr_n_u16(v128 a,unsigned int c)900 SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
901   return v128_shr_u16(a, c);
902 }
903 
v128_shr_n_s16(v128 a,unsigned int c)904 SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
905   return v128_shr_s16(a, c);
906 }
907 
v128_shl_n_32(v128 a,unsigned int c)908 SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
909   return v128_shl_32(a, c);
910 }
911 
v128_shr_n_u32(v128 a,unsigned int c)912 SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
913   return v128_shr_u32(a, c);
914 }
915 
v128_shr_n_s32(v128 a,unsigned int c)916 SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
917   return v128_shr_s32(a, c);
918 }
919 
v128_shl_n_64(v128 a,unsigned int c)920 SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
921   return v128_shl_64(a, c);
922 }
923 
v128_shr_n_u64(v128 a,unsigned int c)924 SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
925   return v128_shr_u64(a, c);
926 }
927 
v128_shr_n_s64(v128 a,unsigned int c)928 SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
929   return v128_shr_s64(a, c);
930 }
931 
932 #endif
933 
934 typedef uint32x4_t sad128_internal_u16;
935 
v128_sad_u16_init(void)936 SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) {
937   return vdupq_n_u32(0);
938 }
939 
940 /* Implementation dependent return value.  Result must be finalised with
941  * v128_sad_u16_sum(). */
v128_sad_u16(sad128_internal_u16 s,v128 a,v128 b)942 SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
943                                              v128 b) {
944   return vaddq_u32(
945       s, vpaddlq_u16(vsubq_u16(
946              vmaxq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)),
947              vminq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)))));
948 }
949 
v128_sad_u16_sum(sad128_internal_u16 s)950 SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
951   uint64x2_t t = vpaddlq_u32(s);
952   return (uint32_t)(uint64_t)vget_high_u64(t) +
953          (uint32_t)(uint64_t)vget_low_u64(t);
954 }
955 
956 typedef v128 ssd128_internal_s16;
v128_ssd_s16_init(void)957 SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); }
958 
959 /* Implementation dependent return value.  Result must be finalised with
960  * v128_ssd_s16_sum(). */
v128_ssd_s16(ssd128_internal_s16 s,v128 a,v128 b)961 SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
962                                              v128 b) {
963   v128 d = v128_sub_16(a, b);
964   d = v128_madd_s16(d, d);
965   return v128_add_64(
966       s, vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s64(d))));
967 }
968 
v128_ssd_s16_sum(ssd128_internal_s16 s)969 SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
970   return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
971 }
972 
973 #endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
974