1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef _V64_INTRINSICS_H
13 #define _V64_INTRINSICS_H
14 
15 #include <arm_neon.h>
16 #include "./v64_intrinsics_arm.h"
17 #include "aom_ports/arm.h"
18 
19 #ifdef AOM_INCOMPATIBLE_GCC
20 #error Incompatible gcc
21 #endif
22 
23 typedef int64x1_t v64;
24 
v64_low_u32(v64 a)25 SIMD_INLINE uint32_t v64_low_u32(v64 a) {
26   return vget_lane_u32(vreinterpret_u32_s64(a), 0);
27 }
28 
v64_high_u32(v64 a)29 SIMD_INLINE uint32_t v64_high_u32(v64 a) {
30   return vget_lane_u32(vreinterpret_u32_s64(a), 1);
31 }
32 
v64_low_s32(v64 a)33 SIMD_INLINE int32_t v64_low_s32(v64 a) {
34   return vget_lane_s32(vreinterpret_s32_s64(a), 0);
35 }
36 
v64_high_s32(v64 a)37 SIMD_INLINE int32_t v64_high_s32(v64 a) {
38   return vget_lane_s32(vreinterpret_s32_s64(a), 1);
39 }
40 
v64_from_16(uint16_t a,uint16_t b,uint16_t c,uint16_t d)41 SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
42   return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 |
43                      d);
44 }
45 
v64_from_32(uint32_t x,uint32_t y)46 SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
47   return vcreate_s64((uint64_t)x << 32 | y);
48 }
49 
v64_from_64(uint64_t x)50 SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
51 
v64_u64(v64 x)52 SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
53 
u32_load_aligned(const void * p)54 SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
55   return *((uint32_t *)p);
56 }
57 
u32_load_unaligned(const void * p)58 SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
59   return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0);
60 }
61 
u32_store_aligned(void * p,uint32_t a)62 SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
63   *((uint32_t *)p) = a;
64 }
65 
u32_store_unaligned(void * p,uint32_t a)66 SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
67 #if defined(__clang__)
68   vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
69                 0);
70 #elif defined(__CC_ARM)
71   *(__packed uint32_t *)p) = a;
72 #elif defined(__GNUC__)
73   *((__attribute((packed)) uint32_t *)p) = a;
74 #else
75   vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
76                 0);
77 #endif
78 }
79 
v64_load_aligned(const void * p)80 SIMD_INLINE v64 v64_load_aligned(const void *p) {
81   return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p));
82 }
83 
v64_load_unaligned(const void * p)84 SIMD_INLINE v64 v64_load_unaligned(const void *p) {
85   return v64_load_aligned(p);
86 }
87 
v64_store_aligned(void * p,v64 r)88 SIMD_INLINE void v64_store_aligned(void *p, v64 r) {
89   vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
90 }
91 
v64_store_unaligned(void * p,v64 r)92 SIMD_INLINE void v64_store_unaligned(void *p, v64 r) {
93   vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
94 }
95 
96 // The following function requires an immediate.
97 // Some compilers will check this if it's optimising, others wont.
v64_align(v64 a,v64 b,unsigned int c)98 SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) {
99 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
100   return c ? vreinterpret_s64_s8(
101                  vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
102            : b;
103 #else
104   return c ? v64_from_64(((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8))
105            : b;
106 #endif
107 }
108 
v64_zero()109 SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); }
110 
v64_dup_8(uint8_t x)111 SIMD_INLINE v64 v64_dup_8(uint8_t x) {
112   return vreinterpret_s64_u8(vdup_n_u8(x));
113 }
114 
v64_dup_16(uint16_t x)115 SIMD_INLINE v64 v64_dup_16(uint16_t x) {
116   return vreinterpret_s64_u16(vdup_n_u16(x));
117 }
118 
v64_dup_32(uint32_t x)119 SIMD_INLINE v64 v64_dup_32(uint32_t x) {
120   return vreinterpret_s64_u32(vdup_n_u32(x));
121 }
122 
v64_dotp_su8(v64 x,v64 y)123 SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
124   int64x2_t r = vpaddlq_s32(vpaddlq_s16(
125       vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
126                 vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))))));
127   return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
128 }
129 
v64_dotp_s16(v64 x,v64 y)130 SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
131   int64x2_t r =
132       vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
133   return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
134 }
135 
v64_hadd_u8(v64 x)136 SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
137   return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
138 }
139 
v64_hadd_s16(v64 a)140 SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
141   return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
142 }
143 
144 typedef uint16x8_t sad64_internal;
145 
v64_sad_u8_init()146 SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); }
147 
148 /* Implementation dependent return value.  Result must be finalised with
149    v64_sad_u8_sum().
150    The result for more than 32 v64_sad_u8() calls is undefined. */
v64_sad_u8(sad64_internal s,v64 a,v64 b)151 SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
152   return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
153 }
154 
v64_sad_u8_sum(sad64_internal s)155 SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
156   uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
157   return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
158 }
159 
160 typedef int64x1_t ssd64_internal;
161 
v64_ssd_u8_init()162 SIMD_INLINE ssd64_internal v64_ssd_u8_init() {
163   return (ssd64_internal)(uint64_t)0;
164 }
165 
166 /* Implementation dependent return value.  Result must be finalised with
167  * v64_ssd_u8_sum(). */
v64_ssd_u8(ssd64_internal s,v64 a,v64 b)168 SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
169   uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
170   uint64x2_t r = vpaddlq_u32(vpaddlq_u16(vmull_u8(t, t)));
171   return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r)));
172 }
173 
v64_ssd_u8_sum(ssd64_internal s)174 SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
175   return (uint32_t)(uint64_t)s;
176 }
177 
v64_or(v64 x,v64 y)178 SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
179 
v64_xor(v64 x,v64 y)180 SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); }
181 
v64_and(v64 x,v64 y)182 SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); }
183 
v64_andn(v64 x,v64 y)184 SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); }
185 
v64_add_8(v64 x,v64 y)186 SIMD_INLINE v64 v64_add_8(v64 x, v64 y) {
187   return vreinterpret_s64_u8(
188       vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
189 }
190 
v64_add_16(v64 x,v64 y)191 SIMD_INLINE v64 v64_add_16(v64 x, v64 y) {
192   return vreinterpret_s64_s16(
193       vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
194 }
195 
v64_sadd_s16(v64 x,v64 y)196 SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) {
197   return vreinterpret_s64_s16(
198       vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
199 }
200 
v64_add_32(v64 x,v64 y)201 SIMD_INLINE v64 v64_add_32(v64 x, v64 y) {
202   return vreinterpret_s64_u32(
203       vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y)));
204 }
205 
v64_sub_8(v64 x,v64 y)206 SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) {
207   return vreinterpret_s64_u8(
208       vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
209 }
210 
v64_sub_16(v64 x,v64 y)211 SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) {
212   return vreinterpret_s64_s16(
213       vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
214 }
215 
v64_ssub_s16(v64 x,v64 y)216 SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) {
217   return vreinterpret_s64_s16(
218       vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
219 }
220 
v64_ssub_u16(v64 x,v64 y)221 SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) {
222   return vreinterpret_s64_u16(
223       vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
224 }
225 
v64_ssub_u8(v64 x,v64 y)226 SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) {
227   return vreinterpret_s64_u8(
228       vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
229 }
230 
v64_ssub_s8(v64 x,v64 y)231 SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) {
232   return vreinterpret_s64_s8(
233       vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
234 }
235 
v64_sub_32(v64 x,v64 y)236 SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) {
237   return vreinterpret_s64_s32(
238       vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
239 }
240 
v64_abs_s16(v64 x)241 SIMD_INLINE v64 v64_abs_s16(v64 x) {
242   return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x)));
243 }
244 
v64_abs_s8(v64 x)245 SIMD_INLINE v64 v64_abs_s8(v64 x) {
246   return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x)));
247 }
248 
v64_mullo_s16(v64 x,v64 y)249 SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) {
250   return vreinterpret_s64_s16(
251       vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
252 }
253 
v64_mulhi_s16(v64 x,v64 y)254 SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) {
255   return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32(
256       vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16)));
257 }
258 
v64_mullo_s32(v64 x,v64 y)259 SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) {
260   return vreinterpret_s64_s32(
261       vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
262 }
263 
v64_madd_s16(v64 x,v64 y)264 SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) {
265   int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y));
266   return vreinterpret_s64_s32(
267       vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))),
268                 vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t)))));
269 }
270 
v64_madd_us8(v64 x,v64 y)271 SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) {
272   return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(
273       vaddq_s16(vmull_s8(vadd_s8(vreinterpret_s8_s64(x), vdup_n_s8(-128)),
274                          vreinterpret_s8_s64(y)),
275                 vshlq_n_s16(vmovl_s8(vreinterpret_s8_s64(y)), 7)))));
276 }
277 
v64_avg_u8(v64 x,v64 y)278 SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) {
279   return vreinterpret_s64_u8(
280       vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
281 }
282 
v64_rdavg_u8(v64 x,v64 y)283 SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) {
284   return vreinterpret_s64_u8(
285       vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
286 }
287 
v64_avg_u16(v64 x,v64 y)288 SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) {
289   return vreinterpret_s64_u16(
290       vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
291 }
292 
v64_max_u8(v64 x,v64 y)293 SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) {
294   return vreinterpret_s64_u8(
295       vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
296 }
297 
v64_min_u8(v64 x,v64 y)298 SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) {
299   return vreinterpret_s64_u8(
300       vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
301 }
302 
v64_max_s8(v64 x,v64 y)303 SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) {
304   return vreinterpret_s64_s8(
305       vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
306 }
307 
v64_min_s8(v64 x,v64 y)308 SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) {
309   return vreinterpret_s64_s8(
310       vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
311 }
312 
v64_max_s16(v64 x,v64 y)313 SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) {
314   return vreinterpret_s64_s16(
315       vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
316 }
317 
v64_min_s16(v64 x,v64 y)318 SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) {
319   return vreinterpret_s64_s16(
320       vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
321 }
322 
v64_ziplo_8(v64 x,v64 y)323 SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) {
324   uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
325   return vreinterpret_s64_u8(r.val[0]);
326 }
327 
v64_ziphi_8(v64 x,v64 y)328 SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) {
329   uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
330   return vreinterpret_s64_u8(r.val[1]);
331 }
332 
v64_ziplo_16(v64 x,v64 y)333 SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) {
334   int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
335   return vreinterpret_s64_s16(r.val[0]);
336 }
337 
v64_ziphi_16(v64 x,v64 y)338 SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) {
339   int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
340   return vreinterpret_s64_s16(r.val[1]);
341 }
342 
v64_ziplo_32(v64 x,v64 y)343 SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) {
344   int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
345   return vreinterpret_s64_s32(r.val[0]);
346 }
347 
v64_ziphi_32(v64 x,v64 y)348 SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) {
349   int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
350   return vreinterpret_s64_s32(r.val[1]);
351 }
352 
v64_unpacklo_u8_s16(v64 a)353 SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
354   return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a))));
355 }
356 
v64_unpackhi_u8_s16(v64 a)357 SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
358   return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a))));
359 }
360 
v64_unpacklo_s8_s16(v64 a)361 SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
362   return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a))));
363 }
364 
v64_unpackhi_s8_s16(v64 a)365 SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
366   return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a))));
367 }
368 
v64_pack_s32_s16(v64 x,v64 y)369 SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) {
370   return vreinterpret_s64_s16(vqmovn_s32(
371       vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
372 }
373 
v64_pack_s16_u8(v64 x,v64 y)374 SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) {
375   return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32(
376       vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
377 }
378 
v64_pack_s16_s8(v64 x,v64 y)379 SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) {
380   return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32(
381       vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
382 }
383 
v64_unziplo_8(v64 x,v64 y)384 SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) {
385   uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
386   return vreinterpret_s64_u8(r.val[0]);
387 }
388 
v64_unziphi_8(v64 x,v64 y)389 SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) {
390   uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
391   return vreinterpret_s64_u8(r.val[1]);
392 }
393 
v64_unziplo_16(v64 x,v64 y)394 SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) {
395   uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
396   return vreinterpret_s64_u16(r.val[0]);
397 }
398 
v64_unziphi_16(v64 x,v64 y)399 SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) {
400   uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
401   return vreinterpret_s64_u16(r.val[1]);
402 }
403 
v64_unpacklo_s16_s32(v64 x)404 SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) {
405   return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x))));
406 }
407 
v64_unpacklo_u16_s32(v64 x)408 SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) {
409   return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x))));
410 }
411 
v64_unpackhi_s16_s32(v64 x)412 SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) {
413   return vreinterpret_s64_s32(
414       vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x))));
415 }
416 
v64_unpackhi_u16_s32(v64 x)417 SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) {
418   return vreinterpret_s64_u32(
419       vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x))));
420 }
421 
v64_shuffle_8(v64 x,v64 pattern)422 SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
423   return vreinterpret_s64_u8(
424       vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern)));
425 }
426 
v64_cmpgt_s8(v64 x,v64 y)427 SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) {
428   return vreinterpret_s64_u8(
429       vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
430 }
431 
v64_cmplt_s8(v64 x,v64 y)432 SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) {
433   return vreinterpret_s64_u8(
434       vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
435 }
436 
v64_cmpeq_8(v64 x,v64 y)437 SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) {
438   return vreinterpret_s64_u8(
439       vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
440 }
441 
v64_cmpgt_s16(v64 x,v64 y)442 SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) {
443   return vreinterpret_s64_u16(
444       vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
445 }
446 
v64_cmplt_s16(v64 x,v64 y)447 SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) {
448   return vreinterpret_s64_u16(
449       vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
450 }
451 
v64_cmpeq_16(v64 x,v64 y)452 SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) {
453   return vreinterpret_s64_u16(
454       vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
455 }
456 
v64_shl_8(v64 a,unsigned int c)457 SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
458   return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(c)));
459 }
460 
v64_shr_u8(v64 a,unsigned int c)461 SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
462   return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-c)));
463 }
464 
v64_shr_s8(v64 a,unsigned int c)465 SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
466   return vreinterpret_s64_s8(vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-c)));
467 }
468 
v64_shl_16(v64 a,unsigned int c)469 SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
470   return vreinterpret_s64_u16(vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(c)));
471 }
472 
v64_shr_u16(v64 a,unsigned int c)473 SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
474   return vreinterpret_s64_u16(
475       vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int)c)));
476 }
477 
v64_shr_s16(v64 a,unsigned int c)478 SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
479   return vreinterpret_s64_s16(
480       vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int)c)));
481 }
482 
v64_shl_32(v64 a,unsigned int c)483 SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
484   return vreinterpret_s64_u32(vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(c)));
485 }
486 
v64_shr_u32(v64 a,unsigned int c)487 SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
488   return vreinterpret_s64_u32(
489       vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int)c)));
490 }
491 
v64_shr_s32(v64 a,unsigned int c)492 SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
493   return vreinterpret_s64_s32(
494       vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c)));
495 }
496 
497 // The following functions require an immediate.
498 // Some compilers will check this during optimisation, others wont.
499 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
500 
v64_shl_n_byte(v64 a,unsigned int c)501 SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
502   return vshl_n_s64(a, c * 8);
503 }
504 
v64_shr_n_byte(v64 a,unsigned int c)505 SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
506   return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a;
507 }
508 
v64_shl_n_8(v64 a,unsigned int c)509 SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
510   return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c));
511 }
512 
v64_shr_n_u8(v64 a,unsigned int c)513 SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
514   return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c));
515 }
516 
v64_shr_n_s8(v64 a,unsigned int c)517 SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
518   return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c));
519 }
520 
v64_shl_n_16(v64 a,unsigned int c)521 SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
522   return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c));
523 }
524 
v64_shr_n_u16(v64 a,unsigned int c)525 SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
526   return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c));
527 }
528 
v64_shr_n_s16(v64 a,unsigned int c)529 SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
530   return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c));
531 }
532 
v64_shl_n_32(v64 a,unsigned int c)533 SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
534   return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c));
535 }
536 
v64_shr_n_u32(v64 a,unsigned int c)537 SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
538   return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c));
539 }
540 
v64_shr_n_s32(v64 a,unsigned int c)541 SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
542   return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c));
543 }
544 
545 #else
546 
v64_shl_n_byte(v64 a,unsigned int c)547 SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
548   return v64_from_64(v64_u64(a) << c * 8);
549 }
550 
v64_shr_n_byte(v64 a,unsigned int c)551 SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
552   return v64_from_64(v64_u64(a) >> c * 8);
553 }
554 
v64_shl_n_8(v64 a,unsigned int c)555 SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); }
556 
v64_shr_n_u8(v64 a,unsigned int c)557 SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); }
558 
v64_shr_n_s8(v64 a,unsigned int c)559 SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); }
560 
v64_shl_n_16(v64 a,unsigned int c)561 SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); }
562 
v64_shr_n_u16(v64 a,unsigned int c)563 SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
564   return v64_shr_u16(a, c);
565 }
566 
v64_shr_n_s16(v64 a,unsigned int c)567 SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
568   return v64_shr_s16(a, c);
569 }
570 
v64_shl_n_32(v64 a,unsigned int c)571 SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); }
572 
v64_shr_n_u32(v64 a,unsigned int c)573 SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
574   return v64_shr_u32(a, c);
575 }
576 
v64_shr_n_s32(v64 a,unsigned int c)577 SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
578   return v64_shr_s32(a, c);
579 }
580 
581 #endif
582 
583 #endif /* _V64_INTRINSICS_H */
584