1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef _V64_INTRINSICS_H
13 #define _V64_INTRINSICS_H
14
15 #include <arm_neon.h>
16 #include "./v64_intrinsics_arm.h"
17 #include "aom_ports/arm.h"
18
19 #ifdef AOM_INCOMPATIBLE_GCC
20 #error Incompatible gcc
21 #endif
22
23 typedef int64x1_t v64;
24
v64_low_u32(v64 a)25 SIMD_INLINE uint32_t v64_low_u32(v64 a) {
26 return vget_lane_u32(vreinterpret_u32_s64(a), 0);
27 }
28
v64_high_u32(v64 a)29 SIMD_INLINE uint32_t v64_high_u32(v64 a) {
30 return vget_lane_u32(vreinterpret_u32_s64(a), 1);
31 }
32
v64_low_s32(v64 a)33 SIMD_INLINE int32_t v64_low_s32(v64 a) {
34 return vget_lane_s32(vreinterpret_s32_s64(a), 0);
35 }
36
v64_high_s32(v64 a)37 SIMD_INLINE int32_t v64_high_s32(v64 a) {
38 return vget_lane_s32(vreinterpret_s32_s64(a), 1);
39 }
40
v64_from_16(uint16_t a,uint16_t b,uint16_t c,uint16_t d)41 SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
42 return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 |
43 d);
44 }
45
v64_from_32(uint32_t x,uint32_t y)46 SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
47 return vcreate_s64((uint64_t)x << 32 | y);
48 }
49
v64_from_64(uint64_t x)50 SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
51
v64_u64(v64 x)52 SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
53
u32_load_aligned(const void * p)54 SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
55 return *((uint32_t *)p);
56 }
57
u32_load_unaligned(const void * p)58 SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
59 return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0);
60 }
61
u32_store_aligned(void * p,uint32_t a)62 SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
63 *((uint32_t *)p) = a;
64 }
65
u32_store_unaligned(void * p,uint32_t a)66 SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
67 #if defined(__clang__)
68 vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
69 0);
70 #elif defined(__CC_ARM)
71 *(__packed uint32_t *)p) = a;
72 #elif defined(__GNUC__)
73 *((__attribute((packed)) uint32_t *)p) = a;
74 #else
75 vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
76 0);
77 #endif
78 }
79
v64_load_aligned(const void * p)80 SIMD_INLINE v64 v64_load_aligned(const void *p) {
81 return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p));
82 }
83
v64_load_unaligned(const void * p)84 SIMD_INLINE v64 v64_load_unaligned(const void *p) {
85 return v64_load_aligned(p);
86 }
87
v64_store_aligned(void * p,v64 r)88 SIMD_INLINE void v64_store_aligned(void *p, v64 r) {
89 vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
90 }
91
v64_store_unaligned(void * p,v64 r)92 SIMD_INLINE void v64_store_unaligned(void *p, v64 r) {
93 vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
94 }
95
96 // The following function requires an immediate.
97 // Some compilers will check this if it's optimising, others wont.
v64_align(v64 a,v64 b,unsigned int c)98 SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) {
99 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
100 return c ? vreinterpret_s64_s8(
101 vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
102 : b;
103 #else
104 return c ? v64_from_64(((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8))
105 : b;
106 #endif
107 }
108
v64_zero()109 SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); }
110
v64_dup_8(uint8_t x)111 SIMD_INLINE v64 v64_dup_8(uint8_t x) {
112 return vreinterpret_s64_u8(vdup_n_u8(x));
113 }
114
v64_dup_16(uint16_t x)115 SIMD_INLINE v64 v64_dup_16(uint16_t x) {
116 return vreinterpret_s64_u16(vdup_n_u16(x));
117 }
118
v64_dup_32(uint32_t x)119 SIMD_INLINE v64 v64_dup_32(uint32_t x) {
120 return vreinterpret_s64_u32(vdup_n_u32(x));
121 }
122
v64_dotp_su8(v64 x,v64 y)123 SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
124 int64x2_t r = vpaddlq_s32(vpaddlq_s16(
125 vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
126 vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))))));
127 return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
128 }
129
v64_dotp_s16(v64 x,v64 y)130 SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
131 int64x2_t r =
132 vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
133 return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
134 }
135
v64_hadd_u8(v64 x)136 SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
137 return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
138 }
139
v64_hadd_s16(v64 a)140 SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
141 return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
142 }
143
144 typedef uint16x8_t sad64_internal;
145
v64_sad_u8_init()146 SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); }
147
148 /* Implementation dependent return value. Result must be finalised with
149 v64_sad_u8_sum().
150 The result for more than 32 v64_sad_u8() calls is undefined. */
v64_sad_u8(sad64_internal s,v64 a,v64 b)151 SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
152 return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
153 }
154
v64_sad_u8_sum(sad64_internal s)155 SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
156 uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
157 return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
158 }
159
160 typedef int64x1_t ssd64_internal;
161
v64_ssd_u8_init()162 SIMD_INLINE ssd64_internal v64_ssd_u8_init() {
163 return (ssd64_internal)(uint64_t)0;
164 }
165
166 /* Implementation dependent return value. Result must be finalised with
167 * v64_ssd_u8_sum(). */
v64_ssd_u8(ssd64_internal s,v64 a,v64 b)168 SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
169 uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
170 uint64x2_t r = vpaddlq_u32(vpaddlq_u16(vmull_u8(t, t)));
171 return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r)));
172 }
173
v64_ssd_u8_sum(ssd64_internal s)174 SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
175 return (uint32_t)(uint64_t)s;
176 }
177
v64_or(v64 x,v64 y)178 SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
179
v64_xor(v64 x,v64 y)180 SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); }
181
v64_and(v64 x,v64 y)182 SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); }
183
v64_andn(v64 x,v64 y)184 SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); }
185
v64_add_8(v64 x,v64 y)186 SIMD_INLINE v64 v64_add_8(v64 x, v64 y) {
187 return vreinterpret_s64_u8(
188 vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
189 }
190
v64_add_16(v64 x,v64 y)191 SIMD_INLINE v64 v64_add_16(v64 x, v64 y) {
192 return vreinterpret_s64_s16(
193 vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
194 }
195
v64_sadd_s16(v64 x,v64 y)196 SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) {
197 return vreinterpret_s64_s16(
198 vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
199 }
200
v64_add_32(v64 x,v64 y)201 SIMD_INLINE v64 v64_add_32(v64 x, v64 y) {
202 return vreinterpret_s64_u32(
203 vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y)));
204 }
205
v64_sub_8(v64 x,v64 y)206 SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) {
207 return vreinterpret_s64_u8(
208 vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
209 }
210
v64_sub_16(v64 x,v64 y)211 SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) {
212 return vreinterpret_s64_s16(
213 vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
214 }
215
v64_ssub_s16(v64 x,v64 y)216 SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) {
217 return vreinterpret_s64_s16(
218 vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
219 }
220
v64_ssub_u16(v64 x,v64 y)221 SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) {
222 return vreinterpret_s64_u16(
223 vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
224 }
225
v64_ssub_u8(v64 x,v64 y)226 SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) {
227 return vreinterpret_s64_u8(
228 vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
229 }
230
v64_ssub_s8(v64 x,v64 y)231 SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) {
232 return vreinterpret_s64_s8(
233 vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
234 }
235
v64_sub_32(v64 x,v64 y)236 SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) {
237 return vreinterpret_s64_s32(
238 vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
239 }
240
v64_abs_s16(v64 x)241 SIMD_INLINE v64 v64_abs_s16(v64 x) {
242 return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x)));
243 }
244
v64_abs_s8(v64 x)245 SIMD_INLINE v64 v64_abs_s8(v64 x) {
246 return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x)));
247 }
248
v64_mullo_s16(v64 x,v64 y)249 SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) {
250 return vreinterpret_s64_s16(
251 vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
252 }
253
v64_mulhi_s16(v64 x,v64 y)254 SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) {
255 return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32(
256 vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16)));
257 }
258
v64_mullo_s32(v64 x,v64 y)259 SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) {
260 return vreinterpret_s64_s32(
261 vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
262 }
263
v64_madd_s16(v64 x,v64 y)264 SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) {
265 int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y));
266 return vreinterpret_s64_s32(
267 vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))),
268 vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t)))));
269 }
270
v64_madd_us8(v64 x,v64 y)271 SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) {
272 return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(
273 vaddq_s16(vmull_s8(vadd_s8(vreinterpret_s8_s64(x), vdup_n_s8(-128)),
274 vreinterpret_s8_s64(y)),
275 vshlq_n_s16(vmovl_s8(vreinterpret_s8_s64(y)), 7)))));
276 }
277
v64_avg_u8(v64 x,v64 y)278 SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) {
279 return vreinterpret_s64_u8(
280 vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
281 }
282
v64_rdavg_u8(v64 x,v64 y)283 SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) {
284 return vreinterpret_s64_u8(
285 vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
286 }
287
v64_avg_u16(v64 x,v64 y)288 SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) {
289 return vreinterpret_s64_u16(
290 vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
291 }
292
v64_max_u8(v64 x,v64 y)293 SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) {
294 return vreinterpret_s64_u8(
295 vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
296 }
297
v64_min_u8(v64 x,v64 y)298 SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) {
299 return vreinterpret_s64_u8(
300 vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
301 }
302
v64_max_s8(v64 x,v64 y)303 SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) {
304 return vreinterpret_s64_s8(
305 vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
306 }
307
v64_min_s8(v64 x,v64 y)308 SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) {
309 return vreinterpret_s64_s8(
310 vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
311 }
312
v64_max_s16(v64 x,v64 y)313 SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) {
314 return vreinterpret_s64_s16(
315 vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
316 }
317
v64_min_s16(v64 x,v64 y)318 SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) {
319 return vreinterpret_s64_s16(
320 vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
321 }
322
v64_ziplo_8(v64 x,v64 y)323 SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) {
324 uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
325 return vreinterpret_s64_u8(r.val[0]);
326 }
327
v64_ziphi_8(v64 x,v64 y)328 SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) {
329 uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
330 return vreinterpret_s64_u8(r.val[1]);
331 }
332
v64_ziplo_16(v64 x,v64 y)333 SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) {
334 int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
335 return vreinterpret_s64_s16(r.val[0]);
336 }
337
v64_ziphi_16(v64 x,v64 y)338 SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) {
339 int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
340 return vreinterpret_s64_s16(r.val[1]);
341 }
342
v64_ziplo_32(v64 x,v64 y)343 SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) {
344 int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
345 return vreinterpret_s64_s32(r.val[0]);
346 }
347
v64_ziphi_32(v64 x,v64 y)348 SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) {
349 int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
350 return vreinterpret_s64_s32(r.val[1]);
351 }
352
v64_unpacklo_u8_s16(v64 a)353 SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
354 return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a))));
355 }
356
v64_unpackhi_u8_s16(v64 a)357 SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
358 return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a))));
359 }
360
v64_unpacklo_s8_s16(v64 a)361 SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
362 return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a))));
363 }
364
v64_unpackhi_s8_s16(v64 a)365 SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
366 return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a))));
367 }
368
v64_pack_s32_s16(v64 x,v64 y)369 SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) {
370 return vreinterpret_s64_s16(vqmovn_s32(
371 vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
372 }
373
v64_pack_s16_u8(v64 x,v64 y)374 SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) {
375 return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32(
376 vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
377 }
378
v64_pack_s16_s8(v64 x,v64 y)379 SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) {
380 return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32(
381 vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
382 }
383
v64_unziplo_8(v64 x,v64 y)384 SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) {
385 uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
386 return vreinterpret_s64_u8(r.val[0]);
387 }
388
v64_unziphi_8(v64 x,v64 y)389 SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) {
390 uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
391 return vreinterpret_s64_u8(r.val[1]);
392 }
393
v64_unziplo_16(v64 x,v64 y)394 SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) {
395 uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
396 return vreinterpret_s64_u16(r.val[0]);
397 }
398
v64_unziphi_16(v64 x,v64 y)399 SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) {
400 uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
401 return vreinterpret_s64_u16(r.val[1]);
402 }
403
v64_unpacklo_s16_s32(v64 x)404 SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) {
405 return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x))));
406 }
407
v64_unpacklo_u16_s32(v64 x)408 SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) {
409 return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x))));
410 }
411
v64_unpackhi_s16_s32(v64 x)412 SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) {
413 return vreinterpret_s64_s32(
414 vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x))));
415 }
416
v64_unpackhi_u16_s32(v64 x)417 SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) {
418 return vreinterpret_s64_u32(
419 vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x))));
420 }
421
v64_shuffle_8(v64 x,v64 pattern)422 SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
423 return vreinterpret_s64_u8(
424 vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern)));
425 }
426
v64_cmpgt_s8(v64 x,v64 y)427 SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) {
428 return vreinterpret_s64_u8(
429 vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
430 }
431
v64_cmplt_s8(v64 x,v64 y)432 SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) {
433 return vreinterpret_s64_u8(
434 vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
435 }
436
v64_cmpeq_8(v64 x,v64 y)437 SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) {
438 return vreinterpret_s64_u8(
439 vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
440 }
441
v64_cmpgt_s16(v64 x,v64 y)442 SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) {
443 return vreinterpret_s64_u16(
444 vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
445 }
446
v64_cmplt_s16(v64 x,v64 y)447 SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) {
448 return vreinterpret_s64_u16(
449 vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
450 }
451
v64_cmpeq_16(v64 x,v64 y)452 SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) {
453 return vreinterpret_s64_u16(
454 vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
455 }
456
v64_shl_8(v64 a,unsigned int c)457 SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
458 return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(c)));
459 }
460
v64_shr_u8(v64 a,unsigned int c)461 SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
462 return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-c)));
463 }
464
v64_shr_s8(v64 a,unsigned int c)465 SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
466 return vreinterpret_s64_s8(vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-c)));
467 }
468
v64_shl_16(v64 a,unsigned int c)469 SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
470 return vreinterpret_s64_u16(vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(c)));
471 }
472
v64_shr_u16(v64 a,unsigned int c)473 SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
474 return vreinterpret_s64_u16(
475 vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int)c)));
476 }
477
v64_shr_s16(v64 a,unsigned int c)478 SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
479 return vreinterpret_s64_s16(
480 vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int)c)));
481 }
482
v64_shl_32(v64 a,unsigned int c)483 SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
484 return vreinterpret_s64_u32(vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(c)));
485 }
486
v64_shr_u32(v64 a,unsigned int c)487 SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
488 return vreinterpret_s64_u32(
489 vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int)c)));
490 }
491
v64_shr_s32(v64 a,unsigned int c)492 SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
493 return vreinterpret_s64_s32(
494 vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c)));
495 }
496
497 // The following functions require an immediate.
498 // Some compilers will check this during optimisation, others wont.
499 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
500
v64_shl_n_byte(v64 a,unsigned int c)501 SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
502 return vshl_n_s64(a, c * 8);
503 }
504
v64_shr_n_byte(v64 a,unsigned int c)505 SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
506 return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a;
507 }
508
v64_shl_n_8(v64 a,unsigned int c)509 SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
510 return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c));
511 }
512
v64_shr_n_u8(v64 a,unsigned int c)513 SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
514 return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c));
515 }
516
v64_shr_n_s8(v64 a,unsigned int c)517 SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
518 return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c));
519 }
520
v64_shl_n_16(v64 a,unsigned int c)521 SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
522 return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c));
523 }
524
v64_shr_n_u16(v64 a,unsigned int c)525 SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
526 return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c));
527 }
528
v64_shr_n_s16(v64 a,unsigned int c)529 SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
530 return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c));
531 }
532
v64_shl_n_32(v64 a,unsigned int c)533 SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
534 return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c));
535 }
536
v64_shr_n_u32(v64 a,unsigned int c)537 SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
538 return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c));
539 }
540
v64_shr_n_s32(v64 a,unsigned int c)541 SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
542 return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c));
543 }
544
545 #else
546
v64_shl_n_byte(v64 a,unsigned int c)547 SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
548 return v64_from_64(v64_u64(a) << c * 8);
549 }
550
v64_shr_n_byte(v64 a,unsigned int c)551 SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
552 return v64_from_64(v64_u64(a) >> c * 8);
553 }
554
v64_shl_n_8(v64 a,unsigned int c)555 SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); }
556
v64_shr_n_u8(v64 a,unsigned int c)557 SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); }
558
v64_shr_n_s8(v64 a,unsigned int c)559 SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); }
560
v64_shl_n_16(v64 a,unsigned int c)561 SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); }
562
v64_shr_n_u16(v64 a,unsigned int c)563 SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
564 return v64_shr_u16(a, c);
565 }
566
v64_shr_n_s16(v64 a,unsigned int c)567 SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
568 return v64_shr_s16(a, c);
569 }
570
v64_shl_n_32(v64 a,unsigned int c)571 SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); }
572
v64_shr_n_u32(v64 a,unsigned int c)573 SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
574 return v64_shr_u32(a, c);
575 }
576
v64_shr_n_s32(v64 a,unsigned int c)577 SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
578 return v64_shr_s32(a, c);
579 }
580
581 #endif
582
583 #endif /* _V64_INTRINSICS_H */
584