1 #ifndef SIMDJSON_ARM64_SIMD_H
2 #define SIMDJSON_ARM64_SIMD_H
3 
4 #include "simdjson/base.h"
5 #include "simdjson/internal/simdprune_tables.h"
6 #include "simdjson/arm64/bitmanipulation.h"
7 #include <type_traits>
8 
9 
10 namespace simdjson {
11 namespace SIMDJSON_IMPLEMENTATION {
12 namespace {
13 namespace simd {
14 
15 #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
16 namespace {
17 // Start of private section with Visual Studio workaround
18 
19 
20 /**
21  * make_uint8x16_t initializes a SIMD register (uint8x16_t).
22  * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
23  * is not recognized under Visual Studio! This is a workaround.
24  * Using a std::initializer_list<uint8_t>  as a parameter resulted in
25  * inefficient code. With the current approach, if the parameters are
26  * compile-time constants,
27  * GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
28  * You should not use this function except for compile-time constants:
29  * it is not efficient.
30  */
make_uint8x16_t(uint8_t x1,uint8_t x2,uint8_t x3,uint8_t x4,uint8_t x5,uint8_t x6,uint8_t x7,uint8_t x8,uint8_t x9,uint8_t x10,uint8_t x11,uint8_t x12,uint8_t x13,uint8_t x14,uint8_t x15,uint8_t x16)31 simdjson_really_inline uint8x16_t make_uint8x16_t(uint8_t x1,  uint8_t x2,  uint8_t x3,  uint8_t x4,
32                                          uint8_t x5,  uint8_t x6,  uint8_t x7,  uint8_t x8,
33                                          uint8_t x9,  uint8_t x10, uint8_t x11, uint8_t x12,
34                                          uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
35   // Doing a load like so end ups generating worse code.
36   // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
37   //                     x9, x10,x11,x12,x13,x14,x15,x16};
38   // return vld1q_u8(array);
39   uint8x16_t x{};
40   // incredibly, Visual Studio does not allow x[0] = x1
41   x = vsetq_lane_u8(x1, x, 0);
42   x = vsetq_lane_u8(x2, x, 1);
43   x = vsetq_lane_u8(x3, x, 2);
44   x = vsetq_lane_u8(x4, x, 3);
45   x = vsetq_lane_u8(x5, x, 4);
46   x = vsetq_lane_u8(x6, x, 5);
47   x = vsetq_lane_u8(x7, x, 6);
48   x = vsetq_lane_u8(x8, x, 7);
49   x = vsetq_lane_u8(x9, x, 8);
50   x = vsetq_lane_u8(x10, x, 9);
51   x = vsetq_lane_u8(x11, x, 10);
52   x = vsetq_lane_u8(x12, x, 11);
53   x = vsetq_lane_u8(x13, x, 12);
54   x = vsetq_lane_u8(x14, x, 13);
55   x = vsetq_lane_u8(x15, x, 14);
56   x = vsetq_lane_u8(x16, x, 15);
57   return x;
58 }
59 
make_uint8x8_t(uint8_t x1,uint8_t x2,uint8_t x3,uint8_t x4,uint8_t x5,uint8_t x6,uint8_t x7,uint8_t x8)60 simdjson_really_inline uint8x8_t make_uint8x8_t(uint8_t x1,  uint8_t x2,  uint8_t x3,  uint8_t x4,
61                                          uint8_t x5,  uint8_t x6,  uint8_t x7,  uint8_t x8) {
62   uint8x8_t x{};
63   x = vset_lane_u8(x1, x, 0);
64   x = vset_lane_u8(x2, x, 1);
65   x = vset_lane_u8(x3, x, 2);
66   x = vset_lane_u8(x4, x, 3);
67   x = vset_lane_u8(x5, x, 4);
68   x = vset_lane_u8(x6, x, 5);
69   x = vset_lane_u8(x7, x, 6);
70   x = vset_lane_u8(x8, x, 7);
71   return x;
72 }
73 
74 // We have to do the same work for make_int8x16_t
make_int8x16_t(int8_t x1,int8_t x2,int8_t x3,int8_t x4,int8_t x5,int8_t x6,int8_t x7,int8_t x8,int8_t x9,int8_t x10,int8_t x11,int8_t x12,int8_t x13,int8_t x14,int8_t x15,int8_t x16)75 simdjson_really_inline int8x16_t make_int8x16_t(int8_t x1,  int8_t x2,  int8_t x3,  int8_t x4,
76                                        int8_t x5,  int8_t x6,  int8_t x7,  int8_t x8,
77                                        int8_t x9,  int8_t x10, int8_t x11, int8_t x12,
78                                        int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
79   // Doing a load like so end ups generating worse code.
80   // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
81   //                     x9, x10,x11,x12,x13,x14,x15,x16};
82   // return vld1q_s8(array);
83   int8x16_t x{};
84   // incredibly, Visual Studio does not allow x[0] = x1
85   x = vsetq_lane_s8(x1, x, 0);
86   x = vsetq_lane_s8(x2, x, 1);
87   x = vsetq_lane_s8(x3, x, 2);
88   x = vsetq_lane_s8(x4, x, 3);
89   x = vsetq_lane_s8(x5, x, 4);
90   x = vsetq_lane_s8(x6, x, 5);
91   x = vsetq_lane_s8(x7, x, 6);
92   x = vsetq_lane_s8(x8, x, 7);
93   x = vsetq_lane_s8(x9, x, 8);
94   x = vsetq_lane_s8(x10, x, 9);
95   x = vsetq_lane_s8(x11, x, 10);
96   x = vsetq_lane_s8(x12, x, 11);
97   x = vsetq_lane_s8(x13, x, 12);
98   x = vsetq_lane_s8(x14, x, 13);
99   x = vsetq_lane_s8(x15, x, 14);
100   x = vsetq_lane_s8(x16, x, 15);
101   return x;
102 }
103 
104 // End of private section with Visual Studio workaround
105 } // namespace
106 #endif // SIMDJSON_REGULAR_VISUAL_STUDIO
107 
108 
109   template<typename T>
110   struct simd8;
111 
112   //
113   // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
114   //
115   template<typename T, typename Mask=simd8<bool>>
116   struct base_u8 {
117     uint8x16_t value;
118     static const int SIZE = sizeof(value);
119 
120     // Conversion from/to SIMD register
base_u8base_u8121     simdjson_really_inline base_u8(const uint8x16_t _value) : value(_value) {}
122     simdjson_really_inline operator const uint8x16_t&() const { return this->value; }
123     simdjson_really_inline operator uint8x16_t&() { return this->value; }
124 
125     // Bit operations
126     simdjson_really_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
127     simdjson_really_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
128     simdjson_really_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
bit_andnotbase_u8129     simdjson_really_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
130     simdjson_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
131     simdjson_really_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
132     simdjson_really_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
133     simdjson_really_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
134 
135     simdjson_really_inline Mask operator==(const simd8<T> other) const { return vceqq_u8(*this, other); }
136 
137     template<int N=1>
prevbase_u8138     simdjson_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
139       return vextq_u8(prev_chunk, *this, 16 - N);
140     }
141   };
142 
143   // SIMD byte mask type (returned by things like eq and gt)
144   template<>
145   struct simd8<bool>: base_u8<bool> {
146     typedef uint16_t bitmask_t;
147     typedef uint32_t bitmask2_t;
148 
149     static simdjson_really_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); }
150 
151     simdjson_really_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
152     // False constructor
153     simdjson_really_inline simd8() : simd8(vdupq_n_u8(0)) {}
154     // Splat constructor
155     simdjson_really_inline simd8(bool _value) : simd8(splat(_value)) {}
156 
157     // We return uint32_t instead of uint16_t because that seems to be more efficient for most
158     // purposes (cutting it down to uint16_t costs performance in some compilers).
159     simdjson_really_inline uint32_t to_bitmask() const {
160 #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
161       const uint8x16_t bit_mask =  make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
162                                                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
163 #else
164       const uint8x16_t bit_mask =  {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
165                                     0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
166 #endif
167       auto minput = *this & bit_mask;
168       uint8x16_t tmp = vpaddq_u8(minput, minput);
169       tmp = vpaddq_u8(tmp, tmp);
170       tmp = vpaddq_u8(tmp, tmp);
171       return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
172     }
173     simdjson_really_inline bool any() const { return vmaxvq_u8(*this) != 0; }
174   };
175 
176   // Unsigned bytes
177   template<>
178   struct simd8<uint8_t>: base_u8<uint8_t> {
179     static simdjson_really_inline uint8x16_t splat(uint8_t _value) { return vmovq_n_u8(_value); }
180     static simdjson_really_inline uint8x16_t zero() { return vdupq_n_u8(0); }
181     static simdjson_really_inline uint8x16_t load(const uint8_t* values) { return vld1q_u8(values); }
182 
183     simdjson_really_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
184     // Zero constructor
185     simdjson_really_inline simd8() : simd8(zero()) {}
186     // Array constructor
187     simdjson_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
188     // Splat constructor
189     simdjson_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
190     // Member-by-member initialization
191 #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
192     simdjson_really_inline simd8(
193       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
194       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
195     ) : simd8(make_uint8x16_t(
196       v0, v1, v2, v3, v4, v5, v6, v7,
197       v8, v9, v10,v11,v12,v13,v14,v15
198     )) {}
199 #else
200     simdjson_really_inline simd8(
201       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
202       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
203     ) : simd8(uint8x16_t{
204       v0, v1, v2, v3, v4, v5, v6, v7,
205       v8, v9, v10,v11,v12,v13,v14,v15
206     }) {}
207 #endif
208 
209     // Repeat 16 values as many times as necessary (usually for lookup tables)
210     simdjson_really_inline static simd8<uint8_t> repeat_16(
211       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
212       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
213     ) {
214       return simd8<uint8_t>(
215         v0, v1, v2, v3, v4, v5, v6, v7,
216         v8, v9, v10,v11,v12,v13,v14,v15
217       );
218     }
219 
220     // Store to array
221     simdjson_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
222 
223     // Saturated math
224     simdjson_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(*this, other); }
225     simdjson_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(*this, other); }
226 
227     // Addition/subtraction are the same for signed and unsigned
228     simdjson_really_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
229     simdjson_really_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
230     simdjson_really_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
231     simdjson_really_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
232 
233     // Order-specific operations
234     simdjson_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
235     simdjson_really_inline uint8_t min_val() const { return vminvq_u8(*this); }
236     simdjson_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return vmaxq_u8(*this, other); }
237     simdjson_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return vminq_u8(*this, other); }
238     simdjson_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(*this, other); }
239     simdjson_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(*this, other); }
240     simdjson_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(*this, other); }
241     simdjson_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(*this, other); }
242     // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
243     simdjson_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); }
244     // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
245     simdjson_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); }
246 
247     // Bit-specific operations
248     simdjson_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(*this, bits); }
249     simdjson_really_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; }
250     simdjson_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
251     template<int N>
252     simdjson_really_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
253     template<int N>
254     simdjson_really_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
255 
256     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
257     template<typename L>
258     simdjson_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
259       return lookup_table.apply_lookup_16_to(*this);
260     }
261 
262 
263     // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
264     // Passing a 0 value for mask would be equivalent to writing out every byte to output.
265     // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
266     // get written.
267     // Design consideration: it seems like a function with the
268     // signature simd8<L> compress(uint16_t mask) would be
269     // sensible, but the AVX ISA makes this kind of approach difficult.
270     template<typename L>
271     simdjson_really_inline void compress(uint16_t mask, L * output) const {
272       using internal::thintable_epi8;
273       using internal::BitsSetTable256mul2;
274       using internal::pshufb_combine_table;
275       // this particular implementation was inspired by work done by @animetosho
276       // we do it in two steps, first 8 bytes and then second 8 bytes
277       uint8_t mask1 = uint8_t(mask); // least significant 8 bits
278       uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
279       // next line just loads the 64-bit values thintable_epi8[mask1] and
280       // thintable_epi8[mask2] into a 128-bit register, using only
281       // two instructions on most compilers.
282       uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
283       uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
284       // we increment by 0x08 the second half of the mask
285 #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
286       uint8x16_t inc = make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
287 #else
288       uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
289 #endif
290       shufmask = vaddq_u8(shufmask, inc);
291       // this is the version "nearly pruned"
292       uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
293       // we still need to put the two halves together.
294       // we compute the popcount of the first half:
295       int pop1 = BitsSetTable256mul2[mask1];
296       // then load the corresponding mask, what it does is to write
297       // only the first pop1 bytes from the first 8 bytes, and then
298       // it fills in with the bytes from the second 8 bytes + some filling
299       // at the end.
300       uint8x16_t compactmask = vld1q_u8(reinterpret_cast<const uint8_t *>(pshufb_combine_table + pop1 * 8));
301       uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
302       vst1q_u8(reinterpret_cast<uint8_t*>(output), answer);
303     }
304 
305     // Copies all bytes corresponding to a 0 in the low half of the mask (interpreted as a
306     // bitset) to output1, then those corresponding to a 0 in the high half to output2.
307     template<typename L>
308     simdjson_really_inline void compress_halves(uint16_t mask, L *output1, L *output2) const {
309       using internal::thintable_epi8;
310       uint8_t mask1 = uint8_t(mask); // least significant 8 bits
311       uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
312       uint8x8_t compactmask1 = vcreate_u8(thintable_epi8[mask1]);
313       uint8x8_t compactmask2 = vcreate_u8(thintable_epi8[mask2]);
314       // we increment by 0x08 the second half of the mask
315 #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
316       uint8x8_t inc = make_uint8x8_t(0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
317 #else
318       uint8x8_t inc = {0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
319 #endif
320       compactmask2 = vadd_u8(compactmask2, inc);
321       // store each result (with the second store possibly overlapping the first)
322       vst1_u8((uint8_t*)output1, vqtbl1_u8(*this, compactmask1));
323       vst1_u8((uint8_t*)output2, vqtbl1_u8(*this, compactmask2));
324     }
325 
326     template<typename L>
327     simdjson_really_inline simd8<L> lookup_16(
328         L replace0,  L replace1,  L replace2,  L replace3,
329         L replace4,  L replace5,  L replace6,  L replace7,
330         L replace8,  L replace9,  L replace10, L replace11,
331         L replace12, L replace13, L replace14, L replace15) const {
332       return lookup_16(simd8<L>::repeat_16(
333         replace0,  replace1,  replace2,  replace3,
334         replace4,  replace5,  replace6,  replace7,
335         replace8,  replace9,  replace10, replace11,
336         replace12, replace13, replace14, replace15
337       ));
338     }
339 
340     template<typename T>
341     simdjson_really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) {
342       return vqtbl1q_u8(*this, simd8<uint8_t>(original));
343     }
344   };
345 
346   // Signed bytes
347   template<>
348   struct simd8<int8_t> {
349     int8x16_t value;
350 
351     static simdjson_really_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
352     static simdjson_really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
353     static simdjson_really_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
354 
355     // Conversion from/to SIMD register
356     simdjson_really_inline simd8(const int8x16_t _value) : value{_value} {}
357     simdjson_really_inline operator const int8x16_t&() const { return this->value; }
358     simdjson_really_inline operator int8x16_t&() { return this->value; }
359 
360     // Zero constructor
361     simdjson_really_inline simd8() : simd8(zero()) {}
362     // Splat constructor
363     simdjson_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
364     // Array constructor
365     simdjson_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
366     // Member-by-member initialization
367 #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
368     simdjson_really_inline simd8(
369       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
370       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
371     ) : simd8(make_int8x16_t(
372       v0, v1, v2, v3, v4, v5, v6, v7,
373       v8, v9, v10,v11,v12,v13,v14,v15
374     )) {}
375 #else
376     simdjson_really_inline simd8(
377       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
378       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
379     ) : simd8(int8x16_t{
380       v0, v1, v2, v3, v4, v5, v6, v7,
381       v8, v9, v10,v11,v12,v13,v14,v15
382     }) {}
383 #endif
384     // Repeat 16 values as many times as necessary (usually for lookup tables)
385     simdjson_really_inline static simd8<int8_t> repeat_16(
386       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
387       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
388     ) {
389       return simd8<int8_t>(
390         v0, v1, v2, v3, v4, v5, v6, v7,
391         v8, v9, v10,v11,v12,v13,v14,v15
392       );
393     }
394 
395     // Store to array
396     simdjson_really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
397 
398     // Explicit conversion to/from unsigned
399     //
400     // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
401     // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14
402     // and relatively ugly and hard to read.
403 #ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
404     simdjson_really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
405 #endif
406     simdjson_really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
407 
408     // Math
409     simdjson_really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
410     simdjson_really_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(*this, other); }
411     simdjson_really_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
412     simdjson_really_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
413 
414     // Order-sensitive comparisons
415     simdjson_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return vmaxq_s8(*this, other); }
416     simdjson_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return vminq_s8(*this, other); }
417     simdjson_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(*this, other); }
418     simdjson_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(*this, other); }
419     simdjson_really_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(*this, other); }
420 
421     template<int N=1>
422     simdjson_really_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
423       return vextq_s8(prev_chunk, *this, 16 - N);
424     }
425 
426     // Perform a lookup assuming no value is larger than 16
427     template<typename L>
428     simdjson_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
429       return lookup_table.apply_lookup_16_to(*this);
430     }
431     template<typename L>
432     simdjson_really_inline simd8<L> lookup_16(
433         L replace0,  L replace1,  L replace2,  L replace3,
434         L replace4,  L replace5,  L replace6,  L replace7,
435         L replace8,  L replace9,  L replace10, L replace11,
436         L replace12, L replace13, L replace14, L replace15) const {
437       return lookup_16(simd8<L>::repeat_16(
438         replace0,  replace1,  replace2,  replace3,
439         replace4,  replace5,  replace6,  replace7,
440         replace8,  replace9,  replace10, replace11,
441         replace12, replace13, replace14, replace15
442       ));
443     }
444 
445     template<typename T>
446     simdjson_really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
447       return vqtbl1q_s8(*this, simd8<uint8_t>(original));
448     }
449   };
450 
451   template<typename T>
452   struct simd8x64 {
453     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
454     static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
455     const simd8<T> chunks[NUM_CHUNKS];
456 
457     simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
458     simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
459     simd8x64() = delete; // no default constructor allowed
460 
461     simdjson_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
462     simdjson_really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
463 
464     simdjson_really_inline void store(T ptr[64]) const {
465       this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
466       this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
467       this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
468       this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
469     }
470 
471     simdjson_really_inline simd8<T> reduce_or() const {
472       return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
473     }
474 
475 
476     simdjson_really_inline uint64_t compress(uint64_t mask, T * output) const {
477       uint64_t popcounts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
478       // compute the prefix sum of the popcounts of each byte
479       uint64_t offsets = popcounts * 0x0101010101010101;
480       this->chunks[0].compress_halves(uint16_t(mask), output, &output[popcounts & 0xFF]);
481       this->chunks[1].compress_halves(uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF], &output[(offsets >> 16) & 0xFF]);
482       this->chunks[2].compress_halves(uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF], &output[(offsets >> 32) & 0xFF]);
483       this->chunks[3].compress_halves(uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF], &output[(offsets >> 48) & 0xFF]);
484       return offsets >> 56;
485     }
486 
487     simdjson_really_inline uint64_t to_bitmask() const {
488 #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
489       const uint8x16_t bit_mask = make_uint8x16_t(
490         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
491         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
492       );
493 #else
494       const uint8x16_t bit_mask = {
495         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
496         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
497       };
498 #endif
499       // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
500       uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
501       uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
502       sum0 = vpaddq_u8(sum0, sum1);
503       sum0 = vpaddq_u8(sum0, sum0);
504       return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
505     }
506 
507     simdjson_really_inline uint64_t eq(const T m) const {
508       const simd8<T> mask = simd8<T>::splat(m);
509       return  simd8x64<bool>(
510         this->chunks[0] == mask,
511         this->chunks[1] == mask,
512         this->chunks[2] == mask,
513         this->chunks[3] == mask
514       ).to_bitmask();
515     }
516 
517     simdjson_really_inline uint64_t lteq(const T m) const {
518       const simd8<T> mask = simd8<T>::splat(m);
519       return  simd8x64<bool>(
520         this->chunks[0] <= mask,
521         this->chunks[1] <= mask,
522         this->chunks[2] <= mask,
523         this->chunks[3] <= mask
524       ).to_bitmask();
525     }
526   }; // struct simd8x64<T>
527 
528 } // namespace simd
529 } // unnamed namespace
530 } // namespace SIMDJSON_IMPLEMENTATION
531 } // namespace simdjson
532 
533 #endif // SIMDJSON_ARM64_SIMD_H
534