1 /* auto-generated on 2021-06-04 17:09:21 -0400. Do not edit! */
2 /* begin file src/simdjson.cpp */
3 #include "simdjson.h"
4 
5 SIMDJSON_PUSH_DISABLE_WARNINGS
6 SIMDJSON_DISABLE_UNDESIRED_WARNINGS
7 
8 /* begin file src/to_chars.cpp */
9 #include <cstring>
10 #include <cstdint>
11 #include <array>
12 namespace simdjson {
13 namespace internal {
14 /*!
15 implements the Grisu2 algorithm for binary to decimal floating-point
16 conversion.
17 Adapted from JSON for Modern C++
18 
19 This implementation is a slightly modified version of the reference
20 implementation which may be obtained from
21 http://florian.loitsch.com/publications (bench.tar.gz).
22 The code is distributed under the MIT license, Copyright (c) 2009 Florian
23 Loitsch. For a detailed description of the algorithm see: [1] Loitsch, "Printing
24 Floating-Point Numbers Quickly and Accurately with Integers", Proceedings of the
25 ACM SIGPLAN 2010 Conference on Programming Language Design and Implementation,
26 PLDI 2010 [2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and
27 Accurately", Proceedings of the ACM SIGPLAN 1996 Conference on Programming
28 Language Design and Implementation, PLDI 1996
29 */
30 namespace dtoa_impl {
31 
32 template <typename Target, typename Source>
33 Target reinterpret_bits(const Source source) {
34   static_assert(sizeof(Target) == sizeof(Source), "size mismatch");
35 
36   Target target;
37   std::memcpy(&target, &source, sizeof(Source));
38   return target;
39 }
40 
41 struct diyfp // f * 2^e
42 {
43   static constexpr int kPrecision = 64; // = q
44 
45   std::uint64_t f = 0;
46   int e = 0;
47 
diyfpsimdjson::internal::dtoa_impl::diyfp48   constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {}
49 
50   /*!
51   @brief returns x - y
52   @pre x.e == y.e and x.f >= y.f
53   */
subsimdjson::internal::dtoa_impl::diyfp54   static diyfp sub(const diyfp &x, const diyfp &y) noexcept {
55 
56     return {x.f - y.f, x.e};
57   }
58 
59   /*!
60   @brief returns x * y
61   @note The result is rounded. (Only the upper q bits are returned.)
62   */
mulsimdjson::internal::dtoa_impl::diyfp63   static diyfp mul(const diyfp &x, const diyfp &y) noexcept {
64     static_assert(kPrecision == 64, "internal error");
65 
66     // Computes:
67     //  f = round((x.f * y.f) / 2^q)
68     //  e = x.e + y.e + q
69 
70     // Emulate the 64-bit * 64-bit multiplication:
71     //
72     // p = u * v
73     //   = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi)
74     //   = (u_lo v_lo         ) + 2^32 ((u_lo v_hi         ) + (u_hi v_lo )) +
75     //   2^64 (u_hi v_hi         ) = (p0                ) + 2^32 ((p1 ) + (p2 ))
76     //   + 2^64 (p3                ) = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo +
77     //   2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3                ) =
78     //   (p0_lo             ) + 2^32 (p0_hi + p1_lo + p2_lo ) + 2^64 (p1_hi +
79     //   p2_hi + p3) = (p0_lo             ) + 2^32 (Q ) + 2^64 (H ) = (p0_lo ) +
80     //   2^32 (Q_lo + 2^32 Q_hi                           ) + 2^64 (H )
81     //
82     // (Since Q might be larger than 2^32 - 1)
83     //
84     //   = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H)
85     //
86     // (Q_hi + H does not overflow a 64-bit int)
87     //
88     //   = p_lo + 2^64 p_hi
89 
90     const std::uint64_t u_lo = x.f & 0xFFFFFFFFu;
91     const std::uint64_t u_hi = x.f >> 32u;
92     const std::uint64_t v_lo = y.f & 0xFFFFFFFFu;
93     const std::uint64_t v_hi = y.f >> 32u;
94 
95     const std::uint64_t p0 = u_lo * v_lo;
96     const std::uint64_t p1 = u_lo * v_hi;
97     const std::uint64_t p2 = u_hi * v_lo;
98     const std::uint64_t p3 = u_hi * v_hi;
99 
100     const std::uint64_t p0_hi = p0 >> 32u;
101     const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu;
102     const std::uint64_t p1_hi = p1 >> 32u;
103     const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu;
104     const std::uint64_t p2_hi = p2 >> 32u;
105 
106     std::uint64_t Q = p0_hi + p1_lo + p2_lo;
107 
108     // The full product might now be computed as
109     //
110     // p_hi = p3 + p2_hi + p1_hi + (Q >> 32)
111     // p_lo = p0_lo + (Q << 32)
112     //
113     // But in this particular case here, the full p_lo is not required.
114     // Effectively we only need to add the highest bit in p_lo to p_hi (and
115     // Q_hi + 1 does not overflow).
116 
117     Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up
118 
119     const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u);
120 
121     return {h, x.e + y.e + 64};
122   }
123 
124   /*!
125   @brief normalize x such that the significand is >= 2^(q-1)
126   @pre x.f != 0
127   */
normalizesimdjson::internal::dtoa_impl::diyfp128   static diyfp normalize(diyfp x) noexcept {
129 
130     while ((x.f >> 63u) == 0) {
131       x.f <<= 1u;
132       x.e--;
133     }
134 
135     return x;
136   }
137 
138   /*!
139   @brief normalize x such that the result has the exponent E
140   @pre e >= x.e and the upper e - x.e bits of x.f must be zero.
141   */
normalize_tosimdjson::internal::dtoa_impl::diyfp142   static diyfp normalize_to(const diyfp &x,
143                             const int target_exponent) noexcept {
144     const int delta = x.e - target_exponent;
145 
146     return {x.f << delta, target_exponent};
147   }
148 };
149 
150 struct boundaries {
151   diyfp w;
152   diyfp minus;
153   diyfp plus;
154 };
155 
156 /*!
157 Compute the (normalized) diyfp representing the input number 'value' and its
158 boundaries.
159 @pre value must be finite and positive
160 */
compute_boundaries(FloatType value)161 template <typename FloatType> boundaries compute_boundaries(FloatType value) {
162 
163   // Convert the IEEE representation into a diyfp.
164   //
165   // If v is denormal:
166   //      value = 0.F * 2^(1 - bias) = (          F) * 2^(1 - bias - (p-1))
167   // If v is normalized:
168   //      value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1))
169 
170   static_assert(std::numeric_limits<FloatType>::is_iec559,
171                 "internal error: dtoa_short requires an IEEE-754 "
172                 "floating-point implementation");
173 
174   constexpr int kPrecision =
175       std::numeric_limits<FloatType>::digits; // = p (includes the hidden bit)
176   constexpr int kBias =
177       std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1);
178   constexpr int kMinExp = 1 - kBias;
179   constexpr std::uint64_t kHiddenBit = std::uint64_t{1}
180                                        << (kPrecision - 1); // = 2^(p-1)
181 
182   using bits_type = typename std::conditional<kPrecision == 24, std::uint32_t,
183                                               std::uint64_t>::type;
184 
185   const std::uint64_t bits = reinterpret_bits<bits_type>(value);
186   const std::uint64_t E = bits >> (kPrecision - 1);
187   const std::uint64_t F = bits & (kHiddenBit - 1);
188 
189   const bool is_denormal = E == 0;
190   const diyfp v = is_denormal
191                       ? diyfp(F, kMinExp)
192                       : diyfp(F + kHiddenBit, static_cast<int>(E) - kBias);
193 
194   // Compute the boundaries m- and m+ of the floating-point value
195   // v = f * 2^e.
196   //
197   // Determine v- and v+, the floating-point predecessor and successor if v,
198   // respectively.
199   //
200   //      v- = v - 2^e        if f != 2^(p-1) or e == e_min                (A)
201   //         = v - 2^(e-1)    if f == 2^(p-1) and e > e_min                (B)
202   //
203   //      v+ = v + 2^e
204   //
205   // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_
206   // between m- and m+ round to v, regardless of how the input rounding
207   // algorithm breaks ties.
208   //
209   //      ---+-------------+-------------+-------------+-------------+---  (A)
210   //         v-            m-            v             m+            v+
211   //
212   //      -----------------+------+------+-------------+-------------+---  (B)
213   //                       v-     m-     v             m+            v+
214 
215   const bool lower_boundary_is_closer = F == 0 && E > 1;
216   const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1);
217   const diyfp m_minus = lower_boundary_is_closer
218                             ? diyfp(4 * v.f - 1, v.e - 2)  // (B)
219                             : diyfp(2 * v.f - 1, v.e - 1); // (A)
220 
221   // Determine the normalized w+ = m+.
222   const diyfp w_plus = diyfp::normalize(m_plus);
223 
224   // Determine w- = m- such that e_(w-) = e_(w+).
225   const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e);
226 
227   return {diyfp::normalize(v), w_minus, w_plus};
228 }
229 
230 // Given normalized diyfp w, Grisu needs to find a (normalized) cached
231 // power-of-ten c, such that the exponent of the product c * w = f * 2^e lies
232 // within a certain range [alpha, gamma] (Definition 3.2 from [1])
233 //
234 //      alpha <= e = e_c + e_w + q <= gamma
235 //
236 // or
237 //
238 //      f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q
239 //                          <= f_c * f_w * 2^gamma
240 //
241 // Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies
242 //
243 //      2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma
244 //
245 // or
246 //
247 //      2^(q - 2 + alpha) <= c * w < 2^(q + gamma)
248 //
249 // The choice of (alpha,gamma) determines the size of the table and the form of
250 // the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well
251 // in practice:
252 //
253 // The idea is to cut the number c * w = f * 2^e into two parts, which can be
254 // processed independently: An integral part p1, and a fractional part p2:
255 //
256 //      f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e
257 //              = (f div 2^-e) + (f mod 2^-e) * 2^e
258 //              = p1 + p2 * 2^e
259 //
260 // The conversion of p1 into decimal form requires a series of divisions and
261 // modulos by (a power of) 10. These operations are faster for 32-bit than for
262 // 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be
263 // achieved by choosing
264 //
265 //      -e >= 32   or   e <= -32 := gamma
266 //
267 // In order to convert the fractional part
268 //
269 //      p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ...
270 //
271 // into decimal form, the fraction is repeatedly multiplied by 10 and the digits
272 // d[-i] are extracted in order:
273 //
274 //      (10 * p2) div 2^-e = d[-1]
275 //      (10 * p2) mod 2^-e = d[-2] / 10^1 + ...
276 //
277 // The multiplication by 10 must not overflow. It is sufficient to choose
278 //
279 //      10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64.
280 //
281 // Since p2 = f mod 2^-e < 2^-e,
282 //
283 //      -e <= 60   or   e >= -60 := alpha
284 
285 constexpr int kAlpha = -60;
286 constexpr int kGamma = -32;
287 
288 struct cached_power // c = f * 2^e ~= 10^k
289 {
290   std::uint64_t f;
291   int e;
292   int k;
293 };
294 
295 /*!
296 For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached
297 power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c
298 satisfies (Definition 3.2 from [1])
299      alpha <= e_c + e + q <= gamma.
300 */
get_cached_power_for_binary_exponent(int e)301 inline cached_power get_cached_power_for_binary_exponent(int e) {
302   // Now
303   //
304   //      alpha <= e_c + e + q <= gamma                                    (1)
305   //      ==> f_c * 2^alpha <= c * 2^e * 2^q
306   //
307   // and since the c's are normalized, 2^(q-1) <= f_c,
308   //
309   //      ==> 2^(q - 1 + alpha) <= c * 2^(e + q)
310   //      ==> 2^(alpha - e - 1) <= c
311   //
312   // If c were an exact power of ten, i.e. c = 10^k, one may determine k as
313   //
314   //      k = ceil( log_10( 2^(alpha - e - 1) ) )
315   //        = ceil( (alpha - e - 1) * log_10(2) )
316   //
317   // From the paper:
318   // "In theory the result of the procedure could be wrong since c is rounded,
319   //  and the computation itself is approximated [...]. In practice, however,
320   //  this simple function is sufficient."
321   //
322   // For IEEE double precision floating-point numbers converted into
323   // normalized diyfp's w = f * 2^e, with q = 64,
324   //
325   //      e >= -1022      (min IEEE exponent)
326   //           -52        (p - 1)
327   //           -52        (p - 1, possibly normalize denormal IEEE numbers)
328   //           -11        (normalize the diyfp)
329   //         = -1137
330   //
331   // and
332   //
333   //      e <= +1023      (max IEEE exponent)
334   //           -52        (p - 1)
335   //           -11        (normalize the diyfp)
336   //         = 960
337   //
338   // This binary exponent range [-1137,960] results in a decimal exponent
339   // range [-307,324]. One does not need to store a cached power for each
340   // k in this range. For each such k it suffices to find a cached power
341   // such that the exponent of the product lies in [alpha,gamma].
342   // This implies that the difference of the decimal exponents of adjacent
343   // table entries must be less than or equal to
344   //
345   //      floor( (gamma - alpha) * log_10(2) ) = 8.
346   //
347   // (A smaller distance gamma-alpha would require a larger table.)
348 
349   // NB:
350   // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34.
351 
352   constexpr int kCachedPowersMinDecExp = -300;
353   constexpr int kCachedPowersDecStep = 8;
354 
355   static constexpr std::array<cached_power, 79> kCachedPowers = {{
356       {0xAB70FE17C79AC6CA, -1060, -300}, {0xFF77B1FCBEBCDC4F, -1034, -292},
357       {0xBE5691EF416BD60C, -1007, -284}, {0x8DD01FAD907FFC3C, -980, -276},
358       {0xD3515C2831559A83, -954, -268},  {0x9D71AC8FADA6C9B5, -927, -260},
359       {0xEA9C227723EE8BCB, -901, -252},  {0xAECC49914078536D, -874, -244},
360       {0x823C12795DB6CE57, -847, -236},  {0xC21094364DFB5637, -821, -228},
361       {0x9096EA6F3848984F, -794, -220},  {0xD77485CB25823AC7, -768, -212},
362       {0xA086CFCD97BF97F4, -741, -204},  {0xEF340A98172AACE5, -715, -196},
363       {0xB23867FB2A35B28E, -688, -188},  {0x84C8D4DFD2C63F3B, -661, -180},
364       {0xC5DD44271AD3CDBA, -635, -172},  {0x936B9FCEBB25C996, -608, -164},
365       {0xDBAC6C247D62A584, -582, -156},  {0xA3AB66580D5FDAF6, -555, -148},
366       {0xF3E2F893DEC3F126, -529, -140},  {0xB5B5ADA8AAFF80B8, -502, -132},
367       {0x87625F056C7C4A8B, -475, -124},  {0xC9BCFF6034C13053, -449, -116},
368       {0x964E858C91BA2655, -422, -108},  {0xDFF9772470297EBD, -396, -100},
369       {0xA6DFBD9FB8E5B88F, -369, -92},   {0xF8A95FCF88747D94, -343, -84},
370       {0xB94470938FA89BCF, -316, -76},   {0x8A08F0F8BF0F156B, -289, -68},
371       {0xCDB02555653131B6, -263, -60},   {0x993FE2C6D07B7FAC, -236, -52},
372       {0xE45C10C42A2B3B06, -210, -44},   {0xAA242499697392D3, -183, -36},
373       {0xFD87B5F28300CA0E, -157, -28},   {0xBCE5086492111AEB, -130, -20},
374       {0x8CBCCC096F5088CC, -103, -12},   {0xD1B71758E219652C, -77, -4},
375       {0x9C40000000000000, -50, 4},      {0xE8D4A51000000000, -24, 12},
376       {0xAD78EBC5AC620000, 3, 20},       {0x813F3978F8940984, 30, 28},
377       {0xC097CE7BC90715B3, 56, 36},      {0x8F7E32CE7BEA5C70, 83, 44},
378       {0xD5D238A4ABE98068, 109, 52},     {0x9F4F2726179A2245, 136, 60},
379       {0xED63A231D4C4FB27, 162, 68},     {0xB0DE65388CC8ADA8, 189, 76},
380       {0x83C7088E1AAB65DB, 216, 84},     {0xC45D1DF942711D9A, 242, 92},
381       {0x924D692CA61BE758, 269, 100},    {0xDA01EE641A708DEA, 295, 108},
382       {0xA26DA3999AEF774A, 322, 116},    {0xF209787BB47D6B85, 348, 124},
383       {0xB454E4A179DD1877, 375, 132},    {0x865B86925B9BC5C2, 402, 140},
384       {0xC83553C5C8965D3D, 428, 148},    {0x952AB45CFA97A0B3, 455, 156},
385       {0xDE469FBD99A05FE3, 481, 164},    {0xA59BC234DB398C25, 508, 172},
386       {0xF6C69A72A3989F5C, 534, 180},    {0xB7DCBF5354E9BECE, 561, 188},
387       {0x88FCF317F22241E2, 588, 196},    {0xCC20CE9BD35C78A5, 614, 204},
388       {0x98165AF37B2153DF, 641, 212},    {0xE2A0B5DC971F303A, 667, 220},
389       {0xA8D9D1535CE3B396, 694, 228},    {0xFB9B7CD9A4A7443C, 720, 236},
390       {0xBB764C4CA7A44410, 747, 244},    {0x8BAB8EEFB6409C1A, 774, 252},
391       {0xD01FEF10A657842C, 800, 260},    {0x9B10A4E5E9913129, 827, 268},
392       {0xE7109BFBA19C0C9D, 853, 276},    {0xAC2820D9623BF429, 880, 284},
393       {0x80444B5E7AA7CF85, 907, 292},    {0xBF21E44003ACDD2D, 933, 300},
394       {0x8E679C2F5E44FF8F, 960, 308},    {0xD433179D9C8CB841, 986, 316},
395       {0x9E19DB92B4E31BA9, 1013, 324},
396   }};
397 
398   // This computation gives exactly the same results for k as
399   //      k = ceil((kAlpha - e - 1) * 0.30102999566398114)
400   // for |e| <= 1500, but doesn't require floating-point operations.
401   // NB: log_10(2) ~= 78913 / 2^18
402   const int f = kAlpha - e - 1;
403   const int k = (f * 78913) / (1 << 18) + static_cast<int>(f > 0);
404 
405   const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) /
406                     kCachedPowersDecStep;
407 
408   const cached_power cached = kCachedPowers[static_cast<std::size_t>(index)];
409 
410   return cached;
411 }
412 
413 /*!
414 For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k.
415 For n == 0, returns 1 and sets pow10 := 1.
416 */
find_largest_pow10(const std::uint32_t n,std::uint32_t & pow10)417 inline int find_largest_pow10(const std::uint32_t n, std::uint32_t &pow10) {
418   // LCOV_EXCL_START
419   if (n >= 1000000000) {
420     pow10 = 1000000000;
421     return 10;
422   }
423   // LCOV_EXCL_STOP
424   else if (n >= 100000000) {
425     pow10 = 100000000;
426     return 9;
427   } else if (n >= 10000000) {
428     pow10 = 10000000;
429     return 8;
430   } else if (n >= 1000000) {
431     pow10 = 1000000;
432     return 7;
433   } else if (n >= 100000) {
434     pow10 = 100000;
435     return 6;
436   } else if (n >= 10000) {
437     pow10 = 10000;
438     return 5;
439   } else if (n >= 1000) {
440     pow10 = 1000;
441     return 4;
442   } else if (n >= 100) {
443     pow10 = 100;
444     return 3;
445   } else if (n >= 10) {
446     pow10 = 10;
447     return 2;
448   } else {
449     pow10 = 1;
450     return 1;
451   }
452 }
453 
grisu2_round(char * buf,int len,std::uint64_t dist,std::uint64_t delta,std::uint64_t rest,std::uint64_t ten_k)454 inline void grisu2_round(char *buf, int len, std::uint64_t dist,
455                          std::uint64_t delta, std::uint64_t rest,
456                          std::uint64_t ten_k) {
457 
458   //               <--------------------------- delta ---->
459   //                                  <---- dist --------->
460   // --------------[------------------+-------------------]--------------
461   //               M-                 w                   M+
462   //
463   //                                  ten_k
464   //                                <------>
465   //                                       <---- rest ---->
466   // --------------[------------------+----+--------------]--------------
467   //                                  w    V
468   //                                       = buf * 10^k
469   //
470   // ten_k represents a unit-in-the-last-place in the decimal representation
471   // stored in buf.
472   // Decrement buf by ten_k while this takes buf closer to w.
473 
474   // The tests are written in this order to avoid overflow in unsigned
475   // integer arithmetic.
476 
477   while (rest < dist && delta - rest >= ten_k &&
478          (rest + ten_k < dist || dist - rest > rest + ten_k - dist)) {
479     buf[len - 1]--;
480     rest += ten_k;
481   }
482 }
483 
484 /*!
485 Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+.
486 M- and M+ must be normalized and share the same exponent -60 <= e <= -32.
487 */
grisu2_digit_gen(char * buffer,int & length,int & decimal_exponent,diyfp M_minus,diyfp w,diyfp M_plus)488 inline void grisu2_digit_gen(char *buffer, int &length, int &decimal_exponent,
489                              diyfp M_minus, diyfp w, diyfp M_plus) {
490   static_assert(kAlpha >= -60, "internal error");
491   static_assert(kGamma <= -32, "internal error");
492 
493   // Generates the digits (and the exponent) of a decimal floating-point
494   // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's
495   // w, M- and M+ share the same exponent e, which satisfies alpha <= e <=
496   // gamma.
497   //
498   //               <--------------------------- delta ---->
499   //                                  <---- dist --------->
500   // --------------[------------------+-------------------]--------------
501   //               M-                 w                   M+
502   //
503   // Grisu2 generates the digits of M+ from left to right and stops as soon as
504   // V is in [M-,M+].
505 
506   std::uint64_t delta =
507       diyfp::sub(M_plus, M_minus)
508           .f; // (significand of (M+ - M-), implicit exponent is e)
509   std::uint64_t dist =
510       diyfp::sub(M_plus, w)
511           .f; // (significand of (M+ - w ), implicit exponent is e)
512 
513   // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0):
514   //
515   //      M+ = f * 2^e
516   //         = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e
517   //         = ((p1        ) * 2^-e + (p2        )) * 2^e
518   //         = p1 + p2 * 2^e
519 
520   const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e);
521 
522   auto p1 = static_cast<std::uint32_t>(
523       M_plus.f >>
524       -one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.)
525   std::uint64_t p2 = M_plus.f & (one.f - 1); // p2 = f mod 2^-e
526 
527   // 1)
528   //
529   // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0]
530 
531   std::uint32_t pow10;
532   const int k = find_largest_pow10(p1, pow10);
533 
534   //      10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1)
535   //
536   //      p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1))
537   //         = (d[k-1]         ) * 10^(k-1) + (p1 mod 10^(k-1))
538   //
539   //      M+ = p1                                             + p2 * 2^e
540   //         = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1))          + p2 * 2^e
541   //         = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e
542   //         = d[k-1] * 10^(k-1) + (                         rest) * 2^e
543   //
544   // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0)
545   //
546   //      p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0]
547   //
548   // but stop as soon as
549   //
550   //      rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e
551 
552   int n = k;
553   while (n > 0) {
554     // Invariants:
555     //      M+ = buffer * 10^n + (p1 + p2 * 2^e)    (buffer = 0 for n = k)
556     //      pow10 = 10^(n-1) <= p1 < 10^n
557     //
558     const std::uint32_t d = p1 / pow10; // d = p1 div 10^(n-1)
559     const std::uint32_t r = p1 % pow10; // r = p1 mod 10^(n-1)
560     //
561     //      M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e
562     //         = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e)
563     //
564     buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
565     //
566     //      M+ = buffer * 10^(n-1) + (r + p2 * 2^e)
567     //
568     p1 = r;
569     n--;
570     //
571     //      M+ = buffer * 10^n + (p1 + p2 * 2^e)
572     //      pow10 = 10^n
573     //
574 
575     // Now check if enough digits have been generated.
576     // Compute
577     //
578     //      p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e
579     //
580     // Note:
581     // Since rest and delta share the same exponent e, it suffices to
582     // compare the significands.
583     const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2;
584     if (rest <= delta) {
585       // V = buffer * 10^n, with M- <= V <= M+.
586 
587       decimal_exponent += n;
588 
589       // We may now just stop. But instead look if the buffer could be
590       // decremented to bring V closer to w.
591       //
592       // pow10 = 10^n is now 1 ulp in the decimal representation V.
593       // The rounding procedure works with diyfp's with an implicit
594       // exponent of e.
595       //
596       //      10^n = (10^n * 2^-e) * 2^e = ulp * 2^e
597       //
598       const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e;
599       grisu2_round(buffer, length, dist, delta, rest, ten_n);
600 
601       return;
602     }
603 
604     pow10 /= 10;
605     //
606     //      pow10 = 10^(n-1) <= p1 < 10^n
607     // Invariants restored.
608   }
609 
610   // 2)
611   //
612   // The digits of the integral part have been generated:
613   //
614   //      M+ = d[k-1]...d[1]d[0] + p2 * 2^e
615   //         = buffer            + p2 * 2^e
616   //
617   // Now generate the digits of the fractional part p2 * 2^e.
618   //
619   // Note:
620   // No decimal point is generated: the exponent is adjusted instead.
621   //
622   // p2 actually represents the fraction
623   //
624   //      p2 * 2^e
625   //          = p2 / 2^-e
626   //          = d[-1] / 10^1 + d[-2] / 10^2 + ...
627   //
628   // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...)
629   //
630   //      p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m
631   //                      + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...)
632   //
633   // using
634   //
635   //      10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e)
636   //                = (                   d) * 2^-e + (                   r)
637   //
638   // or
639   //      10^m * p2 * 2^e = d + r * 2^e
640   //
641   // i.e.
642   //
643   //      M+ = buffer + p2 * 2^e
644   //         = buffer + 10^-m * (d + r * 2^e)
645   //         = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e
646   //
647   // and stop as soon as 10^-m * r * 2^e <= delta * 2^e
648 
649   int m = 0;
650   for (;;) {
651     // Invariant:
652     //      M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...)
653     //      * 2^e
654     //         = buffer * 10^-m + 10^-m * (p2                                 )
655     //         * 2^e = buffer * 10^-m + 10^-m * (1/10 * (10 * p2) ) * 2^e =
656     //         buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e +
657     //         (10*p2 mod 2^-e)) * 2^e
658     //
659     p2 *= 10;
660     const std::uint64_t d = p2 >> -one.e;     // d = (10 * p2) div 2^-e
661     const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e
662     //
663     //      M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e
664     //         = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e))
665     //         = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e
666     //
667     buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
668     //
669     //      M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e
670     //
671     p2 = r;
672     m++;
673     //
674     //      M+ = buffer * 10^-m + 10^-m * p2 * 2^e
675     // Invariant restored.
676 
677     // Check if enough digits have been generated.
678     //
679     //      10^-m * p2 * 2^e <= delta * 2^e
680     //              p2 * 2^e <= 10^m * delta * 2^e
681     //                    p2 <= 10^m * delta
682     delta *= 10;
683     dist *= 10;
684     if (p2 <= delta) {
685       break;
686     }
687   }
688 
689   // V = buffer * 10^-m, with M- <= V <= M+.
690 
691   decimal_exponent -= m;
692 
693   // 1 ulp in the decimal representation is now 10^-m.
694   // Since delta and dist are now scaled by 10^m, we need to do the
695   // same with ulp in order to keep the units in sync.
696   //
697   //      10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e
698   //
699   const std::uint64_t ten_m = one.f;
700   grisu2_round(buffer, length, dist, delta, p2, ten_m);
701 
702   // By construction this algorithm generates the shortest possible decimal
703   // number (Loitsch, Theorem 6.2) which rounds back to w.
704   // For an input number of precision p, at least
705   //
706   //      N = 1 + ceil(p * log_10(2))
707   //
708   // decimal digits are sufficient to identify all binary floating-point
709   // numbers (Matula, "In-and-Out conversions").
710   // This implies that the algorithm does not produce more than N decimal
711   // digits.
712   //
713   //      N = 17 for p = 53 (IEEE double precision)
714   //      N = 9  for p = 24 (IEEE single precision)
715 }
716 
717 /*!
718 v = buf * 10^decimal_exponent
719 len is the length of the buffer (number of decimal digits)
720 The buffer must be large enough, i.e. >= max_digits10.
721 */
grisu2(char * buf,int & len,int & decimal_exponent,diyfp m_minus,diyfp v,diyfp m_plus)722 inline void grisu2(char *buf, int &len, int &decimal_exponent, diyfp m_minus,
723                    diyfp v, diyfp m_plus) {
724 
725   //  --------(-----------------------+-----------------------)--------    (A)
726   //          m-                      v                       m+
727   //
728   //  --------------------(-----------+-----------------------)--------    (B)
729   //                      m-          v                       m+
730   //
731   // First scale v (and m- and m+) such that the exponent is in the range
732   // [alpha, gamma].
733 
734   const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e);
735 
736   const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k
737 
738   // The exponent of the products is = v.e + c_minus_k.e + q and is in the range
739   // [alpha,gamma]
740   const diyfp w = diyfp::mul(v, c_minus_k);
741   const diyfp w_minus = diyfp::mul(m_minus, c_minus_k);
742   const diyfp w_plus = diyfp::mul(m_plus, c_minus_k);
743 
744   //  ----(---+---)---------------(---+---)---------------(---+---)----
745   //          w-                      w                       w+
746   //          = c*m-                  = c*v                   = c*m+
747   //
748   // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and
749   // w+ are now off by a small amount.
750   // In fact:
751   //
752   //      w - v * 10^k < 1 ulp
753   //
754   // To account for this inaccuracy, add resp. subtract 1 ulp.
755   //
756   //  --------+---[---------------(---+---)---------------]---+--------
757   //          w-  M-                  w                   M+  w+
758   //
759   // Now any number in [M-, M+] (bounds included) will round to w when input,
760   // regardless of how the input rounding algorithm breaks ties.
761   //
762   // And digit_gen generates the shortest possible such number in [M-, M+].
763   // Note that this does not mean that Grisu2 always generates the shortest
764   // possible number in the interval (m-, m+).
765   const diyfp M_minus(w_minus.f + 1, w_minus.e);
766   const diyfp M_plus(w_plus.f - 1, w_plus.e);
767 
768   decimal_exponent = -cached.k; // = -(-k) = k
769 
770   grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus);
771 }
772 
773 /*!
774 v = buf * 10^decimal_exponent
775 len is the length of the buffer (number of decimal digits)
776 The buffer must be large enough, i.e. >= max_digits10.
777 */
778 template <typename FloatType>
grisu2(char * buf,int & len,int & decimal_exponent,FloatType value)779 void grisu2(char *buf, int &len, int &decimal_exponent, FloatType value) {
780   static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3,
781                 "internal error: not enough precision");
782 
783   // If the neighbors (and boundaries) of 'value' are always computed for
784   // double-precision numbers, all float's can be recovered using strtod (and
785   // strtof). However, the resulting decimal representations are not exactly
786   // "short".
787   //
788   // The documentation for 'std::to_chars'
789   // (https://en.cppreference.com/w/cpp/utility/to_chars) says "value is
790   // converted to a string as if by std::sprintf in the default ("C") locale"
791   // and since sprintf promotes float's to double's, I think this is exactly
792   // what 'std::to_chars' does. On the other hand, the documentation for
793   // 'std::to_chars' requires that "parsing the representation using the
794   // corresponding std::from_chars function recovers value exactly". That
795   // indicates that single precision floating-point numbers should be recovered
796   // using 'std::strtof'.
797   //
798   // NB: If the neighbors are computed for single-precision numbers, there is a
799   // single float
800   //     (7.0385307e-26f) which can't be recovered using strtod. The resulting
801   //     double precision value is off by 1 ulp.
802 #if 0
803     const boundaries w = compute_boundaries(static_cast<double>(value));
804 #else
805   const boundaries w = compute_boundaries(value);
806 #endif
807 
808   grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus);
809 }
810 
811 /*!
812 @brief appends a decimal representation of e to buf
813 @return a pointer to the element following the exponent.
814 @pre -1000 < e < 1000
815 */
append_exponent(char * buf,int e)816 inline char *append_exponent(char *buf, int e) {
817 
818   if (e < 0) {
819     e = -e;
820     *buf++ = '-';
821   } else {
822     *buf++ = '+';
823   }
824 
825   auto k = static_cast<std::uint32_t>(e);
826   if (k < 10) {
827     // Always print at least two digits in the exponent.
828     // This is for compatibility with printf("%g").
829     *buf++ = '0';
830     *buf++ = static_cast<char>('0' + k);
831   } else if (k < 100) {
832     *buf++ = static_cast<char>('0' + k / 10);
833     k %= 10;
834     *buf++ = static_cast<char>('0' + k);
835   } else {
836     *buf++ = static_cast<char>('0' + k / 100);
837     k %= 100;
838     *buf++ = static_cast<char>('0' + k / 10);
839     k %= 10;
840     *buf++ = static_cast<char>('0' + k);
841   }
842 
843   return buf;
844 }
845 
846 /*!
847 @brief prettify v = buf * 10^decimal_exponent
848 If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point
849 notation. Otherwise it will be printed in exponential notation.
850 @pre min_exp < 0
851 @pre max_exp > 0
852 */
format_buffer(char * buf,int len,int decimal_exponent,int min_exp,int max_exp)853 inline char *format_buffer(char *buf, int len, int decimal_exponent,
854                            int min_exp, int max_exp) {
855 
856   const int k = len;
857   const int n = len + decimal_exponent;
858 
859   // v = buf * 10^(n-k)
860   // k is the length of the buffer (number of decimal digits)
861   // n is the position of the decimal point relative to the start of the buffer.
862 
863   if (k <= n && n <= max_exp) {
864     // digits[000]
865     // len <= max_exp + 2
866 
867     std::memset(buf + k, '0', static_cast<size_t>(n) - static_cast<size_t>(k));
868     // Make it look like a floating-point number (#362, #378)
869     buf[n + 0] = '.';
870     buf[n + 1] = '0';
871     return buf + (static_cast<size_t>(n) + 2);
872   }
873 
874   if (0 < n && n <= max_exp) {
875     // dig.its
876     // len <= max_digits10 + 1
877     std::memmove(buf + (static_cast<size_t>(n) + 1), buf + n,
878                  static_cast<size_t>(k) - static_cast<size_t>(n));
879     buf[n] = '.';
880     return buf + (static_cast<size_t>(k) + 1U);
881   }
882 
883   if (min_exp < n && n <= 0) {
884     // 0.[000]digits
885     // len <= 2 + (-min_exp - 1) + max_digits10
886 
887     std::memmove(buf + (2 + static_cast<size_t>(-n)), buf,
888                  static_cast<size_t>(k));
889     buf[0] = '0';
890     buf[1] = '.';
891     std::memset(buf + 2, '0', static_cast<size_t>(-n));
892     return buf + (2U + static_cast<size_t>(-n) + static_cast<size_t>(k));
893   }
894 
895   if (k == 1) {
896     // dE+123
897     // len <= 1 + 5
898 
899     buf += 1;
900   } else {
901     // d.igitsE+123
902     // len <= max_digits10 + 1 + 5
903 
904     std::memmove(buf + 2, buf + 1, static_cast<size_t>(k) - 1);
905     buf[1] = '.';
906     buf += 1 + static_cast<size_t>(k);
907   }
908 
909   *buf++ = 'e';
910   return append_exponent(buf, n - 1);
911 }
912 
913 } // namespace dtoa_impl
914 
915 /*!
916 The format of the resulting decimal representation is similar to printf's %g
917 format. Returns an iterator pointing past-the-end of the decimal representation.
918 @note The input number must be finite, i.e. NaN's and Inf's are not supported.
919 @note The buffer must be large enough.
920 @note The result is NOT null-terminated.
921 */
to_chars(char * first,const char * last,double value)922 char *to_chars(char *first, const char *last, double value) {
923   static_cast<void>(last); // maybe unused - fix warning
924   if (value <= -0) {
925     value = -value;
926     *first++ = '-';
927   }
928 
929   if (value == 0) // +-0
930   {
931     *first++ = '0';
932     // Make it look like a floating-point number (#362, #378)
933     *first++ = '.';
934     *first++ = '0';
935     return first;
936   }
937   // Compute v = buffer * 10^decimal_exponent.
938   // The decimal digits are stored in the buffer, which needs to be interpreted
939   // as an unsigned decimal integer.
940   // len is the length of the buffer, i.e. the number of decimal digits.
941   int len = 0;
942   int decimal_exponent = 0;
943   dtoa_impl::grisu2(first, len, decimal_exponent, value);
944   // Format the buffer like printf("%.*g", prec, value)
945   constexpr int kMinExp = -4;
946   constexpr int kMaxExp = std::numeric_limits<double>::digits10;
947 
948   return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp,
949                                   kMaxExp);
950 }
951 } // namespace internal
952 } // namespace simdjson
953 /* end file src/to_chars.cpp */
954 /* begin file src/from_chars.cpp */
955 #include <limits>
956 namespace simdjson {
957 namespace internal {
958 
959 /**
960  * The code in the internal::from_chars function is meant to handle the floating-point number parsing
961  * when we have more than 19 digits in the decimal mantissa. This should only be seen
962  * in adversarial scenarios: we do not expect production systems to even produce
963  * such floating-point numbers.
964  *
965  * The parser is based on work by Nigel Tao (at https://github.com/google/wuffs/)
966  * who credits Ken Thompson for the design (via a reference to the Go source
967  * code). See
968  * https://github.com/google/wuffs/blob/aa46859ea40c72516deffa1b146121952d6dfd3b/internal/cgen/base/floatconv-submodule-data.c
969  * https://github.com/google/wuffs/blob/46cd8105f47ca07ae2ba8e6a7818ef9c0df6c152/internal/cgen/base/floatconv-submodule-code.c
970  * It is probably not very fast but it is a fallback that should almost never be
971  * called in real life. Google Wuffs is published under APL 2.0.
972  **/
973 
974 namespace {
975 constexpr uint32_t max_digits = 768;
976 constexpr int32_t decimal_point_range = 2047;
977 } // namespace
978 
979 struct adjusted_mantissa {
980   uint64_t mantissa;
981   int power2;
adjusted_mantissasimdjson::internal::adjusted_mantissa982   adjusted_mantissa() : mantissa(0), power2(0) {}
983 };
984 
985 struct decimal {
986   uint32_t num_digits;
987   int32_t decimal_point;
988   bool negative;
989   bool truncated;
990   uint8_t digits[max_digits];
991 };
992 
993 template <typename T> struct binary_format {
994   static constexpr int mantissa_explicit_bits();
995   static constexpr int minimum_exponent();
996   static constexpr int infinite_power();
997   static constexpr int sign_index();
998 };
999 
mantissa_explicit_bits()1000 template <> constexpr int binary_format<double>::mantissa_explicit_bits() {
1001   return 52;
1002 }
1003 
minimum_exponent()1004 template <> constexpr int binary_format<double>::minimum_exponent() {
1005   return -1023;
1006 }
infinite_power()1007 template <> constexpr int binary_format<double>::infinite_power() {
1008   return 0x7FF;
1009 }
1010 
sign_index()1011 template <> constexpr int binary_format<double>::sign_index() { return 63; }
1012 
is_integer(char c)1013 bool is_integer(char c)  noexcept  { return (c >= '0' && c <= '9'); }
1014 
1015 // This should always succeed since it follows a call to parse_number.
parse_decimal(const char * & p)1016 decimal parse_decimal(const char *&p) noexcept {
1017   decimal answer;
1018   answer.num_digits = 0;
1019   answer.decimal_point = 0;
1020   answer.truncated = false;
1021   answer.negative = (*p == '-');
1022   if ((*p == '-') || (*p == '+')) {
1023     ++p;
1024   }
1025 
1026   while (*p == '0') {
1027     ++p;
1028   }
1029   while (is_integer(*p)) {
1030     if (answer.num_digits < max_digits) {
1031       answer.digits[answer.num_digits] = uint8_t(*p - '0');
1032     }
1033     answer.num_digits++;
1034     ++p;
1035   }
1036   if (*p == '.') {
1037     ++p;
1038     const char *first_after_period = p;
1039     // if we have not yet encountered a zero, we have to skip it as well
1040     if (answer.num_digits == 0) {
1041       // skip zeros
1042       while (*p == '0') {
1043         ++p;
1044       }
1045     }
1046     while (is_integer(*p)) {
1047       if (answer.num_digits < max_digits) {
1048         answer.digits[answer.num_digits] = uint8_t(*p - '0');
1049       }
1050       answer.num_digits++;
1051       ++p;
1052     }
1053     answer.decimal_point = int32_t(first_after_period - p);
1054   }
1055   if(answer.num_digits > 0) {
1056     const char *preverse = p - 1;
1057     int32_t trailing_zeros = 0;
1058     while ((*preverse == '0') || (*preverse == '.')) {
1059       if(*preverse == '0') { trailing_zeros++; };
1060       --preverse;
1061     }
1062     answer.decimal_point += int32_t(answer.num_digits);
1063     answer.num_digits -= uint32_t(trailing_zeros);
1064   }
1065   if(answer.num_digits > max_digits ) {
1066     answer.num_digits = max_digits;
1067     answer.truncated = true;
1068   }
1069   if (('e' == *p) || ('E' == *p)) {
1070     ++p;
1071     bool neg_exp = false;
1072     if ('-' == *p) {
1073       neg_exp = true;
1074       ++p;
1075     } else if ('+' == *p) {
1076       ++p;
1077     }
1078     int32_t exp_number = 0; // exponential part
1079     while (is_integer(*p)) {
1080       uint8_t digit = uint8_t(*p - '0');
1081       if (exp_number < 0x10000) {
1082         exp_number = 10 * exp_number + digit;
1083       }
1084       ++p;
1085     }
1086     answer.decimal_point += (neg_exp ? -exp_number : exp_number);
1087   }
1088   return answer;
1089 }
1090 
1091 namespace {
1092 
1093 // remove all final zeroes
trim(decimal & h)1094 inline void trim(decimal &h) {
1095   while ((h.num_digits > 0) && (h.digits[h.num_digits - 1] == 0)) {
1096     h.num_digits--;
1097   }
1098 }
1099 
number_of_digits_decimal_left_shift(decimal & h,uint32_t shift)1100 uint32_t number_of_digits_decimal_left_shift(decimal &h, uint32_t shift) {
1101   shift &= 63;
1102   const static uint16_t number_of_digits_decimal_left_shift_table[65] = {
1103       0x0000, 0x0800, 0x0801, 0x0803, 0x1006, 0x1009, 0x100D, 0x1812, 0x1817,
1104       0x181D, 0x2024, 0x202B, 0x2033, 0x203C, 0x2846, 0x2850, 0x285B, 0x3067,
1105       0x3073, 0x3080, 0x388E, 0x389C, 0x38AB, 0x38BB, 0x40CC, 0x40DD, 0x40EF,
1106       0x4902, 0x4915, 0x4929, 0x513E, 0x5153, 0x5169, 0x5180, 0x5998, 0x59B0,
1107       0x59C9, 0x61E3, 0x61FD, 0x6218, 0x6A34, 0x6A50, 0x6A6D, 0x6A8B, 0x72AA,
1108       0x72C9, 0x72E9, 0x7B0A, 0x7B2B, 0x7B4D, 0x8370, 0x8393, 0x83B7, 0x83DC,
1109       0x8C02, 0x8C28, 0x8C4F, 0x9477, 0x949F, 0x94C8, 0x9CF2, 0x051C, 0x051C,
1110       0x051C, 0x051C,
1111   };
1112   uint32_t x_a = number_of_digits_decimal_left_shift_table[shift];
1113   uint32_t x_b = number_of_digits_decimal_left_shift_table[shift + 1];
1114   uint32_t num_new_digits = x_a >> 11;
1115   uint32_t pow5_a = 0x7FF & x_a;
1116   uint32_t pow5_b = 0x7FF & x_b;
1117   const static uint8_t
1118       number_of_digits_decimal_left_shift_table_powers_of_5[0x051C] = {
1119           5, 2, 5, 1, 2, 5, 6, 2, 5, 3, 1, 2, 5, 1, 5, 6, 2, 5, 7, 8, 1, 2, 5,
1120           3, 9, 0, 6, 2, 5, 1, 9, 5, 3, 1, 2, 5, 9, 7, 6, 5, 6, 2, 5, 4, 8, 8,
1121           2, 8, 1, 2, 5, 2, 4, 4, 1, 4, 0, 6, 2, 5, 1, 2, 2, 0, 7, 0, 3, 1, 2,
1122           5, 6, 1, 0, 3, 5, 1, 5, 6, 2, 5, 3, 0, 5, 1, 7, 5, 7, 8, 1, 2, 5, 1,
1123           5, 2, 5, 8, 7, 8, 9, 0, 6, 2, 5, 7, 6, 2, 9, 3, 9, 4, 5, 3, 1, 2, 5,
1124           3, 8, 1, 4, 6, 9, 7, 2, 6, 5, 6, 2, 5, 1, 9, 0, 7, 3, 4, 8, 6, 3, 2,
1125           8, 1, 2, 5, 9, 5, 3, 6, 7, 4, 3, 1, 6, 4, 0, 6, 2, 5, 4, 7, 6, 8, 3,
1126           7, 1, 5, 8, 2, 0, 3, 1, 2, 5, 2, 3, 8, 4, 1, 8, 5, 7, 9, 1, 0, 1, 5,
1127           6, 2, 5, 1, 1, 9, 2, 0, 9, 2, 8, 9, 5, 5, 0, 7, 8, 1, 2, 5, 5, 9, 6,
1128           0, 4, 6, 4, 4, 7, 7, 5, 3, 9, 0, 6, 2, 5, 2, 9, 8, 0, 2, 3, 2, 2, 3,
1129           8, 7, 6, 9, 5, 3, 1, 2, 5, 1, 4, 9, 0, 1, 1, 6, 1, 1, 9, 3, 8, 4, 7,
1130           6, 5, 6, 2, 5, 7, 4, 5, 0, 5, 8, 0, 5, 9, 6, 9, 2, 3, 8, 2, 8, 1, 2,
1131           5, 3, 7, 2, 5, 2, 9, 0, 2, 9, 8, 4, 6, 1, 9, 1, 4, 0, 6, 2, 5, 1, 8,
1132           6, 2, 6, 4, 5, 1, 4, 9, 2, 3, 0, 9, 5, 7, 0, 3, 1, 2, 5, 9, 3, 1, 3,
1133           2, 2, 5, 7, 4, 6, 1, 5, 4, 7, 8, 5, 1, 5, 6, 2, 5, 4, 6, 5, 6, 6, 1,
1134           2, 8, 7, 3, 0, 7, 7, 3, 9, 2, 5, 7, 8, 1, 2, 5, 2, 3, 2, 8, 3, 0, 6,
1135           4, 3, 6, 5, 3, 8, 6, 9, 6, 2, 8, 9, 0, 6, 2, 5, 1, 1, 6, 4, 1, 5, 3,
1136           2, 1, 8, 2, 6, 9, 3, 4, 8, 1, 4, 4, 5, 3, 1, 2, 5, 5, 8, 2, 0, 7, 6,
1137           6, 0, 9, 1, 3, 4, 6, 7, 4, 0, 7, 2, 2, 6, 5, 6, 2, 5, 2, 9, 1, 0, 3,
1138           8, 3, 0, 4, 5, 6, 7, 3, 3, 7, 0, 3, 6, 1, 3, 2, 8, 1, 2, 5, 1, 4, 5,
1139           5, 1, 9, 1, 5, 2, 2, 8, 3, 6, 6, 8, 5, 1, 8, 0, 6, 6, 4, 0, 6, 2, 5,
1140           7, 2, 7, 5, 9, 5, 7, 6, 1, 4, 1, 8, 3, 4, 2, 5, 9, 0, 3, 3, 2, 0, 3,
1141           1, 2, 5, 3, 6, 3, 7, 9, 7, 8, 8, 0, 7, 0, 9, 1, 7, 1, 2, 9, 5, 1, 6,
1142           6, 0, 1, 5, 6, 2, 5, 1, 8, 1, 8, 9, 8, 9, 4, 0, 3, 5, 4, 5, 8, 5, 6,
1143           4, 7, 5, 8, 3, 0, 0, 7, 8, 1, 2, 5, 9, 0, 9, 4, 9, 4, 7, 0, 1, 7, 7,
1144           2, 9, 2, 8, 2, 3, 7, 9, 1, 5, 0, 3, 9, 0, 6, 2, 5, 4, 5, 4, 7, 4, 7,
1145           3, 5, 0, 8, 8, 6, 4, 6, 4, 1, 1, 8, 9, 5, 7, 5, 1, 9, 5, 3, 1, 2, 5,
1146           2, 2, 7, 3, 7, 3, 6, 7, 5, 4, 4, 3, 2, 3, 2, 0, 5, 9, 4, 7, 8, 7, 5,
1147           9, 7, 6, 5, 6, 2, 5, 1, 1, 3, 6, 8, 6, 8, 3, 7, 7, 2, 1, 6, 1, 6, 0,
1148           2, 9, 7, 3, 9, 3, 7, 9, 8, 8, 2, 8, 1, 2, 5, 5, 6, 8, 4, 3, 4, 1, 8,
1149           8, 6, 0, 8, 0, 8, 0, 1, 4, 8, 6, 9, 6, 8, 9, 9, 4, 1, 4, 0, 6, 2, 5,
1150           2, 8, 4, 2, 1, 7, 0, 9, 4, 3, 0, 4, 0, 4, 0, 0, 7, 4, 3, 4, 8, 4, 4,
1151           9, 7, 0, 7, 0, 3, 1, 2, 5, 1, 4, 2, 1, 0, 8, 5, 4, 7, 1, 5, 2, 0, 2,
1152           0, 0, 3, 7, 1, 7, 4, 2, 2, 4, 8, 5, 3, 5, 1, 5, 6, 2, 5, 7, 1, 0, 5,
1153           4, 2, 7, 3, 5, 7, 6, 0, 1, 0, 0, 1, 8, 5, 8, 7, 1, 1, 2, 4, 2, 6, 7,
1154           5, 7, 8, 1, 2, 5, 3, 5, 5, 2, 7, 1, 3, 6, 7, 8, 8, 0, 0, 5, 0, 0, 9,
1155           2, 9, 3, 5, 5, 6, 2, 1, 3, 3, 7, 8, 9, 0, 6, 2, 5, 1, 7, 7, 6, 3, 5,
1156           6, 8, 3, 9, 4, 0, 0, 2, 5, 0, 4, 6, 4, 6, 7, 7, 8, 1, 0, 6, 6, 8, 9,
1157           4, 5, 3, 1, 2, 5, 8, 8, 8, 1, 7, 8, 4, 1, 9, 7, 0, 0, 1, 2, 5, 2, 3,
1158           2, 3, 3, 8, 9, 0, 5, 3, 3, 4, 4, 7, 2, 6, 5, 6, 2, 5, 4, 4, 4, 0, 8,
1159           9, 2, 0, 9, 8, 5, 0, 0, 6, 2, 6, 1, 6, 1, 6, 9, 4, 5, 2, 6, 6, 7, 2,
1160           3, 6, 3, 2, 8, 1, 2, 5, 2, 2, 2, 0, 4, 4, 6, 0, 4, 9, 2, 5, 0, 3, 1,
1161           3, 0, 8, 0, 8, 4, 7, 2, 6, 3, 3, 3, 6, 1, 8, 1, 6, 4, 0, 6, 2, 5, 1,
1162           1, 1, 0, 2, 2, 3, 0, 2, 4, 6, 2, 5, 1, 5, 6, 5, 4, 0, 4, 2, 3, 6, 3,
1163           1, 6, 6, 8, 0, 9, 0, 8, 2, 0, 3, 1, 2, 5, 5, 5, 5, 1, 1, 1, 5, 1, 2,
1164           3, 1, 2, 5, 7, 8, 2, 7, 0, 2, 1, 1, 8, 1, 5, 8, 3, 4, 0, 4, 5, 4, 1,
1165           0, 1, 5, 6, 2, 5, 2, 7, 7, 5, 5, 5, 7, 5, 6, 1, 5, 6, 2, 8, 9, 1, 3,
1166           5, 1, 0, 5, 9, 0, 7, 9, 1, 7, 0, 2, 2, 7, 0, 5, 0, 7, 8, 1, 2, 5, 1,
1167           3, 8, 7, 7, 7, 8, 7, 8, 0, 7, 8, 1, 4, 4, 5, 6, 7, 5, 5, 2, 9, 5, 3,
1168           9, 5, 8, 5, 1, 1, 3, 5, 2, 5, 3, 9, 0, 6, 2, 5, 6, 9, 3, 8, 8, 9, 3,
1169           9, 0, 3, 9, 0, 7, 2, 2, 8, 3, 7, 7, 6, 4, 7, 6, 9, 7, 9, 2, 5, 5, 6,
1170           7, 6, 2, 6, 9, 5, 3, 1, 2, 5, 3, 4, 6, 9, 4, 4, 6, 9, 5, 1, 9, 5, 3,
1171           6, 1, 4, 1, 8, 8, 8, 2, 3, 8, 4, 8, 9, 6, 2, 7, 8, 3, 8, 1, 3, 4, 7,
1172           6, 5, 6, 2, 5, 1, 7, 3, 4, 7, 2, 3, 4, 7, 5, 9, 7, 6, 8, 0, 7, 0, 9,
1173           4, 4, 1, 1, 9, 2, 4, 4, 8, 1, 3, 9, 1, 9, 0, 6, 7, 3, 8, 2, 8, 1, 2,
1174           5, 8, 6, 7, 3, 6, 1, 7, 3, 7, 9, 8, 8, 4, 0, 3, 5, 4, 7, 2, 0, 5, 9,
1175           6, 2, 2, 4, 0, 6, 9, 5, 9, 5, 3, 3, 6, 9, 1, 4, 0, 6, 2, 5,
1176       };
1177   const uint8_t *pow5 =
1178       &number_of_digits_decimal_left_shift_table_powers_of_5[pow5_a];
1179   uint32_t i = 0;
1180   uint32_t n = pow5_b - pow5_a;
1181   for (; i < n; i++) {
1182     if (i >= h.num_digits) {
1183       return num_new_digits - 1;
1184     } else if (h.digits[i] == pow5[i]) {
1185       continue;
1186     } else if (h.digits[i] < pow5[i]) {
1187       return num_new_digits - 1;
1188     } else {
1189       return num_new_digits;
1190     }
1191   }
1192   return num_new_digits;
1193 }
1194 
1195 } // end of anonymous namespace
1196 
round(decimal & h)1197 uint64_t round(decimal &h) {
1198   if ((h.num_digits == 0) || (h.decimal_point < 0)) {
1199     return 0;
1200   } else if (h.decimal_point > 18) {
1201     return UINT64_MAX;
1202   }
1203   // at this point, we know that h.decimal_point >= 0
1204   uint32_t dp = uint32_t(h.decimal_point);
1205   uint64_t n = 0;
1206   for (uint32_t i = 0; i < dp; i++) {
1207     n = (10 * n) + ((i < h.num_digits) ? h.digits[i] : 0);
1208   }
1209   bool round_up = false;
1210   if (dp < h.num_digits) {
1211     round_up = h.digits[dp] >= 5; // normally, we round up
1212     // but we may need to round to even!
1213     if ((h.digits[dp] == 5) && (dp + 1 == h.num_digits)) {
1214       round_up = h.truncated || ((dp > 0) && (1 & h.digits[dp - 1]));
1215     }
1216   }
1217   if (round_up) {
1218     n++;
1219   }
1220   return n;
1221 }
1222 
1223 // computes h * 2^-shift
decimal_left_shift(decimal & h,uint32_t shift)1224 void decimal_left_shift(decimal &h, uint32_t shift) {
1225   if (h.num_digits == 0) {
1226     return;
1227   }
1228   uint32_t num_new_digits = number_of_digits_decimal_left_shift(h, shift);
1229   int32_t read_index = int32_t(h.num_digits - 1);
1230   uint32_t write_index = h.num_digits - 1 + num_new_digits;
1231   uint64_t n = 0;
1232 
1233   while (read_index >= 0) {
1234     n += uint64_t(h.digits[read_index]) << shift;
1235     uint64_t quotient = n / 10;
1236     uint64_t remainder = n - (10 * quotient);
1237     if (write_index < max_digits) {
1238       h.digits[write_index] = uint8_t(remainder);
1239     } else if (remainder > 0) {
1240       h.truncated = true;
1241     }
1242     n = quotient;
1243     write_index--;
1244     read_index--;
1245   }
1246   while (n > 0) {
1247     uint64_t quotient = n / 10;
1248     uint64_t remainder = n - (10 * quotient);
1249     if (write_index < max_digits) {
1250       h.digits[write_index] = uint8_t(remainder);
1251     } else if (remainder > 0) {
1252       h.truncated = true;
1253     }
1254     n = quotient;
1255     write_index--;
1256   }
1257   h.num_digits += num_new_digits;
1258   if (h.num_digits > max_digits) {
1259     h.num_digits = max_digits;
1260   }
1261   h.decimal_point += int32_t(num_new_digits);
1262   trim(h);
1263 }
1264 
1265 // computes h * 2^shift
decimal_right_shift(decimal & h,uint32_t shift)1266 void decimal_right_shift(decimal &h, uint32_t shift) {
1267   uint32_t read_index = 0;
1268   uint32_t write_index = 0;
1269 
1270   uint64_t n = 0;
1271 
1272   while ((n >> shift) == 0) {
1273     if (read_index < h.num_digits) {
1274       n = (10 * n) + h.digits[read_index++];
1275     } else if (n == 0) {
1276       return;
1277     } else {
1278       while ((n >> shift) == 0) {
1279         n = 10 * n;
1280         read_index++;
1281       }
1282       break;
1283     }
1284   }
1285   h.decimal_point -= int32_t(read_index - 1);
1286   if (h.decimal_point < -decimal_point_range) { // it is zero
1287     h.num_digits = 0;
1288     h.decimal_point = 0;
1289     h.negative = false;
1290     h.truncated = false;
1291     return;
1292   }
1293   uint64_t mask = (uint64_t(1) << shift) - 1;
1294   while (read_index < h.num_digits) {
1295     uint8_t new_digit = uint8_t(n >> shift);
1296     n = (10 * (n & mask)) + h.digits[read_index++];
1297     h.digits[write_index++] = new_digit;
1298   }
1299   while (n > 0) {
1300     uint8_t new_digit = uint8_t(n >> shift);
1301     n = 10 * (n & mask);
1302     if (write_index < max_digits) {
1303       h.digits[write_index++] = new_digit;
1304     } else if (new_digit > 0) {
1305       h.truncated = true;
1306     }
1307   }
1308   h.num_digits = write_index;
1309   trim(h);
1310 }
1311 
compute_float(decimal & d)1312 template <typename binary> adjusted_mantissa compute_float(decimal &d) {
1313   adjusted_mantissa answer;
1314   if (d.num_digits == 0) {
1315     // should be zero
1316     answer.power2 = 0;
1317     answer.mantissa = 0;
1318     return answer;
1319   }
1320   // At this point, going further, we can assume that d.num_digits > 0.
1321   // We want to guard against excessive decimal point values because
1322   // they can result in long running times. Indeed, we do
1323   // shifts by at most 60 bits. We have that log(10**400)/log(2**60) ~= 22
1324   // which is fine, but log(10**299995)/log(2**60) ~= 16609 which is not
1325   // fine (runs for a long time).
1326   //
1327   if(d.decimal_point < -324) {
1328     // We have something smaller than 1e-324 which is always zero
1329     // in binary64 and binary32.
1330     // It should be zero.
1331     answer.power2 = 0;
1332     answer.mantissa = 0;
1333     return answer;
1334   } else if(d.decimal_point >= 310) {
1335     // We have something at least as large as 0.1e310 which is
1336     // always infinite.
1337     answer.power2 = binary::infinite_power();
1338     answer.mantissa = 0;
1339     return answer;
1340   }
1341 
1342   static const uint32_t max_shift = 60;
1343   static const uint32_t num_powers = 19;
1344   static const uint8_t powers[19] = {
1345       0,  3,  6,  9,  13, 16, 19, 23, 26, 29, //
1346       33, 36, 39, 43, 46, 49, 53, 56, 59,     //
1347   };
1348   int32_t exp2 = 0;
1349   while (d.decimal_point > 0) {
1350     uint32_t n = uint32_t(d.decimal_point);
1351     uint32_t shift = (n < num_powers) ? powers[n] : max_shift;
1352     decimal_right_shift(d, shift);
1353     if (d.decimal_point < -decimal_point_range) {
1354       // should be zero
1355       answer.power2 = 0;
1356       answer.mantissa = 0;
1357       return answer;
1358     }
1359     exp2 += int32_t(shift);
1360   }
1361   // We shift left toward [1/2 ... 1].
1362   while (d.decimal_point <= 0) {
1363     uint32_t shift;
1364     if (d.decimal_point == 0) {
1365       if (d.digits[0] >= 5) {
1366         break;
1367       }
1368       shift = (d.digits[0] < 2) ? 2 : 1;
1369     } else {
1370       uint32_t n = uint32_t(-d.decimal_point);
1371       shift = (n < num_powers) ? powers[n] : max_shift;
1372     }
1373     decimal_left_shift(d, shift);
1374     if (d.decimal_point > decimal_point_range) {
1375       // we want to get infinity:
1376       answer.power2 = 0xFF;
1377       answer.mantissa = 0;
1378       return answer;
1379     }
1380     exp2 -= int32_t(shift);
1381   }
1382   // We are now in the range [1/2 ... 1] but the binary format uses [1 ... 2].
1383   exp2--;
1384   constexpr int32_t minimum_exponent = binary::minimum_exponent();
1385   while ((minimum_exponent + 1) > exp2) {
1386     uint32_t n = uint32_t((minimum_exponent + 1) - exp2);
1387     if (n > max_shift) {
1388       n = max_shift;
1389     }
1390     decimal_right_shift(d, n);
1391     exp2 += int32_t(n);
1392   }
1393   if ((exp2 - minimum_exponent) >= binary::infinite_power()) {
1394     answer.power2 = binary::infinite_power();
1395     answer.mantissa = 0;
1396     return answer;
1397   }
1398 
1399   const int mantissa_size_in_bits = binary::mantissa_explicit_bits() + 1;
1400   decimal_left_shift(d, mantissa_size_in_bits);
1401 
1402   uint64_t mantissa = round(d);
1403   // It is possible that we have an overflow, in which case we need
1404   // to shift back.
1405   if (mantissa >= (uint64_t(1) << mantissa_size_in_bits)) {
1406     decimal_right_shift(d, 1);
1407     exp2 += 1;
1408     mantissa = round(d);
1409     if ((exp2 - minimum_exponent) >= binary::infinite_power()) {
1410       answer.power2 = binary::infinite_power();
1411       answer.mantissa = 0;
1412       return answer;
1413     }
1414   }
1415   answer.power2 = exp2 - binary::minimum_exponent();
1416   if (mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) {
1417     answer.power2--;
1418   }
1419   answer.mantissa =
1420       mantissa & ((uint64_t(1) << binary::mantissa_explicit_bits()) - 1);
1421   return answer;
1422 }
1423 
1424 template <typename binary>
parse_long_mantissa(const char * first)1425 adjusted_mantissa parse_long_mantissa(const char *first) {
1426   decimal d = parse_decimal(first);
1427   return compute_float<binary>(d);
1428 }
1429 
from_chars(const char * first)1430 double from_chars(const char *first) noexcept {
1431   bool negative = first[0] == '-';
1432   if (negative) {
1433     first++;
1434   }
1435   adjusted_mantissa am = parse_long_mantissa<binary_format<double>>(first);
1436   uint64_t word = am.mantissa;
1437   word |= uint64_t(am.power2)
1438           << binary_format<double>::mantissa_explicit_bits();
1439   word = negative ? word | (uint64_t(1) << binary_format<double>::sign_index())
1440                   : word;
1441   double value;
1442   std::memcpy(&value, &word, sizeof(double));
1443   return value;
1444 }
1445 
1446 } // internal
1447 } // simdjson
1448 /* end file src/from_chars.cpp */
1449 /* begin file src/internal/error_tables.cpp */
1450 
1451 namespace simdjson {
1452 namespace internal {
1453 
1454   SIMDJSON_DLLIMPORTEXPORT const error_code_info error_codes[] {
1455     { SUCCESS, "No error" },
1456     { CAPACITY, "This parser can't support a document that big" },
1457     { MEMALLOC, "Error allocating memory, we're most likely out of memory" },
1458     { TAPE_ERROR, "The JSON document has an improper structure: missing or superfluous commas, braces, missing keys, etc." },
1459     { DEPTH_ERROR, "The JSON document was too deep (too many nested objects and arrays)" },
1460     { STRING_ERROR, "Problem while parsing a string" },
1461     { T_ATOM_ERROR, "Problem while parsing an atom starting with the letter 't'" },
1462     { F_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'f'" },
1463     { N_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'n'" },
1464     { NUMBER_ERROR, "Problem while parsing a number" },
1465     { UTF8_ERROR, "The input is not valid UTF-8" },
1466     { UNINITIALIZED, "Uninitialized" },
1467     { EMPTY, "Empty: no JSON found" },
1468     { UNESCAPED_CHARS, "Within strings, some characters must be escaped, we found unescaped characters" },
1469     { UNCLOSED_STRING, "A string is opened, but never closed." },
1470     { UNSUPPORTED_ARCHITECTURE, "simdjson does not have an implementation supported by this CPU architecture (perhaps it's a non-SIMD CPU?)." },
1471     { INCORRECT_TYPE, "The JSON element does not have the requested type." },
1472     { NUMBER_OUT_OF_RANGE, "The JSON number is too large or too small to fit within the requested type." },
1473     { INDEX_OUT_OF_BOUNDS, "Attempted to access an element of a JSON array that is beyond its length." },
1474     { NO_SUCH_FIELD, "The JSON field referenced does not exist in this object." },
1475     { IO_ERROR, "Error reading the file." },
1476     { INVALID_JSON_POINTER, "Invalid JSON pointer syntax." },
1477     { INVALID_URI_FRAGMENT, "Invalid URI fragment syntax." },
1478     { UNEXPECTED_ERROR, "Unexpected error, consider reporting this problem as you may have found a bug in simdjson" },
1479     { PARSER_IN_USE, "Cannot parse a new document while a document is still in use." },
1480     { OUT_OF_ORDER_ITERATION, "Objects and arrays can only be iterated when they are first encountered." },
1481     { INSUFFICIENT_PADDING, "simdjson requires the input JSON string to have at least SIMDJSON_PADDING extra bytes allocated, beyond the string's length." }
1482   }; // error_messages[]
1483 
1484 } // namespace internal
1485 } // namespace simdjson
1486 /* end file src/internal/error_tables.cpp */
1487 /* begin file src/internal/jsoncharutils_tables.cpp */
1488 
1489 namespace simdjson {
1490 namespace internal {
1491 
1492 // structural chars here are
1493 // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL)
1494 // we are also interested in the four whitespace characters
1495 // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
1496 
1497 SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace_negated[256] = {
1498     1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1499     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1500     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1501 
1502     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1503     1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1504     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
1505 
1506     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1507     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1508     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1509 
1510     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1511     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1512     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
1513 
1514 SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace[256] = {
1515     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1516     0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
1517     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1518     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
1519     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1520     0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1521     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1522     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1523     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1524     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1525     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1526 
1527 SIMDJSON_DLLIMPORTEXPORT const uint32_t digit_to_val32[886] = {
1528     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1529     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1530     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1531     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1532     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1533     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1534     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1535     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1536     0x0,        0x1,        0x2,        0x3,        0x4,        0x5,
1537     0x6,        0x7,        0x8,        0x9,        0xFFFFFFFF, 0xFFFFFFFF,
1538     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa,
1539     0xb,        0xc,        0xd,        0xe,        0xf,        0xFFFFFFFF,
1540     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1541     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1542     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1543     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1544     0xFFFFFFFF, 0xa,        0xb,        0xc,        0xd,        0xe,
1545     0xf,        0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1546     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1547     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1548     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1549     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1550     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1551     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1552     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1553     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1554     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1555     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1556     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1557     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1558     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1559     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1560     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1561     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1562     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1563     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1564     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1565     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1566     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1567     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1568     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1569     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1570     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1571     0x0,        0x10,       0x20,       0x30,       0x40,       0x50,
1572     0x60,       0x70,       0x80,       0x90,       0xFFFFFFFF, 0xFFFFFFFF,
1573     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa0,
1574     0xb0,       0xc0,       0xd0,       0xe0,       0xf0,       0xFFFFFFFF,
1575     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1576     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1577     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1578     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1579     0xFFFFFFFF, 0xa0,       0xb0,       0xc0,       0xd0,       0xe0,
1580     0xf0,       0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1581     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1582     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1583     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1584     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1585     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1586     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1587     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1588     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1589     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1590     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1591     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1592     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1593     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1594     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1595     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1596     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1597     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1598     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1599     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1600     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1601     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1602     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1603     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1604     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1605     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1606     0x0,        0x100,      0x200,      0x300,      0x400,      0x500,
1607     0x600,      0x700,      0x800,      0x900,      0xFFFFFFFF, 0xFFFFFFFF,
1608     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa00,
1609     0xb00,      0xc00,      0xd00,      0xe00,      0xf00,      0xFFFFFFFF,
1610     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1611     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1612     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1613     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1614     0xFFFFFFFF, 0xa00,      0xb00,      0xc00,      0xd00,      0xe00,
1615     0xf00,      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1616     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1617     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1618     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1619     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1620     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1621     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1622     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1623     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1624     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1625     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1626     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1627     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1628     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1629     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1630     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1631     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1632     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1633     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1634     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1635     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1636     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1637     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1638     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1639     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1640     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1641     0x0,        0x1000,     0x2000,     0x3000,     0x4000,     0x5000,
1642     0x6000,     0x7000,     0x8000,     0x9000,     0xFFFFFFFF, 0xFFFFFFFF,
1643     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa000,
1644     0xb000,     0xc000,     0xd000,     0xe000,     0xf000,     0xFFFFFFFF,
1645     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1646     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1647     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1648     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1649     0xFFFFFFFF, 0xa000,     0xb000,     0xc000,     0xd000,     0xe000,
1650     0xf000,     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1651     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1652     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1653     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1654     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1655     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1656     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1657     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1658     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1659     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1660     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1661     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1662     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1663     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1664     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1665     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1666     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1667     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1668     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1669     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1670     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1671     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1672     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1673     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1674     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1675     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
1676 
1677 } // namespace internal
1678 } // namespace simdjson
1679 /* end file src/internal/jsoncharutils_tables.cpp */
1680 /* begin file src/internal/numberparsing_tables.cpp */
1681 
1682 namespace simdjson {
1683 namespace internal {
1684 
1685 // Precomputed powers of ten from 10^0 to 10^22. These
1686 // can be represented exactly using the double type.
1687 SIMDJSON_DLLIMPORTEXPORT const double power_of_ten[] = {
1688     1e0,  1e1,  1e2,  1e3,  1e4,  1e5,  1e6,  1e7,  1e8,  1e9,  1e10, 1e11,
1689     1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22};
1690 
1691 /**
1692  * When mapping numbers from decimal to binary,
1693  * we go from w * 10^q to m * 2^p but we have
1694  * 10^q = 5^q * 2^q, so effectively
1695  * we are trying to match
1696  * w * 2^q * 5^q to m * 2^p. Thus the powers of two
1697  * are not a concern since they can be represented
1698  * exactly using the binary notation, only the powers of five
1699  * affect the binary significand.
1700  */
1701 
1702 
1703 // The truncated powers of five from 5^-342 all the way to 5^308
1704 // The mantissa is truncated to 128 bits, and
1705 // never rounded up. Uses about 10KB.
1706 SIMDJSON_DLLIMPORTEXPORT const uint64_t power_of_five_128[]= {
1707         0xeef453d6923bd65a,0x113faa2906a13b3f,
1708         0x9558b4661b6565f8,0x4ac7ca59a424c507,
1709         0xbaaee17fa23ebf76,0x5d79bcf00d2df649,
1710         0xe95a99df8ace6f53,0xf4d82c2c107973dc,
1711         0x91d8a02bb6c10594,0x79071b9b8a4be869,
1712         0xb64ec836a47146f9,0x9748e2826cdee284,
1713         0xe3e27a444d8d98b7,0xfd1b1b2308169b25,
1714         0x8e6d8c6ab0787f72,0xfe30f0f5e50e20f7,
1715         0xb208ef855c969f4f,0xbdbd2d335e51a935,
1716         0xde8b2b66b3bc4723,0xad2c788035e61382,
1717         0x8b16fb203055ac76,0x4c3bcb5021afcc31,
1718         0xaddcb9e83c6b1793,0xdf4abe242a1bbf3d,
1719         0xd953e8624b85dd78,0xd71d6dad34a2af0d,
1720         0x87d4713d6f33aa6b,0x8672648c40e5ad68,
1721         0xa9c98d8ccb009506,0x680efdaf511f18c2,
1722         0xd43bf0effdc0ba48,0x212bd1b2566def2,
1723         0x84a57695fe98746d,0x14bb630f7604b57,
1724         0xa5ced43b7e3e9188,0x419ea3bd35385e2d,
1725         0xcf42894a5dce35ea,0x52064cac828675b9,
1726         0x818995ce7aa0e1b2,0x7343efebd1940993,
1727         0xa1ebfb4219491a1f,0x1014ebe6c5f90bf8,
1728         0xca66fa129f9b60a6,0xd41a26e077774ef6,
1729         0xfd00b897478238d0,0x8920b098955522b4,
1730         0x9e20735e8cb16382,0x55b46e5f5d5535b0,
1731         0xc5a890362fddbc62,0xeb2189f734aa831d,
1732         0xf712b443bbd52b7b,0xa5e9ec7501d523e4,
1733         0x9a6bb0aa55653b2d,0x47b233c92125366e,
1734         0xc1069cd4eabe89f8,0x999ec0bb696e840a,
1735         0xf148440a256e2c76,0xc00670ea43ca250d,
1736         0x96cd2a865764dbca,0x380406926a5e5728,
1737         0xbc807527ed3e12bc,0xc605083704f5ecf2,
1738         0xeba09271e88d976b,0xf7864a44c633682e,
1739         0x93445b8731587ea3,0x7ab3ee6afbe0211d,
1740         0xb8157268fdae9e4c,0x5960ea05bad82964,
1741         0xe61acf033d1a45df,0x6fb92487298e33bd,
1742         0x8fd0c16206306bab,0xa5d3b6d479f8e056,
1743         0xb3c4f1ba87bc8696,0x8f48a4899877186c,
1744         0xe0b62e2929aba83c,0x331acdabfe94de87,
1745         0x8c71dcd9ba0b4925,0x9ff0c08b7f1d0b14,
1746         0xaf8e5410288e1b6f,0x7ecf0ae5ee44dd9,
1747         0xdb71e91432b1a24a,0xc9e82cd9f69d6150,
1748         0x892731ac9faf056e,0xbe311c083a225cd2,
1749         0xab70fe17c79ac6ca,0x6dbd630a48aaf406,
1750         0xd64d3d9db981787d,0x92cbbccdad5b108,
1751         0x85f0468293f0eb4e,0x25bbf56008c58ea5,
1752         0xa76c582338ed2621,0xaf2af2b80af6f24e,
1753         0xd1476e2c07286faa,0x1af5af660db4aee1,
1754         0x82cca4db847945ca,0x50d98d9fc890ed4d,
1755         0xa37fce126597973c,0xe50ff107bab528a0,
1756         0xcc5fc196fefd7d0c,0x1e53ed49a96272c8,
1757         0xff77b1fcbebcdc4f,0x25e8e89c13bb0f7a,
1758         0x9faacf3df73609b1,0x77b191618c54e9ac,
1759         0xc795830d75038c1d,0xd59df5b9ef6a2417,
1760         0xf97ae3d0d2446f25,0x4b0573286b44ad1d,
1761         0x9becce62836ac577,0x4ee367f9430aec32,
1762         0xc2e801fb244576d5,0x229c41f793cda73f,
1763         0xf3a20279ed56d48a,0x6b43527578c1110f,
1764         0x9845418c345644d6,0x830a13896b78aaa9,
1765         0xbe5691ef416bd60c,0x23cc986bc656d553,
1766         0xedec366b11c6cb8f,0x2cbfbe86b7ec8aa8,
1767         0x94b3a202eb1c3f39,0x7bf7d71432f3d6a9,
1768         0xb9e08a83a5e34f07,0xdaf5ccd93fb0cc53,
1769         0xe858ad248f5c22c9,0xd1b3400f8f9cff68,
1770         0x91376c36d99995be,0x23100809b9c21fa1,
1771         0xb58547448ffffb2d,0xabd40a0c2832a78a,
1772         0xe2e69915b3fff9f9,0x16c90c8f323f516c,
1773         0x8dd01fad907ffc3b,0xae3da7d97f6792e3,
1774         0xb1442798f49ffb4a,0x99cd11cfdf41779c,
1775         0xdd95317f31c7fa1d,0x40405643d711d583,
1776         0x8a7d3eef7f1cfc52,0x482835ea666b2572,
1777         0xad1c8eab5ee43b66,0xda3243650005eecf,
1778         0xd863b256369d4a40,0x90bed43e40076a82,
1779         0x873e4f75e2224e68,0x5a7744a6e804a291,
1780         0xa90de3535aaae202,0x711515d0a205cb36,
1781         0xd3515c2831559a83,0xd5a5b44ca873e03,
1782         0x8412d9991ed58091,0xe858790afe9486c2,
1783         0xa5178fff668ae0b6,0x626e974dbe39a872,
1784         0xce5d73ff402d98e3,0xfb0a3d212dc8128f,
1785         0x80fa687f881c7f8e,0x7ce66634bc9d0b99,
1786         0xa139029f6a239f72,0x1c1fffc1ebc44e80,
1787         0xc987434744ac874e,0xa327ffb266b56220,
1788         0xfbe9141915d7a922,0x4bf1ff9f0062baa8,
1789         0x9d71ac8fada6c9b5,0x6f773fc3603db4a9,
1790         0xc4ce17b399107c22,0xcb550fb4384d21d3,
1791         0xf6019da07f549b2b,0x7e2a53a146606a48,
1792         0x99c102844f94e0fb,0x2eda7444cbfc426d,
1793         0xc0314325637a1939,0xfa911155fefb5308,
1794         0xf03d93eebc589f88,0x793555ab7eba27ca,
1795         0x96267c7535b763b5,0x4bc1558b2f3458de,
1796         0xbbb01b9283253ca2,0x9eb1aaedfb016f16,
1797         0xea9c227723ee8bcb,0x465e15a979c1cadc,
1798         0x92a1958a7675175f,0xbfacd89ec191ec9,
1799         0xb749faed14125d36,0xcef980ec671f667b,
1800         0xe51c79a85916f484,0x82b7e12780e7401a,
1801         0x8f31cc0937ae58d2,0xd1b2ecb8b0908810,
1802         0xb2fe3f0b8599ef07,0x861fa7e6dcb4aa15,
1803         0xdfbdcece67006ac9,0x67a791e093e1d49a,
1804         0x8bd6a141006042bd,0xe0c8bb2c5c6d24e0,
1805         0xaecc49914078536d,0x58fae9f773886e18,
1806         0xda7f5bf590966848,0xaf39a475506a899e,
1807         0x888f99797a5e012d,0x6d8406c952429603,
1808         0xaab37fd7d8f58178,0xc8e5087ba6d33b83,
1809         0xd5605fcdcf32e1d6,0xfb1e4a9a90880a64,
1810         0x855c3be0a17fcd26,0x5cf2eea09a55067f,
1811         0xa6b34ad8c9dfc06f,0xf42faa48c0ea481e,
1812         0xd0601d8efc57b08b,0xf13b94daf124da26,
1813         0x823c12795db6ce57,0x76c53d08d6b70858,
1814         0xa2cb1717b52481ed,0x54768c4b0c64ca6e,
1815         0xcb7ddcdda26da268,0xa9942f5dcf7dfd09,
1816         0xfe5d54150b090b02,0xd3f93b35435d7c4c,
1817         0x9efa548d26e5a6e1,0xc47bc5014a1a6daf,
1818         0xc6b8e9b0709f109a,0x359ab6419ca1091b,
1819         0xf867241c8cc6d4c0,0xc30163d203c94b62,
1820         0x9b407691d7fc44f8,0x79e0de63425dcf1d,
1821         0xc21094364dfb5636,0x985915fc12f542e4,
1822         0xf294b943e17a2bc4,0x3e6f5b7b17b2939d,
1823         0x979cf3ca6cec5b5a,0xa705992ceecf9c42,
1824         0xbd8430bd08277231,0x50c6ff782a838353,
1825         0xece53cec4a314ebd,0xa4f8bf5635246428,
1826         0x940f4613ae5ed136,0x871b7795e136be99,
1827         0xb913179899f68584,0x28e2557b59846e3f,
1828         0xe757dd7ec07426e5,0x331aeada2fe589cf,
1829         0x9096ea6f3848984f,0x3ff0d2c85def7621,
1830         0xb4bca50b065abe63,0xfed077a756b53a9,
1831         0xe1ebce4dc7f16dfb,0xd3e8495912c62894,
1832         0x8d3360f09cf6e4bd,0x64712dd7abbbd95c,
1833         0xb080392cc4349dec,0xbd8d794d96aacfb3,
1834         0xdca04777f541c567,0xecf0d7a0fc5583a0,
1835         0x89e42caaf9491b60,0xf41686c49db57244,
1836         0xac5d37d5b79b6239,0x311c2875c522ced5,
1837         0xd77485cb25823ac7,0x7d633293366b828b,
1838         0x86a8d39ef77164bc,0xae5dff9c02033197,
1839         0xa8530886b54dbdeb,0xd9f57f830283fdfc,
1840         0xd267caa862a12d66,0xd072df63c324fd7b,
1841         0x8380dea93da4bc60,0x4247cb9e59f71e6d,
1842         0xa46116538d0deb78,0x52d9be85f074e608,
1843         0xcd795be870516656,0x67902e276c921f8b,
1844         0x806bd9714632dff6,0xba1cd8a3db53b6,
1845         0xa086cfcd97bf97f3,0x80e8a40eccd228a4,
1846         0xc8a883c0fdaf7df0,0x6122cd128006b2cd,
1847         0xfad2a4b13d1b5d6c,0x796b805720085f81,
1848         0x9cc3a6eec6311a63,0xcbe3303674053bb0,
1849         0xc3f490aa77bd60fc,0xbedbfc4411068a9c,
1850         0xf4f1b4d515acb93b,0xee92fb5515482d44,
1851         0x991711052d8bf3c5,0x751bdd152d4d1c4a,
1852         0xbf5cd54678eef0b6,0xd262d45a78a0635d,
1853         0xef340a98172aace4,0x86fb897116c87c34,
1854         0x9580869f0e7aac0e,0xd45d35e6ae3d4da0,
1855         0xbae0a846d2195712,0x8974836059cca109,
1856         0xe998d258869facd7,0x2bd1a438703fc94b,
1857         0x91ff83775423cc06,0x7b6306a34627ddcf,
1858         0xb67f6455292cbf08,0x1a3bc84c17b1d542,
1859         0xe41f3d6a7377eeca,0x20caba5f1d9e4a93,
1860         0x8e938662882af53e,0x547eb47b7282ee9c,
1861         0xb23867fb2a35b28d,0xe99e619a4f23aa43,
1862         0xdec681f9f4c31f31,0x6405fa00e2ec94d4,
1863         0x8b3c113c38f9f37e,0xde83bc408dd3dd04,
1864         0xae0b158b4738705e,0x9624ab50b148d445,
1865         0xd98ddaee19068c76,0x3badd624dd9b0957,
1866         0x87f8a8d4cfa417c9,0xe54ca5d70a80e5d6,
1867         0xa9f6d30a038d1dbc,0x5e9fcf4ccd211f4c,
1868         0xd47487cc8470652b,0x7647c3200069671f,
1869         0x84c8d4dfd2c63f3b,0x29ecd9f40041e073,
1870         0xa5fb0a17c777cf09,0xf468107100525890,
1871         0xcf79cc9db955c2cc,0x7182148d4066eeb4,
1872         0x81ac1fe293d599bf,0xc6f14cd848405530,
1873         0xa21727db38cb002f,0xb8ada00e5a506a7c,
1874         0xca9cf1d206fdc03b,0xa6d90811f0e4851c,
1875         0xfd442e4688bd304a,0x908f4a166d1da663,
1876         0x9e4a9cec15763e2e,0x9a598e4e043287fe,
1877         0xc5dd44271ad3cdba,0x40eff1e1853f29fd,
1878         0xf7549530e188c128,0xd12bee59e68ef47c,
1879         0x9a94dd3e8cf578b9,0x82bb74f8301958ce,
1880         0xc13a148e3032d6e7,0xe36a52363c1faf01,
1881         0xf18899b1bc3f8ca1,0xdc44e6c3cb279ac1,
1882         0x96f5600f15a7b7e5,0x29ab103a5ef8c0b9,
1883         0xbcb2b812db11a5de,0x7415d448f6b6f0e7,
1884         0xebdf661791d60f56,0x111b495b3464ad21,
1885         0x936b9fcebb25c995,0xcab10dd900beec34,
1886         0xb84687c269ef3bfb,0x3d5d514f40eea742,
1887         0xe65829b3046b0afa,0xcb4a5a3112a5112,
1888         0x8ff71a0fe2c2e6dc,0x47f0e785eaba72ab,
1889         0xb3f4e093db73a093,0x59ed216765690f56,
1890         0xe0f218b8d25088b8,0x306869c13ec3532c,
1891         0x8c974f7383725573,0x1e414218c73a13fb,
1892         0xafbd2350644eeacf,0xe5d1929ef90898fa,
1893         0xdbac6c247d62a583,0xdf45f746b74abf39,
1894         0x894bc396ce5da772,0x6b8bba8c328eb783,
1895         0xab9eb47c81f5114f,0x66ea92f3f326564,
1896         0xd686619ba27255a2,0xc80a537b0efefebd,
1897         0x8613fd0145877585,0xbd06742ce95f5f36,
1898         0xa798fc4196e952e7,0x2c48113823b73704,
1899         0xd17f3b51fca3a7a0,0xf75a15862ca504c5,
1900         0x82ef85133de648c4,0x9a984d73dbe722fb,
1901         0xa3ab66580d5fdaf5,0xc13e60d0d2e0ebba,
1902         0xcc963fee10b7d1b3,0x318df905079926a8,
1903         0xffbbcfe994e5c61f,0xfdf17746497f7052,
1904         0x9fd561f1fd0f9bd3,0xfeb6ea8bedefa633,
1905         0xc7caba6e7c5382c8,0xfe64a52ee96b8fc0,
1906         0xf9bd690a1b68637b,0x3dfdce7aa3c673b0,
1907         0x9c1661a651213e2d,0x6bea10ca65c084e,
1908         0xc31bfa0fe5698db8,0x486e494fcff30a62,
1909         0xf3e2f893dec3f126,0x5a89dba3c3efccfa,
1910         0x986ddb5c6b3a76b7,0xf89629465a75e01c,
1911         0xbe89523386091465,0xf6bbb397f1135823,
1912         0xee2ba6c0678b597f,0x746aa07ded582e2c,
1913         0x94db483840b717ef,0xa8c2a44eb4571cdc,
1914         0xba121a4650e4ddeb,0x92f34d62616ce413,
1915         0xe896a0d7e51e1566,0x77b020baf9c81d17,
1916         0x915e2486ef32cd60,0xace1474dc1d122e,
1917         0xb5b5ada8aaff80b8,0xd819992132456ba,
1918         0xe3231912d5bf60e6,0x10e1fff697ed6c69,
1919         0x8df5efabc5979c8f,0xca8d3ffa1ef463c1,
1920         0xb1736b96b6fd83b3,0xbd308ff8a6b17cb2,
1921         0xddd0467c64bce4a0,0xac7cb3f6d05ddbde,
1922         0x8aa22c0dbef60ee4,0x6bcdf07a423aa96b,
1923         0xad4ab7112eb3929d,0x86c16c98d2c953c6,
1924         0xd89d64d57a607744,0xe871c7bf077ba8b7,
1925         0x87625f056c7c4a8b,0x11471cd764ad4972,
1926         0xa93af6c6c79b5d2d,0xd598e40d3dd89bcf,
1927         0xd389b47879823479,0x4aff1d108d4ec2c3,
1928         0x843610cb4bf160cb,0xcedf722a585139ba,
1929         0xa54394fe1eedb8fe,0xc2974eb4ee658828,
1930         0xce947a3da6a9273e,0x733d226229feea32,
1931         0x811ccc668829b887,0x806357d5a3f525f,
1932         0xa163ff802a3426a8,0xca07c2dcb0cf26f7,
1933         0xc9bcff6034c13052,0xfc89b393dd02f0b5,
1934         0xfc2c3f3841f17c67,0xbbac2078d443ace2,
1935         0x9d9ba7832936edc0,0xd54b944b84aa4c0d,
1936         0xc5029163f384a931,0xa9e795e65d4df11,
1937         0xf64335bcf065d37d,0x4d4617b5ff4a16d5,
1938         0x99ea0196163fa42e,0x504bced1bf8e4e45,
1939         0xc06481fb9bcf8d39,0xe45ec2862f71e1d6,
1940         0xf07da27a82c37088,0x5d767327bb4e5a4c,
1941         0x964e858c91ba2655,0x3a6a07f8d510f86f,
1942         0xbbe226efb628afea,0x890489f70a55368b,
1943         0xeadab0aba3b2dbe5,0x2b45ac74ccea842e,
1944         0x92c8ae6b464fc96f,0x3b0b8bc90012929d,
1945         0xb77ada0617e3bbcb,0x9ce6ebb40173744,
1946         0xe55990879ddcaabd,0xcc420a6a101d0515,
1947         0x8f57fa54c2a9eab6,0x9fa946824a12232d,
1948         0xb32df8e9f3546564,0x47939822dc96abf9,
1949         0xdff9772470297ebd,0x59787e2b93bc56f7,
1950         0x8bfbea76c619ef36,0x57eb4edb3c55b65a,
1951         0xaefae51477a06b03,0xede622920b6b23f1,
1952         0xdab99e59958885c4,0xe95fab368e45eced,
1953         0x88b402f7fd75539b,0x11dbcb0218ebb414,
1954         0xaae103b5fcd2a881,0xd652bdc29f26a119,
1955         0xd59944a37c0752a2,0x4be76d3346f0495f,
1956         0x857fcae62d8493a5,0x6f70a4400c562ddb,
1957         0xa6dfbd9fb8e5b88e,0xcb4ccd500f6bb952,
1958         0xd097ad07a71f26b2,0x7e2000a41346a7a7,
1959         0x825ecc24c873782f,0x8ed400668c0c28c8,
1960         0xa2f67f2dfa90563b,0x728900802f0f32fa,
1961         0xcbb41ef979346bca,0x4f2b40a03ad2ffb9,
1962         0xfea126b7d78186bc,0xe2f610c84987bfa8,
1963         0x9f24b832e6b0f436,0xdd9ca7d2df4d7c9,
1964         0xc6ede63fa05d3143,0x91503d1c79720dbb,
1965         0xf8a95fcf88747d94,0x75a44c6397ce912a,
1966         0x9b69dbe1b548ce7c,0xc986afbe3ee11aba,
1967         0xc24452da229b021b,0xfbe85badce996168,
1968         0xf2d56790ab41c2a2,0xfae27299423fb9c3,
1969         0x97c560ba6b0919a5,0xdccd879fc967d41a,
1970         0xbdb6b8e905cb600f,0x5400e987bbc1c920,
1971         0xed246723473e3813,0x290123e9aab23b68,
1972         0x9436c0760c86e30b,0xf9a0b6720aaf6521,
1973         0xb94470938fa89bce,0xf808e40e8d5b3e69,
1974         0xe7958cb87392c2c2,0xb60b1d1230b20e04,
1975         0x90bd77f3483bb9b9,0xb1c6f22b5e6f48c2,
1976         0xb4ecd5f01a4aa828,0x1e38aeb6360b1af3,
1977         0xe2280b6c20dd5232,0x25c6da63c38de1b0,
1978         0x8d590723948a535f,0x579c487e5a38ad0e,
1979         0xb0af48ec79ace837,0x2d835a9df0c6d851,
1980         0xdcdb1b2798182244,0xf8e431456cf88e65,
1981         0x8a08f0f8bf0f156b,0x1b8e9ecb641b58ff,
1982         0xac8b2d36eed2dac5,0xe272467e3d222f3f,
1983         0xd7adf884aa879177,0x5b0ed81dcc6abb0f,
1984         0x86ccbb52ea94baea,0x98e947129fc2b4e9,
1985         0xa87fea27a539e9a5,0x3f2398d747b36224,
1986         0xd29fe4b18e88640e,0x8eec7f0d19a03aad,
1987         0x83a3eeeef9153e89,0x1953cf68300424ac,
1988         0xa48ceaaab75a8e2b,0x5fa8c3423c052dd7,
1989         0xcdb02555653131b6,0x3792f412cb06794d,
1990         0x808e17555f3ebf11,0xe2bbd88bbee40bd0,
1991         0xa0b19d2ab70e6ed6,0x5b6aceaeae9d0ec4,
1992         0xc8de047564d20a8b,0xf245825a5a445275,
1993         0xfb158592be068d2e,0xeed6e2f0f0d56712,
1994         0x9ced737bb6c4183d,0x55464dd69685606b,
1995         0xc428d05aa4751e4c,0xaa97e14c3c26b886,
1996         0xf53304714d9265df,0xd53dd99f4b3066a8,
1997         0x993fe2c6d07b7fab,0xe546a8038efe4029,
1998         0xbf8fdb78849a5f96,0xde98520472bdd033,
1999         0xef73d256a5c0f77c,0x963e66858f6d4440,
2000         0x95a8637627989aad,0xdde7001379a44aa8,
2001         0xbb127c53b17ec159,0x5560c018580d5d52,
2002         0xe9d71b689dde71af,0xaab8f01e6e10b4a6,
2003         0x9226712162ab070d,0xcab3961304ca70e8,
2004         0xb6b00d69bb55c8d1,0x3d607b97c5fd0d22,
2005         0xe45c10c42a2b3b05,0x8cb89a7db77c506a,
2006         0x8eb98a7a9a5b04e3,0x77f3608e92adb242,
2007         0xb267ed1940f1c61c,0x55f038b237591ed3,
2008         0xdf01e85f912e37a3,0x6b6c46dec52f6688,
2009         0x8b61313bbabce2c6,0x2323ac4b3b3da015,
2010         0xae397d8aa96c1b77,0xabec975e0a0d081a,
2011         0xd9c7dced53c72255,0x96e7bd358c904a21,
2012         0x881cea14545c7575,0x7e50d64177da2e54,
2013         0xaa242499697392d2,0xdde50bd1d5d0b9e9,
2014         0xd4ad2dbfc3d07787,0x955e4ec64b44e864,
2015         0x84ec3c97da624ab4,0xbd5af13bef0b113e,
2016         0xa6274bbdd0fadd61,0xecb1ad8aeacdd58e,
2017         0xcfb11ead453994ba,0x67de18eda5814af2,
2018         0x81ceb32c4b43fcf4,0x80eacf948770ced7,
2019         0xa2425ff75e14fc31,0xa1258379a94d028d,
2020         0xcad2f7f5359a3b3e,0x96ee45813a04330,
2021         0xfd87b5f28300ca0d,0x8bca9d6e188853fc,
2022         0x9e74d1b791e07e48,0x775ea264cf55347e,
2023         0xc612062576589dda,0x95364afe032a81a0,
2024         0xf79687aed3eec551,0x3a83ddbd83f52210,
2025         0x9abe14cd44753b52,0xc4926a9672793580,
2026         0xc16d9a0095928a27,0x75b7053c0f178400,
2027         0xf1c90080baf72cb1,0x5324c68b12dd6800,
2028         0x971da05074da7bee,0xd3f6fc16ebca8000,
2029         0xbce5086492111aea,0x88f4bb1ca6bd0000,
2030         0xec1e4a7db69561a5,0x2b31e9e3d0700000,
2031         0x9392ee8e921d5d07,0x3aff322e62600000,
2032         0xb877aa3236a4b449,0x9befeb9fad487c3,
2033         0xe69594bec44de15b,0x4c2ebe687989a9b4,
2034         0x901d7cf73ab0acd9,0xf9d37014bf60a11,
2035         0xb424dc35095cd80f,0x538484c19ef38c95,
2036         0xe12e13424bb40e13,0x2865a5f206b06fba,
2037         0x8cbccc096f5088cb,0xf93f87b7442e45d4,
2038         0xafebff0bcb24aafe,0xf78f69a51539d749,
2039         0xdbe6fecebdedd5be,0xb573440e5a884d1c,
2040         0x89705f4136b4a597,0x31680a88f8953031,
2041         0xabcc77118461cefc,0xfdc20d2b36ba7c3e,
2042         0xd6bf94d5e57a42bc,0x3d32907604691b4d,
2043         0x8637bd05af6c69b5,0xa63f9a49c2c1b110,
2044         0xa7c5ac471b478423,0xfcf80dc33721d54,
2045         0xd1b71758e219652b,0xd3c36113404ea4a9,
2046         0x83126e978d4fdf3b,0x645a1cac083126ea,
2047         0xa3d70a3d70a3d70a,0x3d70a3d70a3d70a4,
2048         0xcccccccccccccccc,0xcccccccccccccccd,
2049         0x8000000000000000,0x0,
2050         0xa000000000000000,0x0,
2051         0xc800000000000000,0x0,
2052         0xfa00000000000000,0x0,
2053         0x9c40000000000000,0x0,
2054         0xc350000000000000,0x0,
2055         0xf424000000000000,0x0,
2056         0x9896800000000000,0x0,
2057         0xbebc200000000000,0x0,
2058         0xee6b280000000000,0x0,
2059         0x9502f90000000000,0x0,
2060         0xba43b74000000000,0x0,
2061         0xe8d4a51000000000,0x0,
2062         0x9184e72a00000000,0x0,
2063         0xb5e620f480000000,0x0,
2064         0xe35fa931a0000000,0x0,
2065         0x8e1bc9bf04000000,0x0,
2066         0xb1a2bc2ec5000000,0x0,
2067         0xde0b6b3a76400000,0x0,
2068         0x8ac7230489e80000,0x0,
2069         0xad78ebc5ac620000,0x0,
2070         0xd8d726b7177a8000,0x0,
2071         0x878678326eac9000,0x0,
2072         0xa968163f0a57b400,0x0,
2073         0xd3c21bcecceda100,0x0,
2074         0x84595161401484a0,0x0,
2075         0xa56fa5b99019a5c8,0x0,
2076         0xcecb8f27f4200f3a,0x0,
2077         0x813f3978f8940984,0x4000000000000000,
2078         0xa18f07d736b90be5,0x5000000000000000,
2079         0xc9f2c9cd04674ede,0xa400000000000000,
2080         0xfc6f7c4045812296,0x4d00000000000000,
2081         0x9dc5ada82b70b59d,0xf020000000000000,
2082         0xc5371912364ce305,0x6c28000000000000,
2083         0xf684df56c3e01bc6,0xc732000000000000,
2084         0x9a130b963a6c115c,0x3c7f400000000000,
2085         0xc097ce7bc90715b3,0x4b9f100000000000,
2086         0xf0bdc21abb48db20,0x1e86d40000000000,
2087         0x96769950b50d88f4,0x1314448000000000,
2088         0xbc143fa4e250eb31,0x17d955a000000000,
2089         0xeb194f8e1ae525fd,0x5dcfab0800000000,
2090         0x92efd1b8d0cf37be,0x5aa1cae500000000,
2091         0xb7abc627050305ad,0xf14a3d9e40000000,
2092         0xe596b7b0c643c719,0x6d9ccd05d0000000,
2093         0x8f7e32ce7bea5c6f,0xe4820023a2000000,
2094         0xb35dbf821ae4f38b,0xdda2802c8a800000,
2095         0xe0352f62a19e306e,0xd50b2037ad200000,
2096         0x8c213d9da502de45,0x4526f422cc340000,
2097         0xaf298d050e4395d6,0x9670b12b7f410000,
2098         0xdaf3f04651d47b4c,0x3c0cdd765f114000,
2099         0x88d8762bf324cd0f,0xa5880a69fb6ac800,
2100         0xab0e93b6efee0053,0x8eea0d047a457a00,
2101         0xd5d238a4abe98068,0x72a4904598d6d880,
2102         0x85a36366eb71f041,0x47a6da2b7f864750,
2103         0xa70c3c40a64e6c51,0x999090b65f67d924,
2104         0xd0cf4b50cfe20765,0xfff4b4e3f741cf6d,
2105         0x82818f1281ed449f,0xbff8f10e7a8921a4,
2106         0xa321f2d7226895c7,0xaff72d52192b6a0d,
2107         0xcbea6f8ceb02bb39,0x9bf4f8a69f764490,
2108         0xfee50b7025c36a08,0x2f236d04753d5b4,
2109         0x9f4f2726179a2245,0x1d762422c946590,
2110         0xc722f0ef9d80aad6,0x424d3ad2b7b97ef5,
2111         0xf8ebad2b84e0d58b,0xd2e0898765a7deb2,
2112         0x9b934c3b330c8577,0x63cc55f49f88eb2f,
2113         0xc2781f49ffcfa6d5,0x3cbf6b71c76b25fb,
2114         0xf316271c7fc3908a,0x8bef464e3945ef7a,
2115         0x97edd871cfda3a56,0x97758bf0e3cbb5ac,
2116         0xbde94e8e43d0c8ec,0x3d52eeed1cbea317,
2117         0xed63a231d4c4fb27,0x4ca7aaa863ee4bdd,
2118         0x945e455f24fb1cf8,0x8fe8caa93e74ef6a,
2119         0xb975d6b6ee39e436,0xb3e2fd538e122b44,
2120         0xe7d34c64a9c85d44,0x60dbbca87196b616,
2121         0x90e40fbeea1d3a4a,0xbc8955e946fe31cd,
2122         0xb51d13aea4a488dd,0x6babab6398bdbe41,
2123         0xe264589a4dcdab14,0xc696963c7eed2dd1,
2124         0x8d7eb76070a08aec,0xfc1e1de5cf543ca2,
2125         0xb0de65388cc8ada8,0x3b25a55f43294bcb,
2126         0xdd15fe86affad912,0x49ef0eb713f39ebe,
2127         0x8a2dbf142dfcc7ab,0x6e3569326c784337,
2128         0xacb92ed9397bf996,0x49c2c37f07965404,
2129         0xd7e77a8f87daf7fb,0xdc33745ec97be906,
2130         0x86f0ac99b4e8dafd,0x69a028bb3ded71a3,
2131         0xa8acd7c0222311bc,0xc40832ea0d68ce0c,
2132         0xd2d80db02aabd62b,0xf50a3fa490c30190,
2133         0x83c7088e1aab65db,0x792667c6da79e0fa,
2134         0xa4b8cab1a1563f52,0x577001b891185938,
2135         0xcde6fd5e09abcf26,0xed4c0226b55e6f86,
2136         0x80b05e5ac60b6178,0x544f8158315b05b4,
2137         0xa0dc75f1778e39d6,0x696361ae3db1c721,
2138         0xc913936dd571c84c,0x3bc3a19cd1e38e9,
2139         0xfb5878494ace3a5f,0x4ab48a04065c723,
2140         0x9d174b2dcec0e47b,0x62eb0d64283f9c76,
2141         0xc45d1df942711d9a,0x3ba5d0bd324f8394,
2142         0xf5746577930d6500,0xca8f44ec7ee36479,
2143         0x9968bf6abbe85f20,0x7e998b13cf4e1ecb,
2144         0xbfc2ef456ae276e8,0x9e3fedd8c321a67e,
2145         0xefb3ab16c59b14a2,0xc5cfe94ef3ea101e,
2146         0x95d04aee3b80ece5,0xbba1f1d158724a12,
2147         0xbb445da9ca61281f,0x2a8a6e45ae8edc97,
2148         0xea1575143cf97226,0xf52d09d71a3293bd,
2149         0x924d692ca61be758,0x593c2626705f9c56,
2150         0xb6e0c377cfa2e12e,0x6f8b2fb00c77836c,
2151         0xe498f455c38b997a,0xb6dfb9c0f956447,
2152         0x8edf98b59a373fec,0x4724bd4189bd5eac,
2153         0xb2977ee300c50fe7,0x58edec91ec2cb657,
2154         0xdf3d5e9bc0f653e1,0x2f2967b66737e3ed,
2155         0x8b865b215899f46c,0xbd79e0d20082ee74,
2156         0xae67f1e9aec07187,0xecd8590680a3aa11,
2157         0xda01ee641a708de9,0xe80e6f4820cc9495,
2158         0x884134fe908658b2,0x3109058d147fdcdd,
2159         0xaa51823e34a7eede,0xbd4b46f0599fd415,
2160         0xd4e5e2cdc1d1ea96,0x6c9e18ac7007c91a,
2161         0x850fadc09923329e,0x3e2cf6bc604ddb0,
2162         0xa6539930bf6bff45,0x84db8346b786151c,
2163         0xcfe87f7cef46ff16,0xe612641865679a63,
2164         0x81f14fae158c5f6e,0x4fcb7e8f3f60c07e,
2165         0xa26da3999aef7749,0xe3be5e330f38f09d,
2166         0xcb090c8001ab551c,0x5cadf5bfd3072cc5,
2167         0xfdcb4fa002162a63,0x73d9732fc7c8f7f6,
2168         0x9e9f11c4014dda7e,0x2867e7fddcdd9afa,
2169         0xc646d63501a1511d,0xb281e1fd541501b8,
2170         0xf7d88bc24209a565,0x1f225a7ca91a4226,
2171         0x9ae757596946075f,0x3375788de9b06958,
2172         0xc1a12d2fc3978937,0x52d6b1641c83ae,
2173         0xf209787bb47d6b84,0xc0678c5dbd23a49a,
2174         0x9745eb4d50ce6332,0xf840b7ba963646e0,
2175         0xbd176620a501fbff,0xb650e5a93bc3d898,
2176         0xec5d3fa8ce427aff,0xa3e51f138ab4cebe,
2177         0x93ba47c980e98cdf,0xc66f336c36b10137,
2178         0xb8a8d9bbe123f017,0xb80b0047445d4184,
2179         0xe6d3102ad96cec1d,0xa60dc059157491e5,
2180         0x9043ea1ac7e41392,0x87c89837ad68db2f,
2181         0xb454e4a179dd1877,0x29babe4598c311fb,
2182         0xe16a1dc9d8545e94,0xf4296dd6fef3d67a,
2183         0x8ce2529e2734bb1d,0x1899e4a65f58660c,
2184         0xb01ae745b101e9e4,0x5ec05dcff72e7f8f,
2185         0xdc21a1171d42645d,0x76707543f4fa1f73,
2186         0x899504ae72497eba,0x6a06494a791c53a8,
2187         0xabfa45da0edbde69,0x487db9d17636892,
2188         0xd6f8d7509292d603,0x45a9d2845d3c42b6,
2189         0x865b86925b9bc5c2,0xb8a2392ba45a9b2,
2190         0xa7f26836f282b732,0x8e6cac7768d7141e,
2191         0xd1ef0244af2364ff,0x3207d795430cd926,
2192         0x8335616aed761f1f,0x7f44e6bd49e807b8,
2193         0xa402b9c5a8d3a6e7,0x5f16206c9c6209a6,
2194         0xcd036837130890a1,0x36dba887c37a8c0f,
2195         0x802221226be55a64,0xc2494954da2c9789,
2196         0xa02aa96b06deb0fd,0xf2db9baa10b7bd6c,
2197         0xc83553c5c8965d3d,0x6f92829494e5acc7,
2198         0xfa42a8b73abbf48c,0xcb772339ba1f17f9,
2199         0x9c69a97284b578d7,0xff2a760414536efb,
2200         0xc38413cf25e2d70d,0xfef5138519684aba,
2201         0xf46518c2ef5b8cd1,0x7eb258665fc25d69,
2202         0x98bf2f79d5993802,0xef2f773ffbd97a61,
2203         0xbeeefb584aff8603,0xaafb550ffacfd8fa,
2204         0xeeaaba2e5dbf6784,0x95ba2a53f983cf38,
2205         0x952ab45cfa97a0b2,0xdd945a747bf26183,
2206         0xba756174393d88df,0x94f971119aeef9e4,
2207         0xe912b9d1478ceb17,0x7a37cd5601aab85d,
2208         0x91abb422ccb812ee,0xac62e055c10ab33a,
2209         0xb616a12b7fe617aa,0x577b986b314d6009,
2210         0xe39c49765fdf9d94,0xed5a7e85fda0b80b,
2211         0x8e41ade9fbebc27d,0x14588f13be847307,
2212         0xb1d219647ae6b31c,0x596eb2d8ae258fc8,
2213         0xde469fbd99a05fe3,0x6fca5f8ed9aef3bb,
2214         0x8aec23d680043bee,0x25de7bb9480d5854,
2215         0xada72ccc20054ae9,0xaf561aa79a10ae6a,
2216         0xd910f7ff28069da4,0x1b2ba1518094da04,
2217         0x87aa9aff79042286,0x90fb44d2f05d0842,
2218         0xa99541bf57452b28,0x353a1607ac744a53,
2219         0xd3fa922f2d1675f2,0x42889b8997915ce8,
2220         0x847c9b5d7c2e09b7,0x69956135febada11,
2221         0xa59bc234db398c25,0x43fab9837e699095,
2222         0xcf02b2c21207ef2e,0x94f967e45e03f4bb,
2223         0x8161afb94b44f57d,0x1d1be0eebac278f5,
2224         0xa1ba1ba79e1632dc,0x6462d92a69731732,
2225         0xca28a291859bbf93,0x7d7b8f7503cfdcfe,
2226         0xfcb2cb35e702af78,0x5cda735244c3d43e,
2227         0x9defbf01b061adab,0x3a0888136afa64a7,
2228         0xc56baec21c7a1916,0x88aaa1845b8fdd0,
2229         0xf6c69a72a3989f5b,0x8aad549e57273d45,
2230         0x9a3c2087a63f6399,0x36ac54e2f678864b,
2231         0xc0cb28a98fcf3c7f,0x84576a1bb416a7dd,
2232         0xf0fdf2d3f3c30b9f,0x656d44a2a11c51d5,
2233         0x969eb7c47859e743,0x9f644ae5a4b1b325,
2234         0xbc4665b596706114,0x873d5d9f0dde1fee,
2235         0xeb57ff22fc0c7959,0xa90cb506d155a7ea,
2236         0x9316ff75dd87cbd8,0x9a7f12442d588f2,
2237         0xb7dcbf5354e9bece,0xc11ed6d538aeb2f,
2238         0xe5d3ef282a242e81,0x8f1668c8a86da5fa,
2239         0x8fa475791a569d10,0xf96e017d694487bc,
2240         0xb38d92d760ec4455,0x37c981dcc395a9ac,
2241         0xe070f78d3927556a,0x85bbe253f47b1417,
2242         0x8c469ab843b89562,0x93956d7478ccec8e,
2243         0xaf58416654a6babb,0x387ac8d1970027b2,
2244         0xdb2e51bfe9d0696a,0x6997b05fcc0319e,
2245         0x88fcf317f22241e2,0x441fece3bdf81f03,
2246         0xab3c2fddeeaad25a,0xd527e81cad7626c3,
2247         0xd60b3bd56a5586f1,0x8a71e223d8d3b074,
2248         0x85c7056562757456,0xf6872d5667844e49,
2249         0xa738c6bebb12d16c,0xb428f8ac016561db,
2250         0xd106f86e69d785c7,0xe13336d701beba52,
2251         0x82a45b450226b39c,0xecc0024661173473,
2252         0xa34d721642b06084,0x27f002d7f95d0190,
2253         0xcc20ce9bd35c78a5,0x31ec038df7b441f4,
2254         0xff290242c83396ce,0x7e67047175a15271,
2255         0x9f79a169bd203e41,0xf0062c6e984d386,
2256         0xc75809c42c684dd1,0x52c07b78a3e60868,
2257         0xf92e0c3537826145,0xa7709a56ccdf8a82,
2258         0x9bbcc7a142b17ccb,0x88a66076400bb691,
2259         0xc2abf989935ddbfe,0x6acff893d00ea435,
2260         0xf356f7ebf83552fe,0x583f6b8c4124d43,
2261         0x98165af37b2153de,0xc3727a337a8b704a,
2262         0xbe1bf1b059e9a8d6,0x744f18c0592e4c5c,
2263         0xeda2ee1c7064130c,0x1162def06f79df73,
2264         0x9485d4d1c63e8be7,0x8addcb5645ac2ba8,
2265         0xb9a74a0637ce2ee1,0x6d953e2bd7173692,
2266         0xe8111c87c5c1ba99,0xc8fa8db6ccdd0437,
2267         0x910ab1d4db9914a0,0x1d9c9892400a22a2,
2268         0xb54d5e4a127f59c8,0x2503beb6d00cab4b,
2269         0xe2a0b5dc971f303a,0x2e44ae64840fd61d,
2270         0x8da471a9de737e24,0x5ceaecfed289e5d2,
2271         0xb10d8e1456105dad,0x7425a83e872c5f47,
2272         0xdd50f1996b947518,0xd12f124e28f77719,
2273         0x8a5296ffe33cc92f,0x82bd6b70d99aaa6f,
2274         0xace73cbfdc0bfb7b,0x636cc64d1001550b,
2275         0xd8210befd30efa5a,0x3c47f7e05401aa4e,
2276         0x8714a775e3e95c78,0x65acfaec34810a71,
2277         0xa8d9d1535ce3b396,0x7f1839a741a14d0d,
2278         0xd31045a8341ca07c,0x1ede48111209a050,
2279         0x83ea2b892091e44d,0x934aed0aab460432,
2280         0xa4e4b66b68b65d60,0xf81da84d5617853f,
2281         0xce1de40642e3f4b9,0x36251260ab9d668e,
2282         0x80d2ae83e9ce78f3,0xc1d72b7c6b426019,
2283         0xa1075a24e4421730,0xb24cf65b8612f81f,
2284         0xc94930ae1d529cfc,0xdee033f26797b627,
2285         0xfb9b7cd9a4a7443c,0x169840ef017da3b1,
2286         0x9d412e0806e88aa5,0x8e1f289560ee864e,
2287         0xc491798a08a2ad4e,0xf1a6f2bab92a27e2,
2288         0xf5b5d7ec8acb58a2,0xae10af696774b1db,
2289         0x9991a6f3d6bf1765,0xacca6da1e0a8ef29,
2290         0xbff610b0cc6edd3f,0x17fd090a58d32af3,
2291         0xeff394dcff8a948e,0xddfc4b4cef07f5b0,
2292         0x95f83d0a1fb69cd9,0x4abdaf101564f98e,
2293         0xbb764c4ca7a4440f,0x9d6d1ad41abe37f1,
2294         0xea53df5fd18d5513,0x84c86189216dc5ed,
2295         0x92746b9be2f8552c,0x32fd3cf5b4e49bb4,
2296         0xb7118682dbb66a77,0x3fbc8c33221dc2a1,
2297         0xe4d5e82392a40515,0xfabaf3feaa5334a,
2298         0x8f05b1163ba6832d,0x29cb4d87f2a7400e,
2299         0xb2c71d5bca9023f8,0x743e20e9ef511012,
2300         0xdf78e4b2bd342cf6,0x914da9246b255416,
2301         0x8bab8eefb6409c1a,0x1ad089b6c2f7548e,
2302         0xae9672aba3d0c320,0xa184ac2473b529b1,
2303         0xda3c0f568cc4f3e8,0xc9e5d72d90a2741e,
2304         0x8865899617fb1871,0x7e2fa67c7a658892,
2305         0xaa7eebfb9df9de8d,0xddbb901b98feeab7,
2306         0xd51ea6fa85785631,0x552a74227f3ea565,
2307         0x8533285c936b35de,0xd53a88958f87275f,
2308         0xa67ff273b8460356,0x8a892abaf368f137,
2309         0xd01fef10a657842c,0x2d2b7569b0432d85,
2310         0x8213f56a67f6b29b,0x9c3b29620e29fc73,
2311         0xa298f2c501f45f42,0x8349f3ba91b47b8f,
2312         0xcb3f2f7642717713,0x241c70a936219a73,
2313         0xfe0efb53d30dd4d7,0xed238cd383aa0110,
2314         0x9ec95d1463e8a506,0xf4363804324a40aa,
2315         0xc67bb4597ce2ce48,0xb143c6053edcd0d5,
2316         0xf81aa16fdc1b81da,0xdd94b7868e94050a,
2317         0x9b10a4e5e9913128,0xca7cf2b4191c8326,
2318         0xc1d4ce1f63f57d72,0xfd1c2f611f63a3f0,
2319         0xf24a01a73cf2dccf,0xbc633b39673c8cec,
2320         0x976e41088617ca01,0xd5be0503e085d813,
2321         0xbd49d14aa79dbc82,0x4b2d8644d8a74e18,
2322         0xec9c459d51852ba2,0xddf8e7d60ed1219e,
2323         0x93e1ab8252f33b45,0xcabb90e5c942b503,
2324         0xb8da1662e7b00a17,0x3d6a751f3b936243,
2325         0xe7109bfba19c0c9d,0xcc512670a783ad4,
2326         0x906a617d450187e2,0x27fb2b80668b24c5,
2327         0xb484f9dc9641e9da,0xb1f9f660802dedf6,
2328         0xe1a63853bbd26451,0x5e7873f8a0396973,
2329         0x8d07e33455637eb2,0xdb0b487b6423e1e8,
2330         0xb049dc016abc5e5f,0x91ce1a9a3d2cda62,
2331         0xdc5c5301c56b75f7,0x7641a140cc7810fb,
2332         0x89b9b3e11b6329ba,0xa9e904c87fcb0a9d,
2333         0xac2820d9623bf429,0x546345fa9fbdcd44,
2334         0xd732290fbacaf133,0xa97c177947ad4095,
2335         0x867f59a9d4bed6c0,0x49ed8eabcccc485d,
2336         0xa81f301449ee8c70,0x5c68f256bfff5a74,
2337         0xd226fc195c6a2f8c,0x73832eec6fff3111,
2338         0x83585d8fd9c25db7,0xc831fd53c5ff7eab,
2339         0xa42e74f3d032f525,0xba3e7ca8b77f5e55,
2340         0xcd3a1230c43fb26f,0x28ce1bd2e55f35eb,
2341         0x80444b5e7aa7cf85,0x7980d163cf5b81b3,
2342         0xa0555e361951c366,0xd7e105bcc332621f,
2343         0xc86ab5c39fa63440,0x8dd9472bf3fefaa7,
2344         0xfa856334878fc150,0xb14f98f6f0feb951,
2345         0x9c935e00d4b9d8d2,0x6ed1bf9a569f33d3,
2346         0xc3b8358109e84f07,0xa862f80ec4700c8,
2347         0xf4a642e14c6262c8,0xcd27bb612758c0fa,
2348         0x98e7e9cccfbd7dbd,0x8038d51cb897789c,
2349         0xbf21e44003acdd2c,0xe0470a63e6bd56c3,
2350         0xeeea5d5004981478,0x1858ccfce06cac74,
2351         0x95527a5202df0ccb,0xf37801e0c43ebc8,
2352         0xbaa718e68396cffd,0xd30560258f54e6ba,
2353         0xe950df20247c83fd,0x47c6b82ef32a2069,
2354         0x91d28b7416cdd27e,0x4cdc331d57fa5441,
2355         0xb6472e511c81471d,0xe0133fe4adf8e952,
2356         0xe3d8f9e563a198e5,0x58180fddd97723a6,
2357         0x8e679c2f5e44ff8f,0x570f09eaa7ea7648,};
2358 
2359 } // namespace internal
2360 } // namespace simdjson
2361 /* end file src/internal/numberparsing_tables.cpp */
2362 /* begin file src/internal/simdprune_tables.cpp */
2363 #if SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE || SIMDJSON_IMPLEMENTATION_PPC64
2364 
2365 #include <cstdint>
2366 
2367 namespace simdjson { // table modified and copied from
2368 namespace internal { // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable
2369 SIMDJSON_DLLIMPORTEXPORT  const unsigned char BitsSetTable256mul2[256] = {
2370     0,  2,  2,  4,  2,  4,  4,  6,  2,  4,  4,  6,  4,  6,  6,  8,  2,  4,  4,
2371     6,  4,  6,  6,  8,  4,  6,  6,  8,  6,  8,  8,  10, 2,  4,  4,  6,  4,  6,
2372     6,  8,  4,  6,  6,  8,  6,  8,  8,  10, 4,  6,  6,  8,  6,  8,  8,  10, 6,
2373     8,  8,  10, 8,  10, 10, 12, 2,  4,  4,  6,  4,  6,  6,  8,  4,  6,  6,  8,
2374     6,  8,  8,  10, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,  10, 8,  10, 10,
2375     12, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,  10, 8,  10, 10, 12, 6,  8,
2376     8,  10, 8,  10, 10, 12, 8,  10, 10, 12, 10, 12, 12, 14, 2,  4,  4,  6,  4,
2377     6,  6,  8,  4,  6,  6,  8,  6,  8,  8,  10, 4,  6,  6,  8,  6,  8,  8,  10,
2378     6,  8,  8,  10, 8,  10, 10, 12, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,
2379     10, 8,  10, 10, 12, 6,  8,  8,  10, 8,  10, 10, 12, 8,  10, 10, 12, 10, 12,
2380     12, 14, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,  10, 8,  10, 10, 12, 6,
2381     8,  8,  10, 8,  10, 10, 12, 8,  10, 10, 12, 10, 12, 12, 14, 6,  8,  8,  10,
2382     8,  10, 10, 12, 8,  10, 10, 12, 10, 12, 12, 14, 8,  10, 10, 12, 10, 12, 12,
2383     14, 10, 12, 12, 14, 12, 14, 14, 16};
2384 
2385 SIMDJSON_DLLIMPORTEXPORT  const uint8_t pshufb_combine_table[272] = {
2386     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
2387     0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08,
2388     0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0x00, 0x01, 0x02, 0x03,
2389     0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
2390     0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
2391     0x0f, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
2392     0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x08,
2393     0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff,
2394     0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
2395     0xff, 0xff, 0xff, 0xff, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
2396     0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x08, 0x09, 0x0a, 0x0b,
2397     0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
2398 };
2399 
2400 // 256 * 8 bytes = 2kB, easily fits in cache.
2401 SIMDJSON_DLLIMPORTEXPORT  const uint64_t thintable_epi8[256] = {
2402     0x0706050403020100, 0x0007060504030201, 0x0007060504030200,
2403     0x0000070605040302, 0x0007060504030100, 0x0000070605040301,
2404     0x0000070605040300, 0x0000000706050403, 0x0007060504020100,
2405     0x0000070605040201, 0x0000070605040200, 0x0000000706050402,
2406     0x0000070605040100, 0x0000000706050401, 0x0000000706050400,
2407     0x0000000007060504, 0x0007060503020100, 0x0000070605030201,
2408     0x0000070605030200, 0x0000000706050302, 0x0000070605030100,
2409     0x0000000706050301, 0x0000000706050300, 0x0000000007060503,
2410     0x0000070605020100, 0x0000000706050201, 0x0000000706050200,
2411     0x0000000007060502, 0x0000000706050100, 0x0000000007060501,
2412     0x0000000007060500, 0x0000000000070605, 0x0007060403020100,
2413     0x0000070604030201, 0x0000070604030200, 0x0000000706040302,
2414     0x0000070604030100, 0x0000000706040301, 0x0000000706040300,
2415     0x0000000007060403, 0x0000070604020100, 0x0000000706040201,
2416     0x0000000706040200, 0x0000000007060402, 0x0000000706040100,
2417     0x0000000007060401, 0x0000000007060400, 0x0000000000070604,
2418     0x0000070603020100, 0x0000000706030201, 0x0000000706030200,
2419     0x0000000007060302, 0x0000000706030100, 0x0000000007060301,
2420     0x0000000007060300, 0x0000000000070603, 0x0000000706020100,
2421     0x0000000007060201, 0x0000000007060200, 0x0000000000070602,
2422     0x0000000007060100, 0x0000000000070601, 0x0000000000070600,
2423     0x0000000000000706, 0x0007050403020100, 0x0000070504030201,
2424     0x0000070504030200, 0x0000000705040302, 0x0000070504030100,
2425     0x0000000705040301, 0x0000000705040300, 0x0000000007050403,
2426     0x0000070504020100, 0x0000000705040201, 0x0000000705040200,
2427     0x0000000007050402, 0x0000000705040100, 0x0000000007050401,
2428     0x0000000007050400, 0x0000000000070504, 0x0000070503020100,
2429     0x0000000705030201, 0x0000000705030200, 0x0000000007050302,
2430     0x0000000705030100, 0x0000000007050301, 0x0000000007050300,
2431     0x0000000000070503, 0x0000000705020100, 0x0000000007050201,
2432     0x0000000007050200, 0x0000000000070502, 0x0000000007050100,
2433     0x0000000000070501, 0x0000000000070500, 0x0000000000000705,
2434     0x0000070403020100, 0x0000000704030201, 0x0000000704030200,
2435     0x0000000007040302, 0x0000000704030100, 0x0000000007040301,
2436     0x0000000007040300, 0x0000000000070403, 0x0000000704020100,
2437     0x0000000007040201, 0x0000000007040200, 0x0000000000070402,
2438     0x0000000007040100, 0x0000000000070401, 0x0000000000070400,
2439     0x0000000000000704, 0x0000000703020100, 0x0000000007030201,
2440     0x0000000007030200, 0x0000000000070302, 0x0000000007030100,
2441     0x0000000000070301, 0x0000000000070300, 0x0000000000000703,
2442     0x0000000007020100, 0x0000000000070201, 0x0000000000070200,
2443     0x0000000000000702, 0x0000000000070100, 0x0000000000000701,
2444     0x0000000000000700, 0x0000000000000007, 0x0006050403020100,
2445     0x0000060504030201, 0x0000060504030200, 0x0000000605040302,
2446     0x0000060504030100, 0x0000000605040301, 0x0000000605040300,
2447     0x0000000006050403, 0x0000060504020100, 0x0000000605040201,
2448     0x0000000605040200, 0x0000000006050402, 0x0000000605040100,
2449     0x0000000006050401, 0x0000000006050400, 0x0000000000060504,
2450     0x0000060503020100, 0x0000000605030201, 0x0000000605030200,
2451     0x0000000006050302, 0x0000000605030100, 0x0000000006050301,
2452     0x0000000006050300, 0x0000000000060503, 0x0000000605020100,
2453     0x0000000006050201, 0x0000000006050200, 0x0000000000060502,
2454     0x0000000006050100, 0x0000000000060501, 0x0000000000060500,
2455     0x0000000000000605, 0x0000060403020100, 0x0000000604030201,
2456     0x0000000604030200, 0x0000000006040302, 0x0000000604030100,
2457     0x0000000006040301, 0x0000000006040300, 0x0000000000060403,
2458     0x0000000604020100, 0x0000000006040201, 0x0000000006040200,
2459     0x0000000000060402, 0x0000000006040100, 0x0000000000060401,
2460     0x0000000000060400, 0x0000000000000604, 0x0000000603020100,
2461     0x0000000006030201, 0x0000000006030200, 0x0000000000060302,
2462     0x0000000006030100, 0x0000000000060301, 0x0000000000060300,
2463     0x0000000000000603, 0x0000000006020100, 0x0000000000060201,
2464     0x0000000000060200, 0x0000000000000602, 0x0000000000060100,
2465     0x0000000000000601, 0x0000000000000600, 0x0000000000000006,
2466     0x0000050403020100, 0x0000000504030201, 0x0000000504030200,
2467     0x0000000005040302, 0x0000000504030100, 0x0000000005040301,
2468     0x0000000005040300, 0x0000000000050403, 0x0000000504020100,
2469     0x0000000005040201, 0x0000000005040200, 0x0000000000050402,
2470     0x0000000005040100, 0x0000000000050401, 0x0000000000050400,
2471     0x0000000000000504, 0x0000000503020100, 0x0000000005030201,
2472     0x0000000005030200, 0x0000000000050302, 0x0000000005030100,
2473     0x0000000000050301, 0x0000000000050300, 0x0000000000000503,
2474     0x0000000005020100, 0x0000000000050201, 0x0000000000050200,
2475     0x0000000000000502, 0x0000000000050100, 0x0000000000000501,
2476     0x0000000000000500, 0x0000000000000005, 0x0000000403020100,
2477     0x0000000004030201, 0x0000000004030200, 0x0000000000040302,
2478     0x0000000004030100, 0x0000000000040301, 0x0000000000040300,
2479     0x0000000000000403, 0x0000000004020100, 0x0000000000040201,
2480     0x0000000000040200, 0x0000000000000402, 0x0000000000040100,
2481     0x0000000000000401, 0x0000000000000400, 0x0000000000000004,
2482     0x0000000003020100, 0x0000000000030201, 0x0000000000030200,
2483     0x0000000000000302, 0x0000000000030100, 0x0000000000000301,
2484     0x0000000000000300, 0x0000000000000003, 0x0000000000020100,
2485     0x0000000000000201, 0x0000000000000200, 0x0000000000000002,
2486     0x0000000000000100, 0x0000000000000001, 0x0000000000000000,
2487     0x0000000000000000,
2488 }; //static uint64_t thintable_epi8[256]
2489 
2490 } // namespace internal
2491 } // namespace simdjson
2492 
2493 #endif //  SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE || SIMDJSON_IMPLEMENTATION_PPC64
2494 /* end file src/internal/simdprune_tables.cpp */
2495 /* begin file src/implementation.cpp */
2496 #include <initializer_list>
2497 
2498 namespace simdjson {
2499 
supported_by_runtime_system() const2500 bool implementation::supported_by_runtime_system() const {
2501   uint32_t required_instruction_sets = this->required_instruction_sets();
2502   uint32_t supported_instruction_sets = internal::detect_supported_architectures();
2503   return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets);
2504 }
2505 
2506 namespace internal {
2507 
2508 // Static array of known implementations. We're hoping these get baked into the executable
2509 // without requiring a static initializer.
2510 
2511 #if SIMDJSON_IMPLEMENTATION_HASWELL
2512 const haswell::implementation haswell_singleton{};
2513 #endif
2514 #if SIMDJSON_IMPLEMENTATION_WESTMERE
2515 const westmere::implementation westmere_singleton{};
2516 #endif // SIMDJSON_IMPLEMENTATION_WESTMERE
2517 #if SIMDJSON_IMPLEMENTATION_ARM64
2518 const arm64::implementation arm64_singleton{};
2519 #endif // SIMDJSON_IMPLEMENTATION_ARM64
2520 #if SIMDJSON_IMPLEMENTATION_PPC64
2521 const ppc64::implementation ppc64_singleton{};
2522 #endif // SIMDJSON_IMPLEMENTATION_PPC64
2523 #if SIMDJSON_IMPLEMENTATION_FALLBACK
2524 const fallback::implementation fallback_singleton{};
2525 #endif // SIMDJSON_IMPLEMENTATION_FALLBACK
2526 
2527 /**
2528  * @private Detects best supported implementation on first use, and sets it
2529  */
2530 class detect_best_supported_implementation_on_first_use final : public implementation {
2531 public:
name() const2532   const std::string &name() const noexcept final { return set_best()->name(); }
description() const2533   const std::string &description() const noexcept final { return set_best()->description(); }
required_instruction_sets() const2534   uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
create_dom_parser_implementation(size_t capacity,size_t max_length,std::unique_ptr<internal::dom_parser_implementation> & dst) const2535   simdjson_warn_unused error_code create_dom_parser_implementation(
2536     size_t capacity,
2537     size_t max_length,
2538     std::unique_ptr<internal::dom_parser_implementation>& dst
2539   ) const noexcept final {
2540     return set_best()->create_dom_parser_implementation(capacity, max_length, dst);
2541   }
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len) const2542   simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
2543     return set_best()->minify(buf, len, dst, dst_len);
2544   }
validate_utf8(const char * buf,size_t len) const2545   simdjson_warn_unused bool validate_utf8(const char * buf, size_t len) const noexcept final override {
2546     return set_best()->validate_utf8(buf, len);
2547   }
detect_best_supported_implementation_on_first_use()2548   simdjson_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
2549 private:
2550   const implementation *set_best() const noexcept;
2551 };
2552 
2553 const detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton;
2554 
2555 const std::initializer_list<const implementation *> available_implementation_pointers {
2556 #if SIMDJSON_IMPLEMENTATION_HASWELL
2557   &haswell_singleton,
2558 #endif
2559 #if SIMDJSON_IMPLEMENTATION_WESTMERE
2560   &westmere_singleton,
2561 #endif
2562 #if SIMDJSON_IMPLEMENTATION_ARM64
2563   &arm64_singleton,
2564 #endif
2565 #if SIMDJSON_IMPLEMENTATION_PPC64
2566   &ppc64_singleton,
2567 #endif
2568 #if SIMDJSON_IMPLEMENTATION_FALLBACK
2569   &fallback_singleton,
2570 #endif
2571 }; // available_implementation_pointers
2572 
2573 // So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support
2574 class unsupported_implementation final : public implementation {
2575 public:
create_dom_parser_implementation(size_t,size_t,std::unique_ptr<internal::dom_parser_implementation> &) const2576   simdjson_warn_unused error_code create_dom_parser_implementation(
2577     size_t,
2578     size_t,
2579     std::unique_ptr<internal::dom_parser_implementation>&
2580   ) const noexcept final {
2581     return UNSUPPORTED_ARCHITECTURE;
2582   }
minify(const uint8_t *,size_t,uint8_t *,size_t &) const2583   simdjson_warn_unused error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final override {
2584     return UNSUPPORTED_ARCHITECTURE;
2585   }
validate_utf8(const char *,size_t) const2586   simdjson_warn_unused bool validate_utf8(const char *, size_t) const noexcept final override {
2587     return false; // Just refuse to validate. Given that we have a fallback implementation
2588     // it seems unlikely that unsupported_implementation will ever be used. If it is used,
2589     // then it will flag all strings as invalid. The alternative is to return an error_code
2590     // from which the user has to figure out whether the string is valid UTF-8... which seems
2591     // like a lot of work just to handle the very unlikely case that we have an unsupported
2592     // implementation. And, when it does happen (that we have an unsupported implementation),
2593     // what are the chances that the programmer has a fallback? Given that *we* provide the
2594     // fallback, it implies that the programmer would need a fallback for our fallback.
2595   }
unsupported_implementation()2596   unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
2597 };
2598 
2599 const unsupported_implementation unsupported_singleton{};
2600 
size() const2601 size_t available_implementation_list::size() const noexcept {
2602   return internal::available_implementation_pointers.size();
2603 }
begin() const2604 const implementation * const *available_implementation_list::begin() const noexcept {
2605   return internal::available_implementation_pointers.begin();
2606 }
end() const2607 const implementation * const *available_implementation_list::end() const noexcept {
2608   return internal::available_implementation_pointers.end();
2609 }
detect_best_supported() const2610 const implementation *available_implementation_list::detect_best_supported() const noexcept {
2611   // They are prelisted in priority order, so we just go down the list
2612   uint32_t supported_instruction_sets = internal::detect_supported_architectures();
2613   for (const implementation *impl : internal::available_implementation_pointers) {
2614     uint32_t required_instruction_sets = impl->required_instruction_sets();
2615     if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; }
2616   }
2617   return &unsupported_singleton; // this should never happen?
2618 }
2619 
set_best() const2620 const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept {
2621   SIMDJSON_PUSH_DISABLE_WARNINGS
2622   SIMDJSON_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
2623   char *force_implementation_name = getenv("SIMDJSON_FORCE_IMPLEMENTATION");
2624   SIMDJSON_POP_DISABLE_WARNINGS
2625 
2626   if (force_implementation_name) {
2627     auto force_implementation = available_implementations[force_implementation_name];
2628     if (force_implementation) {
2629       return active_implementation = force_implementation;
2630     } else {
2631       // Note: abort() and stderr usage within the library is forbidden.
2632       return active_implementation = &unsupported_singleton;
2633     }
2634   }
2635   return active_implementation = available_implementations.detect_best_supported();
2636 }
2637 
2638 } // namespace internal
2639 
2640 SIMDJSON_DLLIMPORTEXPORT const internal::available_implementation_list available_implementations{};
2641 SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> active_implementation{&internal::detect_best_supported_implementation_on_first_use_singleton};
2642 
minify(const char * buf,size_t len,char * dst,size_t & dst_len)2643 simdjson_warn_unused error_code minify(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept {
2644   return active_implementation->minify(reinterpret_cast<const uint8_t *>(buf), len, reinterpret_cast<uint8_t *>(dst), dst_len);
2645 }
validate_utf8(const char * buf,size_t len)2646 simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
2647   return active_implementation->validate_utf8(buf, len);
2648 }
2649 
builtin_implementation()2650 const implementation * builtin_implementation() {
2651   static const implementation * builtin_impl = available_implementations[STRINGIFY(SIMDJSON_BUILTIN_IMPLEMENTATION)];
2652   assert(builtin_impl);
2653   return builtin_impl;
2654 }
2655 
2656 
2657 } // namespace simdjson
2658 /* end file src/implementation.cpp */
2659 
2660 #if SIMDJSON_IMPLEMENTATION_ARM64
2661 /* begin file src/arm64/implementation.cpp */
2662 /* begin file include/simdjson/arm64/begin.h */
2663 // redefining SIMDJSON_IMPLEMENTATION to "arm64"
2664 // #define SIMDJSON_IMPLEMENTATION arm64
2665 /* end file include/simdjson/arm64/begin.h */
2666 
2667 namespace simdjson {
2668 namespace arm64 {
2669 
create_dom_parser_implementation(size_t capacity,size_t max_depth,std::unique_ptr<internal::dom_parser_implementation> & dst) const2670 simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
2671   size_t capacity,
2672   size_t max_depth,
2673   std::unique_ptr<internal::dom_parser_implementation>& dst
2674 ) const noexcept {
2675   dst.reset( new (std::nothrow) dom_parser_implementation() );
2676   if (!dst) { return MEMALLOC; }
2677   dst->set_capacity(capacity);
2678   dst->set_max_depth(max_depth);
2679   return SUCCESS;
2680 }
2681 
2682 } // namespace arm64
2683 } // namespace simdjson
2684 
2685 /* begin file include/simdjson/arm64/end.h */
2686 /* end file include/simdjson/arm64/end.h */
2687 /* end file src/arm64/implementation.cpp */
2688 /* begin file src/arm64/dom_parser_implementation.cpp */
2689 /* begin file include/simdjson/arm64/begin.h */
2690 // redefining SIMDJSON_IMPLEMENTATION to "arm64"
2691 // #define SIMDJSON_IMPLEMENTATION arm64
2692 /* end file include/simdjson/arm64/begin.h */
2693 
2694 //
2695 // Stage 1
2696 //
2697 namespace simdjson {
2698 namespace arm64 {
2699 namespace {
2700 
2701 using namespace simd;
2702 
2703 struct json_character_block {
2704   static simdjson_really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
2705 
whitespacesimdjson::arm64::__anon9bb6be6f0311::json_character_block2706   simdjson_really_inline uint64_t whitespace() const noexcept { return _whitespace; }
opsimdjson::arm64::__anon9bb6be6f0311::json_character_block2707   simdjson_really_inline uint64_t op() const noexcept { return _op; }
scalarsimdjson::arm64::__anon9bb6be6f0311::json_character_block2708   simdjson_really_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
2709 
2710   uint64_t _whitespace;
2711   uint64_t _op;
2712 };
2713 
classify(const simd::simd8x64<uint8_t> & in)2714 simdjson_really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
2715   // Functional programming causes trouble with Visual Studio.
2716   // Keeping this version in comments since it is much nicer:
2717   // auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
2718   //  auto nib_lo = chunk & 0xf;
2719   //  auto nib_hi = chunk.shr<4>();
2720   //  auto shuf_lo = nib_lo.lookup_16<uint8_t>(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
2721   //  auto shuf_hi = nib_hi.lookup_16<uint8_t>(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
2722   //  return shuf_lo & shuf_hi;
2723   // });
2724   const simd8<uint8_t> table1(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
2725   const simd8<uint8_t> table2(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
2726 
2727   simd8x64<uint8_t> v(
2728      (in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2),
2729      (in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2),
2730      (in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2),
2731      (in.chunks[3] & 0xf).lookup_16(table1) & (in.chunks[3].shr<4>()).lookup_16(table2)
2732   );
2733 
2734 
2735   // We compute whitespace and op separately. If the code later only use one or the
2736   // other, given the fact that all functions are aggressively inlined, we can
2737   // hope that useless computations will be omitted. This is namely case when
2738   // minifying (we only need whitespace). *However* if we only need spaces,
2739   // it is likely that we will still compute 'v' above with two lookup_16: one
2740   // could do it a bit cheaper. This is in contrast with the x64 implementations
2741   // where we can, efficiently, do the white space and structural matching
2742   // separately. One reason for this difference is that on ARM NEON, the table
2743   // lookups either zero or leave unchanged the characters exceeding 0xF whereas
2744   // on x64, the equivalent instruction (pshufb) automatically applies a mask,
2745   // ignoring the 4 most significant bits. Thus the x64 implementation is
2746   // optimized differently. This being said, if you use this code strictly
2747   // just for minification (or just to identify the structural characters),
2748   // there is a small untaken optimization opportunity here. We deliberately
2749   // do not pick it up.
2750 
2751   uint64_t op = simd8x64<bool>(
2752         v.chunks[0].any_bits_set(0x7),
2753         v.chunks[1].any_bits_set(0x7),
2754         v.chunks[2].any_bits_set(0x7),
2755         v.chunks[3].any_bits_set(0x7)
2756   ).to_bitmask();
2757 
2758   uint64_t whitespace = simd8x64<bool>(
2759         v.chunks[0].any_bits_set(0x18),
2760         v.chunks[1].any_bits_set(0x18),
2761         v.chunks[2].any_bits_set(0x18),
2762         v.chunks[3].any_bits_set(0x18)
2763   ).to_bitmask();
2764 
2765   return { whitespace, op };
2766 }
2767 
is_ascii(const simd8x64<uint8_t> & input)2768 simdjson_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
2769     simd8<uint8_t> bits = input.reduce_or();
2770     return bits.max_val() < 0b10000000u;
2771 }
2772 
must_be_continuation(const simd8<uint8_t> prev1,const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)2773 simdjson_unused simdjson_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
2774     simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
2775     simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
2776     simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
2777     // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well.
2778     // This will work fine because we only have to report errors for cases with 0-1 lead bytes.
2779     // Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is
2780     // guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character.
2781     // The error will be detected there.
2782     return is_second_byte ^ is_third_byte ^ is_fourth_byte;
2783 }
2784 
must_be_2_3_continuation(const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)2785 simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
2786     simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
2787     simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
2788     return is_third_byte ^ is_fourth_byte;
2789 }
2790 
2791 } // unnamed namespace
2792 } // namespace arm64
2793 } // namespace simdjson
2794 
2795 /* begin file src/generic/stage1/utf8_lookup4_algorithm.h */
2796 namespace simdjson {
2797 namespace arm64 {
2798 namespace {
2799 namespace utf8_validation {
2800 
2801 using namespace simd;
2802 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)2803   simdjson_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
2804 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
2805 // Bit 1 = Too Long (ASCII followed by continuation)
2806 // Bit 2 = Overlong 3-byte
2807 // Bit 4 = Surrogate
2808 // Bit 5 = Overlong 2-byte
2809 // Bit 7 = Two Continuations
2810     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
2811                                                 // 11______ 11______
2812     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
2813     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
2814     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
2815     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
2816     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
2817     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
2818                                                 // 11110100 101_____
2819                                                 // 11110101 1001____
2820                                                 // 11110101 101_____
2821                                                 // 1111011_ 1001____
2822                                                 // 1111011_ 101_____
2823                                                 // 11111___ 1001____
2824                                                 // 11111___ 101_____
2825     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
2826                                                 // 11110101 1000____
2827                                                 // 1111011_ 1000____
2828                                                 // 11111___ 1000____
2829     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
2830 
2831     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
2832       // 0_______ ________ <ASCII in byte 1>
2833       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
2834       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
2835       // 10______ ________ <continuation in byte 1>
2836       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
2837       // 1100____ ________ <two byte lead in byte 1>
2838       TOO_SHORT | OVERLONG_2,
2839       // 1101____ ________ <two byte lead in byte 1>
2840       TOO_SHORT,
2841       // 1110____ ________ <three byte lead in byte 1>
2842       TOO_SHORT | OVERLONG_3 | SURROGATE,
2843       // 1111____ ________ <four+ byte lead in byte 1>
2844       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
2845     );
2846     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
2847     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
2848       // ____0000 ________
2849       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
2850       // ____0001 ________
2851       CARRY | OVERLONG_2,
2852       // ____001_ ________
2853       CARRY,
2854       CARRY,
2855 
2856       // ____0100 ________
2857       CARRY | TOO_LARGE,
2858       // ____0101 ________
2859       CARRY | TOO_LARGE | TOO_LARGE_1000,
2860       // ____011_ ________
2861       CARRY | TOO_LARGE | TOO_LARGE_1000,
2862       CARRY | TOO_LARGE | TOO_LARGE_1000,
2863 
2864       // ____1___ ________
2865       CARRY | TOO_LARGE | TOO_LARGE_1000,
2866       CARRY | TOO_LARGE | TOO_LARGE_1000,
2867       CARRY | TOO_LARGE | TOO_LARGE_1000,
2868       CARRY | TOO_LARGE | TOO_LARGE_1000,
2869       CARRY | TOO_LARGE | TOO_LARGE_1000,
2870       // ____1101 ________
2871       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
2872       CARRY | TOO_LARGE | TOO_LARGE_1000,
2873       CARRY | TOO_LARGE | TOO_LARGE_1000
2874     );
2875     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
2876       // ________ 0_______ <ASCII in byte 2>
2877       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
2878       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
2879 
2880       // ________ 1000____
2881       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
2882       // ________ 1001____
2883       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
2884       // ________ 101_____
2885       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
2886       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
2887 
2888       // ________ 11______
2889       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
2890     );
2891     return (byte_1_high & byte_1_low & byte_2_high);
2892   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)2893   simdjson_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
2894       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
2895     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
2896     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
2897     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
2898     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
2899     return must23_80 ^ sc;
2900   }
2901 
2902   //
2903   // Return nonzero if there are incomplete multibyte characters at the end of the block:
2904   // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
2905   //
is_incomplete(const simd8<uint8_t> input)2906   simdjson_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
2907     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
2908     // ... 1111____ 111_____ 11______
2909     static const uint8_t max_array[32] = {
2910       255, 255, 255, 255, 255, 255, 255, 255,
2911       255, 255, 255, 255, 255, 255, 255, 255,
2912       255, 255, 255, 255, 255, 255, 255, 255,
2913       255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
2914     };
2915     const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
2916     return input.gt_bits(max_value);
2917   }
2918 
2919   struct utf8_checker {
2920     // If this is nonzero, there has been a UTF-8 error.
2921     simd8<uint8_t> error;
2922     // The last input we received
2923     simd8<uint8_t> prev_input_block;
2924     // Whether the last input we received was incomplete (used for ASCII fast path)
2925     simd8<uint8_t> prev_incomplete;
2926 
2927     //
2928     // Check whether the current bytes are valid UTF-8.
2929     //
check_utf8_bytessimdjson::arm64::__anon9bb6be6f0411::utf8_validation::utf8_checker2930     simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
2931       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
2932       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
2933       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
2934       simd8<uint8_t> sc = check_special_cases(input, prev1);
2935       this->error |= check_multibyte_lengths(input, prev_input, sc);
2936     }
2937 
2938     // The only problem that can happen at EOF is that a multibyte character is too short
2939     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
2940     // too large in the first of two bytes.
check_eofsimdjson::arm64::__anon9bb6be6f0411::utf8_validation::utf8_checker2941     simdjson_really_inline void check_eof() {
2942       // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
2943       // possibly finish them.
2944       this->error |= this->prev_incomplete;
2945     }
2946 
check_next_inputsimdjson::arm64::__anon9bb6be6f0411::utf8_validation::utf8_checker2947     simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
2948       if(simdjson_likely(is_ascii(input))) {
2949         this->error |= this->prev_incomplete;
2950       } else {
2951         // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
2952         static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
2953             "We support either two or four chunks per 64-byte block.");
2954         if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
2955           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
2956           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
2957         } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
2958           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
2959           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
2960           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
2961           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
2962         }
2963         this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
2964         this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
2965 
2966       }
2967     }
2968     // do not forget to call check_eof!
errorssimdjson::arm64::__anon9bb6be6f0411::utf8_validation::utf8_checker2969     simdjson_really_inline error_code errors() {
2970       return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
2971     }
2972 
2973   }; // struct utf8_checker
2974 } // namespace utf8_validation
2975 
2976 using utf8_validation::utf8_checker;
2977 
2978 } // unnamed namespace
2979 } // namespace arm64
2980 } // namespace simdjson
2981 /* end file src/generic/stage1/utf8_lookup4_algorithm.h */
2982 /* begin file src/generic/stage1/json_structural_indexer.h */
2983 // This file contains the common code every implementation uses in stage1
2984 // It is intended to be included multiple times and compiled multiple times
2985 // We assume the file in which it is included already includes
2986 // "simdjson/stage1.h" (this simplifies amalgation)
2987 
2988 /* begin file src/generic/stage1/buf_block_reader.h */
2989 namespace simdjson {
2990 namespace arm64 {
2991 namespace {
2992 
2993 // Walks through a buffer in block-sized increments, loading the last part with spaces
2994 template<size_t STEP_SIZE>
2995 struct buf_block_reader {
2996 public:
2997   simdjson_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
2998   simdjson_really_inline size_t block_index();
2999   simdjson_really_inline bool has_full_block() const;
3000   simdjson_really_inline const uint8_t *full_block() const;
3001   /**
3002    * Get the last block, padded with spaces.
3003    *
3004    * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
3005    * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
3006    * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
3007    *
3008    * @return the number of effective characters in the last block.
3009    */
3010   simdjson_really_inline size_t get_remainder(uint8_t *dst) const;
3011   simdjson_really_inline void advance();
3012 private:
3013   const uint8_t *buf;
3014   const size_t len;
3015   const size_t lenminusstep;
3016   size_t idx;
3017 };
3018 
3019 // Routines to print masks and text for debugging bitmask operations
format_input_text_64(const uint8_t * text)3020 simdjson_unused static char * format_input_text_64(const uint8_t *text) {
3021   static char buf[sizeof(simd8x64<uint8_t>) + 1];
3022   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
3023     buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
3024   }
3025   buf[sizeof(simd8x64<uint8_t>)] = '\0';
3026   return buf;
3027 }
3028 
3029 // Routines to print masks and text for debugging bitmask operations
format_input_text(const simd8x64<uint8_t> & in)3030 simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
3031   static char buf[sizeof(simd8x64<uint8_t>) + 1];
3032   in.store(reinterpret_cast<uint8_t*>(buf));
3033   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
3034     if (buf[i] < ' ') { buf[i] = '_'; }
3035   }
3036   buf[sizeof(simd8x64<uint8_t>)] = '\0';
3037   return buf;
3038 }
3039 
format_mask(uint64_t mask)3040 simdjson_unused static char * format_mask(uint64_t mask) {
3041   static char buf[sizeof(simd8x64<uint8_t>) + 1];
3042   for (size_t i=0; i<64; i++) {
3043     buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
3044   }
3045   buf[64] = '\0';
3046   return buf;
3047 }
3048 
3049 template<size_t STEP_SIZE>
buf_block_reader(const uint8_t * _buf,size_t _len)3050 simdjson_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
3051 
3052 template<size_t STEP_SIZE>
block_index()3053 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
3054 
3055 template<size_t STEP_SIZE>
has_full_block() const3056 simdjson_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
3057   return idx < lenminusstep;
3058 }
3059 
3060 template<size_t STEP_SIZE>
full_block() const3061 simdjson_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
3062   return &buf[idx];
3063 }
3064 
3065 template<size_t STEP_SIZE>
get_remainder(uint8_t * dst) const3066 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
3067   if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
3068   std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
3069   std::memcpy(dst, buf + idx, len - idx);
3070   return len - idx;
3071 }
3072 
3073 template<size_t STEP_SIZE>
advance()3074 simdjson_really_inline void buf_block_reader<STEP_SIZE>::advance() {
3075   idx += STEP_SIZE;
3076 }
3077 
3078 } // unnamed namespace
3079 } // namespace arm64
3080 } // namespace simdjson
3081 /* end file src/generic/stage1/buf_block_reader.h */
3082 /* begin file src/generic/stage1/json_string_scanner.h */
3083 namespace simdjson {
3084 namespace arm64 {
3085 namespace {
3086 namespace stage1 {
3087 
3088 struct json_string_block {
3089   // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_string_blocksimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3090   simdjson_really_inline json_string_block(uint64_t backslash, uint64_t escaped, uint64_t quote, uint64_t in_string) :
3091   _backslash(backslash), _escaped(escaped), _quote(quote), _in_string(in_string) {}
3092 
3093   // Escaped characters (characters following an escape() character)
escapedsimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3094   simdjson_really_inline uint64_t escaped() const { return _escaped; }
3095   // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
escapesimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3096   simdjson_really_inline uint64_t escape() const { return _backslash & ~_escaped; }
3097   // Real (non-backslashed) quotes
quotesimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3098   simdjson_really_inline uint64_t quote() const { return _quote; }
3099   // Start quotes of strings
string_startsimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3100   simdjson_really_inline uint64_t string_start() const { return _quote & _in_string; }
3101   // End quotes of strings
string_endsimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3102   simdjson_really_inline uint64_t string_end() const { return _quote & ~_in_string; }
3103   // Only characters inside the string (not including the quotes)
string_contentsimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3104   simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; }
3105   // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_inside_stringsimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3106   simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
3107   // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_outside_stringsimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3108   simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
3109   // Tail of string (everything except the start quote)
string_tailsimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3110   simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
3111 
3112   // backslash characters
3113   uint64_t _backslash;
3114   // escaped characters (backslashed--does not include the hex characters after \u)
3115   uint64_t _escaped;
3116   // real quotes (non-backslashed ones)
3117   uint64_t _quote;
3118   // string characters (includes start quote but not end quote)
3119   uint64_t _in_string;
3120 };
3121 
3122 // Scans blocks for string characters, storing the state necessary to do so
3123 class json_string_scanner {
3124 public:
3125   simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
3126   // Returns either UNCLOSED_STRING or SUCCESS
3127   simdjson_really_inline error_code finish();
3128 
3129 private:
3130   // Intended to be defined by the implementation
3131   simdjson_really_inline uint64_t find_escaped(uint64_t escape);
3132   simdjson_really_inline uint64_t find_escaped_branchless(uint64_t escape);
3133 
3134   // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
3135   uint64_t prev_in_string = 0ULL;
3136   // Whether the first character of the next iteration is escaped.
3137   uint64_t prev_escaped = 0ULL;
3138 };
3139 
3140 //
3141 // Finds escaped characters (characters following \).
3142 //
3143 // Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
3144 //
3145 // Does this by:
3146 // - Shift the escape mask to get potentially escaped characters (characters after backslashes).
3147 // - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
3148 // - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
3149 //
3150 // To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
3151 // escape sequences, filters out the ones that start on even bits, and adds that to the mask of
3152 // escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
3153 // the start bit causes a carry), and leaves even-bit sequences alone.
3154 //
3155 // Example:
3156 //
3157 // text           |  \\\ | \\\"\\\" \\\" \\"\\" |
3158 // escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
3159 // odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
3160 // even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
3161 // invert_mask    |      |     cxxx     c xx   c| even_seq << 1
3162 // follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
3163 // escaped        |   x  | x x  x x  x x  x  x  |
3164 // desired        |   x  | x x  x x  x x  x  x  |
3165 // text           |  \\\ | \\\"\\\" \\\" \\"\\" |
3166 //
find_escaped_branchless(uint64_t backslash)3167 simdjson_really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash) {
3168   // If there was overflow, pretend the first character isn't a backslash
3169   backslash &= ~prev_escaped;
3170   uint64_t follows_escape = backslash << 1 | prev_escaped;
3171 
3172   // Get sequences starting on even bits by clearing out the odd series using +
3173   const uint64_t even_bits = 0x5555555555555555ULL;
3174   uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
3175   uint64_t sequences_starting_on_even_bits;
3176   prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
3177   uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
3178 
3179   // Mask every other backslashed character as an escaped character
3180   // Flip the mask for sequences that start on even bits, to correct them
3181   return (even_bits ^ invert_mask) & follows_escape;
3182 }
3183 
3184 //
3185 // Return a mask of all string characters plus end quotes.
3186 //
3187 // prev_escaped is overflow saying whether the next character is escaped.
3188 // prev_in_string is overflow saying whether we're still in a string.
3189 //
3190 // Backslash sequences outside of quotes will be detected in stage 2.
3191 //
next(const simd::simd8x64<uint8_t> & in)3192 simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
3193   const uint64_t backslash = in.eq('\\');
3194   const uint64_t escaped = find_escaped(backslash);
3195   const uint64_t quote = in.eq('"') & ~escaped;
3196 
3197   //
3198   // prefix_xor flips on bits inside the string (and flips off the end quote).
3199   //
3200   // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
3201   // (characters inside strings are outside, and characters outside strings are inside).
3202   //
3203   const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
3204 
3205   //
3206   // Check if we're still in a string at the end of the box so the next block will know
3207   //
3208   // right shift of a signed value expected to be well-defined and standard
3209   // compliant as of C++20, John Regher from Utah U. says this is fine code
3210   //
3211   prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
3212 
3213   // Use ^ to turn the beginning quote off, and the end quote on.
3214 
3215   // We are returning a function-local object so either we get a move constructor
3216   // or we get copy elision.
3217   return json_string_block(
3218     backslash,
3219     escaped,
3220     quote,
3221     in_string
3222   );
3223 }
3224 
finish()3225 simdjson_really_inline error_code json_string_scanner::finish() {
3226   if (prev_in_string) {
3227     return UNCLOSED_STRING;
3228   }
3229   return SUCCESS;
3230 }
3231 
3232 } // namespace stage1
3233 } // unnamed namespace
3234 } // namespace arm64
3235 } // namespace simdjson
3236 /* end file src/generic/stage1/json_string_scanner.h */
3237 /* begin file src/generic/stage1/json_scanner.h */
3238 namespace simdjson {
3239 namespace arm64 {
3240 namespace {
3241 namespace stage1 {
3242 
3243 /**
3244  * A block of scanned json, with information on operators and scalars.
3245  *
3246  * We seek to identify pseudo-structural characters. Anything that is inside
3247  * a string must be omitted (hence  & ~_string.string_tail()).
3248  * Otherwise, pseudo-structural characters come in two forms.
3249  * 1. We have the structural characters ([,],{,},:, comma). The
3250  *    term 'structural character' is from the JSON RFC.
3251  * 2. We have the 'scalar pseudo-structural characters'.
3252  *    Scalars are quotes, and any character except structural characters and white space.
3253  *
3254  * To identify the scalar pseudo-structural characters, we must look at what comes
3255  * before them: it must be a space, a quote or a structural characters.
3256  * Starting with simdjson v0.3, we identify them by
3257  * negation: we identify everything that is followed by a non-quote scalar,
3258  * and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
3259  */
3260 struct json_block {
3261 public:
3262   // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_blocksimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3263   simdjson_really_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
3264   _string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
json_blocksimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3265   simdjson_really_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
3266   _string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
3267 
3268   /**
3269    * The start of structurals.
3270    * In simdjson prior to v0.3, these were called the pseudo-structural characters.
3271    **/
structural_startsimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3272   simdjson_really_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
3273   /** All JSON whitespace (i.e. not in a string) */
whitespacesimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3274   simdjson_really_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
3275 
3276   // Helpers
3277 
3278   /** Whether the given characters are inside a string (only works on non-quotes) */
non_quote_inside_stringsimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3279   simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
3280   /** Whether the given characters are outside a string (only works on non-quotes) */
non_quote_outside_stringsimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3281   simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
3282 
3283   // string and escape characters
3284   json_string_block _string;
3285   // whitespace, structural characters ('operators'), scalars
3286   json_character_block _characters;
3287   // whether the previous character was a scalar
3288   uint64_t _follows_potential_nonquote_scalar;
3289 private:
3290   // Potential structurals (i.e. disregarding strings)
3291 
3292   /**
3293    * structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
3294    * They may reside inside a string.
3295    **/
potential_structural_startsimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3296   simdjson_really_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
3297   /**
3298    * The start of non-operator runs, like 123, true and "abc".
3299    * It main reside inside a string.
3300    **/
potential_scalar_startsimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3301   simdjson_really_inline uint64_t potential_scalar_start() const noexcept {
3302     // The term "scalar" refers to anything except structural characters and white space
3303     // (so letters, numbers, quotes).
3304     // Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
3305     // then we know that it is irrelevant structurally.
3306     return _characters.scalar() & ~follows_potential_scalar();
3307   }
3308   /**
3309    * Whether the given character is immediately after a non-operator like 123, true.
3310    * The characters following a quote are not included.
3311    */
follows_potential_scalarsimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3312   simdjson_really_inline uint64_t follows_potential_scalar() const noexcept {
3313     // _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
3314     // that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
3315     // white space.
3316     // It is understood that within quoted region, anything at all could be marked (irrelevant).
3317     return _follows_potential_nonquote_scalar;
3318   }
3319 };
3320 
3321 /**
3322  * Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
3323  *
3324  * The scanner starts by calculating two distinct things:
3325  * - string characters (taking \" into account)
3326  * - structural characters or 'operators' ([]{},:, comma)
3327  *   and scalars (runs of non-operators like 123, true and "abc")
3328  *
3329  * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
3330  * in particular, the operator/scalar bit will find plenty of things that are actually part of
3331  * strings. When we're done, json_block will fuse the two together by masking out tokens that are
3332  * part of a string.
3333  */
3334 class json_scanner {
3335 public:
json_scanner()3336   json_scanner() {}
3337   simdjson_really_inline json_block next(const simd::simd8x64<uint8_t>& in);
3338   // Returns either UNCLOSED_STRING or SUCCESS
3339   simdjson_really_inline error_code finish();
3340 
3341 private:
3342   // Whether the last character of the previous iteration is part of a scalar token
3343   // (anything except whitespace or a structural character/'operator').
3344   uint64_t prev_scalar = 0ULL;
3345   json_string_scanner string_scanner{};
3346 };
3347 
3348 
3349 //
3350 // Check if the current character immediately follows a matching character.
3351 //
3352 // For example, this checks for quotes with backslashes in front of them:
3353 //
3354 //     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
3355 //
follows(const uint64_t match,uint64_t & overflow)3356 simdjson_really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
3357   const uint64_t result = match << 1 | overflow;
3358   overflow = match >> 63;
3359   return result;
3360 }
3361 
next(const simd::simd8x64<uint8_t> & in)3362 simdjson_really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
3363   json_string_block strings = string_scanner.next(in);
3364   // identifies the white-space and the structurat characters
3365   json_character_block characters = json_character_block::classify(in);
3366   // The term "scalar" refers to anything except structural characters and white space
3367   // (so letters, numbers, quotes).
3368   // We want  follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
3369   //
3370   // A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
3371   // or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
3372   // pseudo-structural character just like we would if we had  ' "a string" true '; otherwise we
3373   // may need to add an extra check when parsing strings.
3374   //
3375   // Performance: there are many ways to skin this cat.
3376   const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
3377   uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
3378   // We are returning a function-local object so either we get a move constructor
3379   // or we get copy elision.
3380   return json_block(
3381     strings,// strings is a function-local object so either it moves or the copy is elided.
3382     characters,
3383     follows_nonquote_scalar
3384   );
3385 }
3386 
finish()3387 simdjson_really_inline error_code json_scanner::finish() {
3388   return string_scanner.finish();
3389 }
3390 
3391 } // namespace stage1
3392 } // unnamed namespace
3393 } // namespace arm64
3394 } // namespace simdjson
3395 /* end file src/generic/stage1/json_scanner.h */
3396 /* begin file src/generic/stage1/json_minifier.h */
3397 // This file contains the common code every implementation uses in stage1
3398 // It is intended to be included multiple times and compiled multiple times
3399 // We assume the file in which it is included already includes
3400 // "simdjson/stage1.h" (this simplifies amalgation)
3401 
3402 namespace simdjson {
3403 namespace arm64 {
3404 namespace {
3405 namespace stage1 {
3406 
3407 class json_minifier {
3408 public:
3409   template<size_t STEP_SIZE>
3410   static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
3411 
3412 private:
json_minifier(uint8_t * _dst)3413   simdjson_really_inline json_minifier(uint8_t *_dst)
3414   : dst{_dst}
3415   {}
3416   template<size_t STEP_SIZE>
3417   simdjson_really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
3418   simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
3419   simdjson_really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
3420   json_scanner scanner{};
3421   uint8_t *dst;
3422 };
3423 
next(const simd::simd8x64<uint8_t> & in,const json_block & block)3424 simdjson_really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
3425   uint64_t mask = block.whitespace();
3426   in.compress(mask, dst);
3427   dst += 64 - count_ones(mask);
3428 }
3429 
finish(uint8_t * dst_start,size_t & dst_len)3430 simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
3431   error_code error = scanner.finish();
3432   if (error) { dst_len = 0; return error; }
3433   dst_len = dst - dst_start;
3434   return SUCCESS;
3435 }
3436 
3437 template<>
step(const uint8_t * block_buf,buf_block_reader<128> & reader)3438 simdjson_really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
3439   simd::simd8x64<uint8_t> in_1(block_buf);
3440   simd::simd8x64<uint8_t> in_2(block_buf+64);
3441   json_block block_1 = scanner.next(in_1);
3442   json_block block_2 = scanner.next(in_2);
3443   this->next(in_1, block_1);
3444   this->next(in_2, block_2);
3445   reader.advance();
3446 }
3447 
3448 template<>
step(const uint8_t * block_buf,buf_block_reader<64> & reader)3449 simdjson_really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
3450   simd::simd8x64<uint8_t> in_1(block_buf);
3451   json_block block_1 = scanner.next(in_1);
3452   this->next(block_buf, block_1);
3453   reader.advance();
3454 }
3455 
3456 template<size_t STEP_SIZE>
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len)3457 error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
3458   buf_block_reader<STEP_SIZE> reader(buf, len);
3459   json_minifier minifier(dst);
3460 
3461   // Index the first n-1 blocks
3462   while (reader.has_full_block()) {
3463     minifier.step<STEP_SIZE>(reader.full_block(), reader);
3464   }
3465 
3466   // Index the last (remainder) block, padded with spaces
3467   uint8_t block[STEP_SIZE];
3468   size_t remaining_bytes = reader.get_remainder(block);
3469   if (remaining_bytes > 0) {
3470     // We do not want to write directly to the output stream. Rather, we write
3471     // to a local buffer (for safety).
3472     uint8_t out_block[STEP_SIZE];
3473     uint8_t * const guarded_dst{minifier.dst};
3474     minifier.dst = out_block;
3475     minifier.step<STEP_SIZE>(block, reader);
3476     size_t to_write = minifier.dst - out_block;
3477     // In some cases, we could be enticed to consider the padded spaces
3478     // as part of the string. This is fine as long as we do not write more
3479     // than we consumed.
3480     if(to_write > remaining_bytes) { to_write = remaining_bytes; }
3481     memcpy(guarded_dst, out_block, to_write);
3482     minifier.dst = guarded_dst + to_write;
3483   }
3484   return minifier.finish(dst, dst_len);
3485 }
3486 
3487 } // namespace stage1
3488 } // unnamed namespace
3489 } // namespace arm64
3490 } // namespace simdjson
3491 /* end file src/generic/stage1/json_minifier.h */
3492 /* begin file src/generic/stage1/find_next_document_index.h */
3493 namespace simdjson {
3494 namespace arm64 {
3495 namespace {
3496 
3497 /**
3498   * This algorithm is used to quickly identify the last structural position that
3499   * makes up a complete document.
3500   *
3501   * It does this by going backwards and finding the last *document boundary* (a
3502   * place where one value follows another without a comma between them). If the
3503   * last document (the characters after the boundary) has an equal number of
3504   * start and end brackets, it is considered complete.
3505   *
3506   * Simply put, we iterate over the structural characters, starting from
3507   * the end. We consider that we found the end of a JSON document when the
3508   * first element of the pair is NOT one of these characters: '{' '[' ';' ','
3509   * and when the second element is NOT one of these characters: '}' '}' ';' ','.
3510   *
3511   * This simple comparison works most of the time, but it does not cover cases
3512   * where the batch's structural indexes contain a perfect amount of documents.
3513   * In such a case, we do not have access to the structural index which follows
3514   * the last document, therefore, we do not have access to the second element in
3515   * the pair, and that means we cannot identify the last document. To fix this
3516   * issue, we keep a count of the open and closed curly/square braces we found
3517   * while searching for the pair. When we find a pair AND the count of open and
3518   * closed curly/square braces is the same, we know that we just passed a
3519   * complete document, therefore the last json buffer location is the end of the
3520   * batch.
3521   */
find_next_document_index(dom_parser_implementation & parser)3522 simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
3523   // TODO don't count separately, just figure out depth
3524   auto arr_cnt = 0;
3525   auto obj_cnt = 0;
3526   for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
3527     auto idxb = parser.structural_indexes[i];
3528     switch (parser.buf[idxb]) {
3529     case ':':
3530     case ',':
3531       continue;
3532     case '}':
3533       obj_cnt--;
3534       continue;
3535     case ']':
3536       arr_cnt--;
3537       continue;
3538     case '{':
3539       obj_cnt++;
3540       break;
3541     case '[':
3542       arr_cnt++;
3543       break;
3544     }
3545     auto idxa = parser.structural_indexes[i - 1];
3546     switch (parser.buf[idxa]) {
3547     case '{':
3548     case '[':
3549     case ':':
3550     case ',':
3551       continue;
3552     }
3553     // Last document is complete, so the next document will appear after!
3554     if (!arr_cnt && !obj_cnt) {
3555       return parser.n_structural_indexes;
3556     }
3557     // Last document is incomplete; mark the document at i + 1 as the next one
3558     return i;
3559   }
3560   return 0;
3561 }
3562 
3563 } // unnamed namespace
3564 } // namespace arm64
3565 } // namespace simdjson
3566 /* end file src/generic/stage1/find_next_document_index.h */
3567 
3568 namespace simdjson {
3569 namespace arm64 {
3570 namespace {
3571 namespace stage1 {
3572 
3573 class bit_indexer {
3574 public:
3575   uint32_t *tail;
3576 
bit_indexer(uint32_t * index_buf)3577   simdjson_really_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
3578 
3579   // flatten out values in 'bits' assuming that they are are to have values of idx
3580   // plus their position in the bitvector, and store these indexes at
3581   // base_ptr[base] incrementing base as we go
3582   // will potentially store extra values beyond end of valid bits, so base_ptr
3583   // needs to be large enough to handle this
write(uint32_t idx,uint64_t bits)3584   simdjson_really_inline void write(uint32_t idx, uint64_t bits) {
3585     // In some instances, the next branch is expensive because it is mispredicted.
3586     // Unfortunately, in other cases,
3587     // it helps tremendously.
3588     if (bits == 0)
3589         return;
3590     int cnt = static_cast<int>(count_ones(bits));
3591 
3592     // Do the first 8 all together
3593     for (int i=0; i<8; i++) {
3594       this->tail[i] = idx + trailing_zeroes(bits);
3595       bits = clear_lowest_bit(bits);
3596     }
3597 
3598     // Do the next 8 all together (we hope in most cases it won't happen at all
3599     // and the branch is easily predicted).
3600     if (simdjson_unlikely(cnt > 8)) {
3601       for (int i=8; i<16; i++) {
3602         this->tail[i] = idx + trailing_zeroes(bits);
3603         bits = clear_lowest_bit(bits);
3604       }
3605 
3606       // Most files don't have 16+ structurals per block, so we take several basically guaranteed
3607       // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
3608       // or the start of a value ("abc" true 123) every four characters.
3609       if (simdjson_unlikely(cnt > 16)) {
3610         int i = 16;
3611         do {
3612           this->tail[i] = idx + trailing_zeroes(bits);
3613           bits = clear_lowest_bit(bits);
3614           i++;
3615         } while (i < cnt);
3616       }
3617     }
3618 
3619     this->tail += cnt;
3620   }
3621 };
3622 
3623 class json_structural_indexer {
3624 public:
3625   /**
3626    * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
3627    *
3628    * @param partial Setting the partial parameter to true allows the find_structural_bits to
3629    *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
3630    *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
3631    */
3632   template<size_t STEP_SIZE>
3633   static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
3634 
3635 private:
3636   simdjson_really_inline json_structural_indexer(uint32_t *structural_indexes);
3637   template<size_t STEP_SIZE>
3638   simdjson_really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
3639   simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
3640   simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
3641 
3642   json_scanner scanner{};
3643   utf8_checker checker{};
3644   bit_indexer indexer;
3645   uint64_t prev_structurals = 0;
3646   uint64_t unescaped_chars_error = 0;
3647 };
3648 
json_structural_indexer(uint32_t * structural_indexes)3649 simdjson_really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
3650 
3651 // Skip the last character if it is partial
trim_partial_utf8(const uint8_t * buf,size_t len)3652 simdjson_really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
3653   if (simdjson_unlikely(len < 3)) {
3654     switch (len) {
3655       case 2:
3656         if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
3657         if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
3658         return len;
3659       case 1:
3660         if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
3661         return len;
3662       case 0:
3663         return len;
3664     }
3665   }
3666   if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
3667   if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
3668   if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
3669   return len;
3670 }
3671 
3672 //
3673 // PERF NOTES:
3674 // We pipe 2 inputs through these stages:
3675 // 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
3676 //    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
3677 // 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
3678 //    The output of step 1 depends entirely on this information. These functions don't quite use
3679 //    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
3680 //    at a time. The second input's scans has some dependency on the first ones finishing it, but
3681 //    they can make a lot of progress before they need that information.
3682 // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
3683 //    to finish: utf-8 checks and generating the output from the last iteration.
3684 //
3685 // The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
3686 // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
3687 // workout.
3688 //
3689 template<size_t STEP_SIZE>
index(const uint8_t * buf,size_t len,dom_parser_implementation & parser,bool partial)3690 error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept {
3691   if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
3692   if (partial) { len = trim_partial_utf8(buf, len); }
3693 
3694   buf_block_reader<STEP_SIZE> reader(buf, len);
3695   json_structural_indexer indexer(parser.structural_indexes.get());
3696 
3697   // Read all but the last block
3698   while (reader.has_full_block()) {
3699     indexer.step<STEP_SIZE>(reader.full_block(), reader);
3700   }
3701 
3702   // Take care of the last block (will always be there unless file is empty)
3703   uint8_t block[STEP_SIZE];
3704   if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return EMPTY; }
3705   indexer.step<STEP_SIZE>(block, reader);
3706 
3707   return indexer.finish(parser, reader.block_index(), len, partial);
3708 }
3709 
3710 template<>
step(const uint8_t * block,buf_block_reader<128> & reader)3711 simdjson_really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
3712   simd::simd8x64<uint8_t> in_1(block);
3713   simd::simd8x64<uint8_t> in_2(block+64);
3714   json_block block_1 = scanner.next(in_1);
3715   json_block block_2 = scanner.next(in_2);
3716   this->next(in_1, block_1, reader.block_index());
3717   this->next(in_2, block_2, reader.block_index()+64);
3718   reader.advance();
3719 }
3720 
3721 template<>
step(const uint8_t * block,buf_block_reader<64> & reader)3722 simdjson_really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
3723   simd::simd8x64<uint8_t> in_1(block);
3724   json_block block_1 = scanner.next(in_1);
3725   this->next(in_1, block_1, reader.block_index());
3726   reader.advance();
3727 }
3728 
next(const simd::simd8x64<uint8_t> & in,const json_block & block,size_t idx)3729 simdjson_really_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
3730   uint64_t unescaped = in.lteq(0x1F);
3731   checker.check_next_input(in);
3732   indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
3733   prev_structurals = block.structural_start();
3734   unescaped_chars_error |= block.non_quote_inside_string(unescaped);
3735 }
3736 
finish(dom_parser_implementation & parser,size_t idx,size_t len,bool partial)3737 simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) {
3738   // Write out the final iteration's structurals
3739   indexer.write(uint32_t(idx-64), prev_structurals);
3740 
3741   error_code error = scanner.finish();
3742   // We deliberately break down the next expression so that it is
3743   // human readable.
3744   const bool should_we_exit =  partial ?
3745     ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
3746     : (error != SUCCESS); // if partial is false, we must have SUCCESS
3747   const bool have_unclosed_string = (error == UNCLOSED_STRING);
3748   if (simdjson_unlikely(should_we_exit)) { return error; }
3749 
3750   if (unescaped_chars_error) {
3751     return UNESCAPED_CHARS;
3752   }
3753 
3754   parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
3755   /***
3756    * This is related to https://github.com/simdjson/simdjson/issues/906
3757    * Basically, we want to make sure that if the parsing continues beyond the last (valid)
3758    * structural character, it quickly stops.
3759    * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
3760    * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
3761    * continues, then it must be [,] or }.
3762    * Suppose it is ] or }. We backtrack to the first character, what could it be that would
3763    * not trigger an error? It could be ] or } but no, because you can't start a document that way.
3764    * It can't be a comma, a colon or any simple value. So the only way we could continue is
3765    * if the repeated character is [. But if so, the document must start with [. But if the document
3766    * starts with [, it should end with ]. If we enforce that rule, then we would get
3767    * ][[ which is invalid.
3768    **/
3769   parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
3770   parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
3771   parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
3772   parser.next_structural_index = 0;
3773   // a valid JSON file cannot have zero structural indexes - we should have found something
3774   if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
3775     return EMPTY;
3776   }
3777   if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
3778     return UNEXPECTED_ERROR;
3779   }
3780   if (partial) {
3781     // If we have an unclosed string, then the last structural
3782     // will be the quote and we want to make sure to omit it.
3783     if(have_unclosed_string) {
3784       parser.n_structural_indexes--;
3785       // a valid JSON file cannot have zero structural indexes - we should have found something
3786       if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
3787     }
3788     auto new_structural_indexes = find_next_document_index(parser);
3789     if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
3790       return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
3791     }
3792     parser.n_structural_indexes = new_structural_indexes;
3793   }
3794   checker.check_eof();
3795   return checker.errors();
3796 }
3797 
3798 } // namespace stage1
3799 } // unnamed namespace
3800 } // namespace arm64
3801 } // namespace simdjson
3802 /* end file src/generic/stage1/json_structural_indexer.h */
3803 /* begin file src/generic/stage1/utf8_validator.h */
3804 namespace simdjson {
3805 namespace arm64 {
3806 namespace {
3807 namespace stage1 {
3808 
3809 /**
3810  * Validates that the string is actual UTF-8.
3811  */
3812 template<class checker>
generic_validate_utf8(const uint8_t * input,size_t length)3813 bool generic_validate_utf8(const uint8_t * input, size_t length) {
3814     checker c{};
3815     buf_block_reader<64> reader(input, length);
3816     while (reader.has_full_block()) {
3817       simd::simd8x64<uint8_t> in(reader.full_block());
3818       c.check_next_input(in);
3819       reader.advance();
3820     }
3821     uint8_t block[64]{};
3822     reader.get_remainder(block);
3823     simd::simd8x64<uint8_t> in(block);
3824     c.check_next_input(in);
3825     reader.advance();
3826     c.check_eof();
3827     return c.errors() == error_code::SUCCESS;
3828 }
3829 
generic_validate_utf8(const char * input,size_t length)3830 bool generic_validate_utf8(const char * input, size_t length) {
3831     return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
3832 }
3833 
3834 } // namespace stage1
3835 } // unnamed namespace
3836 } // namespace arm64
3837 } // namespace simdjson
3838 /* end file src/generic/stage1/utf8_validator.h */
3839 
3840 //
3841 // Stage 2
3842 //
3843 
3844 /* begin file src/generic/stage2/tape_builder.h */
3845 /* begin file src/generic/stage2/json_iterator.h */
3846 /* begin file src/generic/stage2/logger.h */
3847 // This is for an internal-only stage 2 specific logger.
3848 // Set LOG_ENABLED = true to log what stage 2 is doing!
3849 namespace simdjson {
3850 namespace arm64 {
3851 namespace {
3852 namespace logger {
3853 
3854   static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
3855 
3856 #if SIMDJSON_VERBOSE_LOGGING
3857   static constexpr const bool LOG_ENABLED = true;
3858 #else
3859   static constexpr const bool LOG_ENABLED = false;
3860 #endif
3861   static constexpr const int LOG_EVENT_LEN = 20;
3862   static constexpr const int LOG_BUFFER_LEN = 30;
3863   static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
3864   static constexpr const int LOG_INDEX_LEN = 5;
3865 
3866   static int log_depth; // Not threadsafe. Log only.
3867 
3868   // Helper to turn unprintable or newline characters into spaces
printable_char(char c)3869   static simdjson_really_inline char printable_char(char c) {
3870     if (c >= 0x20) {
3871       return c;
3872     } else {
3873       return ' ';
3874     }
3875   }
3876 
3877   // Print the header and set up log_start
log_start()3878   static simdjson_really_inline void log_start() {
3879     if (LOG_ENABLED) {
3880       log_depth = 0;
3881       printf("\n");
3882       printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
3883       printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
3884     }
3885   }
3886 
log_string(const char * message)3887   simdjson_unused static simdjson_really_inline void log_string(const char *message) {
3888     if (LOG_ENABLED) {
3889       printf("%s\n", message);
3890     }
3891   }
3892 
3893   // Logs a single line from the stage 2 DOM parser
3894   template<typename S>
log_line(S & structurals,const char * title_prefix,const char * title,const char * detail)3895   static simdjson_really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
3896     if (LOG_ENABLED) {
3897       printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
3898       auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
3899       auto next_index = structurals.next_structural;
3900       auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>("                                                       ");
3901       auto next = &structurals.buf[*next_index];
3902       {
3903         // Print the next N characters in the buffer.
3904         printf("| ");
3905         // Otherwise, print the characters starting from the buffer position.
3906         // Print spaces for unprintable or newline characters.
3907         for (int i=0;i<LOG_BUFFER_LEN;i++) {
3908           printf("%c", printable_char(current[i]));
3909         }
3910         printf(" ");
3911         // Print the next N characters in the buffer.
3912         printf("| ");
3913         // Otherwise, print the characters starting from the buffer position.
3914         // Print spaces for unprintable or newline characters.
3915         for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
3916           printf("%c", printable_char(next[i]));
3917         }
3918         printf(" ");
3919       }
3920       if (current_index) {
3921         printf("| %*u ", LOG_INDEX_LEN, *current_index);
3922       } else {
3923         printf("| %-*s ", LOG_INDEX_LEN, "");
3924       }
3925       // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
3926       printf("| %-s ", detail);
3927       printf("|\n");
3928     }
3929   }
3930 
3931 } // namespace logger
3932 } // unnamed namespace
3933 } // namespace arm64
3934 } // namespace simdjson
3935 /* end file src/generic/stage2/logger.h */
3936 
3937 namespace simdjson {
3938 namespace arm64 {
3939 namespace {
3940 namespace stage2 {
3941 
3942 class json_iterator {
3943 public:
3944   const uint8_t* const buf;
3945   uint32_t *next_structural;
3946   dom_parser_implementation &dom_parser;
3947   uint32_t depth{0};
3948 
3949   /**
3950    * Walk the JSON document.
3951    *
3952    * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
3953    * the first parameter; some callbacks have other parameters as well:
3954    *
3955    * - visit_document_start() - at the beginning.
3956    * - visit_document_end() - at the end (if things were successful).
3957    *
3958    * - visit_array_start() - at the start `[` of a non-empty array.
3959    * - visit_array_end() - at the end `]` of a non-empty array.
3960    * - visit_empty_array() - when an empty array is encountered.
3961    *
3962    * - visit_object_end() - at the start `]` of a non-empty object.
3963    * - visit_object_start() - at the end `]` of a non-empty object.
3964    * - visit_empty_object() - when an empty object is encountered.
3965    * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
3966    *                                   guaranteed to point at the first quote of the string (`"key"`).
3967    * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
3968    * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
3969    *
3970    * - increment_count(iter) - each time a value is found in an array or object.
3971    */
3972   template<bool STREAMING, typename V>
3973   simdjson_warn_unused simdjson_really_inline error_code walk_document(V &visitor) noexcept;
3974 
3975   /**
3976    * Create an iterator capable of walking a JSON document.
3977    *
3978    * The document must have already passed through stage 1.
3979    */
3980   simdjson_really_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
3981 
3982   /**
3983    * Look at the next token.
3984    *
3985    * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
3986    *
3987    * They may include invalid JSON as well (such as `1.2.3` or `ture`).
3988    */
3989   simdjson_really_inline const uint8_t *peek() const noexcept;
3990   /**
3991    * Advance to the next token.
3992    *
3993    * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
3994    *
3995    * They may include invalid JSON as well (such as `1.2.3` or `ture`).
3996    */
3997   simdjson_really_inline const uint8_t *advance() noexcept;
3998   /**
3999    * Get the remaining length of the document, from the start of the current token.
4000    */
4001   simdjson_really_inline size_t remaining_len() const noexcept;
4002   /**
4003    * Check if we are at the end of the document.
4004    *
4005    * If this is true, there are no more tokens.
4006    */
4007   simdjson_really_inline bool at_eof() const noexcept;
4008   /**
4009    * Check if we are at the beginning of the document.
4010    */
4011   simdjson_really_inline bool at_beginning() const noexcept;
4012   simdjson_really_inline uint8_t last_structural() const noexcept;
4013 
4014   /**
4015    * Log that a value has been found.
4016    *
4017    * Set ENABLE_LOGGING=true in logger.h to see logging.
4018    */
4019   simdjson_really_inline void log_value(const char *type) const noexcept;
4020   /**
4021    * Log the start of a multipart value.
4022    *
4023    * Set ENABLE_LOGGING=true in logger.h to see logging.
4024    */
4025   simdjson_really_inline void log_start_value(const char *type) const noexcept;
4026   /**
4027    * Log the end of a multipart value.
4028    *
4029    * Set ENABLE_LOGGING=true in logger.h to see logging.
4030    */
4031   simdjson_really_inline void log_end_value(const char *type) const noexcept;
4032   /**
4033    * Log an error.
4034    *
4035    * Set ENABLE_LOGGING=true in logger.h to see logging.
4036    */
4037   simdjson_really_inline void log_error(const char *error) const noexcept;
4038 
4039   template<typename V>
4040   simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
4041   template<typename V>
4042   simdjson_warn_unused simdjson_really_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
4043 };
4044 
4045 template<bool STREAMING, typename V>
walk_document(V & visitor)4046 simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_document(V &visitor) noexcept {
4047   logger::log_start();
4048 
4049   //
4050   // Start the document
4051   //
4052   if (at_eof()) { return EMPTY; }
4053   log_start_value("document");
4054   SIMDJSON_TRY( visitor.visit_document_start(*this) );
4055 
4056   //
4057   // Read first value
4058   //
4059   {
4060     auto value = advance();
4061 
4062     // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
4063     // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
4064     if (!STREAMING) {
4065       switch (*value) {
4066         case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break;
4067         case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break;
4068       }
4069     }
4070 
4071     switch (*value) {
4072       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
4073       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
4074       default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
4075     }
4076   }
4077   goto document_end;
4078 
4079 //
4080 // Object parser states
4081 //
4082 object_begin:
4083   log_start_value("object");
4084   depth++;
4085   if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
4086   dom_parser.is_array[depth] = false;
4087   SIMDJSON_TRY( visitor.visit_object_start(*this) );
4088 
4089   {
4090     auto key = advance();
4091     if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
4092     SIMDJSON_TRY( visitor.increment_count(*this) );
4093     SIMDJSON_TRY( visitor.visit_key(*this, key) );
4094   }
4095 
4096 object_field:
4097   if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
4098   {
4099     auto value = advance();
4100     switch (*value) {
4101       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
4102       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
4103       default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
4104     }
4105   }
4106 
4107 object_continue:
4108   switch (*advance()) {
4109     case ',':
4110       SIMDJSON_TRY( visitor.increment_count(*this) );
4111       {
4112         auto key = advance();
4113         if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
4114         SIMDJSON_TRY( visitor.visit_key(*this, key) );
4115       }
4116       goto object_field;
4117     case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
4118     default: log_error("No comma between object fields"); return TAPE_ERROR;
4119   }
4120 
4121 scope_end:
4122   depth--;
4123   if (depth == 0) { goto document_end; }
4124   if (dom_parser.is_array[depth]) { goto array_continue; }
4125   goto object_continue;
4126 
4127 //
4128 // Array parser states
4129 //
4130 array_begin:
4131   log_start_value("array");
4132   depth++;
4133   if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
4134   dom_parser.is_array[depth] = true;
4135   SIMDJSON_TRY( visitor.visit_array_start(*this) );
4136   SIMDJSON_TRY( visitor.increment_count(*this) );
4137 
4138 array_value:
4139   {
4140     auto value = advance();
4141     switch (*value) {
4142       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
4143       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
4144       default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
4145     }
4146   }
4147 
4148 array_continue:
4149   switch (*advance()) {
4150     case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
4151     case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
4152     default: log_error("Missing comma between array values"); return TAPE_ERROR;
4153   }
4154 
4155 document_end:
4156   log_end_value("document");
4157   SIMDJSON_TRY( visitor.visit_document_end(*this) );
4158 
4159   dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
4160 
4161   // If we didn't make it to the end, it's an error
4162   if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
4163     log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
4164     return TAPE_ERROR;
4165   }
4166 
4167   return SUCCESS;
4168 
4169 } // walk_document()
4170 
json_iterator(dom_parser_implementation & _dom_parser,size_t start_structural_index)4171 simdjson_really_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
4172   : buf{_dom_parser.buf},
4173     next_structural{&_dom_parser.structural_indexes[start_structural_index]},
4174     dom_parser{_dom_parser} {
4175 }
4176 
peek() const4177 simdjson_really_inline const uint8_t *json_iterator::peek() const noexcept {
4178   return &buf[*(next_structural)];
4179 }
advance()4180 simdjson_really_inline const uint8_t *json_iterator::advance() noexcept {
4181   return &buf[*(next_structural++)];
4182 }
remaining_len() const4183 simdjson_really_inline size_t json_iterator::remaining_len() const noexcept {
4184   return dom_parser.len - *(next_structural-1);
4185 }
4186 
at_eof() const4187 simdjson_really_inline bool json_iterator::at_eof() const noexcept {
4188   return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
4189 }
at_beginning() const4190 simdjson_really_inline bool json_iterator::at_beginning() const noexcept {
4191   return next_structural == dom_parser.structural_indexes.get();
4192 }
last_structural() const4193 simdjson_really_inline uint8_t json_iterator::last_structural() const noexcept {
4194   return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
4195 }
4196 
log_value(const char * type) const4197 simdjson_really_inline void json_iterator::log_value(const char *type) const noexcept {
4198   logger::log_line(*this, "", type, "");
4199 }
4200 
log_start_value(const char * type) const4201 simdjson_really_inline void json_iterator::log_start_value(const char *type) const noexcept {
4202   logger::log_line(*this, "+", type, "");
4203   if (logger::LOG_ENABLED) { logger::log_depth++; }
4204 }
4205 
log_end_value(const char * type) const4206 simdjson_really_inline void json_iterator::log_end_value(const char *type) const noexcept {
4207   if (logger::LOG_ENABLED) { logger::log_depth--; }
4208   logger::log_line(*this, "-", type, "");
4209 }
4210 
log_error(const char * error) const4211 simdjson_really_inline void json_iterator::log_error(const char *error) const noexcept {
4212   logger::log_line(*this, "", "ERROR", error);
4213 }
4214 
4215 template<typename V>
visit_root_primitive(V & visitor,const uint8_t * value)4216 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
4217   switch (*value) {
4218     case '"': return visitor.visit_root_string(*this, value);
4219     case 't': return visitor.visit_root_true_atom(*this, value);
4220     case 'f': return visitor.visit_root_false_atom(*this, value);
4221     case 'n': return visitor.visit_root_null_atom(*this, value);
4222     case '-':
4223     case '0': case '1': case '2': case '3': case '4':
4224     case '5': case '6': case '7': case '8': case '9':
4225       return visitor.visit_root_number(*this, value);
4226     default:
4227       log_error("Document starts with a non-value character");
4228       return TAPE_ERROR;
4229   }
4230 }
4231 template<typename V>
visit_primitive(V & visitor,const uint8_t * value)4232 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
4233   switch (*value) {
4234     case '"': return visitor.visit_string(*this, value);
4235     case 't': return visitor.visit_true_atom(*this, value);
4236     case 'f': return visitor.visit_false_atom(*this, value);
4237     case 'n': return visitor.visit_null_atom(*this, value);
4238     case '-':
4239     case '0': case '1': case '2': case '3': case '4':
4240     case '5': case '6': case '7': case '8': case '9':
4241       return visitor.visit_number(*this, value);
4242     default:
4243       log_error("Non-value found when value was expected!");
4244       return TAPE_ERROR;
4245   }
4246 }
4247 
4248 } // namespace stage2
4249 } // unnamed namespace
4250 } // namespace arm64
4251 } // namespace simdjson
4252 /* end file src/generic/stage2/json_iterator.h */
4253 /* begin file src/generic/stage2/tape_writer.h */
4254 namespace simdjson {
4255 namespace arm64 {
4256 namespace {
4257 namespace stage2 {
4258 
4259 struct tape_writer {
4260   /** The next place to write to tape */
4261   uint64_t *next_tape_loc;
4262 
4263   /** Write a signed 64-bit value to tape. */
4264   simdjson_really_inline void append_s64(int64_t value) noexcept;
4265 
4266   /** Write an unsigned 64-bit value to tape. */
4267   simdjson_really_inline void append_u64(uint64_t value) noexcept;
4268 
4269   /** Write a double value to tape. */
4270   simdjson_really_inline void append_double(double value) noexcept;
4271 
4272   /**
4273    * Append a tape entry (an 8-bit type,and 56 bits worth of value).
4274    */
4275   simdjson_really_inline void append(uint64_t val, internal::tape_type t) noexcept;
4276 
4277   /**
4278    * Skip the current tape entry without writing.
4279    *
4280    * Used to skip the start of the container, since we'll come back later to fill it in when the
4281    * container ends.
4282    */
4283   simdjson_really_inline void skip() noexcept;
4284 
4285   /**
4286    * Skip the number of tape entries necessary to write a large u64 or i64.
4287    */
4288   simdjson_really_inline void skip_large_integer() noexcept;
4289 
4290   /**
4291    * Skip the number of tape entries necessary to write a double.
4292    */
4293   simdjson_really_inline void skip_double() noexcept;
4294 
4295   /**
4296    * Write a value to a known location on tape.
4297    *
4298    * Used to go back and write out the start of a container after the container ends.
4299    */
4300   simdjson_really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
4301 
4302 private:
4303   /**
4304    * Append both the tape entry, and a supplementary value following it. Used for types that need
4305    * all 64 bits, such as double and uint64_t.
4306    */
4307   template<typename T>
4308   simdjson_really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
4309 }; // struct number_writer
4310 
append_s64(int64_t value)4311 simdjson_really_inline void tape_writer::append_s64(int64_t value) noexcept {
4312   append2(0, value, internal::tape_type::INT64);
4313 }
4314 
append_u64(uint64_t value)4315 simdjson_really_inline void tape_writer::append_u64(uint64_t value) noexcept {
4316   append(0, internal::tape_type::UINT64);
4317   *next_tape_loc = value;
4318   next_tape_loc++;
4319 }
4320 
4321 /** Write a double value to tape. */
append_double(double value)4322 simdjson_really_inline void tape_writer::append_double(double value) noexcept {
4323   append2(0, value, internal::tape_type::DOUBLE);
4324 }
4325 
skip()4326 simdjson_really_inline void tape_writer::skip() noexcept {
4327   next_tape_loc++;
4328 }
4329 
skip_large_integer()4330 simdjson_really_inline void tape_writer::skip_large_integer() noexcept {
4331   next_tape_loc += 2;
4332 }
4333 
skip_double()4334 simdjson_really_inline void tape_writer::skip_double() noexcept {
4335   next_tape_loc += 2;
4336 }
4337 
append(uint64_t val,internal::tape_type t)4338 simdjson_really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
4339   *next_tape_loc = val | ((uint64_t(char(t))) << 56);
4340   next_tape_loc++;
4341 }
4342 
4343 template<typename T>
append2(uint64_t val,T val2,internal::tape_type t)4344 simdjson_really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
4345   append(val, t);
4346   static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
4347   memcpy(next_tape_loc, &val2, sizeof(val2));
4348   next_tape_loc++;
4349 }
4350 
write(uint64_t & tape_loc,uint64_t val,internal::tape_type t)4351 simdjson_really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
4352   tape_loc = val | ((uint64_t(char(t))) << 56);
4353 }
4354 
4355 } // namespace stage2
4356 } // unnamed namespace
4357 } // namespace arm64
4358 } // namespace simdjson
4359 /* end file src/generic/stage2/tape_writer.h */
4360 
4361 namespace simdjson {
4362 namespace arm64 {
4363 namespace {
4364 namespace stage2 {
4365 
4366 struct tape_builder {
4367   template<bool STREAMING>
4368   simdjson_warn_unused static simdjson_really_inline error_code parse_document(
4369     dom_parser_implementation &dom_parser,
4370     dom::document &doc) noexcept;
4371 
4372   /** Called when a non-empty document starts. */
4373   simdjson_warn_unused simdjson_really_inline error_code visit_document_start(json_iterator &iter) noexcept;
4374   /** Called when a non-empty document ends without error. */
4375   simdjson_warn_unused simdjson_really_inline error_code visit_document_end(json_iterator &iter) noexcept;
4376 
4377   /** Called when a non-empty array starts. */
4378   simdjson_warn_unused simdjson_really_inline error_code visit_array_start(json_iterator &iter) noexcept;
4379   /** Called when a non-empty array ends. */
4380   simdjson_warn_unused simdjson_really_inline error_code visit_array_end(json_iterator &iter) noexcept;
4381   /** Called when an empty array is found. */
4382   simdjson_warn_unused simdjson_really_inline error_code visit_empty_array(json_iterator &iter) noexcept;
4383 
4384   /** Called when a non-empty object starts. */
4385   simdjson_warn_unused simdjson_really_inline error_code visit_object_start(json_iterator &iter) noexcept;
4386   /**
4387    * Called when a key in a field is encountered.
4388    *
4389    * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
4390    * will be called after this with the field value.
4391    */
4392   simdjson_warn_unused simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
4393   /** Called when a non-empty object ends. */
4394   simdjson_warn_unused simdjson_really_inline error_code visit_object_end(json_iterator &iter) noexcept;
4395   /** Called when an empty object is found. */
4396   simdjson_warn_unused simdjson_really_inline error_code visit_empty_object(json_iterator &iter) noexcept;
4397 
4398   /**
4399    * Called when a string, number, boolean or null is found.
4400    */
4401   simdjson_warn_unused simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
4402   /**
4403    * Called when a string, number, boolean or null is found at the top level of a document (i.e.
4404    * when there is no array or object and the entire document is a single string, number, boolean or
4405    * null.
4406    *
4407    * This is separate from primitive() because simdjson's normal primitive parsing routines assume
4408    * there is at least one more token after the value, which is only true in an array or object.
4409    */
4410   simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
4411 
4412   simdjson_warn_unused simdjson_really_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
4413   simdjson_warn_unused simdjson_really_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
4414   simdjson_warn_unused simdjson_really_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
4415   simdjson_warn_unused simdjson_really_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
4416   simdjson_warn_unused simdjson_really_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
4417 
4418   simdjson_warn_unused simdjson_really_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
4419   simdjson_warn_unused simdjson_really_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
4420   simdjson_warn_unused simdjson_really_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
4421   simdjson_warn_unused simdjson_really_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
4422   simdjson_warn_unused simdjson_really_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
4423 
4424   /** Called each time a new field or element in an array or object is found. */
4425   simdjson_warn_unused simdjson_really_inline error_code increment_count(json_iterator &iter) noexcept;
4426 
4427   /** Next location to write to tape */
4428   tape_writer tape;
4429 private:
4430   /** Next write location in the string buf for stage 2 parsing */
4431   uint8_t *current_string_buf_loc;
4432 
4433   simdjson_really_inline tape_builder(dom::document &doc) noexcept;
4434 
4435   simdjson_really_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
4436   simdjson_really_inline void start_container(json_iterator &iter) noexcept;
4437   simdjson_warn_unused simdjson_really_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
4438   simdjson_warn_unused simdjson_really_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
4439   simdjson_really_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
4440   simdjson_really_inline void on_end_string(uint8_t *dst) noexcept;
4441 }; // class tape_builder
4442 
4443 template<bool STREAMING>
parse_document(dom_parser_implementation & dom_parser,dom::document & doc)4444 simdjson_warn_unused simdjson_really_inline error_code tape_builder::parse_document(
4445     dom_parser_implementation &dom_parser,
4446     dom::document &doc) noexcept {
4447   dom_parser.doc = &doc;
4448   json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
4449   tape_builder builder(doc);
4450   return iter.walk_document<STREAMING>(builder);
4451 }
4452 
visit_root_primitive(json_iterator & iter,const uint8_t * value)4453 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
4454   return iter.visit_root_primitive(*this, value);
4455 }
visit_primitive(json_iterator & iter,const uint8_t * value)4456 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
4457   return iter.visit_primitive(*this, value);
4458 }
visit_empty_object(json_iterator & iter)4459 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
4460   return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
4461 }
visit_empty_array(json_iterator & iter)4462 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
4463   return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
4464 }
4465 
visit_document_start(json_iterator & iter)4466 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
4467   start_container(iter);
4468   return SUCCESS;
4469 }
visit_object_start(json_iterator & iter)4470 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
4471   start_container(iter);
4472   return SUCCESS;
4473 }
visit_array_start(json_iterator & iter)4474 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
4475   start_container(iter);
4476   return SUCCESS;
4477 }
4478 
visit_object_end(json_iterator & iter)4479 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
4480   return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
4481 }
visit_array_end(json_iterator & iter)4482 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
4483   return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
4484 }
visit_document_end(json_iterator & iter)4485 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
4486   constexpr uint32_t start_tape_index = 0;
4487   tape.append(start_tape_index, internal::tape_type::ROOT);
4488   tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
4489   return SUCCESS;
4490 }
visit_key(json_iterator & iter,const uint8_t * key)4491 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
4492   return visit_string(iter, key, true);
4493 }
4494 
increment_count(json_iterator & iter)4495 simdjson_warn_unused simdjson_really_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
4496   iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
4497   return SUCCESS;
4498 }
4499 
tape_builder(dom::document & doc)4500 simdjson_really_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
4501 
visit_string(json_iterator & iter,const uint8_t * value,bool key)4502 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
4503   iter.log_value(key ? "key" : "string");
4504   uint8_t *dst = on_start_string(iter);
4505   dst = stringparsing::parse_string(value+1, dst);
4506   if (dst == nullptr) {
4507     iter.log_error("Invalid escape in string");
4508     return STRING_ERROR;
4509   }
4510   on_end_string(dst);
4511   return SUCCESS;
4512 }
4513 
visit_root_string(json_iterator & iter,const uint8_t * value)4514 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
4515   return visit_string(iter, value);
4516 }
4517 
visit_number(json_iterator & iter,const uint8_t * value)4518 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
4519   iter.log_value("number");
4520   return numberparsing::parse_number(value, tape);
4521 }
4522 
visit_root_number(json_iterator & iter,const uint8_t * value)4523 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
4524   //
4525   // We need to make a copy to make sure that the string is space terminated.
4526   // This is not about padding the input, which should already padded up
4527   // to len + SIMDJSON_PADDING. However, we have no control at this stage
4528   // on how the padding was done. What if the input string was padded with nulls?
4529   // It is quite common for an input string to have an extra null character (C string).
4530   // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
4531   // document, but the string "9\0" by itself is fine. So we make a copy and
4532   // pad the input with spaces when we know that there is just one input element.
4533   // This copy is relatively expensive, but it will almost never be called in
4534   // practice unless you are in the strange scenario where you have many JSON
4535   // documents made of single atoms.
4536   //
4537   std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
4538   if (copy.get() == nullptr) { return MEMALLOC; }
4539   std::memcpy(copy.get(), value, iter.remaining_len());
4540   std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
4541   error_code error = visit_number(iter, copy.get());
4542   return error;
4543 }
4544 
visit_true_atom(json_iterator & iter,const uint8_t * value)4545 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
4546   iter.log_value("true");
4547   if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
4548   tape.append(0, internal::tape_type::TRUE_VALUE);
4549   return SUCCESS;
4550 }
4551 
visit_root_true_atom(json_iterator & iter,const uint8_t * value)4552 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
4553   iter.log_value("true");
4554   if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
4555   tape.append(0, internal::tape_type::TRUE_VALUE);
4556   return SUCCESS;
4557 }
4558 
visit_false_atom(json_iterator & iter,const uint8_t * value)4559 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
4560   iter.log_value("false");
4561   if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
4562   tape.append(0, internal::tape_type::FALSE_VALUE);
4563   return SUCCESS;
4564 }
4565 
visit_root_false_atom(json_iterator & iter,const uint8_t * value)4566 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
4567   iter.log_value("false");
4568   if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
4569   tape.append(0, internal::tape_type::FALSE_VALUE);
4570   return SUCCESS;
4571 }
4572 
visit_null_atom(json_iterator & iter,const uint8_t * value)4573 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
4574   iter.log_value("null");
4575   if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
4576   tape.append(0, internal::tape_type::NULL_VALUE);
4577   return SUCCESS;
4578 }
4579 
visit_root_null_atom(json_iterator & iter,const uint8_t * value)4580 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
4581   iter.log_value("null");
4582   if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
4583   tape.append(0, internal::tape_type::NULL_VALUE);
4584   return SUCCESS;
4585 }
4586 
4587 // private:
4588 
next_tape_index(json_iterator & iter) const4589 simdjson_really_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
4590   return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
4591 }
4592 
empty_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)4593 simdjson_warn_unused simdjson_really_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
4594   auto start_index = next_tape_index(iter);
4595   tape.append(start_index+2, start);
4596   tape.append(start_index, end);
4597   return SUCCESS;
4598 }
4599 
start_container(json_iterator & iter)4600 simdjson_really_inline void tape_builder::start_container(json_iterator &iter) noexcept {
4601   iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
4602   iter.dom_parser.open_containers[iter.depth].count = 0;
4603   tape.skip(); // We don't actually *write* the start element until the end.
4604 }
4605 
end_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)4606 simdjson_warn_unused simdjson_really_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
4607   // Write the ending tape element, pointing at the start location
4608   const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
4609   tape.append(start_tape_index, end);
4610   // Write the start tape element, pointing at the end location (and including count)
4611   // count can overflow if it exceeds 24 bits... so we saturate
4612   // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
4613   const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
4614   const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
4615   tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
4616   return SUCCESS;
4617 }
4618 
on_start_string(json_iterator & iter)4619 simdjson_really_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
4620   // we advance the point, accounting for the fact that we have a NULL termination
4621   tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
4622   return current_string_buf_loc + sizeof(uint32_t);
4623 }
4624 
on_end_string(uint8_t * dst)4625 simdjson_really_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
4626   uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
4627   // TODO check for overflow in case someone has a crazy string (>=4GB?)
4628   // But only add the overflow check when the document itself exceeds 4GB
4629   // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
4630   memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
4631   // NULL termination is still handy if you expect all your strings to
4632   // be NULL terminated? It comes at a small cost
4633   *dst = 0;
4634   current_string_buf_loc = dst + 1;
4635 }
4636 
4637 } // namespace stage2
4638 } // unnamed namespace
4639 } // namespace arm64
4640 } // namespace simdjson
4641 /* end file src/generic/stage2/tape_builder.h */
4642 
4643 //
4644 // Implementation-specific overrides
4645 //
4646 namespace simdjson {
4647 namespace arm64 {
4648 namespace {
4649 namespace stage1 {
4650 
find_escaped(uint64_t backslash)4651 simdjson_really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
4652   // On ARM, we don't short-circuit this if there are no backslashes, because the branch gives us no
4653   // benefit and therefore makes things worse.
4654   // if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
4655   return find_escaped_branchless(backslash);
4656 }
4657 
4658 } // namespace stage1
4659 } // unnamed namespace
4660 
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len) const4661 simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
4662   return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
4663 }
4664 
stage1(const uint8_t * _buf,size_t _len,bool streaming)4665 simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
4666   this->buf = _buf;
4667   this->len = _len;
4668   return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
4669 }
4670 
validate_utf8(const char * buf,size_t len) const4671 simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
4672   return arm64::stage1::generic_validate_utf8(buf,len);
4673 }
4674 
stage2(dom::document & _doc)4675 simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
4676   return stage2::tape_builder::parse_document<false>(*this, _doc);
4677 }
4678 
stage2_next(dom::document & _doc)4679 simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
4680   return stage2::tape_builder::parse_document<true>(*this, _doc);
4681 }
4682 
parse(const uint8_t * _buf,size_t _len,dom::document & _doc)4683 simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
4684   auto error = stage1(_buf, _len, false);
4685   if (error) { return error; }
4686   return stage2(_doc);
4687 }
4688 
4689 } // namespace arm64
4690 } // namespace simdjson
4691 
4692 /* begin file include/simdjson/arm64/end.h */
4693 /* end file include/simdjson/arm64/end.h */
4694 /* end file src/arm64/dom_parser_implementation.cpp */
4695 #endif
4696 #if SIMDJSON_IMPLEMENTATION_FALLBACK
4697 /* begin file src/fallback/implementation.cpp */
4698 /* begin file include/simdjson/fallback/begin.h */
4699 // redefining SIMDJSON_IMPLEMENTATION to "fallback"
4700 // #define SIMDJSON_IMPLEMENTATION fallback
4701 /* end file include/simdjson/fallback/begin.h */
4702 
4703 namespace simdjson {
4704 namespace fallback {
4705 
create_dom_parser_implementation(size_t capacity,size_t max_depth,std::unique_ptr<internal::dom_parser_implementation> & dst) const4706 simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
4707   size_t capacity,
4708   size_t max_depth,
4709   std::unique_ptr<internal::dom_parser_implementation>& dst
4710 ) const noexcept {
4711   dst.reset( new (std::nothrow) dom_parser_implementation() );
4712   if (!dst) { return MEMALLOC; }
4713   dst->set_capacity(capacity);
4714   dst->set_max_depth(max_depth);
4715   return SUCCESS;
4716 }
4717 
4718 } // namespace fallback
4719 } // namespace simdjson
4720 
4721 /* begin file include/simdjson/fallback/end.h */
4722 /* end file include/simdjson/fallback/end.h */
4723 /* end file src/fallback/implementation.cpp */
4724 /* begin file src/fallback/dom_parser_implementation.cpp */
4725 /* begin file include/simdjson/fallback/begin.h */
4726 // redefining SIMDJSON_IMPLEMENTATION to "fallback"
4727 // #define SIMDJSON_IMPLEMENTATION fallback
4728 /* end file include/simdjson/fallback/begin.h */
4729 
4730 //
4731 // Stage 1
4732 //
4733 /* begin file src/generic/stage1/find_next_document_index.h */
4734 namespace simdjson {
4735 namespace fallback {
4736 namespace {
4737 
4738 /**
4739   * This algorithm is used to quickly identify the last structural position that
4740   * makes up a complete document.
4741   *
4742   * It does this by going backwards and finding the last *document boundary* (a
4743   * place where one value follows another without a comma between them). If the
4744   * last document (the characters after the boundary) has an equal number of
4745   * start and end brackets, it is considered complete.
4746   *
4747   * Simply put, we iterate over the structural characters, starting from
4748   * the end. We consider that we found the end of a JSON document when the
4749   * first element of the pair is NOT one of these characters: '{' '[' ';' ','
4750   * and when the second element is NOT one of these characters: '}' '}' ';' ','.
4751   *
4752   * This simple comparison works most of the time, but it does not cover cases
4753   * where the batch's structural indexes contain a perfect amount of documents.
4754   * In such a case, we do not have access to the structural index which follows
4755   * the last document, therefore, we do not have access to the second element in
4756   * the pair, and that means we cannot identify the last document. To fix this
4757   * issue, we keep a count of the open and closed curly/square braces we found
4758   * while searching for the pair. When we find a pair AND the count of open and
4759   * closed curly/square braces is the same, we know that we just passed a
4760   * complete document, therefore the last json buffer location is the end of the
4761   * batch.
4762   */
find_next_document_index(dom_parser_implementation & parser)4763 simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
4764   // TODO don't count separately, just figure out depth
4765   auto arr_cnt = 0;
4766   auto obj_cnt = 0;
4767   for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
4768     auto idxb = parser.structural_indexes[i];
4769     switch (parser.buf[idxb]) {
4770     case ':':
4771     case ',':
4772       continue;
4773     case '}':
4774       obj_cnt--;
4775       continue;
4776     case ']':
4777       arr_cnt--;
4778       continue;
4779     case '{':
4780       obj_cnt++;
4781       break;
4782     case '[':
4783       arr_cnt++;
4784       break;
4785     }
4786     auto idxa = parser.structural_indexes[i - 1];
4787     switch (parser.buf[idxa]) {
4788     case '{':
4789     case '[':
4790     case ':':
4791     case ',':
4792       continue;
4793     }
4794     // Last document is complete, so the next document will appear after!
4795     if (!arr_cnt && !obj_cnt) {
4796       return parser.n_structural_indexes;
4797     }
4798     // Last document is incomplete; mark the document at i + 1 as the next one
4799     return i;
4800   }
4801   return 0;
4802 }
4803 
4804 } // unnamed namespace
4805 } // namespace fallback
4806 } // namespace simdjson
4807 /* end file src/generic/stage1/find_next_document_index.h */
4808 
4809 namespace simdjson {
4810 namespace fallback {
4811 namespace {
4812 namespace stage1 {
4813 
4814 class structural_scanner {
4815 public:
4816 
structural_scanner(dom_parser_implementation & _parser,bool _partial)4817 simdjson_really_inline structural_scanner(dom_parser_implementation &_parser, bool _partial)
4818   : buf{_parser.buf},
4819     next_structural_index{_parser.structural_indexes.get()},
4820     parser{_parser},
4821     len{static_cast<uint32_t>(_parser.len)},
4822     partial{_partial} {
4823 }
4824 
add_structural()4825 simdjson_really_inline void add_structural() {
4826   *next_structural_index = idx;
4827   next_structural_index++;
4828 }
4829 
is_continuation(uint8_t c)4830 simdjson_really_inline bool is_continuation(uint8_t c) {
4831   return (c & 0b11000000) == 0b10000000;
4832 }
4833 
validate_utf8_character()4834 simdjson_really_inline void validate_utf8_character() {
4835   // Continuation
4836   if (simdjson_unlikely((buf[idx] & 0b01000000) == 0)) {
4837     // extra continuation
4838     error = UTF8_ERROR;
4839     idx++;
4840     return;
4841   }
4842 
4843   // 2-byte
4844   if ((buf[idx] & 0b00100000) == 0) {
4845     // missing continuation
4846     if (simdjson_unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
4847       if (idx+1 > len && partial) { idx = len; return; }
4848       error = UTF8_ERROR;
4849       idx++;
4850       return;
4851     }
4852     // overlong: 1100000_ 10______
4853     if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; }
4854     idx += 2;
4855     return;
4856   }
4857 
4858   // 3-byte
4859   if ((buf[idx] & 0b00010000) == 0) {
4860     // missing continuation
4861     if (simdjson_unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
4862       if (idx+2 > len && partial) { idx = len; return; }
4863       error = UTF8_ERROR;
4864       idx++;
4865       return;
4866     }
4867     // overlong: 11100000 100_____ ________
4868     if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; }
4869     // surrogates: U+D800-U+DFFF 11101101 101_____
4870     if (buf[idx] == 0b11101101 && buf[idx+1] >= 0b10100000) { error = UTF8_ERROR; }
4871     idx += 3;
4872     return;
4873   }
4874 
4875   // 4-byte
4876   // missing continuation
4877   if (simdjson_unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
4878     if (idx+2 > len && partial) { idx = len; return; }
4879     error = UTF8_ERROR;
4880     idx++;
4881     return;
4882   }
4883   // overlong: 11110000 1000____ ________ ________
4884   if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; }
4885   // too large: > U+10FFFF:
4886   // 11110100 (1001|101_)____
4887   // 1111(1___|011_|0101) 10______
4888   // also includes 5, 6, 7 and 8 byte characters:
4889   // 11111___
4890   if (buf[idx] == 0b11110100 && buf[idx+1] >= 0b10010000) { error = UTF8_ERROR; }
4891   if (buf[idx] >= 0b11110101) { error = UTF8_ERROR; }
4892   idx += 4;
4893 }
4894 
4895 // Returns true if the string is unclosed.
validate_string()4896 simdjson_really_inline bool validate_string() {
4897   idx++; // skip first quote
4898   while (idx < len && buf[idx] != '"') {
4899     if (buf[idx] == '\\') {
4900       idx += 2;
4901     } else if (simdjson_unlikely(buf[idx] & 0b10000000)) {
4902       validate_utf8_character();
4903     } else {
4904       if (buf[idx] < 0x20) { error = UNESCAPED_CHARS; }
4905       idx++;
4906     }
4907   }
4908   if (idx >= len) { return true; }
4909   return false;
4910 }
4911 
is_whitespace_or_operator(uint8_t c)4912 simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) {
4913   switch (c) {
4914     case '{': case '}': case '[': case ']': case ',': case ':':
4915     case ' ': case '\r': case '\n': case '\t':
4916       return true;
4917     default:
4918       return false;
4919   }
4920 }
4921 
4922 //
4923 // Parse the entire input in STEP_SIZE-byte chunks.
4924 //
scan()4925 simdjson_really_inline error_code scan() {
4926   bool unclosed_string = false;
4927   for (;idx<len;idx++) {
4928     switch (buf[idx]) {
4929       // String
4930       case '"':
4931         add_structural();
4932         unclosed_string |= validate_string();
4933         break;
4934       // Operator
4935       case '{': case '}': case '[': case ']': case ',': case ':':
4936         add_structural();
4937         break;
4938       // Whitespace
4939       case ' ': case '\r': case '\n': case '\t':
4940         break;
4941       // Primitive or invalid character (invalid characters will be checked in stage 2)
4942       default:
4943         // Anything else, add the structural and go until we find the next one
4944         add_structural();
4945         while (idx+1<len && !is_whitespace_or_operator(buf[idx+1])) {
4946           idx++;
4947         };
4948         break;
4949     }
4950   }
4951   *next_structural_index = len;
4952   // We pad beyond.
4953   // https://github.com/simdjson/simdjson/issues/906
4954   next_structural_index[1] = len;
4955   next_structural_index[2] = 0;
4956   parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
4957   if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
4958   parser.next_structural_index = 0;
4959   if (partial) {
4960     if(unclosed_string) {
4961       parser.n_structural_indexes--;
4962       if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
4963     }
4964     auto new_structural_indexes = find_next_document_index(parser);
4965     if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
4966       return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
4967     }
4968     parser.n_structural_indexes = new_structural_indexes;
4969   } else if(unclosed_string) { error = UNCLOSED_STRING; }
4970   return error;
4971 }
4972 
4973 private:
4974   const uint8_t *buf;
4975   uint32_t *next_structural_index;
4976   dom_parser_implementation &parser;
4977   uint32_t len;
4978   uint32_t idx{0};
4979   error_code error{SUCCESS};
4980   bool partial;
4981 }; // structural_scanner
4982 
4983 } // namespace stage1
4984 } // unnamed namespace
4985 
stage1(const uint8_t * _buf,size_t _len,bool partial)4986 simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool partial) noexcept {
4987   this->buf = _buf;
4988   this->len = _len;
4989   stage1::structural_scanner scanner(*this, partial);
4990   return scanner.scan();
4991 }
4992 
4993 // big table for the minifier
4994 static uint8_t jump_table[256 * 3] = {
4995     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
4996     1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
4997     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
4998     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
4999     1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5000     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5001     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5002     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5003     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5004     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5005     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5006     1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5007     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5008     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5009     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5010     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5011     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5012     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5013     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5014     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5015     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5016     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5017     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5018     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5019     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5020     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5021     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5022     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5023     1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5024     1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5025     0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5026 };
5027 
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len) const5028 simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
5029   size_t i = 0, pos = 0;
5030   uint8_t quote = 0;
5031   uint8_t nonescape = 1;
5032 
5033   while (i < len) {
5034     unsigned char c = buf[i];
5035     uint8_t *meta = jump_table + 3 * c;
5036 
5037     quote = quote ^ (meta[0] & nonescape);
5038     dst[pos] = c;
5039     pos += meta[2] | quote;
5040 
5041     i += 1;
5042     nonescape = uint8_t(~nonescape) | (meta[1]);
5043   }
5044   dst_len = pos; // we intentionally do not work with a reference
5045   // for fear of aliasing
5046   return quote ? UNCLOSED_STRING : SUCCESS;
5047 }
5048 
5049 // credit: based on code from Google Fuchsia (Apache Licensed)
validate_utf8(const char * buf,size_t len) const5050 simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
5051   const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
5052   uint64_t pos = 0;
5053   uint32_t code_point = 0;
5054   while (pos < len) {
5055     // check of the next 8 bytes are ascii.
5056     uint64_t next_pos = pos + 16;
5057     if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
5058       uint64_t v1;
5059       memcpy(&v1, data + pos, sizeof(uint64_t));
5060       uint64_t v2;
5061       memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
5062       uint64_t v{v1 | v2};
5063       if ((v & 0x8080808080808080) == 0) {
5064         pos = next_pos;
5065         continue;
5066       }
5067     }
5068     unsigned char byte = data[pos];
5069     if (byte < 0b10000000) {
5070       pos++;
5071       continue;
5072     } else if ((byte & 0b11100000) == 0b11000000) {
5073       next_pos = pos + 2;
5074       if (next_pos > len) { return false; }
5075       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
5076       // range check
5077       code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
5078       if (code_point < 0x80 || 0x7ff < code_point) { return false; }
5079     } else if ((byte & 0b11110000) == 0b11100000) {
5080       next_pos = pos + 3;
5081       if (next_pos > len) { return false; }
5082       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
5083       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
5084       // range check
5085       code_point = (byte & 0b00001111) << 12 |
5086                    (data[pos + 1] & 0b00111111) << 6 |
5087                    (data[pos + 2] & 0b00111111);
5088       if (code_point < 0x800 || 0xffff < code_point ||
5089           (0xd7ff < code_point && code_point < 0xe000)) {
5090         return false;
5091       }
5092     } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
5093       next_pos = pos + 4;
5094       if (next_pos > len) { return false; }
5095       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
5096       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
5097       if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
5098       // range check
5099       code_point =
5100           (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
5101           (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
5102       if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
5103     } else {
5104       // we may have a continuation
5105       return false;
5106     }
5107     pos = next_pos;
5108   }
5109   return true;
5110 }
5111 
5112 } // namespace fallback
5113 } // namespace simdjson
5114 
5115 //
5116 // Stage 2
5117 //
5118 /* begin file src/generic/stage2/tape_builder.h */
5119 /* begin file src/generic/stage2/json_iterator.h */
5120 /* begin file src/generic/stage2/logger.h */
5121 // This is for an internal-only stage 2 specific logger.
5122 // Set LOG_ENABLED = true to log what stage 2 is doing!
5123 namespace simdjson {
5124 namespace fallback {
5125 namespace {
5126 namespace logger {
5127 
5128   static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
5129 
5130 #if SIMDJSON_VERBOSE_LOGGING
5131   static constexpr const bool LOG_ENABLED = true;
5132 #else
5133   static constexpr const bool LOG_ENABLED = false;
5134 #endif
5135   static constexpr const int LOG_EVENT_LEN = 20;
5136   static constexpr const int LOG_BUFFER_LEN = 30;
5137   static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
5138   static constexpr const int LOG_INDEX_LEN = 5;
5139 
5140   static int log_depth; // Not threadsafe. Log only.
5141 
5142   // Helper to turn unprintable or newline characters into spaces
printable_char(char c)5143   static simdjson_really_inline char printable_char(char c) {
5144     if (c >= 0x20) {
5145       return c;
5146     } else {
5147       return ' ';
5148     }
5149   }
5150 
5151   // Print the header and set up log_start
log_start()5152   static simdjson_really_inline void log_start() {
5153     if (LOG_ENABLED) {
5154       log_depth = 0;
5155       printf("\n");
5156       printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
5157       printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
5158     }
5159   }
5160 
log_string(const char * message)5161   simdjson_unused static simdjson_really_inline void log_string(const char *message) {
5162     if (LOG_ENABLED) {
5163       printf("%s\n", message);
5164     }
5165   }
5166 
5167   // Logs a single line from the stage 2 DOM parser
5168   template<typename S>
log_line(S & structurals,const char * title_prefix,const char * title,const char * detail)5169   static simdjson_really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
5170     if (LOG_ENABLED) {
5171       printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
5172       auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
5173       auto next_index = structurals.next_structural;
5174       auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>("                                                       ");
5175       auto next = &structurals.buf[*next_index];
5176       {
5177         // Print the next N characters in the buffer.
5178         printf("| ");
5179         // Otherwise, print the characters starting from the buffer position.
5180         // Print spaces for unprintable or newline characters.
5181         for (int i=0;i<LOG_BUFFER_LEN;i++) {
5182           printf("%c", printable_char(current[i]));
5183         }
5184         printf(" ");
5185         // Print the next N characters in the buffer.
5186         printf("| ");
5187         // Otherwise, print the characters starting from the buffer position.
5188         // Print spaces for unprintable or newline characters.
5189         for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
5190           printf("%c", printable_char(next[i]));
5191         }
5192         printf(" ");
5193       }
5194       if (current_index) {
5195         printf("| %*u ", LOG_INDEX_LEN, *current_index);
5196       } else {
5197         printf("| %-*s ", LOG_INDEX_LEN, "");
5198       }
5199       // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
5200       printf("| %-s ", detail);
5201       printf("|\n");
5202     }
5203   }
5204 
5205 } // namespace logger
5206 } // unnamed namespace
5207 } // namespace fallback
5208 } // namespace simdjson
5209 /* end file src/generic/stage2/logger.h */
5210 
5211 namespace simdjson {
5212 namespace fallback {
5213 namespace {
5214 namespace stage2 {
5215 
5216 class json_iterator {
5217 public:
5218   const uint8_t* const buf;
5219   uint32_t *next_structural;
5220   dom_parser_implementation &dom_parser;
5221   uint32_t depth{0};
5222 
5223   /**
5224    * Walk the JSON document.
5225    *
5226    * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
5227    * the first parameter; some callbacks have other parameters as well:
5228    *
5229    * - visit_document_start() - at the beginning.
5230    * - visit_document_end() - at the end (if things were successful).
5231    *
5232    * - visit_array_start() - at the start `[` of a non-empty array.
5233    * - visit_array_end() - at the end `]` of a non-empty array.
5234    * - visit_empty_array() - when an empty array is encountered.
5235    *
5236    * - visit_object_end() - at the start `]` of a non-empty object.
5237    * - visit_object_start() - at the end `]` of a non-empty object.
5238    * - visit_empty_object() - when an empty object is encountered.
5239    * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
5240    *                                   guaranteed to point at the first quote of the string (`"key"`).
5241    * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
5242    * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
5243    *
5244    * - increment_count(iter) - each time a value is found in an array or object.
5245    */
5246   template<bool STREAMING, typename V>
5247   simdjson_warn_unused simdjson_really_inline error_code walk_document(V &visitor) noexcept;
5248 
5249   /**
5250    * Create an iterator capable of walking a JSON document.
5251    *
5252    * The document must have already passed through stage 1.
5253    */
5254   simdjson_really_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
5255 
5256   /**
5257    * Look at the next token.
5258    *
5259    * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
5260    *
5261    * They may include invalid JSON as well (such as `1.2.3` or `ture`).
5262    */
5263   simdjson_really_inline const uint8_t *peek() const noexcept;
5264   /**
5265    * Advance to the next token.
5266    *
5267    * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
5268    *
5269    * They may include invalid JSON as well (such as `1.2.3` or `ture`).
5270    */
5271   simdjson_really_inline const uint8_t *advance() noexcept;
5272   /**
5273    * Get the remaining length of the document, from the start of the current token.
5274    */
5275   simdjson_really_inline size_t remaining_len() const noexcept;
5276   /**
5277    * Check if we are at the end of the document.
5278    *
5279    * If this is true, there are no more tokens.
5280    */
5281   simdjson_really_inline bool at_eof() const noexcept;
5282   /**
5283    * Check if we are at the beginning of the document.
5284    */
5285   simdjson_really_inline bool at_beginning() const noexcept;
5286   simdjson_really_inline uint8_t last_structural() const noexcept;
5287 
5288   /**
5289    * Log that a value has been found.
5290    *
5291    * Set ENABLE_LOGGING=true in logger.h to see logging.
5292    */
5293   simdjson_really_inline void log_value(const char *type) const noexcept;
5294   /**
5295    * Log the start of a multipart value.
5296    *
5297    * Set ENABLE_LOGGING=true in logger.h to see logging.
5298    */
5299   simdjson_really_inline void log_start_value(const char *type) const noexcept;
5300   /**
5301    * Log the end of a multipart value.
5302    *
5303    * Set ENABLE_LOGGING=true in logger.h to see logging.
5304    */
5305   simdjson_really_inline void log_end_value(const char *type) const noexcept;
5306   /**
5307    * Log an error.
5308    *
5309    * Set ENABLE_LOGGING=true in logger.h to see logging.
5310    */
5311   simdjson_really_inline void log_error(const char *error) const noexcept;
5312 
5313   template<typename V>
5314   simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
5315   template<typename V>
5316   simdjson_warn_unused simdjson_really_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
5317 };
5318 
5319 template<bool STREAMING, typename V>
walk_document(V & visitor)5320 simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_document(V &visitor) noexcept {
5321   logger::log_start();
5322 
5323   //
5324   // Start the document
5325   //
5326   if (at_eof()) { return EMPTY; }
5327   log_start_value("document");
5328   SIMDJSON_TRY( visitor.visit_document_start(*this) );
5329 
5330   //
5331   // Read first value
5332   //
5333   {
5334     auto value = advance();
5335 
5336     // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
5337     // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
5338     if (!STREAMING) {
5339       switch (*value) {
5340         case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break;
5341         case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break;
5342       }
5343     }
5344 
5345     switch (*value) {
5346       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
5347       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
5348       default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
5349     }
5350   }
5351   goto document_end;
5352 
5353 //
5354 // Object parser states
5355 //
5356 object_begin:
5357   log_start_value("object");
5358   depth++;
5359   if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
5360   dom_parser.is_array[depth] = false;
5361   SIMDJSON_TRY( visitor.visit_object_start(*this) );
5362 
5363   {
5364     auto key = advance();
5365     if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
5366     SIMDJSON_TRY( visitor.increment_count(*this) );
5367     SIMDJSON_TRY( visitor.visit_key(*this, key) );
5368   }
5369 
5370 object_field:
5371   if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
5372   {
5373     auto value = advance();
5374     switch (*value) {
5375       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
5376       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
5377       default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
5378     }
5379   }
5380 
5381 object_continue:
5382   switch (*advance()) {
5383     case ',':
5384       SIMDJSON_TRY( visitor.increment_count(*this) );
5385       {
5386         auto key = advance();
5387         if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
5388         SIMDJSON_TRY( visitor.visit_key(*this, key) );
5389       }
5390       goto object_field;
5391     case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
5392     default: log_error("No comma between object fields"); return TAPE_ERROR;
5393   }
5394 
5395 scope_end:
5396   depth--;
5397   if (depth == 0) { goto document_end; }
5398   if (dom_parser.is_array[depth]) { goto array_continue; }
5399   goto object_continue;
5400 
5401 //
5402 // Array parser states
5403 //
5404 array_begin:
5405   log_start_value("array");
5406   depth++;
5407   if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
5408   dom_parser.is_array[depth] = true;
5409   SIMDJSON_TRY( visitor.visit_array_start(*this) );
5410   SIMDJSON_TRY( visitor.increment_count(*this) );
5411 
5412 array_value:
5413   {
5414     auto value = advance();
5415     switch (*value) {
5416       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
5417       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
5418       default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
5419     }
5420   }
5421 
5422 array_continue:
5423   switch (*advance()) {
5424     case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
5425     case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
5426     default: log_error("Missing comma between array values"); return TAPE_ERROR;
5427   }
5428 
5429 document_end:
5430   log_end_value("document");
5431   SIMDJSON_TRY( visitor.visit_document_end(*this) );
5432 
5433   dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
5434 
5435   // If we didn't make it to the end, it's an error
5436   if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
5437     log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
5438     return TAPE_ERROR;
5439   }
5440 
5441   return SUCCESS;
5442 
5443 } // walk_document()
5444 
json_iterator(dom_parser_implementation & _dom_parser,size_t start_structural_index)5445 simdjson_really_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
5446   : buf{_dom_parser.buf},
5447     next_structural{&_dom_parser.structural_indexes[start_structural_index]},
5448     dom_parser{_dom_parser} {
5449 }
5450 
peek() const5451 simdjson_really_inline const uint8_t *json_iterator::peek() const noexcept {
5452   return &buf[*(next_structural)];
5453 }
advance()5454 simdjson_really_inline const uint8_t *json_iterator::advance() noexcept {
5455   return &buf[*(next_structural++)];
5456 }
remaining_len() const5457 simdjson_really_inline size_t json_iterator::remaining_len() const noexcept {
5458   return dom_parser.len - *(next_structural-1);
5459 }
5460 
at_eof() const5461 simdjson_really_inline bool json_iterator::at_eof() const noexcept {
5462   return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
5463 }
at_beginning() const5464 simdjson_really_inline bool json_iterator::at_beginning() const noexcept {
5465   return next_structural == dom_parser.structural_indexes.get();
5466 }
last_structural() const5467 simdjson_really_inline uint8_t json_iterator::last_structural() const noexcept {
5468   return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
5469 }
5470 
log_value(const char * type) const5471 simdjson_really_inline void json_iterator::log_value(const char *type) const noexcept {
5472   logger::log_line(*this, "", type, "");
5473 }
5474 
log_start_value(const char * type) const5475 simdjson_really_inline void json_iterator::log_start_value(const char *type) const noexcept {
5476   logger::log_line(*this, "+", type, "");
5477   if (logger::LOG_ENABLED) { logger::log_depth++; }
5478 }
5479 
log_end_value(const char * type) const5480 simdjson_really_inline void json_iterator::log_end_value(const char *type) const noexcept {
5481   if (logger::LOG_ENABLED) { logger::log_depth--; }
5482   logger::log_line(*this, "-", type, "");
5483 }
5484 
log_error(const char * error) const5485 simdjson_really_inline void json_iterator::log_error(const char *error) const noexcept {
5486   logger::log_line(*this, "", "ERROR", error);
5487 }
5488 
5489 template<typename V>
visit_root_primitive(V & visitor,const uint8_t * value)5490 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
5491   switch (*value) {
5492     case '"': return visitor.visit_root_string(*this, value);
5493     case 't': return visitor.visit_root_true_atom(*this, value);
5494     case 'f': return visitor.visit_root_false_atom(*this, value);
5495     case 'n': return visitor.visit_root_null_atom(*this, value);
5496     case '-':
5497     case '0': case '1': case '2': case '3': case '4':
5498     case '5': case '6': case '7': case '8': case '9':
5499       return visitor.visit_root_number(*this, value);
5500     default:
5501       log_error("Document starts with a non-value character");
5502       return TAPE_ERROR;
5503   }
5504 }
5505 template<typename V>
visit_primitive(V & visitor,const uint8_t * value)5506 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
5507   switch (*value) {
5508     case '"': return visitor.visit_string(*this, value);
5509     case 't': return visitor.visit_true_atom(*this, value);
5510     case 'f': return visitor.visit_false_atom(*this, value);
5511     case 'n': return visitor.visit_null_atom(*this, value);
5512     case '-':
5513     case '0': case '1': case '2': case '3': case '4':
5514     case '5': case '6': case '7': case '8': case '9':
5515       return visitor.visit_number(*this, value);
5516     default:
5517       log_error("Non-value found when value was expected!");
5518       return TAPE_ERROR;
5519   }
5520 }
5521 
5522 } // namespace stage2
5523 } // unnamed namespace
5524 } // namespace fallback
5525 } // namespace simdjson
5526 /* end file src/generic/stage2/json_iterator.h */
5527 /* begin file src/generic/stage2/tape_writer.h */
5528 namespace simdjson {
5529 namespace fallback {
5530 namespace {
5531 namespace stage2 {
5532 
5533 struct tape_writer {
5534   /** The next place to write to tape */
5535   uint64_t *next_tape_loc;
5536 
5537   /** Write a signed 64-bit value to tape. */
5538   simdjson_really_inline void append_s64(int64_t value) noexcept;
5539 
5540   /** Write an unsigned 64-bit value to tape. */
5541   simdjson_really_inline void append_u64(uint64_t value) noexcept;
5542 
5543   /** Write a double value to tape. */
5544   simdjson_really_inline void append_double(double value) noexcept;
5545 
5546   /**
5547    * Append a tape entry (an 8-bit type,and 56 bits worth of value).
5548    */
5549   simdjson_really_inline void append(uint64_t val, internal::tape_type t) noexcept;
5550 
5551   /**
5552    * Skip the current tape entry without writing.
5553    *
5554    * Used to skip the start of the container, since we'll come back later to fill it in when the
5555    * container ends.
5556    */
5557   simdjson_really_inline void skip() noexcept;
5558 
5559   /**
5560    * Skip the number of tape entries necessary to write a large u64 or i64.
5561    */
5562   simdjson_really_inline void skip_large_integer() noexcept;
5563 
5564   /**
5565    * Skip the number of tape entries necessary to write a double.
5566    */
5567   simdjson_really_inline void skip_double() noexcept;
5568 
5569   /**
5570    * Write a value to a known location on tape.
5571    *
5572    * Used to go back and write out the start of a container after the container ends.
5573    */
5574   simdjson_really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
5575 
5576 private:
5577   /**
5578    * Append both the tape entry, and a supplementary value following it. Used for types that need
5579    * all 64 bits, such as double and uint64_t.
5580    */
5581   template<typename T>
5582   simdjson_really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
5583 }; // struct number_writer
5584 
append_s64(int64_t value)5585 simdjson_really_inline void tape_writer::append_s64(int64_t value) noexcept {
5586   append2(0, value, internal::tape_type::INT64);
5587 }
5588 
append_u64(uint64_t value)5589 simdjson_really_inline void tape_writer::append_u64(uint64_t value) noexcept {
5590   append(0, internal::tape_type::UINT64);
5591   *next_tape_loc = value;
5592   next_tape_loc++;
5593 }
5594 
5595 /** Write a double value to tape. */
append_double(double value)5596 simdjson_really_inline void tape_writer::append_double(double value) noexcept {
5597   append2(0, value, internal::tape_type::DOUBLE);
5598 }
5599 
skip()5600 simdjson_really_inline void tape_writer::skip() noexcept {
5601   next_tape_loc++;
5602 }
5603 
skip_large_integer()5604 simdjson_really_inline void tape_writer::skip_large_integer() noexcept {
5605   next_tape_loc += 2;
5606 }
5607 
skip_double()5608 simdjson_really_inline void tape_writer::skip_double() noexcept {
5609   next_tape_loc += 2;
5610 }
5611 
append(uint64_t val,internal::tape_type t)5612 simdjson_really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
5613   *next_tape_loc = val | ((uint64_t(char(t))) << 56);
5614   next_tape_loc++;
5615 }
5616 
5617 template<typename T>
append2(uint64_t val,T val2,internal::tape_type t)5618 simdjson_really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
5619   append(val, t);
5620   static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
5621   memcpy(next_tape_loc, &val2, sizeof(val2));
5622   next_tape_loc++;
5623 }
5624 
write(uint64_t & tape_loc,uint64_t val,internal::tape_type t)5625 simdjson_really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
5626   tape_loc = val | ((uint64_t(char(t))) << 56);
5627 }
5628 
5629 } // namespace stage2
5630 } // unnamed namespace
5631 } // namespace fallback
5632 } // namespace simdjson
5633 /* end file src/generic/stage2/tape_writer.h */
5634 
5635 namespace simdjson {
5636 namespace fallback {
5637 namespace {
5638 namespace stage2 {
5639 
5640 struct tape_builder {
5641   template<bool STREAMING>
5642   simdjson_warn_unused static simdjson_really_inline error_code parse_document(
5643     dom_parser_implementation &dom_parser,
5644     dom::document &doc) noexcept;
5645 
5646   /** Called when a non-empty document starts. */
5647   simdjson_warn_unused simdjson_really_inline error_code visit_document_start(json_iterator &iter) noexcept;
5648   /** Called when a non-empty document ends without error. */
5649   simdjson_warn_unused simdjson_really_inline error_code visit_document_end(json_iterator &iter) noexcept;
5650 
5651   /** Called when a non-empty array starts. */
5652   simdjson_warn_unused simdjson_really_inline error_code visit_array_start(json_iterator &iter) noexcept;
5653   /** Called when a non-empty array ends. */
5654   simdjson_warn_unused simdjson_really_inline error_code visit_array_end(json_iterator &iter) noexcept;
5655   /** Called when an empty array is found. */
5656   simdjson_warn_unused simdjson_really_inline error_code visit_empty_array(json_iterator &iter) noexcept;
5657 
5658   /** Called when a non-empty object starts. */
5659   simdjson_warn_unused simdjson_really_inline error_code visit_object_start(json_iterator &iter) noexcept;
5660   /**
5661    * Called when a key in a field is encountered.
5662    *
5663    * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
5664    * will be called after this with the field value.
5665    */
5666   simdjson_warn_unused simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
5667   /** Called when a non-empty object ends. */
5668   simdjson_warn_unused simdjson_really_inline error_code visit_object_end(json_iterator &iter) noexcept;
5669   /** Called when an empty object is found. */
5670   simdjson_warn_unused simdjson_really_inline error_code visit_empty_object(json_iterator &iter) noexcept;
5671 
5672   /**
5673    * Called when a string, number, boolean or null is found.
5674    */
5675   simdjson_warn_unused simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
5676   /**
5677    * Called when a string, number, boolean or null is found at the top level of a document (i.e.
5678    * when there is no array or object and the entire document is a single string, number, boolean or
5679    * null.
5680    *
5681    * This is separate from primitive() because simdjson's normal primitive parsing routines assume
5682    * there is at least one more token after the value, which is only true in an array or object.
5683    */
5684   simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
5685 
5686   simdjson_warn_unused simdjson_really_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
5687   simdjson_warn_unused simdjson_really_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
5688   simdjson_warn_unused simdjson_really_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
5689   simdjson_warn_unused simdjson_really_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
5690   simdjson_warn_unused simdjson_really_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
5691 
5692   simdjson_warn_unused simdjson_really_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
5693   simdjson_warn_unused simdjson_really_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
5694   simdjson_warn_unused simdjson_really_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
5695   simdjson_warn_unused simdjson_really_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
5696   simdjson_warn_unused simdjson_really_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
5697 
5698   /** Called each time a new field or element in an array or object is found. */
5699   simdjson_warn_unused simdjson_really_inline error_code increment_count(json_iterator &iter) noexcept;
5700 
5701   /** Next location to write to tape */
5702   tape_writer tape;
5703 private:
5704   /** Next write location in the string buf for stage 2 parsing */
5705   uint8_t *current_string_buf_loc;
5706 
5707   simdjson_really_inline tape_builder(dom::document &doc) noexcept;
5708 
5709   simdjson_really_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
5710   simdjson_really_inline void start_container(json_iterator &iter) noexcept;
5711   simdjson_warn_unused simdjson_really_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
5712   simdjson_warn_unused simdjson_really_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
5713   simdjson_really_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
5714   simdjson_really_inline void on_end_string(uint8_t *dst) noexcept;
5715 }; // class tape_builder
5716 
5717 template<bool STREAMING>
parse_document(dom_parser_implementation & dom_parser,dom::document & doc)5718 simdjson_warn_unused simdjson_really_inline error_code tape_builder::parse_document(
5719     dom_parser_implementation &dom_parser,
5720     dom::document &doc) noexcept {
5721   dom_parser.doc = &doc;
5722   json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
5723   tape_builder builder(doc);
5724   return iter.walk_document<STREAMING>(builder);
5725 }
5726 
visit_root_primitive(json_iterator & iter,const uint8_t * value)5727 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
5728   return iter.visit_root_primitive(*this, value);
5729 }
visit_primitive(json_iterator & iter,const uint8_t * value)5730 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
5731   return iter.visit_primitive(*this, value);
5732 }
visit_empty_object(json_iterator & iter)5733 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
5734   return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
5735 }
visit_empty_array(json_iterator & iter)5736 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
5737   return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
5738 }
5739 
visit_document_start(json_iterator & iter)5740 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
5741   start_container(iter);
5742   return SUCCESS;
5743 }
visit_object_start(json_iterator & iter)5744 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
5745   start_container(iter);
5746   return SUCCESS;
5747 }
visit_array_start(json_iterator & iter)5748 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
5749   start_container(iter);
5750   return SUCCESS;
5751 }
5752 
visit_object_end(json_iterator & iter)5753 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
5754   return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
5755 }
visit_array_end(json_iterator & iter)5756 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
5757   return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
5758 }
visit_document_end(json_iterator & iter)5759 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
5760   constexpr uint32_t start_tape_index = 0;
5761   tape.append(start_tape_index, internal::tape_type::ROOT);
5762   tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
5763   return SUCCESS;
5764 }
visit_key(json_iterator & iter,const uint8_t * key)5765 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
5766   return visit_string(iter, key, true);
5767 }
5768 
increment_count(json_iterator & iter)5769 simdjson_warn_unused simdjson_really_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
5770   iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
5771   return SUCCESS;
5772 }
5773 
tape_builder(dom::document & doc)5774 simdjson_really_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
5775 
visit_string(json_iterator & iter,const uint8_t * value,bool key)5776 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
5777   iter.log_value(key ? "key" : "string");
5778   uint8_t *dst = on_start_string(iter);
5779   dst = stringparsing::parse_string(value+1, dst);
5780   if (dst == nullptr) {
5781     iter.log_error("Invalid escape in string");
5782     return STRING_ERROR;
5783   }
5784   on_end_string(dst);
5785   return SUCCESS;
5786 }
5787 
visit_root_string(json_iterator & iter,const uint8_t * value)5788 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
5789   return visit_string(iter, value);
5790 }
5791 
visit_number(json_iterator & iter,const uint8_t * value)5792 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
5793   iter.log_value("number");
5794   return numberparsing::parse_number(value, tape);
5795 }
5796 
visit_root_number(json_iterator & iter,const uint8_t * value)5797 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
5798   //
5799   // We need to make a copy to make sure that the string is space terminated.
5800   // This is not about padding the input, which should already padded up
5801   // to len + SIMDJSON_PADDING. However, we have no control at this stage
5802   // on how the padding was done. What if the input string was padded with nulls?
5803   // It is quite common for an input string to have an extra null character (C string).
5804   // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
5805   // document, but the string "9\0" by itself is fine. So we make a copy and
5806   // pad the input with spaces when we know that there is just one input element.
5807   // This copy is relatively expensive, but it will almost never be called in
5808   // practice unless you are in the strange scenario where you have many JSON
5809   // documents made of single atoms.
5810   //
5811   std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
5812   if (copy.get() == nullptr) { return MEMALLOC; }
5813   std::memcpy(copy.get(), value, iter.remaining_len());
5814   std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
5815   error_code error = visit_number(iter, copy.get());
5816   return error;
5817 }
5818 
visit_true_atom(json_iterator & iter,const uint8_t * value)5819 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
5820   iter.log_value("true");
5821   if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
5822   tape.append(0, internal::tape_type::TRUE_VALUE);
5823   return SUCCESS;
5824 }
5825 
visit_root_true_atom(json_iterator & iter,const uint8_t * value)5826 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
5827   iter.log_value("true");
5828   if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
5829   tape.append(0, internal::tape_type::TRUE_VALUE);
5830   return SUCCESS;
5831 }
5832 
visit_false_atom(json_iterator & iter,const uint8_t * value)5833 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
5834   iter.log_value("false");
5835   if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
5836   tape.append(0, internal::tape_type::FALSE_VALUE);
5837   return SUCCESS;
5838 }
5839 
visit_root_false_atom(json_iterator & iter,const uint8_t * value)5840 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
5841   iter.log_value("false");
5842   if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
5843   tape.append(0, internal::tape_type::FALSE_VALUE);
5844   return SUCCESS;
5845 }
5846 
visit_null_atom(json_iterator & iter,const uint8_t * value)5847 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
5848   iter.log_value("null");
5849   if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
5850   tape.append(0, internal::tape_type::NULL_VALUE);
5851   return SUCCESS;
5852 }
5853 
visit_root_null_atom(json_iterator & iter,const uint8_t * value)5854 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
5855   iter.log_value("null");
5856   if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
5857   tape.append(0, internal::tape_type::NULL_VALUE);
5858   return SUCCESS;
5859 }
5860 
5861 // private:
5862 
next_tape_index(json_iterator & iter) const5863 simdjson_really_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
5864   return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
5865 }
5866 
empty_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)5867 simdjson_warn_unused simdjson_really_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
5868   auto start_index = next_tape_index(iter);
5869   tape.append(start_index+2, start);
5870   tape.append(start_index, end);
5871   return SUCCESS;
5872 }
5873 
start_container(json_iterator & iter)5874 simdjson_really_inline void tape_builder::start_container(json_iterator &iter) noexcept {
5875   iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
5876   iter.dom_parser.open_containers[iter.depth].count = 0;
5877   tape.skip(); // We don't actually *write* the start element until the end.
5878 }
5879 
end_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)5880 simdjson_warn_unused simdjson_really_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
5881   // Write the ending tape element, pointing at the start location
5882   const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
5883   tape.append(start_tape_index, end);
5884   // Write the start tape element, pointing at the end location (and including count)
5885   // count can overflow if it exceeds 24 bits... so we saturate
5886   // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
5887   const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
5888   const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
5889   tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
5890   return SUCCESS;
5891 }
5892 
on_start_string(json_iterator & iter)5893 simdjson_really_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
5894   // we advance the point, accounting for the fact that we have a NULL termination
5895   tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
5896   return current_string_buf_loc + sizeof(uint32_t);
5897 }
5898 
on_end_string(uint8_t * dst)5899 simdjson_really_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
5900   uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
5901   // TODO check for overflow in case someone has a crazy string (>=4GB?)
5902   // But only add the overflow check when the document itself exceeds 4GB
5903   // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
5904   memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
5905   // NULL termination is still handy if you expect all your strings to
5906   // be NULL terminated? It comes at a small cost
5907   *dst = 0;
5908   current_string_buf_loc = dst + 1;
5909 }
5910 
5911 } // namespace stage2
5912 } // unnamed namespace
5913 } // namespace fallback
5914 } // namespace simdjson
5915 /* end file src/generic/stage2/tape_builder.h */
5916 
5917 namespace simdjson {
5918 namespace fallback {
5919 
stage2(dom::document & _doc)5920 simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
5921   return stage2::tape_builder::parse_document<false>(*this, _doc);
5922 }
5923 
stage2_next(dom::document & _doc)5924 simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
5925   return stage2::tape_builder::parse_document<true>(*this, _doc);
5926 }
5927 
parse(const uint8_t * _buf,size_t _len,dom::document & _doc)5928 simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
5929   auto error = stage1(_buf, _len, false);
5930   if (error) { return error; }
5931   return stage2(_doc);
5932 }
5933 
5934 } // namespace fallback
5935 } // namespace simdjson
5936 
5937 /* begin file include/simdjson/fallback/end.h */
5938 /* end file include/simdjson/fallback/end.h */
5939 /* end file src/fallback/dom_parser_implementation.cpp */
5940 #endif
5941 #if SIMDJSON_IMPLEMENTATION_HASWELL
5942 /* begin file src/haswell/implementation.cpp */
5943 /* begin file include/simdjson/haswell/begin.h */
5944 // redefining SIMDJSON_IMPLEMENTATION to "haswell"
5945 // #define SIMDJSON_IMPLEMENTATION haswell
5946 SIMDJSON_TARGET_HASWELL
5947 /* end file include/simdjson/haswell/begin.h */
5948 
5949 namespace simdjson {
5950 namespace haswell {
5951 
create_dom_parser_implementation(size_t capacity,size_t max_depth,std::unique_ptr<internal::dom_parser_implementation> & dst) const5952 simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
5953   size_t capacity,
5954   size_t max_depth,
5955   std::unique_ptr<internal::dom_parser_implementation>& dst
5956 ) const noexcept {
5957   dst.reset( new (std::nothrow) dom_parser_implementation() );
5958   if (!dst) { return MEMALLOC; }
5959   dst->set_capacity(capacity);
5960   dst->set_max_depth(max_depth);
5961   return SUCCESS;
5962 }
5963 
5964 } // namespace haswell
5965 } // namespace simdjson
5966 
5967 /* begin file include/simdjson/haswell/end.h */
5968 SIMDJSON_UNTARGET_HASWELL
5969 /* end file include/simdjson/haswell/end.h */
5970 
5971 /* end file src/haswell/implementation.cpp */
5972 /* begin file src/haswell/dom_parser_implementation.cpp */
5973 /* begin file include/simdjson/haswell/begin.h */
5974 // redefining SIMDJSON_IMPLEMENTATION to "haswell"
5975 // #define SIMDJSON_IMPLEMENTATION haswell
5976 SIMDJSON_TARGET_HASWELL
5977 /* end file include/simdjson/haswell/begin.h */
5978 
5979 //
5980 // Stage 1
5981 //
5982 
5983 namespace simdjson {
5984 namespace haswell {
5985 namespace {
5986 
5987 using namespace simd;
5988 
5989 struct json_character_block {
5990   static simdjson_really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
5991   //  ASCII white-space ('\r','\n','\t',' ')
5992   simdjson_really_inline uint64_t whitespace() const noexcept;
5993   // non-quote structural characters (comma, colon, braces, brackets)
5994   simdjson_really_inline uint64_t op() const noexcept;
5995   // neither a structural character nor a white-space, so letters, numbers and quotes
5996   simdjson_really_inline uint64_t scalar() const noexcept;
5997 
5998   uint64_t _whitespace; // ASCII white-space ('\r','\n','\t',' ')
5999   uint64_t _op; // structural characters (comma, colon, braces, brackets but not quotes)
6000 };
6001 
whitespace() const6002 simdjson_really_inline uint64_t json_character_block::whitespace() const noexcept { return _whitespace; }
op() const6003 simdjson_really_inline uint64_t json_character_block::op() const noexcept { return _op; }
scalar() const6004 simdjson_really_inline uint64_t json_character_block::scalar() const noexcept { return ~(op() | whitespace()); }
6005 
6006 // This identifies structural characters (comma, colon, braces, brackets),
6007 // and ASCII white-space ('\r','\n','\t',' ').
classify(const simd::simd8x64<uint8_t> & in)6008 simdjson_really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
6009   // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
6010   // we can't use the generic lookup_16.
6011   const auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
6012 
6013   // The 6 operators (:,[]{}) have these values:
6014   //
6015   // , 2C
6016   // : 3A
6017   // [ 5B
6018   // { 7B
6019   // ] 5D
6020   // } 7D
6021   //
6022   // If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique.
6023   // We exploit this, using a simd 4-bit lookup to tell us which character match against, and then
6024   // match it (against | 0x20).
6025   //
6026   // To prevent recognizing other characters, everything else gets compared with 0, which cannot
6027   // match due to the | 0x20.
6028   //
6029   // NOTE: Due to the | 0x20, this ALSO treats <FF> and <SUB> (control characters 0C and 1A) like ,
6030   // and :. This gets caught in stage 2, which checks the actual character to ensure the right
6031   // operators are in the right places.
6032   const auto op_table = simd8<uint8_t>::repeat_16(
6033     0, 0, 0, 0,
6034     0, 0, 0, 0,
6035     0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B
6036     ',', '}', 0, 0  // , = 2C, ] = 5D, } = 7D
6037   );
6038 
6039   // We compute whitespace and op separately. If later code only uses one or the
6040   // other, given the fact that all functions are aggressively inlined, we can
6041   // hope that useless computations will be omitted. This is namely case when
6042   // minifying (we only need whitespace).
6043 
6044   const uint64_t whitespace = in.eq({
6045     _mm256_shuffle_epi8(whitespace_table, in.chunks[0]),
6046     _mm256_shuffle_epi8(whitespace_table, in.chunks[1])
6047   });
6048   // Turn [ and ] into { and }
6049   const simd8x64<uint8_t> curlified{
6050     in.chunks[0] | 0x20,
6051     in.chunks[1] | 0x20
6052   };
6053   const uint64_t op = curlified.eq({
6054     _mm256_shuffle_epi8(op_table, in.chunks[0]),
6055     _mm256_shuffle_epi8(op_table, in.chunks[1])
6056   });
6057 
6058   return { whitespace, op };
6059 }
6060 
is_ascii(const simd8x64<uint8_t> & input)6061 simdjson_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
6062   return input.reduce_or().is_ascii();
6063 }
6064 
must_be_continuation(const simd8<uint8_t> prev1,const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)6065 simdjson_unused simdjson_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
6066   simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
6067   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
6068   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
6069   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
6070   return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
6071 }
6072 
must_be_2_3_continuation(const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)6073 simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
6074   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
6075   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
6076   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
6077   return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
6078 }
6079 
6080 } // unnamed namespace
6081 } // namespace haswell
6082 } // namespace simdjson
6083 
6084 /* begin file src/generic/stage1/utf8_lookup4_algorithm.h */
6085 namespace simdjson {
6086 namespace haswell {
6087 namespace {
6088 namespace utf8_validation {
6089 
6090 using namespace simd;
6091 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)6092   simdjson_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
6093 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
6094 // Bit 1 = Too Long (ASCII followed by continuation)
6095 // Bit 2 = Overlong 3-byte
6096 // Bit 4 = Surrogate
6097 // Bit 5 = Overlong 2-byte
6098 // Bit 7 = Two Continuations
6099     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
6100                                                 // 11______ 11______
6101     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
6102     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
6103     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
6104     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
6105     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
6106     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
6107                                                 // 11110100 101_____
6108                                                 // 11110101 1001____
6109                                                 // 11110101 101_____
6110                                                 // 1111011_ 1001____
6111                                                 // 1111011_ 101_____
6112                                                 // 11111___ 1001____
6113                                                 // 11111___ 101_____
6114     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
6115                                                 // 11110101 1000____
6116                                                 // 1111011_ 1000____
6117                                                 // 11111___ 1000____
6118     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
6119 
6120     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
6121       // 0_______ ________ <ASCII in byte 1>
6122       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
6123       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
6124       // 10______ ________ <continuation in byte 1>
6125       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
6126       // 1100____ ________ <two byte lead in byte 1>
6127       TOO_SHORT | OVERLONG_2,
6128       // 1101____ ________ <two byte lead in byte 1>
6129       TOO_SHORT,
6130       // 1110____ ________ <three byte lead in byte 1>
6131       TOO_SHORT | OVERLONG_3 | SURROGATE,
6132       // 1111____ ________ <four+ byte lead in byte 1>
6133       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
6134     );
6135     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
6136     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
6137       // ____0000 ________
6138       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
6139       // ____0001 ________
6140       CARRY | OVERLONG_2,
6141       // ____001_ ________
6142       CARRY,
6143       CARRY,
6144 
6145       // ____0100 ________
6146       CARRY | TOO_LARGE,
6147       // ____0101 ________
6148       CARRY | TOO_LARGE | TOO_LARGE_1000,
6149       // ____011_ ________
6150       CARRY | TOO_LARGE | TOO_LARGE_1000,
6151       CARRY | TOO_LARGE | TOO_LARGE_1000,
6152 
6153       // ____1___ ________
6154       CARRY | TOO_LARGE | TOO_LARGE_1000,
6155       CARRY | TOO_LARGE | TOO_LARGE_1000,
6156       CARRY | TOO_LARGE | TOO_LARGE_1000,
6157       CARRY | TOO_LARGE | TOO_LARGE_1000,
6158       CARRY | TOO_LARGE | TOO_LARGE_1000,
6159       // ____1101 ________
6160       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
6161       CARRY | TOO_LARGE | TOO_LARGE_1000,
6162       CARRY | TOO_LARGE | TOO_LARGE_1000
6163     );
6164     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
6165       // ________ 0_______ <ASCII in byte 2>
6166       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
6167       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
6168 
6169       // ________ 1000____
6170       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
6171       // ________ 1001____
6172       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
6173       // ________ 101_____
6174       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
6175       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
6176 
6177       // ________ 11______
6178       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
6179     );
6180     return (byte_1_high & byte_1_low & byte_2_high);
6181   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)6182   simdjson_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
6183       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
6184     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
6185     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
6186     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
6187     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
6188     return must23_80 ^ sc;
6189   }
6190 
6191   //
6192   // Return nonzero if there are incomplete multibyte characters at the end of the block:
6193   // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
6194   //
is_incomplete(const simd8<uint8_t> input)6195   simdjson_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
6196     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
6197     // ... 1111____ 111_____ 11______
6198     static const uint8_t max_array[32] = {
6199       255, 255, 255, 255, 255, 255, 255, 255,
6200       255, 255, 255, 255, 255, 255, 255, 255,
6201       255, 255, 255, 255, 255, 255, 255, 255,
6202       255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
6203     };
6204     const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
6205     return input.gt_bits(max_value);
6206   }
6207 
6208   struct utf8_checker {
6209     // If this is nonzero, there has been a UTF-8 error.
6210     simd8<uint8_t> error;
6211     // The last input we received
6212     simd8<uint8_t> prev_input_block;
6213     // Whether the last input we received was incomplete (used for ASCII fast path)
6214     simd8<uint8_t> prev_incomplete;
6215 
6216     //
6217     // Check whether the current bytes are valid UTF-8.
6218     //
check_utf8_bytessimdjson::haswell::__anon9bb6be6f1811::utf8_validation::utf8_checker6219     simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
6220       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
6221       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
6222       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
6223       simd8<uint8_t> sc = check_special_cases(input, prev1);
6224       this->error |= check_multibyte_lengths(input, prev_input, sc);
6225     }
6226 
6227     // The only problem that can happen at EOF is that a multibyte character is too short
6228     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
6229     // too large in the first of two bytes.
check_eofsimdjson::haswell::__anon9bb6be6f1811::utf8_validation::utf8_checker6230     simdjson_really_inline void check_eof() {
6231       // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
6232       // possibly finish them.
6233       this->error |= this->prev_incomplete;
6234     }
6235 
check_next_inputsimdjson::haswell::__anon9bb6be6f1811::utf8_validation::utf8_checker6236     simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
6237       if(simdjson_likely(is_ascii(input))) {
6238         this->error |= this->prev_incomplete;
6239       } else {
6240         // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
6241         static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
6242             "We support either two or four chunks per 64-byte block.");
6243         if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
6244           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
6245           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
6246         } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
6247           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
6248           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
6249           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
6250           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
6251         }
6252         this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
6253         this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
6254 
6255       }
6256     }
6257     // do not forget to call check_eof!
errorssimdjson::haswell::__anon9bb6be6f1811::utf8_validation::utf8_checker6258     simdjson_really_inline error_code errors() {
6259       return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
6260     }
6261 
6262   }; // struct utf8_checker
6263 } // namespace utf8_validation
6264 
6265 using utf8_validation::utf8_checker;
6266 
6267 } // unnamed namespace
6268 } // namespace haswell
6269 } // namespace simdjson
6270 /* end file src/generic/stage1/utf8_lookup4_algorithm.h */
6271 /* begin file src/generic/stage1/json_structural_indexer.h */
6272 // This file contains the common code every implementation uses in stage1
6273 // It is intended to be included multiple times and compiled multiple times
6274 // We assume the file in which it is included already includes
6275 // "simdjson/stage1.h" (this simplifies amalgation)
6276 
6277 /* begin file src/generic/stage1/buf_block_reader.h */
6278 namespace simdjson {
6279 namespace haswell {
6280 namespace {
6281 
6282 // Walks through a buffer in block-sized increments, loading the last part with spaces
6283 template<size_t STEP_SIZE>
6284 struct buf_block_reader {
6285 public:
6286   simdjson_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
6287   simdjson_really_inline size_t block_index();
6288   simdjson_really_inline bool has_full_block() const;
6289   simdjson_really_inline const uint8_t *full_block() const;
6290   /**
6291    * Get the last block, padded with spaces.
6292    *
6293    * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
6294    * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
6295    * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
6296    *
6297    * @return the number of effective characters in the last block.
6298    */
6299   simdjson_really_inline size_t get_remainder(uint8_t *dst) const;
6300   simdjson_really_inline void advance();
6301 private:
6302   const uint8_t *buf;
6303   const size_t len;
6304   const size_t lenminusstep;
6305   size_t idx;
6306 };
6307 
6308 // Routines to print masks and text for debugging bitmask operations
format_input_text_64(const uint8_t * text)6309 simdjson_unused static char * format_input_text_64(const uint8_t *text) {
6310   static char buf[sizeof(simd8x64<uint8_t>) + 1];
6311   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
6312     buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
6313   }
6314   buf[sizeof(simd8x64<uint8_t>)] = '\0';
6315   return buf;
6316 }
6317 
6318 // Routines to print masks and text for debugging bitmask operations
format_input_text(const simd8x64<uint8_t> & in)6319 simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
6320   static char buf[sizeof(simd8x64<uint8_t>) + 1];
6321   in.store(reinterpret_cast<uint8_t*>(buf));
6322   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
6323     if (buf[i] < ' ') { buf[i] = '_'; }
6324   }
6325   buf[sizeof(simd8x64<uint8_t>)] = '\0';
6326   return buf;
6327 }
6328 
format_mask(uint64_t mask)6329 simdjson_unused static char * format_mask(uint64_t mask) {
6330   static char buf[sizeof(simd8x64<uint8_t>) + 1];
6331   for (size_t i=0; i<64; i++) {
6332     buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
6333   }
6334   buf[64] = '\0';
6335   return buf;
6336 }
6337 
6338 template<size_t STEP_SIZE>
buf_block_reader(const uint8_t * _buf,size_t _len)6339 simdjson_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
6340 
6341 template<size_t STEP_SIZE>
block_index()6342 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
6343 
6344 template<size_t STEP_SIZE>
has_full_block() const6345 simdjson_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
6346   return idx < lenminusstep;
6347 }
6348 
6349 template<size_t STEP_SIZE>
full_block() const6350 simdjson_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
6351   return &buf[idx];
6352 }
6353 
6354 template<size_t STEP_SIZE>
get_remainder(uint8_t * dst) const6355 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
6356   if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
6357   std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
6358   std::memcpy(dst, buf + idx, len - idx);
6359   return len - idx;
6360 }
6361 
6362 template<size_t STEP_SIZE>
advance()6363 simdjson_really_inline void buf_block_reader<STEP_SIZE>::advance() {
6364   idx += STEP_SIZE;
6365 }
6366 
6367 } // unnamed namespace
6368 } // namespace haswell
6369 } // namespace simdjson
6370 /* end file src/generic/stage1/buf_block_reader.h */
6371 /* begin file src/generic/stage1/json_string_scanner.h */
6372 namespace simdjson {
6373 namespace haswell {
6374 namespace {
6375 namespace stage1 {
6376 
6377 struct json_string_block {
6378   // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_string_blocksimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6379   simdjson_really_inline json_string_block(uint64_t backslash, uint64_t escaped, uint64_t quote, uint64_t in_string) :
6380   _backslash(backslash), _escaped(escaped), _quote(quote), _in_string(in_string) {}
6381 
6382   // Escaped characters (characters following an escape() character)
escapedsimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6383   simdjson_really_inline uint64_t escaped() const { return _escaped; }
6384   // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
escapesimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6385   simdjson_really_inline uint64_t escape() const { return _backslash & ~_escaped; }
6386   // Real (non-backslashed) quotes
quotesimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6387   simdjson_really_inline uint64_t quote() const { return _quote; }
6388   // Start quotes of strings
string_startsimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6389   simdjson_really_inline uint64_t string_start() const { return _quote & _in_string; }
6390   // End quotes of strings
string_endsimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6391   simdjson_really_inline uint64_t string_end() const { return _quote & ~_in_string; }
6392   // Only characters inside the string (not including the quotes)
string_contentsimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6393   simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; }
6394   // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_inside_stringsimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6395   simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
6396   // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_outside_stringsimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6397   simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
6398   // Tail of string (everything except the start quote)
string_tailsimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6399   simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
6400 
6401   // backslash characters
6402   uint64_t _backslash;
6403   // escaped characters (backslashed--does not include the hex characters after \u)
6404   uint64_t _escaped;
6405   // real quotes (non-backslashed ones)
6406   uint64_t _quote;
6407   // string characters (includes start quote but not end quote)
6408   uint64_t _in_string;
6409 };
6410 
6411 // Scans blocks for string characters, storing the state necessary to do so
6412 class json_string_scanner {
6413 public:
6414   simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
6415   // Returns either UNCLOSED_STRING or SUCCESS
6416   simdjson_really_inline error_code finish();
6417 
6418 private:
6419   // Intended to be defined by the implementation
6420   simdjson_really_inline uint64_t find_escaped(uint64_t escape);
6421   simdjson_really_inline uint64_t find_escaped_branchless(uint64_t escape);
6422 
6423   // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
6424   uint64_t prev_in_string = 0ULL;
6425   // Whether the first character of the next iteration is escaped.
6426   uint64_t prev_escaped = 0ULL;
6427 };
6428 
6429 //
6430 // Finds escaped characters (characters following \).
6431 //
6432 // Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
6433 //
6434 // Does this by:
6435 // - Shift the escape mask to get potentially escaped characters (characters after backslashes).
6436 // - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
6437 // - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
6438 //
6439 // To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
6440 // escape sequences, filters out the ones that start on even bits, and adds that to the mask of
6441 // escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
6442 // the start bit causes a carry), and leaves even-bit sequences alone.
6443 //
6444 // Example:
6445 //
6446 // text           |  \\\ | \\\"\\\" \\\" \\"\\" |
6447 // escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
6448 // odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
6449 // even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
6450 // invert_mask    |      |     cxxx     c xx   c| even_seq << 1
6451 // follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
6452 // escaped        |   x  | x x  x x  x x  x  x  |
6453 // desired        |   x  | x x  x x  x x  x  x  |
6454 // text           |  \\\ | \\\"\\\" \\\" \\"\\" |
6455 //
find_escaped_branchless(uint64_t backslash)6456 simdjson_really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash) {
6457   // If there was overflow, pretend the first character isn't a backslash
6458   backslash &= ~prev_escaped;
6459   uint64_t follows_escape = backslash << 1 | prev_escaped;
6460 
6461   // Get sequences starting on even bits by clearing out the odd series using +
6462   const uint64_t even_bits = 0x5555555555555555ULL;
6463   uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
6464   uint64_t sequences_starting_on_even_bits;
6465   prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
6466   uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
6467 
6468   // Mask every other backslashed character as an escaped character
6469   // Flip the mask for sequences that start on even bits, to correct them
6470   return (even_bits ^ invert_mask) & follows_escape;
6471 }
6472 
6473 //
6474 // Return a mask of all string characters plus end quotes.
6475 //
6476 // prev_escaped is overflow saying whether the next character is escaped.
6477 // prev_in_string is overflow saying whether we're still in a string.
6478 //
6479 // Backslash sequences outside of quotes will be detected in stage 2.
6480 //
next(const simd::simd8x64<uint8_t> & in)6481 simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
6482   const uint64_t backslash = in.eq('\\');
6483   const uint64_t escaped = find_escaped(backslash);
6484   const uint64_t quote = in.eq('"') & ~escaped;
6485 
6486   //
6487   // prefix_xor flips on bits inside the string (and flips off the end quote).
6488   //
6489   // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
6490   // (characters inside strings are outside, and characters outside strings are inside).
6491   //
6492   const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
6493 
6494   //
6495   // Check if we're still in a string at the end of the box so the next block will know
6496   //
6497   // right shift of a signed value expected to be well-defined and standard
6498   // compliant as of C++20, John Regher from Utah U. says this is fine code
6499   //
6500   prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
6501 
6502   // Use ^ to turn the beginning quote off, and the end quote on.
6503 
6504   // We are returning a function-local object so either we get a move constructor
6505   // or we get copy elision.
6506   return json_string_block(
6507     backslash,
6508     escaped,
6509     quote,
6510     in_string
6511   );
6512 }
6513 
finish()6514 simdjson_really_inline error_code json_string_scanner::finish() {
6515   if (prev_in_string) {
6516     return UNCLOSED_STRING;
6517   }
6518   return SUCCESS;
6519 }
6520 
6521 } // namespace stage1
6522 } // unnamed namespace
6523 } // namespace haswell
6524 } // namespace simdjson
6525 /* end file src/generic/stage1/json_string_scanner.h */
6526 /* begin file src/generic/stage1/json_scanner.h */
6527 namespace simdjson {
6528 namespace haswell {
6529 namespace {
6530 namespace stage1 {
6531 
6532 /**
6533  * A block of scanned json, with information on operators and scalars.
6534  *
6535  * We seek to identify pseudo-structural characters. Anything that is inside
6536  * a string must be omitted (hence  & ~_string.string_tail()).
6537  * Otherwise, pseudo-structural characters come in two forms.
6538  * 1. We have the structural characters ([,],{,},:, comma). The
6539  *    term 'structural character' is from the JSON RFC.
6540  * 2. We have the 'scalar pseudo-structural characters'.
6541  *    Scalars are quotes, and any character except structural characters and white space.
6542  *
6543  * To identify the scalar pseudo-structural characters, we must look at what comes
6544  * before them: it must be a space, a quote or a structural characters.
6545  * Starting with simdjson v0.3, we identify them by
6546  * negation: we identify everything that is followed by a non-quote scalar,
6547  * and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
6548  */
6549 struct json_block {
6550 public:
6551   // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_blocksimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6552   simdjson_really_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
6553   _string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
json_blocksimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6554   simdjson_really_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
6555   _string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
6556 
6557   /**
6558    * The start of structurals.
6559    * In simdjson prior to v0.3, these were called the pseudo-structural characters.
6560    **/
structural_startsimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6561   simdjson_really_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
6562   /** All JSON whitespace (i.e. not in a string) */
whitespacesimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6563   simdjson_really_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
6564 
6565   // Helpers
6566 
6567   /** Whether the given characters are inside a string (only works on non-quotes) */
non_quote_inside_stringsimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6568   simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
6569   /** Whether the given characters are outside a string (only works on non-quotes) */
non_quote_outside_stringsimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6570   simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
6571 
6572   // string and escape characters
6573   json_string_block _string;
6574   // whitespace, structural characters ('operators'), scalars
6575   json_character_block _characters;
6576   // whether the previous character was a scalar
6577   uint64_t _follows_potential_nonquote_scalar;
6578 private:
6579   // Potential structurals (i.e. disregarding strings)
6580 
6581   /**
6582    * structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
6583    * They may reside inside a string.
6584    **/
potential_structural_startsimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6585   simdjson_really_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
6586   /**
6587    * The start of non-operator runs, like 123, true and "abc".
6588    * It main reside inside a string.
6589    **/
potential_scalar_startsimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6590   simdjson_really_inline uint64_t potential_scalar_start() const noexcept {
6591     // The term "scalar" refers to anything except structural characters and white space
6592     // (so letters, numbers, quotes).
6593     // Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
6594     // then we know that it is irrelevant structurally.
6595     return _characters.scalar() & ~follows_potential_scalar();
6596   }
6597   /**
6598    * Whether the given character is immediately after a non-operator like 123, true.
6599    * The characters following a quote are not included.
6600    */
follows_potential_scalarsimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6601   simdjson_really_inline uint64_t follows_potential_scalar() const noexcept {
6602     // _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
6603     // that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
6604     // white space.
6605     // It is understood that within quoted region, anything at all could be marked (irrelevant).
6606     return _follows_potential_nonquote_scalar;
6607   }
6608 };
6609 
6610 /**
6611  * Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
6612  *
6613  * The scanner starts by calculating two distinct things:
6614  * - string characters (taking \" into account)
6615  * - structural characters or 'operators' ([]{},:, comma)
6616  *   and scalars (runs of non-operators like 123, true and "abc")
6617  *
6618  * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
6619  * in particular, the operator/scalar bit will find plenty of things that are actually part of
6620  * strings. When we're done, json_block will fuse the two together by masking out tokens that are
6621  * part of a string.
6622  */
6623 class json_scanner {
6624 public:
json_scanner()6625   json_scanner() {}
6626   simdjson_really_inline json_block next(const simd::simd8x64<uint8_t>& in);
6627   // Returns either UNCLOSED_STRING or SUCCESS
6628   simdjson_really_inline error_code finish();
6629 
6630 private:
6631   // Whether the last character of the previous iteration is part of a scalar token
6632   // (anything except whitespace or a structural character/'operator').
6633   uint64_t prev_scalar = 0ULL;
6634   json_string_scanner string_scanner{};
6635 };
6636 
6637 
6638 //
6639 // Check if the current character immediately follows a matching character.
6640 //
6641 // For example, this checks for quotes with backslashes in front of them:
6642 //
6643 //     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
6644 //
follows(const uint64_t match,uint64_t & overflow)6645 simdjson_really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
6646   const uint64_t result = match << 1 | overflow;
6647   overflow = match >> 63;
6648   return result;
6649 }
6650 
next(const simd::simd8x64<uint8_t> & in)6651 simdjson_really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
6652   json_string_block strings = string_scanner.next(in);
6653   // identifies the white-space and the structurat characters
6654   json_character_block characters = json_character_block::classify(in);
6655   // The term "scalar" refers to anything except structural characters and white space
6656   // (so letters, numbers, quotes).
6657   // We want  follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
6658   //
6659   // A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
6660   // or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
6661   // pseudo-structural character just like we would if we had  ' "a string" true '; otherwise we
6662   // may need to add an extra check when parsing strings.
6663   //
6664   // Performance: there are many ways to skin this cat.
6665   const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
6666   uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
6667   // We are returning a function-local object so either we get a move constructor
6668   // or we get copy elision.
6669   return json_block(
6670     strings,// strings is a function-local object so either it moves or the copy is elided.
6671     characters,
6672     follows_nonquote_scalar
6673   );
6674 }
6675 
finish()6676 simdjson_really_inline error_code json_scanner::finish() {
6677   return string_scanner.finish();
6678 }
6679 
6680 } // namespace stage1
6681 } // unnamed namespace
6682 } // namespace haswell
6683 } // namespace simdjson
6684 /* end file src/generic/stage1/json_scanner.h */
6685 /* begin file src/generic/stage1/json_minifier.h */
6686 // This file contains the common code every implementation uses in stage1
6687 // It is intended to be included multiple times and compiled multiple times
6688 // We assume the file in which it is included already includes
6689 // "simdjson/stage1.h" (this simplifies amalgation)
6690 
6691 namespace simdjson {
6692 namespace haswell {
6693 namespace {
6694 namespace stage1 {
6695 
6696 class json_minifier {
6697 public:
6698   template<size_t STEP_SIZE>
6699   static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
6700 
6701 private:
json_minifier(uint8_t * _dst)6702   simdjson_really_inline json_minifier(uint8_t *_dst)
6703   : dst{_dst}
6704   {}
6705   template<size_t STEP_SIZE>
6706   simdjson_really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
6707   simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
6708   simdjson_really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
6709   json_scanner scanner{};
6710   uint8_t *dst;
6711 };
6712 
next(const simd::simd8x64<uint8_t> & in,const json_block & block)6713 simdjson_really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
6714   uint64_t mask = block.whitespace();
6715   in.compress(mask, dst);
6716   dst += 64 - count_ones(mask);
6717 }
6718 
finish(uint8_t * dst_start,size_t & dst_len)6719 simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
6720   error_code error = scanner.finish();
6721   if (error) { dst_len = 0; return error; }
6722   dst_len = dst - dst_start;
6723   return SUCCESS;
6724 }
6725 
6726 template<>
step(const uint8_t * block_buf,buf_block_reader<128> & reader)6727 simdjson_really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
6728   simd::simd8x64<uint8_t> in_1(block_buf);
6729   simd::simd8x64<uint8_t> in_2(block_buf+64);
6730   json_block block_1 = scanner.next(in_1);
6731   json_block block_2 = scanner.next(in_2);
6732   this->next(in_1, block_1);
6733   this->next(in_2, block_2);
6734   reader.advance();
6735 }
6736 
6737 template<>
step(const uint8_t * block_buf,buf_block_reader<64> & reader)6738 simdjson_really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
6739   simd::simd8x64<uint8_t> in_1(block_buf);
6740   json_block block_1 = scanner.next(in_1);
6741   this->next(block_buf, block_1);
6742   reader.advance();
6743 }
6744 
6745 template<size_t STEP_SIZE>
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len)6746 error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
6747   buf_block_reader<STEP_SIZE> reader(buf, len);
6748   json_minifier minifier(dst);
6749 
6750   // Index the first n-1 blocks
6751   while (reader.has_full_block()) {
6752     minifier.step<STEP_SIZE>(reader.full_block(), reader);
6753   }
6754 
6755   // Index the last (remainder) block, padded with spaces
6756   uint8_t block[STEP_SIZE];
6757   size_t remaining_bytes = reader.get_remainder(block);
6758   if (remaining_bytes > 0) {
6759     // We do not want to write directly to the output stream. Rather, we write
6760     // to a local buffer (for safety).
6761     uint8_t out_block[STEP_SIZE];
6762     uint8_t * const guarded_dst{minifier.dst};
6763     minifier.dst = out_block;
6764     minifier.step<STEP_SIZE>(block, reader);
6765     size_t to_write = minifier.dst - out_block;
6766     // In some cases, we could be enticed to consider the padded spaces
6767     // as part of the string. This is fine as long as we do not write more
6768     // than we consumed.
6769     if(to_write > remaining_bytes) { to_write = remaining_bytes; }
6770     memcpy(guarded_dst, out_block, to_write);
6771     minifier.dst = guarded_dst + to_write;
6772   }
6773   return minifier.finish(dst, dst_len);
6774 }
6775 
6776 } // namespace stage1
6777 } // unnamed namespace
6778 } // namespace haswell
6779 } // namespace simdjson
6780 /* end file src/generic/stage1/json_minifier.h */
6781 /* begin file src/generic/stage1/find_next_document_index.h */
6782 namespace simdjson {
6783 namespace haswell {
6784 namespace {
6785 
6786 /**
6787   * This algorithm is used to quickly identify the last structural position that
6788   * makes up a complete document.
6789   *
6790   * It does this by going backwards and finding the last *document boundary* (a
6791   * place where one value follows another without a comma between them). If the
6792   * last document (the characters after the boundary) has an equal number of
6793   * start and end brackets, it is considered complete.
6794   *
6795   * Simply put, we iterate over the structural characters, starting from
6796   * the end. We consider that we found the end of a JSON document when the
6797   * first element of the pair is NOT one of these characters: '{' '[' ';' ','
6798   * and when the second element is NOT one of these characters: '}' '}' ';' ','.
6799   *
6800   * This simple comparison works most of the time, but it does not cover cases
6801   * where the batch's structural indexes contain a perfect amount of documents.
6802   * In such a case, we do not have access to the structural index which follows
6803   * the last document, therefore, we do not have access to the second element in
6804   * the pair, and that means we cannot identify the last document. To fix this
6805   * issue, we keep a count of the open and closed curly/square braces we found
6806   * while searching for the pair. When we find a pair AND the count of open and
6807   * closed curly/square braces is the same, we know that we just passed a
6808   * complete document, therefore the last json buffer location is the end of the
6809   * batch.
6810   */
find_next_document_index(dom_parser_implementation & parser)6811 simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
6812   // TODO don't count separately, just figure out depth
6813   auto arr_cnt = 0;
6814   auto obj_cnt = 0;
6815   for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
6816     auto idxb = parser.structural_indexes[i];
6817     switch (parser.buf[idxb]) {
6818     case ':':
6819     case ',':
6820       continue;
6821     case '}':
6822       obj_cnt--;
6823       continue;
6824     case ']':
6825       arr_cnt--;
6826       continue;
6827     case '{':
6828       obj_cnt++;
6829       break;
6830     case '[':
6831       arr_cnt++;
6832       break;
6833     }
6834     auto idxa = parser.structural_indexes[i - 1];
6835     switch (parser.buf[idxa]) {
6836     case '{':
6837     case '[':
6838     case ':':
6839     case ',':
6840       continue;
6841     }
6842     // Last document is complete, so the next document will appear after!
6843     if (!arr_cnt && !obj_cnt) {
6844       return parser.n_structural_indexes;
6845     }
6846     // Last document is incomplete; mark the document at i + 1 as the next one
6847     return i;
6848   }
6849   return 0;
6850 }
6851 
6852 } // unnamed namespace
6853 } // namespace haswell
6854 } // namespace simdjson
6855 /* end file src/generic/stage1/find_next_document_index.h */
6856 
6857 namespace simdjson {
6858 namespace haswell {
6859 namespace {
6860 namespace stage1 {
6861 
6862 class bit_indexer {
6863 public:
6864   uint32_t *tail;
6865 
bit_indexer(uint32_t * index_buf)6866   simdjson_really_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
6867 
6868   // flatten out values in 'bits' assuming that they are are to have values of idx
6869   // plus their position in the bitvector, and store these indexes at
6870   // base_ptr[base] incrementing base as we go
6871   // will potentially store extra values beyond end of valid bits, so base_ptr
6872   // needs to be large enough to handle this
write(uint32_t idx,uint64_t bits)6873   simdjson_really_inline void write(uint32_t idx, uint64_t bits) {
6874     // In some instances, the next branch is expensive because it is mispredicted.
6875     // Unfortunately, in other cases,
6876     // it helps tremendously.
6877     if (bits == 0)
6878         return;
6879     int cnt = static_cast<int>(count_ones(bits));
6880 
6881     // Do the first 8 all together
6882     for (int i=0; i<8; i++) {
6883       this->tail[i] = idx + trailing_zeroes(bits);
6884       bits = clear_lowest_bit(bits);
6885     }
6886 
6887     // Do the next 8 all together (we hope in most cases it won't happen at all
6888     // and the branch is easily predicted).
6889     if (simdjson_unlikely(cnt > 8)) {
6890       for (int i=8; i<16; i++) {
6891         this->tail[i] = idx + trailing_zeroes(bits);
6892         bits = clear_lowest_bit(bits);
6893       }
6894 
6895       // Most files don't have 16+ structurals per block, so we take several basically guaranteed
6896       // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
6897       // or the start of a value ("abc" true 123) every four characters.
6898       if (simdjson_unlikely(cnt > 16)) {
6899         int i = 16;
6900         do {
6901           this->tail[i] = idx + trailing_zeroes(bits);
6902           bits = clear_lowest_bit(bits);
6903           i++;
6904         } while (i < cnt);
6905       }
6906     }
6907 
6908     this->tail += cnt;
6909   }
6910 };
6911 
6912 class json_structural_indexer {
6913 public:
6914   /**
6915    * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
6916    *
6917    * @param partial Setting the partial parameter to true allows the find_structural_bits to
6918    *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
6919    *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
6920    */
6921   template<size_t STEP_SIZE>
6922   static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
6923 
6924 private:
6925   simdjson_really_inline json_structural_indexer(uint32_t *structural_indexes);
6926   template<size_t STEP_SIZE>
6927   simdjson_really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
6928   simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
6929   simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
6930 
6931   json_scanner scanner{};
6932   utf8_checker checker{};
6933   bit_indexer indexer;
6934   uint64_t prev_structurals = 0;
6935   uint64_t unescaped_chars_error = 0;
6936 };
6937 
json_structural_indexer(uint32_t * structural_indexes)6938 simdjson_really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
6939 
6940 // Skip the last character if it is partial
trim_partial_utf8(const uint8_t * buf,size_t len)6941 simdjson_really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
6942   if (simdjson_unlikely(len < 3)) {
6943     switch (len) {
6944       case 2:
6945         if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
6946         if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
6947         return len;
6948       case 1:
6949         if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
6950         return len;
6951       case 0:
6952         return len;
6953     }
6954   }
6955   if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
6956   if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
6957   if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
6958   return len;
6959 }
6960 
6961 //
6962 // PERF NOTES:
6963 // We pipe 2 inputs through these stages:
6964 // 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
6965 //    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
6966 // 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
6967 //    The output of step 1 depends entirely on this information. These functions don't quite use
6968 //    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
6969 //    at a time. The second input's scans has some dependency on the first ones finishing it, but
6970 //    they can make a lot of progress before they need that information.
6971 // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
6972 //    to finish: utf-8 checks and generating the output from the last iteration.
6973 //
6974 // The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
6975 // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
6976 // workout.
6977 //
6978 template<size_t STEP_SIZE>
index(const uint8_t * buf,size_t len,dom_parser_implementation & parser,bool partial)6979 error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept {
6980   if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
6981   if (partial) { len = trim_partial_utf8(buf, len); }
6982 
6983   buf_block_reader<STEP_SIZE> reader(buf, len);
6984   json_structural_indexer indexer(parser.structural_indexes.get());
6985 
6986   // Read all but the last block
6987   while (reader.has_full_block()) {
6988     indexer.step<STEP_SIZE>(reader.full_block(), reader);
6989   }
6990 
6991   // Take care of the last block (will always be there unless file is empty)
6992   uint8_t block[STEP_SIZE];
6993   if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return EMPTY; }
6994   indexer.step<STEP_SIZE>(block, reader);
6995 
6996   return indexer.finish(parser, reader.block_index(), len, partial);
6997 }
6998 
6999 template<>
step(const uint8_t * block,buf_block_reader<128> & reader)7000 simdjson_really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
7001   simd::simd8x64<uint8_t> in_1(block);
7002   simd::simd8x64<uint8_t> in_2(block+64);
7003   json_block block_1 = scanner.next(in_1);
7004   json_block block_2 = scanner.next(in_2);
7005   this->next(in_1, block_1, reader.block_index());
7006   this->next(in_2, block_2, reader.block_index()+64);
7007   reader.advance();
7008 }
7009 
7010 template<>
step(const uint8_t * block,buf_block_reader<64> & reader)7011 simdjson_really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
7012   simd::simd8x64<uint8_t> in_1(block);
7013   json_block block_1 = scanner.next(in_1);
7014   this->next(in_1, block_1, reader.block_index());
7015   reader.advance();
7016 }
7017 
next(const simd::simd8x64<uint8_t> & in,const json_block & block,size_t idx)7018 simdjson_really_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
7019   uint64_t unescaped = in.lteq(0x1F);
7020   checker.check_next_input(in);
7021   indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
7022   prev_structurals = block.structural_start();
7023   unescaped_chars_error |= block.non_quote_inside_string(unescaped);
7024 }
7025 
finish(dom_parser_implementation & parser,size_t idx,size_t len,bool partial)7026 simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) {
7027   // Write out the final iteration's structurals
7028   indexer.write(uint32_t(idx-64), prev_structurals);
7029 
7030   error_code error = scanner.finish();
7031   // We deliberately break down the next expression so that it is
7032   // human readable.
7033   const bool should_we_exit =  partial ?
7034     ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
7035     : (error != SUCCESS); // if partial is false, we must have SUCCESS
7036   const bool have_unclosed_string = (error == UNCLOSED_STRING);
7037   if (simdjson_unlikely(should_we_exit)) { return error; }
7038 
7039   if (unescaped_chars_error) {
7040     return UNESCAPED_CHARS;
7041   }
7042 
7043   parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
7044   /***
7045    * This is related to https://github.com/simdjson/simdjson/issues/906
7046    * Basically, we want to make sure that if the parsing continues beyond the last (valid)
7047    * structural character, it quickly stops.
7048    * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
7049    * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
7050    * continues, then it must be [,] or }.
7051    * Suppose it is ] or }. We backtrack to the first character, what could it be that would
7052    * not trigger an error? It could be ] or } but no, because you can't start a document that way.
7053    * It can't be a comma, a colon or any simple value. So the only way we could continue is
7054    * if the repeated character is [. But if so, the document must start with [. But if the document
7055    * starts with [, it should end with ]. If we enforce that rule, then we would get
7056    * ][[ which is invalid.
7057    **/
7058   parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
7059   parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
7060   parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
7061   parser.next_structural_index = 0;
7062   // a valid JSON file cannot have zero structural indexes - we should have found something
7063   if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
7064     return EMPTY;
7065   }
7066   if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
7067     return UNEXPECTED_ERROR;
7068   }
7069   if (partial) {
7070     // If we have an unclosed string, then the last structural
7071     // will be the quote and we want to make sure to omit it.
7072     if(have_unclosed_string) {
7073       parser.n_structural_indexes--;
7074       // a valid JSON file cannot have zero structural indexes - we should have found something
7075       if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
7076     }
7077     auto new_structural_indexes = find_next_document_index(parser);
7078     if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
7079       return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
7080     }
7081     parser.n_structural_indexes = new_structural_indexes;
7082   }
7083   checker.check_eof();
7084   return checker.errors();
7085 }
7086 
7087 } // namespace stage1
7088 } // unnamed namespace
7089 } // namespace haswell
7090 } // namespace simdjson
7091 /* end file src/generic/stage1/json_structural_indexer.h */
7092 /* begin file src/generic/stage1/utf8_validator.h */
7093 namespace simdjson {
7094 namespace haswell {
7095 namespace {
7096 namespace stage1 {
7097 
7098 /**
7099  * Validates that the string is actual UTF-8.
7100  */
7101 template<class checker>
generic_validate_utf8(const uint8_t * input,size_t length)7102 bool generic_validate_utf8(const uint8_t * input, size_t length) {
7103     checker c{};
7104     buf_block_reader<64> reader(input, length);
7105     while (reader.has_full_block()) {
7106       simd::simd8x64<uint8_t> in(reader.full_block());
7107       c.check_next_input(in);
7108       reader.advance();
7109     }
7110     uint8_t block[64]{};
7111     reader.get_remainder(block);
7112     simd::simd8x64<uint8_t> in(block);
7113     c.check_next_input(in);
7114     reader.advance();
7115     c.check_eof();
7116     return c.errors() == error_code::SUCCESS;
7117 }
7118 
generic_validate_utf8(const char * input,size_t length)7119 bool generic_validate_utf8(const char * input, size_t length) {
7120     return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
7121 }
7122 
7123 } // namespace stage1
7124 } // unnamed namespace
7125 } // namespace haswell
7126 } // namespace simdjson
7127 /* end file src/generic/stage1/utf8_validator.h */
7128 
7129 //
7130 // Stage 2
7131 //
7132 /* begin file src/generic/stage2/tape_builder.h */
7133 /* begin file src/generic/stage2/json_iterator.h */
7134 /* begin file src/generic/stage2/logger.h */
7135 // This is for an internal-only stage 2 specific logger.
7136 // Set LOG_ENABLED = true to log what stage 2 is doing!
7137 namespace simdjson {
7138 namespace haswell {
7139 namespace {
7140 namespace logger {
7141 
7142   static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
7143 
7144 #if SIMDJSON_VERBOSE_LOGGING
7145   static constexpr const bool LOG_ENABLED = true;
7146 #else
7147   static constexpr const bool LOG_ENABLED = false;
7148 #endif
7149   static constexpr const int LOG_EVENT_LEN = 20;
7150   static constexpr const int LOG_BUFFER_LEN = 30;
7151   static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
7152   static constexpr const int LOG_INDEX_LEN = 5;
7153 
7154   static int log_depth; // Not threadsafe. Log only.
7155 
7156   // Helper to turn unprintable or newline characters into spaces
printable_char(char c)7157   static simdjson_really_inline char printable_char(char c) {
7158     if (c >= 0x20) {
7159       return c;
7160     } else {
7161       return ' ';
7162     }
7163   }
7164 
7165   // Print the header and set up log_start
log_start()7166   static simdjson_really_inline void log_start() {
7167     if (LOG_ENABLED) {
7168       log_depth = 0;
7169       printf("\n");
7170       printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
7171       printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
7172     }
7173   }
7174 
log_string(const char * message)7175   simdjson_unused static simdjson_really_inline void log_string(const char *message) {
7176     if (LOG_ENABLED) {
7177       printf("%s\n", message);
7178     }
7179   }
7180 
7181   // Logs a single line from the stage 2 DOM parser
7182   template<typename S>
log_line(S & structurals,const char * title_prefix,const char * title,const char * detail)7183   static simdjson_really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
7184     if (LOG_ENABLED) {
7185       printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
7186       auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
7187       auto next_index = structurals.next_structural;
7188       auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>("                                                       ");
7189       auto next = &structurals.buf[*next_index];
7190       {
7191         // Print the next N characters in the buffer.
7192         printf("| ");
7193         // Otherwise, print the characters starting from the buffer position.
7194         // Print spaces for unprintable or newline characters.
7195         for (int i=0;i<LOG_BUFFER_LEN;i++) {
7196           printf("%c", printable_char(current[i]));
7197         }
7198         printf(" ");
7199         // Print the next N characters in the buffer.
7200         printf("| ");
7201         // Otherwise, print the characters starting from the buffer position.
7202         // Print spaces for unprintable or newline characters.
7203         for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
7204           printf("%c", printable_char(next[i]));
7205         }
7206         printf(" ");
7207       }
7208       if (current_index) {
7209         printf("| %*u ", LOG_INDEX_LEN, *current_index);
7210       } else {
7211         printf("| %-*s ", LOG_INDEX_LEN, "");
7212       }
7213       // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
7214       printf("| %-s ", detail);
7215       printf("|\n");
7216     }
7217   }
7218 
7219 } // namespace logger
7220 } // unnamed namespace
7221 } // namespace haswell
7222 } // namespace simdjson
7223 /* end file src/generic/stage2/logger.h */
7224 
7225 namespace simdjson {
7226 namespace haswell {
7227 namespace {
7228 namespace stage2 {
7229 
7230 class json_iterator {
7231 public:
7232   const uint8_t* const buf;
7233   uint32_t *next_structural;
7234   dom_parser_implementation &dom_parser;
7235   uint32_t depth{0};
7236 
7237   /**
7238    * Walk the JSON document.
7239    *
7240    * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
7241    * the first parameter; some callbacks have other parameters as well:
7242    *
7243    * - visit_document_start() - at the beginning.
7244    * - visit_document_end() - at the end (if things were successful).
7245    *
7246    * - visit_array_start() - at the start `[` of a non-empty array.
7247    * - visit_array_end() - at the end `]` of a non-empty array.
7248    * - visit_empty_array() - when an empty array is encountered.
7249    *
7250    * - visit_object_end() - at the start `]` of a non-empty object.
7251    * - visit_object_start() - at the end `]` of a non-empty object.
7252    * - visit_empty_object() - when an empty object is encountered.
7253    * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
7254    *                                   guaranteed to point at the first quote of the string (`"key"`).
7255    * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
7256    * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
7257    *
7258    * - increment_count(iter) - each time a value is found in an array or object.
7259    */
7260   template<bool STREAMING, typename V>
7261   simdjson_warn_unused simdjson_really_inline error_code walk_document(V &visitor) noexcept;
7262 
7263   /**
7264    * Create an iterator capable of walking a JSON document.
7265    *
7266    * The document must have already passed through stage 1.
7267    */
7268   simdjson_really_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
7269 
7270   /**
7271    * Look at the next token.
7272    *
7273    * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
7274    *
7275    * They may include invalid JSON as well (such as `1.2.3` or `ture`).
7276    */
7277   simdjson_really_inline const uint8_t *peek() const noexcept;
7278   /**
7279    * Advance to the next token.
7280    *
7281    * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
7282    *
7283    * They may include invalid JSON as well (such as `1.2.3` or `ture`).
7284    */
7285   simdjson_really_inline const uint8_t *advance() noexcept;
7286   /**
7287    * Get the remaining length of the document, from the start of the current token.
7288    */
7289   simdjson_really_inline size_t remaining_len() const noexcept;
7290   /**
7291    * Check if we are at the end of the document.
7292    *
7293    * If this is true, there are no more tokens.
7294    */
7295   simdjson_really_inline bool at_eof() const noexcept;
7296   /**
7297    * Check if we are at the beginning of the document.
7298    */
7299   simdjson_really_inline bool at_beginning() const noexcept;
7300   simdjson_really_inline uint8_t last_structural() const noexcept;
7301 
7302   /**
7303    * Log that a value has been found.
7304    *
7305    * Set ENABLE_LOGGING=true in logger.h to see logging.
7306    */
7307   simdjson_really_inline void log_value(const char *type) const noexcept;
7308   /**
7309    * Log the start of a multipart value.
7310    *
7311    * Set ENABLE_LOGGING=true in logger.h to see logging.
7312    */
7313   simdjson_really_inline void log_start_value(const char *type) const noexcept;
7314   /**
7315    * Log the end of a multipart value.
7316    *
7317    * Set ENABLE_LOGGING=true in logger.h to see logging.
7318    */
7319   simdjson_really_inline void log_end_value(const char *type) const noexcept;
7320   /**
7321    * Log an error.
7322    *
7323    * Set ENABLE_LOGGING=true in logger.h to see logging.
7324    */
7325   simdjson_really_inline void log_error(const char *error) const noexcept;
7326 
7327   template<typename V>
7328   simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
7329   template<typename V>
7330   simdjson_warn_unused simdjson_really_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
7331 };
7332 
7333 template<bool STREAMING, typename V>
walk_document(V & visitor)7334 simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_document(V &visitor) noexcept {
7335   logger::log_start();
7336 
7337   //
7338   // Start the document
7339   //
7340   if (at_eof()) { return EMPTY; }
7341   log_start_value("document");
7342   SIMDJSON_TRY( visitor.visit_document_start(*this) );
7343 
7344   //
7345   // Read first value
7346   //
7347   {
7348     auto value = advance();
7349 
7350     // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
7351     // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
7352     if (!STREAMING) {
7353       switch (*value) {
7354         case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break;
7355         case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break;
7356       }
7357     }
7358 
7359     switch (*value) {
7360       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
7361       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
7362       default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
7363     }
7364   }
7365   goto document_end;
7366 
7367 //
7368 // Object parser states
7369 //
7370 object_begin:
7371   log_start_value("object");
7372   depth++;
7373   if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
7374   dom_parser.is_array[depth] = false;
7375   SIMDJSON_TRY( visitor.visit_object_start(*this) );
7376 
7377   {
7378     auto key = advance();
7379     if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
7380     SIMDJSON_TRY( visitor.increment_count(*this) );
7381     SIMDJSON_TRY( visitor.visit_key(*this, key) );
7382   }
7383 
7384 object_field:
7385   if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
7386   {
7387     auto value = advance();
7388     switch (*value) {
7389       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
7390       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
7391       default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
7392     }
7393   }
7394 
7395 object_continue:
7396   switch (*advance()) {
7397     case ',':
7398       SIMDJSON_TRY( visitor.increment_count(*this) );
7399       {
7400         auto key = advance();
7401         if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
7402         SIMDJSON_TRY( visitor.visit_key(*this, key) );
7403       }
7404       goto object_field;
7405     case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
7406     default: log_error("No comma between object fields"); return TAPE_ERROR;
7407   }
7408 
7409 scope_end:
7410   depth--;
7411   if (depth == 0) { goto document_end; }
7412   if (dom_parser.is_array[depth]) { goto array_continue; }
7413   goto object_continue;
7414 
7415 //
7416 // Array parser states
7417 //
7418 array_begin:
7419   log_start_value("array");
7420   depth++;
7421   if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
7422   dom_parser.is_array[depth] = true;
7423   SIMDJSON_TRY( visitor.visit_array_start(*this) );
7424   SIMDJSON_TRY( visitor.increment_count(*this) );
7425 
7426 array_value:
7427   {
7428     auto value = advance();
7429     switch (*value) {
7430       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
7431       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
7432       default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
7433     }
7434   }
7435 
7436 array_continue:
7437   switch (*advance()) {
7438     case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
7439     case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
7440     default: log_error("Missing comma between array values"); return TAPE_ERROR;
7441   }
7442 
7443 document_end:
7444   log_end_value("document");
7445   SIMDJSON_TRY( visitor.visit_document_end(*this) );
7446 
7447   dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
7448 
7449   // If we didn't make it to the end, it's an error
7450   if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
7451     log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
7452     return TAPE_ERROR;
7453   }
7454 
7455   return SUCCESS;
7456 
7457 } // walk_document()
7458 
json_iterator(dom_parser_implementation & _dom_parser,size_t start_structural_index)7459 simdjson_really_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
7460   : buf{_dom_parser.buf},
7461     next_structural{&_dom_parser.structural_indexes[start_structural_index]},
7462     dom_parser{_dom_parser} {
7463 }
7464 
peek() const7465 simdjson_really_inline const uint8_t *json_iterator::peek() const noexcept {
7466   return &buf[*(next_structural)];
7467 }
advance()7468 simdjson_really_inline const uint8_t *json_iterator::advance() noexcept {
7469   return &buf[*(next_structural++)];
7470 }
remaining_len() const7471 simdjson_really_inline size_t json_iterator::remaining_len() const noexcept {
7472   return dom_parser.len - *(next_structural-1);
7473 }
7474 
at_eof() const7475 simdjson_really_inline bool json_iterator::at_eof() const noexcept {
7476   return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
7477 }
at_beginning() const7478 simdjson_really_inline bool json_iterator::at_beginning() const noexcept {
7479   return next_structural == dom_parser.structural_indexes.get();
7480 }
last_structural() const7481 simdjson_really_inline uint8_t json_iterator::last_structural() const noexcept {
7482   return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
7483 }
7484 
log_value(const char * type) const7485 simdjson_really_inline void json_iterator::log_value(const char *type) const noexcept {
7486   logger::log_line(*this, "", type, "");
7487 }
7488 
log_start_value(const char * type) const7489 simdjson_really_inline void json_iterator::log_start_value(const char *type) const noexcept {
7490   logger::log_line(*this, "+", type, "");
7491   if (logger::LOG_ENABLED) { logger::log_depth++; }
7492 }
7493 
log_end_value(const char * type) const7494 simdjson_really_inline void json_iterator::log_end_value(const char *type) const noexcept {
7495   if (logger::LOG_ENABLED) { logger::log_depth--; }
7496   logger::log_line(*this, "-", type, "");
7497 }
7498 
log_error(const char * error) const7499 simdjson_really_inline void json_iterator::log_error(const char *error) const noexcept {
7500   logger::log_line(*this, "", "ERROR", error);
7501 }
7502 
7503 template<typename V>
visit_root_primitive(V & visitor,const uint8_t * value)7504 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
7505   switch (*value) {
7506     case '"': return visitor.visit_root_string(*this, value);
7507     case 't': return visitor.visit_root_true_atom(*this, value);
7508     case 'f': return visitor.visit_root_false_atom(*this, value);
7509     case 'n': return visitor.visit_root_null_atom(*this, value);
7510     case '-':
7511     case '0': case '1': case '2': case '3': case '4':
7512     case '5': case '6': case '7': case '8': case '9':
7513       return visitor.visit_root_number(*this, value);
7514     default:
7515       log_error("Document starts with a non-value character");
7516       return TAPE_ERROR;
7517   }
7518 }
7519 template<typename V>
visit_primitive(V & visitor,const uint8_t * value)7520 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
7521   switch (*value) {
7522     case '"': return visitor.visit_string(*this, value);
7523     case 't': return visitor.visit_true_atom(*this, value);
7524     case 'f': return visitor.visit_false_atom(*this, value);
7525     case 'n': return visitor.visit_null_atom(*this, value);
7526     case '-':
7527     case '0': case '1': case '2': case '3': case '4':
7528     case '5': case '6': case '7': case '8': case '9':
7529       return visitor.visit_number(*this, value);
7530     default:
7531       log_error("Non-value found when value was expected!");
7532       return TAPE_ERROR;
7533   }
7534 }
7535 
7536 } // namespace stage2
7537 } // unnamed namespace
7538 } // namespace haswell
7539 } // namespace simdjson
7540 /* end file src/generic/stage2/json_iterator.h */
7541 /* begin file src/generic/stage2/tape_writer.h */
7542 namespace simdjson {
7543 namespace haswell {
7544 namespace {
7545 namespace stage2 {
7546 
7547 struct tape_writer {
7548   /** The next place to write to tape */
7549   uint64_t *next_tape_loc;
7550 
7551   /** Write a signed 64-bit value to tape. */
7552   simdjson_really_inline void append_s64(int64_t value) noexcept;
7553 
7554   /** Write an unsigned 64-bit value to tape. */
7555   simdjson_really_inline void append_u64(uint64_t value) noexcept;
7556 
7557   /** Write a double value to tape. */
7558   simdjson_really_inline void append_double(double value) noexcept;
7559 
7560   /**
7561    * Append a tape entry (an 8-bit type,and 56 bits worth of value).
7562    */
7563   simdjson_really_inline void append(uint64_t val, internal::tape_type t) noexcept;
7564 
7565   /**
7566    * Skip the current tape entry without writing.
7567    *
7568    * Used to skip the start of the container, since we'll come back later to fill it in when the
7569    * container ends.
7570    */
7571   simdjson_really_inline void skip() noexcept;
7572 
7573   /**
7574    * Skip the number of tape entries necessary to write a large u64 or i64.
7575    */
7576   simdjson_really_inline void skip_large_integer() noexcept;
7577 
7578   /**
7579    * Skip the number of tape entries necessary to write a double.
7580    */
7581   simdjson_really_inline void skip_double() noexcept;
7582 
7583   /**
7584    * Write a value to a known location on tape.
7585    *
7586    * Used to go back and write out the start of a container after the container ends.
7587    */
7588   simdjson_really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
7589 
7590 private:
7591   /**
7592    * Append both the tape entry, and a supplementary value following it. Used for types that need
7593    * all 64 bits, such as double and uint64_t.
7594    */
7595   template<typename T>
7596   simdjson_really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
7597 }; // struct number_writer
7598 
append_s64(int64_t value)7599 simdjson_really_inline void tape_writer::append_s64(int64_t value) noexcept {
7600   append2(0, value, internal::tape_type::INT64);
7601 }
7602 
append_u64(uint64_t value)7603 simdjson_really_inline void tape_writer::append_u64(uint64_t value) noexcept {
7604   append(0, internal::tape_type::UINT64);
7605   *next_tape_loc = value;
7606   next_tape_loc++;
7607 }
7608 
7609 /** Write a double value to tape. */
append_double(double value)7610 simdjson_really_inline void tape_writer::append_double(double value) noexcept {
7611   append2(0, value, internal::tape_type::DOUBLE);
7612 }
7613 
skip()7614 simdjson_really_inline void tape_writer::skip() noexcept {
7615   next_tape_loc++;
7616 }
7617 
skip_large_integer()7618 simdjson_really_inline void tape_writer::skip_large_integer() noexcept {
7619   next_tape_loc += 2;
7620 }
7621 
skip_double()7622 simdjson_really_inline void tape_writer::skip_double() noexcept {
7623   next_tape_loc += 2;
7624 }
7625 
append(uint64_t val,internal::tape_type t)7626 simdjson_really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
7627   *next_tape_loc = val | ((uint64_t(char(t))) << 56);
7628   next_tape_loc++;
7629 }
7630 
7631 template<typename T>
append2(uint64_t val,T val2,internal::tape_type t)7632 simdjson_really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
7633   append(val, t);
7634   static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
7635   memcpy(next_tape_loc, &val2, sizeof(val2));
7636   next_tape_loc++;
7637 }
7638 
write(uint64_t & tape_loc,uint64_t val,internal::tape_type t)7639 simdjson_really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
7640   tape_loc = val | ((uint64_t(char(t))) << 56);
7641 }
7642 
7643 } // namespace stage2
7644 } // unnamed namespace
7645 } // namespace haswell
7646 } // namespace simdjson
7647 /* end file src/generic/stage2/tape_writer.h */
7648 
7649 namespace simdjson {
7650 namespace haswell {
7651 namespace {
7652 namespace stage2 {
7653 
7654 struct tape_builder {
7655   template<bool STREAMING>
7656   simdjson_warn_unused static simdjson_really_inline error_code parse_document(
7657     dom_parser_implementation &dom_parser,
7658     dom::document &doc) noexcept;
7659 
7660   /** Called when a non-empty document starts. */
7661   simdjson_warn_unused simdjson_really_inline error_code visit_document_start(json_iterator &iter) noexcept;
7662   /** Called when a non-empty document ends without error. */
7663   simdjson_warn_unused simdjson_really_inline error_code visit_document_end(json_iterator &iter) noexcept;
7664 
7665   /** Called when a non-empty array starts. */
7666   simdjson_warn_unused simdjson_really_inline error_code visit_array_start(json_iterator &iter) noexcept;
7667   /** Called when a non-empty array ends. */
7668   simdjson_warn_unused simdjson_really_inline error_code visit_array_end(json_iterator &iter) noexcept;
7669   /** Called when an empty array is found. */
7670   simdjson_warn_unused simdjson_really_inline error_code visit_empty_array(json_iterator &iter) noexcept;
7671 
7672   /** Called when a non-empty object starts. */
7673   simdjson_warn_unused simdjson_really_inline error_code visit_object_start(json_iterator &iter) noexcept;
7674   /**
7675    * Called when a key in a field is encountered.
7676    *
7677    * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
7678    * will be called after this with the field value.
7679    */
7680   simdjson_warn_unused simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
7681   /** Called when a non-empty object ends. */
7682   simdjson_warn_unused simdjson_really_inline error_code visit_object_end(json_iterator &iter) noexcept;
7683   /** Called when an empty object is found. */
7684   simdjson_warn_unused simdjson_really_inline error_code visit_empty_object(json_iterator &iter) noexcept;
7685 
7686   /**
7687    * Called when a string, number, boolean or null is found.
7688    */
7689   simdjson_warn_unused simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
7690   /**
7691    * Called when a string, number, boolean or null is found at the top level of a document (i.e.
7692    * when there is no array or object and the entire document is a single string, number, boolean or
7693    * null.
7694    *
7695    * This is separate from primitive() because simdjson's normal primitive parsing routines assume
7696    * there is at least one more token after the value, which is only true in an array or object.
7697    */
7698   simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
7699 
7700   simdjson_warn_unused simdjson_really_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
7701   simdjson_warn_unused simdjson_really_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
7702   simdjson_warn_unused simdjson_really_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
7703   simdjson_warn_unused simdjson_really_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
7704   simdjson_warn_unused simdjson_really_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
7705 
7706   simdjson_warn_unused simdjson_really_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
7707   simdjson_warn_unused simdjson_really_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
7708   simdjson_warn_unused simdjson_really_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
7709   simdjson_warn_unused simdjson_really_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
7710   simdjson_warn_unused simdjson_really_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
7711 
7712   /** Called each time a new field or element in an array or object is found. */
7713   simdjson_warn_unused simdjson_really_inline error_code increment_count(json_iterator &iter) noexcept;
7714 
7715   /** Next location to write to tape */
7716   tape_writer tape;
7717 private:
7718   /** Next write location in the string buf for stage 2 parsing */
7719   uint8_t *current_string_buf_loc;
7720 
7721   simdjson_really_inline tape_builder(dom::document &doc) noexcept;
7722 
7723   simdjson_really_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
7724   simdjson_really_inline void start_container(json_iterator &iter) noexcept;
7725   simdjson_warn_unused simdjson_really_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
7726   simdjson_warn_unused simdjson_really_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
7727   simdjson_really_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
7728   simdjson_really_inline void on_end_string(uint8_t *dst) noexcept;
7729 }; // class tape_builder
7730 
7731 template<bool STREAMING>
parse_document(dom_parser_implementation & dom_parser,dom::document & doc)7732 simdjson_warn_unused simdjson_really_inline error_code tape_builder::parse_document(
7733     dom_parser_implementation &dom_parser,
7734     dom::document &doc) noexcept {
7735   dom_parser.doc = &doc;
7736   json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
7737   tape_builder builder(doc);
7738   return iter.walk_document<STREAMING>(builder);
7739 }
7740 
visit_root_primitive(json_iterator & iter,const uint8_t * value)7741 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
7742   return iter.visit_root_primitive(*this, value);
7743 }
visit_primitive(json_iterator & iter,const uint8_t * value)7744 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
7745   return iter.visit_primitive(*this, value);
7746 }
visit_empty_object(json_iterator & iter)7747 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
7748   return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
7749 }
visit_empty_array(json_iterator & iter)7750 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
7751   return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
7752 }
7753 
visit_document_start(json_iterator & iter)7754 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
7755   start_container(iter);
7756   return SUCCESS;
7757 }
visit_object_start(json_iterator & iter)7758 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
7759   start_container(iter);
7760   return SUCCESS;
7761 }
visit_array_start(json_iterator & iter)7762 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
7763   start_container(iter);
7764   return SUCCESS;
7765 }
7766 
visit_object_end(json_iterator & iter)7767 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
7768   return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
7769 }
visit_array_end(json_iterator & iter)7770 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
7771   return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
7772 }
visit_document_end(json_iterator & iter)7773 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
7774   constexpr uint32_t start_tape_index = 0;
7775   tape.append(start_tape_index, internal::tape_type::ROOT);
7776   tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
7777   return SUCCESS;
7778 }
visit_key(json_iterator & iter,const uint8_t * key)7779 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
7780   return visit_string(iter, key, true);
7781 }
7782 
increment_count(json_iterator & iter)7783 simdjson_warn_unused simdjson_really_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
7784   iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
7785   return SUCCESS;
7786 }
7787 
tape_builder(dom::document & doc)7788 simdjson_really_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
7789 
visit_string(json_iterator & iter,const uint8_t * value,bool key)7790 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
7791   iter.log_value(key ? "key" : "string");
7792   uint8_t *dst = on_start_string(iter);
7793   dst = stringparsing::parse_string(value+1, dst);
7794   if (dst == nullptr) {
7795     iter.log_error("Invalid escape in string");
7796     return STRING_ERROR;
7797   }
7798   on_end_string(dst);
7799   return SUCCESS;
7800 }
7801 
visit_root_string(json_iterator & iter,const uint8_t * value)7802 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
7803   return visit_string(iter, value);
7804 }
7805 
visit_number(json_iterator & iter,const uint8_t * value)7806 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
7807   iter.log_value("number");
7808   return numberparsing::parse_number(value, tape);
7809 }
7810 
visit_root_number(json_iterator & iter,const uint8_t * value)7811 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
7812   //
7813   // We need to make a copy to make sure that the string is space terminated.
7814   // This is not about padding the input, which should already padded up
7815   // to len + SIMDJSON_PADDING. However, we have no control at this stage
7816   // on how the padding was done. What if the input string was padded with nulls?
7817   // It is quite common for an input string to have an extra null character (C string).
7818   // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
7819   // document, but the string "9\0" by itself is fine. So we make a copy and
7820   // pad the input with spaces when we know that there is just one input element.
7821   // This copy is relatively expensive, but it will almost never be called in
7822   // practice unless you are in the strange scenario where you have many JSON
7823   // documents made of single atoms.
7824   //
7825   std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
7826   if (copy.get() == nullptr) { return MEMALLOC; }
7827   std::memcpy(copy.get(), value, iter.remaining_len());
7828   std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
7829   error_code error = visit_number(iter, copy.get());
7830   return error;
7831 }
7832 
visit_true_atom(json_iterator & iter,const uint8_t * value)7833 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
7834   iter.log_value("true");
7835   if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
7836   tape.append(0, internal::tape_type::TRUE_VALUE);
7837   return SUCCESS;
7838 }
7839 
visit_root_true_atom(json_iterator & iter,const uint8_t * value)7840 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
7841   iter.log_value("true");
7842   if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
7843   tape.append(0, internal::tape_type::TRUE_VALUE);
7844   return SUCCESS;
7845 }
7846 
visit_false_atom(json_iterator & iter,const uint8_t * value)7847 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
7848   iter.log_value("false");
7849   if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
7850   tape.append(0, internal::tape_type::FALSE_VALUE);
7851   return SUCCESS;
7852 }
7853 
visit_root_false_atom(json_iterator & iter,const uint8_t * value)7854 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
7855   iter.log_value("false");
7856   if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
7857   tape.append(0, internal::tape_type::FALSE_VALUE);
7858   return SUCCESS;
7859 }
7860 
visit_null_atom(json_iterator & iter,const uint8_t * value)7861 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
7862   iter.log_value("null");
7863   if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
7864   tape.append(0, internal::tape_type::NULL_VALUE);
7865   return SUCCESS;
7866 }
7867 
visit_root_null_atom(json_iterator & iter,const uint8_t * value)7868 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
7869   iter.log_value("null");
7870   if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
7871   tape.append(0, internal::tape_type::NULL_VALUE);
7872   return SUCCESS;
7873 }
7874 
7875 // private:
7876 
next_tape_index(json_iterator & iter) const7877 simdjson_really_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
7878   return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
7879 }
7880 
empty_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)7881 simdjson_warn_unused simdjson_really_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
7882   auto start_index = next_tape_index(iter);
7883   tape.append(start_index+2, start);
7884   tape.append(start_index, end);
7885   return SUCCESS;
7886 }
7887 
start_container(json_iterator & iter)7888 simdjson_really_inline void tape_builder::start_container(json_iterator &iter) noexcept {
7889   iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
7890   iter.dom_parser.open_containers[iter.depth].count = 0;
7891   tape.skip(); // We don't actually *write* the start element until the end.
7892 }
7893 
end_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)7894 simdjson_warn_unused simdjson_really_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
7895   // Write the ending tape element, pointing at the start location
7896   const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
7897   tape.append(start_tape_index, end);
7898   // Write the start tape element, pointing at the end location (and including count)
7899   // count can overflow if it exceeds 24 bits... so we saturate
7900   // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
7901   const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
7902   const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
7903   tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
7904   return SUCCESS;
7905 }
7906 
on_start_string(json_iterator & iter)7907 simdjson_really_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
7908   // we advance the point, accounting for the fact that we have a NULL termination
7909   tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
7910   return current_string_buf_loc + sizeof(uint32_t);
7911 }
7912 
on_end_string(uint8_t * dst)7913 simdjson_really_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
7914   uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
7915   // TODO check for overflow in case someone has a crazy string (>=4GB?)
7916   // But only add the overflow check when the document itself exceeds 4GB
7917   // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
7918   memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
7919   // NULL termination is still handy if you expect all your strings to
7920   // be NULL terminated? It comes at a small cost
7921   *dst = 0;
7922   current_string_buf_loc = dst + 1;
7923 }
7924 
7925 } // namespace stage2
7926 } // unnamed namespace
7927 } // namespace haswell
7928 } // namespace simdjson
7929 /* end file src/generic/stage2/tape_builder.h */
7930 
7931 //
7932 // Implementation-specific overrides
7933 //
7934 namespace simdjson {
7935 namespace haswell {
7936 namespace {
7937 namespace stage1 {
7938 
find_escaped(uint64_t backslash)7939 simdjson_really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
7940   if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
7941   return find_escaped_branchless(backslash);
7942 }
7943 
7944 } // namespace stage1
7945 } // unnamed namespace
7946 
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len) const7947 simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
7948   return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
7949 }
7950 
stage1(const uint8_t * _buf,size_t _len,bool streaming)7951 simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
7952   this->buf = _buf;
7953   this->len = _len;
7954   return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
7955 }
7956 
validate_utf8(const char * buf,size_t len) const7957 simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
7958   return haswell::stage1::generic_validate_utf8(buf,len);
7959 }
7960 
stage2(dom::document & _doc)7961 simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
7962   return stage2::tape_builder::parse_document<false>(*this, _doc);
7963 }
7964 
stage2_next(dom::document & _doc)7965 simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
7966   return stage2::tape_builder::parse_document<true>(*this, _doc);
7967 }
7968 
parse(const uint8_t * _buf,size_t _len,dom::document & _doc)7969 simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
7970   auto error = stage1(_buf, _len, false);
7971   if (error) { return error; }
7972   return stage2(_doc);
7973 }
7974 
7975 } // namespace haswell
7976 } // namespace simdjson
7977 
7978 /* begin file include/simdjson/haswell/end.h */
7979 SIMDJSON_UNTARGET_HASWELL
7980 /* end file include/simdjson/haswell/end.h */
7981 /* end file src/haswell/dom_parser_implementation.cpp */
7982 #endif
7983 #if SIMDJSON_IMPLEMENTATION_PPC64
7984 /* begin file src/ppc64/implementation.cpp */
7985 /* begin file include/simdjson/ppc64/begin.h */
7986 // redefining SIMDJSON_IMPLEMENTATION to "ppc64"
7987 // #define SIMDJSON_IMPLEMENTATION ppc64
7988 /* end file include/simdjson/ppc64/begin.h */
7989 
7990 namespace simdjson {
7991 namespace ppc64 {
7992 
create_dom_parser_implementation(size_t capacity,size_t max_depth,std::unique_ptr<internal::dom_parser_implementation> & dst) const7993 simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
7994   size_t capacity,
7995   size_t max_depth,
7996   std::unique_ptr<internal::dom_parser_implementation>& dst
7997 ) const noexcept {
7998   dst.reset( new (std::nothrow) dom_parser_implementation() );
7999   if (!dst) { return MEMALLOC; }
8000   dst->set_capacity(capacity);
8001   dst->set_max_depth(max_depth);
8002   return SUCCESS;
8003 }
8004 
8005 } // namespace ppc64
8006 } // namespace simdjson
8007 
8008 /* begin file include/simdjson/ppc64/end.h */
8009 /* end file include/simdjson/ppc64/end.h */
8010 /* end file src/ppc64/implementation.cpp */
8011 /* begin file src/ppc64/dom_parser_implementation.cpp */
8012 /* begin file include/simdjson/ppc64/begin.h */
8013 // redefining SIMDJSON_IMPLEMENTATION to "ppc64"
8014 // #define SIMDJSON_IMPLEMENTATION ppc64
8015 /* end file include/simdjson/ppc64/begin.h */
8016 
8017 //
8018 // Stage 1
8019 //
8020 namespace simdjson {
8021 namespace ppc64 {
8022 namespace {
8023 
8024 using namespace simd;
8025 
8026 struct json_character_block {
8027   static simdjson_really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
8028 
whitespacesimdjson::ppc64::__anon9bb6be6f2511::json_character_block8029   simdjson_really_inline uint64_t whitespace() const noexcept { return _whitespace; }
opsimdjson::ppc64::__anon9bb6be6f2511::json_character_block8030   simdjson_really_inline uint64_t op() const noexcept { return _op; }
scalarsimdjson::ppc64::__anon9bb6be6f2511::json_character_block8031   simdjson_really_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
8032 
8033   uint64_t _whitespace;
8034   uint64_t _op;
8035 };
8036 
classify(const simd::simd8x64<uint8_t> & in)8037 simdjson_really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
8038   const simd8<uint8_t> table1(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
8039   const simd8<uint8_t> table2(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
8040 
8041   simd8x64<uint8_t> v(
8042      (in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2),
8043      (in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2),
8044      (in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2),
8045      (in.chunks[3] & 0xf).lookup_16(table1) & (in.chunks[3].shr<4>()).lookup_16(table2)
8046   );
8047 
8048   uint64_t op = simd8x64<bool>(
8049         v.chunks[0].any_bits_set(0x7),
8050         v.chunks[1].any_bits_set(0x7),
8051         v.chunks[2].any_bits_set(0x7),
8052         v.chunks[3].any_bits_set(0x7)
8053   ).to_bitmask();
8054 
8055   uint64_t whitespace = simd8x64<bool>(
8056         v.chunks[0].any_bits_set(0x18),
8057         v.chunks[1].any_bits_set(0x18),
8058         v.chunks[2].any_bits_set(0x18),
8059         v.chunks[3].any_bits_set(0x18)
8060   ).to_bitmask();
8061 
8062   return { whitespace, op };
8063 }
8064 
is_ascii(const simd8x64<uint8_t> & input)8065 simdjson_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
8066   // careful: 0x80 is not ascii.
8067   return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
8068 }
8069 
must_be_continuation(const simd8<uint8_t> prev1,const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)8070 simdjson_unused simdjson_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
8071   simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
8072   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
8073   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
8074   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
8075   return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
8076 }
8077 
must_be_2_3_continuation(const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)8078 simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
8079   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
8080   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
8081   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
8082   return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
8083 }
8084 
8085 } // unnamed namespace
8086 } // namespace ppc64
8087 } // namespace simdjson
8088 
8089 /* begin file src/generic/stage1/utf8_lookup4_algorithm.h */
8090 namespace simdjson {
8091 namespace ppc64 {
8092 namespace {
8093 namespace utf8_validation {
8094 
8095 using namespace simd;
8096 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)8097   simdjson_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
8098 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
8099 // Bit 1 = Too Long (ASCII followed by continuation)
8100 // Bit 2 = Overlong 3-byte
8101 // Bit 4 = Surrogate
8102 // Bit 5 = Overlong 2-byte
8103 // Bit 7 = Two Continuations
8104     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
8105                                                 // 11______ 11______
8106     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
8107     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
8108     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
8109     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
8110     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
8111     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
8112                                                 // 11110100 101_____
8113                                                 // 11110101 1001____
8114                                                 // 11110101 101_____
8115                                                 // 1111011_ 1001____
8116                                                 // 1111011_ 101_____
8117                                                 // 11111___ 1001____
8118                                                 // 11111___ 101_____
8119     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
8120                                                 // 11110101 1000____
8121                                                 // 1111011_ 1000____
8122                                                 // 11111___ 1000____
8123     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
8124 
8125     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
8126       // 0_______ ________ <ASCII in byte 1>
8127       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
8128       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
8129       // 10______ ________ <continuation in byte 1>
8130       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
8131       // 1100____ ________ <two byte lead in byte 1>
8132       TOO_SHORT | OVERLONG_2,
8133       // 1101____ ________ <two byte lead in byte 1>
8134       TOO_SHORT,
8135       // 1110____ ________ <three byte lead in byte 1>
8136       TOO_SHORT | OVERLONG_3 | SURROGATE,
8137       // 1111____ ________ <four+ byte lead in byte 1>
8138       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
8139     );
8140     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
8141     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
8142       // ____0000 ________
8143       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
8144       // ____0001 ________
8145       CARRY | OVERLONG_2,
8146       // ____001_ ________
8147       CARRY,
8148       CARRY,
8149 
8150       // ____0100 ________
8151       CARRY | TOO_LARGE,
8152       // ____0101 ________
8153       CARRY | TOO_LARGE | TOO_LARGE_1000,
8154       // ____011_ ________
8155       CARRY | TOO_LARGE | TOO_LARGE_1000,
8156       CARRY | TOO_LARGE | TOO_LARGE_1000,
8157 
8158       // ____1___ ________
8159       CARRY | TOO_LARGE | TOO_LARGE_1000,
8160       CARRY | TOO_LARGE | TOO_LARGE_1000,
8161       CARRY | TOO_LARGE | TOO_LARGE_1000,
8162       CARRY | TOO_LARGE | TOO_LARGE_1000,
8163       CARRY | TOO_LARGE | TOO_LARGE_1000,
8164       // ____1101 ________
8165       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
8166       CARRY | TOO_LARGE | TOO_LARGE_1000,
8167       CARRY | TOO_LARGE | TOO_LARGE_1000
8168     );
8169     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
8170       // ________ 0_______ <ASCII in byte 2>
8171       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
8172       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
8173 
8174       // ________ 1000____
8175       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
8176       // ________ 1001____
8177       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
8178       // ________ 101_____
8179       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
8180       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
8181 
8182       // ________ 11______
8183       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
8184     );
8185     return (byte_1_high & byte_1_low & byte_2_high);
8186   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)8187   simdjson_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
8188       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
8189     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
8190     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
8191     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
8192     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
8193     return must23_80 ^ sc;
8194   }
8195 
8196   //
8197   // Return nonzero if there are incomplete multibyte characters at the end of the block:
8198   // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
8199   //
is_incomplete(const simd8<uint8_t> input)8200   simdjson_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
8201     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
8202     // ... 1111____ 111_____ 11______
8203     static const uint8_t max_array[32] = {
8204       255, 255, 255, 255, 255, 255, 255, 255,
8205       255, 255, 255, 255, 255, 255, 255, 255,
8206       255, 255, 255, 255, 255, 255, 255, 255,
8207       255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
8208     };
8209     const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
8210     return input.gt_bits(max_value);
8211   }
8212 
8213   struct utf8_checker {
8214     // If this is nonzero, there has been a UTF-8 error.
8215     simd8<uint8_t> error;
8216     // The last input we received
8217     simd8<uint8_t> prev_input_block;
8218     // Whether the last input we received was incomplete (used for ASCII fast path)
8219     simd8<uint8_t> prev_incomplete;
8220 
8221     //
8222     // Check whether the current bytes are valid UTF-8.
8223     //
check_utf8_bytessimdjson::ppc64::__anon9bb6be6f2611::utf8_validation::utf8_checker8224     simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
8225       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
8226       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
8227       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
8228       simd8<uint8_t> sc = check_special_cases(input, prev1);
8229       this->error |= check_multibyte_lengths(input, prev_input, sc);
8230     }
8231 
8232     // The only problem that can happen at EOF is that a multibyte character is too short
8233     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
8234     // too large in the first of two bytes.
check_eofsimdjson::ppc64::__anon9bb6be6f2611::utf8_validation::utf8_checker8235     simdjson_really_inline void check_eof() {
8236       // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
8237       // possibly finish them.
8238       this->error |= this->prev_incomplete;
8239     }
8240 
check_next_inputsimdjson::ppc64::__anon9bb6be6f2611::utf8_validation::utf8_checker8241     simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
8242       if(simdjson_likely(is_ascii(input))) {
8243         this->error |= this->prev_incomplete;
8244       } else {
8245         // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
8246         static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
8247             "We support either two or four chunks per 64-byte block.");
8248         if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
8249           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
8250           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
8251         } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
8252           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
8253           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
8254           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
8255           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
8256         }
8257         this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
8258         this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
8259 
8260       }
8261     }
8262     // do not forget to call check_eof!
errorssimdjson::ppc64::__anon9bb6be6f2611::utf8_validation::utf8_checker8263     simdjson_really_inline error_code errors() {
8264       return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
8265     }
8266 
8267   }; // struct utf8_checker
8268 } // namespace utf8_validation
8269 
8270 using utf8_validation::utf8_checker;
8271 
8272 } // unnamed namespace
8273 } // namespace ppc64
8274 } // namespace simdjson
8275 /* end file src/generic/stage1/utf8_lookup4_algorithm.h */
8276 /* begin file src/generic/stage1/json_structural_indexer.h */
8277 // This file contains the common code every implementation uses in stage1
8278 // It is intended to be included multiple times and compiled multiple times
8279 // We assume the file in which it is included already includes
8280 // "simdjson/stage1.h" (this simplifies amalgation)
8281 
8282 /* begin file src/generic/stage1/buf_block_reader.h */
8283 namespace simdjson {
8284 namespace ppc64 {
8285 namespace {
8286 
8287 // Walks through a buffer in block-sized increments, loading the last part with spaces
8288 template<size_t STEP_SIZE>
8289 struct buf_block_reader {
8290 public:
8291   simdjson_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
8292   simdjson_really_inline size_t block_index();
8293   simdjson_really_inline bool has_full_block() const;
8294   simdjson_really_inline const uint8_t *full_block() const;
8295   /**
8296    * Get the last block, padded with spaces.
8297    *
8298    * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
8299    * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
8300    * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
8301    *
8302    * @return the number of effective characters in the last block.
8303    */
8304   simdjson_really_inline size_t get_remainder(uint8_t *dst) const;
8305   simdjson_really_inline void advance();
8306 private:
8307   const uint8_t *buf;
8308   const size_t len;
8309   const size_t lenminusstep;
8310   size_t idx;
8311 };
8312 
8313 // Routines to print masks and text for debugging bitmask operations
format_input_text_64(const uint8_t * text)8314 simdjson_unused static char * format_input_text_64(const uint8_t *text) {
8315   static char buf[sizeof(simd8x64<uint8_t>) + 1];
8316   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
8317     buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
8318   }
8319   buf[sizeof(simd8x64<uint8_t>)] = '\0';
8320   return buf;
8321 }
8322 
8323 // Routines to print masks and text for debugging bitmask operations
format_input_text(const simd8x64<uint8_t> & in)8324 simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
8325   static char buf[sizeof(simd8x64<uint8_t>) + 1];
8326   in.store(reinterpret_cast<uint8_t*>(buf));
8327   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
8328     if (buf[i] < ' ') { buf[i] = '_'; }
8329   }
8330   buf[sizeof(simd8x64<uint8_t>)] = '\0';
8331   return buf;
8332 }
8333 
format_mask(uint64_t mask)8334 simdjson_unused static char * format_mask(uint64_t mask) {
8335   static char buf[sizeof(simd8x64<uint8_t>) + 1];
8336   for (size_t i=0; i<64; i++) {
8337     buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
8338   }
8339   buf[64] = '\0';
8340   return buf;
8341 }
8342 
8343 template<size_t STEP_SIZE>
buf_block_reader(const uint8_t * _buf,size_t _len)8344 simdjson_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
8345 
8346 template<size_t STEP_SIZE>
block_index()8347 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
8348 
8349 template<size_t STEP_SIZE>
has_full_block() const8350 simdjson_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
8351   return idx < lenminusstep;
8352 }
8353 
8354 template<size_t STEP_SIZE>
full_block() const8355 simdjson_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
8356   return &buf[idx];
8357 }
8358 
8359 template<size_t STEP_SIZE>
get_remainder(uint8_t * dst) const8360 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
8361   if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
8362   std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
8363   std::memcpy(dst, buf + idx, len - idx);
8364   return len - idx;
8365 }
8366 
8367 template<size_t STEP_SIZE>
advance()8368 simdjson_really_inline void buf_block_reader<STEP_SIZE>::advance() {
8369   idx += STEP_SIZE;
8370 }
8371 
8372 } // unnamed namespace
8373 } // namespace ppc64
8374 } // namespace simdjson
8375 /* end file src/generic/stage1/buf_block_reader.h */
8376 /* begin file src/generic/stage1/json_string_scanner.h */
8377 namespace simdjson {
8378 namespace ppc64 {
8379 namespace {
8380 namespace stage1 {
8381 
8382 struct json_string_block {
8383   // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_string_blocksimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8384   simdjson_really_inline json_string_block(uint64_t backslash, uint64_t escaped, uint64_t quote, uint64_t in_string) :
8385   _backslash(backslash), _escaped(escaped), _quote(quote), _in_string(in_string) {}
8386 
8387   // Escaped characters (characters following an escape() character)
escapedsimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8388   simdjson_really_inline uint64_t escaped() const { return _escaped; }
8389   // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
escapesimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8390   simdjson_really_inline uint64_t escape() const { return _backslash & ~_escaped; }
8391   // Real (non-backslashed) quotes
quotesimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8392   simdjson_really_inline uint64_t quote() const { return _quote; }
8393   // Start quotes of strings
string_startsimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8394   simdjson_really_inline uint64_t string_start() const { return _quote & _in_string; }
8395   // End quotes of strings
string_endsimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8396   simdjson_really_inline uint64_t string_end() const { return _quote & ~_in_string; }
8397   // Only characters inside the string (not including the quotes)
string_contentsimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8398   simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; }
8399   // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_inside_stringsimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8400   simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
8401   // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_outside_stringsimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8402   simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
8403   // Tail of string (everything except the start quote)
string_tailsimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8404   simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
8405 
8406   // backslash characters
8407   uint64_t _backslash;
8408   // escaped characters (backslashed--does not include the hex characters after \u)
8409   uint64_t _escaped;
8410   // real quotes (non-backslashed ones)
8411   uint64_t _quote;
8412   // string characters (includes start quote but not end quote)
8413   uint64_t _in_string;
8414 };
8415 
8416 // Scans blocks for string characters, storing the state necessary to do so
8417 class json_string_scanner {
8418 public:
8419   simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
8420   // Returns either UNCLOSED_STRING or SUCCESS
8421   simdjson_really_inline error_code finish();
8422 
8423 private:
8424   // Intended to be defined by the implementation
8425   simdjson_really_inline uint64_t find_escaped(uint64_t escape);
8426   simdjson_really_inline uint64_t find_escaped_branchless(uint64_t escape);
8427 
8428   // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
8429   uint64_t prev_in_string = 0ULL;
8430   // Whether the first character of the next iteration is escaped.
8431   uint64_t prev_escaped = 0ULL;
8432 };
8433 
8434 //
8435 // Finds escaped characters (characters following \).
8436 //
8437 // Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
8438 //
8439 // Does this by:
8440 // - Shift the escape mask to get potentially escaped characters (characters after backslashes).
8441 // - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
8442 // - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
8443 //
8444 // To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
8445 // escape sequences, filters out the ones that start on even bits, and adds that to the mask of
8446 // escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
8447 // the start bit causes a carry), and leaves even-bit sequences alone.
8448 //
8449 // Example:
8450 //
8451 // text           |  \\\ | \\\"\\\" \\\" \\"\\" |
8452 // escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
8453 // odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
8454 // even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
8455 // invert_mask    |      |     cxxx     c xx   c| even_seq << 1
8456 // follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
8457 // escaped        |   x  | x x  x x  x x  x  x  |
8458 // desired        |   x  | x x  x x  x x  x  x  |
8459 // text           |  \\\ | \\\"\\\" \\\" \\"\\" |
8460 //
find_escaped_branchless(uint64_t backslash)8461 simdjson_really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash) {
8462   // If there was overflow, pretend the first character isn't a backslash
8463   backslash &= ~prev_escaped;
8464   uint64_t follows_escape = backslash << 1 | prev_escaped;
8465 
8466   // Get sequences starting on even bits by clearing out the odd series using +
8467   const uint64_t even_bits = 0x5555555555555555ULL;
8468   uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
8469   uint64_t sequences_starting_on_even_bits;
8470   prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
8471   uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
8472 
8473   // Mask every other backslashed character as an escaped character
8474   // Flip the mask for sequences that start on even bits, to correct them
8475   return (even_bits ^ invert_mask) & follows_escape;
8476 }
8477 
8478 //
8479 // Return a mask of all string characters plus end quotes.
8480 //
8481 // prev_escaped is overflow saying whether the next character is escaped.
8482 // prev_in_string is overflow saying whether we're still in a string.
8483 //
8484 // Backslash sequences outside of quotes will be detected in stage 2.
8485 //
next(const simd::simd8x64<uint8_t> & in)8486 simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
8487   const uint64_t backslash = in.eq('\\');
8488   const uint64_t escaped = find_escaped(backslash);
8489   const uint64_t quote = in.eq('"') & ~escaped;
8490 
8491   //
8492   // prefix_xor flips on bits inside the string (and flips off the end quote).
8493   //
8494   // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
8495   // (characters inside strings are outside, and characters outside strings are inside).
8496   //
8497   const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
8498 
8499   //
8500   // Check if we're still in a string at the end of the box so the next block will know
8501   //
8502   // right shift of a signed value expected to be well-defined and standard
8503   // compliant as of C++20, John Regher from Utah U. says this is fine code
8504   //
8505   prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
8506 
8507   // Use ^ to turn the beginning quote off, and the end quote on.
8508 
8509   // We are returning a function-local object so either we get a move constructor
8510   // or we get copy elision.
8511   return json_string_block(
8512     backslash,
8513     escaped,
8514     quote,
8515     in_string
8516   );
8517 }
8518 
finish()8519 simdjson_really_inline error_code json_string_scanner::finish() {
8520   if (prev_in_string) {
8521     return UNCLOSED_STRING;
8522   }
8523   return SUCCESS;
8524 }
8525 
8526 } // namespace stage1
8527 } // unnamed namespace
8528 } // namespace ppc64
8529 } // namespace simdjson
8530 /* end file src/generic/stage1/json_string_scanner.h */
8531 /* begin file src/generic/stage1/json_scanner.h */
8532 namespace simdjson {
8533 namespace ppc64 {
8534 namespace {
8535 namespace stage1 {
8536 
8537 /**
8538  * A block of scanned json, with information on operators and scalars.
8539  *
8540  * We seek to identify pseudo-structural characters. Anything that is inside
8541  * a string must be omitted (hence  & ~_string.string_tail()).
8542  * Otherwise, pseudo-structural characters come in two forms.
8543  * 1. We have the structural characters ([,],{,},:, comma). The
8544  *    term 'structural character' is from the JSON RFC.
8545  * 2. We have the 'scalar pseudo-structural characters'.
8546  *    Scalars are quotes, and any character except structural characters and white space.
8547  *
8548  * To identify the scalar pseudo-structural characters, we must look at what comes
8549  * before them: it must be a space, a quote or a structural characters.
8550  * Starting with simdjson v0.3, we identify them by
8551  * negation: we identify everything that is followed by a non-quote scalar,
8552  * and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
8553  */
8554 struct json_block {
8555 public:
8556   // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_blocksimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8557   simdjson_really_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
8558   _string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
json_blocksimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8559   simdjson_really_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
8560   _string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
8561 
8562   /**
8563    * The start of structurals.
8564    * In simdjson prior to v0.3, these were called the pseudo-structural characters.
8565    **/
structural_startsimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8566   simdjson_really_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
8567   /** All JSON whitespace (i.e. not in a string) */
whitespacesimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8568   simdjson_really_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
8569 
8570   // Helpers
8571 
8572   /** Whether the given characters are inside a string (only works on non-quotes) */
non_quote_inside_stringsimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8573   simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
8574   /** Whether the given characters are outside a string (only works on non-quotes) */
non_quote_outside_stringsimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8575   simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
8576 
8577   // string and escape characters
8578   json_string_block _string;
8579   // whitespace, structural characters ('operators'), scalars
8580   json_character_block _characters;
8581   // whether the previous character was a scalar
8582   uint64_t _follows_potential_nonquote_scalar;
8583 private:
8584   // Potential structurals (i.e. disregarding strings)
8585 
8586   /**
8587    * structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
8588    * They may reside inside a string.
8589    **/
potential_structural_startsimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8590   simdjson_really_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
8591   /**
8592    * The start of non-operator runs, like 123, true and "abc".
8593    * It main reside inside a string.
8594    **/
potential_scalar_startsimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8595   simdjson_really_inline uint64_t potential_scalar_start() const noexcept {
8596     // The term "scalar" refers to anything except structural characters and white space
8597     // (so letters, numbers, quotes).
8598     // Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
8599     // then we know that it is irrelevant structurally.
8600     return _characters.scalar() & ~follows_potential_scalar();
8601   }
8602   /**
8603    * Whether the given character is immediately after a non-operator like 123, true.
8604    * The characters following a quote are not included.
8605    */
follows_potential_scalarsimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8606   simdjson_really_inline uint64_t follows_potential_scalar() const noexcept {
8607     // _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
8608     // that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
8609     // white space.
8610     // It is understood that within quoted region, anything at all could be marked (irrelevant).
8611     return _follows_potential_nonquote_scalar;
8612   }
8613 };
8614 
8615 /**
8616  * Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
8617  *
8618  * The scanner starts by calculating two distinct things:
8619  * - string characters (taking \" into account)
8620  * - structural characters or 'operators' ([]{},:, comma)
8621  *   and scalars (runs of non-operators like 123, true and "abc")
8622  *
8623  * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
8624  * in particular, the operator/scalar bit will find plenty of things that are actually part of
8625  * strings. When we're done, json_block will fuse the two together by masking out tokens that are
8626  * part of a string.
8627  */
8628 class json_scanner {
8629 public:
json_scanner()8630   json_scanner() {}
8631   simdjson_really_inline json_block next(const simd::simd8x64<uint8_t>& in);
8632   // Returns either UNCLOSED_STRING or SUCCESS
8633   simdjson_really_inline error_code finish();
8634 
8635 private:
8636   // Whether the last character of the previous iteration is part of a scalar token
8637   // (anything except whitespace or a structural character/'operator').
8638   uint64_t prev_scalar = 0ULL;
8639   json_string_scanner string_scanner{};
8640 };
8641 
8642 
8643 //
8644 // Check if the current character immediately follows a matching character.
8645 //
8646 // For example, this checks for quotes with backslashes in front of them:
8647 //
8648 //     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
8649 //
follows(const uint64_t match,uint64_t & overflow)8650 simdjson_really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
8651   const uint64_t result = match << 1 | overflow;
8652   overflow = match >> 63;
8653   return result;
8654 }
8655 
next(const simd::simd8x64<uint8_t> & in)8656 simdjson_really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
8657   json_string_block strings = string_scanner.next(in);
8658   // identifies the white-space and the structurat characters
8659   json_character_block characters = json_character_block::classify(in);
8660   // The term "scalar" refers to anything except structural characters and white space
8661   // (so letters, numbers, quotes).
8662   // We want  follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
8663   //
8664   // A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
8665   // or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
8666   // pseudo-structural character just like we would if we had  ' "a string" true '; otherwise we
8667   // may need to add an extra check when parsing strings.
8668   //
8669   // Performance: there are many ways to skin this cat.
8670   const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
8671   uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
8672   // We are returning a function-local object so either we get a move constructor
8673   // or we get copy elision.
8674   return json_block(
8675     strings,// strings is a function-local object so either it moves or the copy is elided.
8676     characters,
8677     follows_nonquote_scalar
8678   );
8679 }
8680 
finish()8681 simdjson_really_inline error_code json_scanner::finish() {
8682   return string_scanner.finish();
8683 }
8684 
8685 } // namespace stage1
8686 } // unnamed namespace
8687 } // namespace ppc64
8688 } // namespace simdjson
8689 /* end file src/generic/stage1/json_scanner.h */
8690 /* begin file src/generic/stage1/json_minifier.h */
8691 // This file contains the common code every implementation uses in stage1
8692 // It is intended to be included multiple times and compiled multiple times
8693 // We assume the file in which it is included already includes
8694 // "simdjson/stage1.h" (this simplifies amalgation)
8695 
8696 namespace simdjson {
8697 namespace ppc64 {
8698 namespace {
8699 namespace stage1 {
8700 
8701 class json_minifier {
8702 public:
8703   template<size_t STEP_SIZE>
8704   static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
8705 
8706 private:
json_minifier(uint8_t * _dst)8707   simdjson_really_inline json_minifier(uint8_t *_dst)
8708   : dst{_dst}
8709   {}
8710   template<size_t STEP_SIZE>
8711   simdjson_really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
8712   simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
8713   simdjson_really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
8714   json_scanner scanner{};
8715   uint8_t *dst;
8716 };
8717 
next(const simd::simd8x64<uint8_t> & in,const json_block & block)8718 simdjson_really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
8719   uint64_t mask = block.whitespace();
8720   in.compress(mask, dst);
8721   dst += 64 - count_ones(mask);
8722 }
8723 
finish(uint8_t * dst_start,size_t & dst_len)8724 simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
8725   error_code error = scanner.finish();
8726   if (error) { dst_len = 0; return error; }
8727   dst_len = dst - dst_start;
8728   return SUCCESS;
8729 }
8730 
8731 template<>
step(const uint8_t * block_buf,buf_block_reader<128> & reader)8732 simdjson_really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
8733   simd::simd8x64<uint8_t> in_1(block_buf);
8734   simd::simd8x64<uint8_t> in_2(block_buf+64);
8735   json_block block_1 = scanner.next(in_1);
8736   json_block block_2 = scanner.next(in_2);
8737   this->next(in_1, block_1);
8738   this->next(in_2, block_2);
8739   reader.advance();
8740 }
8741 
8742 template<>
step(const uint8_t * block_buf,buf_block_reader<64> & reader)8743 simdjson_really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
8744   simd::simd8x64<uint8_t> in_1(block_buf);
8745   json_block block_1 = scanner.next(in_1);
8746   this->next(block_buf, block_1);
8747   reader.advance();
8748 }
8749 
8750 template<size_t STEP_SIZE>
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len)8751 error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
8752   buf_block_reader<STEP_SIZE> reader(buf, len);
8753   json_minifier minifier(dst);
8754 
8755   // Index the first n-1 blocks
8756   while (reader.has_full_block()) {
8757     minifier.step<STEP_SIZE>(reader.full_block(), reader);
8758   }
8759 
8760   // Index the last (remainder) block, padded with spaces
8761   uint8_t block[STEP_SIZE];
8762   size_t remaining_bytes = reader.get_remainder(block);
8763   if (remaining_bytes > 0) {
8764     // We do not want to write directly to the output stream. Rather, we write
8765     // to a local buffer (for safety).
8766     uint8_t out_block[STEP_SIZE];
8767     uint8_t * const guarded_dst{minifier.dst};
8768     minifier.dst = out_block;
8769     minifier.step<STEP_SIZE>(block, reader);
8770     size_t to_write = minifier.dst - out_block;
8771     // In some cases, we could be enticed to consider the padded spaces
8772     // as part of the string. This is fine as long as we do not write more
8773     // than we consumed.
8774     if(to_write > remaining_bytes) { to_write = remaining_bytes; }
8775     memcpy(guarded_dst, out_block, to_write);
8776     minifier.dst = guarded_dst + to_write;
8777   }
8778   return minifier.finish(dst, dst_len);
8779 }
8780 
8781 } // namespace stage1
8782 } // unnamed namespace
8783 } // namespace ppc64
8784 } // namespace simdjson
8785 /* end file src/generic/stage1/json_minifier.h */
8786 /* begin file src/generic/stage1/find_next_document_index.h */
8787 namespace simdjson {
8788 namespace ppc64 {
8789 namespace {
8790 
8791 /**
8792   * This algorithm is used to quickly identify the last structural position that
8793   * makes up a complete document.
8794   *
8795   * It does this by going backwards and finding the last *document boundary* (a
8796   * place where one value follows another without a comma between them). If the
8797   * last document (the characters after the boundary) has an equal number of
8798   * start and end brackets, it is considered complete.
8799   *
8800   * Simply put, we iterate over the structural characters, starting from
8801   * the end. We consider that we found the end of a JSON document when the
8802   * first element of the pair is NOT one of these characters: '{' '[' ';' ','
8803   * and when the second element is NOT one of these characters: '}' '}' ';' ','.
8804   *
8805   * This simple comparison works most of the time, but it does not cover cases
8806   * where the batch's structural indexes contain a perfect amount of documents.
8807   * In such a case, we do not have access to the structural index which follows
8808   * the last document, therefore, we do not have access to the second element in
8809   * the pair, and that means we cannot identify the last document. To fix this
8810   * issue, we keep a count of the open and closed curly/square braces we found
8811   * while searching for the pair. When we find a pair AND the count of open and
8812   * closed curly/square braces is the same, we know that we just passed a
8813   * complete document, therefore the last json buffer location is the end of the
8814   * batch.
8815   */
find_next_document_index(dom_parser_implementation & parser)8816 simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
8817   // TODO don't count separately, just figure out depth
8818   auto arr_cnt = 0;
8819   auto obj_cnt = 0;
8820   for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
8821     auto idxb = parser.structural_indexes[i];
8822     switch (parser.buf[idxb]) {
8823     case ':':
8824     case ',':
8825       continue;
8826     case '}':
8827       obj_cnt--;
8828       continue;
8829     case ']':
8830       arr_cnt--;
8831       continue;
8832     case '{':
8833       obj_cnt++;
8834       break;
8835     case '[':
8836       arr_cnt++;
8837       break;
8838     }
8839     auto idxa = parser.structural_indexes[i - 1];
8840     switch (parser.buf[idxa]) {
8841     case '{':
8842     case '[':
8843     case ':':
8844     case ',':
8845       continue;
8846     }
8847     // Last document is complete, so the next document will appear after!
8848     if (!arr_cnt && !obj_cnt) {
8849       return parser.n_structural_indexes;
8850     }
8851     // Last document is incomplete; mark the document at i + 1 as the next one
8852     return i;
8853   }
8854   return 0;
8855 }
8856 
8857 } // unnamed namespace
8858 } // namespace ppc64
8859 } // namespace simdjson
8860 /* end file src/generic/stage1/find_next_document_index.h */
8861 
8862 namespace simdjson {
8863 namespace ppc64 {
8864 namespace {
8865 namespace stage1 {
8866 
8867 class bit_indexer {
8868 public:
8869   uint32_t *tail;
8870 
bit_indexer(uint32_t * index_buf)8871   simdjson_really_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
8872 
8873   // flatten out values in 'bits' assuming that they are are to have values of idx
8874   // plus their position in the bitvector, and store these indexes at
8875   // base_ptr[base] incrementing base as we go
8876   // will potentially store extra values beyond end of valid bits, so base_ptr
8877   // needs to be large enough to handle this
write(uint32_t idx,uint64_t bits)8878   simdjson_really_inline void write(uint32_t idx, uint64_t bits) {
8879     // In some instances, the next branch is expensive because it is mispredicted.
8880     // Unfortunately, in other cases,
8881     // it helps tremendously.
8882     if (bits == 0)
8883         return;
8884     int cnt = static_cast<int>(count_ones(bits));
8885 
8886     // Do the first 8 all together
8887     for (int i=0; i<8; i++) {
8888       this->tail[i] = idx + trailing_zeroes(bits);
8889       bits = clear_lowest_bit(bits);
8890     }
8891 
8892     // Do the next 8 all together (we hope in most cases it won't happen at all
8893     // and the branch is easily predicted).
8894     if (simdjson_unlikely(cnt > 8)) {
8895       for (int i=8; i<16; i++) {
8896         this->tail[i] = idx + trailing_zeroes(bits);
8897         bits = clear_lowest_bit(bits);
8898       }
8899 
8900       // Most files don't have 16+ structurals per block, so we take several basically guaranteed
8901       // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
8902       // or the start of a value ("abc" true 123) every four characters.
8903       if (simdjson_unlikely(cnt > 16)) {
8904         int i = 16;
8905         do {
8906           this->tail[i] = idx + trailing_zeroes(bits);
8907           bits = clear_lowest_bit(bits);
8908           i++;
8909         } while (i < cnt);
8910       }
8911     }
8912 
8913     this->tail += cnt;
8914   }
8915 };
8916 
8917 class json_structural_indexer {
8918 public:
8919   /**
8920    * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
8921    *
8922    * @param partial Setting the partial parameter to true allows the find_structural_bits to
8923    *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
8924    *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
8925    */
8926   template<size_t STEP_SIZE>
8927   static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
8928 
8929 private:
8930   simdjson_really_inline json_structural_indexer(uint32_t *structural_indexes);
8931   template<size_t STEP_SIZE>
8932   simdjson_really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
8933   simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
8934   simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
8935 
8936   json_scanner scanner{};
8937   utf8_checker checker{};
8938   bit_indexer indexer;
8939   uint64_t prev_structurals = 0;
8940   uint64_t unescaped_chars_error = 0;
8941 };
8942 
json_structural_indexer(uint32_t * structural_indexes)8943 simdjson_really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
8944 
8945 // Skip the last character if it is partial
trim_partial_utf8(const uint8_t * buf,size_t len)8946 simdjson_really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
8947   if (simdjson_unlikely(len < 3)) {
8948     switch (len) {
8949       case 2:
8950         if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
8951         if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
8952         return len;
8953       case 1:
8954         if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
8955         return len;
8956       case 0:
8957         return len;
8958     }
8959   }
8960   if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
8961   if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
8962   if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
8963   return len;
8964 }
8965 
8966 //
8967 // PERF NOTES:
8968 // We pipe 2 inputs through these stages:
8969 // 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
8970 //    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
8971 // 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
8972 //    The output of step 1 depends entirely on this information. These functions don't quite use
8973 //    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
8974 //    at a time. The second input's scans has some dependency on the first ones finishing it, but
8975 //    they can make a lot of progress before they need that information.
8976 // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
8977 //    to finish: utf-8 checks and generating the output from the last iteration.
8978 //
8979 // The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
8980 // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
8981 // workout.
8982 //
8983 template<size_t STEP_SIZE>
index(const uint8_t * buf,size_t len,dom_parser_implementation & parser,bool partial)8984 error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept {
8985   if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
8986   if (partial) { len = trim_partial_utf8(buf, len); }
8987 
8988   buf_block_reader<STEP_SIZE> reader(buf, len);
8989   json_structural_indexer indexer(parser.structural_indexes.get());
8990 
8991   // Read all but the last block
8992   while (reader.has_full_block()) {
8993     indexer.step<STEP_SIZE>(reader.full_block(), reader);
8994   }
8995 
8996   // Take care of the last block (will always be there unless file is empty)
8997   uint8_t block[STEP_SIZE];
8998   if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return EMPTY; }
8999   indexer.step<STEP_SIZE>(block, reader);
9000 
9001   return indexer.finish(parser, reader.block_index(), len, partial);
9002 }
9003 
9004 template<>
step(const uint8_t * block,buf_block_reader<128> & reader)9005 simdjson_really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
9006   simd::simd8x64<uint8_t> in_1(block);
9007   simd::simd8x64<uint8_t> in_2(block+64);
9008   json_block block_1 = scanner.next(in_1);
9009   json_block block_2 = scanner.next(in_2);
9010   this->next(in_1, block_1, reader.block_index());
9011   this->next(in_2, block_2, reader.block_index()+64);
9012   reader.advance();
9013 }
9014 
9015 template<>
step(const uint8_t * block,buf_block_reader<64> & reader)9016 simdjson_really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
9017   simd::simd8x64<uint8_t> in_1(block);
9018   json_block block_1 = scanner.next(in_1);
9019   this->next(in_1, block_1, reader.block_index());
9020   reader.advance();
9021 }
9022 
next(const simd::simd8x64<uint8_t> & in,const json_block & block,size_t idx)9023 simdjson_really_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
9024   uint64_t unescaped = in.lteq(0x1F);
9025   checker.check_next_input(in);
9026   indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
9027   prev_structurals = block.structural_start();
9028   unescaped_chars_error |= block.non_quote_inside_string(unescaped);
9029 }
9030 
finish(dom_parser_implementation & parser,size_t idx,size_t len,bool partial)9031 simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) {
9032   // Write out the final iteration's structurals
9033   indexer.write(uint32_t(idx-64), prev_structurals);
9034 
9035   error_code error = scanner.finish();
9036   // We deliberately break down the next expression so that it is
9037   // human readable.
9038   const bool should_we_exit =  partial ?
9039     ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
9040     : (error != SUCCESS); // if partial is false, we must have SUCCESS
9041   const bool have_unclosed_string = (error == UNCLOSED_STRING);
9042   if (simdjson_unlikely(should_we_exit)) { return error; }
9043 
9044   if (unescaped_chars_error) {
9045     return UNESCAPED_CHARS;
9046   }
9047 
9048   parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
9049   /***
9050    * This is related to https://github.com/simdjson/simdjson/issues/906
9051    * Basically, we want to make sure that if the parsing continues beyond the last (valid)
9052    * structural character, it quickly stops.
9053    * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
9054    * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
9055    * continues, then it must be [,] or }.
9056    * Suppose it is ] or }. We backtrack to the first character, what could it be that would
9057    * not trigger an error? It could be ] or } but no, because you can't start a document that way.
9058    * It can't be a comma, a colon or any simple value. So the only way we could continue is
9059    * if the repeated character is [. But if so, the document must start with [. But if the document
9060    * starts with [, it should end with ]. If we enforce that rule, then we would get
9061    * ][[ which is invalid.
9062    **/
9063   parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
9064   parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
9065   parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
9066   parser.next_structural_index = 0;
9067   // a valid JSON file cannot have zero structural indexes - we should have found something
9068   if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
9069     return EMPTY;
9070   }
9071   if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
9072     return UNEXPECTED_ERROR;
9073   }
9074   if (partial) {
9075     // If we have an unclosed string, then the last structural
9076     // will be the quote and we want to make sure to omit it.
9077     if(have_unclosed_string) {
9078       parser.n_structural_indexes--;
9079       // a valid JSON file cannot have zero structural indexes - we should have found something
9080       if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
9081     }
9082     auto new_structural_indexes = find_next_document_index(parser);
9083     if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
9084       return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
9085     }
9086     parser.n_structural_indexes = new_structural_indexes;
9087   }
9088   checker.check_eof();
9089   return checker.errors();
9090 }
9091 
9092 } // namespace stage1
9093 } // unnamed namespace
9094 } // namespace ppc64
9095 } // namespace simdjson
9096 /* end file src/generic/stage1/json_structural_indexer.h */
9097 /* begin file src/generic/stage1/utf8_validator.h */
9098 namespace simdjson {
9099 namespace ppc64 {
9100 namespace {
9101 namespace stage1 {
9102 
9103 /**
9104  * Validates that the string is actual UTF-8.
9105  */
9106 template<class checker>
generic_validate_utf8(const uint8_t * input,size_t length)9107 bool generic_validate_utf8(const uint8_t * input, size_t length) {
9108     checker c{};
9109     buf_block_reader<64> reader(input, length);
9110     while (reader.has_full_block()) {
9111       simd::simd8x64<uint8_t> in(reader.full_block());
9112       c.check_next_input(in);
9113       reader.advance();
9114     }
9115     uint8_t block[64]{};
9116     reader.get_remainder(block);
9117     simd::simd8x64<uint8_t> in(block);
9118     c.check_next_input(in);
9119     reader.advance();
9120     c.check_eof();
9121     return c.errors() == error_code::SUCCESS;
9122 }
9123 
generic_validate_utf8(const char * input,size_t length)9124 bool generic_validate_utf8(const char * input, size_t length) {
9125     return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
9126 }
9127 
9128 } // namespace stage1
9129 } // unnamed namespace
9130 } // namespace ppc64
9131 } // namespace simdjson
9132 /* end file src/generic/stage1/utf8_validator.h */
9133 
9134 //
9135 // Stage 2
9136 //
9137 
9138 /* begin file src/generic/stage2/tape_builder.h */
9139 /* begin file src/generic/stage2/json_iterator.h */
9140 /* begin file src/generic/stage2/logger.h */
9141 // This is for an internal-only stage 2 specific logger.
9142 // Set LOG_ENABLED = true to log what stage 2 is doing!
9143 namespace simdjson {
9144 namespace ppc64 {
9145 namespace {
9146 namespace logger {
9147 
9148   static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
9149 
9150 #if SIMDJSON_VERBOSE_LOGGING
9151   static constexpr const bool LOG_ENABLED = true;
9152 #else
9153   static constexpr const bool LOG_ENABLED = false;
9154 #endif
9155   static constexpr const int LOG_EVENT_LEN = 20;
9156   static constexpr const int LOG_BUFFER_LEN = 30;
9157   static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
9158   static constexpr const int LOG_INDEX_LEN = 5;
9159 
9160   static int log_depth; // Not threadsafe. Log only.
9161 
9162   // Helper to turn unprintable or newline characters into spaces
printable_char(char c)9163   static simdjson_really_inline char printable_char(char c) {
9164     if (c >= 0x20) {
9165       return c;
9166     } else {
9167       return ' ';
9168     }
9169   }
9170 
9171   // Print the header and set up log_start
log_start()9172   static simdjson_really_inline void log_start() {
9173     if (LOG_ENABLED) {
9174       log_depth = 0;
9175       printf("\n");
9176       printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
9177       printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
9178     }
9179   }
9180 
log_string(const char * message)9181   simdjson_unused static simdjson_really_inline void log_string(const char *message) {
9182     if (LOG_ENABLED) {
9183       printf("%s\n", message);
9184     }
9185   }
9186 
9187   // Logs a single line from the stage 2 DOM parser
9188   template<typename S>
log_line(S & structurals,const char * title_prefix,const char * title,const char * detail)9189   static simdjson_really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
9190     if (LOG_ENABLED) {
9191       printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
9192       auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
9193       auto next_index = structurals.next_structural;
9194       auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>("                                                       ");
9195       auto next = &structurals.buf[*next_index];
9196       {
9197         // Print the next N characters in the buffer.
9198         printf("| ");
9199         // Otherwise, print the characters starting from the buffer position.
9200         // Print spaces for unprintable or newline characters.
9201         for (int i=0;i<LOG_BUFFER_LEN;i++) {
9202           printf("%c", printable_char(current[i]));
9203         }
9204         printf(" ");
9205         // Print the next N characters in the buffer.
9206         printf("| ");
9207         // Otherwise, print the characters starting from the buffer position.
9208         // Print spaces for unprintable or newline characters.
9209         for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
9210           printf("%c", printable_char(next[i]));
9211         }
9212         printf(" ");
9213       }
9214       if (current_index) {
9215         printf("| %*u ", LOG_INDEX_LEN, *current_index);
9216       } else {
9217         printf("| %-*s ", LOG_INDEX_LEN, "");
9218       }
9219       // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
9220       printf("| %-s ", detail);
9221       printf("|\n");
9222     }
9223   }
9224 
9225 } // namespace logger
9226 } // unnamed namespace
9227 } // namespace ppc64
9228 } // namespace simdjson
9229 /* end file src/generic/stage2/logger.h */
9230 
9231 namespace simdjson {
9232 namespace ppc64 {
9233 namespace {
9234 namespace stage2 {
9235 
9236 class json_iterator {
9237 public:
9238   const uint8_t* const buf;
9239   uint32_t *next_structural;
9240   dom_parser_implementation &dom_parser;
9241   uint32_t depth{0};
9242 
9243   /**
9244    * Walk the JSON document.
9245    *
9246    * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
9247    * the first parameter; some callbacks have other parameters as well:
9248    *
9249    * - visit_document_start() - at the beginning.
9250    * - visit_document_end() - at the end (if things were successful).
9251    *
9252    * - visit_array_start() - at the start `[` of a non-empty array.
9253    * - visit_array_end() - at the end `]` of a non-empty array.
9254    * - visit_empty_array() - when an empty array is encountered.
9255    *
9256    * - visit_object_end() - at the start `]` of a non-empty object.
9257    * - visit_object_start() - at the end `]` of a non-empty object.
9258    * - visit_empty_object() - when an empty object is encountered.
9259    * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
9260    *                                   guaranteed to point at the first quote of the string (`"key"`).
9261    * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
9262    * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
9263    *
9264    * - increment_count(iter) - each time a value is found in an array or object.
9265    */
9266   template<bool STREAMING, typename V>
9267   simdjson_warn_unused simdjson_really_inline error_code walk_document(V &visitor) noexcept;
9268 
9269   /**
9270    * Create an iterator capable of walking a JSON document.
9271    *
9272    * The document must have already passed through stage 1.
9273    */
9274   simdjson_really_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
9275 
9276   /**
9277    * Look at the next token.
9278    *
9279    * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
9280    *
9281    * They may include invalid JSON as well (such as `1.2.3` or `ture`).
9282    */
9283   simdjson_really_inline const uint8_t *peek() const noexcept;
9284   /**
9285    * Advance to the next token.
9286    *
9287    * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
9288    *
9289    * They may include invalid JSON as well (such as `1.2.3` or `ture`).
9290    */
9291   simdjson_really_inline const uint8_t *advance() noexcept;
9292   /**
9293    * Get the remaining length of the document, from the start of the current token.
9294    */
9295   simdjson_really_inline size_t remaining_len() const noexcept;
9296   /**
9297    * Check if we are at the end of the document.
9298    *
9299    * If this is true, there are no more tokens.
9300    */
9301   simdjson_really_inline bool at_eof() const noexcept;
9302   /**
9303    * Check if we are at the beginning of the document.
9304    */
9305   simdjson_really_inline bool at_beginning() const noexcept;
9306   simdjson_really_inline uint8_t last_structural() const noexcept;
9307 
9308   /**
9309    * Log that a value has been found.
9310    *
9311    * Set ENABLE_LOGGING=true in logger.h to see logging.
9312    */
9313   simdjson_really_inline void log_value(const char *type) const noexcept;
9314   /**
9315    * Log the start of a multipart value.
9316    *
9317    * Set ENABLE_LOGGING=true in logger.h to see logging.
9318    */
9319   simdjson_really_inline void log_start_value(const char *type) const noexcept;
9320   /**
9321    * Log the end of a multipart value.
9322    *
9323    * Set ENABLE_LOGGING=true in logger.h to see logging.
9324    */
9325   simdjson_really_inline void log_end_value(const char *type) const noexcept;
9326   /**
9327    * Log an error.
9328    *
9329    * Set ENABLE_LOGGING=true in logger.h to see logging.
9330    */
9331   simdjson_really_inline void log_error(const char *error) const noexcept;
9332 
9333   template<typename V>
9334   simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
9335   template<typename V>
9336   simdjson_warn_unused simdjson_really_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
9337 };
9338 
9339 template<bool STREAMING, typename V>
walk_document(V & visitor)9340 simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_document(V &visitor) noexcept {
9341   logger::log_start();
9342 
9343   //
9344   // Start the document
9345   //
9346   if (at_eof()) { return EMPTY; }
9347   log_start_value("document");
9348   SIMDJSON_TRY( visitor.visit_document_start(*this) );
9349 
9350   //
9351   // Read first value
9352   //
9353   {
9354     auto value = advance();
9355 
9356     // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
9357     // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
9358     if (!STREAMING) {
9359       switch (*value) {
9360         case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break;
9361         case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break;
9362       }
9363     }
9364 
9365     switch (*value) {
9366       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
9367       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
9368       default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
9369     }
9370   }
9371   goto document_end;
9372 
9373 //
9374 // Object parser states
9375 //
9376 object_begin:
9377   log_start_value("object");
9378   depth++;
9379   if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
9380   dom_parser.is_array[depth] = false;
9381   SIMDJSON_TRY( visitor.visit_object_start(*this) );
9382 
9383   {
9384     auto key = advance();
9385     if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
9386     SIMDJSON_TRY( visitor.increment_count(*this) );
9387     SIMDJSON_TRY( visitor.visit_key(*this, key) );
9388   }
9389 
9390 object_field:
9391   if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
9392   {
9393     auto value = advance();
9394     switch (*value) {
9395       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
9396       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
9397       default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
9398     }
9399   }
9400 
9401 object_continue:
9402   switch (*advance()) {
9403     case ',':
9404       SIMDJSON_TRY( visitor.increment_count(*this) );
9405       {
9406         auto key = advance();
9407         if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
9408         SIMDJSON_TRY( visitor.visit_key(*this, key) );
9409       }
9410       goto object_field;
9411     case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
9412     default: log_error("No comma between object fields"); return TAPE_ERROR;
9413   }
9414 
9415 scope_end:
9416   depth--;
9417   if (depth == 0) { goto document_end; }
9418   if (dom_parser.is_array[depth]) { goto array_continue; }
9419   goto object_continue;
9420 
9421 //
9422 // Array parser states
9423 //
9424 array_begin:
9425   log_start_value("array");
9426   depth++;
9427   if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
9428   dom_parser.is_array[depth] = true;
9429   SIMDJSON_TRY( visitor.visit_array_start(*this) );
9430   SIMDJSON_TRY( visitor.increment_count(*this) );
9431 
9432 array_value:
9433   {
9434     auto value = advance();
9435     switch (*value) {
9436       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
9437       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
9438       default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
9439     }
9440   }
9441 
9442 array_continue:
9443   switch (*advance()) {
9444     case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
9445     case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
9446     default: log_error("Missing comma between array values"); return TAPE_ERROR;
9447   }
9448 
9449 document_end:
9450   log_end_value("document");
9451   SIMDJSON_TRY( visitor.visit_document_end(*this) );
9452 
9453   dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
9454 
9455   // If we didn't make it to the end, it's an error
9456   if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
9457     log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
9458     return TAPE_ERROR;
9459   }
9460 
9461   return SUCCESS;
9462 
9463 } // walk_document()
9464 
json_iterator(dom_parser_implementation & _dom_parser,size_t start_structural_index)9465 simdjson_really_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
9466   : buf{_dom_parser.buf},
9467     next_structural{&_dom_parser.structural_indexes[start_structural_index]},
9468     dom_parser{_dom_parser} {
9469 }
9470 
peek() const9471 simdjson_really_inline const uint8_t *json_iterator::peek() const noexcept {
9472   return &buf[*(next_structural)];
9473 }
advance()9474 simdjson_really_inline const uint8_t *json_iterator::advance() noexcept {
9475   return &buf[*(next_structural++)];
9476 }
remaining_len() const9477 simdjson_really_inline size_t json_iterator::remaining_len() const noexcept {
9478   return dom_parser.len - *(next_structural-1);
9479 }
9480 
at_eof() const9481 simdjson_really_inline bool json_iterator::at_eof() const noexcept {
9482   return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
9483 }
at_beginning() const9484 simdjson_really_inline bool json_iterator::at_beginning() const noexcept {
9485   return next_structural == dom_parser.structural_indexes.get();
9486 }
last_structural() const9487 simdjson_really_inline uint8_t json_iterator::last_structural() const noexcept {
9488   return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
9489 }
9490 
log_value(const char * type) const9491 simdjson_really_inline void json_iterator::log_value(const char *type) const noexcept {
9492   logger::log_line(*this, "", type, "");
9493 }
9494 
log_start_value(const char * type) const9495 simdjson_really_inline void json_iterator::log_start_value(const char *type) const noexcept {
9496   logger::log_line(*this, "+", type, "");
9497   if (logger::LOG_ENABLED) { logger::log_depth++; }
9498 }
9499 
log_end_value(const char * type) const9500 simdjson_really_inline void json_iterator::log_end_value(const char *type) const noexcept {
9501   if (logger::LOG_ENABLED) { logger::log_depth--; }
9502   logger::log_line(*this, "-", type, "");
9503 }
9504 
log_error(const char * error) const9505 simdjson_really_inline void json_iterator::log_error(const char *error) const noexcept {
9506   logger::log_line(*this, "", "ERROR", error);
9507 }
9508 
9509 template<typename V>
visit_root_primitive(V & visitor,const uint8_t * value)9510 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
9511   switch (*value) {
9512     case '"': return visitor.visit_root_string(*this, value);
9513     case 't': return visitor.visit_root_true_atom(*this, value);
9514     case 'f': return visitor.visit_root_false_atom(*this, value);
9515     case 'n': return visitor.visit_root_null_atom(*this, value);
9516     case '-':
9517     case '0': case '1': case '2': case '3': case '4':
9518     case '5': case '6': case '7': case '8': case '9':
9519       return visitor.visit_root_number(*this, value);
9520     default:
9521       log_error("Document starts with a non-value character");
9522       return TAPE_ERROR;
9523   }
9524 }
9525 template<typename V>
visit_primitive(V & visitor,const uint8_t * value)9526 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
9527   switch (*value) {
9528     case '"': return visitor.visit_string(*this, value);
9529     case 't': return visitor.visit_true_atom(*this, value);
9530     case 'f': return visitor.visit_false_atom(*this, value);
9531     case 'n': return visitor.visit_null_atom(*this, value);
9532     case '-':
9533     case '0': case '1': case '2': case '3': case '4':
9534     case '5': case '6': case '7': case '8': case '9':
9535       return visitor.visit_number(*this, value);
9536     default:
9537       log_error("Non-value found when value was expected!");
9538       return TAPE_ERROR;
9539   }
9540 }
9541 
9542 } // namespace stage2
9543 } // unnamed namespace
9544 } // namespace ppc64
9545 } // namespace simdjson
9546 /* end file src/generic/stage2/json_iterator.h */
9547 /* begin file src/generic/stage2/tape_writer.h */
9548 namespace simdjson {
9549 namespace ppc64 {
9550 namespace {
9551 namespace stage2 {
9552 
9553 struct tape_writer {
9554   /** The next place to write to tape */
9555   uint64_t *next_tape_loc;
9556 
9557   /** Write a signed 64-bit value to tape. */
9558   simdjson_really_inline void append_s64(int64_t value) noexcept;
9559 
9560   /** Write an unsigned 64-bit value to tape. */
9561   simdjson_really_inline void append_u64(uint64_t value) noexcept;
9562 
9563   /** Write a double value to tape. */
9564   simdjson_really_inline void append_double(double value) noexcept;
9565 
9566   /**
9567    * Append a tape entry (an 8-bit type,and 56 bits worth of value).
9568    */
9569   simdjson_really_inline void append(uint64_t val, internal::tape_type t) noexcept;
9570 
9571   /**
9572    * Skip the current tape entry without writing.
9573    *
9574    * Used to skip the start of the container, since we'll come back later to fill it in when the
9575    * container ends.
9576    */
9577   simdjson_really_inline void skip() noexcept;
9578 
9579   /**
9580    * Skip the number of tape entries necessary to write a large u64 or i64.
9581    */
9582   simdjson_really_inline void skip_large_integer() noexcept;
9583 
9584   /**
9585    * Skip the number of tape entries necessary to write a double.
9586    */
9587   simdjson_really_inline void skip_double() noexcept;
9588 
9589   /**
9590    * Write a value to a known location on tape.
9591    *
9592    * Used to go back and write out the start of a container after the container ends.
9593    */
9594   simdjson_really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
9595 
9596 private:
9597   /**
9598    * Append both the tape entry, and a supplementary value following it. Used for types that need
9599    * all 64 bits, such as double and uint64_t.
9600    */
9601   template<typename T>
9602   simdjson_really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
9603 }; // struct number_writer
9604 
append_s64(int64_t value)9605 simdjson_really_inline void tape_writer::append_s64(int64_t value) noexcept {
9606   append2(0, value, internal::tape_type::INT64);
9607 }
9608 
append_u64(uint64_t value)9609 simdjson_really_inline void tape_writer::append_u64(uint64_t value) noexcept {
9610   append(0, internal::tape_type::UINT64);
9611   *next_tape_loc = value;
9612   next_tape_loc++;
9613 }
9614 
9615 /** Write a double value to tape. */
append_double(double value)9616 simdjson_really_inline void tape_writer::append_double(double value) noexcept {
9617   append2(0, value, internal::tape_type::DOUBLE);
9618 }
9619 
skip()9620 simdjson_really_inline void tape_writer::skip() noexcept {
9621   next_tape_loc++;
9622 }
9623 
skip_large_integer()9624 simdjson_really_inline void tape_writer::skip_large_integer() noexcept {
9625   next_tape_loc += 2;
9626 }
9627 
skip_double()9628 simdjson_really_inline void tape_writer::skip_double() noexcept {
9629   next_tape_loc += 2;
9630 }
9631 
append(uint64_t val,internal::tape_type t)9632 simdjson_really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
9633   *next_tape_loc = val | ((uint64_t(char(t))) << 56);
9634   next_tape_loc++;
9635 }
9636 
9637 template<typename T>
append2(uint64_t val,T val2,internal::tape_type t)9638 simdjson_really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
9639   append(val, t);
9640   static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
9641   memcpy(next_tape_loc, &val2, sizeof(val2));
9642   next_tape_loc++;
9643 }
9644 
write(uint64_t & tape_loc,uint64_t val,internal::tape_type t)9645 simdjson_really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
9646   tape_loc = val | ((uint64_t(char(t))) << 56);
9647 }
9648 
9649 } // namespace stage2
9650 } // unnamed namespace
9651 } // namespace ppc64
9652 } // namespace simdjson
9653 /* end file src/generic/stage2/tape_writer.h */
9654 
9655 namespace simdjson {
9656 namespace ppc64 {
9657 namespace {
9658 namespace stage2 {
9659 
9660 struct tape_builder {
9661   template<bool STREAMING>
9662   simdjson_warn_unused static simdjson_really_inline error_code parse_document(
9663     dom_parser_implementation &dom_parser,
9664     dom::document &doc) noexcept;
9665 
9666   /** Called when a non-empty document starts. */
9667   simdjson_warn_unused simdjson_really_inline error_code visit_document_start(json_iterator &iter) noexcept;
9668   /** Called when a non-empty document ends without error. */
9669   simdjson_warn_unused simdjson_really_inline error_code visit_document_end(json_iterator &iter) noexcept;
9670 
9671   /** Called when a non-empty array starts. */
9672   simdjson_warn_unused simdjson_really_inline error_code visit_array_start(json_iterator &iter) noexcept;
9673   /** Called when a non-empty array ends. */
9674   simdjson_warn_unused simdjson_really_inline error_code visit_array_end(json_iterator &iter) noexcept;
9675   /** Called when an empty array is found. */
9676   simdjson_warn_unused simdjson_really_inline error_code visit_empty_array(json_iterator &iter) noexcept;
9677 
9678   /** Called when a non-empty object starts. */
9679   simdjson_warn_unused simdjson_really_inline error_code visit_object_start(json_iterator &iter) noexcept;
9680   /**
9681    * Called when a key in a field is encountered.
9682    *
9683    * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
9684    * will be called after this with the field value.
9685    */
9686   simdjson_warn_unused simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
9687   /** Called when a non-empty object ends. */
9688   simdjson_warn_unused simdjson_really_inline error_code visit_object_end(json_iterator &iter) noexcept;
9689   /** Called when an empty object is found. */
9690   simdjson_warn_unused simdjson_really_inline error_code visit_empty_object(json_iterator &iter) noexcept;
9691 
9692   /**
9693    * Called when a string, number, boolean or null is found.
9694    */
9695   simdjson_warn_unused simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
9696   /**
9697    * Called when a string, number, boolean or null is found at the top level of a document (i.e.
9698    * when there is no array or object and the entire document is a single string, number, boolean or
9699    * null.
9700    *
9701    * This is separate from primitive() because simdjson's normal primitive parsing routines assume
9702    * there is at least one more token after the value, which is only true in an array or object.
9703    */
9704   simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
9705 
9706   simdjson_warn_unused simdjson_really_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
9707   simdjson_warn_unused simdjson_really_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
9708   simdjson_warn_unused simdjson_really_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
9709   simdjson_warn_unused simdjson_really_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
9710   simdjson_warn_unused simdjson_really_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
9711 
9712   simdjson_warn_unused simdjson_really_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
9713   simdjson_warn_unused simdjson_really_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
9714   simdjson_warn_unused simdjson_really_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
9715   simdjson_warn_unused simdjson_really_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
9716   simdjson_warn_unused simdjson_really_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
9717 
9718   /** Called each time a new field or element in an array or object is found. */
9719   simdjson_warn_unused simdjson_really_inline error_code increment_count(json_iterator &iter) noexcept;
9720 
9721   /** Next location to write to tape */
9722   tape_writer tape;
9723 private:
9724   /** Next write location in the string buf for stage 2 parsing */
9725   uint8_t *current_string_buf_loc;
9726 
9727   simdjson_really_inline tape_builder(dom::document &doc) noexcept;
9728 
9729   simdjson_really_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
9730   simdjson_really_inline void start_container(json_iterator &iter) noexcept;
9731   simdjson_warn_unused simdjson_really_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
9732   simdjson_warn_unused simdjson_really_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
9733   simdjson_really_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
9734   simdjson_really_inline void on_end_string(uint8_t *dst) noexcept;
9735 }; // class tape_builder
9736 
9737 template<bool STREAMING>
parse_document(dom_parser_implementation & dom_parser,dom::document & doc)9738 simdjson_warn_unused simdjson_really_inline error_code tape_builder::parse_document(
9739     dom_parser_implementation &dom_parser,
9740     dom::document &doc) noexcept {
9741   dom_parser.doc = &doc;
9742   json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
9743   tape_builder builder(doc);
9744   return iter.walk_document<STREAMING>(builder);
9745 }
9746 
visit_root_primitive(json_iterator & iter,const uint8_t * value)9747 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
9748   return iter.visit_root_primitive(*this, value);
9749 }
visit_primitive(json_iterator & iter,const uint8_t * value)9750 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
9751   return iter.visit_primitive(*this, value);
9752 }
visit_empty_object(json_iterator & iter)9753 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
9754   return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
9755 }
visit_empty_array(json_iterator & iter)9756 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
9757   return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
9758 }
9759 
visit_document_start(json_iterator & iter)9760 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
9761   start_container(iter);
9762   return SUCCESS;
9763 }
visit_object_start(json_iterator & iter)9764 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
9765   start_container(iter);
9766   return SUCCESS;
9767 }
visit_array_start(json_iterator & iter)9768 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
9769   start_container(iter);
9770   return SUCCESS;
9771 }
9772 
visit_object_end(json_iterator & iter)9773 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
9774   return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
9775 }
visit_array_end(json_iterator & iter)9776 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
9777   return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
9778 }
visit_document_end(json_iterator & iter)9779 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
9780   constexpr uint32_t start_tape_index = 0;
9781   tape.append(start_tape_index, internal::tape_type::ROOT);
9782   tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
9783   return SUCCESS;
9784 }
visit_key(json_iterator & iter,const uint8_t * key)9785 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
9786   return visit_string(iter, key, true);
9787 }
9788 
increment_count(json_iterator & iter)9789 simdjson_warn_unused simdjson_really_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
9790   iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
9791   return SUCCESS;
9792 }
9793 
tape_builder(dom::document & doc)9794 simdjson_really_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
9795 
visit_string(json_iterator & iter,const uint8_t * value,bool key)9796 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
9797   iter.log_value(key ? "key" : "string");
9798   uint8_t *dst = on_start_string(iter);
9799   dst = stringparsing::parse_string(value+1, dst);
9800   if (dst == nullptr) {
9801     iter.log_error("Invalid escape in string");
9802     return STRING_ERROR;
9803   }
9804   on_end_string(dst);
9805   return SUCCESS;
9806 }
9807 
visit_root_string(json_iterator & iter,const uint8_t * value)9808 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
9809   return visit_string(iter, value);
9810 }
9811 
visit_number(json_iterator & iter,const uint8_t * value)9812 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
9813   iter.log_value("number");
9814   return numberparsing::parse_number(value, tape);
9815 }
9816 
visit_root_number(json_iterator & iter,const uint8_t * value)9817 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
9818   //
9819   // We need to make a copy to make sure that the string is space terminated.
9820   // This is not about padding the input, which should already padded up
9821   // to len + SIMDJSON_PADDING. However, we have no control at this stage
9822   // on how the padding was done. What if the input string was padded with nulls?
9823   // It is quite common for an input string to have an extra null character (C string).
9824   // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
9825   // document, but the string "9\0" by itself is fine. So we make a copy and
9826   // pad the input with spaces when we know that there is just one input element.
9827   // This copy is relatively expensive, but it will almost never be called in
9828   // practice unless you are in the strange scenario where you have many JSON
9829   // documents made of single atoms.
9830   //
9831   std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
9832   if (copy.get() == nullptr) { return MEMALLOC; }
9833   std::memcpy(copy.get(), value, iter.remaining_len());
9834   std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
9835   error_code error = visit_number(iter, copy.get());
9836   return error;
9837 }
9838 
visit_true_atom(json_iterator & iter,const uint8_t * value)9839 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
9840   iter.log_value("true");
9841   if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
9842   tape.append(0, internal::tape_type::TRUE_VALUE);
9843   return SUCCESS;
9844 }
9845 
visit_root_true_atom(json_iterator & iter,const uint8_t * value)9846 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
9847   iter.log_value("true");
9848   if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
9849   tape.append(0, internal::tape_type::TRUE_VALUE);
9850   return SUCCESS;
9851 }
9852 
visit_false_atom(json_iterator & iter,const uint8_t * value)9853 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
9854   iter.log_value("false");
9855   if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
9856   tape.append(0, internal::tape_type::FALSE_VALUE);
9857   return SUCCESS;
9858 }
9859 
visit_root_false_atom(json_iterator & iter,const uint8_t * value)9860 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
9861   iter.log_value("false");
9862   if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
9863   tape.append(0, internal::tape_type::FALSE_VALUE);
9864   return SUCCESS;
9865 }
9866 
visit_null_atom(json_iterator & iter,const uint8_t * value)9867 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
9868   iter.log_value("null");
9869   if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
9870   tape.append(0, internal::tape_type::NULL_VALUE);
9871   return SUCCESS;
9872 }
9873 
visit_root_null_atom(json_iterator & iter,const uint8_t * value)9874 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
9875   iter.log_value("null");
9876   if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
9877   tape.append(0, internal::tape_type::NULL_VALUE);
9878   return SUCCESS;
9879 }
9880 
9881 // private:
9882 
next_tape_index(json_iterator & iter) const9883 simdjson_really_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
9884   return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
9885 }
9886 
empty_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)9887 simdjson_warn_unused simdjson_really_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
9888   auto start_index = next_tape_index(iter);
9889   tape.append(start_index+2, start);
9890   tape.append(start_index, end);
9891   return SUCCESS;
9892 }
9893 
start_container(json_iterator & iter)9894 simdjson_really_inline void tape_builder::start_container(json_iterator &iter) noexcept {
9895   iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
9896   iter.dom_parser.open_containers[iter.depth].count = 0;
9897   tape.skip(); // We don't actually *write* the start element until the end.
9898 }
9899 
end_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)9900 simdjson_warn_unused simdjson_really_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
9901   // Write the ending tape element, pointing at the start location
9902   const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
9903   tape.append(start_tape_index, end);
9904   // Write the start tape element, pointing at the end location (and including count)
9905   // count can overflow if it exceeds 24 bits... so we saturate
9906   // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
9907   const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
9908   const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
9909   tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
9910   return SUCCESS;
9911 }
9912 
on_start_string(json_iterator & iter)9913 simdjson_really_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
9914   // we advance the point, accounting for the fact that we have a NULL termination
9915   tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
9916   return current_string_buf_loc + sizeof(uint32_t);
9917 }
9918 
on_end_string(uint8_t * dst)9919 simdjson_really_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
9920   uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
9921   // TODO check for overflow in case someone has a crazy string (>=4GB?)
9922   // But only add the overflow check when the document itself exceeds 4GB
9923   // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
9924   memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
9925   // NULL termination is still handy if you expect all your strings to
9926   // be NULL terminated? It comes at a small cost
9927   *dst = 0;
9928   current_string_buf_loc = dst + 1;
9929 }
9930 
9931 } // namespace stage2
9932 } // unnamed namespace
9933 } // namespace ppc64
9934 } // namespace simdjson
9935 /* end file src/generic/stage2/tape_builder.h */
9936 
9937 //
9938 // Implementation-specific overrides
9939 //
9940 namespace simdjson {
9941 namespace ppc64 {
9942 namespace {
9943 namespace stage1 {
9944 
find_escaped(uint64_t backslash)9945 simdjson_really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
9946   // On PPC, we don't short-circuit this if there are no backslashes, because the branch gives us no
9947   // benefit and therefore makes things worse.
9948   // if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
9949   return find_escaped_branchless(backslash);
9950 }
9951 
9952 } // namespace stage1
9953 } // unnamed namespace
9954 
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len) const9955 simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
9956   return ppc64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
9957 }
9958 
stage1(const uint8_t * _buf,size_t _len,bool streaming)9959 simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
9960   this->buf = _buf;
9961   this->len = _len;
9962   return ppc64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
9963 }
9964 
validate_utf8(const char * buf,size_t len) const9965 simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
9966   return ppc64::stage1::generic_validate_utf8(buf,len);
9967 }
9968 
stage2(dom::document & _doc)9969 simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
9970   return stage2::tape_builder::parse_document<false>(*this, _doc);
9971 }
9972 
stage2_next(dom::document & _doc)9973 simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
9974   return stage2::tape_builder::parse_document<true>(*this, _doc);
9975 }
9976 
parse(const uint8_t * _buf,size_t _len,dom::document & _doc)9977 simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
9978   auto error = stage1(_buf, _len, false);
9979   if (error) { return error; }
9980   return stage2(_doc);
9981 }
9982 
9983 } // namespace ppc64
9984 } // namespace simdjson
9985 
9986 /* begin file include/simdjson/ppc64/end.h */
9987 /* end file include/simdjson/ppc64/end.h */
9988 /* end file src/ppc64/dom_parser_implementation.cpp */
9989 #endif
9990 #if SIMDJSON_IMPLEMENTATION_WESTMERE
9991 /* begin file src/westmere/implementation.cpp */
9992 /* begin file include/simdjson/westmere/begin.h */
9993 // redefining SIMDJSON_IMPLEMENTATION to "westmere"
9994 // #define SIMDJSON_IMPLEMENTATION westmere
9995 SIMDJSON_TARGET_WESTMERE
9996 /* end file include/simdjson/westmere/begin.h */
9997 
9998 namespace simdjson {
9999 namespace westmere {
10000 
create_dom_parser_implementation(size_t capacity,size_t max_depth,std::unique_ptr<internal::dom_parser_implementation> & dst) const10001 simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
10002   size_t capacity,
10003   size_t max_depth,
10004   std::unique_ptr<internal::dom_parser_implementation>& dst
10005 ) const noexcept {
10006   dst.reset( new (std::nothrow) dom_parser_implementation() );
10007   if (!dst) { return MEMALLOC; }
10008   dst->set_capacity(capacity);
10009   dst->set_max_depth(max_depth);
10010   return SUCCESS;
10011 }
10012 
10013 } // namespace westmere
10014 } // namespace simdjson
10015 
10016 /* begin file include/simdjson/westmere/end.h */
10017 SIMDJSON_UNTARGET_WESTMERE
10018 /* end file include/simdjson/westmere/end.h */
10019 /* end file src/westmere/implementation.cpp */
10020 /* begin file src/westmere/dom_parser_implementation.cpp */
10021 /* begin file include/simdjson/westmere/begin.h */
10022 // redefining SIMDJSON_IMPLEMENTATION to "westmere"
10023 // #define SIMDJSON_IMPLEMENTATION westmere
10024 SIMDJSON_TARGET_WESTMERE
10025 /* end file include/simdjson/westmere/begin.h */
10026 
10027 //
10028 // Stage 1
10029 //
10030 
10031 namespace simdjson {
10032 namespace westmere {
10033 namespace {
10034 
10035 using namespace simd;
10036 
10037 struct json_character_block {
10038   static simdjson_really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
10039 
whitespacesimdjson::westmere::__anon9bb6be6f3311::json_character_block10040   simdjson_really_inline uint64_t whitespace() const noexcept { return _whitespace; }
opsimdjson::westmere::__anon9bb6be6f3311::json_character_block10041   simdjson_really_inline uint64_t op() const noexcept { return _op; }
scalarsimdjson::westmere::__anon9bb6be6f3311::json_character_block10042   simdjson_really_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
10043 
10044   uint64_t _whitespace;
10045   uint64_t _op;
10046 };
10047 
classify(const simd::simd8x64<uint8_t> & in)10048 simdjson_really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
10049   // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
10050   // we can't use the generic lookup_16.
10051   auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
10052 
10053   // The 6 operators (:,[]{}) have these values:
10054   //
10055   // , 2C
10056   // : 3A
10057   // [ 5B
10058   // { 7B
10059   // ] 5D
10060   // } 7D
10061   //
10062   // If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique.
10063   // We exploit this, using a simd 4-bit lookup to tell us which character match against, and then
10064   // match it (against | 0x20).
10065   //
10066   // To prevent recognizing other characters, everything else gets compared with 0, which cannot
10067   // match due to the | 0x20.
10068   //
10069   // NOTE: Due to the | 0x20, this ALSO treats <FF> and <SUB> (control characters 0C and 1A) like ,
10070   // and :. This gets caught in stage 2, which checks the actual character to ensure the right
10071   // operators are in the right places.
10072   const auto op_table = simd8<uint8_t>::repeat_16(
10073     0, 0, 0, 0,
10074     0, 0, 0, 0,
10075     0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B
10076     ',', '}', 0, 0  // , = 2C, ] = 5D, } = 7D
10077   );
10078 
10079   // We compute whitespace and op separately. If the code later only use one or the
10080   // other, given the fact that all functions are aggressively inlined, we can
10081   // hope that useless computations will be omitted. This is namely case when
10082   // minifying (we only need whitespace).
10083 
10084 
10085   const uint64_t whitespace = in.eq({
10086     _mm_shuffle_epi8(whitespace_table, in.chunks[0]),
10087     _mm_shuffle_epi8(whitespace_table, in.chunks[1]),
10088     _mm_shuffle_epi8(whitespace_table, in.chunks[2]),
10089     _mm_shuffle_epi8(whitespace_table, in.chunks[3])
10090   });
10091   // Turn [ and ] into { and }
10092   const simd8x64<uint8_t> curlified{
10093     in.chunks[0] | 0x20,
10094     in.chunks[1] | 0x20,
10095     in.chunks[2] | 0x20,
10096     in.chunks[3] | 0x20
10097   };
10098   const uint64_t op = curlified.eq({
10099     _mm_shuffle_epi8(op_table, in.chunks[0]),
10100     _mm_shuffle_epi8(op_table, in.chunks[1]),
10101     _mm_shuffle_epi8(op_table, in.chunks[2]),
10102     _mm_shuffle_epi8(op_table, in.chunks[3])
10103   });
10104     return { whitespace, op };
10105 }
10106 
is_ascii(const simd8x64<uint8_t> & input)10107 simdjson_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
10108   return input.reduce_or().is_ascii();
10109 }
10110 
must_be_continuation(const simd8<uint8_t> prev1,const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)10111 simdjson_unused simdjson_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
10112   simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
10113   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
10114   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
10115   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
10116   return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
10117 }
10118 
must_be_2_3_continuation(const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)10119 simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
10120   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
10121   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
10122   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
10123   return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
10124 }
10125 
10126 } // unnamed namespace
10127 } // namespace westmere
10128 } // namespace simdjson
10129 
10130 /* begin file src/generic/stage1/utf8_lookup4_algorithm.h */
10131 namespace simdjson {
10132 namespace westmere {
10133 namespace {
10134 namespace utf8_validation {
10135 
10136 using namespace simd;
10137 
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)10138   simdjson_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
10139 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
10140 // Bit 1 = Too Long (ASCII followed by continuation)
10141 // Bit 2 = Overlong 3-byte
10142 // Bit 4 = Surrogate
10143 // Bit 5 = Overlong 2-byte
10144 // Bit 7 = Two Continuations
10145     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
10146                                                 // 11______ 11______
10147     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
10148     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
10149     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
10150     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
10151     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
10152     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
10153                                                 // 11110100 101_____
10154                                                 // 11110101 1001____
10155                                                 // 11110101 101_____
10156                                                 // 1111011_ 1001____
10157                                                 // 1111011_ 101_____
10158                                                 // 11111___ 1001____
10159                                                 // 11111___ 101_____
10160     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
10161                                                 // 11110101 1000____
10162                                                 // 1111011_ 1000____
10163                                                 // 11111___ 1000____
10164     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
10165 
10166     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
10167       // 0_______ ________ <ASCII in byte 1>
10168       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
10169       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
10170       // 10______ ________ <continuation in byte 1>
10171       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
10172       // 1100____ ________ <two byte lead in byte 1>
10173       TOO_SHORT | OVERLONG_2,
10174       // 1101____ ________ <two byte lead in byte 1>
10175       TOO_SHORT,
10176       // 1110____ ________ <three byte lead in byte 1>
10177       TOO_SHORT | OVERLONG_3 | SURROGATE,
10178       // 1111____ ________ <four+ byte lead in byte 1>
10179       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
10180     );
10181     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
10182     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
10183       // ____0000 ________
10184       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
10185       // ____0001 ________
10186       CARRY | OVERLONG_2,
10187       // ____001_ ________
10188       CARRY,
10189       CARRY,
10190 
10191       // ____0100 ________
10192       CARRY | TOO_LARGE,
10193       // ____0101 ________
10194       CARRY | TOO_LARGE | TOO_LARGE_1000,
10195       // ____011_ ________
10196       CARRY | TOO_LARGE | TOO_LARGE_1000,
10197       CARRY | TOO_LARGE | TOO_LARGE_1000,
10198 
10199       // ____1___ ________
10200       CARRY | TOO_LARGE | TOO_LARGE_1000,
10201       CARRY | TOO_LARGE | TOO_LARGE_1000,
10202       CARRY | TOO_LARGE | TOO_LARGE_1000,
10203       CARRY | TOO_LARGE | TOO_LARGE_1000,
10204       CARRY | TOO_LARGE | TOO_LARGE_1000,
10205       // ____1101 ________
10206       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
10207       CARRY | TOO_LARGE | TOO_LARGE_1000,
10208       CARRY | TOO_LARGE | TOO_LARGE_1000
10209     );
10210     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
10211       // ________ 0_______ <ASCII in byte 2>
10212       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
10213       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
10214 
10215       // ________ 1000____
10216       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
10217       // ________ 1001____
10218       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
10219       // ________ 101_____
10220       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
10221       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
10222 
10223       // ________ 11______
10224       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
10225     );
10226     return (byte_1_high & byte_1_low & byte_2_high);
10227   }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)10228   simdjson_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
10229       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
10230     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
10231     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
10232     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
10233     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
10234     return must23_80 ^ sc;
10235   }
10236 
10237   //
10238   // Return nonzero if there are incomplete multibyte characters at the end of the block:
10239   // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
10240   //
is_incomplete(const simd8<uint8_t> input)10241   simdjson_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
10242     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
10243     // ... 1111____ 111_____ 11______
10244     static const uint8_t max_array[32] = {
10245       255, 255, 255, 255, 255, 255, 255, 255,
10246       255, 255, 255, 255, 255, 255, 255, 255,
10247       255, 255, 255, 255, 255, 255, 255, 255,
10248       255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
10249     };
10250     const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
10251     return input.gt_bits(max_value);
10252   }
10253 
10254   struct utf8_checker {
10255     // If this is nonzero, there has been a UTF-8 error.
10256     simd8<uint8_t> error;
10257     // The last input we received
10258     simd8<uint8_t> prev_input_block;
10259     // Whether the last input we received was incomplete (used for ASCII fast path)
10260     simd8<uint8_t> prev_incomplete;
10261 
10262     //
10263     // Check whether the current bytes are valid UTF-8.
10264     //
check_utf8_bytessimdjson::westmere::__anon9bb6be6f3411::utf8_validation::utf8_checker10265     simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
10266       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
10267       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
10268       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
10269       simd8<uint8_t> sc = check_special_cases(input, prev1);
10270       this->error |= check_multibyte_lengths(input, prev_input, sc);
10271     }
10272 
10273     // The only problem that can happen at EOF is that a multibyte character is too short
10274     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
10275     // too large in the first of two bytes.
check_eofsimdjson::westmere::__anon9bb6be6f3411::utf8_validation::utf8_checker10276     simdjson_really_inline void check_eof() {
10277       // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
10278       // possibly finish them.
10279       this->error |= this->prev_incomplete;
10280     }
10281 
check_next_inputsimdjson::westmere::__anon9bb6be6f3411::utf8_validation::utf8_checker10282     simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
10283       if(simdjson_likely(is_ascii(input))) {
10284         this->error |= this->prev_incomplete;
10285       } else {
10286         // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
10287         static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
10288             "We support either two or four chunks per 64-byte block.");
10289         if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
10290           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
10291           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
10292         } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
10293           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
10294           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
10295           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
10296           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
10297         }
10298         this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
10299         this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
10300 
10301       }
10302     }
10303     // do not forget to call check_eof!
errorssimdjson::westmere::__anon9bb6be6f3411::utf8_validation::utf8_checker10304     simdjson_really_inline error_code errors() {
10305       return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
10306     }
10307 
10308   }; // struct utf8_checker
10309 } // namespace utf8_validation
10310 
10311 using utf8_validation::utf8_checker;
10312 
10313 } // unnamed namespace
10314 } // namespace westmere
10315 } // namespace simdjson
10316 /* end file src/generic/stage1/utf8_lookup4_algorithm.h */
10317 /* begin file src/generic/stage1/json_structural_indexer.h */
10318 // This file contains the common code every implementation uses in stage1
10319 // It is intended to be included multiple times and compiled multiple times
10320 // We assume the file in which it is included already includes
10321 // "simdjson/stage1.h" (this simplifies amalgation)
10322 
10323 /* begin file src/generic/stage1/buf_block_reader.h */
10324 namespace simdjson {
10325 namespace westmere {
10326 namespace {
10327 
10328 // Walks through a buffer in block-sized increments, loading the last part with spaces
10329 template<size_t STEP_SIZE>
10330 struct buf_block_reader {
10331 public:
10332   simdjson_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
10333   simdjson_really_inline size_t block_index();
10334   simdjson_really_inline bool has_full_block() const;
10335   simdjson_really_inline const uint8_t *full_block() const;
10336   /**
10337    * Get the last block, padded with spaces.
10338    *
10339    * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
10340    * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
10341    * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
10342    *
10343    * @return the number of effective characters in the last block.
10344    */
10345   simdjson_really_inline size_t get_remainder(uint8_t *dst) const;
10346   simdjson_really_inline void advance();
10347 private:
10348   const uint8_t *buf;
10349   const size_t len;
10350   const size_t lenminusstep;
10351   size_t idx;
10352 };
10353 
10354 // Routines to print masks and text for debugging bitmask operations
format_input_text_64(const uint8_t * text)10355 simdjson_unused static char * format_input_text_64(const uint8_t *text) {
10356   static char buf[sizeof(simd8x64<uint8_t>) + 1];
10357   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
10358     buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
10359   }
10360   buf[sizeof(simd8x64<uint8_t>)] = '\0';
10361   return buf;
10362 }
10363 
10364 // Routines to print masks and text for debugging bitmask operations
format_input_text(const simd8x64<uint8_t> & in)10365 simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
10366   static char buf[sizeof(simd8x64<uint8_t>) + 1];
10367   in.store(reinterpret_cast<uint8_t*>(buf));
10368   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
10369     if (buf[i] < ' ') { buf[i] = '_'; }
10370   }
10371   buf[sizeof(simd8x64<uint8_t>)] = '\0';
10372   return buf;
10373 }
10374 
format_mask(uint64_t mask)10375 simdjson_unused static char * format_mask(uint64_t mask) {
10376   static char buf[sizeof(simd8x64<uint8_t>) + 1];
10377   for (size_t i=0; i<64; i++) {
10378     buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
10379   }
10380   buf[64] = '\0';
10381   return buf;
10382 }
10383 
10384 template<size_t STEP_SIZE>
buf_block_reader(const uint8_t * _buf,size_t _len)10385 simdjson_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
10386 
10387 template<size_t STEP_SIZE>
block_index()10388 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
10389 
10390 template<size_t STEP_SIZE>
has_full_block() const10391 simdjson_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
10392   return idx < lenminusstep;
10393 }
10394 
10395 template<size_t STEP_SIZE>
full_block() const10396 simdjson_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
10397   return &buf[idx];
10398 }
10399 
10400 template<size_t STEP_SIZE>
get_remainder(uint8_t * dst) const10401 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
10402   if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
10403   std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
10404   std::memcpy(dst, buf + idx, len - idx);
10405   return len - idx;
10406 }
10407 
10408 template<size_t STEP_SIZE>
advance()10409 simdjson_really_inline void buf_block_reader<STEP_SIZE>::advance() {
10410   idx += STEP_SIZE;
10411 }
10412 
10413 } // unnamed namespace
10414 } // namespace westmere
10415 } // namespace simdjson
10416 /* end file src/generic/stage1/buf_block_reader.h */
10417 /* begin file src/generic/stage1/json_string_scanner.h */
10418 namespace simdjson {
10419 namespace westmere {
10420 namespace {
10421 namespace stage1 {
10422 
10423 struct json_string_block {
10424   // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_string_blocksimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10425   simdjson_really_inline json_string_block(uint64_t backslash, uint64_t escaped, uint64_t quote, uint64_t in_string) :
10426   _backslash(backslash), _escaped(escaped), _quote(quote), _in_string(in_string) {}
10427 
10428   // Escaped characters (characters following an escape() character)
escapedsimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10429   simdjson_really_inline uint64_t escaped() const { return _escaped; }
10430   // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
escapesimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10431   simdjson_really_inline uint64_t escape() const { return _backslash & ~_escaped; }
10432   // Real (non-backslashed) quotes
quotesimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10433   simdjson_really_inline uint64_t quote() const { return _quote; }
10434   // Start quotes of strings
string_startsimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10435   simdjson_really_inline uint64_t string_start() const { return _quote & _in_string; }
10436   // End quotes of strings
string_endsimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10437   simdjson_really_inline uint64_t string_end() const { return _quote & ~_in_string; }
10438   // Only characters inside the string (not including the quotes)
string_contentsimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10439   simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; }
10440   // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_inside_stringsimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10441   simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
10442   // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_outside_stringsimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10443   simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
10444   // Tail of string (everything except the start quote)
string_tailsimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10445   simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
10446 
10447   // backslash characters
10448   uint64_t _backslash;
10449   // escaped characters (backslashed--does not include the hex characters after \u)
10450   uint64_t _escaped;
10451   // real quotes (non-backslashed ones)
10452   uint64_t _quote;
10453   // string characters (includes start quote but not end quote)
10454   uint64_t _in_string;
10455 };
10456 
10457 // Scans blocks for string characters, storing the state necessary to do so
10458 class json_string_scanner {
10459 public:
10460   simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
10461   // Returns either UNCLOSED_STRING or SUCCESS
10462   simdjson_really_inline error_code finish();
10463 
10464 private:
10465   // Intended to be defined by the implementation
10466   simdjson_really_inline uint64_t find_escaped(uint64_t escape);
10467   simdjson_really_inline uint64_t find_escaped_branchless(uint64_t escape);
10468 
10469   // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
10470   uint64_t prev_in_string = 0ULL;
10471   // Whether the first character of the next iteration is escaped.
10472   uint64_t prev_escaped = 0ULL;
10473 };
10474 
10475 //
10476 // Finds escaped characters (characters following \).
10477 //
10478 // Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
10479 //
10480 // Does this by:
10481 // - Shift the escape mask to get potentially escaped characters (characters after backslashes).
10482 // - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
10483 // - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
10484 //
10485 // To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
10486 // escape sequences, filters out the ones that start on even bits, and adds that to the mask of
10487 // escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
10488 // the start bit causes a carry), and leaves even-bit sequences alone.
10489 //
10490 // Example:
10491 //
10492 // text           |  \\\ | \\\"\\\" \\\" \\"\\" |
10493 // escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
10494 // odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
10495 // even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
10496 // invert_mask    |      |     cxxx     c xx   c| even_seq << 1
10497 // follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
10498 // escaped        |   x  | x x  x x  x x  x  x  |
10499 // desired        |   x  | x x  x x  x x  x  x  |
10500 // text           |  \\\ | \\\"\\\" \\\" \\"\\" |
10501 //
find_escaped_branchless(uint64_t backslash)10502 simdjson_really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash) {
10503   // If there was overflow, pretend the first character isn't a backslash
10504   backslash &= ~prev_escaped;
10505   uint64_t follows_escape = backslash << 1 | prev_escaped;
10506 
10507   // Get sequences starting on even bits by clearing out the odd series using +
10508   const uint64_t even_bits = 0x5555555555555555ULL;
10509   uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
10510   uint64_t sequences_starting_on_even_bits;
10511   prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
10512   uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
10513 
10514   // Mask every other backslashed character as an escaped character
10515   // Flip the mask for sequences that start on even bits, to correct them
10516   return (even_bits ^ invert_mask) & follows_escape;
10517 }
10518 
10519 //
10520 // Return a mask of all string characters plus end quotes.
10521 //
10522 // prev_escaped is overflow saying whether the next character is escaped.
10523 // prev_in_string is overflow saying whether we're still in a string.
10524 //
10525 // Backslash sequences outside of quotes will be detected in stage 2.
10526 //
next(const simd::simd8x64<uint8_t> & in)10527 simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
10528   const uint64_t backslash = in.eq('\\');
10529   const uint64_t escaped = find_escaped(backslash);
10530   const uint64_t quote = in.eq('"') & ~escaped;
10531 
10532   //
10533   // prefix_xor flips on bits inside the string (and flips off the end quote).
10534   //
10535   // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
10536   // (characters inside strings are outside, and characters outside strings are inside).
10537   //
10538   const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
10539 
10540   //
10541   // Check if we're still in a string at the end of the box so the next block will know
10542   //
10543   // right shift of a signed value expected to be well-defined and standard
10544   // compliant as of C++20, John Regher from Utah U. says this is fine code
10545   //
10546   prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
10547 
10548   // Use ^ to turn the beginning quote off, and the end quote on.
10549 
10550   // We are returning a function-local object so either we get a move constructor
10551   // or we get copy elision.
10552   return json_string_block(
10553     backslash,
10554     escaped,
10555     quote,
10556     in_string
10557   );
10558 }
10559 
finish()10560 simdjson_really_inline error_code json_string_scanner::finish() {
10561   if (prev_in_string) {
10562     return UNCLOSED_STRING;
10563   }
10564   return SUCCESS;
10565 }
10566 
10567 } // namespace stage1
10568 } // unnamed namespace
10569 } // namespace westmere
10570 } // namespace simdjson
10571 /* end file src/generic/stage1/json_string_scanner.h */
10572 /* begin file src/generic/stage1/json_scanner.h */
10573 namespace simdjson {
10574 namespace westmere {
10575 namespace {
10576 namespace stage1 {
10577 
10578 /**
10579  * A block of scanned json, with information on operators and scalars.
10580  *
10581  * We seek to identify pseudo-structural characters. Anything that is inside
10582  * a string must be omitted (hence  & ~_string.string_tail()).
10583  * Otherwise, pseudo-structural characters come in two forms.
10584  * 1. We have the structural characters ([,],{,},:, comma). The
10585  *    term 'structural character' is from the JSON RFC.
10586  * 2. We have the 'scalar pseudo-structural characters'.
10587  *    Scalars are quotes, and any character except structural characters and white space.
10588  *
10589  * To identify the scalar pseudo-structural characters, we must look at what comes
10590  * before them: it must be a space, a quote or a structural characters.
10591  * Starting with simdjson v0.3, we identify them by
10592  * negation: we identify everything that is followed by a non-quote scalar,
10593  * and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
10594  */
10595 struct json_block {
10596 public:
10597   // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_blocksimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10598   simdjson_really_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
10599   _string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
json_blocksimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10600   simdjson_really_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
10601   _string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
10602 
10603   /**
10604    * The start of structurals.
10605    * In simdjson prior to v0.3, these were called the pseudo-structural characters.
10606    **/
structural_startsimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10607   simdjson_really_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
10608   /** All JSON whitespace (i.e. not in a string) */
whitespacesimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10609   simdjson_really_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
10610 
10611   // Helpers
10612 
10613   /** Whether the given characters are inside a string (only works on non-quotes) */
non_quote_inside_stringsimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10614   simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
10615   /** Whether the given characters are outside a string (only works on non-quotes) */
non_quote_outside_stringsimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10616   simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
10617 
10618   // string and escape characters
10619   json_string_block _string;
10620   // whitespace, structural characters ('operators'), scalars
10621   json_character_block _characters;
10622   // whether the previous character was a scalar
10623   uint64_t _follows_potential_nonquote_scalar;
10624 private:
10625   // Potential structurals (i.e. disregarding strings)
10626 
10627   /**
10628    * structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
10629    * They may reside inside a string.
10630    **/
potential_structural_startsimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10631   simdjson_really_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
10632   /**
10633    * The start of non-operator runs, like 123, true and "abc".
10634    * It main reside inside a string.
10635    **/
potential_scalar_startsimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10636   simdjson_really_inline uint64_t potential_scalar_start() const noexcept {
10637     // The term "scalar" refers to anything except structural characters and white space
10638     // (so letters, numbers, quotes).
10639     // Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
10640     // then we know that it is irrelevant structurally.
10641     return _characters.scalar() & ~follows_potential_scalar();
10642   }
10643   /**
10644    * Whether the given character is immediately after a non-operator like 123, true.
10645    * The characters following a quote are not included.
10646    */
follows_potential_scalarsimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10647   simdjson_really_inline uint64_t follows_potential_scalar() const noexcept {
10648     // _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
10649     // that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
10650     // white space.
10651     // It is understood that within quoted region, anything at all could be marked (irrelevant).
10652     return _follows_potential_nonquote_scalar;
10653   }
10654 };
10655 
10656 /**
10657  * Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
10658  *
10659  * The scanner starts by calculating two distinct things:
10660  * - string characters (taking \" into account)
10661  * - structural characters or 'operators' ([]{},:, comma)
10662  *   and scalars (runs of non-operators like 123, true and "abc")
10663  *
10664  * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
10665  * in particular, the operator/scalar bit will find plenty of things that are actually part of
10666  * strings. When we're done, json_block will fuse the two together by masking out tokens that are
10667  * part of a string.
10668  */
10669 class json_scanner {
10670 public:
json_scanner()10671   json_scanner() {}
10672   simdjson_really_inline json_block next(const simd::simd8x64<uint8_t>& in);
10673   // Returns either UNCLOSED_STRING or SUCCESS
10674   simdjson_really_inline error_code finish();
10675 
10676 private:
10677   // Whether the last character of the previous iteration is part of a scalar token
10678   // (anything except whitespace or a structural character/'operator').
10679   uint64_t prev_scalar = 0ULL;
10680   json_string_scanner string_scanner{};
10681 };
10682 
10683 
10684 //
10685 // Check if the current character immediately follows a matching character.
10686 //
10687 // For example, this checks for quotes with backslashes in front of them:
10688 //
10689 //     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
10690 //
follows(const uint64_t match,uint64_t & overflow)10691 simdjson_really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
10692   const uint64_t result = match << 1 | overflow;
10693   overflow = match >> 63;
10694   return result;
10695 }
10696 
next(const simd::simd8x64<uint8_t> & in)10697 simdjson_really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
10698   json_string_block strings = string_scanner.next(in);
10699   // identifies the white-space and the structurat characters
10700   json_character_block characters = json_character_block::classify(in);
10701   // The term "scalar" refers to anything except structural characters and white space
10702   // (so letters, numbers, quotes).
10703   // We want  follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
10704   //
10705   // A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
10706   // or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
10707   // pseudo-structural character just like we would if we had  ' "a string" true '; otherwise we
10708   // may need to add an extra check when parsing strings.
10709   //
10710   // Performance: there are many ways to skin this cat.
10711   const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
10712   uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
10713   // We are returning a function-local object so either we get a move constructor
10714   // or we get copy elision.
10715   return json_block(
10716     strings,// strings is a function-local object so either it moves or the copy is elided.
10717     characters,
10718     follows_nonquote_scalar
10719   );
10720 }
10721 
finish()10722 simdjson_really_inline error_code json_scanner::finish() {
10723   return string_scanner.finish();
10724 }
10725 
10726 } // namespace stage1
10727 } // unnamed namespace
10728 } // namespace westmere
10729 } // namespace simdjson
10730 /* end file src/generic/stage1/json_scanner.h */
10731 /* begin file src/generic/stage1/json_minifier.h */
10732 // This file contains the common code every implementation uses in stage1
10733 // It is intended to be included multiple times and compiled multiple times
10734 // We assume the file in which it is included already includes
10735 // "simdjson/stage1.h" (this simplifies amalgation)
10736 
10737 namespace simdjson {
10738 namespace westmere {
10739 namespace {
10740 namespace stage1 {
10741 
10742 class json_minifier {
10743 public:
10744   template<size_t STEP_SIZE>
10745   static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
10746 
10747 private:
json_minifier(uint8_t * _dst)10748   simdjson_really_inline json_minifier(uint8_t *_dst)
10749   : dst{_dst}
10750   {}
10751   template<size_t STEP_SIZE>
10752   simdjson_really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
10753   simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
10754   simdjson_really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
10755   json_scanner scanner{};
10756   uint8_t *dst;
10757 };
10758 
next(const simd::simd8x64<uint8_t> & in,const json_block & block)10759 simdjson_really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
10760   uint64_t mask = block.whitespace();
10761   in.compress(mask, dst);
10762   dst += 64 - count_ones(mask);
10763 }
10764 
finish(uint8_t * dst_start,size_t & dst_len)10765 simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
10766   error_code error = scanner.finish();
10767   if (error) { dst_len = 0; return error; }
10768   dst_len = dst - dst_start;
10769   return SUCCESS;
10770 }
10771 
10772 template<>
step(const uint8_t * block_buf,buf_block_reader<128> & reader)10773 simdjson_really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
10774   simd::simd8x64<uint8_t> in_1(block_buf);
10775   simd::simd8x64<uint8_t> in_2(block_buf+64);
10776   json_block block_1 = scanner.next(in_1);
10777   json_block block_2 = scanner.next(in_2);
10778   this->next(in_1, block_1);
10779   this->next(in_2, block_2);
10780   reader.advance();
10781 }
10782 
10783 template<>
step(const uint8_t * block_buf,buf_block_reader<64> & reader)10784 simdjson_really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
10785   simd::simd8x64<uint8_t> in_1(block_buf);
10786   json_block block_1 = scanner.next(in_1);
10787   this->next(block_buf, block_1);
10788   reader.advance();
10789 }
10790 
10791 template<size_t STEP_SIZE>
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len)10792 error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
10793   buf_block_reader<STEP_SIZE> reader(buf, len);
10794   json_minifier minifier(dst);
10795 
10796   // Index the first n-1 blocks
10797   while (reader.has_full_block()) {
10798     minifier.step<STEP_SIZE>(reader.full_block(), reader);
10799   }
10800 
10801   // Index the last (remainder) block, padded with spaces
10802   uint8_t block[STEP_SIZE];
10803   size_t remaining_bytes = reader.get_remainder(block);
10804   if (remaining_bytes > 0) {
10805     // We do not want to write directly to the output stream. Rather, we write
10806     // to a local buffer (for safety).
10807     uint8_t out_block[STEP_SIZE];
10808     uint8_t * const guarded_dst{minifier.dst};
10809     minifier.dst = out_block;
10810     minifier.step<STEP_SIZE>(block, reader);
10811     size_t to_write = minifier.dst - out_block;
10812     // In some cases, we could be enticed to consider the padded spaces
10813     // as part of the string. This is fine as long as we do not write more
10814     // than we consumed.
10815     if(to_write > remaining_bytes) { to_write = remaining_bytes; }
10816     memcpy(guarded_dst, out_block, to_write);
10817     minifier.dst = guarded_dst + to_write;
10818   }
10819   return minifier.finish(dst, dst_len);
10820 }
10821 
10822 } // namespace stage1
10823 } // unnamed namespace
10824 } // namespace westmere
10825 } // namespace simdjson
10826 /* end file src/generic/stage1/json_minifier.h */
10827 /* begin file src/generic/stage1/find_next_document_index.h */
10828 namespace simdjson {
10829 namespace westmere {
10830 namespace {
10831 
10832 /**
10833   * This algorithm is used to quickly identify the last structural position that
10834   * makes up a complete document.
10835   *
10836   * It does this by going backwards and finding the last *document boundary* (a
10837   * place where one value follows another without a comma between them). If the
10838   * last document (the characters after the boundary) has an equal number of
10839   * start and end brackets, it is considered complete.
10840   *
10841   * Simply put, we iterate over the structural characters, starting from
10842   * the end. We consider that we found the end of a JSON document when the
10843   * first element of the pair is NOT one of these characters: '{' '[' ';' ','
10844   * and when the second element is NOT one of these characters: '}' '}' ';' ','.
10845   *
10846   * This simple comparison works most of the time, but it does not cover cases
10847   * where the batch's structural indexes contain a perfect amount of documents.
10848   * In such a case, we do not have access to the structural index which follows
10849   * the last document, therefore, we do not have access to the second element in
10850   * the pair, and that means we cannot identify the last document. To fix this
10851   * issue, we keep a count of the open and closed curly/square braces we found
10852   * while searching for the pair. When we find a pair AND the count of open and
10853   * closed curly/square braces is the same, we know that we just passed a
10854   * complete document, therefore the last json buffer location is the end of the
10855   * batch.
10856   */
find_next_document_index(dom_parser_implementation & parser)10857 simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
10858   // TODO don't count separately, just figure out depth
10859   auto arr_cnt = 0;
10860   auto obj_cnt = 0;
10861   for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
10862     auto idxb = parser.structural_indexes[i];
10863     switch (parser.buf[idxb]) {
10864     case ':':
10865     case ',':
10866       continue;
10867     case '}':
10868       obj_cnt--;
10869       continue;
10870     case ']':
10871       arr_cnt--;
10872       continue;
10873     case '{':
10874       obj_cnt++;
10875       break;
10876     case '[':
10877       arr_cnt++;
10878       break;
10879     }
10880     auto idxa = parser.structural_indexes[i - 1];
10881     switch (parser.buf[idxa]) {
10882     case '{':
10883     case '[':
10884     case ':':
10885     case ',':
10886       continue;
10887     }
10888     // Last document is complete, so the next document will appear after!
10889     if (!arr_cnt && !obj_cnt) {
10890       return parser.n_structural_indexes;
10891     }
10892     // Last document is incomplete; mark the document at i + 1 as the next one
10893     return i;
10894   }
10895   return 0;
10896 }
10897 
10898 } // unnamed namespace
10899 } // namespace westmere
10900 } // namespace simdjson
10901 /* end file src/generic/stage1/find_next_document_index.h */
10902 
10903 namespace simdjson {
10904 namespace westmere {
10905 namespace {
10906 namespace stage1 {
10907 
10908 class bit_indexer {
10909 public:
10910   uint32_t *tail;
10911 
bit_indexer(uint32_t * index_buf)10912   simdjson_really_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
10913 
10914   // flatten out values in 'bits' assuming that they are are to have values of idx
10915   // plus their position in the bitvector, and store these indexes at
10916   // base_ptr[base] incrementing base as we go
10917   // will potentially store extra values beyond end of valid bits, so base_ptr
10918   // needs to be large enough to handle this
write(uint32_t idx,uint64_t bits)10919   simdjson_really_inline void write(uint32_t idx, uint64_t bits) {
10920     // In some instances, the next branch is expensive because it is mispredicted.
10921     // Unfortunately, in other cases,
10922     // it helps tremendously.
10923     if (bits == 0)
10924         return;
10925     int cnt = static_cast<int>(count_ones(bits));
10926 
10927     // Do the first 8 all together
10928     for (int i=0; i<8; i++) {
10929       this->tail[i] = idx + trailing_zeroes(bits);
10930       bits = clear_lowest_bit(bits);
10931     }
10932 
10933     // Do the next 8 all together (we hope in most cases it won't happen at all
10934     // and the branch is easily predicted).
10935     if (simdjson_unlikely(cnt > 8)) {
10936       for (int i=8; i<16; i++) {
10937         this->tail[i] = idx + trailing_zeroes(bits);
10938         bits = clear_lowest_bit(bits);
10939       }
10940 
10941       // Most files don't have 16+ structurals per block, so we take several basically guaranteed
10942       // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
10943       // or the start of a value ("abc" true 123) every four characters.
10944       if (simdjson_unlikely(cnt > 16)) {
10945         int i = 16;
10946         do {
10947           this->tail[i] = idx + trailing_zeroes(bits);
10948           bits = clear_lowest_bit(bits);
10949           i++;
10950         } while (i < cnt);
10951       }
10952     }
10953 
10954     this->tail += cnt;
10955   }
10956 };
10957 
10958 class json_structural_indexer {
10959 public:
10960   /**
10961    * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
10962    *
10963    * @param partial Setting the partial parameter to true allows the find_structural_bits to
10964    *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
10965    *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
10966    */
10967   template<size_t STEP_SIZE>
10968   static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
10969 
10970 private:
10971   simdjson_really_inline json_structural_indexer(uint32_t *structural_indexes);
10972   template<size_t STEP_SIZE>
10973   simdjson_really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
10974   simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
10975   simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
10976 
10977   json_scanner scanner{};
10978   utf8_checker checker{};
10979   bit_indexer indexer;
10980   uint64_t prev_structurals = 0;
10981   uint64_t unescaped_chars_error = 0;
10982 };
10983 
json_structural_indexer(uint32_t * structural_indexes)10984 simdjson_really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
10985 
10986 // Skip the last character if it is partial
trim_partial_utf8(const uint8_t * buf,size_t len)10987 simdjson_really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
10988   if (simdjson_unlikely(len < 3)) {
10989     switch (len) {
10990       case 2:
10991         if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
10992         if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
10993         return len;
10994       case 1:
10995         if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
10996         return len;
10997       case 0:
10998         return len;
10999     }
11000   }
11001   if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
11002   if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
11003   if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
11004   return len;
11005 }
11006 
11007 //
11008 // PERF NOTES:
11009 // We pipe 2 inputs through these stages:
11010 // 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
11011 //    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
11012 // 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
11013 //    The output of step 1 depends entirely on this information. These functions don't quite use
11014 //    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
11015 //    at a time. The second input's scans has some dependency on the first ones finishing it, but
11016 //    they can make a lot of progress before they need that information.
11017 // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
11018 //    to finish: utf-8 checks and generating the output from the last iteration.
11019 //
11020 // The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
11021 // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
11022 // workout.
11023 //
11024 template<size_t STEP_SIZE>
index(const uint8_t * buf,size_t len,dom_parser_implementation & parser,bool partial)11025 error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept {
11026   if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
11027   if (partial) { len = trim_partial_utf8(buf, len); }
11028 
11029   buf_block_reader<STEP_SIZE> reader(buf, len);
11030   json_structural_indexer indexer(parser.structural_indexes.get());
11031 
11032   // Read all but the last block
11033   while (reader.has_full_block()) {
11034     indexer.step<STEP_SIZE>(reader.full_block(), reader);
11035   }
11036 
11037   // Take care of the last block (will always be there unless file is empty)
11038   uint8_t block[STEP_SIZE];
11039   if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return EMPTY; }
11040   indexer.step<STEP_SIZE>(block, reader);
11041 
11042   return indexer.finish(parser, reader.block_index(), len, partial);
11043 }
11044 
11045 template<>
step(const uint8_t * block,buf_block_reader<128> & reader)11046 simdjson_really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
11047   simd::simd8x64<uint8_t> in_1(block);
11048   simd::simd8x64<uint8_t> in_2(block+64);
11049   json_block block_1 = scanner.next(in_1);
11050   json_block block_2 = scanner.next(in_2);
11051   this->next(in_1, block_1, reader.block_index());
11052   this->next(in_2, block_2, reader.block_index()+64);
11053   reader.advance();
11054 }
11055 
11056 template<>
step(const uint8_t * block,buf_block_reader<64> & reader)11057 simdjson_really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
11058   simd::simd8x64<uint8_t> in_1(block);
11059   json_block block_1 = scanner.next(in_1);
11060   this->next(in_1, block_1, reader.block_index());
11061   reader.advance();
11062 }
11063 
next(const simd::simd8x64<uint8_t> & in,const json_block & block,size_t idx)11064 simdjson_really_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
11065   uint64_t unescaped = in.lteq(0x1F);
11066   checker.check_next_input(in);
11067   indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
11068   prev_structurals = block.structural_start();
11069   unescaped_chars_error |= block.non_quote_inside_string(unescaped);
11070 }
11071 
finish(dom_parser_implementation & parser,size_t idx,size_t len,bool partial)11072 simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) {
11073   // Write out the final iteration's structurals
11074   indexer.write(uint32_t(idx-64), prev_structurals);
11075 
11076   error_code error = scanner.finish();
11077   // We deliberately break down the next expression so that it is
11078   // human readable.
11079   const bool should_we_exit =  partial ?
11080     ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
11081     : (error != SUCCESS); // if partial is false, we must have SUCCESS
11082   const bool have_unclosed_string = (error == UNCLOSED_STRING);
11083   if (simdjson_unlikely(should_we_exit)) { return error; }
11084 
11085   if (unescaped_chars_error) {
11086     return UNESCAPED_CHARS;
11087   }
11088 
11089   parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
11090   /***
11091    * This is related to https://github.com/simdjson/simdjson/issues/906
11092    * Basically, we want to make sure that if the parsing continues beyond the last (valid)
11093    * structural character, it quickly stops.
11094    * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
11095    * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
11096    * continues, then it must be [,] or }.
11097    * Suppose it is ] or }. We backtrack to the first character, what could it be that would
11098    * not trigger an error? It could be ] or } but no, because you can't start a document that way.
11099    * It can't be a comma, a colon or any simple value. So the only way we could continue is
11100    * if the repeated character is [. But if so, the document must start with [. But if the document
11101    * starts with [, it should end with ]. If we enforce that rule, then we would get
11102    * ][[ which is invalid.
11103    **/
11104   parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
11105   parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
11106   parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
11107   parser.next_structural_index = 0;
11108   // a valid JSON file cannot have zero structural indexes - we should have found something
11109   if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
11110     return EMPTY;
11111   }
11112   if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
11113     return UNEXPECTED_ERROR;
11114   }
11115   if (partial) {
11116     // If we have an unclosed string, then the last structural
11117     // will be the quote and we want to make sure to omit it.
11118     if(have_unclosed_string) {
11119       parser.n_structural_indexes--;
11120       // a valid JSON file cannot have zero structural indexes - we should have found something
11121       if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
11122     }
11123     auto new_structural_indexes = find_next_document_index(parser);
11124     if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
11125       return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
11126     }
11127     parser.n_structural_indexes = new_structural_indexes;
11128   }
11129   checker.check_eof();
11130   return checker.errors();
11131 }
11132 
11133 } // namespace stage1
11134 } // unnamed namespace
11135 } // namespace westmere
11136 } // namespace simdjson
11137 /* end file src/generic/stage1/json_structural_indexer.h */
11138 /* begin file src/generic/stage1/utf8_validator.h */
11139 namespace simdjson {
11140 namespace westmere {
11141 namespace {
11142 namespace stage1 {
11143 
11144 /**
11145  * Validates that the string is actual UTF-8.
11146  */
11147 template<class checker>
generic_validate_utf8(const uint8_t * input,size_t length)11148 bool generic_validate_utf8(const uint8_t * input, size_t length) {
11149     checker c{};
11150     buf_block_reader<64> reader(input, length);
11151     while (reader.has_full_block()) {
11152       simd::simd8x64<uint8_t> in(reader.full_block());
11153       c.check_next_input(in);
11154       reader.advance();
11155     }
11156     uint8_t block[64]{};
11157     reader.get_remainder(block);
11158     simd::simd8x64<uint8_t> in(block);
11159     c.check_next_input(in);
11160     reader.advance();
11161     c.check_eof();
11162     return c.errors() == error_code::SUCCESS;
11163 }
11164 
generic_validate_utf8(const char * input,size_t length)11165 bool generic_validate_utf8(const char * input, size_t length) {
11166     return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
11167 }
11168 
11169 } // namespace stage1
11170 } // unnamed namespace
11171 } // namespace westmere
11172 } // namespace simdjson
11173 /* end file src/generic/stage1/utf8_validator.h */
11174 
11175 //
11176 // Stage 2
11177 //
11178 /* begin file src/generic/stage2/tape_builder.h */
11179 /* begin file src/generic/stage2/json_iterator.h */
11180 /* begin file src/generic/stage2/logger.h */
11181 // This is for an internal-only stage 2 specific logger.
11182 // Set LOG_ENABLED = true to log what stage 2 is doing!
11183 namespace simdjson {
11184 namespace westmere {
11185 namespace {
11186 namespace logger {
11187 
11188   static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
11189 
11190 #if SIMDJSON_VERBOSE_LOGGING
11191   static constexpr const bool LOG_ENABLED = true;
11192 #else
11193   static constexpr const bool LOG_ENABLED = false;
11194 #endif
11195   static constexpr const int LOG_EVENT_LEN = 20;
11196   static constexpr const int LOG_BUFFER_LEN = 30;
11197   static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
11198   static constexpr const int LOG_INDEX_LEN = 5;
11199 
11200   static int log_depth; // Not threadsafe. Log only.
11201 
11202   // Helper to turn unprintable or newline characters into spaces
printable_char(char c)11203   static simdjson_really_inline char printable_char(char c) {
11204     if (c >= 0x20) {
11205       return c;
11206     } else {
11207       return ' ';
11208     }
11209   }
11210 
11211   // Print the header and set up log_start
log_start()11212   static simdjson_really_inline void log_start() {
11213     if (LOG_ENABLED) {
11214       log_depth = 0;
11215       printf("\n");
11216       printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
11217       printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
11218     }
11219   }
11220 
log_string(const char * message)11221   simdjson_unused static simdjson_really_inline void log_string(const char *message) {
11222     if (LOG_ENABLED) {
11223       printf("%s\n", message);
11224     }
11225   }
11226 
11227   // Logs a single line from the stage 2 DOM parser
11228   template<typename S>
log_line(S & structurals,const char * title_prefix,const char * title,const char * detail)11229   static simdjson_really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
11230     if (LOG_ENABLED) {
11231       printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
11232       auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
11233       auto next_index = structurals.next_structural;
11234       auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>("                                                       ");
11235       auto next = &structurals.buf[*next_index];
11236       {
11237         // Print the next N characters in the buffer.
11238         printf("| ");
11239         // Otherwise, print the characters starting from the buffer position.
11240         // Print spaces for unprintable or newline characters.
11241         for (int i=0;i<LOG_BUFFER_LEN;i++) {
11242           printf("%c", printable_char(current[i]));
11243         }
11244         printf(" ");
11245         // Print the next N characters in the buffer.
11246         printf("| ");
11247         // Otherwise, print the characters starting from the buffer position.
11248         // Print spaces for unprintable or newline characters.
11249         for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
11250           printf("%c", printable_char(next[i]));
11251         }
11252         printf(" ");
11253       }
11254       if (current_index) {
11255         printf("| %*u ", LOG_INDEX_LEN, *current_index);
11256       } else {
11257         printf("| %-*s ", LOG_INDEX_LEN, "");
11258       }
11259       // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
11260       printf("| %-s ", detail);
11261       printf("|\n");
11262     }
11263   }
11264 
11265 } // namespace logger
11266 } // unnamed namespace
11267 } // namespace westmere
11268 } // namespace simdjson
11269 /* end file src/generic/stage2/logger.h */
11270 
11271 namespace simdjson {
11272 namespace westmere {
11273 namespace {
11274 namespace stage2 {
11275 
11276 class json_iterator {
11277 public:
11278   const uint8_t* const buf;
11279   uint32_t *next_structural;
11280   dom_parser_implementation &dom_parser;
11281   uint32_t depth{0};
11282 
11283   /**
11284    * Walk the JSON document.
11285    *
11286    * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
11287    * the first parameter; some callbacks have other parameters as well:
11288    *
11289    * - visit_document_start() - at the beginning.
11290    * - visit_document_end() - at the end (if things were successful).
11291    *
11292    * - visit_array_start() - at the start `[` of a non-empty array.
11293    * - visit_array_end() - at the end `]` of a non-empty array.
11294    * - visit_empty_array() - when an empty array is encountered.
11295    *
11296    * - visit_object_end() - at the start `]` of a non-empty object.
11297    * - visit_object_start() - at the end `]` of a non-empty object.
11298    * - visit_empty_object() - when an empty object is encountered.
11299    * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
11300    *                                   guaranteed to point at the first quote of the string (`"key"`).
11301    * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
11302    * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
11303    *
11304    * - increment_count(iter) - each time a value is found in an array or object.
11305    */
11306   template<bool STREAMING, typename V>
11307   simdjson_warn_unused simdjson_really_inline error_code walk_document(V &visitor) noexcept;
11308 
11309   /**
11310    * Create an iterator capable of walking a JSON document.
11311    *
11312    * The document must have already passed through stage 1.
11313    */
11314   simdjson_really_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
11315 
11316   /**
11317    * Look at the next token.
11318    *
11319    * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
11320    *
11321    * They may include invalid JSON as well (such as `1.2.3` or `ture`).
11322    */
11323   simdjson_really_inline const uint8_t *peek() const noexcept;
11324   /**
11325    * Advance to the next token.
11326    *
11327    * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
11328    *
11329    * They may include invalid JSON as well (such as `1.2.3` or `ture`).
11330    */
11331   simdjson_really_inline const uint8_t *advance() noexcept;
11332   /**
11333    * Get the remaining length of the document, from the start of the current token.
11334    */
11335   simdjson_really_inline size_t remaining_len() const noexcept;
11336   /**
11337    * Check if we are at the end of the document.
11338    *
11339    * If this is true, there are no more tokens.
11340    */
11341   simdjson_really_inline bool at_eof() const noexcept;
11342   /**
11343    * Check if we are at the beginning of the document.
11344    */
11345   simdjson_really_inline bool at_beginning() const noexcept;
11346   simdjson_really_inline uint8_t last_structural() const noexcept;
11347 
11348   /**
11349    * Log that a value has been found.
11350    *
11351    * Set ENABLE_LOGGING=true in logger.h to see logging.
11352    */
11353   simdjson_really_inline void log_value(const char *type) const noexcept;
11354   /**
11355    * Log the start of a multipart value.
11356    *
11357    * Set ENABLE_LOGGING=true in logger.h to see logging.
11358    */
11359   simdjson_really_inline void log_start_value(const char *type) const noexcept;
11360   /**
11361    * Log the end of a multipart value.
11362    *
11363    * Set ENABLE_LOGGING=true in logger.h to see logging.
11364    */
11365   simdjson_really_inline void log_end_value(const char *type) const noexcept;
11366   /**
11367    * Log an error.
11368    *
11369    * Set ENABLE_LOGGING=true in logger.h to see logging.
11370    */
11371   simdjson_really_inline void log_error(const char *error) const noexcept;
11372 
11373   template<typename V>
11374   simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
11375   template<typename V>
11376   simdjson_warn_unused simdjson_really_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
11377 };
11378 
11379 template<bool STREAMING, typename V>
walk_document(V & visitor)11380 simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_document(V &visitor) noexcept {
11381   logger::log_start();
11382 
11383   //
11384   // Start the document
11385   //
11386   if (at_eof()) { return EMPTY; }
11387   log_start_value("document");
11388   SIMDJSON_TRY( visitor.visit_document_start(*this) );
11389 
11390   //
11391   // Read first value
11392   //
11393   {
11394     auto value = advance();
11395 
11396     // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
11397     // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
11398     if (!STREAMING) {
11399       switch (*value) {
11400         case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break;
11401         case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break;
11402       }
11403     }
11404 
11405     switch (*value) {
11406       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
11407       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
11408       default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
11409     }
11410   }
11411   goto document_end;
11412 
11413 //
11414 // Object parser states
11415 //
11416 object_begin:
11417   log_start_value("object");
11418   depth++;
11419   if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
11420   dom_parser.is_array[depth] = false;
11421   SIMDJSON_TRY( visitor.visit_object_start(*this) );
11422 
11423   {
11424     auto key = advance();
11425     if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
11426     SIMDJSON_TRY( visitor.increment_count(*this) );
11427     SIMDJSON_TRY( visitor.visit_key(*this, key) );
11428   }
11429 
11430 object_field:
11431   if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
11432   {
11433     auto value = advance();
11434     switch (*value) {
11435       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
11436       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
11437       default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
11438     }
11439   }
11440 
11441 object_continue:
11442   switch (*advance()) {
11443     case ',':
11444       SIMDJSON_TRY( visitor.increment_count(*this) );
11445       {
11446         auto key = advance();
11447         if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
11448         SIMDJSON_TRY( visitor.visit_key(*this, key) );
11449       }
11450       goto object_field;
11451     case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
11452     default: log_error("No comma between object fields"); return TAPE_ERROR;
11453   }
11454 
11455 scope_end:
11456   depth--;
11457   if (depth == 0) { goto document_end; }
11458   if (dom_parser.is_array[depth]) { goto array_continue; }
11459   goto object_continue;
11460 
11461 //
11462 // Array parser states
11463 //
11464 array_begin:
11465   log_start_value("array");
11466   depth++;
11467   if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
11468   dom_parser.is_array[depth] = true;
11469   SIMDJSON_TRY( visitor.visit_array_start(*this) );
11470   SIMDJSON_TRY( visitor.increment_count(*this) );
11471 
11472 array_value:
11473   {
11474     auto value = advance();
11475     switch (*value) {
11476       case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
11477       case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
11478       default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
11479     }
11480   }
11481 
11482 array_continue:
11483   switch (*advance()) {
11484     case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
11485     case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
11486     default: log_error("Missing comma between array values"); return TAPE_ERROR;
11487   }
11488 
11489 document_end:
11490   log_end_value("document");
11491   SIMDJSON_TRY( visitor.visit_document_end(*this) );
11492 
11493   dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
11494 
11495   // If we didn't make it to the end, it's an error
11496   if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
11497     log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
11498     return TAPE_ERROR;
11499   }
11500 
11501   return SUCCESS;
11502 
11503 } // walk_document()
11504 
json_iterator(dom_parser_implementation & _dom_parser,size_t start_structural_index)11505 simdjson_really_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
11506   : buf{_dom_parser.buf},
11507     next_structural{&_dom_parser.structural_indexes[start_structural_index]},
11508     dom_parser{_dom_parser} {
11509 }
11510 
peek() const11511 simdjson_really_inline const uint8_t *json_iterator::peek() const noexcept {
11512   return &buf[*(next_structural)];
11513 }
advance()11514 simdjson_really_inline const uint8_t *json_iterator::advance() noexcept {
11515   return &buf[*(next_structural++)];
11516 }
remaining_len() const11517 simdjson_really_inline size_t json_iterator::remaining_len() const noexcept {
11518   return dom_parser.len - *(next_structural-1);
11519 }
11520 
at_eof() const11521 simdjson_really_inline bool json_iterator::at_eof() const noexcept {
11522   return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
11523 }
at_beginning() const11524 simdjson_really_inline bool json_iterator::at_beginning() const noexcept {
11525   return next_structural == dom_parser.structural_indexes.get();
11526 }
last_structural() const11527 simdjson_really_inline uint8_t json_iterator::last_structural() const noexcept {
11528   return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
11529 }
11530 
log_value(const char * type) const11531 simdjson_really_inline void json_iterator::log_value(const char *type) const noexcept {
11532   logger::log_line(*this, "", type, "");
11533 }
11534 
log_start_value(const char * type) const11535 simdjson_really_inline void json_iterator::log_start_value(const char *type) const noexcept {
11536   logger::log_line(*this, "+", type, "");
11537   if (logger::LOG_ENABLED) { logger::log_depth++; }
11538 }
11539 
log_end_value(const char * type) const11540 simdjson_really_inline void json_iterator::log_end_value(const char *type) const noexcept {
11541   if (logger::LOG_ENABLED) { logger::log_depth--; }
11542   logger::log_line(*this, "-", type, "");
11543 }
11544 
log_error(const char * error) const11545 simdjson_really_inline void json_iterator::log_error(const char *error) const noexcept {
11546   logger::log_line(*this, "", "ERROR", error);
11547 }
11548 
11549 template<typename V>
visit_root_primitive(V & visitor,const uint8_t * value)11550 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
11551   switch (*value) {
11552     case '"': return visitor.visit_root_string(*this, value);
11553     case 't': return visitor.visit_root_true_atom(*this, value);
11554     case 'f': return visitor.visit_root_false_atom(*this, value);
11555     case 'n': return visitor.visit_root_null_atom(*this, value);
11556     case '-':
11557     case '0': case '1': case '2': case '3': case '4':
11558     case '5': case '6': case '7': case '8': case '9':
11559       return visitor.visit_root_number(*this, value);
11560     default:
11561       log_error("Document starts with a non-value character");
11562       return TAPE_ERROR;
11563   }
11564 }
11565 template<typename V>
visit_primitive(V & visitor,const uint8_t * value)11566 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
11567   switch (*value) {
11568     case '"': return visitor.visit_string(*this, value);
11569     case 't': return visitor.visit_true_atom(*this, value);
11570     case 'f': return visitor.visit_false_atom(*this, value);
11571     case 'n': return visitor.visit_null_atom(*this, value);
11572     case '-':
11573     case '0': case '1': case '2': case '3': case '4':
11574     case '5': case '6': case '7': case '8': case '9':
11575       return visitor.visit_number(*this, value);
11576     default:
11577       log_error("Non-value found when value was expected!");
11578       return TAPE_ERROR;
11579   }
11580 }
11581 
11582 } // namespace stage2
11583 } // unnamed namespace
11584 } // namespace westmere
11585 } // namespace simdjson
11586 /* end file src/generic/stage2/json_iterator.h */
11587 /* begin file src/generic/stage2/tape_writer.h */
11588 namespace simdjson {
11589 namespace westmere {
11590 namespace {
11591 namespace stage2 {
11592 
11593 struct tape_writer {
11594   /** The next place to write to tape */
11595   uint64_t *next_tape_loc;
11596 
11597   /** Write a signed 64-bit value to tape. */
11598   simdjson_really_inline void append_s64(int64_t value) noexcept;
11599 
11600   /** Write an unsigned 64-bit value to tape. */
11601   simdjson_really_inline void append_u64(uint64_t value) noexcept;
11602 
11603   /** Write a double value to tape. */
11604   simdjson_really_inline void append_double(double value) noexcept;
11605 
11606   /**
11607    * Append a tape entry (an 8-bit type,and 56 bits worth of value).
11608    */
11609   simdjson_really_inline void append(uint64_t val, internal::tape_type t) noexcept;
11610 
11611   /**
11612    * Skip the current tape entry without writing.
11613    *
11614    * Used to skip the start of the container, since we'll come back later to fill it in when the
11615    * container ends.
11616    */
11617   simdjson_really_inline void skip() noexcept;
11618 
11619   /**
11620    * Skip the number of tape entries necessary to write a large u64 or i64.
11621    */
11622   simdjson_really_inline void skip_large_integer() noexcept;
11623 
11624   /**
11625    * Skip the number of tape entries necessary to write a double.
11626    */
11627   simdjson_really_inline void skip_double() noexcept;
11628 
11629   /**
11630    * Write a value to a known location on tape.
11631    *
11632    * Used to go back and write out the start of a container after the container ends.
11633    */
11634   simdjson_really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
11635 
11636 private:
11637   /**
11638    * Append both the tape entry, and a supplementary value following it. Used for types that need
11639    * all 64 bits, such as double and uint64_t.
11640    */
11641   template<typename T>
11642   simdjson_really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
11643 }; // struct number_writer
11644 
append_s64(int64_t value)11645 simdjson_really_inline void tape_writer::append_s64(int64_t value) noexcept {
11646   append2(0, value, internal::tape_type::INT64);
11647 }
11648 
append_u64(uint64_t value)11649 simdjson_really_inline void tape_writer::append_u64(uint64_t value) noexcept {
11650   append(0, internal::tape_type::UINT64);
11651   *next_tape_loc = value;
11652   next_tape_loc++;
11653 }
11654 
11655 /** Write a double value to tape. */
append_double(double value)11656 simdjson_really_inline void tape_writer::append_double(double value) noexcept {
11657   append2(0, value, internal::tape_type::DOUBLE);
11658 }
11659 
skip()11660 simdjson_really_inline void tape_writer::skip() noexcept {
11661   next_tape_loc++;
11662 }
11663 
skip_large_integer()11664 simdjson_really_inline void tape_writer::skip_large_integer() noexcept {
11665   next_tape_loc += 2;
11666 }
11667 
skip_double()11668 simdjson_really_inline void tape_writer::skip_double() noexcept {
11669   next_tape_loc += 2;
11670 }
11671 
append(uint64_t val,internal::tape_type t)11672 simdjson_really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
11673   *next_tape_loc = val | ((uint64_t(char(t))) << 56);
11674   next_tape_loc++;
11675 }
11676 
11677 template<typename T>
append2(uint64_t val,T val2,internal::tape_type t)11678 simdjson_really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
11679   append(val, t);
11680   static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
11681   memcpy(next_tape_loc, &val2, sizeof(val2));
11682   next_tape_loc++;
11683 }
11684 
write(uint64_t & tape_loc,uint64_t val,internal::tape_type t)11685 simdjson_really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
11686   tape_loc = val | ((uint64_t(char(t))) << 56);
11687 }
11688 
11689 } // namespace stage2
11690 } // unnamed namespace
11691 } // namespace westmere
11692 } // namespace simdjson
11693 /* end file src/generic/stage2/tape_writer.h */
11694 
11695 namespace simdjson {
11696 namespace westmere {
11697 namespace {
11698 namespace stage2 {
11699 
11700 struct tape_builder {
11701   template<bool STREAMING>
11702   simdjson_warn_unused static simdjson_really_inline error_code parse_document(
11703     dom_parser_implementation &dom_parser,
11704     dom::document &doc) noexcept;
11705 
11706   /** Called when a non-empty document starts. */
11707   simdjson_warn_unused simdjson_really_inline error_code visit_document_start(json_iterator &iter) noexcept;
11708   /** Called when a non-empty document ends without error. */
11709   simdjson_warn_unused simdjson_really_inline error_code visit_document_end(json_iterator &iter) noexcept;
11710 
11711   /** Called when a non-empty array starts. */
11712   simdjson_warn_unused simdjson_really_inline error_code visit_array_start(json_iterator &iter) noexcept;
11713   /** Called when a non-empty array ends. */
11714   simdjson_warn_unused simdjson_really_inline error_code visit_array_end(json_iterator &iter) noexcept;
11715   /** Called when an empty array is found. */
11716   simdjson_warn_unused simdjson_really_inline error_code visit_empty_array(json_iterator &iter) noexcept;
11717 
11718   /** Called when a non-empty object starts. */
11719   simdjson_warn_unused simdjson_really_inline error_code visit_object_start(json_iterator &iter) noexcept;
11720   /**
11721    * Called when a key in a field is encountered.
11722    *
11723    * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
11724    * will be called after this with the field value.
11725    */
11726   simdjson_warn_unused simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
11727   /** Called when a non-empty object ends. */
11728   simdjson_warn_unused simdjson_really_inline error_code visit_object_end(json_iterator &iter) noexcept;
11729   /** Called when an empty object is found. */
11730   simdjson_warn_unused simdjson_really_inline error_code visit_empty_object(json_iterator &iter) noexcept;
11731 
11732   /**
11733    * Called when a string, number, boolean or null is found.
11734    */
11735   simdjson_warn_unused simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
11736   /**
11737    * Called when a string, number, boolean or null is found at the top level of a document (i.e.
11738    * when there is no array or object and the entire document is a single string, number, boolean or
11739    * null.
11740    *
11741    * This is separate from primitive() because simdjson's normal primitive parsing routines assume
11742    * there is at least one more token after the value, which is only true in an array or object.
11743    */
11744   simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
11745 
11746   simdjson_warn_unused simdjson_really_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
11747   simdjson_warn_unused simdjson_really_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
11748   simdjson_warn_unused simdjson_really_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
11749   simdjson_warn_unused simdjson_really_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
11750   simdjson_warn_unused simdjson_really_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
11751 
11752   simdjson_warn_unused simdjson_really_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
11753   simdjson_warn_unused simdjson_really_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
11754   simdjson_warn_unused simdjson_really_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
11755   simdjson_warn_unused simdjson_really_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
11756   simdjson_warn_unused simdjson_really_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
11757 
11758   /** Called each time a new field or element in an array or object is found. */
11759   simdjson_warn_unused simdjson_really_inline error_code increment_count(json_iterator &iter) noexcept;
11760 
11761   /** Next location to write to tape */
11762   tape_writer tape;
11763 private:
11764   /** Next write location in the string buf for stage 2 parsing */
11765   uint8_t *current_string_buf_loc;
11766 
11767   simdjson_really_inline tape_builder(dom::document &doc) noexcept;
11768 
11769   simdjson_really_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
11770   simdjson_really_inline void start_container(json_iterator &iter) noexcept;
11771   simdjson_warn_unused simdjson_really_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
11772   simdjson_warn_unused simdjson_really_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
11773   simdjson_really_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
11774   simdjson_really_inline void on_end_string(uint8_t *dst) noexcept;
11775 }; // class tape_builder
11776 
11777 template<bool STREAMING>
parse_document(dom_parser_implementation & dom_parser,dom::document & doc)11778 simdjson_warn_unused simdjson_really_inline error_code tape_builder::parse_document(
11779     dom_parser_implementation &dom_parser,
11780     dom::document &doc) noexcept {
11781   dom_parser.doc = &doc;
11782   json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
11783   tape_builder builder(doc);
11784   return iter.walk_document<STREAMING>(builder);
11785 }
11786 
visit_root_primitive(json_iterator & iter,const uint8_t * value)11787 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
11788   return iter.visit_root_primitive(*this, value);
11789 }
visit_primitive(json_iterator & iter,const uint8_t * value)11790 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
11791   return iter.visit_primitive(*this, value);
11792 }
visit_empty_object(json_iterator & iter)11793 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
11794   return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
11795 }
visit_empty_array(json_iterator & iter)11796 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
11797   return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
11798 }
11799 
visit_document_start(json_iterator & iter)11800 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
11801   start_container(iter);
11802   return SUCCESS;
11803 }
visit_object_start(json_iterator & iter)11804 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
11805   start_container(iter);
11806   return SUCCESS;
11807 }
visit_array_start(json_iterator & iter)11808 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
11809   start_container(iter);
11810   return SUCCESS;
11811 }
11812 
visit_object_end(json_iterator & iter)11813 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
11814   return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
11815 }
visit_array_end(json_iterator & iter)11816 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
11817   return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
11818 }
visit_document_end(json_iterator & iter)11819 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
11820   constexpr uint32_t start_tape_index = 0;
11821   tape.append(start_tape_index, internal::tape_type::ROOT);
11822   tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
11823   return SUCCESS;
11824 }
visit_key(json_iterator & iter,const uint8_t * key)11825 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
11826   return visit_string(iter, key, true);
11827 }
11828 
increment_count(json_iterator & iter)11829 simdjson_warn_unused simdjson_really_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
11830   iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
11831   return SUCCESS;
11832 }
11833 
tape_builder(dom::document & doc)11834 simdjson_really_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
11835 
visit_string(json_iterator & iter,const uint8_t * value,bool key)11836 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
11837   iter.log_value(key ? "key" : "string");
11838   uint8_t *dst = on_start_string(iter);
11839   dst = stringparsing::parse_string(value+1, dst);
11840   if (dst == nullptr) {
11841     iter.log_error("Invalid escape in string");
11842     return STRING_ERROR;
11843   }
11844   on_end_string(dst);
11845   return SUCCESS;
11846 }
11847 
visit_root_string(json_iterator & iter,const uint8_t * value)11848 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
11849   return visit_string(iter, value);
11850 }
11851 
visit_number(json_iterator & iter,const uint8_t * value)11852 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
11853   iter.log_value("number");
11854   return numberparsing::parse_number(value, tape);
11855 }
11856 
visit_root_number(json_iterator & iter,const uint8_t * value)11857 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
11858   //
11859   // We need to make a copy to make sure that the string is space terminated.
11860   // This is not about padding the input, which should already padded up
11861   // to len + SIMDJSON_PADDING. However, we have no control at this stage
11862   // on how the padding was done. What if the input string was padded with nulls?
11863   // It is quite common for an input string to have an extra null character (C string).
11864   // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
11865   // document, but the string "9\0" by itself is fine. So we make a copy and
11866   // pad the input with spaces when we know that there is just one input element.
11867   // This copy is relatively expensive, but it will almost never be called in
11868   // practice unless you are in the strange scenario where you have many JSON
11869   // documents made of single atoms.
11870   //
11871   std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
11872   if (copy.get() == nullptr) { return MEMALLOC; }
11873   std::memcpy(copy.get(), value, iter.remaining_len());
11874   std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
11875   error_code error = visit_number(iter, copy.get());
11876   return error;
11877 }
11878 
visit_true_atom(json_iterator & iter,const uint8_t * value)11879 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
11880   iter.log_value("true");
11881   if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
11882   tape.append(0, internal::tape_type::TRUE_VALUE);
11883   return SUCCESS;
11884 }
11885 
visit_root_true_atom(json_iterator & iter,const uint8_t * value)11886 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
11887   iter.log_value("true");
11888   if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
11889   tape.append(0, internal::tape_type::TRUE_VALUE);
11890   return SUCCESS;
11891 }
11892 
visit_false_atom(json_iterator & iter,const uint8_t * value)11893 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
11894   iter.log_value("false");
11895   if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
11896   tape.append(0, internal::tape_type::FALSE_VALUE);
11897   return SUCCESS;
11898 }
11899 
visit_root_false_atom(json_iterator & iter,const uint8_t * value)11900 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
11901   iter.log_value("false");
11902   if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
11903   tape.append(0, internal::tape_type::FALSE_VALUE);
11904   return SUCCESS;
11905 }
11906 
visit_null_atom(json_iterator & iter,const uint8_t * value)11907 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
11908   iter.log_value("null");
11909   if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
11910   tape.append(0, internal::tape_type::NULL_VALUE);
11911   return SUCCESS;
11912 }
11913 
visit_root_null_atom(json_iterator & iter,const uint8_t * value)11914 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
11915   iter.log_value("null");
11916   if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
11917   tape.append(0, internal::tape_type::NULL_VALUE);
11918   return SUCCESS;
11919 }
11920 
11921 // private:
11922 
next_tape_index(json_iterator & iter) const11923 simdjson_really_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
11924   return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
11925 }
11926 
empty_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)11927 simdjson_warn_unused simdjson_really_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
11928   auto start_index = next_tape_index(iter);
11929   tape.append(start_index+2, start);
11930   tape.append(start_index, end);
11931   return SUCCESS;
11932 }
11933 
start_container(json_iterator & iter)11934 simdjson_really_inline void tape_builder::start_container(json_iterator &iter) noexcept {
11935   iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
11936   iter.dom_parser.open_containers[iter.depth].count = 0;
11937   tape.skip(); // We don't actually *write* the start element until the end.
11938 }
11939 
end_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)11940 simdjson_warn_unused simdjson_really_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
11941   // Write the ending tape element, pointing at the start location
11942   const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
11943   tape.append(start_tape_index, end);
11944   // Write the start tape element, pointing at the end location (and including count)
11945   // count can overflow if it exceeds 24 bits... so we saturate
11946   // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
11947   const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
11948   const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
11949   tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
11950   return SUCCESS;
11951 }
11952 
on_start_string(json_iterator & iter)11953 simdjson_really_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
11954   // we advance the point, accounting for the fact that we have a NULL termination
11955   tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
11956   return current_string_buf_loc + sizeof(uint32_t);
11957 }
11958 
on_end_string(uint8_t * dst)11959 simdjson_really_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
11960   uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
11961   // TODO check for overflow in case someone has a crazy string (>=4GB?)
11962   // But only add the overflow check when the document itself exceeds 4GB
11963   // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
11964   memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
11965   // NULL termination is still handy if you expect all your strings to
11966   // be NULL terminated? It comes at a small cost
11967   *dst = 0;
11968   current_string_buf_loc = dst + 1;
11969 }
11970 
11971 } // namespace stage2
11972 } // unnamed namespace
11973 } // namespace westmere
11974 } // namespace simdjson
11975 /* end file src/generic/stage2/tape_builder.h */
11976 
11977 //
11978 // Implementation-specific overrides
11979 //
11980 
11981 namespace simdjson {
11982 namespace westmere {
11983 namespace {
11984 namespace stage1 {
11985 
find_escaped(uint64_t backslash)11986 simdjson_really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
11987   if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
11988   return find_escaped_branchless(backslash);
11989 }
11990 
11991 } // namespace stage1
11992 } // unnamed namespace
11993 
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len) const11994 simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
11995   return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
11996 }
11997 
stage1(const uint8_t * _buf,size_t _len,bool streaming)11998 simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
11999   this->buf = _buf;
12000   this->len = _len;
12001   return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
12002 }
12003 
validate_utf8(const char * buf,size_t len) const12004 simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
12005   return westmere::stage1::generic_validate_utf8(buf,len);
12006 }
12007 
stage2(dom::document & _doc)12008 simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
12009   return stage2::tape_builder::parse_document<false>(*this, _doc);
12010 }
12011 
stage2_next(dom::document & _doc)12012 simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
12013   return stage2::tape_builder::parse_document<true>(*this, _doc);
12014 }
12015 
parse(const uint8_t * _buf,size_t _len,dom::document & _doc)12016 simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
12017   auto error = stage1(_buf, _len, false);
12018   if (error) { return error; }
12019   return stage2(_doc);
12020 }
12021 
12022 } // namespace westmere
12023 } // namespace simdjson
12024 
12025 /* begin file include/simdjson/westmere/end.h */
12026 SIMDJSON_UNTARGET_WESTMERE
12027 /* end file include/simdjson/westmere/end.h */
12028 /* end file src/westmere/dom_parser_implementation.cpp */
12029 #endif
12030 
12031 SIMDJSON_POP_DISABLE_WARNINGS
12032 /* end file src/simdjson.cpp */
12033