1 /* auto-generated on 2021-06-04 17:09:21 -0400. Do not edit! */
2 /* begin file src/simdjson.cpp */
3 #include "simdjson.h"
4
5 SIMDJSON_PUSH_DISABLE_WARNINGS
6 SIMDJSON_DISABLE_UNDESIRED_WARNINGS
7
8 /* begin file src/to_chars.cpp */
9 #include <cstring>
10 #include <cstdint>
11 #include <array>
12 namespace simdjson {
13 namespace internal {
14 /*!
15 implements the Grisu2 algorithm for binary to decimal floating-point
16 conversion.
17 Adapted from JSON for Modern C++
18
19 This implementation is a slightly modified version of the reference
20 implementation which may be obtained from
21 http://florian.loitsch.com/publications (bench.tar.gz).
22 The code is distributed under the MIT license, Copyright (c) 2009 Florian
23 Loitsch. For a detailed description of the algorithm see: [1] Loitsch, "Printing
24 Floating-Point Numbers Quickly and Accurately with Integers", Proceedings of the
25 ACM SIGPLAN 2010 Conference on Programming Language Design and Implementation,
26 PLDI 2010 [2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and
27 Accurately", Proceedings of the ACM SIGPLAN 1996 Conference on Programming
28 Language Design and Implementation, PLDI 1996
29 */
30 namespace dtoa_impl {
31
32 template <typename Target, typename Source>
33 Target reinterpret_bits(const Source source) {
34 static_assert(sizeof(Target) == sizeof(Source), "size mismatch");
35
36 Target target;
37 std::memcpy(&target, &source, sizeof(Source));
38 return target;
39 }
40
41 struct diyfp // f * 2^e
42 {
43 static constexpr int kPrecision = 64; // = q
44
45 std::uint64_t f = 0;
46 int e = 0;
47
diyfpsimdjson::internal::dtoa_impl::diyfp48 constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {}
49
50 /*!
51 @brief returns x - y
52 @pre x.e == y.e and x.f >= y.f
53 */
subsimdjson::internal::dtoa_impl::diyfp54 static diyfp sub(const diyfp &x, const diyfp &y) noexcept {
55
56 return {x.f - y.f, x.e};
57 }
58
59 /*!
60 @brief returns x * y
61 @note The result is rounded. (Only the upper q bits are returned.)
62 */
mulsimdjson::internal::dtoa_impl::diyfp63 static diyfp mul(const diyfp &x, const diyfp &y) noexcept {
64 static_assert(kPrecision == 64, "internal error");
65
66 // Computes:
67 // f = round((x.f * y.f) / 2^q)
68 // e = x.e + y.e + q
69
70 // Emulate the 64-bit * 64-bit multiplication:
71 //
72 // p = u * v
73 // = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi)
74 // = (u_lo v_lo ) + 2^32 ((u_lo v_hi ) + (u_hi v_lo )) +
75 // 2^64 (u_hi v_hi ) = (p0 ) + 2^32 ((p1 ) + (p2 ))
76 // + 2^64 (p3 ) = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo +
77 // 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3 ) =
78 // (p0_lo ) + 2^32 (p0_hi + p1_lo + p2_lo ) + 2^64 (p1_hi +
79 // p2_hi + p3) = (p0_lo ) + 2^32 (Q ) + 2^64 (H ) = (p0_lo ) +
80 // 2^32 (Q_lo + 2^32 Q_hi ) + 2^64 (H )
81 //
82 // (Since Q might be larger than 2^32 - 1)
83 //
84 // = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H)
85 //
86 // (Q_hi + H does not overflow a 64-bit int)
87 //
88 // = p_lo + 2^64 p_hi
89
90 const std::uint64_t u_lo = x.f & 0xFFFFFFFFu;
91 const std::uint64_t u_hi = x.f >> 32u;
92 const std::uint64_t v_lo = y.f & 0xFFFFFFFFu;
93 const std::uint64_t v_hi = y.f >> 32u;
94
95 const std::uint64_t p0 = u_lo * v_lo;
96 const std::uint64_t p1 = u_lo * v_hi;
97 const std::uint64_t p2 = u_hi * v_lo;
98 const std::uint64_t p3 = u_hi * v_hi;
99
100 const std::uint64_t p0_hi = p0 >> 32u;
101 const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu;
102 const std::uint64_t p1_hi = p1 >> 32u;
103 const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu;
104 const std::uint64_t p2_hi = p2 >> 32u;
105
106 std::uint64_t Q = p0_hi + p1_lo + p2_lo;
107
108 // The full product might now be computed as
109 //
110 // p_hi = p3 + p2_hi + p1_hi + (Q >> 32)
111 // p_lo = p0_lo + (Q << 32)
112 //
113 // But in this particular case here, the full p_lo is not required.
114 // Effectively we only need to add the highest bit in p_lo to p_hi (and
115 // Q_hi + 1 does not overflow).
116
117 Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up
118
119 const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u);
120
121 return {h, x.e + y.e + 64};
122 }
123
124 /*!
125 @brief normalize x such that the significand is >= 2^(q-1)
126 @pre x.f != 0
127 */
normalizesimdjson::internal::dtoa_impl::diyfp128 static diyfp normalize(diyfp x) noexcept {
129
130 while ((x.f >> 63u) == 0) {
131 x.f <<= 1u;
132 x.e--;
133 }
134
135 return x;
136 }
137
138 /*!
139 @brief normalize x such that the result has the exponent E
140 @pre e >= x.e and the upper e - x.e bits of x.f must be zero.
141 */
normalize_tosimdjson::internal::dtoa_impl::diyfp142 static diyfp normalize_to(const diyfp &x,
143 const int target_exponent) noexcept {
144 const int delta = x.e - target_exponent;
145
146 return {x.f << delta, target_exponent};
147 }
148 };
149
150 struct boundaries {
151 diyfp w;
152 diyfp minus;
153 diyfp plus;
154 };
155
156 /*!
157 Compute the (normalized) diyfp representing the input number 'value' and its
158 boundaries.
159 @pre value must be finite and positive
160 */
compute_boundaries(FloatType value)161 template <typename FloatType> boundaries compute_boundaries(FloatType value) {
162
163 // Convert the IEEE representation into a diyfp.
164 //
165 // If v is denormal:
166 // value = 0.F * 2^(1 - bias) = ( F) * 2^(1 - bias - (p-1))
167 // If v is normalized:
168 // value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1))
169
170 static_assert(std::numeric_limits<FloatType>::is_iec559,
171 "internal error: dtoa_short requires an IEEE-754 "
172 "floating-point implementation");
173
174 constexpr int kPrecision =
175 std::numeric_limits<FloatType>::digits; // = p (includes the hidden bit)
176 constexpr int kBias =
177 std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1);
178 constexpr int kMinExp = 1 - kBias;
179 constexpr std::uint64_t kHiddenBit = std::uint64_t{1}
180 << (kPrecision - 1); // = 2^(p-1)
181
182 using bits_type = typename std::conditional<kPrecision == 24, std::uint32_t,
183 std::uint64_t>::type;
184
185 const std::uint64_t bits = reinterpret_bits<bits_type>(value);
186 const std::uint64_t E = bits >> (kPrecision - 1);
187 const std::uint64_t F = bits & (kHiddenBit - 1);
188
189 const bool is_denormal = E == 0;
190 const diyfp v = is_denormal
191 ? diyfp(F, kMinExp)
192 : diyfp(F + kHiddenBit, static_cast<int>(E) - kBias);
193
194 // Compute the boundaries m- and m+ of the floating-point value
195 // v = f * 2^e.
196 //
197 // Determine v- and v+, the floating-point predecessor and successor if v,
198 // respectively.
199 //
200 // v- = v - 2^e if f != 2^(p-1) or e == e_min (A)
201 // = v - 2^(e-1) if f == 2^(p-1) and e > e_min (B)
202 //
203 // v+ = v + 2^e
204 //
205 // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_
206 // between m- and m+ round to v, regardless of how the input rounding
207 // algorithm breaks ties.
208 //
209 // ---+-------------+-------------+-------------+-------------+--- (A)
210 // v- m- v m+ v+
211 //
212 // -----------------+------+------+-------------+-------------+--- (B)
213 // v- m- v m+ v+
214
215 const bool lower_boundary_is_closer = F == 0 && E > 1;
216 const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1);
217 const diyfp m_minus = lower_boundary_is_closer
218 ? diyfp(4 * v.f - 1, v.e - 2) // (B)
219 : diyfp(2 * v.f - 1, v.e - 1); // (A)
220
221 // Determine the normalized w+ = m+.
222 const diyfp w_plus = diyfp::normalize(m_plus);
223
224 // Determine w- = m- such that e_(w-) = e_(w+).
225 const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e);
226
227 return {diyfp::normalize(v), w_minus, w_plus};
228 }
229
230 // Given normalized diyfp w, Grisu needs to find a (normalized) cached
231 // power-of-ten c, such that the exponent of the product c * w = f * 2^e lies
232 // within a certain range [alpha, gamma] (Definition 3.2 from [1])
233 //
234 // alpha <= e = e_c + e_w + q <= gamma
235 //
236 // or
237 //
238 // f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q
239 // <= f_c * f_w * 2^gamma
240 //
241 // Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies
242 //
243 // 2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma
244 //
245 // or
246 //
247 // 2^(q - 2 + alpha) <= c * w < 2^(q + gamma)
248 //
249 // The choice of (alpha,gamma) determines the size of the table and the form of
250 // the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well
251 // in practice:
252 //
253 // The idea is to cut the number c * w = f * 2^e into two parts, which can be
254 // processed independently: An integral part p1, and a fractional part p2:
255 //
256 // f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e
257 // = (f div 2^-e) + (f mod 2^-e) * 2^e
258 // = p1 + p2 * 2^e
259 //
260 // The conversion of p1 into decimal form requires a series of divisions and
261 // modulos by (a power of) 10. These operations are faster for 32-bit than for
262 // 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be
263 // achieved by choosing
264 //
265 // -e >= 32 or e <= -32 := gamma
266 //
267 // In order to convert the fractional part
268 //
269 // p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ...
270 //
271 // into decimal form, the fraction is repeatedly multiplied by 10 and the digits
272 // d[-i] are extracted in order:
273 //
274 // (10 * p2) div 2^-e = d[-1]
275 // (10 * p2) mod 2^-e = d[-2] / 10^1 + ...
276 //
277 // The multiplication by 10 must not overflow. It is sufficient to choose
278 //
279 // 10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64.
280 //
281 // Since p2 = f mod 2^-e < 2^-e,
282 //
283 // -e <= 60 or e >= -60 := alpha
284
285 constexpr int kAlpha = -60;
286 constexpr int kGamma = -32;
287
288 struct cached_power // c = f * 2^e ~= 10^k
289 {
290 std::uint64_t f;
291 int e;
292 int k;
293 };
294
295 /*!
296 For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached
297 power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c
298 satisfies (Definition 3.2 from [1])
299 alpha <= e_c + e + q <= gamma.
300 */
get_cached_power_for_binary_exponent(int e)301 inline cached_power get_cached_power_for_binary_exponent(int e) {
302 // Now
303 //
304 // alpha <= e_c + e + q <= gamma (1)
305 // ==> f_c * 2^alpha <= c * 2^e * 2^q
306 //
307 // and since the c's are normalized, 2^(q-1) <= f_c,
308 //
309 // ==> 2^(q - 1 + alpha) <= c * 2^(e + q)
310 // ==> 2^(alpha - e - 1) <= c
311 //
312 // If c were an exact power of ten, i.e. c = 10^k, one may determine k as
313 //
314 // k = ceil( log_10( 2^(alpha - e - 1) ) )
315 // = ceil( (alpha - e - 1) * log_10(2) )
316 //
317 // From the paper:
318 // "In theory the result of the procedure could be wrong since c is rounded,
319 // and the computation itself is approximated [...]. In practice, however,
320 // this simple function is sufficient."
321 //
322 // For IEEE double precision floating-point numbers converted into
323 // normalized diyfp's w = f * 2^e, with q = 64,
324 //
325 // e >= -1022 (min IEEE exponent)
326 // -52 (p - 1)
327 // -52 (p - 1, possibly normalize denormal IEEE numbers)
328 // -11 (normalize the diyfp)
329 // = -1137
330 //
331 // and
332 //
333 // e <= +1023 (max IEEE exponent)
334 // -52 (p - 1)
335 // -11 (normalize the diyfp)
336 // = 960
337 //
338 // This binary exponent range [-1137,960] results in a decimal exponent
339 // range [-307,324]. One does not need to store a cached power for each
340 // k in this range. For each such k it suffices to find a cached power
341 // such that the exponent of the product lies in [alpha,gamma].
342 // This implies that the difference of the decimal exponents of adjacent
343 // table entries must be less than or equal to
344 //
345 // floor( (gamma - alpha) * log_10(2) ) = 8.
346 //
347 // (A smaller distance gamma-alpha would require a larger table.)
348
349 // NB:
350 // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34.
351
352 constexpr int kCachedPowersMinDecExp = -300;
353 constexpr int kCachedPowersDecStep = 8;
354
355 static constexpr std::array<cached_power, 79> kCachedPowers = {{
356 {0xAB70FE17C79AC6CA, -1060, -300}, {0xFF77B1FCBEBCDC4F, -1034, -292},
357 {0xBE5691EF416BD60C, -1007, -284}, {0x8DD01FAD907FFC3C, -980, -276},
358 {0xD3515C2831559A83, -954, -268}, {0x9D71AC8FADA6C9B5, -927, -260},
359 {0xEA9C227723EE8BCB, -901, -252}, {0xAECC49914078536D, -874, -244},
360 {0x823C12795DB6CE57, -847, -236}, {0xC21094364DFB5637, -821, -228},
361 {0x9096EA6F3848984F, -794, -220}, {0xD77485CB25823AC7, -768, -212},
362 {0xA086CFCD97BF97F4, -741, -204}, {0xEF340A98172AACE5, -715, -196},
363 {0xB23867FB2A35B28E, -688, -188}, {0x84C8D4DFD2C63F3B, -661, -180},
364 {0xC5DD44271AD3CDBA, -635, -172}, {0x936B9FCEBB25C996, -608, -164},
365 {0xDBAC6C247D62A584, -582, -156}, {0xA3AB66580D5FDAF6, -555, -148},
366 {0xF3E2F893DEC3F126, -529, -140}, {0xB5B5ADA8AAFF80B8, -502, -132},
367 {0x87625F056C7C4A8B, -475, -124}, {0xC9BCFF6034C13053, -449, -116},
368 {0x964E858C91BA2655, -422, -108}, {0xDFF9772470297EBD, -396, -100},
369 {0xA6DFBD9FB8E5B88F, -369, -92}, {0xF8A95FCF88747D94, -343, -84},
370 {0xB94470938FA89BCF, -316, -76}, {0x8A08F0F8BF0F156B, -289, -68},
371 {0xCDB02555653131B6, -263, -60}, {0x993FE2C6D07B7FAC, -236, -52},
372 {0xE45C10C42A2B3B06, -210, -44}, {0xAA242499697392D3, -183, -36},
373 {0xFD87B5F28300CA0E, -157, -28}, {0xBCE5086492111AEB, -130, -20},
374 {0x8CBCCC096F5088CC, -103, -12}, {0xD1B71758E219652C, -77, -4},
375 {0x9C40000000000000, -50, 4}, {0xE8D4A51000000000, -24, 12},
376 {0xAD78EBC5AC620000, 3, 20}, {0x813F3978F8940984, 30, 28},
377 {0xC097CE7BC90715B3, 56, 36}, {0x8F7E32CE7BEA5C70, 83, 44},
378 {0xD5D238A4ABE98068, 109, 52}, {0x9F4F2726179A2245, 136, 60},
379 {0xED63A231D4C4FB27, 162, 68}, {0xB0DE65388CC8ADA8, 189, 76},
380 {0x83C7088E1AAB65DB, 216, 84}, {0xC45D1DF942711D9A, 242, 92},
381 {0x924D692CA61BE758, 269, 100}, {0xDA01EE641A708DEA, 295, 108},
382 {0xA26DA3999AEF774A, 322, 116}, {0xF209787BB47D6B85, 348, 124},
383 {0xB454E4A179DD1877, 375, 132}, {0x865B86925B9BC5C2, 402, 140},
384 {0xC83553C5C8965D3D, 428, 148}, {0x952AB45CFA97A0B3, 455, 156},
385 {0xDE469FBD99A05FE3, 481, 164}, {0xA59BC234DB398C25, 508, 172},
386 {0xF6C69A72A3989F5C, 534, 180}, {0xB7DCBF5354E9BECE, 561, 188},
387 {0x88FCF317F22241E2, 588, 196}, {0xCC20CE9BD35C78A5, 614, 204},
388 {0x98165AF37B2153DF, 641, 212}, {0xE2A0B5DC971F303A, 667, 220},
389 {0xA8D9D1535CE3B396, 694, 228}, {0xFB9B7CD9A4A7443C, 720, 236},
390 {0xBB764C4CA7A44410, 747, 244}, {0x8BAB8EEFB6409C1A, 774, 252},
391 {0xD01FEF10A657842C, 800, 260}, {0x9B10A4E5E9913129, 827, 268},
392 {0xE7109BFBA19C0C9D, 853, 276}, {0xAC2820D9623BF429, 880, 284},
393 {0x80444B5E7AA7CF85, 907, 292}, {0xBF21E44003ACDD2D, 933, 300},
394 {0x8E679C2F5E44FF8F, 960, 308}, {0xD433179D9C8CB841, 986, 316},
395 {0x9E19DB92B4E31BA9, 1013, 324},
396 }};
397
398 // This computation gives exactly the same results for k as
399 // k = ceil((kAlpha - e - 1) * 0.30102999566398114)
400 // for |e| <= 1500, but doesn't require floating-point operations.
401 // NB: log_10(2) ~= 78913 / 2^18
402 const int f = kAlpha - e - 1;
403 const int k = (f * 78913) / (1 << 18) + static_cast<int>(f > 0);
404
405 const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) /
406 kCachedPowersDecStep;
407
408 const cached_power cached = kCachedPowers[static_cast<std::size_t>(index)];
409
410 return cached;
411 }
412
413 /*!
414 For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k.
415 For n == 0, returns 1 and sets pow10 := 1.
416 */
find_largest_pow10(const std::uint32_t n,std::uint32_t & pow10)417 inline int find_largest_pow10(const std::uint32_t n, std::uint32_t &pow10) {
418 // LCOV_EXCL_START
419 if (n >= 1000000000) {
420 pow10 = 1000000000;
421 return 10;
422 }
423 // LCOV_EXCL_STOP
424 else if (n >= 100000000) {
425 pow10 = 100000000;
426 return 9;
427 } else if (n >= 10000000) {
428 pow10 = 10000000;
429 return 8;
430 } else if (n >= 1000000) {
431 pow10 = 1000000;
432 return 7;
433 } else if (n >= 100000) {
434 pow10 = 100000;
435 return 6;
436 } else if (n >= 10000) {
437 pow10 = 10000;
438 return 5;
439 } else if (n >= 1000) {
440 pow10 = 1000;
441 return 4;
442 } else if (n >= 100) {
443 pow10 = 100;
444 return 3;
445 } else if (n >= 10) {
446 pow10 = 10;
447 return 2;
448 } else {
449 pow10 = 1;
450 return 1;
451 }
452 }
453
grisu2_round(char * buf,int len,std::uint64_t dist,std::uint64_t delta,std::uint64_t rest,std::uint64_t ten_k)454 inline void grisu2_round(char *buf, int len, std::uint64_t dist,
455 std::uint64_t delta, std::uint64_t rest,
456 std::uint64_t ten_k) {
457
458 // <--------------------------- delta ---->
459 // <---- dist --------->
460 // --------------[------------------+-------------------]--------------
461 // M- w M+
462 //
463 // ten_k
464 // <------>
465 // <---- rest ---->
466 // --------------[------------------+----+--------------]--------------
467 // w V
468 // = buf * 10^k
469 //
470 // ten_k represents a unit-in-the-last-place in the decimal representation
471 // stored in buf.
472 // Decrement buf by ten_k while this takes buf closer to w.
473
474 // The tests are written in this order to avoid overflow in unsigned
475 // integer arithmetic.
476
477 while (rest < dist && delta - rest >= ten_k &&
478 (rest + ten_k < dist || dist - rest > rest + ten_k - dist)) {
479 buf[len - 1]--;
480 rest += ten_k;
481 }
482 }
483
484 /*!
485 Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+.
486 M- and M+ must be normalized and share the same exponent -60 <= e <= -32.
487 */
grisu2_digit_gen(char * buffer,int & length,int & decimal_exponent,diyfp M_minus,diyfp w,diyfp M_plus)488 inline void grisu2_digit_gen(char *buffer, int &length, int &decimal_exponent,
489 diyfp M_minus, diyfp w, diyfp M_plus) {
490 static_assert(kAlpha >= -60, "internal error");
491 static_assert(kGamma <= -32, "internal error");
492
493 // Generates the digits (and the exponent) of a decimal floating-point
494 // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's
495 // w, M- and M+ share the same exponent e, which satisfies alpha <= e <=
496 // gamma.
497 //
498 // <--------------------------- delta ---->
499 // <---- dist --------->
500 // --------------[------------------+-------------------]--------------
501 // M- w M+
502 //
503 // Grisu2 generates the digits of M+ from left to right and stops as soon as
504 // V is in [M-,M+].
505
506 std::uint64_t delta =
507 diyfp::sub(M_plus, M_minus)
508 .f; // (significand of (M+ - M-), implicit exponent is e)
509 std::uint64_t dist =
510 diyfp::sub(M_plus, w)
511 .f; // (significand of (M+ - w ), implicit exponent is e)
512
513 // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0):
514 //
515 // M+ = f * 2^e
516 // = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e
517 // = ((p1 ) * 2^-e + (p2 )) * 2^e
518 // = p1 + p2 * 2^e
519
520 const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e);
521
522 auto p1 = static_cast<std::uint32_t>(
523 M_plus.f >>
524 -one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.)
525 std::uint64_t p2 = M_plus.f & (one.f - 1); // p2 = f mod 2^-e
526
527 // 1)
528 //
529 // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0]
530
531 std::uint32_t pow10;
532 const int k = find_largest_pow10(p1, pow10);
533
534 // 10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1)
535 //
536 // p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1))
537 // = (d[k-1] ) * 10^(k-1) + (p1 mod 10^(k-1))
538 //
539 // M+ = p1 + p2 * 2^e
540 // = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1)) + p2 * 2^e
541 // = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e
542 // = d[k-1] * 10^(k-1) + ( rest) * 2^e
543 //
544 // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0)
545 //
546 // p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0]
547 //
548 // but stop as soon as
549 //
550 // rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e
551
552 int n = k;
553 while (n > 0) {
554 // Invariants:
555 // M+ = buffer * 10^n + (p1 + p2 * 2^e) (buffer = 0 for n = k)
556 // pow10 = 10^(n-1) <= p1 < 10^n
557 //
558 const std::uint32_t d = p1 / pow10; // d = p1 div 10^(n-1)
559 const std::uint32_t r = p1 % pow10; // r = p1 mod 10^(n-1)
560 //
561 // M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e
562 // = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e)
563 //
564 buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
565 //
566 // M+ = buffer * 10^(n-1) + (r + p2 * 2^e)
567 //
568 p1 = r;
569 n--;
570 //
571 // M+ = buffer * 10^n + (p1 + p2 * 2^e)
572 // pow10 = 10^n
573 //
574
575 // Now check if enough digits have been generated.
576 // Compute
577 //
578 // p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e
579 //
580 // Note:
581 // Since rest and delta share the same exponent e, it suffices to
582 // compare the significands.
583 const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2;
584 if (rest <= delta) {
585 // V = buffer * 10^n, with M- <= V <= M+.
586
587 decimal_exponent += n;
588
589 // We may now just stop. But instead look if the buffer could be
590 // decremented to bring V closer to w.
591 //
592 // pow10 = 10^n is now 1 ulp in the decimal representation V.
593 // The rounding procedure works with diyfp's with an implicit
594 // exponent of e.
595 //
596 // 10^n = (10^n * 2^-e) * 2^e = ulp * 2^e
597 //
598 const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e;
599 grisu2_round(buffer, length, dist, delta, rest, ten_n);
600
601 return;
602 }
603
604 pow10 /= 10;
605 //
606 // pow10 = 10^(n-1) <= p1 < 10^n
607 // Invariants restored.
608 }
609
610 // 2)
611 //
612 // The digits of the integral part have been generated:
613 //
614 // M+ = d[k-1]...d[1]d[0] + p2 * 2^e
615 // = buffer + p2 * 2^e
616 //
617 // Now generate the digits of the fractional part p2 * 2^e.
618 //
619 // Note:
620 // No decimal point is generated: the exponent is adjusted instead.
621 //
622 // p2 actually represents the fraction
623 //
624 // p2 * 2^e
625 // = p2 / 2^-e
626 // = d[-1] / 10^1 + d[-2] / 10^2 + ...
627 //
628 // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...)
629 //
630 // p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m
631 // + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...)
632 //
633 // using
634 //
635 // 10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e)
636 // = ( d) * 2^-e + ( r)
637 //
638 // or
639 // 10^m * p2 * 2^e = d + r * 2^e
640 //
641 // i.e.
642 //
643 // M+ = buffer + p2 * 2^e
644 // = buffer + 10^-m * (d + r * 2^e)
645 // = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e
646 //
647 // and stop as soon as 10^-m * r * 2^e <= delta * 2^e
648
649 int m = 0;
650 for (;;) {
651 // Invariant:
652 // M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...)
653 // * 2^e
654 // = buffer * 10^-m + 10^-m * (p2 )
655 // * 2^e = buffer * 10^-m + 10^-m * (1/10 * (10 * p2) ) * 2^e =
656 // buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e +
657 // (10*p2 mod 2^-e)) * 2^e
658 //
659 p2 *= 10;
660 const std::uint64_t d = p2 >> -one.e; // d = (10 * p2) div 2^-e
661 const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e
662 //
663 // M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e
664 // = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e))
665 // = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e
666 //
667 buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
668 //
669 // M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e
670 //
671 p2 = r;
672 m++;
673 //
674 // M+ = buffer * 10^-m + 10^-m * p2 * 2^e
675 // Invariant restored.
676
677 // Check if enough digits have been generated.
678 //
679 // 10^-m * p2 * 2^e <= delta * 2^e
680 // p2 * 2^e <= 10^m * delta * 2^e
681 // p2 <= 10^m * delta
682 delta *= 10;
683 dist *= 10;
684 if (p2 <= delta) {
685 break;
686 }
687 }
688
689 // V = buffer * 10^-m, with M- <= V <= M+.
690
691 decimal_exponent -= m;
692
693 // 1 ulp in the decimal representation is now 10^-m.
694 // Since delta and dist are now scaled by 10^m, we need to do the
695 // same with ulp in order to keep the units in sync.
696 //
697 // 10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e
698 //
699 const std::uint64_t ten_m = one.f;
700 grisu2_round(buffer, length, dist, delta, p2, ten_m);
701
702 // By construction this algorithm generates the shortest possible decimal
703 // number (Loitsch, Theorem 6.2) which rounds back to w.
704 // For an input number of precision p, at least
705 //
706 // N = 1 + ceil(p * log_10(2))
707 //
708 // decimal digits are sufficient to identify all binary floating-point
709 // numbers (Matula, "In-and-Out conversions").
710 // This implies that the algorithm does not produce more than N decimal
711 // digits.
712 //
713 // N = 17 for p = 53 (IEEE double precision)
714 // N = 9 for p = 24 (IEEE single precision)
715 }
716
717 /*!
718 v = buf * 10^decimal_exponent
719 len is the length of the buffer (number of decimal digits)
720 The buffer must be large enough, i.e. >= max_digits10.
721 */
grisu2(char * buf,int & len,int & decimal_exponent,diyfp m_minus,diyfp v,diyfp m_plus)722 inline void grisu2(char *buf, int &len, int &decimal_exponent, diyfp m_minus,
723 diyfp v, diyfp m_plus) {
724
725 // --------(-----------------------+-----------------------)-------- (A)
726 // m- v m+
727 //
728 // --------------------(-----------+-----------------------)-------- (B)
729 // m- v m+
730 //
731 // First scale v (and m- and m+) such that the exponent is in the range
732 // [alpha, gamma].
733
734 const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e);
735
736 const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k
737
738 // The exponent of the products is = v.e + c_minus_k.e + q and is in the range
739 // [alpha,gamma]
740 const diyfp w = diyfp::mul(v, c_minus_k);
741 const diyfp w_minus = diyfp::mul(m_minus, c_minus_k);
742 const diyfp w_plus = diyfp::mul(m_plus, c_minus_k);
743
744 // ----(---+---)---------------(---+---)---------------(---+---)----
745 // w- w w+
746 // = c*m- = c*v = c*m+
747 //
748 // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and
749 // w+ are now off by a small amount.
750 // In fact:
751 //
752 // w - v * 10^k < 1 ulp
753 //
754 // To account for this inaccuracy, add resp. subtract 1 ulp.
755 //
756 // --------+---[---------------(---+---)---------------]---+--------
757 // w- M- w M+ w+
758 //
759 // Now any number in [M-, M+] (bounds included) will round to w when input,
760 // regardless of how the input rounding algorithm breaks ties.
761 //
762 // And digit_gen generates the shortest possible such number in [M-, M+].
763 // Note that this does not mean that Grisu2 always generates the shortest
764 // possible number in the interval (m-, m+).
765 const diyfp M_minus(w_minus.f + 1, w_minus.e);
766 const diyfp M_plus(w_plus.f - 1, w_plus.e);
767
768 decimal_exponent = -cached.k; // = -(-k) = k
769
770 grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus);
771 }
772
773 /*!
774 v = buf * 10^decimal_exponent
775 len is the length of the buffer (number of decimal digits)
776 The buffer must be large enough, i.e. >= max_digits10.
777 */
778 template <typename FloatType>
grisu2(char * buf,int & len,int & decimal_exponent,FloatType value)779 void grisu2(char *buf, int &len, int &decimal_exponent, FloatType value) {
780 static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3,
781 "internal error: not enough precision");
782
783 // If the neighbors (and boundaries) of 'value' are always computed for
784 // double-precision numbers, all float's can be recovered using strtod (and
785 // strtof). However, the resulting decimal representations are not exactly
786 // "short".
787 //
788 // The documentation for 'std::to_chars'
789 // (https://en.cppreference.com/w/cpp/utility/to_chars) says "value is
790 // converted to a string as if by std::sprintf in the default ("C") locale"
791 // and since sprintf promotes float's to double's, I think this is exactly
792 // what 'std::to_chars' does. On the other hand, the documentation for
793 // 'std::to_chars' requires that "parsing the representation using the
794 // corresponding std::from_chars function recovers value exactly". That
795 // indicates that single precision floating-point numbers should be recovered
796 // using 'std::strtof'.
797 //
798 // NB: If the neighbors are computed for single-precision numbers, there is a
799 // single float
800 // (7.0385307e-26f) which can't be recovered using strtod. The resulting
801 // double precision value is off by 1 ulp.
802 #if 0
803 const boundaries w = compute_boundaries(static_cast<double>(value));
804 #else
805 const boundaries w = compute_boundaries(value);
806 #endif
807
808 grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus);
809 }
810
811 /*!
812 @brief appends a decimal representation of e to buf
813 @return a pointer to the element following the exponent.
814 @pre -1000 < e < 1000
815 */
append_exponent(char * buf,int e)816 inline char *append_exponent(char *buf, int e) {
817
818 if (e < 0) {
819 e = -e;
820 *buf++ = '-';
821 } else {
822 *buf++ = '+';
823 }
824
825 auto k = static_cast<std::uint32_t>(e);
826 if (k < 10) {
827 // Always print at least two digits in the exponent.
828 // This is for compatibility with printf("%g").
829 *buf++ = '0';
830 *buf++ = static_cast<char>('0' + k);
831 } else if (k < 100) {
832 *buf++ = static_cast<char>('0' + k / 10);
833 k %= 10;
834 *buf++ = static_cast<char>('0' + k);
835 } else {
836 *buf++ = static_cast<char>('0' + k / 100);
837 k %= 100;
838 *buf++ = static_cast<char>('0' + k / 10);
839 k %= 10;
840 *buf++ = static_cast<char>('0' + k);
841 }
842
843 return buf;
844 }
845
846 /*!
847 @brief prettify v = buf * 10^decimal_exponent
848 If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point
849 notation. Otherwise it will be printed in exponential notation.
850 @pre min_exp < 0
851 @pre max_exp > 0
852 */
format_buffer(char * buf,int len,int decimal_exponent,int min_exp,int max_exp)853 inline char *format_buffer(char *buf, int len, int decimal_exponent,
854 int min_exp, int max_exp) {
855
856 const int k = len;
857 const int n = len + decimal_exponent;
858
859 // v = buf * 10^(n-k)
860 // k is the length of the buffer (number of decimal digits)
861 // n is the position of the decimal point relative to the start of the buffer.
862
863 if (k <= n && n <= max_exp) {
864 // digits[000]
865 // len <= max_exp + 2
866
867 std::memset(buf + k, '0', static_cast<size_t>(n) - static_cast<size_t>(k));
868 // Make it look like a floating-point number (#362, #378)
869 buf[n + 0] = '.';
870 buf[n + 1] = '0';
871 return buf + (static_cast<size_t>(n) + 2);
872 }
873
874 if (0 < n && n <= max_exp) {
875 // dig.its
876 // len <= max_digits10 + 1
877 std::memmove(buf + (static_cast<size_t>(n) + 1), buf + n,
878 static_cast<size_t>(k) - static_cast<size_t>(n));
879 buf[n] = '.';
880 return buf + (static_cast<size_t>(k) + 1U);
881 }
882
883 if (min_exp < n && n <= 0) {
884 // 0.[000]digits
885 // len <= 2 + (-min_exp - 1) + max_digits10
886
887 std::memmove(buf + (2 + static_cast<size_t>(-n)), buf,
888 static_cast<size_t>(k));
889 buf[0] = '0';
890 buf[1] = '.';
891 std::memset(buf + 2, '0', static_cast<size_t>(-n));
892 return buf + (2U + static_cast<size_t>(-n) + static_cast<size_t>(k));
893 }
894
895 if (k == 1) {
896 // dE+123
897 // len <= 1 + 5
898
899 buf += 1;
900 } else {
901 // d.igitsE+123
902 // len <= max_digits10 + 1 + 5
903
904 std::memmove(buf + 2, buf + 1, static_cast<size_t>(k) - 1);
905 buf[1] = '.';
906 buf += 1 + static_cast<size_t>(k);
907 }
908
909 *buf++ = 'e';
910 return append_exponent(buf, n - 1);
911 }
912
913 } // namespace dtoa_impl
914
915 /*!
916 The format of the resulting decimal representation is similar to printf's %g
917 format. Returns an iterator pointing past-the-end of the decimal representation.
918 @note The input number must be finite, i.e. NaN's and Inf's are not supported.
919 @note The buffer must be large enough.
920 @note The result is NOT null-terminated.
921 */
to_chars(char * first,const char * last,double value)922 char *to_chars(char *first, const char *last, double value) {
923 static_cast<void>(last); // maybe unused - fix warning
924 if (value <= -0) {
925 value = -value;
926 *first++ = '-';
927 }
928
929 if (value == 0) // +-0
930 {
931 *first++ = '0';
932 // Make it look like a floating-point number (#362, #378)
933 *first++ = '.';
934 *first++ = '0';
935 return first;
936 }
937 // Compute v = buffer * 10^decimal_exponent.
938 // The decimal digits are stored in the buffer, which needs to be interpreted
939 // as an unsigned decimal integer.
940 // len is the length of the buffer, i.e. the number of decimal digits.
941 int len = 0;
942 int decimal_exponent = 0;
943 dtoa_impl::grisu2(first, len, decimal_exponent, value);
944 // Format the buffer like printf("%.*g", prec, value)
945 constexpr int kMinExp = -4;
946 constexpr int kMaxExp = std::numeric_limits<double>::digits10;
947
948 return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp,
949 kMaxExp);
950 }
951 } // namespace internal
952 } // namespace simdjson
953 /* end file src/to_chars.cpp */
954 /* begin file src/from_chars.cpp */
955 #include <limits>
956 namespace simdjson {
957 namespace internal {
958
959 /**
960 * The code in the internal::from_chars function is meant to handle the floating-point number parsing
961 * when we have more than 19 digits in the decimal mantissa. This should only be seen
962 * in adversarial scenarios: we do not expect production systems to even produce
963 * such floating-point numbers.
964 *
965 * The parser is based on work by Nigel Tao (at https://github.com/google/wuffs/)
966 * who credits Ken Thompson for the design (via a reference to the Go source
967 * code). See
968 * https://github.com/google/wuffs/blob/aa46859ea40c72516deffa1b146121952d6dfd3b/internal/cgen/base/floatconv-submodule-data.c
969 * https://github.com/google/wuffs/blob/46cd8105f47ca07ae2ba8e6a7818ef9c0df6c152/internal/cgen/base/floatconv-submodule-code.c
970 * It is probably not very fast but it is a fallback that should almost never be
971 * called in real life. Google Wuffs is published under APL 2.0.
972 **/
973
974 namespace {
975 constexpr uint32_t max_digits = 768;
976 constexpr int32_t decimal_point_range = 2047;
977 } // namespace
978
979 struct adjusted_mantissa {
980 uint64_t mantissa;
981 int power2;
adjusted_mantissasimdjson::internal::adjusted_mantissa982 adjusted_mantissa() : mantissa(0), power2(0) {}
983 };
984
985 struct decimal {
986 uint32_t num_digits;
987 int32_t decimal_point;
988 bool negative;
989 bool truncated;
990 uint8_t digits[max_digits];
991 };
992
993 template <typename T> struct binary_format {
994 static constexpr int mantissa_explicit_bits();
995 static constexpr int minimum_exponent();
996 static constexpr int infinite_power();
997 static constexpr int sign_index();
998 };
999
mantissa_explicit_bits()1000 template <> constexpr int binary_format<double>::mantissa_explicit_bits() {
1001 return 52;
1002 }
1003
minimum_exponent()1004 template <> constexpr int binary_format<double>::minimum_exponent() {
1005 return -1023;
1006 }
infinite_power()1007 template <> constexpr int binary_format<double>::infinite_power() {
1008 return 0x7FF;
1009 }
1010
sign_index()1011 template <> constexpr int binary_format<double>::sign_index() { return 63; }
1012
is_integer(char c)1013 bool is_integer(char c) noexcept { return (c >= '0' && c <= '9'); }
1014
1015 // This should always succeed since it follows a call to parse_number.
parse_decimal(const char * & p)1016 decimal parse_decimal(const char *&p) noexcept {
1017 decimal answer;
1018 answer.num_digits = 0;
1019 answer.decimal_point = 0;
1020 answer.truncated = false;
1021 answer.negative = (*p == '-');
1022 if ((*p == '-') || (*p == '+')) {
1023 ++p;
1024 }
1025
1026 while (*p == '0') {
1027 ++p;
1028 }
1029 while (is_integer(*p)) {
1030 if (answer.num_digits < max_digits) {
1031 answer.digits[answer.num_digits] = uint8_t(*p - '0');
1032 }
1033 answer.num_digits++;
1034 ++p;
1035 }
1036 if (*p == '.') {
1037 ++p;
1038 const char *first_after_period = p;
1039 // if we have not yet encountered a zero, we have to skip it as well
1040 if (answer.num_digits == 0) {
1041 // skip zeros
1042 while (*p == '0') {
1043 ++p;
1044 }
1045 }
1046 while (is_integer(*p)) {
1047 if (answer.num_digits < max_digits) {
1048 answer.digits[answer.num_digits] = uint8_t(*p - '0');
1049 }
1050 answer.num_digits++;
1051 ++p;
1052 }
1053 answer.decimal_point = int32_t(first_after_period - p);
1054 }
1055 if(answer.num_digits > 0) {
1056 const char *preverse = p - 1;
1057 int32_t trailing_zeros = 0;
1058 while ((*preverse == '0') || (*preverse == '.')) {
1059 if(*preverse == '0') { trailing_zeros++; };
1060 --preverse;
1061 }
1062 answer.decimal_point += int32_t(answer.num_digits);
1063 answer.num_digits -= uint32_t(trailing_zeros);
1064 }
1065 if(answer.num_digits > max_digits ) {
1066 answer.num_digits = max_digits;
1067 answer.truncated = true;
1068 }
1069 if (('e' == *p) || ('E' == *p)) {
1070 ++p;
1071 bool neg_exp = false;
1072 if ('-' == *p) {
1073 neg_exp = true;
1074 ++p;
1075 } else if ('+' == *p) {
1076 ++p;
1077 }
1078 int32_t exp_number = 0; // exponential part
1079 while (is_integer(*p)) {
1080 uint8_t digit = uint8_t(*p - '0');
1081 if (exp_number < 0x10000) {
1082 exp_number = 10 * exp_number + digit;
1083 }
1084 ++p;
1085 }
1086 answer.decimal_point += (neg_exp ? -exp_number : exp_number);
1087 }
1088 return answer;
1089 }
1090
1091 namespace {
1092
1093 // remove all final zeroes
trim(decimal & h)1094 inline void trim(decimal &h) {
1095 while ((h.num_digits > 0) && (h.digits[h.num_digits - 1] == 0)) {
1096 h.num_digits--;
1097 }
1098 }
1099
number_of_digits_decimal_left_shift(decimal & h,uint32_t shift)1100 uint32_t number_of_digits_decimal_left_shift(decimal &h, uint32_t shift) {
1101 shift &= 63;
1102 const static uint16_t number_of_digits_decimal_left_shift_table[65] = {
1103 0x0000, 0x0800, 0x0801, 0x0803, 0x1006, 0x1009, 0x100D, 0x1812, 0x1817,
1104 0x181D, 0x2024, 0x202B, 0x2033, 0x203C, 0x2846, 0x2850, 0x285B, 0x3067,
1105 0x3073, 0x3080, 0x388E, 0x389C, 0x38AB, 0x38BB, 0x40CC, 0x40DD, 0x40EF,
1106 0x4902, 0x4915, 0x4929, 0x513E, 0x5153, 0x5169, 0x5180, 0x5998, 0x59B0,
1107 0x59C9, 0x61E3, 0x61FD, 0x6218, 0x6A34, 0x6A50, 0x6A6D, 0x6A8B, 0x72AA,
1108 0x72C9, 0x72E9, 0x7B0A, 0x7B2B, 0x7B4D, 0x8370, 0x8393, 0x83B7, 0x83DC,
1109 0x8C02, 0x8C28, 0x8C4F, 0x9477, 0x949F, 0x94C8, 0x9CF2, 0x051C, 0x051C,
1110 0x051C, 0x051C,
1111 };
1112 uint32_t x_a = number_of_digits_decimal_left_shift_table[shift];
1113 uint32_t x_b = number_of_digits_decimal_left_shift_table[shift + 1];
1114 uint32_t num_new_digits = x_a >> 11;
1115 uint32_t pow5_a = 0x7FF & x_a;
1116 uint32_t pow5_b = 0x7FF & x_b;
1117 const static uint8_t
1118 number_of_digits_decimal_left_shift_table_powers_of_5[0x051C] = {
1119 5, 2, 5, 1, 2, 5, 6, 2, 5, 3, 1, 2, 5, 1, 5, 6, 2, 5, 7, 8, 1, 2, 5,
1120 3, 9, 0, 6, 2, 5, 1, 9, 5, 3, 1, 2, 5, 9, 7, 6, 5, 6, 2, 5, 4, 8, 8,
1121 2, 8, 1, 2, 5, 2, 4, 4, 1, 4, 0, 6, 2, 5, 1, 2, 2, 0, 7, 0, 3, 1, 2,
1122 5, 6, 1, 0, 3, 5, 1, 5, 6, 2, 5, 3, 0, 5, 1, 7, 5, 7, 8, 1, 2, 5, 1,
1123 5, 2, 5, 8, 7, 8, 9, 0, 6, 2, 5, 7, 6, 2, 9, 3, 9, 4, 5, 3, 1, 2, 5,
1124 3, 8, 1, 4, 6, 9, 7, 2, 6, 5, 6, 2, 5, 1, 9, 0, 7, 3, 4, 8, 6, 3, 2,
1125 8, 1, 2, 5, 9, 5, 3, 6, 7, 4, 3, 1, 6, 4, 0, 6, 2, 5, 4, 7, 6, 8, 3,
1126 7, 1, 5, 8, 2, 0, 3, 1, 2, 5, 2, 3, 8, 4, 1, 8, 5, 7, 9, 1, 0, 1, 5,
1127 6, 2, 5, 1, 1, 9, 2, 0, 9, 2, 8, 9, 5, 5, 0, 7, 8, 1, 2, 5, 5, 9, 6,
1128 0, 4, 6, 4, 4, 7, 7, 5, 3, 9, 0, 6, 2, 5, 2, 9, 8, 0, 2, 3, 2, 2, 3,
1129 8, 7, 6, 9, 5, 3, 1, 2, 5, 1, 4, 9, 0, 1, 1, 6, 1, 1, 9, 3, 8, 4, 7,
1130 6, 5, 6, 2, 5, 7, 4, 5, 0, 5, 8, 0, 5, 9, 6, 9, 2, 3, 8, 2, 8, 1, 2,
1131 5, 3, 7, 2, 5, 2, 9, 0, 2, 9, 8, 4, 6, 1, 9, 1, 4, 0, 6, 2, 5, 1, 8,
1132 6, 2, 6, 4, 5, 1, 4, 9, 2, 3, 0, 9, 5, 7, 0, 3, 1, 2, 5, 9, 3, 1, 3,
1133 2, 2, 5, 7, 4, 6, 1, 5, 4, 7, 8, 5, 1, 5, 6, 2, 5, 4, 6, 5, 6, 6, 1,
1134 2, 8, 7, 3, 0, 7, 7, 3, 9, 2, 5, 7, 8, 1, 2, 5, 2, 3, 2, 8, 3, 0, 6,
1135 4, 3, 6, 5, 3, 8, 6, 9, 6, 2, 8, 9, 0, 6, 2, 5, 1, 1, 6, 4, 1, 5, 3,
1136 2, 1, 8, 2, 6, 9, 3, 4, 8, 1, 4, 4, 5, 3, 1, 2, 5, 5, 8, 2, 0, 7, 6,
1137 6, 0, 9, 1, 3, 4, 6, 7, 4, 0, 7, 2, 2, 6, 5, 6, 2, 5, 2, 9, 1, 0, 3,
1138 8, 3, 0, 4, 5, 6, 7, 3, 3, 7, 0, 3, 6, 1, 3, 2, 8, 1, 2, 5, 1, 4, 5,
1139 5, 1, 9, 1, 5, 2, 2, 8, 3, 6, 6, 8, 5, 1, 8, 0, 6, 6, 4, 0, 6, 2, 5,
1140 7, 2, 7, 5, 9, 5, 7, 6, 1, 4, 1, 8, 3, 4, 2, 5, 9, 0, 3, 3, 2, 0, 3,
1141 1, 2, 5, 3, 6, 3, 7, 9, 7, 8, 8, 0, 7, 0, 9, 1, 7, 1, 2, 9, 5, 1, 6,
1142 6, 0, 1, 5, 6, 2, 5, 1, 8, 1, 8, 9, 8, 9, 4, 0, 3, 5, 4, 5, 8, 5, 6,
1143 4, 7, 5, 8, 3, 0, 0, 7, 8, 1, 2, 5, 9, 0, 9, 4, 9, 4, 7, 0, 1, 7, 7,
1144 2, 9, 2, 8, 2, 3, 7, 9, 1, 5, 0, 3, 9, 0, 6, 2, 5, 4, 5, 4, 7, 4, 7,
1145 3, 5, 0, 8, 8, 6, 4, 6, 4, 1, 1, 8, 9, 5, 7, 5, 1, 9, 5, 3, 1, 2, 5,
1146 2, 2, 7, 3, 7, 3, 6, 7, 5, 4, 4, 3, 2, 3, 2, 0, 5, 9, 4, 7, 8, 7, 5,
1147 9, 7, 6, 5, 6, 2, 5, 1, 1, 3, 6, 8, 6, 8, 3, 7, 7, 2, 1, 6, 1, 6, 0,
1148 2, 9, 7, 3, 9, 3, 7, 9, 8, 8, 2, 8, 1, 2, 5, 5, 6, 8, 4, 3, 4, 1, 8,
1149 8, 6, 0, 8, 0, 8, 0, 1, 4, 8, 6, 9, 6, 8, 9, 9, 4, 1, 4, 0, 6, 2, 5,
1150 2, 8, 4, 2, 1, 7, 0, 9, 4, 3, 0, 4, 0, 4, 0, 0, 7, 4, 3, 4, 8, 4, 4,
1151 9, 7, 0, 7, 0, 3, 1, 2, 5, 1, 4, 2, 1, 0, 8, 5, 4, 7, 1, 5, 2, 0, 2,
1152 0, 0, 3, 7, 1, 7, 4, 2, 2, 4, 8, 5, 3, 5, 1, 5, 6, 2, 5, 7, 1, 0, 5,
1153 4, 2, 7, 3, 5, 7, 6, 0, 1, 0, 0, 1, 8, 5, 8, 7, 1, 1, 2, 4, 2, 6, 7,
1154 5, 7, 8, 1, 2, 5, 3, 5, 5, 2, 7, 1, 3, 6, 7, 8, 8, 0, 0, 5, 0, 0, 9,
1155 2, 9, 3, 5, 5, 6, 2, 1, 3, 3, 7, 8, 9, 0, 6, 2, 5, 1, 7, 7, 6, 3, 5,
1156 6, 8, 3, 9, 4, 0, 0, 2, 5, 0, 4, 6, 4, 6, 7, 7, 8, 1, 0, 6, 6, 8, 9,
1157 4, 5, 3, 1, 2, 5, 8, 8, 8, 1, 7, 8, 4, 1, 9, 7, 0, 0, 1, 2, 5, 2, 3,
1158 2, 3, 3, 8, 9, 0, 5, 3, 3, 4, 4, 7, 2, 6, 5, 6, 2, 5, 4, 4, 4, 0, 8,
1159 9, 2, 0, 9, 8, 5, 0, 0, 6, 2, 6, 1, 6, 1, 6, 9, 4, 5, 2, 6, 6, 7, 2,
1160 3, 6, 3, 2, 8, 1, 2, 5, 2, 2, 2, 0, 4, 4, 6, 0, 4, 9, 2, 5, 0, 3, 1,
1161 3, 0, 8, 0, 8, 4, 7, 2, 6, 3, 3, 3, 6, 1, 8, 1, 6, 4, 0, 6, 2, 5, 1,
1162 1, 1, 0, 2, 2, 3, 0, 2, 4, 6, 2, 5, 1, 5, 6, 5, 4, 0, 4, 2, 3, 6, 3,
1163 1, 6, 6, 8, 0, 9, 0, 8, 2, 0, 3, 1, 2, 5, 5, 5, 5, 1, 1, 1, 5, 1, 2,
1164 3, 1, 2, 5, 7, 8, 2, 7, 0, 2, 1, 1, 8, 1, 5, 8, 3, 4, 0, 4, 5, 4, 1,
1165 0, 1, 5, 6, 2, 5, 2, 7, 7, 5, 5, 5, 7, 5, 6, 1, 5, 6, 2, 8, 9, 1, 3,
1166 5, 1, 0, 5, 9, 0, 7, 9, 1, 7, 0, 2, 2, 7, 0, 5, 0, 7, 8, 1, 2, 5, 1,
1167 3, 8, 7, 7, 7, 8, 7, 8, 0, 7, 8, 1, 4, 4, 5, 6, 7, 5, 5, 2, 9, 5, 3,
1168 9, 5, 8, 5, 1, 1, 3, 5, 2, 5, 3, 9, 0, 6, 2, 5, 6, 9, 3, 8, 8, 9, 3,
1169 9, 0, 3, 9, 0, 7, 2, 2, 8, 3, 7, 7, 6, 4, 7, 6, 9, 7, 9, 2, 5, 5, 6,
1170 7, 6, 2, 6, 9, 5, 3, 1, 2, 5, 3, 4, 6, 9, 4, 4, 6, 9, 5, 1, 9, 5, 3,
1171 6, 1, 4, 1, 8, 8, 8, 2, 3, 8, 4, 8, 9, 6, 2, 7, 8, 3, 8, 1, 3, 4, 7,
1172 6, 5, 6, 2, 5, 1, 7, 3, 4, 7, 2, 3, 4, 7, 5, 9, 7, 6, 8, 0, 7, 0, 9,
1173 4, 4, 1, 1, 9, 2, 4, 4, 8, 1, 3, 9, 1, 9, 0, 6, 7, 3, 8, 2, 8, 1, 2,
1174 5, 8, 6, 7, 3, 6, 1, 7, 3, 7, 9, 8, 8, 4, 0, 3, 5, 4, 7, 2, 0, 5, 9,
1175 6, 2, 2, 4, 0, 6, 9, 5, 9, 5, 3, 3, 6, 9, 1, 4, 0, 6, 2, 5,
1176 };
1177 const uint8_t *pow5 =
1178 &number_of_digits_decimal_left_shift_table_powers_of_5[pow5_a];
1179 uint32_t i = 0;
1180 uint32_t n = pow5_b - pow5_a;
1181 for (; i < n; i++) {
1182 if (i >= h.num_digits) {
1183 return num_new_digits - 1;
1184 } else if (h.digits[i] == pow5[i]) {
1185 continue;
1186 } else if (h.digits[i] < pow5[i]) {
1187 return num_new_digits - 1;
1188 } else {
1189 return num_new_digits;
1190 }
1191 }
1192 return num_new_digits;
1193 }
1194
1195 } // end of anonymous namespace
1196
round(decimal & h)1197 uint64_t round(decimal &h) {
1198 if ((h.num_digits == 0) || (h.decimal_point < 0)) {
1199 return 0;
1200 } else if (h.decimal_point > 18) {
1201 return UINT64_MAX;
1202 }
1203 // at this point, we know that h.decimal_point >= 0
1204 uint32_t dp = uint32_t(h.decimal_point);
1205 uint64_t n = 0;
1206 for (uint32_t i = 0; i < dp; i++) {
1207 n = (10 * n) + ((i < h.num_digits) ? h.digits[i] : 0);
1208 }
1209 bool round_up = false;
1210 if (dp < h.num_digits) {
1211 round_up = h.digits[dp] >= 5; // normally, we round up
1212 // but we may need to round to even!
1213 if ((h.digits[dp] == 5) && (dp + 1 == h.num_digits)) {
1214 round_up = h.truncated || ((dp > 0) && (1 & h.digits[dp - 1]));
1215 }
1216 }
1217 if (round_up) {
1218 n++;
1219 }
1220 return n;
1221 }
1222
1223 // computes h * 2^-shift
decimal_left_shift(decimal & h,uint32_t shift)1224 void decimal_left_shift(decimal &h, uint32_t shift) {
1225 if (h.num_digits == 0) {
1226 return;
1227 }
1228 uint32_t num_new_digits = number_of_digits_decimal_left_shift(h, shift);
1229 int32_t read_index = int32_t(h.num_digits - 1);
1230 uint32_t write_index = h.num_digits - 1 + num_new_digits;
1231 uint64_t n = 0;
1232
1233 while (read_index >= 0) {
1234 n += uint64_t(h.digits[read_index]) << shift;
1235 uint64_t quotient = n / 10;
1236 uint64_t remainder = n - (10 * quotient);
1237 if (write_index < max_digits) {
1238 h.digits[write_index] = uint8_t(remainder);
1239 } else if (remainder > 0) {
1240 h.truncated = true;
1241 }
1242 n = quotient;
1243 write_index--;
1244 read_index--;
1245 }
1246 while (n > 0) {
1247 uint64_t quotient = n / 10;
1248 uint64_t remainder = n - (10 * quotient);
1249 if (write_index < max_digits) {
1250 h.digits[write_index] = uint8_t(remainder);
1251 } else if (remainder > 0) {
1252 h.truncated = true;
1253 }
1254 n = quotient;
1255 write_index--;
1256 }
1257 h.num_digits += num_new_digits;
1258 if (h.num_digits > max_digits) {
1259 h.num_digits = max_digits;
1260 }
1261 h.decimal_point += int32_t(num_new_digits);
1262 trim(h);
1263 }
1264
1265 // computes h * 2^shift
decimal_right_shift(decimal & h,uint32_t shift)1266 void decimal_right_shift(decimal &h, uint32_t shift) {
1267 uint32_t read_index = 0;
1268 uint32_t write_index = 0;
1269
1270 uint64_t n = 0;
1271
1272 while ((n >> shift) == 0) {
1273 if (read_index < h.num_digits) {
1274 n = (10 * n) + h.digits[read_index++];
1275 } else if (n == 0) {
1276 return;
1277 } else {
1278 while ((n >> shift) == 0) {
1279 n = 10 * n;
1280 read_index++;
1281 }
1282 break;
1283 }
1284 }
1285 h.decimal_point -= int32_t(read_index - 1);
1286 if (h.decimal_point < -decimal_point_range) { // it is zero
1287 h.num_digits = 0;
1288 h.decimal_point = 0;
1289 h.negative = false;
1290 h.truncated = false;
1291 return;
1292 }
1293 uint64_t mask = (uint64_t(1) << shift) - 1;
1294 while (read_index < h.num_digits) {
1295 uint8_t new_digit = uint8_t(n >> shift);
1296 n = (10 * (n & mask)) + h.digits[read_index++];
1297 h.digits[write_index++] = new_digit;
1298 }
1299 while (n > 0) {
1300 uint8_t new_digit = uint8_t(n >> shift);
1301 n = 10 * (n & mask);
1302 if (write_index < max_digits) {
1303 h.digits[write_index++] = new_digit;
1304 } else if (new_digit > 0) {
1305 h.truncated = true;
1306 }
1307 }
1308 h.num_digits = write_index;
1309 trim(h);
1310 }
1311
compute_float(decimal & d)1312 template <typename binary> adjusted_mantissa compute_float(decimal &d) {
1313 adjusted_mantissa answer;
1314 if (d.num_digits == 0) {
1315 // should be zero
1316 answer.power2 = 0;
1317 answer.mantissa = 0;
1318 return answer;
1319 }
1320 // At this point, going further, we can assume that d.num_digits > 0.
1321 // We want to guard against excessive decimal point values because
1322 // they can result in long running times. Indeed, we do
1323 // shifts by at most 60 bits. We have that log(10**400)/log(2**60) ~= 22
1324 // which is fine, but log(10**299995)/log(2**60) ~= 16609 which is not
1325 // fine (runs for a long time).
1326 //
1327 if(d.decimal_point < -324) {
1328 // We have something smaller than 1e-324 which is always zero
1329 // in binary64 and binary32.
1330 // It should be zero.
1331 answer.power2 = 0;
1332 answer.mantissa = 0;
1333 return answer;
1334 } else if(d.decimal_point >= 310) {
1335 // We have something at least as large as 0.1e310 which is
1336 // always infinite.
1337 answer.power2 = binary::infinite_power();
1338 answer.mantissa = 0;
1339 return answer;
1340 }
1341
1342 static const uint32_t max_shift = 60;
1343 static const uint32_t num_powers = 19;
1344 static const uint8_t powers[19] = {
1345 0, 3, 6, 9, 13, 16, 19, 23, 26, 29, //
1346 33, 36, 39, 43, 46, 49, 53, 56, 59, //
1347 };
1348 int32_t exp2 = 0;
1349 while (d.decimal_point > 0) {
1350 uint32_t n = uint32_t(d.decimal_point);
1351 uint32_t shift = (n < num_powers) ? powers[n] : max_shift;
1352 decimal_right_shift(d, shift);
1353 if (d.decimal_point < -decimal_point_range) {
1354 // should be zero
1355 answer.power2 = 0;
1356 answer.mantissa = 0;
1357 return answer;
1358 }
1359 exp2 += int32_t(shift);
1360 }
1361 // We shift left toward [1/2 ... 1].
1362 while (d.decimal_point <= 0) {
1363 uint32_t shift;
1364 if (d.decimal_point == 0) {
1365 if (d.digits[0] >= 5) {
1366 break;
1367 }
1368 shift = (d.digits[0] < 2) ? 2 : 1;
1369 } else {
1370 uint32_t n = uint32_t(-d.decimal_point);
1371 shift = (n < num_powers) ? powers[n] : max_shift;
1372 }
1373 decimal_left_shift(d, shift);
1374 if (d.decimal_point > decimal_point_range) {
1375 // we want to get infinity:
1376 answer.power2 = 0xFF;
1377 answer.mantissa = 0;
1378 return answer;
1379 }
1380 exp2 -= int32_t(shift);
1381 }
1382 // We are now in the range [1/2 ... 1] but the binary format uses [1 ... 2].
1383 exp2--;
1384 constexpr int32_t minimum_exponent = binary::minimum_exponent();
1385 while ((minimum_exponent + 1) > exp2) {
1386 uint32_t n = uint32_t((minimum_exponent + 1) - exp2);
1387 if (n > max_shift) {
1388 n = max_shift;
1389 }
1390 decimal_right_shift(d, n);
1391 exp2 += int32_t(n);
1392 }
1393 if ((exp2 - minimum_exponent) >= binary::infinite_power()) {
1394 answer.power2 = binary::infinite_power();
1395 answer.mantissa = 0;
1396 return answer;
1397 }
1398
1399 const int mantissa_size_in_bits = binary::mantissa_explicit_bits() + 1;
1400 decimal_left_shift(d, mantissa_size_in_bits);
1401
1402 uint64_t mantissa = round(d);
1403 // It is possible that we have an overflow, in which case we need
1404 // to shift back.
1405 if (mantissa >= (uint64_t(1) << mantissa_size_in_bits)) {
1406 decimal_right_shift(d, 1);
1407 exp2 += 1;
1408 mantissa = round(d);
1409 if ((exp2 - minimum_exponent) >= binary::infinite_power()) {
1410 answer.power2 = binary::infinite_power();
1411 answer.mantissa = 0;
1412 return answer;
1413 }
1414 }
1415 answer.power2 = exp2 - binary::minimum_exponent();
1416 if (mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) {
1417 answer.power2--;
1418 }
1419 answer.mantissa =
1420 mantissa & ((uint64_t(1) << binary::mantissa_explicit_bits()) - 1);
1421 return answer;
1422 }
1423
1424 template <typename binary>
parse_long_mantissa(const char * first)1425 adjusted_mantissa parse_long_mantissa(const char *first) {
1426 decimal d = parse_decimal(first);
1427 return compute_float<binary>(d);
1428 }
1429
from_chars(const char * first)1430 double from_chars(const char *first) noexcept {
1431 bool negative = first[0] == '-';
1432 if (negative) {
1433 first++;
1434 }
1435 adjusted_mantissa am = parse_long_mantissa<binary_format<double>>(first);
1436 uint64_t word = am.mantissa;
1437 word |= uint64_t(am.power2)
1438 << binary_format<double>::mantissa_explicit_bits();
1439 word = negative ? word | (uint64_t(1) << binary_format<double>::sign_index())
1440 : word;
1441 double value;
1442 std::memcpy(&value, &word, sizeof(double));
1443 return value;
1444 }
1445
1446 } // internal
1447 } // simdjson
1448 /* end file src/from_chars.cpp */
1449 /* begin file src/internal/error_tables.cpp */
1450
1451 namespace simdjson {
1452 namespace internal {
1453
1454 SIMDJSON_DLLIMPORTEXPORT const error_code_info error_codes[] {
1455 { SUCCESS, "No error" },
1456 { CAPACITY, "This parser can't support a document that big" },
1457 { MEMALLOC, "Error allocating memory, we're most likely out of memory" },
1458 { TAPE_ERROR, "The JSON document has an improper structure: missing or superfluous commas, braces, missing keys, etc." },
1459 { DEPTH_ERROR, "The JSON document was too deep (too many nested objects and arrays)" },
1460 { STRING_ERROR, "Problem while parsing a string" },
1461 { T_ATOM_ERROR, "Problem while parsing an atom starting with the letter 't'" },
1462 { F_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'f'" },
1463 { N_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'n'" },
1464 { NUMBER_ERROR, "Problem while parsing a number" },
1465 { UTF8_ERROR, "The input is not valid UTF-8" },
1466 { UNINITIALIZED, "Uninitialized" },
1467 { EMPTY, "Empty: no JSON found" },
1468 { UNESCAPED_CHARS, "Within strings, some characters must be escaped, we found unescaped characters" },
1469 { UNCLOSED_STRING, "A string is opened, but never closed." },
1470 { UNSUPPORTED_ARCHITECTURE, "simdjson does not have an implementation supported by this CPU architecture (perhaps it's a non-SIMD CPU?)." },
1471 { INCORRECT_TYPE, "The JSON element does not have the requested type." },
1472 { NUMBER_OUT_OF_RANGE, "The JSON number is too large or too small to fit within the requested type." },
1473 { INDEX_OUT_OF_BOUNDS, "Attempted to access an element of a JSON array that is beyond its length." },
1474 { NO_SUCH_FIELD, "The JSON field referenced does not exist in this object." },
1475 { IO_ERROR, "Error reading the file." },
1476 { INVALID_JSON_POINTER, "Invalid JSON pointer syntax." },
1477 { INVALID_URI_FRAGMENT, "Invalid URI fragment syntax." },
1478 { UNEXPECTED_ERROR, "Unexpected error, consider reporting this problem as you may have found a bug in simdjson" },
1479 { PARSER_IN_USE, "Cannot parse a new document while a document is still in use." },
1480 { OUT_OF_ORDER_ITERATION, "Objects and arrays can only be iterated when they are first encountered." },
1481 { INSUFFICIENT_PADDING, "simdjson requires the input JSON string to have at least SIMDJSON_PADDING extra bytes allocated, beyond the string's length." }
1482 }; // error_messages[]
1483
1484 } // namespace internal
1485 } // namespace simdjson
1486 /* end file src/internal/error_tables.cpp */
1487 /* begin file src/internal/jsoncharutils_tables.cpp */
1488
1489 namespace simdjson {
1490 namespace internal {
1491
1492 // structural chars here are
1493 // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL)
1494 // we are also interested in the four whitespace characters
1495 // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
1496
1497 SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace_negated[256] = {
1498 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1499 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1500 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1501
1502 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1503 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1504 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
1505
1506 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1507 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1508 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1509
1510 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1511 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1512 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
1513
1514 SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace[256] = {
1515 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1516 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
1517 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1518 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
1519 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1520 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1521 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1522 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1526
1527 SIMDJSON_DLLIMPORTEXPORT const uint32_t digit_to_val32[886] = {
1528 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1529 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1530 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1531 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1532 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1533 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1534 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1535 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1536 0x0, 0x1, 0x2, 0x3, 0x4, 0x5,
1537 0x6, 0x7, 0x8, 0x9, 0xFFFFFFFF, 0xFFFFFFFF,
1538 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa,
1539 0xb, 0xc, 0xd, 0xe, 0xf, 0xFFFFFFFF,
1540 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1541 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1542 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1543 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1544 0xFFFFFFFF, 0xa, 0xb, 0xc, 0xd, 0xe,
1545 0xf, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1546 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1547 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1548 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1549 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1550 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1551 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1552 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1553 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1554 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1555 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1556 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1557 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1558 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1559 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1560 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1561 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1562 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1563 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1564 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1565 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1566 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1567 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1568 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1569 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1570 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1571 0x0, 0x10, 0x20, 0x30, 0x40, 0x50,
1572 0x60, 0x70, 0x80, 0x90, 0xFFFFFFFF, 0xFFFFFFFF,
1573 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa0,
1574 0xb0, 0xc0, 0xd0, 0xe0, 0xf0, 0xFFFFFFFF,
1575 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1576 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1577 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1578 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1579 0xFFFFFFFF, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0,
1580 0xf0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1581 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1582 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1583 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1584 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1585 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1586 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1587 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1588 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1589 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1590 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1591 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1592 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1593 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1594 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1595 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1596 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1597 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1598 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1599 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1600 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1601 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1602 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1603 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1604 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1605 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1606 0x0, 0x100, 0x200, 0x300, 0x400, 0x500,
1607 0x600, 0x700, 0x800, 0x900, 0xFFFFFFFF, 0xFFFFFFFF,
1608 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa00,
1609 0xb00, 0xc00, 0xd00, 0xe00, 0xf00, 0xFFFFFFFF,
1610 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1611 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1612 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1613 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1614 0xFFFFFFFF, 0xa00, 0xb00, 0xc00, 0xd00, 0xe00,
1615 0xf00, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1616 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1617 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1618 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1619 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1620 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1621 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1622 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1623 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1624 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1625 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1626 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1627 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1628 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1629 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1630 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1631 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1632 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1633 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1634 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1635 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1636 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1637 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1638 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1639 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1640 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1641 0x0, 0x1000, 0x2000, 0x3000, 0x4000, 0x5000,
1642 0x6000, 0x7000, 0x8000, 0x9000, 0xFFFFFFFF, 0xFFFFFFFF,
1643 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa000,
1644 0xb000, 0xc000, 0xd000, 0xe000, 0xf000, 0xFFFFFFFF,
1645 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1646 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1647 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1648 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1649 0xFFFFFFFF, 0xa000, 0xb000, 0xc000, 0xd000, 0xe000,
1650 0xf000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1651 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1652 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1653 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1654 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1655 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1656 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1657 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1658 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1659 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1660 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1661 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1662 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1663 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1664 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1665 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1666 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1667 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1668 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1669 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1670 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1671 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1672 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1673 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1674 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1675 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
1676
1677 } // namespace internal
1678 } // namespace simdjson
1679 /* end file src/internal/jsoncharutils_tables.cpp */
1680 /* begin file src/internal/numberparsing_tables.cpp */
1681
1682 namespace simdjson {
1683 namespace internal {
1684
1685 // Precomputed powers of ten from 10^0 to 10^22. These
1686 // can be represented exactly using the double type.
1687 SIMDJSON_DLLIMPORTEXPORT const double power_of_ten[] = {
1688 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11,
1689 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22};
1690
1691 /**
1692 * When mapping numbers from decimal to binary,
1693 * we go from w * 10^q to m * 2^p but we have
1694 * 10^q = 5^q * 2^q, so effectively
1695 * we are trying to match
1696 * w * 2^q * 5^q to m * 2^p. Thus the powers of two
1697 * are not a concern since they can be represented
1698 * exactly using the binary notation, only the powers of five
1699 * affect the binary significand.
1700 */
1701
1702
1703 // The truncated powers of five from 5^-342 all the way to 5^308
1704 // The mantissa is truncated to 128 bits, and
1705 // never rounded up. Uses about 10KB.
1706 SIMDJSON_DLLIMPORTEXPORT const uint64_t power_of_five_128[]= {
1707 0xeef453d6923bd65a,0x113faa2906a13b3f,
1708 0x9558b4661b6565f8,0x4ac7ca59a424c507,
1709 0xbaaee17fa23ebf76,0x5d79bcf00d2df649,
1710 0xe95a99df8ace6f53,0xf4d82c2c107973dc,
1711 0x91d8a02bb6c10594,0x79071b9b8a4be869,
1712 0xb64ec836a47146f9,0x9748e2826cdee284,
1713 0xe3e27a444d8d98b7,0xfd1b1b2308169b25,
1714 0x8e6d8c6ab0787f72,0xfe30f0f5e50e20f7,
1715 0xb208ef855c969f4f,0xbdbd2d335e51a935,
1716 0xde8b2b66b3bc4723,0xad2c788035e61382,
1717 0x8b16fb203055ac76,0x4c3bcb5021afcc31,
1718 0xaddcb9e83c6b1793,0xdf4abe242a1bbf3d,
1719 0xd953e8624b85dd78,0xd71d6dad34a2af0d,
1720 0x87d4713d6f33aa6b,0x8672648c40e5ad68,
1721 0xa9c98d8ccb009506,0x680efdaf511f18c2,
1722 0xd43bf0effdc0ba48,0x212bd1b2566def2,
1723 0x84a57695fe98746d,0x14bb630f7604b57,
1724 0xa5ced43b7e3e9188,0x419ea3bd35385e2d,
1725 0xcf42894a5dce35ea,0x52064cac828675b9,
1726 0x818995ce7aa0e1b2,0x7343efebd1940993,
1727 0xa1ebfb4219491a1f,0x1014ebe6c5f90bf8,
1728 0xca66fa129f9b60a6,0xd41a26e077774ef6,
1729 0xfd00b897478238d0,0x8920b098955522b4,
1730 0x9e20735e8cb16382,0x55b46e5f5d5535b0,
1731 0xc5a890362fddbc62,0xeb2189f734aa831d,
1732 0xf712b443bbd52b7b,0xa5e9ec7501d523e4,
1733 0x9a6bb0aa55653b2d,0x47b233c92125366e,
1734 0xc1069cd4eabe89f8,0x999ec0bb696e840a,
1735 0xf148440a256e2c76,0xc00670ea43ca250d,
1736 0x96cd2a865764dbca,0x380406926a5e5728,
1737 0xbc807527ed3e12bc,0xc605083704f5ecf2,
1738 0xeba09271e88d976b,0xf7864a44c633682e,
1739 0x93445b8731587ea3,0x7ab3ee6afbe0211d,
1740 0xb8157268fdae9e4c,0x5960ea05bad82964,
1741 0xe61acf033d1a45df,0x6fb92487298e33bd,
1742 0x8fd0c16206306bab,0xa5d3b6d479f8e056,
1743 0xb3c4f1ba87bc8696,0x8f48a4899877186c,
1744 0xe0b62e2929aba83c,0x331acdabfe94de87,
1745 0x8c71dcd9ba0b4925,0x9ff0c08b7f1d0b14,
1746 0xaf8e5410288e1b6f,0x7ecf0ae5ee44dd9,
1747 0xdb71e91432b1a24a,0xc9e82cd9f69d6150,
1748 0x892731ac9faf056e,0xbe311c083a225cd2,
1749 0xab70fe17c79ac6ca,0x6dbd630a48aaf406,
1750 0xd64d3d9db981787d,0x92cbbccdad5b108,
1751 0x85f0468293f0eb4e,0x25bbf56008c58ea5,
1752 0xa76c582338ed2621,0xaf2af2b80af6f24e,
1753 0xd1476e2c07286faa,0x1af5af660db4aee1,
1754 0x82cca4db847945ca,0x50d98d9fc890ed4d,
1755 0xa37fce126597973c,0xe50ff107bab528a0,
1756 0xcc5fc196fefd7d0c,0x1e53ed49a96272c8,
1757 0xff77b1fcbebcdc4f,0x25e8e89c13bb0f7a,
1758 0x9faacf3df73609b1,0x77b191618c54e9ac,
1759 0xc795830d75038c1d,0xd59df5b9ef6a2417,
1760 0xf97ae3d0d2446f25,0x4b0573286b44ad1d,
1761 0x9becce62836ac577,0x4ee367f9430aec32,
1762 0xc2e801fb244576d5,0x229c41f793cda73f,
1763 0xf3a20279ed56d48a,0x6b43527578c1110f,
1764 0x9845418c345644d6,0x830a13896b78aaa9,
1765 0xbe5691ef416bd60c,0x23cc986bc656d553,
1766 0xedec366b11c6cb8f,0x2cbfbe86b7ec8aa8,
1767 0x94b3a202eb1c3f39,0x7bf7d71432f3d6a9,
1768 0xb9e08a83a5e34f07,0xdaf5ccd93fb0cc53,
1769 0xe858ad248f5c22c9,0xd1b3400f8f9cff68,
1770 0x91376c36d99995be,0x23100809b9c21fa1,
1771 0xb58547448ffffb2d,0xabd40a0c2832a78a,
1772 0xe2e69915b3fff9f9,0x16c90c8f323f516c,
1773 0x8dd01fad907ffc3b,0xae3da7d97f6792e3,
1774 0xb1442798f49ffb4a,0x99cd11cfdf41779c,
1775 0xdd95317f31c7fa1d,0x40405643d711d583,
1776 0x8a7d3eef7f1cfc52,0x482835ea666b2572,
1777 0xad1c8eab5ee43b66,0xda3243650005eecf,
1778 0xd863b256369d4a40,0x90bed43e40076a82,
1779 0x873e4f75e2224e68,0x5a7744a6e804a291,
1780 0xa90de3535aaae202,0x711515d0a205cb36,
1781 0xd3515c2831559a83,0xd5a5b44ca873e03,
1782 0x8412d9991ed58091,0xe858790afe9486c2,
1783 0xa5178fff668ae0b6,0x626e974dbe39a872,
1784 0xce5d73ff402d98e3,0xfb0a3d212dc8128f,
1785 0x80fa687f881c7f8e,0x7ce66634bc9d0b99,
1786 0xa139029f6a239f72,0x1c1fffc1ebc44e80,
1787 0xc987434744ac874e,0xa327ffb266b56220,
1788 0xfbe9141915d7a922,0x4bf1ff9f0062baa8,
1789 0x9d71ac8fada6c9b5,0x6f773fc3603db4a9,
1790 0xc4ce17b399107c22,0xcb550fb4384d21d3,
1791 0xf6019da07f549b2b,0x7e2a53a146606a48,
1792 0x99c102844f94e0fb,0x2eda7444cbfc426d,
1793 0xc0314325637a1939,0xfa911155fefb5308,
1794 0xf03d93eebc589f88,0x793555ab7eba27ca,
1795 0x96267c7535b763b5,0x4bc1558b2f3458de,
1796 0xbbb01b9283253ca2,0x9eb1aaedfb016f16,
1797 0xea9c227723ee8bcb,0x465e15a979c1cadc,
1798 0x92a1958a7675175f,0xbfacd89ec191ec9,
1799 0xb749faed14125d36,0xcef980ec671f667b,
1800 0xe51c79a85916f484,0x82b7e12780e7401a,
1801 0x8f31cc0937ae58d2,0xd1b2ecb8b0908810,
1802 0xb2fe3f0b8599ef07,0x861fa7e6dcb4aa15,
1803 0xdfbdcece67006ac9,0x67a791e093e1d49a,
1804 0x8bd6a141006042bd,0xe0c8bb2c5c6d24e0,
1805 0xaecc49914078536d,0x58fae9f773886e18,
1806 0xda7f5bf590966848,0xaf39a475506a899e,
1807 0x888f99797a5e012d,0x6d8406c952429603,
1808 0xaab37fd7d8f58178,0xc8e5087ba6d33b83,
1809 0xd5605fcdcf32e1d6,0xfb1e4a9a90880a64,
1810 0x855c3be0a17fcd26,0x5cf2eea09a55067f,
1811 0xa6b34ad8c9dfc06f,0xf42faa48c0ea481e,
1812 0xd0601d8efc57b08b,0xf13b94daf124da26,
1813 0x823c12795db6ce57,0x76c53d08d6b70858,
1814 0xa2cb1717b52481ed,0x54768c4b0c64ca6e,
1815 0xcb7ddcdda26da268,0xa9942f5dcf7dfd09,
1816 0xfe5d54150b090b02,0xd3f93b35435d7c4c,
1817 0x9efa548d26e5a6e1,0xc47bc5014a1a6daf,
1818 0xc6b8e9b0709f109a,0x359ab6419ca1091b,
1819 0xf867241c8cc6d4c0,0xc30163d203c94b62,
1820 0x9b407691d7fc44f8,0x79e0de63425dcf1d,
1821 0xc21094364dfb5636,0x985915fc12f542e4,
1822 0xf294b943e17a2bc4,0x3e6f5b7b17b2939d,
1823 0x979cf3ca6cec5b5a,0xa705992ceecf9c42,
1824 0xbd8430bd08277231,0x50c6ff782a838353,
1825 0xece53cec4a314ebd,0xa4f8bf5635246428,
1826 0x940f4613ae5ed136,0x871b7795e136be99,
1827 0xb913179899f68584,0x28e2557b59846e3f,
1828 0xe757dd7ec07426e5,0x331aeada2fe589cf,
1829 0x9096ea6f3848984f,0x3ff0d2c85def7621,
1830 0xb4bca50b065abe63,0xfed077a756b53a9,
1831 0xe1ebce4dc7f16dfb,0xd3e8495912c62894,
1832 0x8d3360f09cf6e4bd,0x64712dd7abbbd95c,
1833 0xb080392cc4349dec,0xbd8d794d96aacfb3,
1834 0xdca04777f541c567,0xecf0d7a0fc5583a0,
1835 0x89e42caaf9491b60,0xf41686c49db57244,
1836 0xac5d37d5b79b6239,0x311c2875c522ced5,
1837 0xd77485cb25823ac7,0x7d633293366b828b,
1838 0x86a8d39ef77164bc,0xae5dff9c02033197,
1839 0xa8530886b54dbdeb,0xd9f57f830283fdfc,
1840 0xd267caa862a12d66,0xd072df63c324fd7b,
1841 0x8380dea93da4bc60,0x4247cb9e59f71e6d,
1842 0xa46116538d0deb78,0x52d9be85f074e608,
1843 0xcd795be870516656,0x67902e276c921f8b,
1844 0x806bd9714632dff6,0xba1cd8a3db53b6,
1845 0xa086cfcd97bf97f3,0x80e8a40eccd228a4,
1846 0xc8a883c0fdaf7df0,0x6122cd128006b2cd,
1847 0xfad2a4b13d1b5d6c,0x796b805720085f81,
1848 0x9cc3a6eec6311a63,0xcbe3303674053bb0,
1849 0xc3f490aa77bd60fc,0xbedbfc4411068a9c,
1850 0xf4f1b4d515acb93b,0xee92fb5515482d44,
1851 0x991711052d8bf3c5,0x751bdd152d4d1c4a,
1852 0xbf5cd54678eef0b6,0xd262d45a78a0635d,
1853 0xef340a98172aace4,0x86fb897116c87c34,
1854 0x9580869f0e7aac0e,0xd45d35e6ae3d4da0,
1855 0xbae0a846d2195712,0x8974836059cca109,
1856 0xe998d258869facd7,0x2bd1a438703fc94b,
1857 0x91ff83775423cc06,0x7b6306a34627ddcf,
1858 0xb67f6455292cbf08,0x1a3bc84c17b1d542,
1859 0xe41f3d6a7377eeca,0x20caba5f1d9e4a93,
1860 0x8e938662882af53e,0x547eb47b7282ee9c,
1861 0xb23867fb2a35b28d,0xe99e619a4f23aa43,
1862 0xdec681f9f4c31f31,0x6405fa00e2ec94d4,
1863 0x8b3c113c38f9f37e,0xde83bc408dd3dd04,
1864 0xae0b158b4738705e,0x9624ab50b148d445,
1865 0xd98ddaee19068c76,0x3badd624dd9b0957,
1866 0x87f8a8d4cfa417c9,0xe54ca5d70a80e5d6,
1867 0xa9f6d30a038d1dbc,0x5e9fcf4ccd211f4c,
1868 0xd47487cc8470652b,0x7647c3200069671f,
1869 0x84c8d4dfd2c63f3b,0x29ecd9f40041e073,
1870 0xa5fb0a17c777cf09,0xf468107100525890,
1871 0xcf79cc9db955c2cc,0x7182148d4066eeb4,
1872 0x81ac1fe293d599bf,0xc6f14cd848405530,
1873 0xa21727db38cb002f,0xb8ada00e5a506a7c,
1874 0xca9cf1d206fdc03b,0xa6d90811f0e4851c,
1875 0xfd442e4688bd304a,0x908f4a166d1da663,
1876 0x9e4a9cec15763e2e,0x9a598e4e043287fe,
1877 0xc5dd44271ad3cdba,0x40eff1e1853f29fd,
1878 0xf7549530e188c128,0xd12bee59e68ef47c,
1879 0x9a94dd3e8cf578b9,0x82bb74f8301958ce,
1880 0xc13a148e3032d6e7,0xe36a52363c1faf01,
1881 0xf18899b1bc3f8ca1,0xdc44e6c3cb279ac1,
1882 0x96f5600f15a7b7e5,0x29ab103a5ef8c0b9,
1883 0xbcb2b812db11a5de,0x7415d448f6b6f0e7,
1884 0xebdf661791d60f56,0x111b495b3464ad21,
1885 0x936b9fcebb25c995,0xcab10dd900beec34,
1886 0xb84687c269ef3bfb,0x3d5d514f40eea742,
1887 0xe65829b3046b0afa,0xcb4a5a3112a5112,
1888 0x8ff71a0fe2c2e6dc,0x47f0e785eaba72ab,
1889 0xb3f4e093db73a093,0x59ed216765690f56,
1890 0xe0f218b8d25088b8,0x306869c13ec3532c,
1891 0x8c974f7383725573,0x1e414218c73a13fb,
1892 0xafbd2350644eeacf,0xe5d1929ef90898fa,
1893 0xdbac6c247d62a583,0xdf45f746b74abf39,
1894 0x894bc396ce5da772,0x6b8bba8c328eb783,
1895 0xab9eb47c81f5114f,0x66ea92f3f326564,
1896 0xd686619ba27255a2,0xc80a537b0efefebd,
1897 0x8613fd0145877585,0xbd06742ce95f5f36,
1898 0xa798fc4196e952e7,0x2c48113823b73704,
1899 0xd17f3b51fca3a7a0,0xf75a15862ca504c5,
1900 0x82ef85133de648c4,0x9a984d73dbe722fb,
1901 0xa3ab66580d5fdaf5,0xc13e60d0d2e0ebba,
1902 0xcc963fee10b7d1b3,0x318df905079926a8,
1903 0xffbbcfe994e5c61f,0xfdf17746497f7052,
1904 0x9fd561f1fd0f9bd3,0xfeb6ea8bedefa633,
1905 0xc7caba6e7c5382c8,0xfe64a52ee96b8fc0,
1906 0xf9bd690a1b68637b,0x3dfdce7aa3c673b0,
1907 0x9c1661a651213e2d,0x6bea10ca65c084e,
1908 0xc31bfa0fe5698db8,0x486e494fcff30a62,
1909 0xf3e2f893dec3f126,0x5a89dba3c3efccfa,
1910 0x986ddb5c6b3a76b7,0xf89629465a75e01c,
1911 0xbe89523386091465,0xf6bbb397f1135823,
1912 0xee2ba6c0678b597f,0x746aa07ded582e2c,
1913 0x94db483840b717ef,0xa8c2a44eb4571cdc,
1914 0xba121a4650e4ddeb,0x92f34d62616ce413,
1915 0xe896a0d7e51e1566,0x77b020baf9c81d17,
1916 0x915e2486ef32cd60,0xace1474dc1d122e,
1917 0xb5b5ada8aaff80b8,0xd819992132456ba,
1918 0xe3231912d5bf60e6,0x10e1fff697ed6c69,
1919 0x8df5efabc5979c8f,0xca8d3ffa1ef463c1,
1920 0xb1736b96b6fd83b3,0xbd308ff8a6b17cb2,
1921 0xddd0467c64bce4a0,0xac7cb3f6d05ddbde,
1922 0x8aa22c0dbef60ee4,0x6bcdf07a423aa96b,
1923 0xad4ab7112eb3929d,0x86c16c98d2c953c6,
1924 0xd89d64d57a607744,0xe871c7bf077ba8b7,
1925 0x87625f056c7c4a8b,0x11471cd764ad4972,
1926 0xa93af6c6c79b5d2d,0xd598e40d3dd89bcf,
1927 0xd389b47879823479,0x4aff1d108d4ec2c3,
1928 0x843610cb4bf160cb,0xcedf722a585139ba,
1929 0xa54394fe1eedb8fe,0xc2974eb4ee658828,
1930 0xce947a3da6a9273e,0x733d226229feea32,
1931 0x811ccc668829b887,0x806357d5a3f525f,
1932 0xa163ff802a3426a8,0xca07c2dcb0cf26f7,
1933 0xc9bcff6034c13052,0xfc89b393dd02f0b5,
1934 0xfc2c3f3841f17c67,0xbbac2078d443ace2,
1935 0x9d9ba7832936edc0,0xd54b944b84aa4c0d,
1936 0xc5029163f384a931,0xa9e795e65d4df11,
1937 0xf64335bcf065d37d,0x4d4617b5ff4a16d5,
1938 0x99ea0196163fa42e,0x504bced1bf8e4e45,
1939 0xc06481fb9bcf8d39,0xe45ec2862f71e1d6,
1940 0xf07da27a82c37088,0x5d767327bb4e5a4c,
1941 0x964e858c91ba2655,0x3a6a07f8d510f86f,
1942 0xbbe226efb628afea,0x890489f70a55368b,
1943 0xeadab0aba3b2dbe5,0x2b45ac74ccea842e,
1944 0x92c8ae6b464fc96f,0x3b0b8bc90012929d,
1945 0xb77ada0617e3bbcb,0x9ce6ebb40173744,
1946 0xe55990879ddcaabd,0xcc420a6a101d0515,
1947 0x8f57fa54c2a9eab6,0x9fa946824a12232d,
1948 0xb32df8e9f3546564,0x47939822dc96abf9,
1949 0xdff9772470297ebd,0x59787e2b93bc56f7,
1950 0x8bfbea76c619ef36,0x57eb4edb3c55b65a,
1951 0xaefae51477a06b03,0xede622920b6b23f1,
1952 0xdab99e59958885c4,0xe95fab368e45eced,
1953 0x88b402f7fd75539b,0x11dbcb0218ebb414,
1954 0xaae103b5fcd2a881,0xd652bdc29f26a119,
1955 0xd59944a37c0752a2,0x4be76d3346f0495f,
1956 0x857fcae62d8493a5,0x6f70a4400c562ddb,
1957 0xa6dfbd9fb8e5b88e,0xcb4ccd500f6bb952,
1958 0xd097ad07a71f26b2,0x7e2000a41346a7a7,
1959 0x825ecc24c873782f,0x8ed400668c0c28c8,
1960 0xa2f67f2dfa90563b,0x728900802f0f32fa,
1961 0xcbb41ef979346bca,0x4f2b40a03ad2ffb9,
1962 0xfea126b7d78186bc,0xe2f610c84987bfa8,
1963 0x9f24b832e6b0f436,0xdd9ca7d2df4d7c9,
1964 0xc6ede63fa05d3143,0x91503d1c79720dbb,
1965 0xf8a95fcf88747d94,0x75a44c6397ce912a,
1966 0x9b69dbe1b548ce7c,0xc986afbe3ee11aba,
1967 0xc24452da229b021b,0xfbe85badce996168,
1968 0xf2d56790ab41c2a2,0xfae27299423fb9c3,
1969 0x97c560ba6b0919a5,0xdccd879fc967d41a,
1970 0xbdb6b8e905cb600f,0x5400e987bbc1c920,
1971 0xed246723473e3813,0x290123e9aab23b68,
1972 0x9436c0760c86e30b,0xf9a0b6720aaf6521,
1973 0xb94470938fa89bce,0xf808e40e8d5b3e69,
1974 0xe7958cb87392c2c2,0xb60b1d1230b20e04,
1975 0x90bd77f3483bb9b9,0xb1c6f22b5e6f48c2,
1976 0xb4ecd5f01a4aa828,0x1e38aeb6360b1af3,
1977 0xe2280b6c20dd5232,0x25c6da63c38de1b0,
1978 0x8d590723948a535f,0x579c487e5a38ad0e,
1979 0xb0af48ec79ace837,0x2d835a9df0c6d851,
1980 0xdcdb1b2798182244,0xf8e431456cf88e65,
1981 0x8a08f0f8bf0f156b,0x1b8e9ecb641b58ff,
1982 0xac8b2d36eed2dac5,0xe272467e3d222f3f,
1983 0xd7adf884aa879177,0x5b0ed81dcc6abb0f,
1984 0x86ccbb52ea94baea,0x98e947129fc2b4e9,
1985 0xa87fea27a539e9a5,0x3f2398d747b36224,
1986 0xd29fe4b18e88640e,0x8eec7f0d19a03aad,
1987 0x83a3eeeef9153e89,0x1953cf68300424ac,
1988 0xa48ceaaab75a8e2b,0x5fa8c3423c052dd7,
1989 0xcdb02555653131b6,0x3792f412cb06794d,
1990 0x808e17555f3ebf11,0xe2bbd88bbee40bd0,
1991 0xa0b19d2ab70e6ed6,0x5b6aceaeae9d0ec4,
1992 0xc8de047564d20a8b,0xf245825a5a445275,
1993 0xfb158592be068d2e,0xeed6e2f0f0d56712,
1994 0x9ced737bb6c4183d,0x55464dd69685606b,
1995 0xc428d05aa4751e4c,0xaa97e14c3c26b886,
1996 0xf53304714d9265df,0xd53dd99f4b3066a8,
1997 0x993fe2c6d07b7fab,0xe546a8038efe4029,
1998 0xbf8fdb78849a5f96,0xde98520472bdd033,
1999 0xef73d256a5c0f77c,0x963e66858f6d4440,
2000 0x95a8637627989aad,0xdde7001379a44aa8,
2001 0xbb127c53b17ec159,0x5560c018580d5d52,
2002 0xe9d71b689dde71af,0xaab8f01e6e10b4a6,
2003 0x9226712162ab070d,0xcab3961304ca70e8,
2004 0xb6b00d69bb55c8d1,0x3d607b97c5fd0d22,
2005 0xe45c10c42a2b3b05,0x8cb89a7db77c506a,
2006 0x8eb98a7a9a5b04e3,0x77f3608e92adb242,
2007 0xb267ed1940f1c61c,0x55f038b237591ed3,
2008 0xdf01e85f912e37a3,0x6b6c46dec52f6688,
2009 0x8b61313bbabce2c6,0x2323ac4b3b3da015,
2010 0xae397d8aa96c1b77,0xabec975e0a0d081a,
2011 0xd9c7dced53c72255,0x96e7bd358c904a21,
2012 0x881cea14545c7575,0x7e50d64177da2e54,
2013 0xaa242499697392d2,0xdde50bd1d5d0b9e9,
2014 0xd4ad2dbfc3d07787,0x955e4ec64b44e864,
2015 0x84ec3c97da624ab4,0xbd5af13bef0b113e,
2016 0xa6274bbdd0fadd61,0xecb1ad8aeacdd58e,
2017 0xcfb11ead453994ba,0x67de18eda5814af2,
2018 0x81ceb32c4b43fcf4,0x80eacf948770ced7,
2019 0xa2425ff75e14fc31,0xa1258379a94d028d,
2020 0xcad2f7f5359a3b3e,0x96ee45813a04330,
2021 0xfd87b5f28300ca0d,0x8bca9d6e188853fc,
2022 0x9e74d1b791e07e48,0x775ea264cf55347e,
2023 0xc612062576589dda,0x95364afe032a81a0,
2024 0xf79687aed3eec551,0x3a83ddbd83f52210,
2025 0x9abe14cd44753b52,0xc4926a9672793580,
2026 0xc16d9a0095928a27,0x75b7053c0f178400,
2027 0xf1c90080baf72cb1,0x5324c68b12dd6800,
2028 0x971da05074da7bee,0xd3f6fc16ebca8000,
2029 0xbce5086492111aea,0x88f4bb1ca6bd0000,
2030 0xec1e4a7db69561a5,0x2b31e9e3d0700000,
2031 0x9392ee8e921d5d07,0x3aff322e62600000,
2032 0xb877aa3236a4b449,0x9befeb9fad487c3,
2033 0xe69594bec44de15b,0x4c2ebe687989a9b4,
2034 0x901d7cf73ab0acd9,0xf9d37014bf60a11,
2035 0xb424dc35095cd80f,0x538484c19ef38c95,
2036 0xe12e13424bb40e13,0x2865a5f206b06fba,
2037 0x8cbccc096f5088cb,0xf93f87b7442e45d4,
2038 0xafebff0bcb24aafe,0xf78f69a51539d749,
2039 0xdbe6fecebdedd5be,0xb573440e5a884d1c,
2040 0x89705f4136b4a597,0x31680a88f8953031,
2041 0xabcc77118461cefc,0xfdc20d2b36ba7c3e,
2042 0xd6bf94d5e57a42bc,0x3d32907604691b4d,
2043 0x8637bd05af6c69b5,0xa63f9a49c2c1b110,
2044 0xa7c5ac471b478423,0xfcf80dc33721d54,
2045 0xd1b71758e219652b,0xd3c36113404ea4a9,
2046 0x83126e978d4fdf3b,0x645a1cac083126ea,
2047 0xa3d70a3d70a3d70a,0x3d70a3d70a3d70a4,
2048 0xcccccccccccccccc,0xcccccccccccccccd,
2049 0x8000000000000000,0x0,
2050 0xa000000000000000,0x0,
2051 0xc800000000000000,0x0,
2052 0xfa00000000000000,0x0,
2053 0x9c40000000000000,0x0,
2054 0xc350000000000000,0x0,
2055 0xf424000000000000,0x0,
2056 0x9896800000000000,0x0,
2057 0xbebc200000000000,0x0,
2058 0xee6b280000000000,0x0,
2059 0x9502f90000000000,0x0,
2060 0xba43b74000000000,0x0,
2061 0xe8d4a51000000000,0x0,
2062 0x9184e72a00000000,0x0,
2063 0xb5e620f480000000,0x0,
2064 0xe35fa931a0000000,0x0,
2065 0x8e1bc9bf04000000,0x0,
2066 0xb1a2bc2ec5000000,0x0,
2067 0xde0b6b3a76400000,0x0,
2068 0x8ac7230489e80000,0x0,
2069 0xad78ebc5ac620000,0x0,
2070 0xd8d726b7177a8000,0x0,
2071 0x878678326eac9000,0x0,
2072 0xa968163f0a57b400,0x0,
2073 0xd3c21bcecceda100,0x0,
2074 0x84595161401484a0,0x0,
2075 0xa56fa5b99019a5c8,0x0,
2076 0xcecb8f27f4200f3a,0x0,
2077 0x813f3978f8940984,0x4000000000000000,
2078 0xa18f07d736b90be5,0x5000000000000000,
2079 0xc9f2c9cd04674ede,0xa400000000000000,
2080 0xfc6f7c4045812296,0x4d00000000000000,
2081 0x9dc5ada82b70b59d,0xf020000000000000,
2082 0xc5371912364ce305,0x6c28000000000000,
2083 0xf684df56c3e01bc6,0xc732000000000000,
2084 0x9a130b963a6c115c,0x3c7f400000000000,
2085 0xc097ce7bc90715b3,0x4b9f100000000000,
2086 0xf0bdc21abb48db20,0x1e86d40000000000,
2087 0x96769950b50d88f4,0x1314448000000000,
2088 0xbc143fa4e250eb31,0x17d955a000000000,
2089 0xeb194f8e1ae525fd,0x5dcfab0800000000,
2090 0x92efd1b8d0cf37be,0x5aa1cae500000000,
2091 0xb7abc627050305ad,0xf14a3d9e40000000,
2092 0xe596b7b0c643c719,0x6d9ccd05d0000000,
2093 0x8f7e32ce7bea5c6f,0xe4820023a2000000,
2094 0xb35dbf821ae4f38b,0xdda2802c8a800000,
2095 0xe0352f62a19e306e,0xd50b2037ad200000,
2096 0x8c213d9da502de45,0x4526f422cc340000,
2097 0xaf298d050e4395d6,0x9670b12b7f410000,
2098 0xdaf3f04651d47b4c,0x3c0cdd765f114000,
2099 0x88d8762bf324cd0f,0xa5880a69fb6ac800,
2100 0xab0e93b6efee0053,0x8eea0d047a457a00,
2101 0xd5d238a4abe98068,0x72a4904598d6d880,
2102 0x85a36366eb71f041,0x47a6da2b7f864750,
2103 0xa70c3c40a64e6c51,0x999090b65f67d924,
2104 0xd0cf4b50cfe20765,0xfff4b4e3f741cf6d,
2105 0x82818f1281ed449f,0xbff8f10e7a8921a4,
2106 0xa321f2d7226895c7,0xaff72d52192b6a0d,
2107 0xcbea6f8ceb02bb39,0x9bf4f8a69f764490,
2108 0xfee50b7025c36a08,0x2f236d04753d5b4,
2109 0x9f4f2726179a2245,0x1d762422c946590,
2110 0xc722f0ef9d80aad6,0x424d3ad2b7b97ef5,
2111 0xf8ebad2b84e0d58b,0xd2e0898765a7deb2,
2112 0x9b934c3b330c8577,0x63cc55f49f88eb2f,
2113 0xc2781f49ffcfa6d5,0x3cbf6b71c76b25fb,
2114 0xf316271c7fc3908a,0x8bef464e3945ef7a,
2115 0x97edd871cfda3a56,0x97758bf0e3cbb5ac,
2116 0xbde94e8e43d0c8ec,0x3d52eeed1cbea317,
2117 0xed63a231d4c4fb27,0x4ca7aaa863ee4bdd,
2118 0x945e455f24fb1cf8,0x8fe8caa93e74ef6a,
2119 0xb975d6b6ee39e436,0xb3e2fd538e122b44,
2120 0xe7d34c64a9c85d44,0x60dbbca87196b616,
2121 0x90e40fbeea1d3a4a,0xbc8955e946fe31cd,
2122 0xb51d13aea4a488dd,0x6babab6398bdbe41,
2123 0xe264589a4dcdab14,0xc696963c7eed2dd1,
2124 0x8d7eb76070a08aec,0xfc1e1de5cf543ca2,
2125 0xb0de65388cc8ada8,0x3b25a55f43294bcb,
2126 0xdd15fe86affad912,0x49ef0eb713f39ebe,
2127 0x8a2dbf142dfcc7ab,0x6e3569326c784337,
2128 0xacb92ed9397bf996,0x49c2c37f07965404,
2129 0xd7e77a8f87daf7fb,0xdc33745ec97be906,
2130 0x86f0ac99b4e8dafd,0x69a028bb3ded71a3,
2131 0xa8acd7c0222311bc,0xc40832ea0d68ce0c,
2132 0xd2d80db02aabd62b,0xf50a3fa490c30190,
2133 0x83c7088e1aab65db,0x792667c6da79e0fa,
2134 0xa4b8cab1a1563f52,0x577001b891185938,
2135 0xcde6fd5e09abcf26,0xed4c0226b55e6f86,
2136 0x80b05e5ac60b6178,0x544f8158315b05b4,
2137 0xa0dc75f1778e39d6,0x696361ae3db1c721,
2138 0xc913936dd571c84c,0x3bc3a19cd1e38e9,
2139 0xfb5878494ace3a5f,0x4ab48a04065c723,
2140 0x9d174b2dcec0e47b,0x62eb0d64283f9c76,
2141 0xc45d1df942711d9a,0x3ba5d0bd324f8394,
2142 0xf5746577930d6500,0xca8f44ec7ee36479,
2143 0x9968bf6abbe85f20,0x7e998b13cf4e1ecb,
2144 0xbfc2ef456ae276e8,0x9e3fedd8c321a67e,
2145 0xefb3ab16c59b14a2,0xc5cfe94ef3ea101e,
2146 0x95d04aee3b80ece5,0xbba1f1d158724a12,
2147 0xbb445da9ca61281f,0x2a8a6e45ae8edc97,
2148 0xea1575143cf97226,0xf52d09d71a3293bd,
2149 0x924d692ca61be758,0x593c2626705f9c56,
2150 0xb6e0c377cfa2e12e,0x6f8b2fb00c77836c,
2151 0xe498f455c38b997a,0xb6dfb9c0f956447,
2152 0x8edf98b59a373fec,0x4724bd4189bd5eac,
2153 0xb2977ee300c50fe7,0x58edec91ec2cb657,
2154 0xdf3d5e9bc0f653e1,0x2f2967b66737e3ed,
2155 0x8b865b215899f46c,0xbd79e0d20082ee74,
2156 0xae67f1e9aec07187,0xecd8590680a3aa11,
2157 0xda01ee641a708de9,0xe80e6f4820cc9495,
2158 0x884134fe908658b2,0x3109058d147fdcdd,
2159 0xaa51823e34a7eede,0xbd4b46f0599fd415,
2160 0xd4e5e2cdc1d1ea96,0x6c9e18ac7007c91a,
2161 0x850fadc09923329e,0x3e2cf6bc604ddb0,
2162 0xa6539930bf6bff45,0x84db8346b786151c,
2163 0xcfe87f7cef46ff16,0xe612641865679a63,
2164 0x81f14fae158c5f6e,0x4fcb7e8f3f60c07e,
2165 0xa26da3999aef7749,0xe3be5e330f38f09d,
2166 0xcb090c8001ab551c,0x5cadf5bfd3072cc5,
2167 0xfdcb4fa002162a63,0x73d9732fc7c8f7f6,
2168 0x9e9f11c4014dda7e,0x2867e7fddcdd9afa,
2169 0xc646d63501a1511d,0xb281e1fd541501b8,
2170 0xf7d88bc24209a565,0x1f225a7ca91a4226,
2171 0x9ae757596946075f,0x3375788de9b06958,
2172 0xc1a12d2fc3978937,0x52d6b1641c83ae,
2173 0xf209787bb47d6b84,0xc0678c5dbd23a49a,
2174 0x9745eb4d50ce6332,0xf840b7ba963646e0,
2175 0xbd176620a501fbff,0xb650e5a93bc3d898,
2176 0xec5d3fa8ce427aff,0xa3e51f138ab4cebe,
2177 0x93ba47c980e98cdf,0xc66f336c36b10137,
2178 0xb8a8d9bbe123f017,0xb80b0047445d4184,
2179 0xe6d3102ad96cec1d,0xa60dc059157491e5,
2180 0x9043ea1ac7e41392,0x87c89837ad68db2f,
2181 0xb454e4a179dd1877,0x29babe4598c311fb,
2182 0xe16a1dc9d8545e94,0xf4296dd6fef3d67a,
2183 0x8ce2529e2734bb1d,0x1899e4a65f58660c,
2184 0xb01ae745b101e9e4,0x5ec05dcff72e7f8f,
2185 0xdc21a1171d42645d,0x76707543f4fa1f73,
2186 0x899504ae72497eba,0x6a06494a791c53a8,
2187 0xabfa45da0edbde69,0x487db9d17636892,
2188 0xd6f8d7509292d603,0x45a9d2845d3c42b6,
2189 0x865b86925b9bc5c2,0xb8a2392ba45a9b2,
2190 0xa7f26836f282b732,0x8e6cac7768d7141e,
2191 0xd1ef0244af2364ff,0x3207d795430cd926,
2192 0x8335616aed761f1f,0x7f44e6bd49e807b8,
2193 0xa402b9c5a8d3a6e7,0x5f16206c9c6209a6,
2194 0xcd036837130890a1,0x36dba887c37a8c0f,
2195 0x802221226be55a64,0xc2494954da2c9789,
2196 0xa02aa96b06deb0fd,0xf2db9baa10b7bd6c,
2197 0xc83553c5c8965d3d,0x6f92829494e5acc7,
2198 0xfa42a8b73abbf48c,0xcb772339ba1f17f9,
2199 0x9c69a97284b578d7,0xff2a760414536efb,
2200 0xc38413cf25e2d70d,0xfef5138519684aba,
2201 0xf46518c2ef5b8cd1,0x7eb258665fc25d69,
2202 0x98bf2f79d5993802,0xef2f773ffbd97a61,
2203 0xbeeefb584aff8603,0xaafb550ffacfd8fa,
2204 0xeeaaba2e5dbf6784,0x95ba2a53f983cf38,
2205 0x952ab45cfa97a0b2,0xdd945a747bf26183,
2206 0xba756174393d88df,0x94f971119aeef9e4,
2207 0xe912b9d1478ceb17,0x7a37cd5601aab85d,
2208 0x91abb422ccb812ee,0xac62e055c10ab33a,
2209 0xb616a12b7fe617aa,0x577b986b314d6009,
2210 0xe39c49765fdf9d94,0xed5a7e85fda0b80b,
2211 0x8e41ade9fbebc27d,0x14588f13be847307,
2212 0xb1d219647ae6b31c,0x596eb2d8ae258fc8,
2213 0xde469fbd99a05fe3,0x6fca5f8ed9aef3bb,
2214 0x8aec23d680043bee,0x25de7bb9480d5854,
2215 0xada72ccc20054ae9,0xaf561aa79a10ae6a,
2216 0xd910f7ff28069da4,0x1b2ba1518094da04,
2217 0x87aa9aff79042286,0x90fb44d2f05d0842,
2218 0xa99541bf57452b28,0x353a1607ac744a53,
2219 0xd3fa922f2d1675f2,0x42889b8997915ce8,
2220 0x847c9b5d7c2e09b7,0x69956135febada11,
2221 0xa59bc234db398c25,0x43fab9837e699095,
2222 0xcf02b2c21207ef2e,0x94f967e45e03f4bb,
2223 0x8161afb94b44f57d,0x1d1be0eebac278f5,
2224 0xa1ba1ba79e1632dc,0x6462d92a69731732,
2225 0xca28a291859bbf93,0x7d7b8f7503cfdcfe,
2226 0xfcb2cb35e702af78,0x5cda735244c3d43e,
2227 0x9defbf01b061adab,0x3a0888136afa64a7,
2228 0xc56baec21c7a1916,0x88aaa1845b8fdd0,
2229 0xf6c69a72a3989f5b,0x8aad549e57273d45,
2230 0x9a3c2087a63f6399,0x36ac54e2f678864b,
2231 0xc0cb28a98fcf3c7f,0x84576a1bb416a7dd,
2232 0xf0fdf2d3f3c30b9f,0x656d44a2a11c51d5,
2233 0x969eb7c47859e743,0x9f644ae5a4b1b325,
2234 0xbc4665b596706114,0x873d5d9f0dde1fee,
2235 0xeb57ff22fc0c7959,0xa90cb506d155a7ea,
2236 0x9316ff75dd87cbd8,0x9a7f12442d588f2,
2237 0xb7dcbf5354e9bece,0xc11ed6d538aeb2f,
2238 0xe5d3ef282a242e81,0x8f1668c8a86da5fa,
2239 0x8fa475791a569d10,0xf96e017d694487bc,
2240 0xb38d92d760ec4455,0x37c981dcc395a9ac,
2241 0xe070f78d3927556a,0x85bbe253f47b1417,
2242 0x8c469ab843b89562,0x93956d7478ccec8e,
2243 0xaf58416654a6babb,0x387ac8d1970027b2,
2244 0xdb2e51bfe9d0696a,0x6997b05fcc0319e,
2245 0x88fcf317f22241e2,0x441fece3bdf81f03,
2246 0xab3c2fddeeaad25a,0xd527e81cad7626c3,
2247 0xd60b3bd56a5586f1,0x8a71e223d8d3b074,
2248 0x85c7056562757456,0xf6872d5667844e49,
2249 0xa738c6bebb12d16c,0xb428f8ac016561db,
2250 0xd106f86e69d785c7,0xe13336d701beba52,
2251 0x82a45b450226b39c,0xecc0024661173473,
2252 0xa34d721642b06084,0x27f002d7f95d0190,
2253 0xcc20ce9bd35c78a5,0x31ec038df7b441f4,
2254 0xff290242c83396ce,0x7e67047175a15271,
2255 0x9f79a169bd203e41,0xf0062c6e984d386,
2256 0xc75809c42c684dd1,0x52c07b78a3e60868,
2257 0xf92e0c3537826145,0xa7709a56ccdf8a82,
2258 0x9bbcc7a142b17ccb,0x88a66076400bb691,
2259 0xc2abf989935ddbfe,0x6acff893d00ea435,
2260 0xf356f7ebf83552fe,0x583f6b8c4124d43,
2261 0x98165af37b2153de,0xc3727a337a8b704a,
2262 0xbe1bf1b059e9a8d6,0x744f18c0592e4c5c,
2263 0xeda2ee1c7064130c,0x1162def06f79df73,
2264 0x9485d4d1c63e8be7,0x8addcb5645ac2ba8,
2265 0xb9a74a0637ce2ee1,0x6d953e2bd7173692,
2266 0xe8111c87c5c1ba99,0xc8fa8db6ccdd0437,
2267 0x910ab1d4db9914a0,0x1d9c9892400a22a2,
2268 0xb54d5e4a127f59c8,0x2503beb6d00cab4b,
2269 0xe2a0b5dc971f303a,0x2e44ae64840fd61d,
2270 0x8da471a9de737e24,0x5ceaecfed289e5d2,
2271 0xb10d8e1456105dad,0x7425a83e872c5f47,
2272 0xdd50f1996b947518,0xd12f124e28f77719,
2273 0x8a5296ffe33cc92f,0x82bd6b70d99aaa6f,
2274 0xace73cbfdc0bfb7b,0x636cc64d1001550b,
2275 0xd8210befd30efa5a,0x3c47f7e05401aa4e,
2276 0x8714a775e3e95c78,0x65acfaec34810a71,
2277 0xa8d9d1535ce3b396,0x7f1839a741a14d0d,
2278 0xd31045a8341ca07c,0x1ede48111209a050,
2279 0x83ea2b892091e44d,0x934aed0aab460432,
2280 0xa4e4b66b68b65d60,0xf81da84d5617853f,
2281 0xce1de40642e3f4b9,0x36251260ab9d668e,
2282 0x80d2ae83e9ce78f3,0xc1d72b7c6b426019,
2283 0xa1075a24e4421730,0xb24cf65b8612f81f,
2284 0xc94930ae1d529cfc,0xdee033f26797b627,
2285 0xfb9b7cd9a4a7443c,0x169840ef017da3b1,
2286 0x9d412e0806e88aa5,0x8e1f289560ee864e,
2287 0xc491798a08a2ad4e,0xf1a6f2bab92a27e2,
2288 0xf5b5d7ec8acb58a2,0xae10af696774b1db,
2289 0x9991a6f3d6bf1765,0xacca6da1e0a8ef29,
2290 0xbff610b0cc6edd3f,0x17fd090a58d32af3,
2291 0xeff394dcff8a948e,0xddfc4b4cef07f5b0,
2292 0x95f83d0a1fb69cd9,0x4abdaf101564f98e,
2293 0xbb764c4ca7a4440f,0x9d6d1ad41abe37f1,
2294 0xea53df5fd18d5513,0x84c86189216dc5ed,
2295 0x92746b9be2f8552c,0x32fd3cf5b4e49bb4,
2296 0xb7118682dbb66a77,0x3fbc8c33221dc2a1,
2297 0xe4d5e82392a40515,0xfabaf3feaa5334a,
2298 0x8f05b1163ba6832d,0x29cb4d87f2a7400e,
2299 0xb2c71d5bca9023f8,0x743e20e9ef511012,
2300 0xdf78e4b2bd342cf6,0x914da9246b255416,
2301 0x8bab8eefb6409c1a,0x1ad089b6c2f7548e,
2302 0xae9672aba3d0c320,0xa184ac2473b529b1,
2303 0xda3c0f568cc4f3e8,0xc9e5d72d90a2741e,
2304 0x8865899617fb1871,0x7e2fa67c7a658892,
2305 0xaa7eebfb9df9de8d,0xddbb901b98feeab7,
2306 0xd51ea6fa85785631,0x552a74227f3ea565,
2307 0x8533285c936b35de,0xd53a88958f87275f,
2308 0xa67ff273b8460356,0x8a892abaf368f137,
2309 0xd01fef10a657842c,0x2d2b7569b0432d85,
2310 0x8213f56a67f6b29b,0x9c3b29620e29fc73,
2311 0xa298f2c501f45f42,0x8349f3ba91b47b8f,
2312 0xcb3f2f7642717713,0x241c70a936219a73,
2313 0xfe0efb53d30dd4d7,0xed238cd383aa0110,
2314 0x9ec95d1463e8a506,0xf4363804324a40aa,
2315 0xc67bb4597ce2ce48,0xb143c6053edcd0d5,
2316 0xf81aa16fdc1b81da,0xdd94b7868e94050a,
2317 0x9b10a4e5e9913128,0xca7cf2b4191c8326,
2318 0xc1d4ce1f63f57d72,0xfd1c2f611f63a3f0,
2319 0xf24a01a73cf2dccf,0xbc633b39673c8cec,
2320 0x976e41088617ca01,0xd5be0503e085d813,
2321 0xbd49d14aa79dbc82,0x4b2d8644d8a74e18,
2322 0xec9c459d51852ba2,0xddf8e7d60ed1219e,
2323 0x93e1ab8252f33b45,0xcabb90e5c942b503,
2324 0xb8da1662e7b00a17,0x3d6a751f3b936243,
2325 0xe7109bfba19c0c9d,0xcc512670a783ad4,
2326 0x906a617d450187e2,0x27fb2b80668b24c5,
2327 0xb484f9dc9641e9da,0xb1f9f660802dedf6,
2328 0xe1a63853bbd26451,0x5e7873f8a0396973,
2329 0x8d07e33455637eb2,0xdb0b487b6423e1e8,
2330 0xb049dc016abc5e5f,0x91ce1a9a3d2cda62,
2331 0xdc5c5301c56b75f7,0x7641a140cc7810fb,
2332 0x89b9b3e11b6329ba,0xa9e904c87fcb0a9d,
2333 0xac2820d9623bf429,0x546345fa9fbdcd44,
2334 0xd732290fbacaf133,0xa97c177947ad4095,
2335 0x867f59a9d4bed6c0,0x49ed8eabcccc485d,
2336 0xa81f301449ee8c70,0x5c68f256bfff5a74,
2337 0xd226fc195c6a2f8c,0x73832eec6fff3111,
2338 0x83585d8fd9c25db7,0xc831fd53c5ff7eab,
2339 0xa42e74f3d032f525,0xba3e7ca8b77f5e55,
2340 0xcd3a1230c43fb26f,0x28ce1bd2e55f35eb,
2341 0x80444b5e7aa7cf85,0x7980d163cf5b81b3,
2342 0xa0555e361951c366,0xd7e105bcc332621f,
2343 0xc86ab5c39fa63440,0x8dd9472bf3fefaa7,
2344 0xfa856334878fc150,0xb14f98f6f0feb951,
2345 0x9c935e00d4b9d8d2,0x6ed1bf9a569f33d3,
2346 0xc3b8358109e84f07,0xa862f80ec4700c8,
2347 0xf4a642e14c6262c8,0xcd27bb612758c0fa,
2348 0x98e7e9cccfbd7dbd,0x8038d51cb897789c,
2349 0xbf21e44003acdd2c,0xe0470a63e6bd56c3,
2350 0xeeea5d5004981478,0x1858ccfce06cac74,
2351 0x95527a5202df0ccb,0xf37801e0c43ebc8,
2352 0xbaa718e68396cffd,0xd30560258f54e6ba,
2353 0xe950df20247c83fd,0x47c6b82ef32a2069,
2354 0x91d28b7416cdd27e,0x4cdc331d57fa5441,
2355 0xb6472e511c81471d,0xe0133fe4adf8e952,
2356 0xe3d8f9e563a198e5,0x58180fddd97723a6,
2357 0x8e679c2f5e44ff8f,0x570f09eaa7ea7648,};
2358
2359 } // namespace internal
2360 } // namespace simdjson
2361 /* end file src/internal/numberparsing_tables.cpp */
2362 /* begin file src/internal/simdprune_tables.cpp */
2363 #if SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE || SIMDJSON_IMPLEMENTATION_PPC64
2364
2365 #include <cstdint>
2366
2367 namespace simdjson { // table modified and copied from
2368 namespace internal { // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable
2369 SIMDJSON_DLLIMPORTEXPORT const unsigned char BitsSetTable256mul2[256] = {
2370 0, 2, 2, 4, 2, 4, 4, 6, 2, 4, 4, 6, 4, 6, 6, 8, 2, 4, 4,
2371 6, 4, 6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 2, 4, 4, 6, 4, 6,
2372 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6,
2373 8, 8, 10, 8, 10, 10, 12, 2, 4, 4, 6, 4, 6, 6, 8, 4, 6, 6, 8,
2374 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10,
2375 12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6, 8,
2376 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 2, 4, 4, 6, 4,
2377 6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10,
2378 6, 8, 8, 10, 8, 10, 10, 12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8,
2379 10, 8, 10, 10, 12, 6, 8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12,
2380 12, 14, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6,
2381 8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 6, 8, 8, 10,
2382 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 8, 10, 10, 12, 10, 12, 12,
2383 14, 10, 12, 12, 14, 12, 14, 14, 16};
2384
2385 SIMDJSON_DLLIMPORTEXPORT const uint8_t pshufb_combine_table[272] = {
2386 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
2387 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08,
2388 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0x00, 0x01, 0x02, 0x03,
2389 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
2390 0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
2391 0x0f, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
2392 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x08,
2393 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff,
2394 0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
2395 0xff, 0xff, 0xff, 0xff, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
2396 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x08, 0x09, 0x0a, 0x0b,
2397 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
2398 };
2399
2400 // 256 * 8 bytes = 2kB, easily fits in cache.
2401 SIMDJSON_DLLIMPORTEXPORT const uint64_t thintable_epi8[256] = {
2402 0x0706050403020100, 0x0007060504030201, 0x0007060504030200,
2403 0x0000070605040302, 0x0007060504030100, 0x0000070605040301,
2404 0x0000070605040300, 0x0000000706050403, 0x0007060504020100,
2405 0x0000070605040201, 0x0000070605040200, 0x0000000706050402,
2406 0x0000070605040100, 0x0000000706050401, 0x0000000706050400,
2407 0x0000000007060504, 0x0007060503020100, 0x0000070605030201,
2408 0x0000070605030200, 0x0000000706050302, 0x0000070605030100,
2409 0x0000000706050301, 0x0000000706050300, 0x0000000007060503,
2410 0x0000070605020100, 0x0000000706050201, 0x0000000706050200,
2411 0x0000000007060502, 0x0000000706050100, 0x0000000007060501,
2412 0x0000000007060500, 0x0000000000070605, 0x0007060403020100,
2413 0x0000070604030201, 0x0000070604030200, 0x0000000706040302,
2414 0x0000070604030100, 0x0000000706040301, 0x0000000706040300,
2415 0x0000000007060403, 0x0000070604020100, 0x0000000706040201,
2416 0x0000000706040200, 0x0000000007060402, 0x0000000706040100,
2417 0x0000000007060401, 0x0000000007060400, 0x0000000000070604,
2418 0x0000070603020100, 0x0000000706030201, 0x0000000706030200,
2419 0x0000000007060302, 0x0000000706030100, 0x0000000007060301,
2420 0x0000000007060300, 0x0000000000070603, 0x0000000706020100,
2421 0x0000000007060201, 0x0000000007060200, 0x0000000000070602,
2422 0x0000000007060100, 0x0000000000070601, 0x0000000000070600,
2423 0x0000000000000706, 0x0007050403020100, 0x0000070504030201,
2424 0x0000070504030200, 0x0000000705040302, 0x0000070504030100,
2425 0x0000000705040301, 0x0000000705040300, 0x0000000007050403,
2426 0x0000070504020100, 0x0000000705040201, 0x0000000705040200,
2427 0x0000000007050402, 0x0000000705040100, 0x0000000007050401,
2428 0x0000000007050400, 0x0000000000070504, 0x0000070503020100,
2429 0x0000000705030201, 0x0000000705030200, 0x0000000007050302,
2430 0x0000000705030100, 0x0000000007050301, 0x0000000007050300,
2431 0x0000000000070503, 0x0000000705020100, 0x0000000007050201,
2432 0x0000000007050200, 0x0000000000070502, 0x0000000007050100,
2433 0x0000000000070501, 0x0000000000070500, 0x0000000000000705,
2434 0x0000070403020100, 0x0000000704030201, 0x0000000704030200,
2435 0x0000000007040302, 0x0000000704030100, 0x0000000007040301,
2436 0x0000000007040300, 0x0000000000070403, 0x0000000704020100,
2437 0x0000000007040201, 0x0000000007040200, 0x0000000000070402,
2438 0x0000000007040100, 0x0000000000070401, 0x0000000000070400,
2439 0x0000000000000704, 0x0000000703020100, 0x0000000007030201,
2440 0x0000000007030200, 0x0000000000070302, 0x0000000007030100,
2441 0x0000000000070301, 0x0000000000070300, 0x0000000000000703,
2442 0x0000000007020100, 0x0000000000070201, 0x0000000000070200,
2443 0x0000000000000702, 0x0000000000070100, 0x0000000000000701,
2444 0x0000000000000700, 0x0000000000000007, 0x0006050403020100,
2445 0x0000060504030201, 0x0000060504030200, 0x0000000605040302,
2446 0x0000060504030100, 0x0000000605040301, 0x0000000605040300,
2447 0x0000000006050403, 0x0000060504020100, 0x0000000605040201,
2448 0x0000000605040200, 0x0000000006050402, 0x0000000605040100,
2449 0x0000000006050401, 0x0000000006050400, 0x0000000000060504,
2450 0x0000060503020100, 0x0000000605030201, 0x0000000605030200,
2451 0x0000000006050302, 0x0000000605030100, 0x0000000006050301,
2452 0x0000000006050300, 0x0000000000060503, 0x0000000605020100,
2453 0x0000000006050201, 0x0000000006050200, 0x0000000000060502,
2454 0x0000000006050100, 0x0000000000060501, 0x0000000000060500,
2455 0x0000000000000605, 0x0000060403020100, 0x0000000604030201,
2456 0x0000000604030200, 0x0000000006040302, 0x0000000604030100,
2457 0x0000000006040301, 0x0000000006040300, 0x0000000000060403,
2458 0x0000000604020100, 0x0000000006040201, 0x0000000006040200,
2459 0x0000000000060402, 0x0000000006040100, 0x0000000000060401,
2460 0x0000000000060400, 0x0000000000000604, 0x0000000603020100,
2461 0x0000000006030201, 0x0000000006030200, 0x0000000000060302,
2462 0x0000000006030100, 0x0000000000060301, 0x0000000000060300,
2463 0x0000000000000603, 0x0000000006020100, 0x0000000000060201,
2464 0x0000000000060200, 0x0000000000000602, 0x0000000000060100,
2465 0x0000000000000601, 0x0000000000000600, 0x0000000000000006,
2466 0x0000050403020100, 0x0000000504030201, 0x0000000504030200,
2467 0x0000000005040302, 0x0000000504030100, 0x0000000005040301,
2468 0x0000000005040300, 0x0000000000050403, 0x0000000504020100,
2469 0x0000000005040201, 0x0000000005040200, 0x0000000000050402,
2470 0x0000000005040100, 0x0000000000050401, 0x0000000000050400,
2471 0x0000000000000504, 0x0000000503020100, 0x0000000005030201,
2472 0x0000000005030200, 0x0000000000050302, 0x0000000005030100,
2473 0x0000000000050301, 0x0000000000050300, 0x0000000000000503,
2474 0x0000000005020100, 0x0000000000050201, 0x0000000000050200,
2475 0x0000000000000502, 0x0000000000050100, 0x0000000000000501,
2476 0x0000000000000500, 0x0000000000000005, 0x0000000403020100,
2477 0x0000000004030201, 0x0000000004030200, 0x0000000000040302,
2478 0x0000000004030100, 0x0000000000040301, 0x0000000000040300,
2479 0x0000000000000403, 0x0000000004020100, 0x0000000000040201,
2480 0x0000000000040200, 0x0000000000000402, 0x0000000000040100,
2481 0x0000000000000401, 0x0000000000000400, 0x0000000000000004,
2482 0x0000000003020100, 0x0000000000030201, 0x0000000000030200,
2483 0x0000000000000302, 0x0000000000030100, 0x0000000000000301,
2484 0x0000000000000300, 0x0000000000000003, 0x0000000000020100,
2485 0x0000000000000201, 0x0000000000000200, 0x0000000000000002,
2486 0x0000000000000100, 0x0000000000000001, 0x0000000000000000,
2487 0x0000000000000000,
2488 }; //static uint64_t thintable_epi8[256]
2489
2490 } // namespace internal
2491 } // namespace simdjson
2492
2493 #endif // SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE || SIMDJSON_IMPLEMENTATION_PPC64
2494 /* end file src/internal/simdprune_tables.cpp */
2495 /* begin file src/implementation.cpp */
2496 #include <initializer_list>
2497
2498 namespace simdjson {
2499
supported_by_runtime_system() const2500 bool implementation::supported_by_runtime_system() const {
2501 uint32_t required_instruction_sets = this->required_instruction_sets();
2502 uint32_t supported_instruction_sets = internal::detect_supported_architectures();
2503 return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets);
2504 }
2505
2506 namespace internal {
2507
2508 // Static array of known implementations. We're hoping these get baked into the executable
2509 // without requiring a static initializer.
2510
2511 #if SIMDJSON_IMPLEMENTATION_HASWELL
2512 const haswell::implementation haswell_singleton{};
2513 #endif
2514 #if SIMDJSON_IMPLEMENTATION_WESTMERE
2515 const westmere::implementation westmere_singleton{};
2516 #endif // SIMDJSON_IMPLEMENTATION_WESTMERE
2517 #if SIMDJSON_IMPLEMENTATION_ARM64
2518 const arm64::implementation arm64_singleton{};
2519 #endif // SIMDJSON_IMPLEMENTATION_ARM64
2520 #if SIMDJSON_IMPLEMENTATION_PPC64
2521 const ppc64::implementation ppc64_singleton{};
2522 #endif // SIMDJSON_IMPLEMENTATION_PPC64
2523 #if SIMDJSON_IMPLEMENTATION_FALLBACK
2524 const fallback::implementation fallback_singleton{};
2525 #endif // SIMDJSON_IMPLEMENTATION_FALLBACK
2526
2527 /**
2528 * @private Detects best supported implementation on first use, and sets it
2529 */
2530 class detect_best_supported_implementation_on_first_use final : public implementation {
2531 public:
name() const2532 const std::string &name() const noexcept final { return set_best()->name(); }
description() const2533 const std::string &description() const noexcept final { return set_best()->description(); }
required_instruction_sets() const2534 uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
create_dom_parser_implementation(size_t capacity,size_t max_length,std::unique_ptr<internal::dom_parser_implementation> & dst) const2535 simdjson_warn_unused error_code create_dom_parser_implementation(
2536 size_t capacity,
2537 size_t max_length,
2538 std::unique_ptr<internal::dom_parser_implementation>& dst
2539 ) const noexcept final {
2540 return set_best()->create_dom_parser_implementation(capacity, max_length, dst);
2541 }
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len) const2542 simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
2543 return set_best()->minify(buf, len, dst, dst_len);
2544 }
validate_utf8(const char * buf,size_t len) const2545 simdjson_warn_unused bool validate_utf8(const char * buf, size_t len) const noexcept final override {
2546 return set_best()->validate_utf8(buf, len);
2547 }
detect_best_supported_implementation_on_first_use()2548 simdjson_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
2549 private:
2550 const implementation *set_best() const noexcept;
2551 };
2552
2553 const detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton;
2554
2555 const std::initializer_list<const implementation *> available_implementation_pointers {
2556 #if SIMDJSON_IMPLEMENTATION_HASWELL
2557 &haswell_singleton,
2558 #endif
2559 #if SIMDJSON_IMPLEMENTATION_WESTMERE
2560 &westmere_singleton,
2561 #endif
2562 #if SIMDJSON_IMPLEMENTATION_ARM64
2563 &arm64_singleton,
2564 #endif
2565 #if SIMDJSON_IMPLEMENTATION_PPC64
2566 &ppc64_singleton,
2567 #endif
2568 #if SIMDJSON_IMPLEMENTATION_FALLBACK
2569 &fallback_singleton,
2570 #endif
2571 }; // available_implementation_pointers
2572
2573 // So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support
2574 class unsupported_implementation final : public implementation {
2575 public:
create_dom_parser_implementation(size_t,size_t,std::unique_ptr<internal::dom_parser_implementation> &) const2576 simdjson_warn_unused error_code create_dom_parser_implementation(
2577 size_t,
2578 size_t,
2579 std::unique_ptr<internal::dom_parser_implementation>&
2580 ) const noexcept final {
2581 return UNSUPPORTED_ARCHITECTURE;
2582 }
minify(const uint8_t *,size_t,uint8_t *,size_t &) const2583 simdjson_warn_unused error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final override {
2584 return UNSUPPORTED_ARCHITECTURE;
2585 }
validate_utf8(const char *,size_t) const2586 simdjson_warn_unused bool validate_utf8(const char *, size_t) const noexcept final override {
2587 return false; // Just refuse to validate. Given that we have a fallback implementation
2588 // it seems unlikely that unsupported_implementation will ever be used. If it is used,
2589 // then it will flag all strings as invalid. The alternative is to return an error_code
2590 // from which the user has to figure out whether the string is valid UTF-8... which seems
2591 // like a lot of work just to handle the very unlikely case that we have an unsupported
2592 // implementation. And, when it does happen (that we have an unsupported implementation),
2593 // what are the chances that the programmer has a fallback? Given that *we* provide the
2594 // fallback, it implies that the programmer would need a fallback for our fallback.
2595 }
unsupported_implementation()2596 unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
2597 };
2598
2599 const unsupported_implementation unsupported_singleton{};
2600
size() const2601 size_t available_implementation_list::size() const noexcept {
2602 return internal::available_implementation_pointers.size();
2603 }
begin() const2604 const implementation * const *available_implementation_list::begin() const noexcept {
2605 return internal::available_implementation_pointers.begin();
2606 }
end() const2607 const implementation * const *available_implementation_list::end() const noexcept {
2608 return internal::available_implementation_pointers.end();
2609 }
detect_best_supported() const2610 const implementation *available_implementation_list::detect_best_supported() const noexcept {
2611 // They are prelisted in priority order, so we just go down the list
2612 uint32_t supported_instruction_sets = internal::detect_supported_architectures();
2613 for (const implementation *impl : internal::available_implementation_pointers) {
2614 uint32_t required_instruction_sets = impl->required_instruction_sets();
2615 if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; }
2616 }
2617 return &unsupported_singleton; // this should never happen?
2618 }
2619
set_best() const2620 const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept {
2621 SIMDJSON_PUSH_DISABLE_WARNINGS
2622 SIMDJSON_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
2623 char *force_implementation_name = getenv("SIMDJSON_FORCE_IMPLEMENTATION");
2624 SIMDJSON_POP_DISABLE_WARNINGS
2625
2626 if (force_implementation_name) {
2627 auto force_implementation = available_implementations[force_implementation_name];
2628 if (force_implementation) {
2629 return active_implementation = force_implementation;
2630 } else {
2631 // Note: abort() and stderr usage within the library is forbidden.
2632 return active_implementation = &unsupported_singleton;
2633 }
2634 }
2635 return active_implementation = available_implementations.detect_best_supported();
2636 }
2637
2638 } // namespace internal
2639
2640 SIMDJSON_DLLIMPORTEXPORT const internal::available_implementation_list available_implementations{};
2641 SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> active_implementation{&internal::detect_best_supported_implementation_on_first_use_singleton};
2642
minify(const char * buf,size_t len,char * dst,size_t & dst_len)2643 simdjson_warn_unused error_code minify(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept {
2644 return active_implementation->minify(reinterpret_cast<const uint8_t *>(buf), len, reinterpret_cast<uint8_t *>(dst), dst_len);
2645 }
validate_utf8(const char * buf,size_t len)2646 simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
2647 return active_implementation->validate_utf8(buf, len);
2648 }
2649
builtin_implementation()2650 const implementation * builtin_implementation() {
2651 static const implementation * builtin_impl = available_implementations[STRINGIFY(SIMDJSON_BUILTIN_IMPLEMENTATION)];
2652 assert(builtin_impl);
2653 return builtin_impl;
2654 }
2655
2656
2657 } // namespace simdjson
2658 /* end file src/implementation.cpp */
2659
2660 #if SIMDJSON_IMPLEMENTATION_ARM64
2661 /* begin file src/arm64/implementation.cpp */
2662 /* begin file include/simdjson/arm64/begin.h */
2663 // redefining SIMDJSON_IMPLEMENTATION to "arm64"
2664 // #define SIMDJSON_IMPLEMENTATION arm64
2665 /* end file include/simdjson/arm64/begin.h */
2666
2667 namespace simdjson {
2668 namespace arm64 {
2669
create_dom_parser_implementation(size_t capacity,size_t max_depth,std::unique_ptr<internal::dom_parser_implementation> & dst) const2670 simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
2671 size_t capacity,
2672 size_t max_depth,
2673 std::unique_ptr<internal::dom_parser_implementation>& dst
2674 ) const noexcept {
2675 dst.reset( new (std::nothrow) dom_parser_implementation() );
2676 if (!dst) { return MEMALLOC; }
2677 dst->set_capacity(capacity);
2678 dst->set_max_depth(max_depth);
2679 return SUCCESS;
2680 }
2681
2682 } // namespace arm64
2683 } // namespace simdjson
2684
2685 /* begin file include/simdjson/arm64/end.h */
2686 /* end file include/simdjson/arm64/end.h */
2687 /* end file src/arm64/implementation.cpp */
2688 /* begin file src/arm64/dom_parser_implementation.cpp */
2689 /* begin file include/simdjson/arm64/begin.h */
2690 // redefining SIMDJSON_IMPLEMENTATION to "arm64"
2691 // #define SIMDJSON_IMPLEMENTATION arm64
2692 /* end file include/simdjson/arm64/begin.h */
2693
2694 //
2695 // Stage 1
2696 //
2697 namespace simdjson {
2698 namespace arm64 {
2699 namespace {
2700
2701 using namespace simd;
2702
2703 struct json_character_block {
2704 static simdjson_really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
2705
whitespacesimdjson::arm64::__anon9bb6be6f0311::json_character_block2706 simdjson_really_inline uint64_t whitespace() const noexcept { return _whitespace; }
opsimdjson::arm64::__anon9bb6be6f0311::json_character_block2707 simdjson_really_inline uint64_t op() const noexcept { return _op; }
scalarsimdjson::arm64::__anon9bb6be6f0311::json_character_block2708 simdjson_really_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
2709
2710 uint64_t _whitespace;
2711 uint64_t _op;
2712 };
2713
classify(const simd::simd8x64<uint8_t> & in)2714 simdjson_really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
2715 // Functional programming causes trouble with Visual Studio.
2716 // Keeping this version in comments since it is much nicer:
2717 // auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
2718 // auto nib_lo = chunk & 0xf;
2719 // auto nib_hi = chunk.shr<4>();
2720 // auto shuf_lo = nib_lo.lookup_16<uint8_t>(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
2721 // auto shuf_hi = nib_hi.lookup_16<uint8_t>(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
2722 // return shuf_lo & shuf_hi;
2723 // });
2724 const simd8<uint8_t> table1(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
2725 const simd8<uint8_t> table2(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
2726
2727 simd8x64<uint8_t> v(
2728 (in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2),
2729 (in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2),
2730 (in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2),
2731 (in.chunks[3] & 0xf).lookup_16(table1) & (in.chunks[3].shr<4>()).lookup_16(table2)
2732 );
2733
2734
2735 // We compute whitespace and op separately. If the code later only use one or the
2736 // other, given the fact that all functions are aggressively inlined, we can
2737 // hope that useless computations will be omitted. This is namely case when
2738 // minifying (we only need whitespace). *However* if we only need spaces,
2739 // it is likely that we will still compute 'v' above with two lookup_16: one
2740 // could do it a bit cheaper. This is in contrast with the x64 implementations
2741 // where we can, efficiently, do the white space and structural matching
2742 // separately. One reason for this difference is that on ARM NEON, the table
2743 // lookups either zero or leave unchanged the characters exceeding 0xF whereas
2744 // on x64, the equivalent instruction (pshufb) automatically applies a mask,
2745 // ignoring the 4 most significant bits. Thus the x64 implementation is
2746 // optimized differently. This being said, if you use this code strictly
2747 // just for minification (or just to identify the structural characters),
2748 // there is a small untaken optimization opportunity here. We deliberately
2749 // do not pick it up.
2750
2751 uint64_t op = simd8x64<bool>(
2752 v.chunks[0].any_bits_set(0x7),
2753 v.chunks[1].any_bits_set(0x7),
2754 v.chunks[2].any_bits_set(0x7),
2755 v.chunks[3].any_bits_set(0x7)
2756 ).to_bitmask();
2757
2758 uint64_t whitespace = simd8x64<bool>(
2759 v.chunks[0].any_bits_set(0x18),
2760 v.chunks[1].any_bits_set(0x18),
2761 v.chunks[2].any_bits_set(0x18),
2762 v.chunks[3].any_bits_set(0x18)
2763 ).to_bitmask();
2764
2765 return { whitespace, op };
2766 }
2767
is_ascii(const simd8x64<uint8_t> & input)2768 simdjson_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
2769 simd8<uint8_t> bits = input.reduce_or();
2770 return bits.max_val() < 0b10000000u;
2771 }
2772
must_be_continuation(const simd8<uint8_t> prev1,const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)2773 simdjson_unused simdjson_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
2774 simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
2775 simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
2776 simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
2777 // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well.
2778 // This will work fine because we only have to report errors for cases with 0-1 lead bytes.
2779 // Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is
2780 // guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character.
2781 // The error will be detected there.
2782 return is_second_byte ^ is_third_byte ^ is_fourth_byte;
2783 }
2784
must_be_2_3_continuation(const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)2785 simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
2786 simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
2787 simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
2788 return is_third_byte ^ is_fourth_byte;
2789 }
2790
2791 } // unnamed namespace
2792 } // namespace arm64
2793 } // namespace simdjson
2794
2795 /* begin file src/generic/stage1/utf8_lookup4_algorithm.h */
2796 namespace simdjson {
2797 namespace arm64 {
2798 namespace {
2799 namespace utf8_validation {
2800
2801 using namespace simd;
2802
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)2803 simdjson_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
2804 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
2805 // Bit 1 = Too Long (ASCII followed by continuation)
2806 // Bit 2 = Overlong 3-byte
2807 // Bit 4 = Surrogate
2808 // Bit 5 = Overlong 2-byte
2809 // Bit 7 = Two Continuations
2810 constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
2811 // 11______ 11______
2812 constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
2813 constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
2814 constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
2815 constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
2816 constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
2817 constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
2818 // 11110100 101_____
2819 // 11110101 1001____
2820 // 11110101 101_____
2821 // 1111011_ 1001____
2822 // 1111011_ 101_____
2823 // 11111___ 1001____
2824 // 11111___ 101_____
2825 constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
2826 // 11110101 1000____
2827 // 1111011_ 1000____
2828 // 11111___ 1000____
2829 constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
2830
2831 const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
2832 // 0_______ ________ <ASCII in byte 1>
2833 TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
2834 TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
2835 // 10______ ________ <continuation in byte 1>
2836 TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
2837 // 1100____ ________ <two byte lead in byte 1>
2838 TOO_SHORT | OVERLONG_2,
2839 // 1101____ ________ <two byte lead in byte 1>
2840 TOO_SHORT,
2841 // 1110____ ________ <three byte lead in byte 1>
2842 TOO_SHORT | OVERLONG_3 | SURROGATE,
2843 // 1111____ ________ <four+ byte lead in byte 1>
2844 TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
2845 );
2846 constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
2847 const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
2848 // ____0000 ________
2849 CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
2850 // ____0001 ________
2851 CARRY | OVERLONG_2,
2852 // ____001_ ________
2853 CARRY,
2854 CARRY,
2855
2856 // ____0100 ________
2857 CARRY | TOO_LARGE,
2858 // ____0101 ________
2859 CARRY | TOO_LARGE | TOO_LARGE_1000,
2860 // ____011_ ________
2861 CARRY | TOO_LARGE | TOO_LARGE_1000,
2862 CARRY | TOO_LARGE | TOO_LARGE_1000,
2863
2864 // ____1___ ________
2865 CARRY | TOO_LARGE | TOO_LARGE_1000,
2866 CARRY | TOO_LARGE | TOO_LARGE_1000,
2867 CARRY | TOO_LARGE | TOO_LARGE_1000,
2868 CARRY | TOO_LARGE | TOO_LARGE_1000,
2869 CARRY | TOO_LARGE | TOO_LARGE_1000,
2870 // ____1101 ________
2871 CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
2872 CARRY | TOO_LARGE | TOO_LARGE_1000,
2873 CARRY | TOO_LARGE | TOO_LARGE_1000
2874 );
2875 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
2876 // ________ 0_______ <ASCII in byte 2>
2877 TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
2878 TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
2879
2880 // ________ 1000____
2881 TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
2882 // ________ 1001____
2883 TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
2884 // ________ 101_____
2885 TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
2886 TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
2887
2888 // ________ 11______
2889 TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
2890 );
2891 return (byte_1_high & byte_1_low & byte_2_high);
2892 }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)2893 simdjson_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
2894 const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
2895 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
2896 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
2897 simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
2898 simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
2899 return must23_80 ^ sc;
2900 }
2901
2902 //
2903 // Return nonzero if there are incomplete multibyte characters at the end of the block:
2904 // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
2905 //
is_incomplete(const simd8<uint8_t> input)2906 simdjson_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
2907 // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
2908 // ... 1111____ 111_____ 11______
2909 static const uint8_t max_array[32] = {
2910 255, 255, 255, 255, 255, 255, 255, 255,
2911 255, 255, 255, 255, 255, 255, 255, 255,
2912 255, 255, 255, 255, 255, 255, 255, 255,
2913 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
2914 };
2915 const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
2916 return input.gt_bits(max_value);
2917 }
2918
2919 struct utf8_checker {
2920 // If this is nonzero, there has been a UTF-8 error.
2921 simd8<uint8_t> error;
2922 // The last input we received
2923 simd8<uint8_t> prev_input_block;
2924 // Whether the last input we received was incomplete (used for ASCII fast path)
2925 simd8<uint8_t> prev_incomplete;
2926
2927 //
2928 // Check whether the current bytes are valid UTF-8.
2929 //
check_utf8_bytessimdjson::arm64::__anon9bb6be6f0411::utf8_validation::utf8_checker2930 simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
2931 // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
2932 // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
2933 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
2934 simd8<uint8_t> sc = check_special_cases(input, prev1);
2935 this->error |= check_multibyte_lengths(input, prev_input, sc);
2936 }
2937
2938 // The only problem that can happen at EOF is that a multibyte character is too short
2939 // or a byte value too large in the last bytes: check_special_cases only checks for bytes
2940 // too large in the first of two bytes.
check_eofsimdjson::arm64::__anon9bb6be6f0411::utf8_validation::utf8_checker2941 simdjson_really_inline void check_eof() {
2942 // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
2943 // possibly finish them.
2944 this->error |= this->prev_incomplete;
2945 }
2946
check_next_inputsimdjson::arm64::__anon9bb6be6f0411::utf8_validation::utf8_checker2947 simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
2948 if(simdjson_likely(is_ascii(input))) {
2949 this->error |= this->prev_incomplete;
2950 } else {
2951 // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
2952 static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
2953 "We support either two or four chunks per 64-byte block.");
2954 if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
2955 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
2956 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
2957 } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
2958 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
2959 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
2960 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
2961 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
2962 }
2963 this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
2964 this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
2965
2966 }
2967 }
2968 // do not forget to call check_eof!
errorssimdjson::arm64::__anon9bb6be6f0411::utf8_validation::utf8_checker2969 simdjson_really_inline error_code errors() {
2970 return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
2971 }
2972
2973 }; // struct utf8_checker
2974 } // namespace utf8_validation
2975
2976 using utf8_validation::utf8_checker;
2977
2978 } // unnamed namespace
2979 } // namespace arm64
2980 } // namespace simdjson
2981 /* end file src/generic/stage1/utf8_lookup4_algorithm.h */
2982 /* begin file src/generic/stage1/json_structural_indexer.h */
2983 // This file contains the common code every implementation uses in stage1
2984 // It is intended to be included multiple times and compiled multiple times
2985 // We assume the file in which it is included already includes
2986 // "simdjson/stage1.h" (this simplifies amalgation)
2987
2988 /* begin file src/generic/stage1/buf_block_reader.h */
2989 namespace simdjson {
2990 namespace arm64 {
2991 namespace {
2992
2993 // Walks through a buffer in block-sized increments, loading the last part with spaces
2994 template<size_t STEP_SIZE>
2995 struct buf_block_reader {
2996 public:
2997 simdjson_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
2998 simdjson_really_inline size_t block_index();
2999 simdjson_really_inline bool has_full_block() const;
3000 simdjson_really_inline const uint8_t *full_block() const;
3001 /**
3002 * Get the last block, padded with spaces.
3003 *
3004 * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
3005 * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
3006 * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
3007 *
3008 * @return the number of effective characters in the last block.
3009 */
3010 simdjson_really_inline size_t get_remainder(uint8_t *dst) const;
3011 simdjson_really_inline void advance();
3012 private:
3013 const uint8_t *buf;
3014 const size_t len;
3015 const size_t lenminusstep;
3016 size_t idx;
3017 };
3018
3019 // Routines to print masks and text for debugging bitmask operations
format_input_text_64(const uint8_t * text)3020 simdjson_unused static char * format_input_text_64(const uint8_t *text) {
3021 static char buf[sizeof(simd8x64<uint8_t>) + 1];
3022 for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
3023 buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
3024 }
3025 buf[sizeof(simd8x64<uint8_t>)] = '\0';
3026 return buf;
3027 }
3028
3029 // Routines to print masks and text for debugging bitmask operations
format_input_text(const simd8x64<uint8_t> & in)3030 simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
3031 static char buf[sizeof(simd8x64<uint8_t>) + 1];
3032 in.store(reinterpret_cast<uint8_t*>(buf));
3033 for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
3034 if (buf[i] < ' ') { buf[i] = '_'; }
3035 }
3036 buf[sizeof(simd8x64<uint8_t>)] = '\0';
3037 return buf;
3038 }
3039
format_mask(uint64_t mask)3040 simdjson_unused static char * format_mask(uint64_t mask) {
3041 static char buf[sizeof(simd8x64<uint8_t>) + 1];
3042 for (size_t i=0; i<64; i++) {
3043 buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
3044 }
3045 buf[64] = '\0';
3046 return buf;
3047 }
3048
3049 template<size_t STEP_SIZE>
buf_block_reader(const uint8_t * _buf,size_t _len)3050 simdjson_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
3051
3052 template<size_t STEP_SIZE>
block_index()3053 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
3054
3055 template<size_t STEP_SIZE>
has_full_block() const3056 simdjson_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
3057 return idx < lenminusstep;
3058 }
3059
3060 template<size_t STEP_SIZE>
full_block() const3061 simdjson_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
3062 return &buf[idx];
3063 }
3064
3065 template<size_t STEP_SIZE>
get_remainder(uint8_t * dst) const3066 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
3067 if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
3068 std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
3069 std::memcpy(dst, buf + idx, len - idx);
3070 return len - idx;
3071 }
3072
3073 template<size_t STEP_SIZE>
advance()3074 simdjson_really_inline void buf_block_reader<STEP_SIZE>::advance() {
3075 idx += STEP_SIZE;
3076 }
3077
3078 } // unnamed namespace
3079 } // namespace arm64
3080 } // namespace simdjson
3081 /* end file src/generic/stage1/buf_block_reader.h */
3082 /* begin file src/generic/stage1/json_string_scanner.h */
3083 namespace simdjson {
3084 namespace arm64 {
3085 namespace {
3086 namespace stage1 {
3087
3088 struct json_string_block {
3089 // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_string_blocksimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3090 simdjson_really_inline json_string_block(uint64_t backslash, uint64_t escaped, uint64_t quote, uint64_t in_string) :
3091 _backslash(backslash), _escaped(escaped), _quote(quote), _in_string(in_string) {}
3092
3093 // Escaped characters (characters following an escape() character)
escapedsimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3094 simdjson_really_inline uint64_t escaped() const { return _escaped; }
3095 // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
escapesimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3096 simdjson_really_inline uint64_t escape() const { return _backslash & ~_escaped; }
3097 // Real (non-backslashed) quotes
quotesimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3098 simdjson_really_inline uint64_t quote() const { return _quote; }
3099 // Start quotes of strings
string_startsimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3100 simdjson_really_inline uint64_t string_start() const { return _quote & _in_string; }
3101 // End quotes of strings
string_endsimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3102 simdjson_really_inline uint64_t string_end() const { return _quote & ~_in_string; }
3103 // Only characters inside the string (not including the quotes)
string_contentsimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3104 simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; }
3105 // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_inside_stringsimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3106 simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
3107 // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_outside_stringsimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3108 simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
3109 // Tail of string (everything except the start quote)
string_tailsimdjson::arm64::__anon9bb6be6f0611::stage1::json_string_block3110 simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
3111
3112 // backslash characters
3113 uint64_t _backslash;
3114 // escaped characters (backslashed--does not include the hex characters after \u)
3115 uint64_t _escaped;
3116 // real quotes (non-backslashed ones)
3117 uint64_t _quote;
3118 // string characters (includes start quote but not end quote)
3119 uint64_t _in_string;
3120 };
3121
3122 // Scans blocks for string characters, storing the state necessary to do so
3123 class json_string_scanner {
3124 public:
3125 simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
3126 // Returns either UNCLOSED_STRING or SUCCESS
3127 simdjson_really_inline error_code finish();
3128
3129 private:
3130 // Intended to be defined by the implementation
3131 simdjson_really_inline uint64_t find_escaped(uint64_t escape);
3132 simdjson_really_inline uint64_t find_escaped_branchless(uint64_t escape);
3133
3134 // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
3135 uint64_t prev_in_string = 0ULL;
3136 // Whether the first character of the next iteration is escaped.
3137 uint64_t prev_escaped = 0ULL;
3138 };
3139
3140 //
3141 // Finds escaped characters (characters following \).
3142 //
3143 // Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
3144 //
3145 // Does this by:
3146 // - Shift the escape mask to get potentially escaped characters (characters after backslashes).
3147 // - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
3148 // - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
3149 //
3150 // To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
3151 // escape sequences, filters out the ones that start on even bits, and adds that to the mask of
3152 // escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
3153 // the start bit causes a carry), and leaves even-bit sequences alone.
3154 //
3155 // Example:
3156 //
3157 // text | \\\ | \\\"\\\" \\\" \\"\\" |
3158 // escape | xxx | xx xxx xxx xx xx | Removed overflow backslash; will | it into follows_escape
3159 // odd_starts | x | x x x | escape & ~even_bits & ~follows_escape
3160 // even_seq | c| cxxx c xx c | c = carry bit -- will be masked out later
3161 // invert_mask | | cxxx c xx c| even_seq << 1
3162 // follows_escape | xx | x xx xxx xxx xx xx | Includes overflow bit
3163 // escaped | x | x x x x x x x x |
3164 // desired | x | x x x x x x x x |
3165 // text | \\\ | \\\"\\\" \\\" \\"\\" |
3166 //
find_escaped_branchless(uint64_t backslash)3167 simdjson_really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash) {
3168 // If there was overflow, pretend the first character isn't a backslash
3169 backslash &= ~prev_escaped;
3170 uint64_t follows_escape = backslash << 1 | prev_escaped;
3171
3172 // Get sequences starting on even bits by clearing out the odd series using +
3173 const uint64_t even_bits = 0x5555555555555555ULL;
3174 uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
3175 uint64_t sequences_starting_on_even_bits;
3176 prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
3177 uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
3178
3179 // Mask every other backslashed character as an escaped character
3180 // Flip the mask for sequences that start on even bits, to correct them
3181 return (even_bits ^ invert_mask) & follows_escape;
3182 }
3183
3184 //
3185 // Return a mask of all string characters plus end quotes.
3186 //
3187 // prev_escaped is overflow saying whether the next character is escaped.
3188 // prev_in_string is overflow saying whether we're still in a string.
3189 //
3190 // Backslash sequences outside of quotes will be detected in stage 2.
3191 //
next(const simd::simd8x64<uint8_t> & in)3192 simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
3193 const uint64_t backslash = in.eq('\\');
3194 const uint64_t escaped = find_escaped(backslash);
3195 const uint64_t quote = in.eq('"') & ~escaped;
3196
3197 //
3198 // prefix_xor flips on bits inside the string (and flips off the end quote).
3199 //
3200 // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
3201 // (characters inside strings are outside, and characters outside strings are inside).
3202 //
3203 const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
3204
3205 //
3206 // Check if we're still in a string at the end of the box so the next block will know
3207 //
3208 // right shift of a signed value expected to be well-defined and standard
3209 // compliant as of C++20, John Regher from Utah U. says this is fine code
3210 //
3211 prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
3212
3213 // Use ^ to turn the beginning quote off, and the end quote on.
3214
3215 // We are returning a function-local object so either we get a move constructor
3216 // or we get copy elision.
3217 return json_string_block(
3218 backslash,
3219 escaped,
3220 quote,
3221 in_string
3222 );
3223 }
3224
finish()3225 simdjson_really_inline error_code json_string_scanner::finish() {
3226 if (prev_in_string) {
3227 return UNCLOSED_STRING;
3228 }
3229 return SUCCESS;
3230 }
3231
3232 } // namespace stage1
3233 } // unnamed namespace
3234 } // namespace arm64
3235 } // namespace simdjson
3236 /* end file src/generic/stage1/json_string_scanner.h */
3237 /* begin file src/generic/stage1/json_scanner.h */
3238 namespace simdjson {
3239 namespace arm64 {
3240 namespace {
3241 namespace stage1 {
3242
3243 /**
3244 * A block of scanned json, with information on operators and scalars.
3245 *
3246 * We seek to identify pseudo-structural characters. Anything that is inside
3247 * a string must be omitted (hence & ~_string.string_tail()).
3248 * Otherwise, pseudo-structural characters come in two forms.
3249 * 1. We have the structural characters ([,],{,},:, comma). The
3250 * term 'structural character' is from the JSON RFC.
3251 * 2. We have the 'scalar pseudo-structural characters'.
3252 * Scalars are quotes, and any character except structural characters and white space.
3253 *
3254 * To identify the scalar pseudo-structural characters, we must look at what comes
3255 * before them: it must be a space, a quote or a structural characters.
3256 * Starting with simdjson v0.3, we identify them by
3257 * negation: we identify everything that is followed by a non-quote scalar,
3258 * and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
3259 */
3260 struct json_block {
3261 public:
3262 // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_blocksimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3263 simdjson_really_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
3264 _string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
json_blocksimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3265 simdjson_really_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
3266 _string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
3267
3268 /**
3269 * The start of structurals.
3270 * In simdjson prior to v0.3, these were called the pseudo-structural characters.
3271 **/
structural_startsimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3272 simdjson_really_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
3273 /** All JSON whitespace (i.e. not in a string) */
whitespacesimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3274 simdjson_really_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
3275
3276 // Helpers
3277
3278 /** Whether the given characters are inside a string (only works on non-quotes) */
non_quote_inside_stringsimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3279 simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
3280 /** Whether the given characters are outside a string (only works on non-quotes) */
non_quote_outside_stringsimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3281 simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
3282
3283 // string and escape characters
3284 json_string_block _string;
3285 // whitespace, structural characters ('operators'), scalars
3286 json_character_block _characters;
3287 // whether the previous character was a scalar
3288 uint64_t _follows_potential_nonquote_scalar;
3289 private:
3290 // Potential structurals (i.e. disregarding strings)
3291
3292 /**
3293 * structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
3294 * They may reside inside a string.
3295 **/
potential_structural_startsimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3296 simdjson_really_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
3297 /**
3298 * The start of non-operator runs, like 123, true and "abc".
3299 * It main reside inside a string.
3300 **/
potential_scalar_startsimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3301 simdjson_really_inline uint64_t potential_scalar_start() const noexcept {
3302 // The term "scalar" refers to anything except structural characters and white space
3303 // (so letters, numbers, quotes).
3304 // Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
3305 // then we know that it is irrelevant structurally.
3306 return _characters.scalar() & ~follows_potential_scalar();
3307 }
3308 /**
3309 * Whether the given character is immediately after a non-operator like 123, true.
3310 * The characters following a quote are not included.
3311 */
follows_potential_scalarsimdjson::arm64::__anon9bb6be6f0711::stage1::json_block3312 simdjson_really_inline uint64_t follows_potential_scalar() const noexcept {
3313 // _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
3314 // that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
3315 // white space.
3316 // It is understood that within quoted region, anything at all could be marked (irrelevant).
3317 return _follows_potential_nonquote_scalar;
3318 }
3319 };
3320
3321 /**
3322 * Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
3323 *
3324 * The scanner starts by calculating two distinct things:
3325 * - string characters (taking \" into account)
3326 * - structural characters or 'operators' ([]{},:, comma)
3327 * and scalars (runs of non-operators like 123, true and "abc")
3328 *
3329 * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
3330 * in particular, the operator/scalar bit will find plenty of things that are actually part of
3331 * strings. When we're done, json_block will fuse the two together by masking out tokens that are
3332 * part of a string.
3333 */
3334 class json_scanner {
3335 public:
json_scanner()3336 json_scanner() {}
3337 simdjson_really_inline json_block next(const simd::simd8x64<uint8_t>& in);
3338 // Returns either UNCLOSED_STRING or SUCCESS
3339 simdjson_really_inline error_code finish();
3340
3341 private:
3342 // Whether the last character of the previous iteration is part of a scalar token
3343 // (anything except whitespace or a structural character/'operator').
3344 uint64_t prev_scalar = 0ULL;
3345 json_string_scanner string_scanner{};
3346 };
3347
3348
3349 //
3350 // Check if the current character immediately follows a matching character.
3351 //
3352 // For example, this checks for quotes with backslashes in front of them:
3353 //
3354 // const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
3355 //
follows(const uint64_t match,uint64_t & overflow)3356 simdjson_really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
3357 const uint64_t result = match << 1 | overflow;
3358 overflow = match >> 63;
3359 return result;
3360 }
3361
next(const simd::simd8x64<uint8_t> & in)3362 simdjson_really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
3363 json_string_block strings = string_scanner.next(in);
3364 // identifies the white-space and the structurat characters
3365 json_character_block characters = json_character_block::classify(in);
3366 // The term "scalar" refers to anything except structural characters and white space
3367 // (so letters, numbers, quotes).
3368 // We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
3369 //
3370 // A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
3371 // or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
3372 // pseudo-structural character just like we would if we had ' "a string" true '; otherwise we
3373 // may need to add an extra check when parsing strings.
3374 //
3375 // Performance: there are many ways to skin this cat.
3376 const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
3377 uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
3378 // We are returning a function-local object so either we get a move constructor
3379 // or we get copy elision.
3380 return json_block(
3381 strings,// strings is a function-local object so either it moves or the copy is elided.
3382 characters,
3383 follows_nonquote_scalar
3384 );
3385 }
3386
finish()3387 simdjson_really_inline error_code json_scanner::finish() {
3388 return string_scanner.finish();
3389 }
3390
3391 } // namespace stage1
3392 } // unnamed namespace
3393 } // namespace arm64
3394 } // namespace simdjson
3395 /* end file src/generic/stage1/json_scanner.h */
3396 /* begin file src/generic/stage1/json_minifier.h */
3397 // This file contains the common code every implementation uses in stage1
3398 // It is intended to be included multiple times and compiled multiple times
3399 // We assume the file in which it is included already includes
3400 // "simdjson/stage1.h" (this simplifies amalgation)
3401
3402 namespace simdjson {
3403 namespace arm64 {
3404 namespace {
3405 namespace stage1 {
3406
3407 class json_minifier {
3408 public:
3409 template<size_t STEP_SIZE>
3410 static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
3411
3412 private:
json_minifier(uint8_t * _dst)3413 simdjson_really_inline json_minifier(uint8_t *_dst)
3414 : dst{_dst}
3415 {}
3416 template<size_t STEP_SIZE>
3417 simdjson_really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
3418 simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
3419 simdjson_really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
3420 json_scanner scanner{};
3421 uint8_t *dst;
3422 };
3423
next(const simd::simd8x64<uint8_t> & in,const json_block & block)3424 simdjson_really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
3425 uint64_t mask = block.whitespace();
3426 in.compress(mask, dst);
3427 dst += 64 - count_ones(mask);
3428 }
3429
finish(uint8_t * dst_start,size_t & dst_len)3430 simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
3431 error_code error = scanner.finish();
3432 if (error) { dst_len = 0; return error; }
3433 dst_len = dst - dst_start;
3434 return SUCCESS;
3435 }
3436
3437 template<>
step(const uint8_t * block_buf,buf_block_reader<128> & reader)3438 simdjson_really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
3439 simd::simd8x64<uint8_t> in_1(block_buf);
3440 simd::simd8x64<uint8_t> in_2(block_buf+64);
3441 json_block block_1 = scanner.next(in_1);
3442 json_block block_2 = scanner.next(in_2);
3443 this->next(in_1, block_1);
3444 this->next(in_2, block_2);
3445 reader.advance();
3446 }
3447
3448 template<>
step(const uint8_t * block_buf,buf_block_reader<64> & reader)3449 simdjson_really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
3450 simd::simd8x64<uint8_t> in_1(block_buf);
3451 json_block block_1 = scanner.next(in_1);
3452 this->next(block_buf, block_1);
3453 reader.advance();
3454 }
3455
3456 template<size_t STEP_SIZE>
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len)3457 error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
3458 buf_block_reader<STEP_SIZE> reader(buf, len);
3459 json_minifier minifier(dst);
3460
3461 // Index the first n-1 blocks
3462 while (reader.has_full_block()) {
3463 minifier.step<STEP_SIZE>(reader.full_block(), reader);
3464 }
3465
3466 // Index the last (remainder) block, padded with spaces
3467 uint8_t block[STEP_SIZE];
3468 size_t remaining_bytes = reader.get_remainder(block);
3469 if (remaining_bytes > 0) {
3470 // We do not want to write directly to the output stream. Rather, we write
3471 // to a local buffer (for safety).
3472 uint8_t out_block[STEP_SIZE];
3473 uint8_t * const guarded_dst{minifier.dst};
3474 minifier.dst = out_block;
3475 minifier.step<STEP_SIZE>(block, reader);
3476 size_t to_write = minifier.dst - out_block;
3477 // In some cases, we could be enticed to consider the padded spaces
3478 // as part of the string. This is fine as long as we do not write more
3479 // than we consumed.
3480 if(to_write > remaining_bytes) { to_write = remaining_bytes; }
3481 memcpy(guarded_dst, out_block, to_write);
3482 minifier.dst = guarded_dst + to_write;
3483 }
3484 return minifier.finish(dst, dst_len);
3485 }
3486
3487 } // namespace stage1
3488 } // unnamed namespace
3489 } // namespace arm64
3490 } // namespace simdjson
3491 /* end file src/generic/stage1/json_minifier.h */
3492 /* begin file src/generic/stage1/find_next_document_index.h */
3493 namespace simdjson {
3494 namespace arm64 {
3495 namespace {
3496
3497 /**
3498 * This algorithm is used to quickly identify the last structural position that
3499 * makes up a complete document.
3500 *
3501 * It does this by going backwards and finding the last *document boundary* (a
3502 * place where one value follows another without a comma between them). If the
3503 * last document (the characters after the boundary) has an equal number of
3504 * start and end brackets, it is considered complete.
3505 *
3506 * Simply put, we iterate over the structural characters, starting from
3507 * the end. We consider that we found the end of a JSON document when the
3508 * first element of the pair is NOT one of these characters: '{' '[' ';' ','
3509 * and when the second element is NOT one of these characters: '}' '}' ';' ','.
3510 *
3511 * This simple comparison works most of the time, but it does not cover cases
3512 * where the batch's structural indexes contain a perfect amount of documents.
3513 * In such a case, we do not have access to the structural index which follows
3514 * the last document, therefore, we do not have access to the second element in
3515 * the pair, and that means we cannot identify the last document. To fix this
3516 * issue, we keep a count of the open and closed curly/square braces we found
3517 * while searching for the pair. When we find a pair AND the count of open and
3518 * closed curly/square braces is the same, we know that we just passed a
3519 * complete document, therefore the last json buffer location is the end of the
3520 * batch.
3521 */
find_next_document_index(dom_parser_implementation & parser)3522 simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
3523 // TODO don't count separately, just figure out depth
3524 auto arr_cnt = 0;
3525 auto obj_cnt = 0;
3526 for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
3527 auto idxb = parser.structural_indexes[i];
3528 switch (parser.buf[idxb]) {
3529 case ':':
3530 case ',':
3531 continue;
3532 case '}':
3533 obj_cnt--;
3534 continue;
3535 case ']':
3536 arr_cnt--;
3537 continue;
3538 case '{':
3539 obj_cnt++;
3540 break;
3541 case '[':
3542 arr_cnt++;
3543 break;
3544 }
3545 auto idxa = parser.structural_indexes[i - 1];
3546 switch (parser.buf[idxa]) {
3547 case '{':
3548 case '[':
3549 case ':':
3550 case ',':
3551 continue;
3552 }
3553 // Last document is complete, so the next document will appear after!
3554 if (!arr_cnt && !obj_cnt) {
3555 return parser.n_structural_indexes;
3556 }
3557 // Last document is incomplete; mark the document at i + 1 as the next one
3558 return i;
3559 }
3560 return 0;
3561 }
3562
3563 } // unnamed namespace
3564 } // namespace arm64
3565 } // namespace simdjson
3566 /* end file src/generic/stage1/find_next_document_index.h */
3567
3568 namespace simdjson {
3569 namespace arm64 {
3570 namespace {
3571 namespace stage1 {
3572
3573 class bit_indexer {
3574 public:
3575 uint32_t *tail;
3576
bit_indexer(uint32_t * index_buf)3577 simdjson_really_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
3578
3579 // flatten out values in 'bits' assuming that they are are to have values of idx
3580 // plus their position in the bitvector, and store these indexes at
3581 // base_ptr[base] incrementing base as we go
3582 // will potentially store extra values beyond end of valid bits, so base_ptr
3583 // needs to be large enough to handle this
write(uint32_t idx,uint64_t bits)3584 simdjson_really_inline void write(uint32_t idx, uint64_t bits) {
3585 // In some instances, the next branch is expensive because it is mispredicted.
3586 // Unfortunately, in other cases,
3587 // it helps tremendously.
3588 if (bits == 0)
3589 return;
3590 int cnt = static_cast<int>(count_ones(bits));
3591
3592 // Do the first 8 all together
3593 for (int i=0; i<8; i++) {
3594 this->tail[i] = idx + trailing_zeroes(bits);
3595 bits = clear_lowest_bit(bits);
3596 }
3597
3598 // Do the next 8 all together (we hope in most cases it won't happen at all
3599 // and the branch is easily predicted).
3600 if (simdjson_unlikely(cnt > 8)) {
3601 for (int i=8; i<16; i++) {
3602 this->tail[i] = idx + trailing_zeroes(bits);
3603 bits = clear_lowest_bit(bits);
3604 }
3605
3606 // Most files don't have 16+ structurals per block, so we take several basically guaranteed
3607 // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
3608 // or the start of a value ("abc" true 123) every four characters.
3609 if (simdjson_unlikely(cnt > 16)) {
3610 int i = 16;
3611 do {
3612 this->tail[i] = idx + trailing_zeroes(bits);
3613 bits = clear_lowest_bit(bits);
3614 i++;
3615 } while (i < cnt);
3616 }
3617 }
3618
3619 this->tail += cnt;
3620 }
3621 };
3622
3623 class json_structural_indexer {
3624 public:
3625 /**
3626 * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
3627 *
3628 * @param partial Setting the partial parameter to true allows the find_structural_bits to
3629 * tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
3630 * you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
3631 */
3632 template<size_t STEP_SIZE>
3633 static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
3634
3635 private:
3636 simdjson_really_inline json_structural_indexer(uint32_t *structural_indexes);
3637 template<size_t STEP_SIZE>
3638 simdjson_really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
3639 simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
3640 simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
3641
3642 json_scanner scanner{};
3643 utf8_checker checker{};
3644 bit_indexer indexer;
3645 uint64_t prev_structurals = 0;
3646 uint64_t unescaped_chars_error = 0;
3647 };
3648
json_structural_indexer(uint32_t * structural_indexes)3649 simdjson_really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
3650
3651 // Skip the last character if it is partial
trim_partial_utf8(const uint8_t * buf,size_t len)3652 simdjson_really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
3653 if (simdjson_unlikely(len < 3)) {
3654 switch (len) {
3655 case 2:
3656 if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
3657 if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
3658 return len;
3659 case 1:
3660 if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
3661 return len;
3662 case 0:
3663 return len;
3664 }
3665 }
3666 if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
3667 if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
3668 if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
3669 return len;
3670 }
3671
3672 //
3673 // PERF NOTES:
3674 // We pipe 2 inputs through these stages:
3675 // 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
3676 // 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
3677 // 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
3678 // The output of step 1 depends entirely on this information. These functions don't quite use
3679 // up enough CPU: the second half of the functions is highly serial, only using 1 execution core
3680 // at a time. The second input's scans has some dependency on the first ones finishing it, but
3681 // they can make a lot of progress before they need that information.
3682 // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
3683 // to finish: utf-8 checks and generating the output from the last iteration.
3684 //
3685 // The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
3686 // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
3687 // workout.
3688 //
3689 template<size_t STEP_SIZE>
index(const uint8_t * buf,size_t len,dom_parser_implementation & parser,bool partial)3690 error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept {
3691 if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
3692 if (partial) { len = trim_partial_utf8(buf, len); }
3693
3694 buf_block_reader<STEP_SIZE> reader(buf, len);
3695 json_structural_indexer indexer(parser.structural_indexes.get());
3696
3697 // Read all but the last block
3698 while (reader.has_full_block()) {
3699 indexer.step<STEP_SIZE>(reader.full_block(), reader);
3700 }
3701
3702 // Take care of the last block (will always be there unless file is empty)
3703 uint8_t block[STEP_SIZE];
3704 if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return EMPTY; }
3705 indexer.step<STEP_SIZE>(block, reader);
3706
3707 return indexer.finish(parser, reader.block_index(), len, partial);
3708 }
3709
3710 template<>
step(const uint8_t * block,buf_block_reader<128> & reader)3711 simdjson_really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
3712 simd::simd8x64<uint8_t> in_1(block);
3713 simd::simd8x64<uint8_t> in_2(block+64);
3714 json_block block_1 = scanner.next(in_1);
3715 json_block block_2 = scanner.next(in_2);
3716 this->next(in_1, block_1, reader.block_index());
3717 this->next(in_2, block_2, reader.block_index()+64);
3718 reader.advance();
3719 }
3720
3721 template<>
step(const uint8_t * block,buf_block_reader<64> & reader)3722 simdjson_really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
3723 simd::simd8x64<uint8_t> in_1(block);
3724 json_block block_1 = scanner.next(in_1);
3725 this->next(in_1, block_1, reader.block_index());
3726 reader.advance();
3727 }
3728
next(const simd::simd8x64<uint8_t> & in,const json_block & block,size_t idx)3729 simdjson_really_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
3730 uint64_t unescaped = in.lteq(0x1F);
3731 checker.check_next_input(in);
3732 indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
3733 prev_structurals = block.structural_start();
3734 unescaped_chars_error |= block.non_quote_inside_string(unescaped);
3735 }
3736
finish(dom_parser_implementation & parser,size_t idx,size_t len,bool partial)3737 simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) {
3738 // Write out the final iteration's structurals
3739 indexer.write(uint32_t(idx-64), prev_structurals);
3740
3741 error_code error = scanner.finish();
3742 // We deliberately break down the next expression so that it is
3743 // human readable.
3744 const bool should_we_exit = partial ?
3745 ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
3746 : (error != SUCCESS); // if partial is false, we must have SUCCESS
3747 const bool have_unclosed_string = (error == UNCLOSED_STRING);
3748 if (simdjson_unlikely(should_we_exit)) { return error; }
3749
3750 if (unescaped_chars_error) {
3751 return UNESCAPED_CHARS;
3752 }
3753
3754 parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
3755 /***
3756 * This is related to https://github.com/simdjson/simdjson/issues/906
3757 * Basically, we want to make sure that if the parsing continues beyond the last (valid)
3758 * structural character, it quickly stops.
3759 * Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
3760 * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
3761 * continues, then it must be [,] or }.
3762 * Suppose it is ] or }. We backtrack to the first character, what could it be that would
3763 * not trigger an error? It could be ] or } but no, because you can't start a document that way.
3764 * It can't be a comma, a colon or any simple value. So the only way we could continue is
3765 * if the repeated character is [. But if so, the document must start with [. But if the document
3766 * starts with [, it should end with ]. If we enforce that rule, then we would get
3767 * ][[ which is invalid.
3768 **/
3769 parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
3770 parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
3771 parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
3772 parser.next_structural_index = 0;
3773 // a valid JSON file cannot have zero structural indexes - we should have found something
3774 if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
3775 return EMPTY;
3776 }
3777 if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
3778 return UNEXPECTED_ERROR;
3779 }
3780 if (partial) {
3781 // If we have an unclosed string, then the last structural
3782 // will be the quote and we want to make sure to omit it.
3783 if(have_unclosed_string) {
3784 parser.n_structural_indexes--;
3785 // a valid JSON file cannot have zero structural indexes - we should have found something
3786 if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
3787 }
3788 auto new_structural_indexes = find_next_document_index(parser);
3789 if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
3790 return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
3791 }
3792 parser.n_structural_indexes = new_structural_indexes;
3793 }
3794 checker.check_eof();
3795 return checker.errors();
3796 }
3797
3798 } // namespace stage1
3799 } // unnamed namespace
3800 } // namespace arm64
3801 } // namespace simdjson
3802 /* end file src/generic/stage1/json_structural_indexer.h */
3803 /* begin file src/generic/stage1/utf8_validator.h */
3804 namespace simdjson {
3805 namespace arm64 {
3806 namespace {
3807 namespace stage1 {
3808
3809 /**
3810 * Validates that the string is actual UTF-8.
3811 */
3812 template<class checker>
generic_validate_utf8(const uint8_t * input,size_t length)3813 bool generic_validate_utf8(const uint8_t * input, size_t length) {
3814 checker c{};
3815 buf_block_reader<64> reader(input, length);
3816 while (reader.has_full_block()) {
3817 simd::simd8x64<uint8_t> in(reader.full_block());
3818 c.check_next_input(in);
3819 reader.advance();
3820 }
3821 uint8_t block[64]{};
3822 reader.get_remainder(block);
3823 simd::simd8x64<uint8_t> in(block);
3824 c.check_next_input(in);
3825 reader.advance();
3826 c.check_eof();
3827 return c.errors() == error_code::SUCCESS;
3828 }
3829
generic_validate_utf8(const char * input,size_t length)3830 bool generic_validate_utf8(const char * input, size_t length) {
3831 return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
3832 }
3833
3834 } // namespace stage1
3835 } // unnamed namespace
3836 } // namespace arm64
3837 } // namespace simdjson
3838 /* end file src/generic/stage1/utf8_validator.h */
3839
3840 //
3841 // Stage 2
3842 //
3843
3844 /* begin file src/generic/stage2/tape_builder.h */
3845 /* begin file src/generic/stage2/json_iterator.h */
3846 /* begin file src/generic/stage2/logger.h */
3847 // This is for an internal-only stage 2 specific logger.
3848 // Set LOG_ENABLED = true to log what stage 2 is doing!
3849 namespace simdjson {
3850 namespace arm64 {
3851 namespace {
3852 namespace logger {
3853
3854 static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
3855
3856 #if SIMDJSON_VERBOSE_LOGGING
3857 static constexpr const bool LOG_ENABLED = true;
3858 #else
3859 static constexpr const bool LOG_ENABLED = false;
3860 #endif
3861 static constexpr const int LOG_EVENT_LEN = 20;
3862 static constexpr const int LOG_BUFFER_LEN = 30;
3863 static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
3864 static constexpr const int LOG_INDEX_LEN = 5;
3865
3866 static int log_depth; // Not threadsafe. Log only.
3867
3868 // Helper to turn unprintable or newline characters into spaces
printable_char(char c)3869 static simdjson_really_inline char printable_char(char c) {
3870 if (c >= 0x20) {
3871 return c;
3872 } else {
3873 return ' ';
3874 }
3875 }
3876
3877 // Print the header and set up log_start
log_start()3878 static simdjson_really_inline void log_start() {
3879 if (LOG_ENABLED) {
3880 log_depth = 0;
3881 printf("\n");
3882 printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
3883 printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
3884 }
3885 }
3886
log_string(const char * message)3887 simdjson_unused static simdjson_really_inline void log_string(const char *message) {
3888 if (LOG_ENABLED) {
3889 printf("%s\n", message);
3890 }
3891 }
3892
3893 // Logs a single line from the stage 2 DOM parser
3894 template<typename S>
log_line(S & structurals,const char * title_prefix,const char * title,const char * detail)3895 static simdjson_really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
3896 if (LOG_ENABLED) {
3897 printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
3898 auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
3899 auto next_index = structurals.next_structural;
3900 auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>(" ");
3901 auto next = &structurals.buf[*next_index];
3902 {
3903 // Print the next N characters in the buffer.
3904 printf("| ");
3905 // Otherwise, print the characters starting from the buffer position.
3906 // Print spaces for unprintable or newline characters.
3907 for (int i=0;i<LOG_BUFFER_LEN;i++) {
3908 printf("%c", printable_char(current[i]));
3909 }
3910 printf(" ");
3911 // Print the next N characters in the buffer.
3912 printf("| ");
3913 // Otherwise, print the characters starting from the buffer position.
3914 // Print spaces for unprintable or newline characters.
3915 for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
3916 printf("%c", printable_char(next[i]));
3917 }
3918 printf(" ");
3919 }
3920 if (current_index) {
3921 printf("| %*u ", LOG_INDEX_LEN, *current_index);
3922 } else {
3923 printf("| %-*s ", LOG_INDEX_LEN, "");
3924 }
3925 // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
3926 printf("| %-s ", detail);
3927 printf("|\n");
3928 }
3929 }
3930
3931 } // namespace logger
3932 } // unnamed namespace
3933 } // namespace arm64
3934 } // namespace simdjson
3935 /* end file src/generic/stage2/logger.h */
3936
3937 namespace simdjson {
3938 namespace arm64 {
3939 namespace {
3940 namespace stage2 {
3941
3942 class json_iterator {
3943 public:
3944 const uint8_t* const buf;
3945 uint32_t *next_structural;
3946 dom_parser_implementation &dom_parser;
3947 uint32_t depth{0};
3948
3949 /**
3950 * Walk the JSON document.
3951 *
3952 * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
3953 * the first parameter; some callbacks have other parameters as well:
3954 *
3955 * - visit_document_start() - at the beginning.
3956 * - visit_document_end() - at the end (if things were successful).
3957 *
3958 * - visit_array_start() - at the start `[` of a non-empty array.
3959 * - visit_array_end() - at the end `]` of a non-empty array.
3960 * - visit_empty_array() - when an empty array is encountered.
3961 *
3962 * - visit_object_end() - at the start `]` of a non-empty object.
3963 * - visit_object_start() - at the end `]` of a non-empty object.
3964 * - visit_empty_object() - when an empty object is encountered.
3965 * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
3966 * guaranteed to point at the first quote of the string (`"key"`).
3967 * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
3968 * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
3969 *
3970 * - increment_count(iter) - each time a value is found in an array or object.
3971 */
3972 template<bool STREAMING, typename V>
3973 simdjson_warn_unused simdjson_really_inline error_code walk_document(V &visitor) noexcept;
3974
3975 /**
3976 * Create an iterator capable of walking a JSON document.
3977 *
3978 * The document must have already passed through stage 1.
3979 */
3980 simdjson_really_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
3981
3982 /**
3983 * Look at the next token.
3984 *
3985 * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
3986 *
3987 * They may include invalid JSON as well (such as `1.2.3` or `ture`).
3988 */
3989 simdjson_really_inline const uint8_t *peek() const noexcept;
3990 /**
3991 * Advance to the next token.
3992 *
3993 * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
3994 *
3995 * They may include invalid JSON as well (such as `1.2.3` or `ture`).
3996 */
3997 simdjson_really_inline const uint8_t *advance() noexcept;
3998 /**
3999 * Get the remaining length of the document, from the start of the current token.
4000 */
4001 simdjson_really_inline size_t remaining_len() const noexcept;
4002 /**
4003 * Check if we are at the end of the document.
4004 *
4005 * If this is true, there are no more tokens.
4006 */
4007 simdjson_really_inline bool at_eof() const noexcept;
4008 /**
4009 * Check if we are at the beginning of the document.
4010 */
4011 simdjson_really_inline bool at_beginning() const noexcept;
4012 simdjson_really_inline uint8_t last_structural() const noexcept;
4013
4014 /**
4015 * Log that a value has been found.
4016 *
4017 * Set ENABLE_LOGGING=true in logger.h to see logging.
4018 */
4019 simdjson_really_inline void log_value(const char *type) const noexcept;
4020 /**
4021 * Log the start of a multipart value.
4022 *
4023 * Set ENABLE_LOGGING=true in logger.h to see logging.
4024 */
4025 simdjson_really_inline void log_start_value(const char *type) const noexcept;
4026 /**
4027 * Log the end of a multipart value.
4028 *
4029 * Set ENABLE_LOGGING=true in logger.h to see logging.
4030 */
4031 simdjson_really_inline void log_end_value(const char *type) const noexcept;
4032 /**
4033 * Log an error.
4034 *
4035 * Set ENABLE_LOGGING=true in logger.h to see logging.
4036 */
4037 simdjson_really_inline void log_error(const char *error) const noexcept;
4038
4039 template<typename V>
4040 simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
4041 template<typename V>
4042 simdjson_warn_unused simdjson_really_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
4043 };
4044
4045 template<bool STREAMING, typename V>
walk_document(V & visitor)4046 simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_document(V &visitor) noexcept {
4047 logger::log_start();
4048
4049 //
4050 // Start the document
4051 //
4052 if (at_eof()) { return EMPTY; }
4053 log_start_value("document");
4054 SIMDJSON_TRY( visitor.visit_document_start(*this) );
4055
4056 //
4057 // Read first value
4058 //
4059 {
4060 auto value = advance();
4061
4062 // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
4063 // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
4064 if (!STREAMING) {
4065 switch (*value) {
4066 case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break;
4067 case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break;
4068 }
4069 }
4070
4071 switch (*value) {
4072 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
4073 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
4074 default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
4075 }
4076 }
4077 goto document_end;
4078
4079 //
4080 // Object parser states
4081 //
4082 object_begin:
4083 log_start_value("object");
4084 depth++;
4085 if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
4086 dom_parser.is_array[depth] = false;
4087 SIMDJSON_TRY( visitor.visit_object_start(*this) );
4088
4089 {
4090 auto key = advance();
4091 if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
4092 SIMDJSON_TRY( visitor.increment_count(*this) );
4093 SIMDJSON_TRY( visitor.visit_key(*this, key) );
4094 }
4095
4096 object_field:
4097 if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
4098 {
4099 auto value = advance();
4100 switch (*value) {
4101 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
4102 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
4103 default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
4104 }
4105 }
4106
4107 object_continue:
4108 switch (*advance()) {
4109 case ',':
4110 SIMDJSON_TRY( visitor.increment_count(*this) );
4111 {
4112 auto key = advance();
4113 if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
4114 SIMDJSON_TRY( visitor.visit_key(*this, key) );
4115 }
4116 goto object_field;
4117 case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
4118 default: log_error("No comma between object fields"); return TAPE_ERROR;
4119 }
4120
4121 scope_end:
4122 depth--;
4123 if (depth == 0) { goto document_end; }
4124 if (dom_parser.is_array[depth]) { goto array_continue; }
4125 goto object_continue;
4126
4127 //
4128 // Array parser states
4129 //
4130 array_begin:
4131 log_start_value("array");
4132 depth++;
4133 if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
4134 dom_parser.is_array[depth] = true;
4135 SIMDJSON_TRY( visitor.visit_array_start(*this) );
4136 SIMDJSON_TRY( visitor.increment_count(*this) );
4137
4138 array_value:
4139 {
4140 auto value = advance();
4141 switch (*value) {
4142 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
4143 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
4144 default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
4145 }
4146 }
4147
4148 array_continue:
4149 switch (*advance()) {
4150 case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
4151 case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
4152 default: log_error("Missing comma between array values"); return TAPE_ERROR;
4153 }
4154
4155 document_end:
4156 log_end_value("document");
4157 SIMDJSON_TRY( visitor.visit_document_end(*this) );
4158
4159 dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
4160
4161 // If we didn't make it to the end, it's an error
4162 if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
4163 log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
4164 return TAPE_ERROR;
4165 }
4166
4167 return SUCCESS;
4168
4169 } // walk_document()
4170
json_iterator(dom_parser_implementation & _dom_parser,size_t start_structural_index)4171 simdjson_really_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
4172 : buf{_dom_parser.buf},
4173 next_structural{&_dom_parser.structural_indexes[start_structural_index]},
4174 dom_parser{_dom_parser} {
4175 }
4176
peek() const4177 simdjson_really_inline const uint8_t *json_iterator::peek() const noexcept {
4178 return &buf[*(next_structural)];
4179 }
advance()4180 simdjson_really_inline const uint8_t *json_iterator::advance() noexcept {
4181 return &buf[*(next_structural++)];
4182 }
remaining_len() const4183 simdjson_really_inline size_t json_iterator::remaining_len() const noexcept {
4184 return dom_parser.len - *(next_structural-1);
4185 }
4186
at_eof() const4187 simdjson_really_inline bool json_iterator::at_eof() const noexcept {
4188 return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
4189 }
at_beginning() const4190 simdjson_really_inline bool json_iterator::at_beginning() const noexcept {
4191 return next_structural == dom_parser.structural_indexes.get();
4192 }
last_structural() const4193 simdjson_really_inline uint8_t json_iterator::last_structural() const noexcept {
4194 return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
4195 }
4196
log_value(const char * type) const4197 simdjson_really_inline void json_iterator::log_value(const char *type) const noexcept {
4198 logger::log_line(*this, "", type, "");
4199 }
4200
log_start_value(const char * type) const4201 simdjson_really_inline void json_iterator::log_start_value(const char *type) const noexcept {
4202 logger::log_line(*this, "+", type, "");
4203 if (logger::LOG_ENABLED) { logger::log_depth++; }
4204 }
4205
log_end_value(const char * type) const4206 simdjson_really_inline void json_iterator::log_end_value(const char *type) const noexcept {
4207 if (logger::LOG_ENABLED) { logger::log_depth--; }
4208 logger::log_line(*this, "-", type, "");
4209 }
4210
log_error(const char * error) const4211 simdjson_really_inline void json_iterator::log_error(const char *error) const noexcept {
4212 logger::log_line(*this, "", "ERROR", error);
4213 }
4214
4215 template<typename V>
visit_root_primitive(V & visitor,const uint8_t * value)4216 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
4217 switch (*value) {
4218 case '"': return visitor.visit_root_string(*this, value);
4219 case 't': return visitor.visit_root_true_atom(*this, value);
4220 case 'f': return visitor.visit_root_false_atom(*this, value);
4221 case 'n': return visitor.visit_root_null_atom(*this, value);
4222 case '-':
4223 case '0': case '1': case '2': case '3': case '4':
4224 case '5': case '6': case '7': case '8': case '9':
4225 return visitor.visit_root_number(*this, value);
4226 default:
4227 log_error("Document starts with a non-value character");
4228 return TAPE_ERROR;
4229 }
4230 }
4231 template<typename V>
visit_primitive(V & visitor,const uint8_t * value)4232 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
4233 switch (*value) {
4234 case '"': return visitor.visit_string(*this, value);
4235 case 't': return visitor.visit_true_atom(*this, value);
4236 case 'f': return visitor.visit_false_atom(*this, value);
4237 case 'n': return visitor.visit_null_atom(*this, value);
4238 case '-':
4239 case '0': case '1': case '2': case '3': case '4':
4240 case '5': case '6': case '7': case '8': case '9':
4241 return visitor.visit_number(*this, value);
4242 default:
4243 log_error("Non-value found when value was expected!");
4244 return TAPE_ERROR;
4245 }
4246 }
4247
4248 } // namespace stage2
4249 } // unnamed namespace
4250 } // namespace arm64
4251 } // namespace simdjson
4252 /* end file src/generic/stage2/json_iterator.h */
4253 /* begin file src/generic/stage2/tape_writer.h */
4254 namespace simdjson {
4255 namespace arm64 {
4256 namespace {
4257 namespace stage2 {
4258
4259 struct tape_writer {
4260 /** The next place to write to tape */
4261 uint64_t *next_tape_loc;
4262
4263 /** Write a signed 64-bit value to tape. */
4264 simdjson_really_inline void append_s64(int64_t value) noexcept;
4265
4266 /** Write an unsigned 64-bit value to tape. */
4267 simdjson_really_inline void append_u64(uint64_t value) noexcept;
4268
4269 /** Write a double value to tape. */
4270 simdjson_really_inline void append_double(double value) noexcept;
4271
4272 /**
4273 * Append a tape entry (an 8-bit type,and 56 bits worth of value).
4274 */
4275 simdjson_really_inline void append(uint64_t val, internal::tape_type t) noexcept;
4276
4277 /**
4278 * Skip the current tape entry without writing.
4279 *
4280 * Used to skip the start of the container, since we'll come back later to fill it in when the
4281 * container ends.
4282 */
4283 simdjson_really_inline void skip() noexcept;
4284
4285 /**
4286 * Skip the number of tape entries necessary to write a large u64 or i64.
4287 */
4288 simdjson_really_inline void skip_large_integer() noexcept;
4289
4290 /**
4291 * Skip the number of tape entries necessary to write a double.
4292 */
4293 simdjson_really_inline void skip_double() noexcept;
4294
4295 /**
4296 * Write a value to a known location on tape.
4297 *
4298 * Used to go back and write out the start of a container after the container ends.
4299 */
4300 simdjson_really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
4301
4302 private:
4303 /**
4304 * Append both the tape entry, and a supplementary value following it. Used for types that need
4305 * all 64 bits, such as double and uint64_t.
4306 */
4307 template<typename T>
4308 simdjson_really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
4309 }; // struct number_writer
4310
append_s64(int64_t value)4311 simdjson_really_inline void tape_writer::append_s64(int64_t value) noexcept {
4312 append2(0, value, internal::tape_type::INT64);
4313 }
4314
append_u64(uint64_t value)4315 simdjson_really_inline void tape_writer::append_u64(uint64_t value) noexcept {
4316 append(0, internal::tape_type::UINT64);
4317 *next_tape_loc = value;
4318 next_tape_loc++;
4319 }
4320
4321 /** Write a double value to tape. */
append_double(double value)4322 simdjson_really_inline void tape_writer::append_double(double value) noexcept {
4323 append2(0, value, internal::tape_type::DOUBLE);
4324 }
4325
skip()4326 simdjson_really_inline void tape_writer::skip() noexcept {
4327 next_tape_loc++;
4328 }
4329
skip_large_integer()4330 simdjson_really_inline void tape_writer::skip_large_integer() noexcept {
4331 next_tape_loc += 2;
4332 }
4333
skip_double()4334 simdjson_really_inline void tape_writer::skip_double() noexcept {
4335 next_tape_loc += 2;
4336 }
4337
append(uint64_t val,internal::tape_type t)4338 simdjson_really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
4339 *next_tape_loc = val | ((uint64_t(char(t))) << 56);
4340 next_tape_loc++;
4341 }
4342
4343 template<typename T>
append2(uint64_t val,T val2,internal::tape_type t)4344 simdjson_really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
4345 append(val, t);
4346 static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
4347 memcpy(next_tape_loc, &val2, sizeof(val2));
4348 next_tape_loc++;
4349 }
4350
write(uint64_t & tape_loc,uint64_t val,internal::tape_type t)4351 simdjson_really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
4352 tape_loc = val | ((uint64_t(char(t))) << 56);
4353 }
4354
4355 } // namespace stage2
4356 } // unnamed namespace
4357 } // namespace arm64
4358 } // namespace simdjson
4359 /* end file src/generic/stage2/tape_writer.h */
4360
4361 namespace simdjson {
4362 namespace arm64 {
4363 namespace {
4364 namespace stage2 {
4365
4366 struct tape_builder {
4367 template<bool STREAMING>
4368 simdjson_warn_unused static simdjson_really_inline error_code parse_document(
4369 dom_parser_implementation &dom_parser,
4370 dom::document &doc) noexcept;
4371
4372 /** Called when a non-empty document starts. */
4373 simdjson_warn_unused simdjson_really_inline error_code visit_document_start(json_iterator &iter) noexcept;
4374 /** Called when a non-empty document ends without error. */
4375 simdjson_warn_unused simdjson_really_inline error_code visit_document_end(json_iterator &iter) noexcept;
4376
4377 /** Called when a non-empty array starts. */
4378 simdjson_warn_unused simdjson_really_inline error_code visit_array_start(json_iterator &iter) noexcept;
4379 /** Called when a non-empty array ends. */
4380 simdjson_warn_unused simdjson_really_inline error_code visit_array_end(json_iterator &iter) noexcept;
4381 /** Called when an empty array is found. */
4382 simdjson_warn_unused simdjson_really_inline error_code visit_empty_array(json_iterator &iter) noexcept;
4383
4384 /** Called when a non-empty object starts. */
4385 simdjson_warn_unused simdjson_really_inline error_code visit_object_start(json_iterator &iter) noexcept;
4386 /**
4387 * Called when a key in a field is encountered.
4388 *
4389 * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
4390 * will be called after this with the field value.
4391 */
4392 simdjson_warn_unused simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
4393 /** Called when a non-empty object ends. */
4394 simdjson_warn_unused simdjson_really_inline error_code visit_object_end(json_iterator &iter) noexcept;
4395 /** Called when an empty object is found. */
4396 simdjson_warn_unused simdjson_really_inline error_code visit_empty_object(json_iterator &iter) noexcept;
4397
4398 /**
4399 * Called when a string, number, boolean or null is found.
4400 */
4401 simdjson_warn_unused simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
4402 /**
4403 * Called when a string, number, boolean or null is found at the top level of a document (i.e.
4404 * when there is no array or object and the entire document is a single string, number, boolean or
4405 * null.
4406 *
4407 * This is separate from primitive() because simdjson's normal primitive parsing routines assume
4408 * there is at least one more token after the value, which is only true in an array or object.
4409 */
4410 simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
4411
4412 simdjson_warn_unused simdjson_really_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
4413 simdjson_warn_unused simdjson_really_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
4414 simdjson_warn_unused simdjson_really_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
4415 simdjson_warn_unused simdjson_really_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
4416 simdjson_warn_unused simdjson_really_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
4417
4418 simdjson_warn_unused simdjson_really_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
4419 simdjson_warn_unused simdjson_really_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
4420 simdjson_warn_unused simdjson_really_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
4421 simdjson_warn_unused simdjson_really_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
4422 simdjson_warn_unused simdjson_really_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
4423
4424 /** Called each time a new field or element in an array or object is found. */
4425 simdjson_warn_unused simdjson_really_inline error_code increment_count(json_iterator &iter) noexcept;
4426
4427 /** Next location to write to tape */
4428 tape_writer tape;
4429 private:
4430 /** Next write location in the string buf for stage 2 parsing */
4431 uint8_t *current_string_buf_loc;
4432
4433 simdjson_really_inline tape_builder(dom::document &doc) noexcept;
4434
4435 simdjson_really_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
4436 simdjson_really_inline void start_container(json_iterator &iter) noexcept;
4437 simdjson_warn_unused simdjson_really_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
4438 simdjson_warn_unused simdjson_really_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
4439 simdjson_really_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
4440 simdjson_really_inline void on_end_string(uint8_t *dst) noexcept;
4441 }; // class tape_builder
4442
4443 template<bool STREAMING>
parse_document(dom_parser_implementation & dom_parser,dom::document & doc)4444 simdjson_warn_unused simdjson_really_inline error_code tape_builder::parse_document(
4445 dom_parser_implementation &dom_parser,
4446 dom::document &doc) noexcept {
4447 dom_parser.doc = &doc;
4448 json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
4449 tape_builder builder(doc);
4450 return iter.walk_document<STREAMING>(builder);
4451 }
4452
visit_root_primitive(json_iterator & iter,const uint8_t * value)4453 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
4454 return iter.visit_root_primitive(*this, value);
4455 }
visit_primitive(json_iterator & iter,const uint8_t * value)4456 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
4457 return iter.visit_primitive(*this, value);
4458 }
visit_empty_object(json_iterator & iter)4459 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
4460 return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
4461 }
visit_empty_array(json_iterator & iter)4462 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
4463 return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
4464 }
4465
visit_document_start(json_iterator & iter)4466 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
4467 start_container(iter);
4468 return SUCCESS;
4469 }
visit_object_start(json_iterator & iter)4470 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
4471 start_container(iter);
4472 return SUCCESS;
4473 }
visit_array_start(json_iterator & iter)4474 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
4475 start_container(iter);
4476 return SUCCESS;
4477 }
4478
visit_object_end(json_iterator & iter)4479 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
4480 return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
4481 }
visit_array_end(json_iterator & iter)4482 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
4483 return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
4484 }
visit_document_end(json_iterator & iter)4485 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
4486 constexpr uint32_t start_tape_index = 0;
4487 tape.append(start_tape_index, internal::tape_type::ROOT);
4488 tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
4489 return SUCCESS;
4490 }
visit_key(json_iterator & iter,const uint8_t * key)4491 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
4492 return visit_string(iter, key, true);
4493 }
4494
increment_count(json_iterator & iter)4495 simdjson_warn_unused simdjson_really_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
4496 iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
4497 return SUCCESS;
4498 }
4499
tape_builder(dom::document & doc)4500 simdjson_really_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
4501
visit_string(json_iterator & iter,const uint8_t * value,bool key)4502 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
4503 iter.log_value(key ? "key" : "string");
4504 uint8_t *dst = on_start_string(iter);
4505 dst = stringparsing::parse_string(value+1, dst);
4506 if (dst == nullptr) {
4507 iter.log_error("Invalid escape in string");
4508 return STRING_ERROR;
4509 }
4510 on_end_string(dst);
4511 return SUCCESS;
4512 }
4513
visit_root_string(json_iterator & iter,const uint8_t * value)4514 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
4515 return visit_string(iter, value);
4516 }
4517
visit_number(json_iterator & iter,const uint8_t * value)4518 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
4519 iter.log_value("number");
4520 return numberparsing::parse_number(value, tape);
4521 }
4522
visit_root_number(json_iterator & iter,const uint8_t * value)4523 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
4524 //
4525 // We need to make a copy to make sure that the string is space terminated.
4526 // This is not about padding the input, which should already padded up
4527 // to len + SIMDJSON_PADDING. However, we have no control at this stage
4528 // on how the padding was done. What if the input string was padded with nulls?
4529 // It is quite common for an input string to have an extra null character (C string).
4530 // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
4531 // document, but the string "9\0" by itself is fine. So we make a copy and
4532 // pad the input with spaces when we know that there is just one input element.
4533 // This copy is relatively expensive, but it will almost never be called in
4534 // practice unless you are in the strange scenario where you have many JSON
4535 // documents made of single atoms.
4536 //
4537 std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
4538 if (copy.get() == nullptr) { return MEMALLOC; }
4539 std::memcpy(copy.get(), value, iter.remaining_len());
4540 std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
4541 error_code error = visit_number(iter, copy.get());
4542 return error;
4543 }
4544
visit_true_atom(json_iterator & iter,const uint8_t * value)4545 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
4546 iter.log_value("true");
4547 if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
4548 tape.append(0, internal::tape_type::TRUE_VALUE);
4549 return SUCCESS;
4550 }
4551
visit_root_true_atom(json_iterator & iter,const uint8_t * value)4552 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
4553 iter.log_value("true");
4554 if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
4555 tape.append(0, internal::tape_type::TRUE_VALUE);
4556 return SUCCESS;
4557 }
4558
visit_false_atom(json_iterator & iter,const uint8_t * value)4559 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
4560 iter.log_value("false");
4561 if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
4562 tape.append(0, internal::tape_type::FALSE_VALUE);
4563 return SUCCESS;
4564 }
4565
visit_root_false_atom(json_iterator & iter,const uint8_t * value)4566 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
4567 iter.log_value("false");
4568 if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
4569 tape.append(0, internal::tape_type::FALSE_VALUE);
4570 return SUCCESS;
4571 }
4572
visit_null_atom(json_iterator & iter,const uint8_t * value)4573 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
4574 iter.log_value("null");
4575 if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
4576 tape.append(0, internal::tape_type::NULL_VALUE);
4577 return SUCCESS;
4578 }
4579
visit_root_null_atom(json_iterator & iter,const uint8_t * value)4580 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
4581 iter.log_value("null");
4582 if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
4583 tape.append(0, internal::tape_type::NULL_VALUE);
4584 return SUCCESS;
4585 }
4586
4587 // private:
4588
next_tape_index(json_iterator & iter) const4589 simdjson_really_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
4590 return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
4591 }
4592
empty_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)4593 simdjson_warn_unused simdjson_really_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
4594 auto start_index = next_tape_index(iter);
4595 tape.append(start_index+2, start);
4596 tape.append(start_index, end);
4597 return SUCCESS;
4598 }
4599
start_container(json_iterator & iter)4600 simdjson_really_inline void tape_builder::start_container(json_iterator &iter) noexcept {
4601 iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
4602 iter.dom_parser.open_containers[iter.depth].count = 0;
4603 tape.skip(); // We don't actually *write* the start element until the end.
4604 }
4605
end_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)4606 simdjson_warn_unused simdjson_really_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
4607 // Write the ending tape element, pointing at the start location
4608 const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
4609 tape.append(start_tape_index, end);
4610 // Write the start tape element, pointing at the end location (and including count)
4611 // count can overflow if it exceeds 24 bits... so we saturate
4612 // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
4613 const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
4614 const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
4615 tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
4616 return SUCCESS;
4617 }
4618
on_start_string(json_iterator & iter)4619 simdjson_really_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
4620 // we advance the point, accounting for the fact that we have a NULL termination
4621 tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
4622 return current_string_buf_loc + sizeof(uint32_t);
4623 }
4624
on_end_string(uint8_t * dst)4625 simdjson_really_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
4626 uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
4627 // TODO check for overflow in case someone has a crazy string (>=4GB?)
4628 // But only add the overflow check when the document itself exceeds 4GB
4629 // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
4630 memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
4631 // NULL termination is still handy if you expect all your strings to
4632 // be NULL terminated? It comes at a small cost
4633 *dst = 0;
4634 current_string_buf_loc = dst + 1;
4635 }
4636
4637 } // namespace stage2
4638 } // unnamed namespace
4639 } // namespace arm64
4640 } // namespace simdjson
4641 /* end file src/generic/stage2/tape_builder.h */
4642
4643 //
4644 // Implementation-specific overrides
4645 //
4646 namespace simdjson {
4647 namespace arm64 {
4648 namespace {
4649 namespace stage1 {
4650
find_escaped(uint64_t backslash)4651 simdjson_really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
4652 // On ARM, we don't short-circuit this if there are no backslashes, because the branch gives us no
4653 // benefit and therefore makes things worse.
4654 // if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
4655 return find_escaped_branchless(backslash);
4656 }
4657
4658 } // namespace stage1
4659 } // unnamed namespace
4660
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len) const4661 simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
4662 return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
4663 }
4664
stage1(const uint8_t * _buf,size_t _len,bool streaming)4665 simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
4666 this->buf = _buf;
4667 this->len = _len;
4668 return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
4669 }
4670
validate_utf8(const char * buf,size_t len) const4671 simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
4672 return arm64::stage1::generic_validate_utf8(buf,len);
4673 }
4674
stage2(dom::document & _doc)4675 simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
4676 return stage2::tape_builder::parse_document<false>(*this, _doc);
4677 }
4678
stage2_next(dom::document & _doc)4679 simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
4680 return stage2::tape_builder::parse_document<true>(*this, _doc);
4681 }
4682
parse(const uint8_t * _buf,size_t _len,dom::document & _doc)4683 simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
4684 auto error = stage1(_buf, _len, false);
4685 if (error) { return error; }
4686 return stage2(_doc);
4687 }
4688
4689 } // namespace arm64
4690 } // namespace simdjson
4691
4692 /* begin file include/simdjson/arm64/end.h */
4693 /* end file include/simdjson/arm64/end.h */
4694 /* end file src/arm64/dom_parser_implementation.cpp */
4695 #endif
4696 #if SIMDJSON_IMPLEMENTATION_FALLBACK
4697 /* begin file src/fallback/implementation.cpp */
4698 /* begin file include/simdjson/fallback/begin.h */
4699 // redefining SIMDJSON_IMPLEMENTATION to "fallback"
4700 // #define SIMDJSON_IMPLEMENTATION fallback
4701 /* end file include/simdjson/fallback/begin.h */
4702
4703 namespace simdjson {
4704 namespace fallback {
4705
create_dom_parser_implementation(size_t capacity,size_t max_depth,std::unique_ptr<internal::dom_parser_implementation> & dst) const4706 simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
4707 size_t capacity,
4708 size_t max_depth,
4709 std::unique_ptr<internal::dom_parser_implementation>& dst
4710 ) const noexcept {
4711 dst.reset( new (std::nothrow) dom_parser_implementation() );
4712 if (!dst) { return MEMALLOC; }
4713 dst->set_capacity(capacity);
4714 dst->set_max_depth(max_depth);
4715 return SUCCESS;
4716 }
4717
4718 } // namespace fallback
4719 } // namespace simdjson
4720
4721 /* begin file include/simdjson/fallback/end.h */
4722 /* end file include/simdjson/fallback/end.h */
4723 /* end file src/fallback/implementation.cpp */
4724 /* begin file src/fallback/dom_parser_implementation.cpp */
4725 /* begin file include/simdjson/fallback/begin.h */
4726 // redefining SIMDJSON_IMPLEMENTATION to "fallback"
4727 // #define SIMDJSON_IMPLEMENTATION fallback
4728 /* end file include/simdjson/fallback/begin.h */
4729
4730 //
4731 // Stage 1
4732 //
4733 /* begin file src/generic/stage1/find_next_document_index.h */
4734 namespace simdjson {
4735 namespace fallback {
4736 namespace {
4737
4738 /**
4739 * This algorithm is used to quickly identify the last structural position that
4740 * makes up a complete document.
4741 *
4742 * It does this by going backwards and finding the last *document boundary* (a
4743 * place where one value follows another without a comma between them). If the
4744 * last document (the characters after the boundary) has an equal number of
4745 * start and end brackets, it is considered complete.
4746 *
4747 * Simply put, we iterate over the structural characters, starting from
4748 * the end. We consider that we found the end of a JSON document when the
4749 * first element of the pair is NOT one of these characters: '{' '[' ';' ','
4750 * and when the second element is NOT one of these characters: '}' '}' ';' ','.
4751 *
4752 * This simple comparison works most of the time, but it does not cover cases
4753 * where the batch's structural indexes contain a perfect amount of documents.
4754 * In such a case, we do not have access to the structural index which follows
4755 * the last document, therefore, we do not have access to the second element in
4756 * the pair, and that means we cannot identify the last document. To fix this
4757 * issue, we keep a count of the open and closed curly/square braces we found
4758 * while searching for the pair. When we find a pair AND the count of open and
4759 * closed curly/square braces is the same, we know that we just passed a
4760 * complete document, therefore the last json buffer location is the end of the
4761 * batch.
4762 */
find_next_document_index(dom_parser_implementation & parser)4763 simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
4764 // TODO don't count separately, just figure out depth
4765 auto arr_cnt = 0;
4766 auto obj_cnt = 0;
4767 for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
4768 auto idxb = parser.structural_indexes[i];
4769 switch (parser.buf[idxb]) {
4770 case ':':
4771 case ',':
4772 continue;
4773 case '}':
4774 obj_cnt--;
4775 continue;
4776 case ']':
4777 arr_cnt--;
4778 continue;
4779 case '{':
4780 obj_cnt++;
4781 break;
4782 case '[':
4783 arr_cnt++;
4784 break;
4785 }
4786 auto idxa = parser.structural_indexes[i - 1];
4787 switch (parser.buf[idxa]) {
4788 case '{':
4789 case '[':
4790 case ':':
4791 case ',':
4792 continue;
4793 }
4794 // Last document is complete, so the next document will appear after!
4795 if (!arr_cnt && !obj_cnt) {
4796 return parser.n_structural_indexes;
4797 }
4798 // Last document is incomplete; mark the document at i + 1 as the next one
4799 return i;
4800 }
4801 return 0;
4802 }
4803
4804 } // unnamed namespace
4805 } // namespace fallback
4806 } // namespace simdjson
4807 /* end file src/generic/stage1/find_next_document_index.h */
4808
4809 namespace simdjson {
4810 namespace fallback {
4811 namespace {
4812 namespace stage1 {
4813
4814 class structural_scanner {
4815 public:
4816
structural_scanner(dom_parser_implementation & _parser,bool _partial)4817 simdjson_really_inline structural_scanner(dom_parser_implementation &_parser, bool _partial)
4818 : buf{_parser.buf},
4819 next_structural_index{_parser.structural_indexes.get()},
4820 parser{_parser},
4821 len{static_cast<uint32_t>(_parser.len)},
4822 partial{_partial} {
4823 }
4824
add_structural()4825 simdjson_really_inline void add_structural() {
4826 *next_structural_index = idx;
4827 next_structural_index++;
4828 }
4829
is_continuation(uint8_t c)4830 simdjson_really_inline bool is_continuation(uint8_t c) {
4831 return (c & 0b11000000) == 0b10000000;
4832 }
4833
validate_utf8_character()4834 simdjson_really_inline void validate_utf8_character() {
4835 // Continuation
4836 if (simdjson_unlikely((buf[idx] & 0b01000000) == 0)) {
4837 // extra continuation
4838 error = UTF8_ERROR;
4839 idx++;
4840 return;
4841 }
4842
4843 // 2-byte
4844 if ((buf[idx] & 0b00100000) == 0) {
4845 // missing continuation
4846 if (simdjson_unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
4847 if (idx+1 > len && partial) { idx = len; return; }
4848 error = UTF8_ERROR;
4849 idx++;
4850 return;
4851 }
4852 // overlong: 1100000_ 10______
4853 if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; }
4854 idx += 2;
4855 return;
4856 }
4857
4858 // 3-byte
4859 if ((buf[idx] & 0b00010000) == 0) {
4860 // missing continuation
4861 if (simdjson_unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
4862 if (idx+2 > len && partial) { idx = len; return; }
4863 error = UTF8_ERROR;
4864 idx++;
4865 return;
4866 }
4867 // overlong: 11100000 100_____ ________
4868 if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; }
4869 // surrogates: U+D800-U+DFFF 11101101 101_____
4870 if (buf[idx] == 0b11101101 && buf[idx+1] >= 0b10100000) { error = UTF8_ERROR; }
4871 idx += 3;
4872 return;
4873 }
4874
4875 // 4-byte
4876 // missing continuation
4877 if (simdjson_unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
4878 if (idx+2 > len && partial) { idx = len; return; }
4879 error = UTF8_ERROR;
4880 idx++;
4881 return;
4882 }
4883 // overlong: 11110000 1000____ ________ ________
4884 if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; }
4885 // too large: > U+10FFFF:
4886 // 11110100 (1001|101_)____
4887 // 1111(1___|011_|0101) 10______
4888 // also includes 5, 6, 7 and 8 byte characters:
4889 // 11111___
4890 if (buf[idx] == 0b11110100 && buf[idx+1] >= 0b10010000) { error = UTF8_ERROR; }
4891 if (buf[idx] >= 0b11110101) { error = UTF8_ERROR; }
4892 idx += 4;
4893 }
4894
4895 // Returns true if the string is unclosed.
validate_string()4896 simdjson_really_inline bool validate_string() {
4897 idx++; // skip first quote
4898 while (idx < len && buf[idx] != '"') {
4899 if (buf[idx] == '\\') {
4900 idx += 2;
4901 } else if (simdjson_unlikely(buf[idx] & 0b10000000)) {
4902 validate_utf8_character();
4903 } else {
4904 if (buf[idx] < 0x20) { error = UNESCAPED_CHARS; }
4905 idx++;
4906 }
4907 }
4908 if (idx >= len) { return true; }
4909 return false;
4910 }
4911
is_whitespace_or_operator(uint8_t c)4912 simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) {
4913 switch (c) {
4914 case '{': case '}': case '[': case ']': case ',': case ':':
4915 case ' ': case '\r': case '\n': case '\t':
4916 return true;
4917 default:
4918 return false;
4919 }
4920 }
4921
4922 //
4923 // Parse the entire input in STEP_SIZE-byte chunks.
4924 //
scan()4925 simdjson_really_inline error_code scan() {
4926 bool unclosed_string = false;
4927 for (;idx<len;idx++) {
4928 switch (buf[idx]) {
4929 // String
4930 case '"':
4931 add_structural();
4932 unclosed_string |= validate_string();
4933 break;
4934 // Operator
4935 case '{': case '}': case '[': case ']': case ',': case ':':
4936 add_structural();
4937 break;
4938 // Whitespace
4939 case ' ': case '\r': case '\n': case '\t':
4940 break;
4941 // Primitive or invalid character (invalid characters will be checked in stage 2)
4942 default:
4943 // Anything else, add the structural and go until we find the next one
4944 add_structural();
4945 while (idx+1<len && !is_whitespace_or_operator(buf[idx+1])) {
4946 idx++;
4947 };
4948 break;
4949 }
4950 }
4951 *next_structural_index = len;
4952 // We pad beyond.
4953 // https://github.com/simdjson/simdjson/issues/906
4954 next_structural_index[1] = len;
4955 next_structural_index[2] = 0;
4956 parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
4957 if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
4958 parser.next_structural_index = 0;
4959 if (partial) {
4960 if(unclosed_string) {
4961 parser.n_structural_indexes--;
4962 if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
4963 }
4964 auto new_structural_indexes = find_next_document_index(parser);
4965 if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
4966 return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
4967 }
4968 parser.n_structural_indexes = new_structural_indexes;
4969 } else if(unclosed_string) { error = UNCLOSED_STRING; }
4970 return error;
4971 }
4972
4973 private:
4974 const uint8_t *buf;
4975 uint32_t *next_structural_index;
4976 dom_parser_implementation &parser;
4977 uint32_t len;
4978 uint32_t idx{0};
4979 error_code error{SUCCESS};
4980 bool partial;
4981 }; // structural_scanner
4982
4983 } // namespace stage1
4984 } // unnamed namespace
4985
stage1(const uint8_t * _buf,size_t _len,bool partial)4986 simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool partial) noexcept {
4987 this->buf = _buf;
4988 this->len = _len;
4989 stage1::structural_scanner scanner(*this, partial);
4990 return scanner.scan();
4991 }
4992
4993 // big table for the minifier
4994 static uint8_t jump_table[256 * 3] = {
4995 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
4996 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
4997 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
4998 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
4999 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5000 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5001 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5002 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5003 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5004 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5005 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5006 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5007 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5008 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5009 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5010 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5011 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5012 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5013 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5014 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5015 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5016 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5017 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5018 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5019 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5020 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5021 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5022 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
5023 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
5024 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5025 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
5026 };
5027
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len) const5028 simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
5029 size_t i = 0, pos = 0;
5030 uint8_t quote = 0;
5031 uint8_t nonescape = 1;
5032
5033 while (i < len) {
5034 unsigned char c = buf[i];
5035 uint8_t *meta = jump_table + 3 * c;
5036
5037 quote = quote ^ (meta[0] & nonescape);
5038 dst[pos] = c;
5039 pos += meta[2] | quote;
5040
5041 i += 1;
5042 nonescape = uint8_t(~nonescape) | (meta[1]);
5043 }
5044 dst_len = pos; // we intentionally do not work with a reference
5045 // for fear of aliasing
5046 return quote ? UNCLOSED_STRING : SUCCESS;
5047 }
5048
5049 // credit: based on code from Google Fuchsia (Apache Licensed)
validate_utf8(const char * buf,size_t len) const5050 simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
5051 const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
5052 uint64_t pos = 0;
5053 uint32_t code_point = 0;
5054 while (pos < len) {
5055 // check of the next 8 bytes are ascii.
5056 uint64_t next_pos = pos + 16;
5057 if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
5058 uint64_t v1;
5059 memcpy(&v1, data + pos, sizeof(uint64_t));
5060 uint64_t v2;
5061 memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
5062 uint64_t v{v1 | v2};
5063 if ((v & 0x8080808080808080) == 0) {
5064 pos = next_pos;
5065 continue;
5066 }
5067 }
5068 unsigned char byte = data[pos];
5069 if (byte < 0b10000000) {
5070 pos++;
5071 continue;
5072 } else if ((byte & 0b11100000) == 0b11000000) {
5073 next_pos = pos + 2;
5074 if (next_pos > len) { return false; }
5075 if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
5076 // range check
5077 code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
5078 if (code_point < 0x80 || 0x7ff < code_point) { return false; }
5079 } else if ((byte & 0b11110000) == 0b11100000) {
5080 next_pos = pos + 3;
5081 if (next_pos > len) { return false; }
5082 if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
5083 if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
5084 // range check
5085 code_point = (byte & 0b00001111) << 12 |
5086 (data[pos + 1] & 0b00111111) << 6 |
5087 (data[pos + 2] & 0b00111111);
5088 if (code_point < 0x800 || 0xffff < code_point ||
5089 (0xd7ff < code_point && code_point < 0xe000)) {
5090 return false;
5091 }
5092 } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
5093 next_pos = pos + 4;
5094 if (next_pos > len) { return false; }
5095 if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
5096 if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
5097 if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
5098 // range check
5099 code_point =
5100 (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
5101 (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
5102 if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
5103 } else {
5104 // we may have a continuation
5105 return false;
5106 }
5107 pos = next_pos;
5108 }
5109 return true;
5110 }
5111
5112 } // namespace fallback
5113 } // namespace simdjson
5114
5115 //
5116 // Stage 2
5117 //
5118 /* begin file src/generic/stage2/tape_builder.h */
5119 /* begin file src/generic/stage2/json_iterator.h */
5120 /* begin file src/generic/stage2/logger.h */
5121 // This is for an internal-only stage 2 specific logger.
5122 // Set LOG_ENABLED = true to log what stage 2 is doing!
5123 namespace simdjson {
5124 namespace fallback {
5125 namespace {
5126 namespace logger {
5127
5128 static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
5129
5130 #if SIMDJSON_VERBOSE_LOGGING
5131 static constexpr const bool LOG_ENABLED = true;
5132 #else
5133 static constexpr const bool LOG_ENABLED = false;
5134 #endif
5135 static constexpr const int LOG_EVENT_LEN = 20;
5136 static constexpr const int LOG_BUFFER_LEN = 30;
5137 static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
5138 static constexpr const int LOG_INDEX_LEN = 5;
5139
5140 static int log_depth; // Not threadsafe. Log only.
5141
5142 // Helper to turn unprintable or newline characters into spaces
printable_char(char c)5143 static simdjson_really_inline char printable_char(char c) {
5144 if (c >= 0x20) {
5145 return c;
5146 } else {
5147 return ' ';
5148 }
5149 }
5150
5151 // Print the header and set up log_start
log_start()5152 static simdjson_really_inline void log_start() {
5153 if (LOG_ENABLED) {
5154 log_depth = 0;
5155 printf("\n");
5156 printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
5157 printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
5158 }
5159 }
5160
log_string(const char * message)5161 simdjson_unused static simdjson_really_inline void log_string(const char *message) {
5162 if (LOG_ENABLED) {
5163 printf("%s\n", message);
5164 }
5165 }
5166
5167 // Logs a single line from the stage 2 DOM parser
5168 template<typename S>
log_line(S & structurals,const char * title_prefix,const char * title,const char * detail)5169 static simdjson_really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
5170 if (LOG_ENABLED) {
5171 printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
5172 auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
5173 auto next_index = structurals.next_structural;
5174 auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>(" ");
5175 auto next = &structurals.buf[*next_index];
5176 {
5177 // Print the next N characters in the buffer.
5178 printf("| ");
5179 // Otherwise, print the characters starting from the buffer position.
5180 // Print spaces for unprintable or newline characters.
5181 for (int i=0;i<LOG_BUFFER_LEN;i++) {
5182 printf("%c", printable_char(current[i]));
5183 }
5184 printf(" ");
5185 // Print the next N characters in the buffer.
5186 printf("| ");
5187 // Otherwise, print the characters starting from the buffer position.
5188 // Print spaces for unprintable or newline characters.
5189 for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
5190 printf("%c", printable_char(next[i]));
5191 }
5192 printf(" ");
5193 }
5194 if (current_index) {
5195 printf("| %*u ", LOG_INDEX_LEN, *current_index);
5196 } else {
5197 printf("| %-*s ", LOG_INDEX_LEN, "");
5198 }
5199 // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
5200 printf("| %-s ", detail);
5201 printf("|\n");
5202 }
5203 }
5204
5205 } // namespace logger
5206 } // unnamed namespace
5207 } // namespace fallback
5208 } // namespace simdjson
5209 /* end file src/generic/stage2/logger.h */
5210
5211 namespace simdjson {
5212 namespace fallback {
5213 namespace {
5214 namespace stage2 {
5215
5216 class json_iterator {
5217 public:
5218 const uint8_t* const buf;
5219 uint32_t *next_structural;
5220 dom_parser_implementation &dom_parser;
5221 uint32_t depth{0};
5222
5223 /**
5224 * Walk the JSON document.
5225 *
5226 * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
5227 * the first parameter; some callbacks have other parameters as well:
5228 *
5229 * - visit_document_start() - at the beginning.
5230 * - visit_document_end() - at the end (if things were successful).
5231 *
5232 * - visit_array_start() - at the start `[` of a non-empty array.
5233 * - visit_array_end() - at the end `]` of a non-empty array.
5234 * - visit_empty_array() - when an empty array is encountered.
5235 *
5236 * - visit_object_end() - at the start `]` of a non-empty object.
5237 * - visit_object_start() - at the end `]` of a non-empty object.
5238 * - visit_empty_object() - when an empty object is encountered.
5239 * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
5240 * guaranteed to point at the first quote of the string (`"key"`).
5241 * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
5242 * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
5243 *
5244 * - increment_count(iter) - each time a value is found in an array or object.
5245 */
5246 template<bool STREAMING, typename V>
5247 simdjson_warn_unused simdjson_really_inline error_code walk_document(V &visitor) noexcept;
5248
5249 /**
5250 * Create an iterator capable of walking a JSON document.
5251 *
5252 * The document must have already passed through stage 1.
5253 */
5254 simdjson_really_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
5255
5256 /**
5257 * Look at the next token.
5258 *
5259 * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
5260 *
5261 * They may include invalid JSON as well (such as `1.2.3` or `ture`).
5262 */
5263 simdjson_really_inline const uint8_t *peek() const noexcept;
5264 /**
5265 * Advance to the next token.
5266 *
5267 * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
5268 *
5269 * They may include invalid JSON as well (such as `1.2.3` or `ture`).
5270 */
5271 simdjson_really_inline const uint8_t *advance() noexcept;
5272 /**
5273 * Get the remaining length of the document, from the start of the current token.
5274 */
5275 simdjson_really_inline size_t remaining_len() const noexcept;
5276 /**
5277 * Check if we are at the end of the document.
5278 *
5279 * If this is true, there are no more tokens.
5280 */
5281 simdjson_really_inline bool at_eof() const noexcept;
5282 /**
5283 * Check if we are at the beginning of the document.
5284 */
5285 simdjson_really_inline bool at_beginning() const noexcept;
5286 simdjson_really_inline uint8_t last_structural() const noexcept;
5287
5288 /**
5289 * Log that a value has been found.
5290 *
5291 * Set ENABLE_LOGGING=true in logger.h to see logging.
5292 */
5293 simdjson_really_inline void log_value(const char *type) const noexcept;
5294 /**
5295 * Log the start of a multipart value.
5296 *
5297 * Set ENABLE_LOGGING=true in logger.h to see logging.
5298 */
5299 simdjson_really_inline void log_start_value(const char *type) const noexcept;
5300 /**
5301 * Log the end of a multipart value.
5302 *
5303 * Set ENABLE_LOGGING=true in logger.h to see logging.
5304 */
5305 simdjson_really_inline void log_end_value(const char *type) const noexcept;
5306 /**
5307 * Log an error.
5308 *
5309 * Set ENABLE_LOGGING=true in logger.h to see logging.
5310 */
5311 simdjson_really_inline void log_error(const char *error) const noexcept;
5312
5313 template<typename V>
5314 simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
5315 template<typename V>
5316 simdjson_warn_unused simdjson_really_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
5317 };
5318
5319 template<bool STREAMING, typename V>
walk_document(V & visitor)5320 simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_document(V &visitor) noexcept {
5321 logger::log_start();
5322
5323 //
5324 // Start the document
5325 //
5326 if (at_eof()) { return EMPTY; }
5327 log_start_value("document");
5328 SIMDJSON_TRY( visitor.visit_document_start(*this) );
5329
5330 //
5331 // Read first value
5332 //
5333 {
5334 auto value = advance();
5335
5336 // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
5337 // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
5338 if (!STREAMING) {
5339 switch (*value) {
5340 case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break;
5341 case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break;
5342 }
5343 }
5344
5345 switch (*value) {
5346 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
5347 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
5348 default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
5349 }
5350 }
5351 goto document_end;
5352
5353 //
5354 // Object parser states
5355 //
5356 object_begin:
5357 log_start_value("object");
5358 depth++;
5359 if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
5360 dom_parser.is_array[depth] = false;
5361 SIMDJSON_TRY( visitor.visit_object_start(*this) );
5362
5363 {
5364 auto key = advance();
5365 if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
5366 SIMDJSON_TRY( visitor.increment_count(*this) );
5367 SIMDJSON_TRY( visitor.visit_key(*this, key) );
5368 }
5369
5370 object_field:
5371 if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
5372 {
5373 auto value = advance();
5374 switch (*value) {
5375 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
5376 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
5377 default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
5378 }
5379 }
5380
5381 object_continue:
5382 switch (*advance()) {
5383 case ',':
5384 SIMDJSON_TRY( visitor.increment_count(*this) );
5385 {
5386 auto key = advance();
5387 if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
5388 SIMDJSON_TRY( visitor.visit_key(*this, key) );
5389 }
5390 goto object_field;
5391 case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
5392 default: log_error("No comma between object fields"); return TAPE_ERROR;
5393 }
5394
5395 scope_end:
5396 depth--;
5397 if (depth == 0) { goto document_end; }
5398 if (dom_parser.is_array[depth]) { goto array_continue; }
5399 goto object_continue;
5400
5401 //
5402 // Array parser states
5403 //
5404 array_begin:
5405 log_start_value("array");
5406 depth++;
5407 if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
5408 dom_parser.is_array[depth] = true;
5409 SIMDJSON_TRY( visitor.visit_array_start(*this) );
5410 SIMDJSON_TRY( visitor.increment_count(*this) );
5411
5412 array_value:
5413 {
5414 auto value = advance();
5415 switch (*value) {
5416 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
5417 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
5418 default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
5419 }
5420 }
5421
5422 array_continue:
5423 switch (*advance()) {
5424 case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
5425 case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
5426 default: log_error("Missing comma between array values"); return TAPE_ERROR;
5427 }
5428
5429 document_end:
5430 log_end_value("document");
5431 SIMDJSON_TRY( visitor.visit_document_end(*this) );
5432
5433 dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
5434
5435 // If we didn't make it to the end, it's an error
5436 if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
5437 log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
5438 return TAPE_ERROR;
5439 }
5440
5441 return SUCCESS;
5442
5443 } // walk_document()
5444
json_iterator(dom_parser_implementation & _dom_parser,size_t start_structural_index)5445 simdjson_really_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
5446 : buf{_dom_parser.buf},
5447 next_structural{&_dom_parser.structural_indexes[start_structural_index]},
5448 dom_parser{_dom_parser} {
5449 }
5450
peek() const5451 simdjson_really_inline const uint8_t *json_iterator::peek() const noexcept {
5452 return &buf[*(next_structural)];
5453 }
advance()5454 simdjson_really_inline const uint8_t *json_iterator::advance() noexcept {
5455 return &buf[*(next_structural++)];
5456 }
remaining_len() const5457 simdjson_really_inline size_t json_iterator::remaining_len() const noexcept {
5458 return dom_parser.len - *(next_structural-1);
5459 }
5460
at_eof() const5461 simdjson_really_inline bool json_iterator::at_eof() const noexcept {
5462 return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
5463 }
at_beginning() const5464 simdjson_really_inline bool json_iterator::at_beginning() const noexcept {
5465 return next_structural == dom_parser.structural_indexes.get();
5466 }
last_structural() const5467 simdjson_really_inline uint8_t json_iterator::last_structural() const noexcept {
5468 return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
5469 }
5470
log_value(const char * type) const5471 simdjson_really_inline void json_iterator::log_value(const char *type) const noexcept {
5472 logger::log_line(*this, "", type, "");
5473 }
5474
log_start_value(const char * type) const5475 simdjson_really_inline void json_iterator::log_start_value(const char *type) const noexcept {
5476 logger::log_line(*this, "+", type, "");
5477 if (logger::LOG_ENABLED) { logger::log_depth++; }
5478 }
5479
log_end_value(const char * type) const5480 simdjson_really_inline void json_iterator::log_end_value(const char *type) const noexcept {
5481 if (logger::LOG_ENABLED) { logger::log_depth--; }
5482 logger::log_line(*this, "-", type, "");
5483 }
5484
log_error(const char * error) const5485 simdjson_really_inline void json_iterator::log_error(const char *error) const noexcept {
5486 logger::log_line(*this, "", "ERROR", error);
5487 }
5488
5489 template<typename V>
visit_root_primitive(V & visitor,const uint8_t * value)5490 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
5491 switch (*value) {
5492 case '"': return visitor.visit_root_string(*this, value);
5493 case 't': return visitor.visit_root_true_atom(*this, value);
5494 case 'f': return visitor.visit_root_false_atom(*this, value);
5495 case 'n': return visitor.visit_root_null_atom(*this, value);
5496 case '-':
5497 case '0': case '1': case '2': case '3': case '4':
5498 case '5': case '6': case '7': case '8': case '9':
5499 return visitor.visit_root_number(*this, value);
5500 default:
5501 log_error("Document starts with a non-value character");
5502 return TAPE_ERROR;
5503 }
5504 }
5505 template<typename V>
visit_primitive(V & visitor,const uint8_t * value)5506 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
5507 switch (*value) {
5508 case '"': return visitor.visit_string(*this, value);
5509 case 't': return visitor.visit_true_atom(*this, value);
5510 case 'f': return visitor.visit_false_atom(*this, value);
5511 case 'n': return visitor.visit_null_atom(*this, value);
5512 case '-':
5513 case '0': case '1': case '2': case '3': case '4':
5514 case '5': case '6': case '7': case '8': case '9':
5515 return visitor.visit_number(*this, value);
5516 default:
5517 log_error("Non-value found when value was expected!");
5518 return TAPE_ERROR;
5519 }
5520 }
5521
5522 } // namespace stage2
5523 } // unnamed namespace
5524 } // namespace fallback
5525 } // namespace simdjson
5526 /* end file src/generic/stage2/json_iterator.h */
5527 /* begin file src/generic/stage2/tape_writer.h */
5528 namespace simdjson {
5529 namespace fallback {
5530 namespace {
5531 namespace stage2 {
5532
5533 struct tape_writer {
5534 /** The next place to write to tape */
5535 uint64_t *next_tape_loc;
5536
5537 /** Write a signed 64-bit value to tape. */
5538 simdjson_really_inline void append_s64(int64_t value) noexcept;
5539
5540 /** Write an unsigned 64-bit value to tape. */
5541 simdjson_really_inline void append_u64(uint64_t value) noexcept;
5542
5543 /** Write a double value to tape. */
5544 simdjson_really_inline void append_double(double value) noexcept;
5545
5546 /**
5547 * Append a tape entry (an 8-bit type,and 56 bits worth of value).
5548 */
5549 simdjson_really_inline void append(uint64_t val, internal::tape_type t) noexcept;
5550
5551 /**
5552 * Skip the current tape entry without writing.
5553 *
5554 * Used to skip the start of the container, since we'll come back later to fill it in when the
5555 * container ends.
5556 */
5557 simdjson_really_inline void skip() noexcept;
5558
5559 /**
5560 * Skip the number of tape entries necessary to write a large u64 or i64.
5561 */
5562 simdjson_really_inline void skip_large_integer() noexcept;
5563
5564 /**
5565 * Skip the number of tape entries necessary to write a double.
5566 */
5567 simdjson_really_inline void skip_double() noexcept;
5568
5569 /**
5570 * Write a value to a known location on tape.
5571 *
5572 * Used to go back and write out the start of a container after the container ends.
5573 */
5574 simdjson_really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
5575
5576 private:
5577 /**
5578 * Append both the tape entry, and a supplementary value following it. Used for types that need
5579 * all 64 bits, such as double and uint64_t.
5580 */
5581 template<typename T>
5582 simdjson_really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
5583 }; // struct number_writer
5584
append_s64(int64_t value)5585 simdjson_really_inline void tape_writer::append_s64(int64_t value) noexcept {
5586 append2(0, value, internal::tape_type::INT64);
5587 }
5588
append_u64(uint64_t value)5589 simdjson_really_inline void tape_writer::append_u64(uint64_t value) noexcept {
5590 append(0, internal::tape_type::UINT64);
5591 *next_tape_loc = value;
5592 next_tape_loc++;
5593 }
5594
5595 /** Write a double value to tape. */
append_double(double value)5596 simdjson_really_inline void tape_writer::append_double(double value) noexcept {
5597 append2(0, value, internal::tape_type::DOUBLE);
5598 }
5599
skip()5600 simdjson_really_inline void tape_writer::skip() noexcept {
5601 next_tape_loc++;
5602 }
5603
skip_large_integer()5604 simdjson_really_inline void tape_writer::skip_large_integer() noexcept {
5605 next_tape_loc += 2;
5606 }
5607
skip_double()5608 simdjson_really_inline void tape_writer::skip_double() noexcept {
5609 next_tape_loc += 2;
5610 }
5611
append(uint64_t val,internal::tape_type t)5612 simdjson_really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
5613 *next_tape_loc = val | ((uint64_t(char(t))) << 56);
5614 next_tape_loc++;
5615 }
5616
5617 template<typename T>
append2(uint64_t val,T val2,internal::tape_type t)5618 simdjson_really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
5619 append(val, t);
5620 static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
5621 memcpy(next_tape_loc, &val2, sizeof(val2));
5622 next_tape_loc++;
5623 }
5624
write(uint64_t & tape_loc,uint64_t val,internal::tape_type t)5625 simdjson_really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
5626 tape_loc = val | ((uint64_t(char(t))) << 56);
5627 }
5628
5629 } // namespace stage2
5630 } // unnamed namespace
5631 } // namespace fallback
5632 } // namespace simdjson
5633 /* end file src/generic/stage2/tape_writer.h */
5634
5635 namespace simdjson {
5636 namespace fallback {
5637 namespace {
5638 namespace stage2 {
5639
5640 struct tape_builder {
5641 template<bool STREAMING>
5642 simdjson_warn_unused static simdjson_really_inline error_code parse_document(
5643 dom_parser_implementation &dom_parser,
5644 dom::document &doc) noexcept;
5645
5646 /** Called when a non-empty document starts. */
5647 simdjson_warn_unused simdjson_really_inline error_code visit_document_start(json_iterator &iter) noexcept;
5648 /** Called when a non-empty document ends without error. */
5649 simdjson_warn_unused simdjson_really_inline error_code visit_document_end(json_iterator &iter) noexcept;
5650
5651 /** Called when a non-empty array starts. */
5652 simdjson_warn_unused simdjson_really_inline error_code visit_array_start(json_iterator &iter) noexcept;
5653 /** Called when a non-empty array ends. */
5654 simdjson_warn_unused simdjson_really_inline error_code visit_array_end(json_iterator &iter) noexcept;
5655 /** Called when an empty array is found. */
5656 simdjson_warn_unused simdjson_really_inline error_code visit_empty_array(json_iterator &iter) noexcept;
5657
5658 /** Called when a non-empty object starts. */
5659 simdjson_warn_unused simdjson_really_inline error_code visit_object_start(json_iterator &iter) noexcept;
5660 /**
5661 * Called when a key in a field is encountered.
5662 *
5663 * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
5664 * will be called after this with the field value.
5665 */
5666 simdjson_warn_unused simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
5667 /** Called when a non-empty object ends. */
5668 simdjson_warn_unused simdjson_really_inline error_code visit_object_end(json_iterator &iter) noexcept;
5669 /** Called when an empty object is found. */
5670 simdjson_warn_unused simdjson_really_inline error_code visit_empty_object(json_iterator &iter) noexcept;
5671
5672 /**
5673 * Called when a string, number, boolean or null is found.
5674 */
5675 simdjson_warn_unused simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
5676 /**
5677 * Called when a string, number, boolean or null is found at the top level of a document (i.e.
5678 * when there is no array or object and the entire document is a single string, number, boolean or
5679 * null.
5680 *
5681 * This is separate from primitive() because simdjson's normal primitive parsing routines assume
5682 * there is at least one more token after the value, which is only true in an array or object.
5683 */
5684 simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
5685
5686 simdjson_warn_unused simdjson_really_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
5687 simdjson_warn_unused simdjson_really_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
5688 simdjson_warn_unused simdjson_really_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
5689 simdjson_warn_unused simdjson_really_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
5690 simdjson_warn_unused simdjson_really_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
5691
5692 simdjson_warn_unused simdjson_really_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
5693 simdjson_warn_unused simdjson_really_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
5694 simdjson_warn_unused simdjson_really_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
5695 simdjson_warn_unused simdjson_really_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
5696 simdjson_warn_unused simdjson_really_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
5697
5698 /** Called each time a new field or element in an array or object is found. */
5699 simdjson_warn_unused simdjson_really_inline error_code increment_count(json_iterator &iter) noexcept;
5700
5701 /** Next location to write to tape */
5702 tape_writer tape;
5703 private:
5704 /** Next write location in the string buf for stage 2 parsing */
5705 uint8_t *current_string_buf_loc;
5706
5707 simdjson_really_inline tape_builder(dom::document &doc) noexcept;
5708
5709 simdjson_really_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
5710 simdjson_really_inline void start_container(json_iterator &iter) noexcept;
5711 simdjson_warn_unused simdjson_really_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
5712 simdjson_warn_unused simdjson_really_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
5713 simdjson_really_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
5714 simdjson_really_inline void on_end_string(uint8_t *dst) noexcept;
5715 }; // class tape_builder
5716
5717 template<bool STREAMING>
parse_document(dom_parser_implementation & dom_parser,dom::document & doc)5718 simdjson_warn_unused simdjson_really_inline error_code tape_builder::parse_document(
5719 dom_parser_implementation &dom_parser,
5720 dom::document &doc) noexcept {
5721 dom_parser.doc = &doc;
5722 json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
5723 tape_builder builder(doc);
5724 return iter.walk_document<STREAMING>(builder);
5725 }
5726
visit_root_primitive(json_iterator & iter,const uint8_t * value)5727 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
5728 return iter.visit_root_primitive(*this, value);
5729 }
visit_primitive(json_iterator & iter,const uint8_t * value)5730 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
5731 return iter.visit_primitive(*this, value);
5732 }
visit_empty_object(json_iterator & iter)5733 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
5734 return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
5735 }
visit_empty_array(json_iterator & iter)5736 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
5737 return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
5738 }
5739
visit_document_start(json_iterator & iter)5740 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
5741 start_container(iter);
5742 return SUCCESS;
5743 }
visit_object_start(json_iterator & iter)5744 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
5745 start_container(iter);
5746 return SUCCESS;
5747 }
visit_array_start(json_iterator & iter)5748 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
5749 start_container(iter);
5750 return SUCCESS;
5751 }
5752
visit_object_end(json_iterator & iter)5753 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
5754 return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
5755 }
visit_array_end(json_iterator & iter)5756 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
5757 return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
5758 }
visit_document_end(json_iterator & iter)5759 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
5760 constexpr uint32_t start_tape_index = 0;
5761 tape.append(start_tape_index, internal::tape_type::ROOT);
5762 tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
5763 return SUCCESS;
5764 }
visit_key(json_iterator & iter,const uint8_t * key)5765 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
5766 return visit_string(iter, key, true);
5767 }
5768
increment_count(json_iterator & iter)5769 simdjson_warn_unused simdjson_really_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
5770 iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
5771 return SUCCESS;
5772 }
5773
tape_builder(dom::document & doc)5774 simdjson_really_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
5775
visit_string(json_iterator & iter,const uint8_t * value,bool key)5776 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
5777 iter.log_value(key ? "key" : "string");
5778 uint8_t *dst = on_start_string(iter);
5779 dst = stringparsing::parse_string(value+1, dst);
5780 if (dst == nullptr) {
5781 iter.log_error("Invalid escape in string");
5782 return STRING_ERROR;
5783 }
5784 on_end_string(dst);
5785 return SUCCESS;
5786 }
5787
visit_root_string(json_iterator & iter,const uint8_t * value)5788 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
5789 return visit_string(iter, value);
5790 }
5791
visit_number(json_iterator & iter,const uint8_t * value)5792 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
5793 iter.log_value("number");
5794 return numberparsing::parse_number(value, tape);
5795 }
5796
visit_root_number(json_iterator & iter,const uint8_t * value)5797 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
5798 //
5799 // We need to make a copy to make sure that the string is space terminated.
5800 // This is not about padding the input, which should already padded up
5801 // to len + SIMDJSON_PADDING. However, we have no control at this stage
5802 // on how the padding was done. What if the input string was padded with nulls?
5803 // It is quite common for an input string to have an extra null character (C string).
5804 // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
5805 // document, but the string "9\0" by itself is fine. So we make a copy and
5806 // pad the input with spaces when we know that there is just one input element.
5807 // This copy is relatively expensive, but it will almost never be called in
5808 // practice unless you are in the strange scenario where you have many JSON
5809 // documents made of single atoms.
5810 //
5811 std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
5812 if (copy.get() == nullptr) { return MEMALLOC; }
5813 std::memcpy(copy.get(), value, iter.remaining_len());
5814 std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
5815 error_code error = visit_number(iter, copy.get());
5816 return error;
5817 }
5818
visit_true_atom(json_iterator & iter,const uint8_t * value)5819 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
5820 iter.log_value("true");
5821 if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
5822 tape.append(0, internal::tape_type::TRUE_VALUE);
5823 return SUCCESS;
5824 }
5825
visit_root_true_atom(json_iterator & iter,const uint8_t * value)5826 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
5827 iter.log_value("true");
5828 if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
5829 tape.append(0, internal::tape_type::TRUE_VALUE);
5830 return SUCCESS;
5831 }
5832
visit_false_atom(json_iterator & iter,const uint8_t * value)5833 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
5834 iter.log_value("false");
5835 if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
5836 tape.append(0, internal::tape_type::FALSE_VALUE);
5837 return SUCCESS;
5838 }
5839
visit_root_false_atom(json_iterator & iter,const uint8_t * value)5840 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
5841 iter.log_value("false");
5842 if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
5843 tape.append(0, internal::tape_type::FALSE_VALUE);
5844 return SUCCESS;
5845 }
5846
visit_null_atom(json_iterator & iter,const uint8_t * value)5847 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
5848 iter.log_value("null");
5849 if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
5850 tape.append(0, internal::tape_type::NULL_VALUE);
5851 return SUCCESS;
5852 }
5853
visit_root_null_atom(json_iterator & iter,const uint8_t * value)5854 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
5855 iter.log_value("null");
5856 if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
5857 tape.append(0, internal::tape_type::NULL_VALUE);
5858 return SUCCESS;
5859 }
5860
5861 // private:
5862
next_tape_index(json_iterator & iter) const5863 simdjson_really_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
5864 return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
5865 }
5866
empty_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)5867 simdjson_warn_unused simdjson_really_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
5868 auto start_index = next_tape_index(iter);
5869 tape.append(start_index+2, start);
5870 tape.append(start_index, end);
5871 return SUCCESS;
5872 }
5873
start_container(json_iterator & iter)5874 simdjson_really_inline void tape_builder::start_container(json_iterator &iter) noexcept {
5875 iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
5876 iter.dom_parser.open_containers[iter.depth].count = 0;
5877 tape.skip(); // We don't actually *write* the start element until the end.
5878 }
5879
end_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)5880 simdjson_warn_unused simdjson_really_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
5881 // Write the ending tape element, pointing at the start location
5882 const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
5883 tape.append(start_tape_index, end);
5884 // Write the start tape element, pointing at the end location (and including count)
5885 // count can overflow if it exceeds 24 bits... so we saturate
5886 // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
5887 const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
5888 const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
5889 tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
5890 return SUCCESS;
5891 }
5892
on_start_string(json_iterator & iter)5893 simdjson_really_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
5894 // we advance the point, accounting for the fact that we have a NULL termination
5895 tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
5896 return current_string_buf_loc + sizeof(uint32_t);
5897 }
5898
on_end_string(uint8_t * dst)5899 simdjson_really_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
5900 uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
5901 // TODO check for overflow in case someone has a crazy string (>=4GB?)
5902 // But only add the overflow check when the document itself exceeds 4GB
5903 // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
5904 memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
5905 // NULL termination is still handy if you expect all your strings to
5906 // be NULL terminated? It comes at a small cost
5907 *dst = 0;
5908 current_string_buf_loc = dst + 1;
5909 }
5910
5911 } // namespace stage2
5912 } // unnamed namespace
5913 } // namespace fallback
5914 } // namespace simdjson
5915 /* end file src/generic/stage2/tape_builder.h */
5916
5917 namespace simdjson {
5918 namespace fallback {
5919
stage2(dom::document & _doc)5920 simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
5921 return stage2::tape_builder::parse_document<false>(*this, _doc);
5922 }
5923
stage2_next(dom::document & _doc)5924 simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
5925 return stage2::tape_builder::parse_document<true>(*this, _doc);
5926 }
5927
parse(const uint8_t * _buf,size_t _len,dom::document & _doc)5928 simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
5929 auto error = stage1(_buf, _len, false);
5930 if (error) { return error; }
5931 return stage2(_doc);
5932 }
5933
5934 } // namespace fallback
5935 } // namespace simdjson
5936
5937 /* begin file include/simdjson/fallback/end.h */
5938 /* end file include/simdjson/fallback/end.h */
5939 /* end file src/fallback/dom_parser_implementation.cpp */
5940 #endif
5941 #if SIMDJSON_IMPLEMENTATION_HASWELL
5942 /* begin file src/haswell/implementation.cpp */
5943 /* begin file include/simdjson/haswell/begin.h */
5944 // redefining SIMDJSON_IMPLEMENTATION to "haswell"
5945 // #define SIMDJSON_IMPLEMENTATION haswell
5946 SIMDJSON_TARGET_HASWELL
5947 /* end file include/simdjson/haswell/begin.h */
5948
5949 namespace simdjson {
5950 namespace haswell {
5951
create_dom_parser_implementation(size_t capacity,size_t max_depth,std::unique_ptr<internal::dom_parser_implementation> & dst) const5952 simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
5953 size_t capacity,
5954 size_t max_depth,
5955 std::unique_ptr<internal::dom_parser_implementation>& dst
5956 ) const noexcept {
5957 dst.reset( new (std::nothrow) dom_parser_implementation() );
5958 if (!dst) { return MEMALLOC; }
5959 dst->set_capacity(capacity);
5960 dst->set_max_depth(max_depth);
5961 return SUCCESS;
5962 }
5963
5964 } // namespace haswell
5965 } // namespace simdjson
5966
5967 /* begin file include/simdjson/haswell/end.h */
5968 SIMDJSON_UNTARGET_HASWELL
5969 /* end file include/simdjson/haswell/end.h */
5970
5971 /* end file src/haswell/implementation.cpp */
5972 /* begin file src/haswell/dom_parser_implementation.cpp */
5973 /* begin file include/simdjson/haswell/begin.h */
5974 // redefining SIMDJSON_IMPLEMENTATION to "haswell"
5975 // #define SIMDJSON_IMPLEMENTATION haswell
5976 SIMDJSON_TARGET_HASWELL
5977 /* end file include/simdjson/haswell/begin.h */
5978
5979 //
5980 // Stage 1
5981 //
5982
5983 namespace simdjson {
5984 namespace haswell {
5985 namespace {
5986
5987 using namespace simd;
5988
5989 struct json_character_block {
5990 static simdjson_really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
5991 // ASCII white-space ('\r','\n','\t',' ')
5992 simdjson_really_inline uint64_t whitespace() const noexcept;
5993 // non-quote structural characters (comma, colon, braces, brackets)
5994 simdjson_really_inline uint64_t op() const noexcept;
5995 // neither a structural character nor a white-space, so letters, numbers and quotes
5996 simdjson_really_inline uint64_t scalar() const noexcept;
5997
5998 uint64_t _whitespace; // ASCII white-space ('\r','\n','\t',' ')
5999 uint64_t _op; // structural characters (comma, colon, braces, brackets but not quotes)
6000 };
6001
whitespace() const6002 simdjson_really_inline uint64_t json_character_block::whitespace() const noexcept { return _whitespace; }
op() const6003 simdjson_really_inline uint64_t json_character_block::op() const noexcept { return _op; }
scalar() const6004 simdjson_really_inline uint64_t json_character_block::scalar() const noexcept { return ~(op() | whitespace()); }
6005
6006 // This identifies structural characters (comma, colon, braces, brackets),
6007 // and ASCII white-space ('\r','\n','\t',' ').
classify(const simd::simd8x64<uint8_t> & in)6008 simdjson_really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
6009 // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
6010 // we can't use the generic lookup_16.
6011 const auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
6012
6013 // The 6 operators (:,[]{}) have these values:
6014 //
6015 // , 2C
6016 // : 3A
6017 // [ 5B
6018 // { 7B
6019 // ] 5D
6020 // } 7D
6021 //
6022 // If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique.
6023 // We exploit this, using a simd 4-bit lookup to tell us which character match against, and then
6024 // match it (against | 0x20).
6025 //
6026 // To prevent recognizing other characters, everything else gets compared with 0, which cannot
6027 // match due to the | 0x20.
6028 //
6029 // NOTE: Due to the | 0x20, this ALSO treats <FF> and <SUB> (control characters 0C and 1A) like ,
6030 // and :. This gets caught in stage 2, which checks the actual character to ensure the right
6031 // operators are in the right places.
6032 const auto op_table = simd8<uint8_t>::repeat_16(
6033 0, 0, 0, 0,
6034 0, 0, 0, 0,
6035 0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B
6036 ',', '}', 0, 0 // , = 2C, ] = 5D, } = 7D
6037 );
6038
6039 // We compute whitespace and op separately. If later code only uses one or the
6040 // other, given the fact that all functions are aggressively inlined, we can
6041 // hope that useless computations will be omitted. This is namely case when
6042 // minifying (we only need whitespace).
6043
6044 const uint64_t whitespace = in.eq({
6045 _mm256_shuffle_epi8(whitespace_table, in.chunks[0]),
6046 _mm256_shuffle_epi8(whitespace_table, in.chunks[1])
6047 });
6048 // Turn [ and ] into { and }
6049 const simd8x64<uint8_t> curlified{
6050 in.chunks[0] | 0x20,
6051 in.chunks[1] | 0x20
6052 };
6053 const uint64_t op = curlified.eq({
6054 _mm256_shuffle_epi8(op_table, in.chunks[0]),
6055 _mm256_shuffle_epi8(op_table, in.chunks[1])
6056 });
6057
6058 return { whitespace, op };
6059 }
6060
is_ascii(const simd8x64<uint8_t> & input)6061 simdjson_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
6062 return input.reduce_or().is_ascii();
6063 }
6064
must_be_continuation(const simd8<uint8_t> prev1,const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)6065 simdjson_unused simdjson_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
6066 simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
6067 simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
6068 simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
6069 // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
6070 return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
6071 }
6072
must_be_2_3_continuation(const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)6073 simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
6074 simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
6075 simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
6076 // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
6077 return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
6078 }
6079
6080 } // unnamed namespace
6081 } // namespace haswell
6082 } // namespace simdjson
6083
6084 /* begin file src/generic/stage1/utf8_lookup4_algorithm.h */
6085 namespace simdjson {
6086 namespace haswell {
6087 namespace {
6088 namespace utf8_validation {
6089
6090 using namespace simd;
6091
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)6092 simdjson_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
6093 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
6094 // Bit 1 = Too Long (ASCII followed by continuation)
6095 // Bit 2 = Overlong 3-byte
6096 // Bit 4 = Surrogate
6097 // Bit 5 = Overlong 2-byte
6098 // Bit 7 = Two Continuations
6099 constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
6100 // 11______ 11______
6101 constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
6102 constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
6103 constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
6104 constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
6105 constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
6106 constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
6107 // 11110100 101_____
6108 // 11110101 1001____
6109 // 11110101 101_____
6110 // 1111011_ 1001____
6111 // 1111011_ 101_____
6112 // 11111___ 1001____
6113 // 11111___ 101_____
6114 constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
6115 // 11110101 1000____
6116 // 1111011_ 1000____
6117 // 11111___ 1000____
6118 constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
6119
6120 const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
6121 // 0_______ ________ <ASCII in byte 1>
6122 TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
6123 TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
6124 // 10______ ________ <continuation in byte 1>
6125 TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
6126 // 1100____ ________ <two byte lead in byte 1>
6127 TOO_SHORT | OVERLONG_2,
6128 // 1101____ ________ <two byte lead in byte 1>
6129 TOO_SHORT,
6130 // 1110____ ________ <three byte lead in byte 1>
6131 TOO_SHORT | OVERLONG_3 | SURROGATE,
6132 // 1111____ ________ <four+ byte lead in byte 1>
6133 TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
6134 );
6135 constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
6136 const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
6137 // ____0000 ________
6138 CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
6139 // ____0001 ________
6140 CARRY | OVERLONG_2,
6141 // ____001_ ________
6142 CARRY,
6143 CARRY,
6144
6145 // ____0100 ________
6146 CARRY | TOO_LARGE,
6147 // ____0101 ________
6148 CARRY | TOO_LARGE | TOO_LARGE_1000,
6149 // ____011_ ________
6150 CARRY | TOO_LARGE | TOO_LARGE_1000,
6151 CARRY | TOO_LARGE | TOO_LARGE_1000,
6152
6153 // ____1___ ________
6154 CARRY | TOO_LARGE | TOO_LARGE_1000,
6155 CARRY | TOO_LARGE | TOO_LARGE_1000,
6156 CARRY | TOO_LARGE | TOO_LARGE_1000,
6157 CARRY | TOO_LARGE | TOO_LARGE_1000,
6158 CARRY | TOO_LARGE | TOO_LARGE_1000,
6159 // ____1101 ________
6160 CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
6161 CARRY | TOO_LARGE | TOO_LARGE_1000,
6162 CARRY | TOO_LARGE | TOO_LARGE_1000
6163 );
6164 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
6165 // ________ 0_______ <ASCII in byte 2>
6166 TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
6167 TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
6168
6169 // ________ 1000____
6170 TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
6171 // ________ 1001____
6172 TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
6173 // ________ 101_____
6174 TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
6175 TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
6176
6177 // ________ 11______
6178 TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
6179 );
6180 return (byte_1_high & byte_1_low & byte_2_high);
6181 }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)6182 simdjson_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
6183 const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
6184 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
6185 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
6186 simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
6187 simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
6188 return must23_80 ^ sc;
6189 }
6190
6191 //
6192 // Return nonzero if there are incomplete multibyte characters at the end of the block:
6193 // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
6194 //
is_incomplete(const simd8<uint8_t> input)6195 simdjson_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
6196 // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
6197 // ... 1111____ 111_____ 11______
6198 static const uint8_t max_array[32] = {
6199 255, 255, 255, 255, 255, 255, 255, 255,
6200 255, 255, 255, 255, 255, 255, 255, 255,
6201 255, 255, 255, 255, 255, 255, 255, 255,
6202 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
6203 };
6204 const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
6205 return input.gt_bits(max_value);
6206 }
6207
6208 struct utf8_checker {
6209 // If this is nonzero, there has been a UTF-8 error.
6210 simd8<uint8_t> error;
6211 // The last input we received
6212 simd8<uint8_t> prev_input_block;
6213 // Whether the last input we received was incomplete (used for ASCII fast path)
6214 simd8<uint8_t> prev_incomplete;
6215
6216 //
6217 // Check whether the current bytes are valid UTF-8.
6218 //
check_utf8_bytessimdjson::haswell::__anon9bb6be6f1811::utf8_validation::utf8_checker6219 simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
6220 // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
6221 // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
6222 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
6223 simd8<uint8_t> sc = check_special_cases(input, prev1);
6224 this->error |= check_multibyte_lengths(input, prev_input, sc);
6225 }
6226
6227 // The only problem that can happen at EOF is that a multibyte character is too short
6228 // or a byte value too large in the last bytes: check_special_cases only checks for bytes
6229 // too large in the first of two bytes.
check_eofsimdjson::haswell::__anon9bb6be6f1811::utf8_validation::utf8_checker6230 simdjson_really_inline void check_eof() {
6231 // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
6232 // possibly finish them.
6233 this->error |= this->prev_incomplete;
6234 }
6235
check_next_inputsimdjson::haswell::__anon9bb6be6f1811::utf8_validation::utf8_checker6236 simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
6237 if(simdjson_likely(is_ascii(input))) {
6238 this->error |= this->prev_incomplete;
6239 } else {
6240 // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
6241 static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
6242 "We support either two or four chunks per 64-byte block.");
6243 if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
6244 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
6245 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
6246 } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
6247 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
6248 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
6249 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
6250 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
6251 }
6252 this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
6253 this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
6254
6255 }
6256 }
6257 // do not forget to call check_eof!
errorssimdjson::haswell::__anon9bb6be6f1811::utf8_validation::utf8_checker6258 simdjson_really_inline error_code errors() {
6259 return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
6260 }
6261
6262 }; // struct utf8_checker
6263 } // namespace utf8_validation
6264
6265 using utf8_validation::utf8_checker;
6266
6267 } // unnamed namespace
6268 } // namespace haswell
6269 } // namespace simdjson
6270 /* end file src/generic/stage1/utf8_lookup4_algorithm.h */
6271 /* begin file src/generic/stage1/json_structural_indexer.h */
6272 // This file contains the common code every implementation uses in stage1
6273 // It is intended to be included multiple times and compiled multiple times
6274 // We assume the file in which it is included already includes
6275 // "simdjson/stage1.h" (this simplifies amalgation)
6276
6277 /* begin file src/generic/stage1/buf_block_reader.h */
6278 namespace simdjson {
6279 namespace haswell {
6280 namespace {
6281
6282 // Walks through a buffer in block-sized increments, loading the last part with spaces
6283 template<size_t STEP_SIZE>
6284 struct buf_block_reader {
6285 public:
6286 simdjson_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
6287 simdjson_really_inline size_t block_index();
6288 simdjson_really_inline bool has_full_block() const;
6289 simdjson_really_inline const uint8_t *full_block() const;
6290 /**
6291 * Get the last block, padded with spaces.
6292 *
6293 * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
6294 * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
6295 * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
6296 *
6297 * @return the number of effective characters in the last block.
6298 */
6299 simdjson_really_inline size_t get_remainder(uint8_t *dst) const;
6300 simdjson_really_inline void advance();
6301 private:
6302 const uint8_t *buf;
6303 const size_t len;
6304 const size_t lenminusstep;
6305 size_t idx;
6306 };
6307
6308 // Routines to print masks and text for debugging bitmask operations
format_input_text_64(const uint8_t * text)6309 simdjson_unused static char * format_input_text_64(const uint8_t *text) {
6310 static char buf[sizeof(simd8x64<uint8_t>) + 1];
6311 for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
6312 buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
6313 }
6314 buf[sizeof(simd8x64<uint8_t>)] = '\0';
6315 return buf;
6316 }
6317
6318 // Routines to print masks and text for debugging bitmask operations
format_input_text(const simd8x64<uint8_t> & in)6319 simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
6320 static char buf[sizeof(simd8x64<uint8_t>) + 1];
6321 in.store(reinterpret_cast<uint8_t*>(buf));
6322 for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
6323 if (buf[i] < ' ') { buf[i] = '_'; }
6324 }
6325 buf[sizeof(simd8x64<uint8_t>)] = '\0';
6326 return buf;
6327 }
6328
format_mask(uint64_t mask)6329 simdjson_unused static char * format_mask(uint64_t mask) {
6330 static char buf[sizeof(simd8x64<uint8_t>) + 1];
6331 for (size_t i=0; i<64; i++) {
6332 buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
6333 }
6334 buf[64] = '\0';
6335 return buf;
6336 }
6337
6338 template<size_t STEP_SIZE>
buf_block_reader(const uint8_t * _buf,size_t _len)6339 simdjson_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
6340
6341 template<size_t STEP_SIZE>
block_index()6342 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
6343
6344 template<size_t STEP_SIZE>
has_full_block() const6345 simdjson_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
6346 return idx < lenminusstep;
6347 }
6348
6349 template<size_t STEP_SIZE>
full_block() const6350 simdjson_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
6351 return &buf[idx];
6352 }
6353
6354 template<size_t STEP_SIZE>
get_remainder(uint8_t * dst) const6355 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
6356 if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
6357 std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
6358 std::memcpy(dst, buf + idx, len - idx);
6359 return len - idx;
6360 }
6361
6362 template<size_t STEP_SIZE>
advance()6363 simdjson_really_inline void buf_block_reader<STEP_SIZE>::advance() {
6364 idx += STEP_SIZE;
6365 }
6366
6367 } // unnamed namespace
6368 } // namespace haswell
6369 } // namespace simdjson
6370 /* end file src/generic/stage1/buf_block_reader.h */
6371 /* begin file src/generic/stage1/json_string_scanner.h */
6372 namespace simdjson {
6373 namespace haswell {
6374 namespace {
6375 namespace stage1 {
6376
6377 struct json_string_block {
6378 // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_string_blocksimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6379 simdjson_really_inline json_string_block(uint64_t backslash, uint64_t escaped, uint64_t quote, uint64_t in_string) :
6380 _backslash(backslash), _escaped(escaped), _quote(quote), _in_string(in_string) {}
6381
6382 // Escaped characters (characters following an escape() character)
escapedsimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6383 simdjson_really_inline uint64_t escaped() const { return _escaped; }
6384 // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
escapesimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6385 simdjson_really_inline uint64_t escape() const { return _backslash & ~_escaped; }
6386 // Real (non-backslashed) quotes
quotesimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6387 simdjson_really_inline uint64_t quote() const { return _quote; }
6388 // Start quotes of strings
string_startsimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6389 simdjson_really_inline uint64_t string_start() const { return _quote & _in_string; }
6390 // End quotes of strings
string_endsimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6391 simdjson_really_inline uint64_t string_end() const { return _quote & ~_in_string; }
6392 // Only characters inside the string (not including the quotes)
string_contentsimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6393 simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; }
6394 // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_inside_stringsimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6395 simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
6396 // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_outside_stringsimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6397 simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
6398 // Tail of string (everything except the start quote)
string_tailsimdjson::haswell::__anon9bb6be6f1a11::stage1::json_string_block6399 simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
6400
6401 // backslash characters
6402 uint64_t _backslash;
6403 // escaped characters (backslashed--does not include the hex characters after \u)
6404 uint64_t _escaped;
6405 // real quotes (non-backslashed ones)
6406 uint64_t _quote;
6407 // string characters (includes start quote but not end quote)
6408 uint64_t _in_string;
6409 };
6410
6411 // Scans blocks for string characters, storing the state necessary to do so
6412 class json_string_scanner {
6413 public:
6414 simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
6415 // Returns either UNCLOSED_STRING or SUCCESS
6416 simdjson_really_inline error_code finish();
6417
6418 private:
6419 // Intended to be defined by the implementation
6420 simdjson_really_inline uint64_t find_escaped(uint64_t escape);
6421 simdjson_really_inline uint64_t find_escaped_branchless(uint64_t escape);
6422
6423 // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
6424 uint64_t prev_in_string = 0ULL;
6425 // Whether the first character of the next iteration is escaped.
6426 uint64_t prev_escaped = 0ULL;
6427 };
6428
6429 //
6430 // Finds escaped characters (characters following \).
6431 //
6432 // Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
6433 //
6434 // Does this by:
6435 // - Shift the escape mask to get potentially escaped characters (characters after backslashes).
6436 // - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
6437 // - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
6438 //
6439 // To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
6440 // escape sequences, filters out the ones that start on even bits, and adds that to the mask of
6441 // escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
6442 // the start bit causes a carry), and leaves even-bit sequences alone.
6443 //
6444 // Example:
6445 //
6446 // text | \\\ | \\\"\\\" \\\" \\"\\" |
6447 // escape | xxx | xx xxx xxx xx xx | Removed overflow backslash; will | it into follows_escape
6448 // odd_starts | x | x x x | escape & ~even_bits & ~follows_escape
6449 // even_seq | c| cxxx c xx c | c = carry bit -- will be masked out later
6450 // invert_mask | | cxxx c xx c| even_seq << 1
6451 // follows_escape | xx | x xx xxx xxx xx xx | Includes overflow bit
6452 // escaped | x | x x x x x x x x |
6453 // desired | x | x x x x x x x x |
6454 // text | \\\ | \\\"\\\" \\\" \\"\\" |
6455 //
find_escaped_branchless(uint64_t backslash)6456 simdjson_really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash) {
6457 // If there was overflow, pretend the first character isn't a backslash
6458 backslash &= ~prev_escaped;
6459 uint64_t follows_escape = backslash << 1 | prev_escaped;
6460
6461 // Get sequences starting on even bits by clearing out the odd series using +
6462 const uint64_t even_bits = 0x5555555555555555ULL;
6463 uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
6464 uint64_t sequences_starting_on_even_bits;
6465 prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
6466 uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
6467
6468 // Mask every other backslashed character as an escaped character
6469 // Flip the mask for sequences that start on even bits, to correct them
6470 return (even_bits ^ invert_mask) & follows_escape;
6471 }
6472
6473 //
6474 // Return a mask of all string characters plus end quotes.
6475 //
6476 // prev_escaped is overflow saying whether the next character is escaped.
6477 // prev_in_string is overflow saying whether we're still in a string.
6478 //
6479 // Backslash sequences outside of quotes will be detected in stage 2.
6480 //
next(const simd::simd8x64<uint8_t> & in)6481 simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
6482 const uint64_t backslash = in.eq('\\');
6483 const uint64_t escaped = find_escaped(backslash);
6484 const uint64_t quote = in.eq('"') & ~escaped;
6485
6486 //
6487 // prefix_xor flips on bits inside the string (and flips off the end quote).
6488 //
6489 // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
6490 // (characters inside strings are outside, and characters outside strings are inside).
6491 //
6492 const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
6493
6494 //
6495 // Check if we're still in a string at the end of the box so the next block will know
6496 //
6497 // right shift of a signed value expected to be well-defined and standard
6498 // compliant as of C++20, John Regher from Utah U. says this is fine code
6499 //
6500 prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
6501
6502 // Use ^ to turn the beginning quote off, and the end quote on.
6503
6504 // We are returning a function-local object so either we get a move constructor
6505 // or we get copy elision.
6506 return json_string_block(
6507 backslash,
6508 escaped,
6509 quote,
6510 in_string
6511 );
6512 }
6513
finish()6514 simdjson_really_inline error_code json_string_scanner::finish() {
6515 if (prev_in_string) {
6516 return UNCLOSED_STRING;
6517 }
6518 return SUCCESS;
6519 }
6520
6521 } // namespace stage1
6522 } // unnamed namespace
6523 } // namespace haswell
6524 } // namespace simdjson
6525 /* end file src/generic/stage1/json_string_scanner.h */
6526 /* begin file src/generic/stage1/json_scanner.h */
6527 namespace simdjson {
6528 namespace haswell {
6529 namespace {
6530 namespace stage1 {
6531
6532 /**
6533 * A block of scanned json, with information on operators and scalars.
6534 *
6535 * We seek to identify pseudo-structural characters. Anything that is inside
6536 * a string must be omitted (hence & ~_string.string_tail()).
6537 * Otherwise, pseudo-structural characters come in two forms.
6538 * 1. We have the structural characters ([,],{,},:, comma). The
6539 * term 'structural character' is from the JSON RFC.
6540 * 2. We have the 'scalar pseudo-structural characters'.
6541 * Scalars are quotes, and any character except structural characters and white space.
6542 *
6543 * To identify the scalar pseudo-structural characters, we must look at what comes
6544 * before them: it must be a space, a quote or a structural characters.
6545 * Starting with simdjson v0.3, we identify them by
6546 * negation: we identify everything that is followed by a non-quote scalar,
6547 * and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
6548 */
6549 struct json_block {
6550 public:
6551 // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_blocksimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6552 simdjson_really_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
6553 _string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
json_blocksimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6554 simdjson_really_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
6555 _string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
6556
6557 /**
6558 * The start of structurals.
6559 * In simdjson prior to v0.3, these were called the pseudo-structural characters.
6560 **/
structural_startsimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6561 simdjson_really_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
6562 /** All JSON whitespace (i.e. not in a string) */
whitespacesimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6563 simdjson_really_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
6564
6565 // Helpers
6566
6567 /** Whether the given characters are inside a string (only works on non-quotes) */
non_quote_inside_stringsimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6568 simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
6569 /** Whether the given characters are outside a string (only works on non-quotes) */
non_quote_outside_stringsimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6570 simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
6571
6572 // string and escape characters
6573 json_string_block _string;
6574 // whitespace, structural characters ('operators'), scalars
6575 json_character_block _characters;
6576 // whether the previous character was a scalar
6577 uint64_t _follows_potential_nonquote_scalar;
6578 private:
6579 // Potential structurals (i.e. disregarding strings)
6580
6581 /**
6582 * structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
6583 * They may reside inside a string.
6584 **/
potential_structural_startsimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6585 simdjson_really_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
6586 /**
6587 * The start of non-operator runs, like 123, true and "abc".
6588 * It main reside inside a string.
6589 **/
potential_scalar_startsimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6590 simdjson_really_inline uint64_t potential_scalar_start() const noexcept {
6591 // The term "scalar" refers to anything except structural characters and white space
6592 // (so letters, numbers, quotes).
6593 // Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
6594 // then we know that it is irrelevant structurally.
6595 return _characters.scalar() & ~follows_potential_scalar();
6596 }
6597 /**
6598 * Whether the given character is immediately after a non-operator like 123, true.
6599 * The characters following a quote are not included.
6600 */
follows_potential_scalarsimdjson::haswell::__anon9bb6be6f1b11::stage1::json_block6601 simdjson_really_inline uint64_t follows_potential_scalar() const noexcept {
6602 // _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
6603 // that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
6604 // white space.
6605 // It is understood that within quoted region, anything at all could be marked (irrelevant).
6606 return _follows_potential_nonquote_scalar;
6607 }
6608 };
6609
6610 /**
6611 * Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
6612 *
6613 * The scanner starts by calculating two distinct things:
6614 * - string characters (taking \" into account)
6615 * - structural characters or 'operators' ([]{},:, comma)
6616 * and scalars (runs of non-operators like 123, true and "abc")
6617 *
6618 * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
6619 * in particular, the operator/scalar bit will find plenty of things that are actually part of
6620 * strings. When we're done, json_block will fuse the two together by masking out tokens that are
6621 * part of a string.
6622 */
6623 class json_scanner {
6624 public:
json_scanner()6625 json_scanner() {}
6626 simdjson_really_inline json_block next(const simd::simd8x64<uint8_t>& in);
6627 // Returns either UNCLOSED_STRING or SUCCESS
6628 simdjson_really_inline error_code finish();
6629
6630 private:
6631 // Whether the last character of the previous iteration is part of a scalar token
6632 // (anything except whitespace or a structural character/'operator').
6633 uint64_t prev_scalar = 0ULL;
6634 json_string_scanner string_scanner{};
6635 };
6636
6637
6638 //
6639 // Check if the current character immediately follows a matching character.
6640 //
6641 // For example, this checks for quotes with backslashes in front of them:
6642 //
6643 // const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
6644 //
follows(const uint64_t match,uint64_t & overflow)6645 simdjson_really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
6646 const uint64_t result = match << 1 | overflow;
6647 overflow = match >> 63;
6648 return result;
6649 }
6650
next(const simd::simd8x64<uint8_t> & in)6651 simdjson_really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
6652 json_string_block strings = string_scanner.next(in);
6653 // identifies the white-space and the structurat characters
6654 json_character_block characters = json_character_block::classify(in);
6655 // The term "scalar" refers to anything except structural characters and white space
6656 // (so letters, numbers, quotes).
6657 // We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
6658 //
6659 // A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
6660 // or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
6661 // pseudo-structural character just like we would if we had ' "a string" true '; otherwise we
6662 // may need to add an extra check when parsing strings.
6663 //
6664 // Performance: there are many ways to skin this cat.
6665 const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
6666 uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
6667 // We are returning a function-local object so either we get a move constructor
6668 // or we get copy elision.
6669 return json_block(
6670 strings,// strings is a function-local object so either it moves or the copy is elided.
6671 characters,
6672 follows_nonquote_scalar
6673 );
6674 }
6675
finish()6676 simdjson_really_inline error_code json_scanner::finish() {
6677 return string_scanner.finish();
6678 }
6679
6680 } // namespace stage1
6681 } // unnamed namespace
6682 } // namespace haswell
6683 } // namespace simdjson
6684 /* end file src/generic/stage1/json_scanner.h */
6685 /* begin file src/generic/stage1/json_minifier.h */
6686 // This file contains the common code every implementation uses in stage1
6687 // It is intended to be included multiple times and compiled multiple times
6688 // We assume the file in which it is included already includes
6689 // "simdjson/stage1.h" (this simplifies amalgation)
6690
6691 namespace simdjson {
6692 namespace haswell {
6693 namespace {
6694 namespace stage1 {
6695
6696 class json_minifier {
6697 public:
6698 template<size_t STEP_SIZE>
6699 static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
6700
6701 private:
json_minifier(uint8_t * _dst)6702 simdjson_really_inline json_minifier(uint8_t *_dst)
6703 : dst{_dst}
6704 {}
6705 template<size_t STEP_SIZE>
6706 simdjson_really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
6707 simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
6708 simdjson_really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
6709 json_scanner scanner{};
6710 uint8_t *dst;
6711 };
6712
next(const simd::simd8x64<uint8_t> & in,const json_block & block)6713 simdjson_really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
6714 uint64_t mask = block.whitespace();
6715 in.compress(mask, dst);
6716 dst += 64 - count_ones(mask);
6717 }
6718
finish(uint8_t * dst_start,size_t & dst_len)6719 simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
6720 error_code error = scanner.finish();
6721 if (error) { dst_len = 0; return error; }
6722 dst_len = dst - dst_start;
6723 return SUCCESS;
6724 }
6725
6726 template<>
step(const uint8_t * block_buf,buf_block_reader<128> & reader)6727 simdjson_really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
6728 simd::simd8x64<uint8_t> in_1(block_buf);
6729 simd::simd8x64<uint8_t> in_2(block_buf+64);
6730 json_block block_1 = scanner.next(in_1);
6731 json_block block_2 = scanner.next(in_2);
6732 this->next(in_1, block_1);
6733 this->next(in_2, block_2);
6734 reader.advance();
6735 }
6736
6737 template<>
step(const uint8_t * block_buf,buf_block_reader<64> & reader)6738 simdjson_really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
6739 simd::simd8x64<uint8_t> in_1(block_buf);
6740 json_block block_1 = scanner.next(in_1);
6741 this->next(block_buf, block_1);
6742 reader.advance();
6743 }
6744
6745 template<size_t STEP_SIZE>
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len)6746 error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
6747 buf_block_reader<STEP_SIZE> reader(buf, len);
6748 json_minifier minifier(dst);
6749
6750 // Index the first n-1 blocks
6751 while (reader.has_full_block()) {
6752 minifier.step<STEP_SIZE>(reader.full_block(), reader);
6753 }
6754
6755 // Index the last (remainder) block, padded with spaces
6756 uint8_t block[STEP_SIZE];
6757 size_t remaining_bytes = reader.get_remainder(block);
6758 if (remaining_bytes > 0) {
6759 // We do not want to write directly to the output stream. Rather, we write
6760 // to a local buffer (for safety).
6761 uint8_t out_block[STEP_SIZE];
6762 uint8_t * const guarded_dst{minifier.dst};
6763 minifier.dst = out_block;
6764 minifier.step<STEP_SIZE>(block, reader);
6765 size_t to_write = minifier.dst - out_block;
6766 // In some cases, we could be enticed to consider the padded spaces
6767 // as part of the string. This is fine as long as we do not write more
6768 // than we consumed.
6769 if(to_write > remaining_bytes) { to_write = remaining_bytes; }
6770 memcpy(guarded_dst, out_block, to_write);
6771 minifier.dst = guarded_dst + to_write;
6772 }
6773 return minifier.finish(dst, dst_len);
6774 }
6775
6776 } // namespace stage1
6777 } // unnamed namespace
6778 } // namespace haswell
6779 } // namespace simdjson
6780 /* end file src/generic/stage1/json_minifier.h */
6781 /* begin file src/generic/stage1/find_next_document_index.h */
6782 namespace simdjson {
6783 namespace haswell {
6784 namespace {
6785
6786 /**
6787 * This algorithm is used to quickly identify the last structural position that
6788 * makes up a complete document.
6789 *
6790 * It does this by going backwards and finding the last *document boundary* (a
6791 * place where one value follows another without a comma between them). If the
6792 * last document (the characters after the boundary) has an equal number of
6793 * start and end brackets, it is considered complete.
6794 *
6795 * Simply put, we iterate over the structural characters, starting from
6796 * the end. We consider that we found the end of a JSON document when the
6797 * first element of the pair is NOT one of these characters: '{' '[' ';' ','
6798 * and when the second element is NOT one of these characters: '}' '}' ';' ','.
6799 *
6800 * This simple comparison works most of the time, but it does not cover cases
6801 * where the batch's structural indexes contain a perfect amount of documents.
6802 * In such a case, we do not have access to the structural index which follows
6803 * the last document, therefore, we do not have access to the second element in
6804 * the pair, and that means we cannot identify the last document. To fix this
6805 * issue, we keep a count of the open and closed curly/square braces we found
6806 * while searching for the pair. When we find a pair AND the count of open and
6807 * closed curly/square braces is the same, we know that we just passed a
6808 * complete document, therefore the last json buffer location is the end of the
6809 * batch.
6810 */
find_next_document_index(dom_parser_implementation & parser)6811 simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
6812 // TODO don't count separately, just figure out depth
6813 auto arr_cnt = 0;
6814 auto obj_cnt = 0;
6815 for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
6816 auto idxb = parser.structural_indexes[i];
6817 switch (parser.buf[idxb]) {
6818 case ':':
6819 case ',':
6820 continue;
6821 case '}':
6822 obj_cnt--;
6823 continue;
6824 case ']':
6825 arr_cnt--;
6826 continue;
6827 case '{':
6828 obj_cnt++;
6829 break;
6830 case '[':
6831 arr_cnt++;
6832 break;
6833 }
6834 auto idxa = parser.structural_indexes[i - 1];
6835 switch (parser.buf[idxa]) {
6836 case '{':
6837 case '[':
6838 case ':':
6839 case ',':
6840 continue;
6841 }
6842 // Last document is complete, so the next document will appear after!
6843 if (!arr_cnt && !obj_cnt) {
6844 return parser.n_structural_indexes;
6845 }
6846 // Last document is incomplete; mark the document at i + 1 as the next one
6847 return i;
6848 }
6849 return 0;
6850 }
6851
6852 } // unnamed namespace
6853 } // namespace haswell
6854 } // namespace simdjson
6855 /* end file src/generic/stage1/find_next_document_index.h */
6856
6857 namespace simdjson {
6858 namespace haswell {
6859 namespace {
6860 namespace stage1 {
6861
6862 class bit_indexer {
6863 public:
6864 uint32_t *tail;
6865
bit_indexer(uint32_t * index_buf)6866 simdjson_really_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
6867
6868 // flatten out values in 'bits' assuming that they are are to have values of idx
6869 // plus their position in the bitvector, and store these indexes at
6870 // base_ptr[base] incrementing base as we go
6871 // will potentially store extra values beyond end of valid bits, so base_ptr
6872 // needs to be large enough to handle this
write(uint32_t idx,uint64_t bits)6873 simdjson_really_inline void write(uint32_t idx, uint64_t bits) {
6874 // In some instances, the next branch is expensive because it is mispredicted.
6875 // Unfortunately, in other cases,
6876 // it helps tremendously.
6877 if (bits == 0)
6878 return;
6879 int cnt = static_cast<int>(count_ones(bits));
6880
6881 // Do the first 8 all together
6882 for (int i=0; i<8; i++) {
6883 this->tail[i] = idx + trailing_zeroes(bits);
6884 bits = clear_lowest_bit(bits);
6885 }
6886
6887 // Do the next 8 all together (we hope in most cases it won't happen at all
6888 // and the branch is easily predicted).
6889 if (simdjson_unlikely(cnt > 8)) {
6890 for (int i=8; i<16; i++) {
6891 this->tail[i] = idx + trailing_zeroes(bits);
6892 bits = clear_lowest_bit(bits);
6893 }
6894
6895 // Most files don't have 16+ structurals per block, so we take several basically guaranteed
6896 // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
6897 // or the start of a value ("abc" true 123) every four characters.
6898 if (simdjson_unlikely(cnt > 16)) {
6899 int i = 16;
6900 do {
6901 this->tail[i] = idx + trailing_zeroes(bits);
6902 bits = clear_lowest_bit(bits);
6903 i++;
6904 } while (i < cnt);
6905 }
6906 }
6907
6908 this->tail += cnt;
6909 }
6910 };
6911
6912 class json_structural_indexer {
6913 public:
6914 /**
6915 * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
6916 *
6917 * @param partial Setting the partial parameter to true allows the find_structural_bits to
6918 * tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
6919 * you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
6920 */
6921 template<size_t STEP_SIZE>
6922 static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
6923
6924 private:
6925 simdjson_really_inline json_structural_indexer(uint32_t *structural_indexes);
6926 template<size_t STEP_SIZE>
6927 simdjson_really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
6928 simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
6929 simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
6930
6931 json_scanner scanner{};
6932 utf8_checker checker{};
6933 bit_indexer indexer;
6934 uint64_t prev_structurals = 0;
6935 uint64_t unescaped_chars_error = 0;
6936 };
6937
json_structural_indexer(uint32_t * structural_indexes)6938 simdjson_really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
6939
6940 // Skip the last character if it is partial
trim_partial_utf8(const uint8_t * buf,size_t len)6941 simdjson_really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
6942 if (simdjson_unlikely(len < 3)) {
6943 switch (len) {
6944 case 2:
6945 if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
6946 if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
6947 return len;
6948 case 1:
6949 if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
6950 return len;
6951 case 0:
6952 return len;
6953 }
6954 }
6955 if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
6956 if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
6957 if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
6958 return len;
6959 }
6960
6961 //
6962 // PERF NOTES:
6963 // We pipe 2 inputs through these stages:
6964 // 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
6965 // 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
6966 // 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
6967 // The output of step 1 depends entirely on this information. These functions don't quite use
6968 // up enough CPU: the second half of the functions is highly serial, only using 1 execution core
6969 // at a time. The second input's scans has some dependency on the first ones finishing it, but
6970 // they can make a lot of progress before they need that information.
6971 // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
6972 // to finish: utf-8 checks and generating the output from the last iteration.
6973 //
6974 // The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
6975 // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
6976 // workout.
6977 //
6978 template<size_t STEP_SIZE>
index(const uint8_t * buf,size_t len,dom_parser_implementation & parser,bool partial)6979 error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept {
6980 if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
6981 if (partial) { len = trim_partial_utf8(buf, len); }
6982
6983 buf_block_reader<STEP_SIZE> reader(buf, len);
6984 json_structural_indexer indexer(parser.structural_indexes.get());
6985
6986 // Read all but the last block
6987 while (reader.has_full_block()) {
6988 indexer.step<STEP_SIZE>(reader.full_block(), reader);
6989 }
6990
6991 // Take care of the last block (will always be there unless file is empty)
6992 uint8_t block[STEP_SIZE];
6993 if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return EMPTY; }
6994 indexer.step<STEP_SIZE>(block, reader);
6995
6996 return indexer.finish(parser, reader.block_index(), len, partial);
6997 }
6998
6999 template<>
step(const uint8_t * block,buf_block_reader<128> & reader)7000 simdjson_really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
7001 simd::simd8x64<uint8_t> in_1(block);
7002 simd::simd8x64<uint8_t> in_2(block+64);
7003 json_block block_1 = scanner.next(in_1);
7004 json_block block_2 = scanner.next(in_2);
7005 this->next(in_1, block_1, reader.block_index());
7006 this->next(in_2, block_2, reader.block_index()+64);
7007 reader.advance();
7008 }
7009
7010 template<>
step(const uint8_t * block,buf_block_reader<64> & reader)7011 simdjson_really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
7012 simd::simd8x64<uint8_t> in_1(block);
7013 json_block block_1 = scanner.next(in_1);
7014 this->next(in_1, block_1, reader.block_index());
7015 reader.advance();
7016 }
7017
next(const simd::simd8x64<uint8_t> & in,const json_block & block,size_t idx)7018 simdjson_really_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
7019 uint64_t unescaped = in.lteq(0x1F);
7020 checker.check_next_input(in);
7021 indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
7022 prev_structurals = block.structural_start();
7023 unescaped_chars_error |= block.non_quote_inside_string(unescaped);
7024 }
7025
finish(dom_parser_implementation & parser,size_t idx,size_t len,bool partial)7026 simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) {
7027 // Write out the final iteration's structurals
7028 indexer.write(uint32_t(idx-64), prev_structurals);
7029
7030 error_code error = scanner.finish();
7031 // We deliberately break down the next expression so that it is
7032 // human readable.
7033 const bool should_we_exit = partial ?
7034 ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
7035 : (error != SUCCESS); // if partial is false, we must have SUCCESS
7036 const bool have_unclosed_string = (error == UNCLOSED_STRING);
7037 if (simdjson_unlikely(should_we_exit)) { return error; }
7038
7039 if (unescaped_chars_error) {
7040 return UNESCAPED_CHARS;
7041 }
7042
7043 parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
7044 /***
7045 * This is related to https://github.com/simdjson/simdjson/issues/906
7046 * Basically, we want to make sure that if the parsing continues beyond the last (valid)
7047 * structural character, it quickly stops.
7048 * Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
7049 * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
7050 * continues, then it must be [,] or }.
7051 * Suppose it is ] or }. We backtrack to the first character, what could it be that would
7052 * not trigger an error? It could be ] or } but no, because you can't start a document that way.
7053 * It can't be a comma, a colon or any simple value. So the only way we could continue is
7054 * if the repeated character is [. But if so, the document must start with [. But if the document
7055 * starts with [, it should end with ]. If we enforce that rule, then we would get
7056 * ][[ which is invalid.
7057 **/
7058 parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
7059 parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
7060 parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
7061 parser.next_structural_index = 0;
7062 // a valid JSON file cannot have zero structural indexes - we should have found something
7063 if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
7064 return EMPTY;
7065 }
7066 if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
7067 return UNEXPECTED_ERROR;
7068 }
7069 if (partial) {
7070 // If we have an unclosed string, then the last structural
7071 // will be the quote and we want to make sure to omit it.
7072 if(have_unclosed_string) {
7073 parser.n_structural_indexes--;
7074 // a valid JSON file cannot have zero structural indexes - we should have found something
7075 if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
7076 }
7077 auto new_structural_indexes = find_next_document_index(parser);
7078 if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
7079 return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
7080 }
7081 parser.n_structural_indexes = new_structural_indexes;
7082 }
7083 checker.check_eof();
7084 return checker.errors();
7085 }
7086
7087 } // namespace stage1
7088 } // unnamed namespace
7089 } // namespace haswell
7090 } // namespace simdjson
7091 /* end file src/generic/stage1/json_structural_indexer.h */
7092 /* begin file src/generic/stage1/utf8_validator.h */
7093 namespace simdjson {
7094 namespace haswell {
7095 namespace {
7096 namespace stage1 {
7097
7098 /**
7099 * Validates that the string is actual UTF-8.
7100 */
7101 template<class checker>
generic_validate_utf8(const uint8_t * input,size_t length)7102 bool generic_validate_utf8(const uint8_t * input, size_t length) {
7103 checker c{};
7104 buf_block_reader<64> reader(input, length);
7105 while (reader.has_full_block()) {
7106 simd::simd8x64<uint8_t> in(reader.full_block());
7107 c.check_next_input(in);
7108 reader.advance();
7109 }
7110 uint8_t block[64]{};
7111 reader.get_remainder(block);
7112 simd::simd8x64<uint8_t> in(block);
7113 c.check_next_input(in);
7114 reader.advance();
7115 c.check_eof();
7116 return c.errors() == error_code::SUCCESS;
7117 }
7118
generic_validate_utf8(const char * input,size_t length)7119 bool generic_validate_utf8(const char * input, size_t length) {
7120 return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
7121 }
7122
7123 } // namespace stage1
7124 } // unnamed namespace
7125 } // namespace haswell
7126 } // namespace simdjson
7127 /* end file src/generic/stage1/utf8_validator.h */
7128
7129 //
7130 // Stage 2
7131 //
7132 /* begin file src/generic/stage2/tape_builder.h */
7133 /* begin file src/generic/stage2/json_iterator.h */
7134 /* begin file src/generic/stage2/logger.h */
7135 // This is for an internal-only stage 2 specific logger.
7136 // Set LOG_ENABLED = true to log what stage 2 is doing!
7137 namespace simdjson {
7138 namespace haswell {
7139 namespace {
7140 namespace logger {
7141
7142 static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
7143
7144 #if SIMDJSON_VERBOSE_LOGGING
7145 static constexpr const bool LOG_ENABLED = true;
7146 #else
7147 static constexpr const bool LOG_ENABLED = false;
7148 #endif
7149 static constexpr const int LOG_EVENT_LEN = 20;
7150 static constexpr const int LOG_BUFFER_LEN = 30;
7151 static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
7152 static constexpr const int LOG_INDEX_LEN = 5;
7153
7154 static int log_depth; // Not threadsafe. Log only.
7155
7156 // Helper to turn unprintable or newline characters into spaces
printable_char(char c)7157 static simdjson_really_inline char printable_char(char c) {
7158 if (c >= 0x20) {
7159 return c;
7160 } else {
7161 return ' ';
7162 }
7163 }
7164
7165 // Print the header and set up log_start
log_start()7166 static simdjson_really_inline void log_start() {
7167 if (LOG_ENABLED) {
7168 log_depth = 0;
7169 printf("\n");
7170 printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
7171 printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
7172 }
7173 }
7174
log_string(const char * message)7175 simdjson_unused static simdjson_really_inline void log_string(const char *message) {
7176 if (LOG_ENABLED) {
7177 printf("%s\n", message);
7178 }
7179 }
7180
7181 // Logs a single line from the stage 2 DOM parser
7182 template<typename S>
log_line(S & structurals,const char * title_prefix,const char * title,const char * detail)7183 static simdjson_really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
7184 if (LOG_ENABLED) {
7185 printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
7186 auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
7187 auto next_index = structurals.next_structural;
7188 auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>(" ");
7189 auto next = &structurals.buf[*next_index];
7190 {
7191 // Print the next N characters in the buffer.
7192 printf("| ");
7193 // Otherwise, print the characters starting from the buffer position.
7194 // Print spaces for unprintable or newline characters.
7195 for (int i=0;i<LOG_BUFFER_LEN;i++) {
7196 printf("%c", printable_char(current[i]));
7197 }
7198 printf(" ");
7199 // Print the next N characters in the buffer.
7200 printf("| ");
7201 // Otherwise, print the characters starting from the buffer position.
7202 // Print spaces for unprintable or newline characters.
7203 for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
7204 printf("%c", printable_char(next[i]));
7205 }
7206 printf(" ");
7207 }
7208 if (current_index) {
7209 printf("| %*u ", LOG_INDEX_LEN, *current_index);
7210 } else {
7211 printf("| %-*s ", LOG_INDEX_LEN, "");
7212 }
7213 // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
7214 printf("| %-s ", detail);
7215 printf("|\n");
7216 }
7217 }
7218
7219 } // namespace logger
7220 } // unnamed namespace
7221 } // namespace haswell
7222 } // namespace simdjson
7223 /* end file src/generic/stage2/logger.h */
7224
7225 namespace simdjson {
7226 namespace haswell {
7227 namespace {
7228 namespace stage2 {
7229
7230 class json_iterator {
7231 public:
7232 const uint8_t* const buf;
7233 uint32_t *next_structural;
7234 dom_parser_implementation &dom_parser;
7235 uint32_t depth{0};
7236
7237 /**
7238 * Walk the JSON document.
7239 *
7240 * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
7241 * the first parameter; some callbacks have other parameters as well:
7242 *
7243 * - visit_document_start() - at the beginning.
7244 * - visit_document_end() - at the end (if things were successful).
7245 *
7246 * - visit_array_start() - at the start `[` of a non-empty array.
7247 * - visit_array_end() - at the end `]` of a non-empty array.
7248 * - visit_empty_array() - when an empty array is encountered.
7249 *
7250 * - visit_object_end() - at the start `]` of a non-empty object.
7251 * - visit_object_start() - at the end `]` of a non-empty object.
7252 * - visit_empty_object() - when an empty object is encountered.
7253 * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
7254 * guaranteed to point at the first quote of the string (`"key"`).
7255 * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
7256 * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
7257 *
7258 * - increment_count(iter) - each time a value is found in an array or object.
7259 */
7260 template<bool STREAMING, typename V>
7261 simdjson_warn_unused simdjson_really_inline error_code walk_document(V &visitor) noexcept;
7262
7263 /**
7264 * Create an iterator capable of walking a JSON document.
7265 *
7266 * The document must have already passed through stage 1.
7267 */
7268 simdjson_really_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
7269
7270 /**
7271 * Look at the next token.
7272 *
7273 * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
7274 *
7275 * They may include invalid JSON as well (such as `1.2.3` or `ture`).
7276 */
7277 simdjson_really_inline const uint8_t *peek() const noexcept;
7278 /**
7279 * Advance to the next token.
7280 *
7281 * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
7282 *
7283 * They may include invalid JSON as well (such as `1.2.3` or `ture`).
7284 */
7285 simdjson_really_inline const uint8_t *advance() noexcept;
7286 /**
7287 * Get the remaining length of the document, from the start of the current token.
7288 */
7289 simdjson_really_inline size_t remaining_len() const noexcept;
7290 /**
7291 * Check if we are at the end of the document.
7292 *
7293 * If this is true, there are no more tokens.
7294 */
7295 simdjson_really_inline bool at_eof() const noexcept;
7296 /**
7297 * Check if we are at the beginning of the document.
7298 */
7299 simdjson_really_inline bool at_beginning() const noexcept;
7300 simdjson_really_inline uint8_t last_structural() const noexcept;
7301
7302 /**
7303 * Log that a value has been found.
7304 *
7305 * Set ENABLE_LOGGING=true in logger.h to see logging.
7306 */
7307 simdjson_really_inline void log_value(const char *type) const noexcept;
7308 /**
7309 * Log the start of a multipart value.
7310 *
7311 * Set ENABLE_LOGGING=true in logger.h to see logging.
7312 */
7313 simdjson_really_inline void log_start_value(const char *type) const noexcept;
7314 /**
7315 * Log the end of a multipart value.
7316 *
7317 * Set ENABLE_LOGGING=true in logger.h to see logging.
7318 */
7319 simdjson_really_inline void log_end_value(const char *type) const noexcept;
7320 /**
7321 * Log an error.
7322 *
7323 * Set ENABLE_LOGGING=true in logger.h to see logging.
7324 */
7325 simdjson_really_inline void log_error(const char *error) const noexcept;
7326
7327 template<typename V>
7328 simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
7329 template<typename V>
7330 simdjson_warn_unused simdjson_really_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
7331 };
7332
7333 template<bool STREAMING, typename V>
walk_document(V & visitor)7334 simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_document(V &visitor) noexcept {
7335 logger::log_start();
7336
7337 //
7338 // Start the document
7339 //
7340 if (at_eof()) { return EMPTY; }
7341 log_start_value("document");
7342 SIMDJSON_TRY( visitor.visit_document_start(*this) );
7343
7344 //
7345 // Read first value
7346 //
7347 {
7348 auto value = advance();
7349
7350 // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
7351 // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
7352 if (!STREAMING) {
7353 switch (*value) {
7354 case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break;
7355 case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break;
7356 }
7357 }
7358
7359 switch (*value) {
7360 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
7361 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
7362 default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
7363 }
7364 }
7365 goto document_end;
7366
7367 //
7368 // Object parser states
7369 //
7370 object_begin:
7371 log_start_value("object");
7372 depth++;
7373 if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
7374 dom_parser.is_array[depth] = false;
7375 SIMDJSON_TRY( visitor.visit_object_start(*this) );
7376
7377 {
7378 auto key = advance();
7379 if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
7380 SIMDJSON_TRY( visitor.increment_count(*this) );
7381 SIMDJSON_TRY( visitor.visit_key(*this, key) );
7382 }
7383
7384 object_field:
7385 if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
7386 {
7387 auto value = advance();
7388 switch (*value) {
7389 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
7390 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
7391 default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
7392 }
7393 }
7394
7395 object_continue:
7396 switch (*advance()) {
7397 case ',':
7398 SIMDJSON_TRY( visitor.increment_count(*this) );
7399 {
7400 auto key = advance();
7401 if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
7402 SIMDJSON_TRY( visitor.visit_key(*this, key) );
7403 }
7404 goto object_field;
7405 case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
7406 default: log_error("No comma between object fields"); return TAPE_ERROR;
7407 }
7408
7409 scope_end:
7410 depth--;
7411 if (depth == 0) { goto document_end; }
7412 if (dom_parser.is_array[depth]) { goto array_continue; }
7413 goto object_continue;
7414
7415 //
7416 // Array parser states
7417 //
7418 array_begin:
7419 log_start_value("array");
7420 depth++;
7421 if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
7422 dom_parser.is_array[depth] = true;
7423 SIMDJSON_TRY( visitor.visit_array_start(*this) );
7424 SIMDJSON_TRY( visitor.increment_count(*this) );
7425
7426 array_value:
7427 {
7428 auto value = advance();
7429 switch (*value) {
7430 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
7431 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
7432 default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
7433 }
7434 }
7435
7436 array_continue:
7437 switch (*advance()) {
7438 case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
7439 case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
7440 default: log_error("Missing comma between array values"); return TAPE_ERROR;
7441 }
7442
7443 document_end:
7444 log_end_value("document");
7445 SIMDJSON_TRY( visitor.visit_document_end(*this) );
7446
7447 dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
7448
7449 // If we didn't make it to the end, it's an error
7450 if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
7451 log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
7452 return TAPE_ERROR;
7453 }
7454
7455 return SUCCESS;
7456
7457 } // walk_document()
7458
json_iterator(dom_parser_implementation & _dom_parser,size_t start_structural_index)7459 simdjson_really_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
7460 : buf{_dom_parser.buf},
7461 next_structural{&_dom_parser.structural_indexes[start_structural_index]},
7462 dom_parser{_dom_parser} {
7463 }
7464
peek() const7465 simdjson_really_inline const uint8_t *json_iterator::peek() const noexcept {
7466 return &buf[*(next_structural)];
7467 }
advance()7468 simdjson_really_inline const uint8_t *json_iterator::advance() noexcept {
7469 return &buf[*(next_structural++)];
7470 }
remaining_len() const7471 simdjson_really_inline size_t json_iterator::remaining_len() const noexcept {
7472 return dom_parser.len - *(next_structural-1);
7473 }
7474
at_eof() const7475 simdjson_really_inline bool json_iterator::at_eof() const noexcept {
7476 return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
7477 }
at_beginning() const7478 simdjson_really_inline bool json_iterator::at_beginning() const noexcept {
7479 return next_structural == dom_parser.structural_indexes.get();
7480 }
last_structural() const7481 simdjson_really_inline uint8_t json_iterator::last_structural() const noexcept {
7482 return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
7483 }
7484
log_value(const char * type) const7485 simdjson_really_inline void json_iterator::log_value(const char *type) const noexcept {
7486 logger::log_line(*this, "", type, "");
7487 }
7488
log_start_value(const char * type) const7489 simdjson_really_inline void json_iterator::log_start_value(const char *type) const noexcept {
7490 logger::log_line(*this, "+", type, "");
7491 if (logger::LOG_ENABLED) { logger::log_depth++; }
7492 }
7493
log_end_value(const char * type) const7494 simdjson_really_inline void json_iterator::log_end_value(const char *type) const noexcept {
7495 if (logger::LOG_ENABLED) { logger::log_depth--; }
7496 logger::log_line(*this, "-", type, "");
7497 }
7498
log_error(const char * error) const7499 simdjson_really_inline void json_iterator::log_error(const char *error) const noexcept {
7500 logger::log_line(*this, "", "ERROR", error);
7501 }
7502
7503 template<typename V>
visit_root_primitive(V & visitor,const uint8_t * value)7504 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
7505 switch (*value) {
7506 case '"': return visitor.visit_root_string(*this, value);
7507 case 't': return visitor.visit_root_true_atom(*this, value);
7508 case 'f': return visitor.visit_root_false_atom(*this, value);
7509 case 'n': return visitor.visit_root_null_atom(*this, value);
7510 case '-':
7511 case '0': case '1': case '2': case '3': case '4':
7512 case '5': case '6': case '7': case '8': case '9':
7513 return visitor.visit_root_number(*this, value);
7514 default:
7515 log_error("Document starts with a non-value character");
7516 return TAPE_ERROR;
7517 }
7518 }
7519 template<typename V>
visit_primitive(V & visitor,const uint8_t * value)7520 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
7521 switch (*value) {
7522 case '"': return visitor.visit_string(*this, value);
7523 case 't': return visitor.visit_true_atom(*this, value);
7524 case 'f': return visitor.visit_false_atom(*this, value);
7525 case 'n': return visitor.visit_null_atom(*this, value);
7526 case '-':
7527 case '0': case '1': case '2': case '3': case '4':
7528 case '5': case '6': case '7': case '8': case '9':
7529 return visitor.visit_number(*this, value);
7530 default:
7531 log_error("Non-value found when value was expected!");
7532 return TAPE_ERROR;
7533 }
7534 }
7535
7536 } // namespace stage2
7537 } // unnamed namespace
7538 } // namespace haswell
7539 } // namespace simdjson
7540 /* end file src/generic/stage2/json_iterator.h */
7541 /* begin file src/generic/stage2/tape_writer.h */
7542 namespace simdjson {
7543 namespace haswell {
7544 namespace {
7545 namespace stage2 {
7546
7547 struct tape_writer {
7548 /** The next place to write to tape */
7549 uint64_t *next_tape_loc;
7550
7551 /** Write a signed 64-bit value to tape. */
7552 simdjson_really_inline void append_s64(int64_t value) noexcept;
7553
7554 /** Write an unsigned 64-bit value to tape. */
7555 simdjson_really_inline void append_u64(uint64_t value) noexcept;
7556
7557 /** Write a double value to tape. */
7558 simdjson_really_inline void append_double(double value) noexcept;
7559
7560 /**
7561 * Append a tape entry (an 8-bit type,and 56 bits worth of value).
7562 */
7563 simdjson_really_inline void append(uint64_t val, internal::tape_type t) noexcept;
7564
7565 /**
7566 * Skip the current tape entry without writing.
7567 *
7568 * Used to skip the start of the container, since we'll come back later to fill it in when the
7569 * container ends.
7570 */
7571 simdjson_really_inline void skip() noexcept;
7572
7573 /**
7574 * Skip the number of tape entries necessary to write a large u64 or i64.
7575 */
7576 simdjson_really_inline void skip_large_integer() noexcept;
7577
7578 /**
7579 * Skip the number of tape entries necessary to write a double.
7580 */
7581 simdjson_really_inline void skip_double() noexcept;
7582
7583 /**
7584 * Write a value to a known location on tape.
7585 *
7586 * Used to go back and write out the start of a container after the container ends.
7587 */
7588 simdjson_really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
7589
7590 private:
7591 /**
7592 * Append both the tape entry, and a supplementary value following it. Used for types that need
7593 * all 64 bits, such as double and uint64_t.
7594 */
7595 template<typename T>
7596 simdjson_really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
7597 }; // struct number_writer
7598
append_s64(int64_t value)7599 simdjson_really_inline void tape_writer::append_s64(int64_t value) noexcept {
7600 append2(0, value, internal::tape_type::INT64);
7601 }
7602
append_u64(uint64_t value)7603 simdjson_really_inline void tape_writer::append_u64(uint64_t value) noexcept {
7604 append(0, internal::tape_type::UINT64);
7605 *next_tape_loc = value;
7606 next_tape_loc++;
7607 }
7608
7609 /** Write a double value to tape. */
append_double(double value)7610 simdjson_really_inline void tape_writer::append_double(double value) noexcept {
7611 append2(0, value, internal::tape_type::DOUBLE);
7612 }
7613
skip()7614 simdjson_really_inline void tape_writer::skip() noexcept {
7615 next_tape_loc++;
7616 }
7617
skip_large_integer()7618 simdjson_really_inline void tape_writer::skip_large_integer() noexcept {
7619 next_tape_loc += 2;
7620 }
7621
skip_double()7622 simdjson_really_inline void tape_writer::skip_double() noexcept {
7623 next_tape_loc += 2;
7624 }
7625
append(uint64_t val,internal::tape_type t)7626 simdjson_really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
7627 *next_tape_loc = val | ((uint64_t(char(t))) << 56);
7628 next_tape_loc++;
7629 }
7630
7631 template<typename T>
append2(uint64_t val,T val2,internal::tape_type t)7632 simdjson_really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
7633 append(val, t);
7634 static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
7635 memcpy(next_tape_loc, &val2, sizeof(val2));
7636 next_tape_loc++;
7637 }
7638
write(uint64_t & tape_loc,uint64_t val,internal::tape_type t)7639 simdjson_really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
7640 tape_loc = val | ((uint64_t(char(t))) << 56);
7641 }
7642
7643 } // namespace stage2
7644 } // unnamed namespace
7645 } // namespace haswell
7646 } // namespace simdjson
7647 /* end file src/generic/stage2/tape_writer.h */
7648
7649 namespace simdjson {
7650 namespace haswell {
7651 namespace {
7652 namespace stage2 {
7653
7654 struct tape_builder {
7655 template<bool STREAMING>
7656 simdjson_warn_unused static simdjson_really_inline error_code parse_document(
7657 dom_parser_implementation &dom_parser,
7658 dom::document &doc) noexcept;
7659
7660 /** Called when a non-empty document starts. */
7661 simdjson_warn_unused simdjson_really_inline error_code visit_document_start(json_iterator &iter) noexcept;
7662 /** Called when a non-empty document ends without error. */
7663 simdjson_warn_unused simdjson_really_inline error_code visit_document_end(json_iterator &iter) noexcept;
7664
7665 /** Called when a non-empty array starts. */
7666 simdjson_warn_unused simdjson_really_inline error_code visit_array_start(json_iterator &iter) noexcept;
7667 /** Called when a non-empty array ends. */
7668 simdjson_warn_unused simdjson_really_inline error_code visit_array_end(json_iterator &iter) noexcept;
7669 /** Called when an empty array is found. */
7670 simdjson_warn_unused simdjson_really_inline error_code visit_empty_array(json_iterator &iter) noexcept;
7671
7672 /** Called when a non-empty object starts. */
7673 simdjson_warn_unused simdjson_really_inline error_code visit_object_start(json_iterator &iter) noexcept;
7674 /**
7675 * Called when a key in a field is encountered.
7676 *
7677 * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
7678 * will be called after this with the field value.
7679 */
7680 simdjson_warn_unused simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
7681 /** Called when a non-empty object ends. */
7682 simdjson_warn_unused simdjson_really_inline error_code visit_object_end(json_iterator &iter) noexcept;
7683 /** Called when an empty object is found. */
7684 simdjson_warn_unused simdjson_really_inline error_code visit_empty_object(json_iterator &iter) noexcept;
7685
7686 /**
7687 * Called when a string, number, boolean or null is found.
7688 */
7689 simdjson_warn_unused simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
7690 /**
7691 * Called when a string, number, boolean or null is found at the top level of a document (i.e.
7692 * when there is no array or object and the entire document is a single string, number, boolean or
7693 * null.
7694 *
7695 * This is separate from primitive() because simdjson's normal primitive parsing routines assume
7696 * there is at least one more token after the value, which is only true in an array or object.
7697 */
7698 simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
7699
7700 simdjson_warn_unused simdjson_really_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
7701 simdjson_warn_unused simdjson_really_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
7702 simdjson_warn_unused simdjson_really_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
7703 simdjson_warn_unused simdjson_really_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
7704 simdjson_warn_unused simdjson_really_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
7705
7706 simdjson_warn_unused simdjson_really_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
7707 simdjson_warn_unused simdjson_really_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
7708 simdjson_warn_unused simdjson_really_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
7709 simdjson_warn_unused simdjson_really_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
7710 simdjson_warn_unused simdjson_really_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
7711
7712 /** Called each time a new field or element in an array or object is found. */
7713 simdjson_warn_unused simdjson_really_inline error_code increment_count(json_iterator &iter) noexcept;
7714
7715 /** Next location to write to tape */
7716 tape_writer tape;
7717 private:
7718 /** Next write location in the string buf for stage 2 parsing */
7719 uint8_t *current_string_buf_loc;
7720
7721 simdjson_really_inline tape_builder(dom::document &doc) noexcept;
7722
7723 simdjson_really_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
7724 simdjson_really_inline void start_container(json_iterator &iter) noexcept;
7725 simdjson_warn_unused simdjson_really_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
7726 simdjson_warn_unused simdjson_really_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
7727 simdjson_really_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
7728 simdjson_really_inline void on_end_string(uint8_t *dst) noexcept;
7729 }; // class tape_builder
7730
7731 template<bool STREAMING>
parse_document(dom_parser_implementation & dom_parser,dom::document & doc)7732 simdjson_warn_unused simdjson_really_inline error_code tape_builder::parse_document(
7733 dom_parser_implementation &dom_parser,
7734 dom::document &doc) noexcept {
7735 dom_parser.doc = &doc;
7736 json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
7737 tape_builder builder(doc);
7738 return iter.walk_document<STREAMING>(builder);
7739 }
7740
visit_root_primitive(json_iterator & iter,const uint8_t * value)7741 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
7742 return iter.visit_root_primitive(*this, value);
7743 }
visit_primitive(json_iterator & iter,const uint8_t * value)7744 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
7745 return iter.visit_primitive(*this, value);
7746 }
visit_empty_object(json_iterator & iter)7747 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
7748 return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
7749 }
visit_empty_array(json_iterator & iter)7750 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
7751 return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
7752 }
7753
visit_document_start(json_iterator & iter)7754 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
7755 start_container(iter);
7756 return SUCCESS;
7757 }
visit_object_start(json_iterator & iter)7758 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
7759 start_container(iter);
7760 return SUCCESS;
7761 }
visit_array_start(json_iterator & iter)7762 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
7763 start_container(iter);
7764 return SUCCESS;
7765 }
7766
visit_object_end(json_iterator & iter)7767 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
7768 return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
7769 }
visit_array_end(json_iterator & iter)7770 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
7771 return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
7772 }
visit_document_end(json_iterator & iter)7773 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
7774 constexpr uint32_t start_tape_index = 0;
7775 tape.append(start_tape_index, internal::tape_type::ROOT);
7776 tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
7777 return SUCCESS;
7778 }
visit_key(json_iterator & iter,const uint8_t * key)7779 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
7780 return visit_string(iter, key, true);
7781 }
7782
increment_count(json_iterator & iter)7783 simdjson_warn_unused simdjson_really_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
7784 iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
7785 return SUCCESS;
7786 }
7787
tape_builder(dom::document & doc)7788 simdjson_really_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
7789
visit_string(json_iterator & iter,const uint8_t * value,bool key)7790 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
7791 iter.log_value(key ? "key" : "string");
7792 uint8_t *dst = on_start_string(iter);
7793 dst = stringparsing::parse_string(value+1, dst);
7794 if (dst == nullptr) {
7795 iter.log_error("Invalid escape in string");
7796 return STRING_ERROR;
7797 }
7798 on_end_string(dst);
7799 return SUCCESS;
7800 }
7801
visit_root_string(json_iterator & iter,const uint8_t * value)7802 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
7803 return visit_string(iter, value);
7804 }
7805
visit_number(json_iterator & iter,const uint8_t * value)7806 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
7807 iter.log_value("number");
7808 return numberparsing::parse_number(value, tape);
7809 }
7810
visit_root_number(json_iterator & iter,const uint8_t * value)7811 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
7812 //
7813 // We need to make a copy to make sure that the string is space terminated.
7814 // This is not about padding the input, which should already padded up
7815 // to len + SIMDJSON_PADDING. However, we have no control at this stage
7816 // on how the padding was done. What if the input string was padded with nulls?
7817 // It is quite common for an input string to have an extra null character (C string).
7818 // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
7819 // document, but the string "9\0" by itself is fine. So we make a copy and
7820 // pad the input with spaces when we know that there is just one input element.
7821 // This copy is relatively expensive, but it will almost never be called in
7822 // practice unless you are in the strange scenario where you have many JSON
7823 // documents made of single atoms.
7824 //
7825 std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
7826 if (copy.get() == nullptr) { return MEMALLOC; }
7827 std::memcpy(copy.get(), value, iter.remaining_len());
7828 std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
7829 error_code error = visit_number(iter, copy.get());
7830 return error;
7831 }
7832
visit_true_atom(json_iterator & iter,const uint8_t * value)7833 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
7834 iter.log_value("true");
7835 if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
7836 tape.append(0, internal::tape_type::TRUE_VALUE);
7837 return SUCCESS;
7838 }
7839
visit_root_true_atom(json_iterator & iter,const uint8_t * value)7840 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
7841 iter.log_value("true");
7842 if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
7843 tape.append(0, internal::tape_type::TRUE_VALUE);
7844 return SUCCESS;
7845 }
7846
visit_false_atom(json_iterator & iter,const uint8_t * value)7847 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
7848 iter.log_value("false");
7849 if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
7850 tape.append(0, internal::tape_type::FALSE_VALUE);
7851 return SUCCESS;
7852 }
7853
visit_root_false_atom(json_iterator & iter,const uint8_t * value)7854 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
7855 iter.log_value("false");
7856 if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
7857 tape.append(0, internal::tape_type::FALSE_VALUE);
7858 return SUCCESS;
7859 }
7860
visit_null_atom(json_iterator & iter,const uint8_t * value)7861 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
7862 iter.log_value("null");
7863 if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
7864 tape.append(0, internal::tape_type::NULL_VALUE);
7865 return SUCCESS;
7866 }
7867
visit_root_null_atom(json_iterator & iter,const uint8_t * value)7868 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
7869 iter.log_value("null");
7870 if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
7871 tape.append(0, internal::tape_type::NULL_VALUE);
7872 return SUCCESS;
7873 }
7874
7875 // private:
7876
next_tape_index(json_iterator & iter) const7877 simdjson_really_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
7878 return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
7879 }
7880
empty_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)7881 simdjson_warn_unused simdjson_really_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
7882 auto start_index = next_tape_index(iter);
7883 tape.append(start_index+2, start);
7884 tape.append(start_index, end);
7885 return SUCCESS;
7886 }
7887
start_container(json_iterator & iter)7888 simdjson_really_inline void tape_builder::start_container(json_iterator &iter) noexcept {
7889 iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
7890 iter.dom_parser.open_containers[iter.depth].count = 0;
7891 tape.skip(); // We don't actually *write* the start element until the end.
7892 }
7893
end_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)7894 simdjson_warn_unused simdjson_really_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
7895 // Write the ending tape element, pointing at the start location
7896 const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
7897 tape.append(start_tape_index, end);
7898 // Write the start tape element, pointing at the end location (and including count)
7899 // count can overflow if it exceeds 24 bits... so we saturate
7900 // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
7901 const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
7902 const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
7903 tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
7904 return SUCCESS;
7905 }
7906
on_start_string(json_iterator & iter)7907 simdjson_really_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
7908 // we advance the point, accounting for the fact that we have a NULL termination
7909 tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
7910 return current_string_buf_loc + sizeof(uint32_t);
7911 }
7912
on_end_string(uint8_t * dst)7913 simdjson_really_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
7914 uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
7915 // TODO check for overflow in case someone has a crazy string (>=4GB?)
7916 // But only add the overflow check when the document itself exceeds 4GB
7917 // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
7918 memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
7919 // NULL termination is still handy if you expect all your strings to
7920 // be NULL terminated? It comes at a small cost
7921 *dst = 0;
7922 current_string_buf_loc = dst + 1;
7923 }
7924
7925 } // namespace stage2
7926 } // unnamed namespace
7927 } // namespace haswell
7928 } // namespace simdjson
7929 /* end file src/generic/stage2/tape_builder.h */
7930
7931 //
7932 // Implementation-specific overrides
7933 //
7934 namespace simdjson {
7935 namespace haswell {
7936 namespace {
7937 namespace stage1 {
7938
find_escaped(uint64_t backslash)7939 simdjson_really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
7940 if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
7941 return find_escaped_branchless(backslash);
7942 }
7943
7944 } // namespace stage1
7945 } // unnamed namespace
7946
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len) const7947 simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
7948 return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
7949 }
7950
stage1(const uint8_t * _buf,size_t _len,bool streaming)7951 simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
7952 this->buf = _buf;
7953 this->len = _len;
7954 return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
7955 }
7956
validate_utf8(const char * buf,size_t len) const7957 simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
7958 return haswell::stage1::generic_validate_utf8(buf,len);
7959 }
7960
stage2(dom::document & _doc)7961 simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
7962 return stage2::tape_builder::parse_document<false>(*this, _doc);
7963 }
7964
stage2_next(dom::document & _doc)7965 simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
7966 return stage2::tape_builder::parse_document<true>(*this, _doc);
7967 }
7968
parse(const uint8_t * _buf,size_t _len,dom::document & _doc)7969 simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
7970 auto error = stage1(_buf, _len, false);
7971 if (error) { return error; }
7972 return stage2(_doc);
7973 }
7974
7975 } // namespace haswell
7976 } // namespace simdjson
7977
7978 /* begin file include/simdjson/haswell/end.h */
7979 SIMDJSON_UNTARGET_HASWELL
7980 /* end file include/simdjson/haswell/end.h */
7981 /* end file src/haswell/dom_parser_implementation.cpp */
7982 #endif
7983 #if SIMDJSON_IMPLEMENTATION_PPC64
7984 /* begin file src/ppc64/implementation.cpp */
7985 /* begin file include/simdjson/ppc64/begin.h */
7986 // redefining SIMDJSON_IMPLEMENTATION to "ppc64"
7987 // #define SIMDJSON_IMPLEMENTATION ppc64
7988 /* end file include/simdjson/ppc64/begin.h */
7989
7990 namespace simdjson {
7991 namespace ppc64 {
7992
create_dom_parser_implementation(size_t capacity,size_t max_depth,std::unique_ptr<internal::dom_parser_implementation> & dst) const7993 simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
7994 size_t capacity,
7995 size_t max_depth,
7996 std::unique_ptr<internal::dom_parser_implementation>& dst
7997 ) const noexcept {
7998 dst.reset( new (std::nothrow) dom_parser_implementation() );
7999 if (!dst) { return MEMALLOC; }
8000 dst->set_capacity(capacity);
8001 dst->set_max_depth(max_depth);
8002 return SUCCESS;
8003 }
8004
8005 } // namespace ppc64
8006 } // namespace simdjson
8007
8008 /* begin file include/simdjson/ppc64/end.h */
8009 /* end file include/simdjson/ppc64/end.h */
8010 /* end file src/ppc64/implementation.cpp */
8011 /* begin file src/ppc64/dom_parser_implementation.cpp */
8012 /* begin file include/simdjson/ppc64/begin.h */
8013 // redefining SIMDJSON_IMPLEMENTATION to "ppc64"
8014 // #define SIMDJSON_IMPLEMENTATION ppc64
8015 /* end file include/simdjson/ppc64/begin.h */
8016
8017 //
8018 // Stage 1
8019 //
8020 namespace simdjson {
8021 namespace ppc64 {
8022 namespace {
8023
8024 using namespace simd;
8025
8026 struct json_character_block {
8027 static simdjson_really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
8028
whitespacesimdjson::ppc64::__anon9bb6be6f2511::json_character_block8029 simdjson_really_inline uint64_t whitespace() const noexcept { return _whitespace; }
opsimdjson::ppc64::__anon9bb6be6f2511::json_character_block8030 simdjson_really_inline uint64_t op() const noexcept { return _op; }
scalarsimdjson::ppc64::__anon9bb6be6f2511::json_character_block8031 simdjson_really_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
8032
8033 uint64_t _whitespace;
8034 uint64_t _op;
8035 };
8036
classify(const simd::simd8x64<uint8_t> & in)8037 simdjson_really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
8038 const simd8<uint8_t> table1(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
8039 const simd8<uint8_t> table2(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
8040
8041 simd8x64<uint8_t> v(
8042 (in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2),
8043 (in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2),
8044 (in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2),
8045 (in.chunks[3] & 0xf).lookup_16(table1) & (in.chunks[3].shr<4>()).lookup_16(table2)
8046 );
8047
8048 uint64_t op = simd8x64<bool>(
8049 v.chunks[0].any_bits_set(0x7),
8050 v.chunks[1].any_bits_set(0x7),
8051 v.chunks[2].any_bits_set(0x7),
8052 v.chunks[3].any_bits_set(0x7)
8053 ).to_bitmask();
8054
8055 uint64_t whitespace = simd8x64<bool>(
8056 v.chunks[0].any_bits_set(0x18),
8057 v.chunks[1].any_bits_set(0x18),
8058 v.chunks[2].any_bits_set(0x18),
8059 v.chunks[3].any_bits_set(0x18)
8060 ).to_bitmask();
8061
8062 return { whitespace, op };
8063 }
8064
is_ascii(const simd8x64<uint8_t> & input)8065 simdjson_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
8066 // careful: 0x80 is not ascii.
8067 return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
8068 }
8069
must_be_continuation(const simd8<uint8_t> prev1,const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)8070 simdjson_unused simdjson_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
8071 simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
8072 simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
8073 simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
8074 // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
8075 return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
8076 }
8077
must_be_2_3_continuation(const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)8078 simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
8079 simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
8080 simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
8081 // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
8082 return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
8083 }
8084
8085 } // unnamed namespace
8086 } // namespace ppc64
8087 } // namespace simdjson
8088
8089 /* begin file src/generic/stage1/utf8_lookup4_algorithm.h */
8090 namespace simdjson {
8091 namespace ppc64 {
8092 namespace {
8093 namespace utf8_validation {
8094
8095 using namespace simd;
8096
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)8097 simdjson_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
8098 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
8099 // Bit 1 = Too Long (ASCII followed by continuation)
8100 // Bit 2 = Overlong 3-byte
8101 // Bit 4 = Surrogate
8102 // Bit 5 = Overlong 2-byte
8103 // Bit 7 = Two Continuations
8104 constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
8105 // 11______ 11______
8106 constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
8107 constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
8108 constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
8109 constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
8110 constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
8111 constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
8112 // 11110100 101_____
8113 // 11110101 1001____
8114 // 11110101 101_____
8115 // 1111011_ 1001____
8116 // 1111011_ 101_____
8117 // 11111___ 1001____
8118 // 11111___ 101_____
8119 constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
8120 // 11110101 1000____
8121 // 1111011_ 1000____
8122 // 11111___ 1000____
8123 constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
8124
8125 const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
8126 // 0_______ ________ <ASCII in byte 1>
8127 TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
8128 TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
8129 // 10______ ________ <continuation in byte 1>
8130 TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
8131 // 1100____ ________ <two byte lead in byte 1>
8132 TOO_SHORT | OVERLONG_2,
8133 // 1101____ ________ <two byte lead in byte 1>
8134 TOO_SHORT,
8135 // 1110____ ________ <three byte lead in byte 1>
8136 TOO_SHORT | OVERLONG_3 | SURROGATE,
8137 // 1111____ ________ <four+ byte lead in byte 1>
8138 TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
8139 );
8140 constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
8141 const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
8142 // ____0000 ________
8143 CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
8144 // ____0001 ________
8145 CARRY | OVERLONG_2,
8146 // ____001_ ________
8147 CARRY,
8148 CARRY,
8149
8150 // ____0100 ________
8151 CARRY | TOO_LARGE,
8152 // ____0101 ________
8153 CARRY | TOO_LARGE | TOO_LARGE_1000,
8154 // ____011_ ________
8155 CARRY | TOO_LARGE | TOO_LARGE_1000,
8156 CARRY | TOO_LARGE | TOO_LARGE_1000,
8157
8158 // ____1___ ________
8159 CARRY | TOO_LARGE | TOO_LARGE_1000,
8160 CARRY | TOO_LARGE | TOO_LARGE_1000,
8161 CARRY | TOO_LARGE | TOO_LARGE_1000,
8162 CARRY | TOO_LARGE | TOO_LARGE_1000,
8163 CARRY | TOO_LARGE | TOO_LARGE_1000,
8164 // ____1101 ________
8165 CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
8166 CARRY | TOO_LARGE | TOO_LARGE_1000,
8167 CARRY | TOO_LARGE | TOO_LARGE_1000
8168 );
8169 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
8170 // ________ 0_______ <ASCII in byte 2>
8171 TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
8172 TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
8173
8174 // ________ 1000____
8175 TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
8176 // ________ 1001____
8177 TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
8178 // ________ 101_____
8179 TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
8180 TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
8181
8182 // ________ 11______
8183 TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
8184 );
8185 return (byte_1_high & byte_1_low & byte_2_high);
8186 }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)8187 simdjson_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
8188 const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
8189 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
8190 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
8191 simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
8192 simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
8193 return must23_80 ^ sc;
8194 }
8195
8196 //
8197 // Return nonzero if there are incomplete multibyte characters at the end of the block:
8198 // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
8199 //
is_incomplete(const simd8<uint8_t> input)8200 simdjson_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
8201 // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
8202 // ... 1111____ 111_____ 11______
8203 static const uint8_t max_array[32] = {
8204 255, 255, 255, 255, 255, 255, 255, 255,
8205 255, 255, 255, 255, 255, 255, 255, 255,
8206 255, 255, 255, 255, 255, 255, 255, 255,
8207 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
8208 };
8209 const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
8210 return input.gt_bits(max_value);
8211 }
8212
8213 struct utf8_checker {
8214 // If this is nonzero, there has been a UTF-8 error.
8215 simd8<uint8_t> error;
8216 // The last input we received
8217 simd8<uint8_t> prev_input_block;
8218 // Whether the last input we received was incomplete (used for ASCII fast path)
8219 simd8<uint8_t> prev_incomplete;
8220
8221 //
8222 // Check whether the current bytes are valid UTF-8.
8223 //
check_utf8_bytessimdjson::ppc64::__anon9bb6be6f2611::utf8_validation::utf8_checker8224 simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
8225 // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
8226 // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
8227 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
8228 simd8<uint8_t> sc = check_special_cases(input, prev1);
8229 this->error |= check_multibyte_lengths(input, prev_input, sc);
8230 }
8231
8232 // The only problem that can happen at EOF is that a multibyte character is too short
8233 // or a byte value too large in the last bytes: check_special_cases only checks for bytes
8234 // too large in the first of two bytes.
check_eofsimdjson::ppc64::__anon9bb6be6f2611::utf8_validation::utf8_checker8235 simdjson_really_inline void check_eof() {
8236 // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
8237 // possibly finish them.
8238 this->error |= this->prev_incomplete;
8239 }
8240
check_next_inputsimdjson::ppc64::__anon9bb6be6f2611::utf8_validation::utf8_checker8241 simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
8242 if(simdjson_likely(is_ascii(input))) {
8243 this->error |= this->prev_incomplete;
8244 } else {
8245 // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
8246 static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
8247 "We support either two or four chunks per 64-byte block.");
8248 if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
8249 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
8250 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
8251 } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
8252 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
8253 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
8254 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
8255 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
8256 }
8257 this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
8258 this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
8259
8260 }
8261 }
8262 // do not forget to call check_eof!
errorssimdjson::ppc64::__anon9bb6be6f2611::utf8_validation::utf8_checker8263 simdjson_really_inline error_code errors() {
8264 return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
8265 }
8266
8267 }; // struct utf8_checker
8268 } // namespace utf8_validation
8269
8270 using utf8_validation::utf8_checker;
8271
8272 } // unnamed namespace
8273 } // namespace ppc64
8274 } // namespace simdjson
8275 /* end file src/generic/stage1/utf8_lookup4_algorithm.h */
8276 /* begin file src/generic/stage1/json_structural_indexer.h */
8277 // This file contains the common code every implementation uses in stage1
8278 // It is intended to be included multiple times and compiled multiple times
8279 // We assume the file in which it is included already includes
8280 // "simdjson/stage1.h" (this simplifies amalgation)
8281
8282 /* begin file src/generic/stage1/buf_block_reader.h */
8283 namespace simdjson {
8284 namespace ppc64 {
8285 namespace {
8286
8287 // Walks through a buffer in block-sized increments, loading the last part with spaces
8288 template<size_t STEP_SIZE>
8289 struct buf_block_reader {
8290 public:
8291 simdjson_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
8292 simdjson_really_inline size_t block_index();
8293 simdjson_really_inline bool has_full_block() const;
8294 simdjson_really_inline const uint8_t *full_block() const;
8295 /**
8296 * Get the last block, padded with spaces.
8297 *
8298 * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
8299 * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
8300 * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
8301 *
8302 * @return the number of effective characters in the last block.
8303 */
8304 simdjson_really_inline size_t get_remainder(uint8_t *dst) const;
8305 simdjson_really_inline void advance();
8306 private:
8307 const uint8_t *buf;
8308 const size_t len;
8309 const size_t lenminusstep;
8310 size_t idx;
8311 };
8312
8313 // Routines to print masks and text for debugging bitmask operations
format_input_text_64(const uint8_t * text)8314 simdjson_unused static char * format_input_text_64(const uint8_t *text) {
8315 static char buf[sizeof(simd8x64<uint8_t>) + 1];
8316 for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
8317 buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
8318 }
8319 buf[sizeof(simd8x64<uint8_t>)] = '\0';
8320 return buf;
8321 }
8322
8323 // Routines to print masks and text for debugging bitmask operations
format_input_text(const simd8x64<uint8_t> & in)8324 simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
8325 static char buf[sizeof(simd8x64<uint8_t>) + 1];
8326 in.store(reinterpret_cast<uint8_t*>(buf));
8327 for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
8328 if (buf[i] < ' ') { buf[i] = '_'; }
8329 }
8330 buf[sizeof(simd8x64<uint8_t>)] = '\0';
8331 return buf;
8332 }
8333
format_mask(uint64_t mask)8334 simdjson_unused static char * format_mask(uint64_t mask) {
8335 static char buf[sizeof(simd8x64<uint8_t>) + 1];
8336 for (size_t i=0; i<64; i++) {
8337 buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
8338 }
8339 buf[64] = '\0';
8340 return buf;
8341 }
8342
8343 template<size_t STEP_SIZE>
buf_block_reader(const uint8_t * _buf,size_t _len)8344 simdjson_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
8345
8346 template<size_t STEP_SIZE>
block_index()8347 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
8348
8349 template<size_t STEP_SIZE>
has_full_block() const8350 simdjson_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
8351 return idx < lenminusstep;
8352 }
8353
8354 template<size_t STEP_SIZE>
full_block() const8355 simdjson_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
8356 return &buf[idx];
8357 }
8358
8359 template<size_t STEP_SIZE>
get_remainder(uint8_t * dst) const8360 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
8361 if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
8362 std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
8363 std::memcpy(dst, buf + idx, len - idx);
8364 return len - idx;
8365 }
8366
8367 template<size_t STEP_SIZE>
advance()8368 simdjson_really_inline void buf_block_reader<STEP_SIZE>::advance() {
8369 idx += STEP_SIZE;
8370 }
8371
8372 } // unnamed namespace
8373 } // namespace ppc64
8374 } // namespace simdjson
8375 /* end file src/generic/stage1/buf_block_reader.h */
8376 /* begin file src/generic/stage1/json_string_scanner.h */
8377 namespace simdjson {
8378 namespace ppc64 {
8379 namespace {
8380 namespace stage1 {
8381
8382 struct json_string_block {
8383 // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_string_blocksimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8384 simdjson_really_inline json_string_block(uint64_t backslash, uint64_t escaped, uint64_t quote, uint64_t in_string) :
8385 _backslash(backslash), _escaped(escaped), _quote(quote), _in_string(in_string) {}
8386
8387 // Escaped characters (characters following an escape() character)
escapedsimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8388 simdjson_really_inline uint64_t escaped() const { return _escaped; }
8389 // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
escapesimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8390 simdjson_really_inline uint64_t escape() const { return _backslash & ~_escaped; }
8391 // Real (non-backslashed) quotes
quotesimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8392 simdjson_really_inline uint64_t quote() const { return _quote; }
8393 // Start quotes of strings
string_startsimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8394 simdjson_really_inline uint64_t string_start() const { return _quote & _in_string; }
8395 // End quotes of strings
string_endsimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8396 simdjson_really_inline uint64_t string_end() const { return _quote & ~_in_string; }
8397 // Only characters inside the string (not including the quotes)
string_contentsimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8398 simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; }
8399 // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_inside_stringsimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8400 simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
8401 // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_outside_stringsimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8402 simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
8403 // Tail of string (everything except the start quote)
string_tailsimdjson::ppc64::__anon9bb6be6f2811::stage1::json_string_block8404 simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
8405
8406 // backslash characters
8407 uint64_t _backslash;
8408 // escaped characters (backslashed--does not include the hex characters after \u)
8409 uint64_t _escaped;
8410 // real quotes (non-backslashed ones)
8411 uint64_t _quote;
8412 // string characters (includes start quote but not end quote)
8413 uint64_t _in_string;
8414 };
8415
8416 // Scans blocks for string characters, storing the state necessary to do so
8417 class json_string_scanner {
8418 public:
8419 simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
8420 // Returns either UNCLOSED_STRING or SUCCESS
8421 simdjson_really_inline error_code finish();
8422
8423 private:
8424 // Intended to be defined by the implementation
8425 simdjson_really_inline uint64_t find_escaped(uint64_t escape);
8426 simdjson_really_inline uint64_t find_escaped_branchless(uint64_t escape);
8427
8428 // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
8429 uint64_t prev_in_string = 0ULL;
8430 // Whether the first character of the next iteration is escaped.
8431 uint64_t prev_escaped = 0ULL;
8432 };
8433
8434 //
8435 // Finds escaped characters (characters following \).
8436 //
8437 // Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
8438 //
8439 // Does this by:
8440 // - Shift the escape mask to get potentially escaped characters (characters after backslashes).
8441 // - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
8442 // - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
8443 //
8444 // To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
8445 // escape sequences, filters out the ones that start on even bits, and adds that to the mask of
8446 // escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
8447 // the start bit causes a carry), and leaves even-bit sequences alone.
8448 //
8449 // Example:
8450 //
8451 // text | \\\ | \\\"\\\" \\\" \\"\\" |
8452 // escape | xxx | xx xxx xxx xx xx | Removed overflow backslash; will | it into follows_escape
8453 // odd_starts | x | x x x | escape & ~even_bits & ~follows_escape
8454 // even_seq | c| cxxx c xx c | c = carry bit -- will be masked out later
8455 // invert_mask | | cxxx c xx c| even_seq << 1
8456 // follows_escape | xx | x xx xxx xxx xx xx | Includes overflow bit
8457 // escaped | x | x x x x x x x x |
8458 // desired | x | x x x x x x x x |
8459 // text | \\\ | \\\"\\\" \\\" \\"\\" |
8460 //
find_escaped_branchless(uint64_t backslash)8461 simdjson_really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash) {
8462 // If there was overflow, pretend the first character isn't a backslash
8463 backslash &= ~prev_escaped;
8464 uint64_t follows_escape = backslash << 1 | prev_escaped;
8465
8466 // Get sequences starting on even bits by clearing out the odd series using +
8467 const uint64_t even_bits = 0x5555555555555555ULL;
8468 uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
8469 uint64_t sequences_starting_on_even_bits;
8470 prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
8471 uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
8472
8473 // Mask every other backslashed character as an escaped character
8474 // Flip the mask for sequences that start on even bits, to correct them
8475 return (even_bits ^ invert_mask) & follows_escape;
8476 }
8477
8478 //
8479 // Return a mask of all string characters plus end quotes.
8480 //
8481 // prev_escaped is overflow saying whether the next character is escaped.
8482 // prev_in_string is overflow saying whether we're still in a string.
8483 //
8484 // Backslash sequences outside of quotes will be detected in stage 2.
8485 //
next(const simd::simd8x64<uint8_t> & in)8486 simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
8487 const uint64_t backslash = in.eq('\\');
8488 const uint64_t escaped = find_escaped(backslash);
8489 const uint64_t quote = in.eq('"') & ~escaped;
8490
8491 //
8492 // prefix_xor flips on bits inside the string (and flips off the end quote).
8493 //
8494 // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
8495 // (characters inside strings are outside, and characters outside strings are inside).
8496 //
8497 const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
8498
8499 //
8500 // Check if we're still in a string at the end of the box so the next block will know
8501 //
8502 // right shift of a signed value expected to be well-defined and standard
8503 // compliant as of C++20, John Regher from Utah U. says this is fine code
8504 //
8505 prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
8506
8507 // Use ^ to turn the beginning quote off, and the end quote on.
8508
8509 // We are returning a function-local object so either we get a move constructor
8510 // or we get copy elision.
8511 return json_string_block(
8512 backslash,
8513 escaped,
8514 quote,
8515 in_string
8516 );
8517 }
8518
finish()8519 simdjson_really_inline error_code json_string_scanner::finish() {
8520 if (prev_in_string) {
8521 return UNCLOSED_STRING;
8522 }
8523 return SUCCESS;
8524 }
8525
8526 } // namespace stage1
8527 } // unnamed namespace
8528 } // namespace ppc64
8529 } // namespace simdjson
8530 /* end file src/generic/stage1/json_string_scanner.h */
8531 /* begin file src/generic/stage1/json_scanner.h */
8532 namespace simdjson {
8533 namespace ppc64 {
8534 namespace {
8535 namespace stage1 {
8536
8537 /**
8538 * A block of scanned json, with information on operators and scalars.
8539 *
8540 * We seek to identify pseudo-structural characters. Anything that is inside
8541 * a string must be omitted (hence & ~_string.string_tail()).
8542 * Otherwise, pseudo-structural characters come in two forms.
8543 * 1. We have the structural characters ([,],{,},:, comma). The
8544 * term 'structural character' is from the JSON RFC.
8545 * 2. We have the 'scalar pseudo-structural characters'.
8546 * Scalars are quotes, and any character except structural characters and white space.
8547 *
8548 * To identify the scalar pseudo-structural characters, we must look at what comes
8549 * before them: it must be a space, a quote or a structural characters.
8550 * Starting with simdjson v0.3, we identify them by
8551 * negation: we identify everything that is followed by a non-quote scalar,
8552 * and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
8553 */
8554 struct json_block {
8555 public:
8556 // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_blocksimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8557 simdjson_really_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
8558 _string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
json_blocksimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8559 simdjson_really_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
8560 _string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
8561
8562 /**
8563 * The start of structurals.
8564 * In simdjson prior to v0.3, these were called the pseudo-structural characters.
8565 **/
structural_startsimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8566 simdjson_really_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
8567 /** All JSON whitespace (i.e. not in a string) */
whitespacesimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8568 simdjson_really_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
8569
8570 // Helpers
8571
8572 /** Whether the given characters are inside a string (only works on non-quotes) */
non_quote_inside_stringsimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8573 simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
8574 /** Whether the given characters are outside a string (only works on non-quotes) */
non_quote_outside_stringsimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8575 simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
8576
8577 // string and escape characters
8578 json_string_block _string;
8579 // whitespace, structural characters ('operators'), scalars
8580 json_character_block _characters;
8581 // whether the previous character was a scalar
8582 uint64_t _follows_potential_nonquote_scalar;
8583 private:
8584 // Potential structurals (i.e. disregarding strings)
8585
8586 /**
8587 * structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
8588 * They may reside inside a string.
8589 **/
potential_structural_startsimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8590 simdjson_really_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
8591 /**
8592 * The start of non-operator runs, like 123, true and "abc".
8593 * It main reside inside a string.
8594 **/
potential_scalar_startsimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8595 simdjson_really_inline uint64_t potential_scalar_start() const noexcept {
8596 // The term "scalar" refers to anything except structural characters and white space
8597 // (so letters, numbers, quotes).
8598 // Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
8599 // then we know that it is irrelevant structurally.
8600 return _characters.scalar() & ~follows_potential_scalar();
8601 }
8602 /**
8603 * Whether the given character is immediately after a non-operator like 123, true.
8604 * The characters following a quote are not included.
8605 */
follows_potential_scalarsimdjson::ppc64::__anon9bb6be6f2911::stage1::json_block8606 simdjson_really_inline uint64_t follows_potential_scalar() const noexcept {
8607 // _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
8608 // that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
8609 // white space.
8610 // It is understood that within quoted region, anything at all could be marked (irrelevant).
8611 return _follows_potential_nonquote_scalar;
8612 }
8613 };
8614
8615 /**
8616 * Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
8617 *
8618 * The scanner starts by calculating two distinct things:
8619 * - string characters (taking \" into account)
8620 * - structural characters or 'operators' ([]{},:, comma)
8621 * and scalars (runs of non-operators like 123, true and "abc")
8622 *
8623 * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
8624 * in particular, the operator/scalar bit will find plenty of things that are actually part of
8625 * strings. When we're done, json_block will fuse the two together by masking out tokens that are
8626 * part of a string.
8627 */
8628 class json_scanner {
8629 public:
json_scanner()8630 json_scanner() {}
8631 simdjson_really_inline json_block next(const simd::simd8x64<uint8_t>& in);
8632 // Returns either UNCLOSED_STRING or SUCCESS
8633 simdjson_really_inline error_code finish();
8634
8635 private:
8636 // Whether the last character of the previous iteration is part of a scalar token
8637 // (anything except whitespace or a structural character/'operator').
8638 uint64_t prev_scalar = 0ULL;
8639 json_string_scanner string_scanner{};
8640 };
8641
8642
8643 //
8644 // Check if the current character immediately follows a matching character.
8645 //
8646 // For example, this checks for quotes with backslashes in front of them:
8647 //
8648 // const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
8649 //
follows(const uint64_t match,uint64_t & overflow)8650 simdjson_really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
8651 const uint64_t result = match << 1 | overflow;
8652 overflow = match >> 63;
8653 return result;
8654 }
8655
next(const simd::simd8x64<uint8_t> & in)8656 simdjson_really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
8657 json_string_block strings = string_scanner.next(in);
8658 // identifies the white-space and the structurat characters
8659 json_character_block characters = json_character_block::classify(in);
8660 // The term "scalar" refers to anything except structural characters and white space
8661 // (so letters, numbers, quotes).
8662 // We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
8663 //
8664 // A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
8665 // or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
8666 // pseudo-structural character just like we would if we had ' "a string" true '; otherwise we
8667 // may need to add an extra check when parsing strings.
8668 //
8669 // Performance: there are many ways to skin this cat.
8670 const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
8671 uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
8672 // We are returning a function-local object so either we get a move constructor
8673 // or we get copy elision.
8674 return json_block(
8675 strings,// strings is a function-local object so either it moves or the copy is elided.
8676 characters,
8677 follows_nonquote_scalar
8678 );
8679 }
8680
finish()8681 simdjson_really_inline error_code json_scanner::finish() {
8682 return string_scanner.finish();
8683 }
8684
8685 } // namespace stage1
8686 } // unnamed namespace
8687 } // namespace ppc64
8688 } // namespace simdjson
8689 /* end file src/generic/stage1/json_scanner.h */
8690 /* begin file src/generic/stage1/json_minifier.h */
8691 // This file contains the common code every implementation uses in stage1
8692 // It is intended to be included multiple times and compiled multiple times
8693 // We assume the file in which it is included already includes
8694 // "simdjson/stage1.h" (this simplifies amalgation)
8695
8696 namespace simdjson {
8697 namespace ppc64 {
8698 namespace {
8699 namespace stage1 {
8700
8701 class json_minifier {
8702 public:
8703 template<size_t STEP_SIZE>
8704 static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
8705
8706 private:
json_minifier(uint8_t * _dst)8707 simdjson_really_inline json_minifier(uint8_t *_dst)
8708 : dst{_dst}
8709 {}
8710 template<size_t STEP_SIZE>
8711 simdjson_really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
8712 simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
8713 simdjson_really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
8714 json_scanner scanner{};
8715 uint8_t *dst;
8716 };
8717
next(const simd::simd8x64<uint8_t> & in,const json_block & block)8718 simdjson_really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
8719 uint64_t mask = block.whitespace();
8720 in.compress(mask, dst);
8721 dst += 64 - count_ones(mask);
8722 }
8723
finish(uint8_t * dst_start,size_t & dst_len)8724 simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
8725 error_code error = scanner.finish();
8726 if (error) { dst_len = 0; return error; }
8727 dst_len = dst - dst_start;
8728 return SUCCESS;
8729 }
8730
8731 template<>
step(const uint8_t * block_buf,buf_block_reader<128> & reader)8732 simdjson_really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
8733 simd::simd8x64<uint8_t> in_1(block_buf);
8734 simd::simd8x64<uint8_t> in_2(block_buf+64);
8735 json_block block_1 = scanner.next(in_1);
8736 json_block block_2 = scanner.next(in_2);
8737 this->next(in_1, block_1);
8738 this->next(in_2, block_2);
8739 reader.advance();
8740 }
8741
8742 template<>
step(const uint8_t * block_buf,buf_block_reader<64> & reader)8743 simdjson_really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
8744 simd::simd8x64<uint8_t> in_1(block_buf);
8745 json_block block_1 = scanner.next(in_1);
8746 this->next(block_buf, block_1);
8747 reader.advance();
8748 }
8749
8750 template<size_t STEP_SIZE>
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len)8751 error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
8752 buf_block_reader<STEP_SIZE> reader(buf, len);
8753 json_minifier minifier(dst);
8754
8755 // Index the first n-1 blocks
8756 while (reader.has_full_block()) {
8757 minifier.step<STEP_SIZE>(reader.full_block(), reader);
8758 }
8759
8760 // Index the last (remainder) block, padded with spaces
8761 uint8_t block[STEP_SIZE];
8762 size_t remaining_bytes = reader.get_remainder(block);
8763 if (remaining_bytes > 0) {
8764 // We do not want to write directly to the output stream. Rather, we write
8765 // to a local buffer (for safety).
8766 uint8_t out_block[STEP_SIZE];
8767 uint8_t * const guarded_dst{minifier.dst};
8768 minifier.dst = out_block;
8769 minifier.step<STEP_SIZE>(block, reader);
8770 size_t to_write = minifier.dst - out_block;
8771 // In some cases, we could be enticed to consider the padded spaces
8772 // as part of the string. This is fine as long as we do not write more
8773 // than we consumed.
8774 if(to_write > remaining_bytes) { to_write = remaining_bytes; }
8775 memcpy(guarded_dst, out_block, to_write);
8776 minifier.dst = guarded_dst + to_write;
8777 }
8778 return minifier.finish(dst, dst_len);
8779 }
8780
8781 } // namespace stage1
8782 } // unnamed namespace
8783 } // namespace ppc64
8784 } // namespace simdjson
8785 /* end file src/generic/stage1/json_minifier.h */
8786 /* begin file src/generic/stage1/find_next_document_index.h */
8787 namespace simdjson {
8788 namespace ppc64 {
8789 namespace {
8790
8791 /**
8792 * This algorithm is used to quickly identify the last structural position that
8793 * makes up a complete document.
8794 *
8795 * It does this by going backwards and finding the last *document boundary* (a
8796 * place where one value follows another without a comma between them). If the
8797 * last document (the characters after the boundary) has an equal number of
8798 * start and end brackets, it is considered complete.
8799 *
8800 * Simply put, we iterate over the structural characters, starting from
8801 * the end. We consider that we found the end of a JSON document when the
8802 * first element of the pair is NOT one of these characters: '{' '[' ';' ','
8803 * and when the second element is NOT one of these characters: '}' '}' ';' ','.
8804 *
8805 * This simple comparison works most of the time, but it does not cover cases
8806 * where the batch's structural indexes contain a perfect amount of documents.
8807 * In such a case, we do not have access to the structural index which follows
8808 * the last document, therefore, we do not have access to the second element in
8809 * the pair, and that means we cannot identify the last document. To fix this
8810 * issue, we keep a count of the open and closed curly/square braces we found
8811 * while searching for the pair. When we find a pair AND the count of open and
8812 * closed curly/square braces is the same, we know that we just passed a
8813 * complete document, therefore the last json buffer location is the end of the
8814 * batch.
8815 */
find_next_document_index(dom_parser_implementation & parser)8816 simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
8817 // TODO don't count separately, just figure out depth
8818 auto arr_cnt = 0;
8819 auto obj_cnt = 0;
8820 for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
8821 auto idxb = parser.structural_indexes[i];
8822 switch (parser.buf[idxb]) {
8823 case ':':
8824 case ',':
8825 continue;
8826 case '}':
8827 obj_cnt--;
8828 continue;
8829 case ']':
8830 arr_cnt--;
8831 continue;
8832 case '{':
8833 obj_cnt++;
8834 break;
8835 case '[':
8836 arr_cnt++;
8837 break;
8838 }
8839 auto idxa = parser.structural_indexes[i - 1];
8840 switch (parser.buf[idxa]) {
8841 case '{':
8842 case '[':
8843 case ':':
8844 case ',':
8845 continue;
8846 }
8847 // Last document is complete, so the next document will appear after!
8848 if (!arr_cnt && !obj_cnt) {
8849 return parser.n_structural_indexes;
8850 }
8851 // Last document is incomplete; mark the document at i + 1 as the next one
8852 return i;
8853 }
8854 return 0;
8855 }
8856
8857 } // unnamed namespace
8858 } // namespace ppc64
8859 } // namespace simdjson
8860 /* end file src/generic/stage1/find_next_document_index.h */
8861
8862 namespace simdjson {
8863 namespace ppc64 {
8864 namespace {
8865 namespace stage1 {
8866
8867 class bit_indexer {
8868 public:
8869 uint32_t *tail;
8870
bit_indexer(uint32_t * index_buf)8871 simdjson_really_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
8872
8873 // flatten out values in 'bits' assuming that they are are to have values of idx
8874 // plus their position in the bitvector, and store these indexes at
8875 // base_ptr[base] incrementing base as we go
8876 // will potentially store extra values beyond end of valid bits, so base_ptr
8877 // needs to be large enough to handle this
write(uint32_t idx,uint64_t bits)8878 simdjson_really_inline void write(uint32_t idx, uint64_t bits) {
8879 // In some instances, the next branch is expensive because it is mispredicted.
8880 // Unfortunately, in other cases,
8881 // it helps tremendously.
8882 if (bits == 0)
8883 return;
8884 int cnt = static_cast<int>(count_ones(bits));
8885
8886 // Do the first 8 all together
8887 for (int i=0; i<8; i++) {
8888 this->tail[i] = idx + trailing_zeroes(bits);
8889 bits = clear_lowest_bit(bits);
8890 }
8891
8892 // Do the next 8 all together (we hope in most cases it won't happen at all
8893 // and the branch is easily predicted).
8894 if (simdjson_unlikely(cnt > 8)) {
8895 for (int i=8; i<16; i++) {
8896 this->tail[i] = idx + trailing_zeroes(bits);
8897 bits = clear_lowest_bit(bits);
8898 }
8899
8900 // Most files don't have 16+ structurals per block, so we take several basically guaranteed
8901 // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
8902 // or the start of a value ("abc" true 123) every four characters.
8903 if (simdjson_unlikely(cnt > 16)) {
8904 int i = 16;
8905 do {
8906 this->tail[i] = idx + trailing_zeroes(bits);
8907 bits = clear_lowest_bit(bits);
8908 i++;
8909 } while (i < cnt);
8910 }
8911 }
8912
8913 this->tail += cnt;
8914 }
8915 };
8916
8917 class json_structural_indexer {
8918 public:
8919 /**
8920 * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
8921 *
8922 * @param partial Setting the partial parameter to true allows the find_structural_bits to
8923 * tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
8924 * you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
8925 */
8926 template<size_t STEP_SIZE>
8927 static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
8928
8929 private:
8930 simdjson_really_inline json_structural_indexer(uint32_t *structural_indexes);
8931 template<size_t STEP_SIZE>
8932 simdjson_really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
8933 simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
8934 simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
8935
8936 json_scanner scanner{};
8937 utf8_checker checker{};
8938 bit_indexer indexer;
8939 uint64_t prev_structurals = 0;
8940 uint64_t unescaped_chars_error = 0;
8941 };
8942
json_structural_indexer(uint32_t * structural_indexes)8943 simdjson_really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
8944
8945 // Skip the last character if it is partial
trim_partial_utf8(const uint8_t * buf,size_t len)8946 simdjson_really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
8947 if (simdjson_unlikely(len < 3)) {
8948 switch (len) {
8949 case 2:
8950 if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
8951 if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
8952 return len;
8953 case 1:
8954 if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
8955 return len;
8956 case 0:
8957 return len;
8958 }
8959 }
8960 if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
8961 if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
8962 if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
8963 return len;
8964 }
8965
8966 //
8967 // PERF NOTES:
8968 // We pipe 2 inputs through these stages:
8969 // 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
8970 // 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
8971 // 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
8972 // The output of step 1 depends entirely on this information. These functions don't quite use
8973 // up enough CPU: the second half of the functions is highly serial, only using 1 execution core
8974 // at a time. The second input's scans has some dependency on the first ones finishing it, but
8975 // they can make a lot of progress before they need that information.
8976 // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
8977 // to finish: utf-8 checks and generating the output from the last iteration.
8978 //
8979 // The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
8980 // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
8981 // workout.
8982 //
8983 template<size_t STEP_SIZE>
index(const uint8_t * buf,size_t len,dom_parser_implementation & parser,bool partial)8984 error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept {
8985 if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
8986 if (partial) { len = trim_partial_utf8(buf, len); }
8987
8988 buf_block_reader<STEP_SIZE> reader(buf, len);
8989 json_structural_indexer indexer(parser.structural_indexes.get());
8990
8991 // Read all but the last block
8992 while (reader.has_full_block()) {
8993 indexer.step<STEP_SIZE>(reader.full_block(), reader);
8994 }
8995
8996 // Take care of the last block (will always be there unless file is empty)
8997 uint8_t block[STEP_SIZE];
8998 if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return EMPTY; }
8999 indexer.step<STEP_SIZE>(block, reader);
9000
9001 return indexer.finish(parser, reader.block_index(), len, partial);
9002 }
9003
9004 template<>
step(const uint8_t * block,buf_block_reader<128> & reader)9005 simdjson_really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
9006 simd::simd8x64<uint8_t> in_1(block);
9007 simd::simd8x64<uint8_t> in_2(block+64);
9008 json_block block_1 = scanner.next(in_1);
9009 json_block block_2 = scanner.next(in_2);
9010 this->next(in_1, block_1, reader.block_index());
9011 this->next(in_2, block_2, reader.block_index()+64);
9012 reader.advance();
9013 }
9014
9015 template<>
step(const uint8_t * block,buf_block_reader<64> & reader)9016 simdjson_really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
9017 simd::simd8x64<uint8_t> in_1(block);
9018 json_block block_1 = scanner.next(in_1);
9019 this->next(in_1, block_1, reader.block_index());
9020 reader.advance();
9021 }
9022
next(const simd::simd8x64<uint8_t> & in,const json_block & block,size_t idx)9023 simdjson_really_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
9024 uint64_t unescaped = in.lteq(0x1F);
9025 checker.check_next_input(in);
9026 indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
9027 prev_structurals = block.structural_start();
9028 unescaped_chars_error |= block.non_quote_inside_string(unescaped);
9029 }
9030
finish(dom_parser_implementation & parser,size_t idx,size_t len,bool partial)9031 simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) {
9032 // Write out the final iteration's structurals
9033 indexer.write(uint32_t(idx-64), prev_structurals);
9034
9035 error_code error = scanner.finish();
9036 // We deliberately break down the next expression so that it is
9037 // human readable.
9038 const bool should_we_exit = partial ?
9039 ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
9040 : (error != SUCCESS); // if partial is false, we must have SUCCESS
9041 const bool have_unclosed_string = (error == UNCLOSED_STRING);
9042 if (simdjson_unlikely(should_we_exit)) { return error; }
9043
9044 if (unescaped_chars_error) {
9045 return UNESCAPED_CHARS;
9046 }
9047
9048 parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
9049 /***
9050 * This is related to https://github.com/simdjson/simdjson/issues/906
9051 * Basically, we want to make sure that if the parsing continues beyond the last (valid)
9052 * structural character, it quickly stops.
9053 * Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
9054 * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
9055 * continues, then it must be [,] or }.
9056 * Suppose it is ] or }. We backtrack to the first character, what could it be that would
9057 * not trigger an error? It could be ] or } but no, because you can't start a document that way.
9058 * It can't be a comma, a colon or any simple value. So the only way we could continue is
9059 * if the repeated character is [. But if so, the document must start with [. But if the document
9060 * starts with [, it should end with ]. If we enforce that rule, then we would get
9061 * ][[ which is invalid.
9062 **/
9063 parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
9064 parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
9065 parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
9066 parser.next_structural_index = 0;
9067 // a valid JSON file cannot have zero structural indexes - we should have found something
9068 if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
9069 return EMPTY;
9070 }
9071 if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
9072 return UNEXPECTED_ERROR;
9073 }
9074 if (partial) {
9075 // If we have an unclosed string, then the last structural
9076 // will be the quote and we want to make sure to omit it.
9077 if(have_unclosed_string) {
9078 parser.n_structural_indexes--;
9079 // a valid JSON file cannot have zero structural indexes - we should have found something
9080 if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
9081 }
9082 auto new_structural_indexes = find_next_document_index(parser);
9083 if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
9084 return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
9085 }
9086 parser.n_structural_indexes = new_structural_indexes;
9087 }
9088 checker.check_eof();
9089 return checker.errors();
9090 }
9091
9092 } // namespace stage1
9093 } // unnamed namespace
9094 } // namespace ppc64
9095 } // namespace simdjson
9096 /* end file src/generic/stage1/json_structural_indexer.h */
9097 /* begin file src/generic/stage1/utf8_validator.h */
9098 namespace simdjson {
9099 namespace ppc64 {
9100 namespace {
9101 namespace stage1 {
9102
9103 /**
9104 * Validates that the string is actual UTF-8.
9105 */
9106 template<class checker>
generic_validate_utf8(const uint8_t * input,size_t length)9107 bool generic_validate_utf8(const uint8_t * input, size_t length) {
9108 checker c{};
9109 buf_block_reader<64> reader(input, length);
9110 while (reader.has_full_block()) {
9111 simd::simd8x64<uint8_t> in(reader.full_block());
9112 c.check_next_input(in);
9113 reader.advance();
9114 }
9115 uint8_t block[64]{};
9116 reader.get_remainder(block);
9117 simd::simd8x64<uint8_t> in(block);
9118 c.check_next_input(in);
9119 reader.advance();
9120 c.check_eof();
9121 return c.errors() == error_code::SUCCESS;
9122 }
9123
generic_validate_utf8(const char * input,size_t length)9124 bool generic_validate_utf8(const char * input, size_t length) {
9125 return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
9126 }
9127
9128 } // namespace stage1
9129 } // unnamed namespace
9130 } // namespace ppc64
9131 } // namespace simdjson
9132 /* end file src/generic/stage1/utf8_validator.h */
9133
9134 //
9135 // Stage 2
9136 //
9137
9138 /* begin file src/generic/stage2/tape_builder.h */
9139 /* begin file src/generic/stage2/json_iterator.h */
9140 /* begin file src/generic/stage2/logger.h */
9141 // This is for an internal-only stage 2 specific logger.
9142 // Set LOG_ENABLED = true to log what stage 2 is doing!
9143 namespace simdjson {
9144 namespace ppc64 {
9145 namespace {
9146 namespace logger {
9147
9148 static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
9149
9150 #if SIMDJSON_VERBOSE_LOGGING
9151 static constexpr const bool LOG_ENABLED = true;
9152 #else
9153 static constexpr const bool LOG_ENABLED = false;
9154 #endif
9155 static constexpr const int LOG_EVENT_LEN = 20;
9156 static constexpr const int LOG_BUFFER_LEN = 30;
9157 static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
9158 static constexpr const int LOG_INDEX_LEN = 5;
9159
9160 static int log_depth; // Not threadsafe. Log only.
9161
9162 // Helper to turn unprintable or newline characters into spaces
printable_char(char c)9163 static simdjson_really_inline char printable_char(char c) {
9164 if (c >= 0x20) {
9165 return c;
9166 } else {
9167 return ' ';
9168 }
9169 }
9170
9171 // Print the header and set up log_start
log_start()9172 static simdjson_really_inline void log_start() {
9173 if (LOG_ENABLED) {
9174 log_depth = 0;
9175 printf("\n");
9176 printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
9177 printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
9178 }
9179 }
9180
log_string(const char * message)9181 simdjson_unused static simdjson_really_inline void log_string(const char *message) {
9182 if (LOG_ENABLED) {
9183 printf("%s\n", message);
9184 }
9185 }
9186
9187 // Logs a single line from the stage 2 DOM parser
9188 template<typename S>
log_line(S & structurals,const char * title_prefix,const char * title,const char * detail)9189 static simdjson_really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
9190 if (LOG_ENABLED) {
9191 printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
9192 auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
9193 auto next_index = structurals.next_structural;
9194 auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>(" ");
9195 auto next = &structurals.buf[*next_index];
9196 {
9197 // Print the next N characters in the buffer.
9198 printf("| ");
9199 // Otherwise, print the characters starting from the buffer position.
9200 // Print spaces for unprintable or newline characters.
9201 for (int i=0;i<LOG_BUFFER_LEN;i++) {
9202 printf("%c", printable_char(current[i]));
9203 }
9204 printf(" ");
9205 // Print the next N characters in the buffer.
9206 printf("| ");
9207 // Otherwise, print the characters starting from the buffer position.
9208 // Print spaces for unprintable or newline characters.
9209 for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
9210 printf("%c", printable_char(next[i]));
9211 }
9212 printf(" ");
9213 }
9214 if (current_index) {
9215 printf("| %*u ", LOG_INDEX_LEN, *current_index);
9216 } else {
9217 printf("| %-*s ", LOG_INDEX_LEN, "");
9218 }
9219 // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
9220 printf("| %-s ", detail);
9221 printf("|\n");
9222 }
9223 }
9224
9225 } // namespace logger
9226 } // unnamed namespace
9227 } // namespace ppc64
9228 } // namespace simdjson
9229 /* end file src/generic/stage2/logger.h */
9230
9231 namespace simdjson {
9232 namespace ppc64 {
9233 namespace {
9234 namespace stage2 {
9235
9236 class json_iterator {
9237 public:
9238 const uint8_t* const buf;
9239 uint32_t *next_structural;
9240 dom_parser_implementation &dom_parser;
9241 uint32_t depth{0};
9242
9243 /**
9244 * Walk the JSON document.
9245 *
9246 * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
9247 * the first parameter; some callbacks have other parameters as well:
9248 *
9249 * - visit_document_start() - at the beginning.
9250 * - visit_document_end() - at the end (if things were successful).
9251 *
9252 * - visit_array_start() - at the start `[` of a non-empty array.
9253 * - visit_array_end() - at the end `]` of a non-empty array.
9254 * - visit_empty_array() - when an empty array is encountered.
9255 *
9256 * - visit_object_end() - at the start `]` of a non-empty object.
9257 * - visit_object_start() - at the end `]` of a non-empty object.
9258 * - visit_empty_object() - when an empty object is encountered.
9259 * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
9260 * guaranteed to point at the first quote of the string (`"key"`).
9261 * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
9262 * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
9263 *
9264 * - increment_count(iter) - each time a value is found in an array or object.
9265 */
9266 template<bool STREAMING, typename V>
9267 simdjson_warn_unused simdjson_really_inline error_code walk_document(V &visitor) noexcept;
9268
9269 /**
9270 * Create an iterator capable of walking a JSON document.
9271 *
9272 * The document must have already passed through stage 1.
9273 */
9274 simdjson_really_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
9275
9276 /**
9277 * Look at the next token.
9278 *
9279 * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
9280 *
9281 * They may include invalid JSON as well (such as `1.2.3` or `ture`).
9282 */
9283 simdjson_really_inline const uint8_t *peek() const noexcept;
9284 /**
9285 * Advance to the next token.
9286 *
9287 * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
9288 *
9289 * They may include invalid JSON as well (such as `1.2.3` or `ture`).
9290 */
9291 simdjson_really_inline const uint8_t *advance() noexcept;
9292 /**
9293 * Get the remaining length of the document, from the start of the current token.
9294 */
9295 simdjson_really_inline size_t remaining_len() const noexcept;
9296 /**
9297 * Check if we are at the end of the document.
9298 *
9299 * If this is true, there are no more tokens.
9300 */
9301 simdjson_really_inline bool at_eof() const noexcept;
9302 /**
9303 * Check if we are at the beginning of the document.
9304 */
9305 simdjson_really_inline bool at_beginning() const noexcept;
9306 simdjson_really_inline uint8_t last_structural() const noexcept;
9307
9308 /**
9309 * Log that a value has been found.
9310 *
9311 * Set ENABLE_LOGGING=true in logger.h to see logging.
9312 */
9313 simdjson_really_inline void log_value(const char *type) const noexcept;
9314 /**
9315 * Log the start of a multipart value.
9316 *
9317 * Set ENABLE_LOGGING=true in logger.h to see logging.
9318 */
9319 simdjson_really_inline void log_start_value(const char *type) const noexcept;
9320 /**
9321 * Log the end of a multipart value.
9322 *
9323 * Set ENABLE_LOGGING=true in logger.h to see logging.
9324 */
9325 simdjson_really_inline void log_end_value(const char *type) const noexcept;
9326 /**
9327 * Log an error.
9328 *
9329 * Set ENABLE_LOGGING=true in logger.h to see logging.
9330 */
9331 simdjson_really_inline void log_error(const char *error) const noexcept;
9332
9333 template<typename V>
9334 simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
9335 template<typename V>
9336 simdjson_warn_unused simdjson_really_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
9337 };
9338
9339 template<bool STREAMING, typename V>
walk_document(V & visitor)9340 simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_document(V &visitor) noexcept {
9341 logger::log_start();
9342
9343 //
9344 // Start the document
9345 //
9346 if (at_eof()) { return EMPTY; }
9347 log_start_value("document");
9348 SIMDJSON_TRY( visitor.visit_document_start(*this) );
9349
9350 //
9351 // Read first value
9352 //
9353 {
9354 auto value = advance();
9355
9356 // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
9357 // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
9358 if (!STREAMING) {
9359 switch (*value) {
9360 case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break;
9361 case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break;
9362 }
9363 }
9364
9365 switch (*value) {
9366 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
9367 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
9368 default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
9369 }
9370 }
9371 goto document_end;
9372
9373 //
9374 // Object parser states
9375 //
9376 object_begin:
9377 log_start_value("object");
9378 depth++;
9379 if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
9380 dom_parser.is_array[depth] = false;
9381 SIMDJSON_TRY( visitor.visit_object_start(*this) );
9382
9383 {
9384 auto key = advance();
9385 if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
9386 SIMDJSON_TRY( visitor.increment_count(*this) );
9387 SIMDJSON_TRY( visitor.visit_key(*this, key) );
9388 }
9389
9390 object_field:
9391 if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
9392 {
9393 auto value = advance();
9394 switch (*value) {
9395 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
9396 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
9397 default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
9398 }
9399 }
9400
9401 object_continue:
9402 switch (*advance()) {
9403 case ',':
9404 SIMDJSON_TRY( visitor.increment_count(*this) );
9405 {
9406 auto key = advance();
9407 if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
9408 SIMDJSON_TRY( visitor.visit_key(*this, key) );
9409 }
9410 goto object_field;
9411 case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
9412 default: log_error("No comma between object fields"); return TAPE_ERROR;
9413 }
9414
9415 scope_end:
9416 depth--;
9417 if (depth == 0) { goto document_end; }
9418 if (dom_parser.is_array[depth]) { goto array_continue; }
9419 goto object_continue;
9420
9421 //
9422 // Array parser states
9423 //
9424 array_begin:
9425 log_start_value("array");
9426 depth++;
9427 if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
9428 dom_parser.is_array[depth] = true;
9429 SIMDJSON_TRY( visitor.visit_array_start(*this) );
9430 SIMDJSON_TRY( visitor.increment_count(*this) );
9431
9432 array_value:
9433 {
9434 auto value = advance();
9435 switch (*value) {
9436 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
9437 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
9438 default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
9439 }
9440 }
9441
9442 array_continue:
9443 switch (*advance()) {
9444 case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
9445 case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
9446 default: log_error("Missing comma between array values"); return TAPE_ERROR;
9447 }
9448
9449 document_end:
9450 log_end_value("document");
9451 SIMDJSON_TRY( visitor.visit_document_end(*this) );
9452
9453 dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
9454
9455 // If we didn't make it to the end, it's an error
9456 if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
9457 log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
9458 return TAPE_ERROR;
9459 }
9460
9461 return SUCCESS;
9462
9463 } // walk_document()
9464
json_iterator(dom_parser_implementation & _dom_parser,size_t start_structural_index)9465 simdjson_really_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
9466 : buf{_dom_parser.buf},
9467 next_structural{&_dom_parser.structural_indexes[start_structural_index]},
9468 dom_parser{_dom_parser} {
9469 }
9470
peek() const9471 simdjson_really_inline const uint8_t *json_iterator::peek() const noexcept {
9472 return &buf[*(next_structural)];
9473 }
advance()9474 simdjson_really_inline const uint8_t *json_iterator::advance() noexcept {
9475 return &buf[*(next_structural++)];
9476 }
remaining_len() const9477 simdjson_really_inline size_t json_iterator::remaining_len() const noexcept {
9478 return dom_parser.len - *(next_structural-1);
9479 }
9480
at_eof() const9481 simdjson_really_inline bool json_iterator::at_eof() const noexcept {
9482 return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
9483 }
at_beginning() const9484 simdjson_really_inline bool json_iterator::at_beginning() const noexcept {
9485 return next_structural == dom_parser.structural_indexes.get();
9486 }
last_structural() const9487 simdjson_really_inline uint8_t json_iterator::last_structural() const noexcept {
9488 return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
9489 }
9490
log_value(const char * type) const9491 simdjson_really_inline void json_iterator::log_value(const char *type) const noexcept {
9492 logger::log_line(*this, "", type, "");
9493 }
9494
log_start_value(const char * type) const9495 simdjson_really_inline void json_iterator::log_start_value(const char *type) const noexcept {
9496 logger::log_line(*this, "+", type, "");
9497 if (logger::LOG_ENABLED) { logger::log_depth++; }
9498 }
9499
log_end_value(const char * type) const9500 simdjson_really_inline void json_iterator::log_end_value(const char *type) const noexcept {
9501 if (logger::LOG_ENABLED) { logger::log_depth--; }
9502 logger::log_line(*this, "-", type, "");
9503 }
9504
log_error(const char * error) const9505 simdjson_really_inline void json_iterator::log_error(const char *error) const noexcept {
9506 logger::log_line(*this, "", "ERROR", error);
9507 }
9508
9509 template<typename V>
visit_root_primitive(V & visitor,const uint8_t * value)9510 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
9511 switch (*value) {
9512 case '"': return visitor.visit_root_string(*this, value);
9513 case 't': return visitor.visit_root_true_atom(*this, value);
9514 case 'f': return visitor.visit_root_false_atom(*this, value);
9515 case 'n': return visitor.visit_root_null_atom(*this, value);
9516 case '-':
9517 case '0': case '1': case '2': case '3': case '4':
9518 case '5': case '6': case '7': case '8': case '9':
9519 return visitor.visit_root_number(*this, value);
9520 default:
9521 log_error("Document starts with a non-value character");
9522 return TAPE_ERROR;
9523 }
9524 }
9525 template<typename V>
visit_primitive(V & visitor,const uint8_t * value)9526 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
9527 switch (*value) {
9528 case '"': return visitor.visit_string(*this, value);
9529 case 't': return visitor.visit_true_atom(*this, value);
9530 case 'f': return visitor.visit_false_atom(*this, value);
9531 case 'n': return visitor.visit_null_atom(*this, value);
9532 case '-':
9533 case '0': case '1': case '2': case '3': case '4':
9534 case '5': case '6': case '7': case '8': case '9':
9535 return visitor.visit_number(*this, value);
9536 default:
9537 log_error("Non-value found when value was expected!");
9538 return TAPE_ERROR;
9539 }
9540 }
9541
9542 } // namespace stage2
9543 } // unnamed namespace
9544 } // namespace ppc64
9545 } // namespace simdjson
9546 /* end file src/generic/stage2/json_iterator.h */
9547 /* begin file src/generic/stage2/tape_writer.h */
9548 namespace simdjson {
9549 namespace ppc64 {
9550 namespace {
9551 namespace stage2 {
9552
9553 struct tape_writer {
9554 /** The next place to write to tape */
9555 uint64_t *next_tape_loc;
9556
9557 /** Write a signed 64-bit value to tape. */
9558 simdjson_really_inline void append_s64(int64_t value) noexcept;
9559
9560 /** Write an unsigned 64-bit value to tape. */
9561 simdjson_really_inline void append_u64(uint64_t value) noexcept;
9562
9563 /** Write a double value to tape. */
9564 simdjson_really_inline void append_double(double value) noexcept;
9565
9566 /**
9567 * Append a tape entry (an 8-bit type,and 56 bits worth of value).
9568 */
9569 simdjson_really_inline void append(uint64_t val, internal::tape_type t) noexcept;
9570
9571 /**
9572 * Skip the current tape entry without writing.
9573 *
9574 * Used to skip the start of the container, since we'll come back later to fill it in when the
9575 * container ends.
9576 */
9577 simdjson_really_inline void skip() noexcept;
9578
9579 /**
9580 * Skip the number of tape entries necessary to write a large u64 or i64.
9581 */
9582 simdjson_really_inline void skip_large_integer() noexcept;
9583
9584 /**
9585 * Skip the number of tape entries necessary to write a double.
9586 */
9587 simdjson_really_inline void skip_double() noexcept;
9588
9589 /**
9590 * Write a value to a known location on tape.
9591 *
9592 * Used to go back and write out the start of a container after the container ends.
9593 */
9594 simdjson_really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
9595
9596 private:
9597 /**
9598 * Append both the tape entry, and a supplementary value following it. Used for types that need
9599 * all 64 bits, such as double and uint64_t.
9600 */
9601 template<typename T>
9602 simdjson_really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
9603 }; // struct number_writer
9604
append_s64(int64_t value)9605 simdjson_really_inline void tape_writer::append_s64(int64_t value) noexcept {
9606 append2(0, value, internal::tape_type::INT64);
9607 }
9608
append_u64(uint64_t value)9609 simdjson_really_inline void tape_writer::append_u64(uint64_t value) noexcept {
9610 append(0, internal::tape_type::UINT64);
9611 *next_tape_loc = value;
9612 next_tape_loc++;
9613 }
9614
9615 /** Write a double value to tape. */
append_double(double value)9616 simdjson_really_inline void tape_writer::append_double(double value) noexcept {
9617 append2(0, value, internal::tape_type::DOUBLE);
9618 }
9619
skip()9620 simdjson_really_inline void tape_writer::skip() noexcept {
9621 next_tape_loc++;
9622 }
9623
skip_large_integer()9624 simdjson_really_inline void tape_writer::skip_large_integer() noexcept {
9625 next_tape_loc += 2;
9626 }
9627
skip_double()9628 simdjson_really_inline void tape_writer::skip_double() noexcept {
9629 next_tape_loc += 2;
9630 }
9631
append(uint64_t val,internal::tape_type t)9632 simdjson_really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
9633 *next_tape_loc = val | ((uint64_t(char(t))) << 56);
9634 next_tape_loc++;
9635 }
9636
9637 template<typename T>
append2(uint64_t val,T val2,internal::tape_type t)9638 simdjson_really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
9639 append(val, t);
9640 static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
9641 memcpy(next_tape_loc, &val2, sizeof(val2));
9642 next_tape_loc++;
9643 }
9644
write(uint64_t & tape_loc,uint64_t val,internal::tape_type t)9645 simdjson_really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
9646 tape_loc = val | ((uint64_t(char(t))) << 56);
9647 }
9648
9649 } // namespace stage2
9650 } // unnamed namespace
9651 } // namespace ppc64
9652 } // namespace simdjson
9653 /* end file src/generic/stage2/tape_writer.h */
9654
9655 namespace simdjson {
9656 namespace ppc64 {
9657 namespace {
9658 namespace stage2 {
9659
9660 struct tape_builder {
9661 template<bool STREAMING>
9662 simdjson_warn_unused static simdjson_really_inline error_code parse_document(
9663 dom_parser_implementation &dom_parser,
9664 dom::document &doc) noexcept;
9665
9666 /** Called when a non-empty document starts. */
9667 simdjson_warn_unused simdjson_really_inline error_code visit_document_start(json_iterator &iter) noexcept;
9668 /** Called when a non-empty document ends without error. */
9669 simdjson_warn_unused simdjson_really_inline error_code visit_document_end(json_iterator &iter) noexcept;
9670
9671 /** Called when a non-empty array starts. */
9672 simdjson_warn_unused simdjson_really_inline error_code visit_array_start(json_iterator &iter) noexcept;
9673 /** Called when a non-empty array ends. */
9674 simdjson_warn_unused simdjson_really_inline error_code visit_array_end(json_iterator &iter) noexcept;
9675 /** Called when an empty array is found. */
9676 simdjson_warn_unused simdjson_really_inline error_code visit_empty_array(json_iterator &iter) noexcept;
9677
9678 /** Called when a non-empty object starts. */
9679 simdjson_warn_unused simdjson_really_inline error_code visit_object_start(json_iterator &iter) noexcept;
9680 /**
9681 * Called when a key in a field is encountered.
9682 *
9683 * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
9684 * will be called after this with the field value.
9685 */
9686 simdjson_warn_unused simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
9687 /** Called when a non-empty object ends. */
9688 simdjson_warn_unused simdjson_really_inline error_code visit_object_end(json_iterator &iter) noexcept;
9689 /** Called when an empty object is found. */
9690 simdjson_warn_unused simdjson_really_inline error_code visit_empty_object(json_iterator &iter) noexcept;
9691
9692 /**
9693 * Called when a string, number, boolean or null is found.
9694 */
9695 simdjson_warn_unused simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
9696 /**
9697 * Called when a string, number, boolean or null is found at the top level of a document (i.e.
9698 * when there is no array or object and the entire document is a single string, number, boolean or
9699 * null.
9700 *
9701 * This is separate from primitive() because simdjson's normal primitive parsing routines assume
9702 * there is at least one more token after the value, which is only true in an array or object.
9703 */
9704 simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
9705
9706 simdjson_warn_unused simdjson_really_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
9707 simdjson_warn_unused simdjson_really_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
9708 simdjson_warn_unused simdjson_really_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
9709 simdjson_warn_unused simdjson_really_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
9710 simdjson_warn_unused simdjson_really_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
9711
9712 simdjson_warn_unused simdjson_really_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
9713 simdjson_warn_unused simdjson_really_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
9714 simdjson_warn_unused simdjson_really_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
9715 simdjson_warn_unused simdjson_really_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
9716 simdjson_warn_unused simdjson_really_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
9717
9718 /** Called each time a new field or element in an array or object is found. */
9719 simdjson_warn_unused simdjson_really_inline error_code increment_count(json_iterator &iter) noexcept;
9720
9721 /** Next location to write to tape */
9722 tape_writer tape;
9723 private:
9724 /** Next write location in the string buf for stage 2 parsing */
9725 uint8_t *current_string_buf_loc;
9726
9727 simdjson_really_inline tape_builder(dom::document &doc) noexcept;
9728
9729 simdjson_really_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
9730 simdjson_really_inline void start_container(json_iterator &iter) noexcept;
9731 simdjson_warn_unused simdjson_really_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
9732 simdjson_warn_unused simdjson_really_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
9733 simdjson_really_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
9734 simdjson_really_inline void on_end_string(uint8_t *dst) noexcept;
9735 }; // class tape_builder
9736
9737 template<bool STREAMING>
parse_document(dom_parser_implementation & dom_parser,dom::document & doc)9738 simdjson_warn_unused simdjson_really_inline error_code tape_builder::parse_document(
9739 dom_parser_implementation &dom_parser,
9740 dom::document &doc) noexcept {
9741 dom_parser.doc = &doc;
9742 json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
9743 tape_builder builder(doc);
9744 return iter.walk_document<STREAMING>(builder);
9745 }
9746
visit_root_primitive(json_iterator & iter,const uint8_t * value)9747 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
9748 return iter.visit_root_primitive(*this, value);
9749 }
visit_primitive(json_iterator & iter,const uint8_t * value)9750 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
9751 return iter.visit_primitive(*this, value);
9752 }
visit_empty_object(json_iterator & iter)9753 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
9754 return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
9755 }
visit_empty_array(json_iterator & iter)9756 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
9757 return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
9758 }
9759
visit_document_start(json_iterator & iter)9760 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
9761 start_container(iter);
9762 return SUCCESS;
9763 }
visit_object_start(json_iterator & iter)9764 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
9765 start_container(iter);
9766 return SUCCESS;
9767 }
visit_array_start(json_iterator & iter)9768 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
9769 start_container(iter);
9770 return SUCCESS;
9771 }
9772
visit_object_end(json_iterator & iter)9773 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
9774 return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
9775 }
visit_array_end(json_iterator & iter)9776 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
9777 return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
9778 }
visit_document_end(json_iterator & iter)9779 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
9780 constexpr uint32_t start_tape_index = 0;
9781 tape.append(start_tape_index, internal::tape_type::ROOT);
9782 tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
9783 return SUCCESS;
9784 }
visit_key(json_iterator & iter,const uint8_t * key)9785 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
9786 return visit_string(iter, key, true);
9787 }
9788
increment_count(json_iterator & iter)9789 simdjson_warn_unused simdjson_really_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
9790 iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
9791 return SUCCESS;
9792 }
9793
tape_builder(dom::document & doc)9794 simdjson_really_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
9795
visit_string(json_iterator & iter,const uint8_t * value,bool key)9796 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
9797 iter.log_value(key ? "key" : "string");
9798 uint8_t *dst = on_start_string(iter);
9799 dst = stringparsing::parse_string(value+1, dst);
9800 if (dst == nullptr) {
9801 iter.log_error("Invalid escape in string");
9802 return STRING_ERROR;
9803 }
9804 on_end_string(dst);
9805 return SUCCESS;
9806 }
9807
visit_root_string(json_iterator & iter,const uint8_t * value)9808 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
9809 return visit_string(iter, value);
9810 }
9811
visit_number(json_iterator & iter,const uint8_t * value)9812 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
9813 iter.log_value("number");
9814 return numberparsing::parse_number(value, tape);
9815 }
9816
visit_root_number(json_iterator & iter,const uint8_t * value)9817 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
9818 //
9819 // We need to make a copy to make sure that the string is space terminated.
9820 // This is not about padding the input, which should already padded up
9821 // to len + SIMDJSON_PADDING. However, we have no control at this stage
9822 // on how the padding was done. What if the input string was padded with nulls?
9823 // It is quite common for an input string to have an extra null character (C string).
9824 // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
9825 // document, but the string "9\0" by itself is fine. So we make a copy and
9826 // pad the input with spaces when we know that there is just one input element.
9827 // This copy is relatively expensive, but it will almost never be called in
9828 // practice unless you are in the strange scenario where you have many JSON
9829 // documents made of single atoms.
9830 //
9831 std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
9832 if (copy.get() == nullptr) { return MEMALLOC; }
9833 std::memcpy(copy.get(), value, iter.remaining_len());
9834 std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
9835 error_code error = visit_number(iter, copy.get());
9836 return error;
9837 }
9838
visit_true_atom(json_iterator & iter,const uint8_t * value)9839 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
9840 iter.log_value("true");
9841 if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
9842 tape.append(0, internal::tape_type::TRUE_VALUE);
9843 return SUCCESS;
9844 }
9845
visit_root_true_atom(json_iterator & iter,const uint8_t * value)9846 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
9847 iter.log_value("true");
9848 if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
9849 tape.append(0, internal::tape_type::TRUE_VALUE);
9850 return SUCCESS;
9851 }
9852
visit_false_atom(json_iterator & iter,const uint8_t * value)9853 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
9854 iter.log_value("false");
9855 if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
9856 tape.append(0, internal::tape_type::FALSE_VALUE);
9857 return SUCCESS;
9858 }
9859
visit_root_false_atom(json_iterator & iter,const uint8_t * value)9860 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
9861 iter.log_value("false");
9862 if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
9863 tape.append(0, internal::tape_type::FALSE_VALUE);
9864 return SUCCESS;
9865 }
9866
visit_null_atom(json_iterator & iter,const uint8_t * value)9867 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
9868 iter.log_value("null");
9869 if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
9870 tape.append(0, internal::tape_type::NULL_VALUE);
9871 return SUCCESS;
9872 }
9873
visit_root_null_atom(json_iterator & iter,const uint8_t * value)9874 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
9875 iter.log_value("null");
9876 if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
9877 tape.append(0, internal::tape_type::NULL_VALUE);
9878 return SUCCESS;
9879 }
9880
9881 // private:
9882
next_tape_index(json_iterator & iter) const9883 simdjson_really_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
9884 return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
9885 }
9886
empty_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)9887 simdjson_warn_unused simdjson_really_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
9888 auto start_index = next_tape_index(iter);
9889 tape.append(start_index+2, start);
9890 tape.append(start_index, end);
9891 return SUCCESS;
9892 }
9893
start_container(json_iterator & iter)9894 simdjson_really_inline void tape_builder::start_container(json_iterator &iter) noexcept {
9895 iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
9896 iter.dom_parser.open_containers[iter.depth].count = 0;
9897 tape.skip(); // We don't actually *write* the start element until the end.
9898 }
9899
end_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)9900 simdjson_warn_unused simdjson_really_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
9901 // Write the ending tape element, pointing at the start location
9902 const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
9903 tape.append(start_tape_index, end);
9904 // Write the start tape element, pointing at the end location (and including count)
9905 // count can overflow if it exceeds 24 bits... so we saturate
9906 // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
9907 const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
9908 const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
9909 tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
9910 return SUCCESS;
9911 }
9912
on_start_string(json_iterator & iter)9913 simdjson_really_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
9914 // we advance the point, accounting for the fact that we have a NULL termination
9915 tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
9916 return current_string_buf_loc + sizeof(uint32_t);
9917 }
9918
on_end_string(uint8_t * dst)9919 simdjson_really_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
9920 uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
9921 // TODO check for overflow in case someone has a crazy string (>=4GB?)
9922 // But only add the overflow check when the document itself exceeds 4GB
9923 // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
9924 memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
9925 // NULL termination is still handy if you expect all your strings to
9926 // be NULL terminated? It comes at a small cost
9927 *dst = 0;
9928 current_string_buf_loc = dst + 1;
9929 }
9930
9931 } // namespace stage2
9932 } // unnamed namespace
9933 } // namespace ppc64
9934 } // namespace simdjson
9935 /* end file src/generic/stage2/tape_builder.h */
9936
9937 //
9938 // Implementation-specific overrides
9939 //
9940 namespace simdjson {
9941 namespace ppc64 {
9942 namespace {
9943 namespace stage1 {
9944
find_escaped(uint64_t backslash)9945 simdjson_really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
9946 // On PPC, we don't short-circuit this if there are no backslashes, because the branch gives us no
9947 // benefit and therefore makes things worse.
9948 // if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
9949 return find_escaped_branchless(backslash);
9950 }
9951
9952 } // namespace stage1
9953 } // unnamed namespace
9954
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len) const9955 simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
9956 return ppc64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
9957 }
9958
stage1(const uint8_t * _buf,size_t _len,bool streaming)9959 simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
9960 this->buf = _buf;
9961 this->len = _len;
9962 return ppc64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
9963 }
9964
validate_utf8(const char * buf,size_t len) const9965 simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
9966 return ppc64::stage1::generic_validate_utf8(buf,len);
9967 }
9968
stage2(dom::document & _doc)9969 simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
9970 return stage2::tape_builder::parse_document<false>(*this, _doc);
9971 }
9972
stage2_next(dom::document & _doc)9973 simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
9974 return stage2::tape_builder::parse_document<true>(*this, _doc);
9975 }
9976
parse(const uint8_t * _buf,size_t _len,dom::document & _doc)9977 simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
9978 auto error = stage1(_buf, _len, false);
9979 if (error) { return error; }
9980 return stage2(_doc);
9981 }
9982
9983 } // namespace ppc64
9984 } // namespace simdjson
9985
9986 /* begin file include/simdjson/ppc64/end.h */
9987 /* end file include/simdjson/ppc64/end.h */
9988 /* end file src/ppc64/dom_parser_implementation.cpp */
9989 #endif
9990 #if SIMDJSON_IMPLEMENTATION_WESTMERE
9991 /* begin file src/westmere/implementation.cpp */
9992 /* begin file include/simdjson/westmere/begin.h */
9993 // redefining SIMDJSON_IMPLEMENTATION to "westmere"
9994 // #define SIMDJSON_IMPLEMENTATION westmere
9995 SIMDJSON_TARGET_WESTMERE
9996 /* end file include/simdjson/westmere/begin.h */
9997
9998 namespace simdjson {
9999 namespace westmere {
10000
create_dom_parser_implementation(size_t capacity,size_t max_depth,std::unique_ptr<internal::dom_parser_implementation> & dst) const10001 simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
10002 size_t capacity,
10003 size_t max_depth,
10004 std::unique_ptr<internal::dom_parser_implementation>& dst
10005 ) const noexcept {
10006 dst.reset( new (std::nothrow) dom_parser_implementation() );
10007 if (!dst) { return MEMALLOC; }
10008 dst->set_capacity(capacity);
10009 dst->set_max_depth(max_depth);
10010 return SUCCESS;
10011 }
10012
10013 } // namespace westmere
10014 } // namespace simdjson
10015
10016 /* begin file include/simdjson/westmere/end.h */
10017 SIMDJSON_UNTARGET_WESTMERE
10018 /* end file include/simdjson/westmere/end.h */
10019 /* end file src/westmere/implementation.cpp */
10020 /* begin file src/westmere/dom_parser_implementation.cpp */
10021 /* begin file include/simdjson/westmere/begin.h */
10022 // redefining SIMDJSON_IMPLEMENTATION to "westmere"
10023 // #define SIMDJSON_IMPLEMENTATION westmere
10024 SIMDJSON_TARGET_WESTMERE
10025 /* end file include/simdjson/westmere/begin.h */
10026
10027 //
10028 // Stage 1
10029 //
10030
10031 namespace simdjson {
10032 namespace westmere {
10033 namespace {
10034
10035 using namespace simd;
10036
10037 struct json_character_block {
10038 static simdjson_really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
10039
whitespacesimdjson::westmere::__anon9bb6be6f3311::json_character_block10040 simdjson_really_inline uint64_t whitespace() const noexcept { return _whitespace; }
opsimdjson::westmere::__anon9bb6be6f3311::json_character_block10041 simdjson_really_inline uint64_t op() const noexcept { return _op; }
scalarsimdjson::westmere::__anon9bb6be6f3311::json_character_block10042 simdjson_really_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
10043
10044 uint64_t _whitespace;
10045 uint64_t _op;
10046 };
10047
classify(const simd::simd8x64<uint8_t> & in)10048 simdjson_really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
10049 // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
10050 // we can't use the generic lookup_16.
10051 auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
10052
10053 // The 6 operators (:,[]{}) have these values:
10054 //
10055 // , 2C
10056 // : 3A
10057 // [ 5B
10058 // { 7B
10059 // ] 5D
10060 // } 7D
10061 //
10062 // If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique.
10063 // We exploit this, using a simd 4-bit lookup to tell us which character match against, and then
10064 // match it (against | 0x20).
10065 //
10066 // To prevent recognizing other characters, everything else gets compared with 0, which cannot
10067 // match due to the | 0x20.
10068 //
10069 // NOTE: Due to the | 0x20, this ALSO treats <FF> and <SUB> (control characters 0C and 1A) like ,
10070 // and :. This gets caught in stage 2, which checks the actual character to ensure the right
10071 // operators are in the right places.
10072 const auto op_table = simd8<uint8_t>::repeat_16(
10073 0, 0, 0, 0,
10074 0, 0, 0, 0,
10075 0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B
10076 ',', '}', 0, 0 // , = 2C, ] = 5D, } = 7D
10077 );
10078
10079 // We compute whitespace and op separately. If the code later only use one or the
10080 // other, given the fact that all functions are aggressively inlined, we can
10081 // hope that useless computations will be omitted. This is namely case when
10082 // minifying (we only need whitespace).
10083
10084
10085 const uint64_t whitespace = in.eq({
10086 _mm_shuffle_epi8(whitespace_table, in.chunks[0]),
10087 _mm_shuffle_epi8(whitespace_table, in.chunks[1]),
10088 _mm_shuffle_epi8(whitespace_table, in.chunks[2]),
10089 _mm_shuffle_epi8(whitespace_table, in.chunks[3])
10090 });
10091 // Turn [ and ] into { and }
10092 const simd8x64<uint8_t> curlified{
10093 in.chunks[0] | 0x20,
10094 in.chunks[1] | 0x20,
10095 in.chunks[2] | 0x20,
10096 in.chunks[3] | 0x20
10097 };
10098 const uint64_t op = curlified.eq({
10099 _mm_shuffle_epi8(op_table, in.chunks[0]),
10100 _mm_shuffle_epi8(op_table, in.chunks[1]),
10101 _mm_shuffle_epi8(op_table, in.chunks[2]),
10102 _mm_shuffle_epi8(op_table, in.chunks[3])
10103 });
10104 return { whitespace, op };
10105 }
10106
is_ascii(const simd8x64<uint8_t> & input)10107 simdjson_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
10108 return input.reduce_or().is_ascii();
10109 }
10110
must_be_continuation(const simd8<uint8_t> prev1,const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)10111 simdjson_unused simdjson_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
10112 simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
10113 simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
10114 simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
10115 // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
10116 return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
10117 }
10118
must_be_2_3_continuation(const simd8<uint8_t> prev2,const simd8<uint8_t> prev3)10119 simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
10120 simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
10121 simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
10122 // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
10123 return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
10124 }
10125
10126 } // unnamed namespace
10127 } // namespace westmere
10128 } // namespace simdjson
10129
10130 /* begin file src/generic/stage1/utf8_lookup4_algorithm.h */
10131 namespace simdjson {
10132 namespace westmere {
10133 namespace {
10134 namespace utf8_validation {
10135
10136 using namespace simd;
10137
check_special_cases(const simd8<uint8_t> input,const simd8<uint8_t> prev1)10138 simdjson_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
10139 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
10140 // Bit 1 = Too Long (ASCII followed by continuation)
10141 // Bit 2 = Overlong 3-byte
10142 // Bit 4 = Surrogate
10143 // Bit 5 = Overlong 2-byte
10144 // Bit 7 = Two Continuations
10145 constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
10146 // 11______ 11______
10147 constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
10148 constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
10149 constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
10150 constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
10151 constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
10152 constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
10153 // 11110100 101_____
10154 // 11110101 1001____
10155 // 11110101 101_____
10156 // 1111011_ 1001____
10157 // 1111011_ 101_____
10158 // 11111___ 1001____
10159 // 11111___ 101_____
10160 constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
10161 // 11110101 1000____
10162 // 1111011_ 1000____
10163 // 11111___ 1000____
10164 constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
10165
10166 const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
10167 // 0_______ ________ <ASCII in byte 1>
10168 TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
10169 TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
10170 // 10______ ________ <continuation in byte 1>
10171 TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
10172 // 1100____ ________ <two byte lead in byte 1>
10173 TOO_SHORT | OVERLONG_2,
10174 // 1101____ ________ <two byte lead in byte 1>
10175 TOO_SHORT,
10176 // 1110____ ________ <three byte lead in byte 1>
10177 TOO_SHORT | OVERLONG_3 | SURROGATE,
10178 // 1111____ ________ <four+ byte lead in byte 1>
10179 TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
10180 );
10181 constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
10182 const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
10183 // ____0000 ________
10184 CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
10185 // ____0001 ________
10186 CARRY | OVERLONG_2,
10187 // ____001_ ________
10188 CARRY,
10189 CARRY,
10190
10191 // ____0100 ________
10192 CARRY | TOO_LARGE,
10193 // ____0101 ________
10194 CARRY | TOO_LARGE | TOO_LARGE_1000,
10195 // ____011_ ________
10196 CARRY | TOO_LARGE | TOO_LARGE_1000,
10197 CARRY | TOO_LARGE | TOO_LARGE_1000,
10198
10199 // ____1___ ________
10200 CARRY | TOO_LARGE | TOO_LARGE_1000,
10201 CARRY | TOO_LARGE | TOO_LARGE_1000,
10202 CARRY | TOO_LARGE | TOO_LARGE_1000,
10203 CARRY | TOO_LARGE | TOO_LARGE_1000,
10204 CARRY | TOO_LARGE | TOO_LARGE_1000,
10205 // ____1101 ________
10206 CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
10207 CARRY | TOO_LARGE | TOO_LARGE_1000,
10208 CARRY | TOO_LARGE | TOO_LARGE_1000
10209 );
10210 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
10211 // ________ 0_______ <ASCII in byte 2>
10212 TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
10213 TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
10214
10215 // ________ 1000____
10216 TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
10217 // ________ 1001____
10218 TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
10219 // ________ 101_____
10220 TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
10221 TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
10222
10223 // ________ 11______
10224 TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
10225 );
10226 return (byte_1_high & byte_1_low & byte_2_high);
10227 }
check_multibyte_lengths(const simd8<uint8_t> input,const simd8<uint8_t> prev_input,const simd8<uint8_t> sc)10228 simdjson_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
10229 const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
10230 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
10231 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
10232 simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
10233 simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
10234 return must23_80 ^ sc;
10235 }
10236
10237 //
10238 // Return nonzero if there are incomplete multibyte characters at the end of the block:
10239 // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
10240 //
is_incomplete(const simd8<uint8_t> input)10241 simdjson_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
10242 // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
10243 // ... 1111____ 111_____ 11______
10244 static const uint8_t max_array[32] = {
10245 255, 255, 255, 255, 255, 255, 255, 255,
10246 255, 255, 255, 255, 255, 255, 255, 255,
10247 255, 255, 255, 255, 255, 255, 255, 255,
10248 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
10249 };
10250 const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
10251 return input.gt_bits(max_value);
10252 }
10253
10254 struct utf8_checker {
10255 // If this is nonzero, there has been a UTF-8 error.
10256 simd8<uint8_t> error;
10257 // The last input we received
10258 simd8<uint8_t> prev_input_block;
10259 // Whether the last input we received was incomplete (used for ASCII fast path)
10260 simd8<uint8_t> prev_incomplete;
10261
10262 //
10263 // Check whether the current bytes are valid UTF-8.
10264 //
check_utf8_bytessimdjson::westmere::__anon9bb6be6f3411::utf8_validation::utf8_checker10265 simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
10266 // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
10267 // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
10268 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
10269 simd8<uint8_t> sc = check_special_cases(input, prev1);
10270 this->error |= check_multibyte_lengths(input, prev_input, sc);
10271 }
10272
10273 // The only problem that can happen at EOF is that a multibyte character is too short
10274 // or a byte value too large in the last bytes: check_special_cases only checks for bytes
10275 // too large in the first of two bytes.
check_eofsimdjson::westmere::__anon9bb6be6f3411::utf8_validation::utf8_checker10276 simdjson_really_inline void check_eof() {
10277 // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
10278 // possibly finish them.
10279 this->error |= this->prev_incomplete;
10280 }
10281
check_next_inputsimdjson::westmere::__anon9bb6be6f3411::utf8_validation::utf8_checker10282 simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
10283 if(simdjson_likely(is_ascii(input))) {
10284 this->error |= this->prev_incomplete;
10285 } else {
10286 // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
10287 static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
10288 "We support either two or four chunks per 64-byte block.");
10289 if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
10290 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
10291 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
10292 } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
10293 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
10294 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
10295 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
10296 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
10297 }
10298 this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
10299 this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
10300
10301 }
10302 }
10303 // do not forget to call check_eof!
errorssimdjson::westmere::__anon9bb6be6f3411::utf8_validation::utf8_checker10304 simdjson_really_inline error_code errors() {
10305 return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
10306 }
10307
10308 }; // struct utf8_checker
10309 } // namespace utf8_validation
10310
10311 using utf8_validation::utf8_checker;
10312
10313 } // unnamed namespace
10314 } // namespace westmere
10315 } // namespace simdjson
10316 /* end file src/generic/stage1/utf8_lookup4_algorithm.h */
10317 /* begin file src/generic/stage1/json_structural_indexer.h */
10318 // This file contains the common code every implementation uses in stage1
10319 // It is intended to be included multiple times and compiled multiple times
10320 // We assume the file in which it is included already includes
10321 // "simdjson/stage1.h" (this simplifies amalgation)
10322
10323 /* begin file src/generic/stage1/buf_block_reader.h */
10324 namespace simdjson {
10325 namespace westmere {
10326 namespace {
10327
10328 // Walks through a buffer in block-sized increments, loading the last part with spaces
10329 template<size_t STEP_SIZE>
10330 struct buf_block_reader {
10331 public:
10332 simdjson_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
10333 simdjson_really_inline size_t block_index();
10334 simdjson_really_inline bool has_full_block() const;
10335 simdjson_really_inline const uint8_t *full_block() const;
10336 /**
10337 * Get the last block, padded with spaces.
10338 *
10339 * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
10340 * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
10341 * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
10342 *
10343 * @return the number of effective characters in the last block.
10344 */
10345 simdjson_really_inline size_t get_remainder(uint8_t *dst) const;
10346 simdjson_really_inline void advance();
10347 private:
10348 const uint8_t *buf;
10349 const size_t len;
10350 const size_t lenminusstep;
10351 size_t idx;
10352 };
10353
10354 // Routines to print masks and text for debugging bitmask operations
format_input_text_64(const uint8_t * text)10355 simdjson_unused static char * format_input_text_64(const uint8_t *text) {
10356 static char buf[sizeof(simd8x64<uint8_t>) + 1];
10357 for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
10358 buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
10359 }
10360 buf[sizeof(simd8x64<uint8_t>)] = '\0';
10361 return buf;
10362 }
10363
10364 // Routines to print masks and text for debugging bitmask operations
format_input_text(const simd8x64<uint8_t> & in)10365 simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
10366 static char buf[sizeof(simd8x64<uint8_t>) + 1];
10367 in.store(reinterpret_cast<uint8_t*>(buf));
10368 for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
10369 if (buf[i] < ' ') { buf[i] = '_'; }
10370 }
10371 buf[sizeof(simd8x64<uint8_t>)] = '\0';
10372 return buf;
10373 }
10374
format_mask(uint64_t mask)10375 simdjson_unused static char * format_mask(uint64_t mask) {
10376 static char buf[sizeof(simd8x64<uint8_t>) + 1];
10377 for (size_t i=0; i<64; i++) {
10378 buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
10379 }
10380 buf[64] = '\0';
10381 return buf;
10382 }
10383
10384 template<size_t STEP_SIZE>
buf_block_reader(const uint8_t * _buf,size_t _len)10385 simdjson_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
10386
10387 template<size_t STEP_SIZE>
block_index()10388 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
10389
10390 template<size_t STEP_SIZE>
has_full_block() const10391 simdjson_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
10392 return idx < lenminusstep;
10393 }
10394
10395 template<size_t STEP_SIZE>
full_block() const10396 simdjson_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
10397 return &buf[idx];
10398 }
10399
10400 template<size_t STEP_SIZE>
get_remainder(uint8_t * dst) const10401 simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
10402 if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
10403 std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
10404 std::memcpy(dst, buf + idx, len - idx);
10405 return len - idx;
10406 }
10407
10408 template<size_t STEP_SIZE>
advance()10409 simdjson_really_inline void buf_block_reader<STEP_SIZE>::advance() {
10410 idx += STEP_SIZE;
10411 }
10412
10413 } // unnamed namespace
10414 } // namespace westmere
10415 } // namespace simdjson
10416 /* end file src/generic/stage1/buf_block_reader.h */
10417 /* begin file src/generic/stage1/json_string_scanner.h */
10418 namespace simdjson {
10419 namespace westmere {
10420 namespace {
10421 namespace stage1 {
10422
10423 struct json_string_block {
10424 // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_string_blocksimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10425 simdjson_really_inline json_string_block(uint64_t backslash, uint64_t escaped, uint64_t quote, uint64_t in_string) :
10426 _backslash(backslash), _escaped(escaped), _quote(quote), _in_string(in_string) {}
10427
10428 // Escaped characters (characters following an escape() character)
escapedsimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10429 simdjson_really_inline uint64_t escaped() const { return _escaped; }
10430 // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
escapesimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10431 simdjson_really_inline uint64_t escape() const { return _backslash & ~_escaped; }
10432 // Real (non-backslashed) quotes
quotesimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10433 simdjson_really_inline uint64_t quote() const { return _quote; }
10434 // Start quotes of strings
string_startsimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10435 simdjson_really_inline uint64_t string_start() const { return _quote & _in_string; }
10436 // End quotes of strings
string_endsimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10437 simdjson_really_inline uint64_t string_end() const { return _quote & ~_in_string; }
10438 // Only characters inside the string (not including the quotes)
string_contentsimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10439 simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; }
10440 // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_inside_stringsimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10441 simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
10442 // Return a mask of whether the given characters are inside a string (only works on non-quotes)
non_quote_outside_stringsimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10443 simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
10444 // Tail of string (everything except the start quote)
string_tailsimdjson::westmere::__anon9bb6be6f3611::stage1::json_string_block10445 simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
10446
10447 // backslash characters
10448 uint64_t _backslash;
10449 // escaped characters (backslashed--does not include the hex characters after \u)
10450 uint64_t _escaped;
10451 // real quotes (non-backslashed ones)
10452 uint64_t _quote;
10453 // string characters (includes start quote but not end quote)
10454 uint64_t _in_string;
10455 };
10456
10457 // Scans blocks for string characters, storing the state necessary to do so
10458 class json_string_scanner {
10459 public:
10460 simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
10461 // Returns either UNCLOSED_STRING or SUCCESS
10462 simdjson_really_inline error_code finish();
10463
10464 private:
10465 // Intended to be defined by the implementation
10466 simdjson_really_inline uint64_t find_escaped(uint64_t escape);
10467 simdjson_really_inline uint64_t find_escaped_branchless(uint64_t escape);
10468
10469 // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
10470 uint64_t prev_in_string = 0ULL;
10471 // Whether the first character of the next iteration is escaped.
10472 uint64_t prev_escaped = 0ULL;
10473 };
10474
10475 //
10476 // Finds escaped characters (characters following \).
10477 //
10478 // Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
10479 //
10480 // Does this by:
10481 // - Shift the escape mask to get potentially escaped characters (characters after backslashes).
10482 // - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
10483 // - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
10484 //
10485 // To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
10486 // escape sequences, filters out the ones that start on even bits, and adds that to the mask of
10487 // escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
10488 // the start bit causes a carry), and leaves even-bit sequences alone.
10489 //
10490 // Example:
10491 //
10492 // text | \\\ | \\\"\\\" \\\" \\"\\" |
10493 // escape | xxx | xx xxx xxx xx xx | Removed overflow backslash; will | it into follows_escape
10494 // odd_starts | x | x x x | escape & ~even_bits & ~follows_escape
10495 // even_seq | c| cxxx c xx c | c = carry bit -- will be masked out later
10496 // invert_mask | | cxxx c xx c| even_seq << 1
10497 // follows_escape | xx | x xx xxx xxx xx xx | Includes overflow bit
10498 // escaped | x | x x x x x x x x |
10499 // desired | x | x x x x x x x x |
10500 // text | \\\ | \\\"\\\" \\\" \\"\\" |
10501 //
find_escaped_branchless(uint64_t backslash)10502 simdjson_really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash) {
10503 // If there was overflow, pretend the first character isn't a backslash
10504 backslash &= ~prev_escaped;
10505 uint64_t follows_escape = backslash << 1 | prev_escaped;
10506
10507 // Get sequences starting on even bits by clearing out the odd series using +
10508 const uint64_t even_bits = 0x5555555555555555ULL;
10509 uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
10510 uint64_t sequences_starting_on_even_bits;
10511 prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
10512 uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
10513
10514 // Mask every other backslashed character as an escaped character
10515 // Flip the mask for sequences that start on even bits, to correct them
10516 return (even_bits ^ invert_mask) & follows_escape;
10517 }
10518
10519 //
10520 // Return a mask of all string characters plus end quotes.
10521 //
10522 // prev_escaped is overflow saying whether the next character is escaped.
10523 // prev_in_string is overflow saying whether we're still in a string.
10524 //
10525 // Backslash sequences outside of quotes will be detected in stage 2.
10526 //
next(const simd::simd8x64<uint8_t> & in)10527 simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
10528 const uint64_t backslash = in.eq('\\');
10529 const uint64_t escaped = find_escaped(backslash);
10530 const uint64_t quote = in.eq('"') & ~escaped;
10531
10532 //
10533 // prefix_xor flips on bits inside the string (and flips off the end quote).
10534 //
10535 // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
10536 // (characters inside strings are outside, and characters outside strings are inside).
10537 //
10538 const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
10539
10540 //
10541 // Check if we're still in a string at the end of the box so the next block will know
10542 //
10543 // right shift of a signed value expected to be well-defined and standard
10544 // compliant as of C++20, John Regher from Utah U. says this is fine code
10545 //
10546 prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
10547
10548 // Use ^ to turn the beginning quote off, and the end quote on.
10549
10550 // We are returning a function-local object so either we get a move constructor
10551 // or we get copy elision.
10552 return json_string_block(
10553 backslash,
10554 escaped,
10555 quote,
10556 in_string
10557 );
10558 }
10559
finish()10560 simdjson_really_inline error_code json_string_scanner::finish() {
10561 if (prev_in_string) {
10562 return UNCLOSED_STRING;
10563 }
10564 return SUCCESS;
10565 }
10566
10567 } // namespace stage1
10568 } // unnamed namespace
10569 } // namespace westmere
10570 } // namespace simdjson
10571 /* end file src/generic/stage1/json_string_scanner.h */
10572 /* begin file src/generic/stage1/json_scanner.h */
10573 namespace simdjson {
10574 namespace westmere {
10575 namespace {
10576 namespace stage1 {
10577
10578 /**
10579 * A block of scanned json, with information on operators and scalars.
10580 *
10581 * We seek to identify pseudo-structural characters. Anything that is inside
10582 * a string must be omitted (hence & ~_string.string_tail()).
10583 * Otherwise, pseudo-structural characters come in two forms.
10584 * 1. We have the structural characters ([,],{,},:, comma). The
10585 * term 'structural character' is from the JSON RFC.
10586 * 2. We have the 'scalar pseudo-structural characters'.
10587 * Scalars are quotes, and any character except structural characters and white space.
10588 *
10589 * To identify the scalar pseudo-structural characters, we must look at what comes
10590 * before them: it must be a space, a quote or a structural characters.
10591 * Starting with simdjson v0.3, we identify them by
10592 * negation: we identify everything that is followed by a non-quote scalar,
10593 * and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
10594 */
10595 struct json_block {
10596 public:
10597 // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
json_blocksimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10598 simdjson_really_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
10599 _string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
json_blocksimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10600 simdjson_really_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
10601 _string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
10602
10603 /**
10604 * The start of structurals.
10605 * In simdjson prior to v0.3, these were called the pseudo-structural characters.
10606 **/
structural_startsimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10607 simdjson_really_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
10608 /** All JSON whitespace (i.e. not in a string) */
whitespacesimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10609 simdjson_really_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
10610
10611 // Helpers
10612
10613 /** Whether the given characters are inside a string (only works on non-quotes) */
non_quote_inside_stringsimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10614 simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
10615 /** Whether the given characters are outside a string (only works on non-quotes) */
non_quote_outside_stringsimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10616 simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
10617
10618 // string and escape characters
10619 json_string_block _string;
10620 // whitespace, structural characters ('operators'), scalars
10621 json_character_block _characters;
10622 // whether the previous character was a scalar
10623 uint64_t _follows_potential_nonquote_scalar;
10624 private:
10625 // Potential structurals (i.e. disregarding strings)
10626
10627 /**
10628 * structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
10629 * They may reside inside a string.
10630 **/
potential_structural_startsimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10631 simdjson_really_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
10632 /**
10633 * The start of non-operator runs, like 123, true and "abc".
10634 * It main reside inside a string.
10635 **/
potential_scalar_startsimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10636 simdjson_really_inline uint64_t potential_scalar_start() const noexcept {
10637 // The term "scalar" refers to anything except structural characters and white space
10638 // (so letters, numbers, quotes).
10639 // Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
10640 // then we know that it is irrelevant structurally.
10641 return _characters.scalar() & ~follows_potential_scalar();
10642 }
10643 /**
10644 * Whether the given character is immediately after a non-operator like 123, true.
10645 * The characters following a quote are not included.
10646 */
follows_potential_scalarsimdjson::westmere::__anon9bb6be6f3711::stage1::json_block10647 simdjson_really_inline uint64_t follows_potential_scalar() const noexcept {
10648 // _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
10649 // that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
10650 // white space.
10651 // It is understood that within quoted region, anything at all could be marked (irrelevant).
10652 return _follows_potential_nonquote_scalar;
10653 }
10654 };
10655
10656 /**
10657 * Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
10658 *
10659 * The scanner starts by calculating two distinct things:
10660 * - string characters (taking \" into account)
10661 * - structural characters or 'operators' ([]{},:, comma)
10662 * and scalars (runs of non-operators like 123, true and "abc")
10663 *
10664 * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
10665 * in particular, the operator/scalar bit will find plenty of things that are actually part of
10666 * strings. When we're done, json_block will fuse the two together by masking out tokens that are
10667 * part of a string.
10668 */
10669 class json_scanner {
10670 public:
json_scanner()10671 json_scanner() {}
10672 simdjson_really_inline json_block next(const simd::simd8x64<uint8_t>& in);
10673 // Returns either UNCLOSED_STRING or SUCCESS
10674 simdjson_really_inline error_code finish();
10675
10676 private:
10677 // Whether the last character of the previous iteration is part of a scalar token
10678 // (anything except whitespace or a structural character/'operator').
10679 uint64_t prev_scalar = 0ULL;
10680 json_string_scanner string_scanner{};
10681 };
10682
10683
10684 //
10685 // Check if the current character immediately follows a matching character.
10686 //
10687 // For example, this checks for quotes with backslashes in front of them:
10688 //
10689 // const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
10690 //
follows(const uint64_t match,uint64_t & overflow)10691 simdjson_really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
10692 const uint64_t result = match << 1 | overflow;
10693 overflow = match >> 63;
10694 return result;
10695 }
10696
next(const simd::simd8x64<uint8_t> & in)10697 simdjson_really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
10698 json_string_block strings = string_scanner.next(in);
10699 // identifies the white-space and the structurat characters
10700 json_character_block characters = json_character_block::classify(in);
10701 // The term "scalar" refers to anything except structural characters and white space
10702 // (so letters, numbers, quotes).
10703 // We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
10704 //
10705 // A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
10706 // or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
10707 // pseudo-structural character just like we would if we had ' "a string" true '; otherwise we
10708 // may need to add an extra check when parsing strings.
10709 //
10710 // Performance: there are many ways to skin this cat.
10711 const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
10712 uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
10713 // We are returning a function-local object so either we get a move constructor
10714 // or we get copy elision.
10715 return json_block(
10716 strings,// strings is a function-local object so either it moves or the copy is elided.
10717 characters,
10718 follows_nonquote_scalar
10719 );
10720 }
10721
finish()10722 simdjson_really_inline error_code json_scanner::finish() {
10723 return string_scanner.finish();
10724 }
10725
10726 } // namespace stage1
10727 } // unnamed namespace
10728 } // namespace westmere
10729 } // namespace simdjson
10730 /* end file src/generic/stage1/json_scanner.h */
10731 /* begin file src/generic/stage1/json_minifier.h */
10732 // This file contains the common code every implementation uses in stage1
10733 // It is intended to be included multiple times and compiled multiple times
10734 // We assume the file in which it is included already includes
10735 // "simdjson/stage1.h" (this simplifies amalgation)
10736
10737 namespace simdjson {
10738 namespace westmere {
10739 namespace {
10740 namespace stage1 {
10741
10742 class json_minifier {
10743 public:
10744 template<size_t STEP_SIZE>
10745 static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
10746
10747 private:
json_minifier(uint8_t * _dst)10748 simdjson_really_inline json_minifier(uint8_t *_dst)
10749 : dst{_dst}
10750 {}
10751 template<size_t STEP_SIZE>
10752 simdjson_really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
10753 simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
10754 simdjson_really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
10755 json_scanner scanner{};
10756 uint8_t *dst;
10757 };
10758
next(const simd::simd8x64<uint8_t> & in,const json_block & block)10759 simdjson_really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
10760 uint64_t mask = block.whitespace();
10761 in.compress(mask, dst);
10762 dst += 64 - count_ones(mask);
10763 }
10764
finish(uint8_t * dst_start,size_t & dst_len)10765 simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
10766 error_code error = scanner.finish();
10767 if (error) { dst_len = 0; return error; }
10768 dst_len = dst - dst_start;
10769 return SUCCESS;
10770 }
10771
10772 template<>
step(const uint8_t * block_buf,buf_block_reader<128> & reader)10773 simdjson_really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
10774 simd::simd8x64<uint8_t> in_1(block_buf);
10775 simd::simd8x64<uint8_t> in_2(block_buf+64);
10776 json_block block_1 = scanner.next(in_1);
10777 json_block block_2 = scanner.next(in_2);
10778 this->next(in_1, block_1);
10779 this->next(in_2, block_2);
10780 reader.advance();
10781 }
10782
10783 template<>
step(const uint8_t * block_buf,buf_block_reader<64> & reader)10784 simdjson_really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
10785 simd::simd8x64<uint8_t> in_1(block_buf);
10786 json_block block_1 = scanner.next(in_1);
10787 this->next(block_buf, block_1);
10788 reader.advance();
10789 }
10790
10791 template<size_t STEP_SIZE>
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len)10792 error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
10793 buf_block_reader<STEP_SIZE> reader(buf, len);
10794 json_minifier minifier(dst);
10795
10796 // Index the first n-1 blocks
10797 while (reader.has_full_block()) {
10798 minifier.step<STEP_SIZE>(reader.full_block(), reader);
10799 }
10800
10801 // Index the last (remainder) block, padded with spaces
10802 uint8_t block[STEP_SIZE];
10803 size_t remaining_bytes = reader.get_remainder(block);
10804 if (remaining_bytes > 0) {
10805 // We do not want to write directly to the output stream. Rather, we write
10806 // to a local buffer (for safety).
10807 uint8_t out_block[STEP_SIZE];
10808 uint8_t * const guarded_dst{minifier.dst};
10809 minifier.dst = out_block;
10810 minifier.step<STEP_SIZE>(block, reader);
10811 size_t to_write = minifier.dst - out_block;
10812 // In some cases, we could be enticed to consider the padded spaces
10813 // as part of the string. This is fine as long as we do not write more
10814 // than we consumed.
10815 if(to_write > remaining_bytes) { to_write = remaining_bytes; }
10816 memcpy(guarded_dst, out_block, to_write);
10817 minifier.dst = guarded_dst + to_write;
10818 }
10819 return minifier.finish(dst, dst_len);
10820 }
10821
10822 } // namespace stage1
10823 } // unnamed namespace
10824 } // namespace westmere
10825 } // namespace simdjson
10826 /* end file src/generic/stage1/json_minifier.h */
10827 /* begin file src/generic/stage1/find_next_document_index.h */
10828 namespace simdjson {
10829 namespace westmere {
10830 namespace {
10831
10832 /**
10833 * This algorithm is used to quickly identify the last structural position that
10834 * makes up a complete document.
10835 *
10836 * It does this by going backwards and finding the last *document boundary* (a
10837 * place where one value follows another without a comma between them). If the
10838 * last document (the characters after the boundary) has an equal number of
10839 * start and end brackets, it is considered complete.
10840 *
10841 * Simply put, we iterate over the structural characters, starting from
10842 * the end. We consider that we found the end of a JSON document when the
10843 * first element of the pair is NOT one of these characters: '{' '[' ';' ','
10844 * and when the second element is NOT one of these characters: '}' '}' ';' ','.
10845 *
10846 * This simple comparison works most of the time, but it does not cover cases
10847 * where the batch's structural indexes contain a perfect amount of documents.
10848 * In such a case, we do not have access to the structural index which follows
10849 * the last document, therefore, we do not have access to the second element in
10850 * the pair, and that means we cannot identify the last document. To fix this
10851 * issue, we keep a count of the open and closed curly/square braces we found
10852 * while searching for the pair. When we find a pair AND the count of open and
10853 * closed curly/square braces is the same, we know that we just passed a
10854 * complete document, therefore the last json buffer location is the end of the
10855 * batch.
10856 */
find_next_document_index(dom_parser_implementation & parser)10857 simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
10858 // TODO don't count separately, just figure out depth
10859 auto arr_cnt = 0;
10860 auto obj_cnt = 0;
10861 for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
10862 auto idxb = parser.structural_indexes[i];
10863 switch (parser.buf[idxb]) {
10864 case ':':
10865 case ',':
10866 continue;
10867 case '}':
10868 obj_cnt--;
10869 continue;
10870 case ']':
10871 arr_cnt--;
10872 continue;
10873 case '{':
10874 obj_cnt++;
10875 break;
10876 case '[':
10877 arr_cnt++;
10878 break;
10879 }
10880 auto idxa = parser.structural_indexes[i - 1];
10881 switch (parser.buf[idxa]) {
10882 case '{':
10883 case '[':
10884 case ':':
10885 case ',':
10886 continue;
10887 }
10888 // Last document is complete, so the next document will appear after!
10889 if (!arr_cnt && !obj_cnt) {
10890 return parser.n_structural_indexes;
10891 }
10892 // Last document is incomplete; mark the document at i + 1 as the next one
10893 return i;
10894 }
10895 return 0;
10896 }
10897
10898 } // unnamed namespace
10899 } // namespace westmere
10900 } // namespace simdjson
10901 /* end file src/generic/stage1/find_next_document_index.h */
10902
10903 namespace simdjson {
10904 namespace westmere {
10905 namespace {
10906 namespace stage1 {
10907
10908 class bit_indexer {
10909 public:
10910 uint32_t *tail;
10911
bit_indexer(uint32_t * index_buf)10912 simdjson_really_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
10913
10914 // flatten out values in 'bits' assuming that they are are to have values of idx
10915 // plus their position in the bitvector, and store these indexes at
10916 // base_ptr[base] incrementing base as we go
10917 // will potentially store extra values beyond end of valid bits, so base_ptr
10918 // needs to be large enough to handle this
write(uint32_t idx,uint64_t bits)10919 simdjson_really_inline void write(uint32_t idx, uint64_t bits) {
10920 // In some instances, the next branch is expensive because it is mispredicted.
10921 // Unfortunately, in other cases,
10922 // it helps tremendously.
10923 if (bits == 0)
10924 return;
10925 int cnt = static_cast<int>(count_ones(bits));
10926
10927 // Do the first 8 all together
10928 for (int i=0; i<8; i++) {
10929 this->tail[i] = idx + trailing_zeroes(bits);
10930 bits = clear_lowest_bit(bits);
10931 }
10932
10933 // Do the next 8 all together (we hope in most cases it won't happen at all
10934 // and the branch is easily predicted).
10935 if (simdjson_unlikely(cnt > 8)) {
10936 for (int i=8; i<16; i++) {
10937 this->tail[i] = idx + trailing_zeroes(bits);
10938 bits = clear_lowest_bit(bits);
10939 }
10940
10941 // Most files don't have 16+ structurals per block, so we take several basically guaranteed
10942 // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
10943 // or the start of a value ("abc" true 123) every four characters.
10944 if (simdjson_unlikely(cnt > 16)) {
10945 int i = 16;
10946 do {
10947 this->tail[i] = idx + trailing_zeroes(bits);
10948 bits = clear_lowest_bit(bits);
10949 i++;
10950 } while (i < cnt);
10951 }
10952 }
10953
10954 this->tail += cnt;
10955 }
10956 };
10957
10958 class json_structural_indexer {
10959 public:
10960 /**
10961 * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
10962 *
10963 * @param partial Setting the partial parameter to true allows the find_structural_bits to
10964 * tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
10965 * you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
10966 */
10967 template<size_t STEP_SIZE>
10968 static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
10969
10970 private:
10971 simdjson_really_inline json_structural_indexer(uint32_t *structural_indexes);
10972 template<size_t STEP_SIZE>
10973 simdjson_really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
10974 simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
10975 simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
10976
10977 json_scanner scanner{};
10978 utf8_checker checker{};
10979 bit_indexer indexer;
10980 uint64_t prev_structurals = 0;
10981 uint64_t unescaped_chars_error = 0;
10982 };
10983
json_structural_indexer(uint32_t * structural_indexes)10984 simdjson_really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
10985
10986 // Skip the last character if it is partial
trim_partial_utf8(const uint8_t * buf,size_t len)10987 simdjson_really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
10988 if (simdjson_unlikely(len < 3)) {
10989 switch (len) {
10990 case 2:
10991 if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
10992 if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
10993 return len;
10994 case 1:
10995 if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
10996 return len;
10997 case 0:
10998 return len;
10999 }
11000 }
11001 if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
11002 if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
11003 if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
11004 return len;
11005 }
11006
11007 //
11008 // PERF NOTES:
11009 // We pipe 2 inputs through these stages:
11010 // 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
11011 // 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
11012 // 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
11013 // The output of step 1 depends entirely on this information. These functions don't quite use
11014 // up enough CPU: the second half of the functions is highly serial, only using 1 execution core
11015 // at a time. The second input's scans has some dependency on the first ones finishing it, but
11016 // they can make a lot of progress before they need that information.
11017 // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
11018 // to finish: utf-8 checks and generating the output from the last iteration.
11019 //
11020 // The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
11021 // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
11022 // workout.
11023 //
11024 template<size_t STEP_SIZE>
index(const uint8_t * buf,size_t len,dom_parser_implementation & parser,bool partial)11025 error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept {
11026 if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
11027 if (partial) { len = trim_partial_utf8(buf, len); }
11028
11029 buf_block_reader<STEP_SIZE> reader(buf, len);
11030 json_structural_indexer indexer(parser.structural_indexes.get());
11031
11032 // Read all but the last block
11033 while (reader.has_full_block()) {
11034 indexer.step<STEP_SIZE>(reader.full_block(), reader);
11035 }
11036
11037 // Take care of the last block (will always be there unless file is empty)
11038 uint8_t block[STEP_SIZE];
11039 if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return EMPTY; }
11040 indexer.step<STEP_SIZE>(block, reader);
11041
11042 return indexer.finish(parser, reader.block_index(), len, partial);
11043 }
11044
11045 template<>
step(const uint8_t * block,buf_block_reader<128> & reader)11046 simdjson_really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
11047 simd::simd8x64<uint8_t> in_1(block);
11048 simd::simd8x64<uint8_t> in_2(block+64);
11049 json_block block_1 = scanner.next(in_1);
11050 json_block block_2 = scanner.next(in_2);
11051 this->next(in_1, block_1, reader.block_index());
11052 this->next(in_2, block_2, reader.block_index()+64);
11053 reader.advance();
11054 }
11055
11056 template<>
step(const uint8_t * block,buf_block_reader<64> & reader)11057 simdjson_really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
11058 simd::simd8x64<uint8_t> in_1(block);
11059 json_block block_1 = scanner.next(in_1);
11060 this->next(in_1, block_1, reader.block_index());
11061 reader.advance();
11062 }
11063
next(const simd::simd8x64<uint8_t> & in,const json_block & block,size_t idx)11064 simdjson_really_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
11065 uint64_t unescaped = in.lteq(0x1F);
11066 checker.check_next_input(in);
11067 indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
11068 prev_structurals = block.structural_start();
11069 unescaped_chars_error |= block.non_quote_inside_string(unescaped);
11070 }
11071
finish(dom_parser_implementation & parser,size_t idx,size_t len,bool partial)11072 simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) {
11073 // Write out the final iteration's structurals
11074 indexer.write(uint32_t(idx-64), prev_structurals);
11075
11076 error_code error = scanner.finish();
11077 // We deliberately break down the next expression so that it is
11078 // human readable.
11079 const bool should_we_exit = partial ?
11080 ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
11081 : (error != SUCCESS); // if partial is false, we must have SUCCESS
11082 const bool have_unclosed_string = (error == UNCLOSED_STRING);
11083 if (simdjson_unlikely(should_we_exit)) { return error; }
11084
11085 if (unescaped_chars_error) {
11086 return UNESCAPED_CHARS;
11087 }
11088
11089 parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
11090 /***
11091 * This is related to https://github.com/simdjson/simdjson/issues/906
11092 * Basically, we want to make sure that if the parsing continues beyond the last (valid)
11093 * structural character, it quickly stops.
11094 * Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
11095 * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
11096 * continues, then it must be [,] or }.
11097 * Suppose it is ] or }. We backtrack to the first character, what could it be that would
11098 * not trigger an error? It could be ] or } but no, because you can't start a document that way.
11099 * It can't be a comma, a colon or any simple value. So the only way we could continue is
11100 * if the repeated character is [. But if so, the document must start with [. But if the document
11101 * starts with [, it should end with ]. If we enforce that rule, then we would get
11102 * ][[ which is invalid.
11103 **/
11104 parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
11105 parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
11106 parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
11107 parser.next_structural_index = 0;
11108 // a valid JSON file cannot have zero structural indexes - we should have found something
11109 if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
11110 return EMPTY;
11111 }
11112 if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
11113 return UNEXPECTED_ERROR;
11114 }
11115 if (partial) {
11116 // If we have an unclosed string, then the last structural
11117 // will be the quote and we want to make sure to omit it.
11118 if(have_unclosed_string) {
11119 parser.n_structural_indexes--;
11120 // a valid JSON file cannot have zero structural indexes - we should have found something
11121 if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
11122 }
11123 auto new_structural_indexes = find_next_document_index(parser);
11124 if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
11125 return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
11126 }
11127 parser.n_structural_indexes = new_structural_indexes;
11128 }
11129 checker.check_eof();
11130 return checker.errors();
11131 }
11132
11133 } // namespace stage1
11134 } // unnamed namespace
11135 } // namespace westmere
11136 } // namespace simdjson
11137 /* end file src/generic/stage1/json_structural_indexer.h */
11138 /* begin file src/generic/stage1/utf8_validator.h */
11139 namespace simdjson {
11140 namespace westmere {
11141 namespace {
11142 namespace stage1 {
11143
11144 /**
11145 * Validates that the string is actual UTF-8.
11146 */
11147 template<class checker>
generic_validate_utf8(const uint8_t * input,size_t length)11148 bool generic_validate_utf8(const uint8_t * input, size_t length) {
11149 checker c{};
11150 buf_block_reader<64> reader(input, length);
11151 while (reader.has_full_block()) {
11152 simd::simd8x64<uint8_t> in(reader.full_block());
11153 c.check_next_input(in);
11154 reader.advance();
11155 }
11156 uint8_t block[64]{};
11157 reader.get_remainder(block);
11158 simd::simd8x64<uint8_t> in(block);
11159 c.check_next_input(in);
11160 reader.advance();
11161 c.check_eof();
11162 return c.errors() == error_code::SUCCESS;
11163 }
11164
generic_validate_utf8(const char * input,size_t length)11165 bool generic_validate_utf8(const char * input, size_t length) {
11166 return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
11167 }
11168
11169 } // namespace stage1
11170 } // unnamed namespace
11171 } // namespace westmere
11172 } // namespace simdjson
11173 /* end file src/generic/stage1/utf8_validator.h */
11174
11175 //
11176 // Stage 2
11177 //
11178 /* begin file src/generic/stage2/tape_builder.h */
11179 /* begin file src/generic/stage2/json_iterator.h */
11180 /* begin file src/generic/stage2/logger.h */
11181 // This is for an internal-only stage 2 specific logger.
11182 // Set LOG_ENABLED = true to log what stage 2 is doing!
11183 namespace simdjson {
11184 namespace westmere {
11185 namespace {
11186 namespace logger {
11187
11188 static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
11189
11190 #if SIMDJSON_VERBOSE_LOGGING
11191 static constexpr const bool LOG_ENABLED = true;
11192 #else
11193 static constexpr const bool LOG_ENABLED = false;
11194 #endif
11195 static constexpr const int LOG_EVENT_LEN = 20;
11196 static constexpr const int LOG_BUFFER_LEN = 30;
11197 static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
11198 static constexpr const int LOG_INDEX_LEN = 5;
11199
11200 static int log_depth; // Not threadsafe. Log only.
11201
11202 // Helper to turn unprintable or newline characters into spaces
printable_char(char c)11203 static simdjson_really_inline char printable_char(char c) {
11204 if (c >= 0x20) {
11205 return c;
11206 } else {
11207 return ' ';
11208 }
11209 }
11210
11211 // Print the header and set up log_start
log_start()11212 static simdjson_really_inline void log_start() {
11213 if (LOG_ENABLED) {
11214 log_depth = 0;
11215 printf("\n");
11216 printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
11217 printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
11218 }
11219 }
11220
log_string(const char * message)11221 simdjson_unused static simdjson_really_inline void log_string(const char *message) {
11222 if (LOG_ENABLED) {
11223 printf("%s\n", message);
11224 }
11225 }
11226
11227 // Logs a single line from the stage 2 DOM parser
11228 template<typename S>
log_line(S & structurals,const char * title_prefix,const char * title,const char * detail)11229 static simdjson_really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
11230 if (LOG_ENABLED) {
11231 printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
11232 auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
11233 auto next_index = structurals.next_structural;
11234 auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>(" ");
11235 auto next = &structurals.buf[*next_index];
11236 {
11237 // Print the next N characters in the buffer.
11238 printf("| ");
11239 // Otherwise, print the characters starting from the buffer position.
11240 // Print spaces for unprintable or newline characters.
11241 for (int i=0;i<LOG_BUFFER_LEN;i++) {
11242 printf("%c", printable_char(current[i]));
11243 }
11244 printf(" ");
11245 // Print the next N characters in the buffer.
11246 printf("| ");
11247 // Otherwise, print the characters starting from the buffer position.
11248 // Print spaces for unprintable or newline characters.
11249 for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
11250 printf("%c", printable_char(next[i]));
11251 }
11252 printf(" ");
11253 }
11254 if (current_index) {
11255 printf("| %*u ", LOG_INDEX_LEN, *current_index);
11256 } else {
11257 printf("| %-*s ", LOG_INDEX_LEN, "");
11258 }
11259 // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
11260 printf("| %-s ", detail);
11261 printf("|\n");
11262 }
11263 }
11264
11265 } // namespace logger
11266 } // unnamed namespace
11267 } // namespace westmere
11268 } // namespace simdjson
11269 /* end file src/generic/stage2/logger.h */
11270
11271 namespace simdjson {
11272 namespace westmere {
11273 namespace {
11274 namespace stage2 {
11275
11276 class json_iterator {
11277 public:
11278 const uint8_t* const buf;
11279 uint32_t *next_structural;
11280 dom_parser_implementation &dom_parser;
11281 uint32_t depth{0};
11282
11283 /**
11284 * Walk the JSON document.
11285 *
11286 * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
11287 * the first parameter; some callbacks have other parameters as well:
11288 *
11289 * - visit_document_start() - at the beginning.
11290 * - visit_document_end() - at the end (if things were successful).
11291 *
11292 * - visit_array_start() - at the start `[` of a non-empty array.
11293 * - visit_array_end() - at the end `]` of a non-empty array.
11294 * - visit_empty_array() - when an empty array is encountered.
11295 *
11296 * - visit_object_end() - at the start `]` of a non-empty object.
11297 * - visit_object_start() - at the end `]` of a non-empty object.
11298 * - visit_empty_object() - when an empty object is encountered.
11299 * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
11300 * guaranteed to point at the first quote of the string (`"key"`).
11301 * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
11302 * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
11303 *
11304 * - increment_count(iter) - each time a value is found in an array or object.
11305 */
11306 template<bool STREAMING, typename V>
11307 simdjson_warn_unused simdjson_really_inline error_code walk_document(V &visitor) noexcept;
11308
11309 /**
11310 * Create an iterator capable of walking a JSON document.
11311 *
11312 * The document must have already passed through stage 1.
11313 */
11314 simdjson_really_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
11315
11316 /**
11317 * Look at the next token.
11318 *
11319 * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
11320 *
11321 * They may include invalid JSON as well (such as `1.2.3` or `ture`).
11322 */
11323 simdjson_really_inline const uint8_t *peek() const noexcept;
11324 /**
11325 * Advance to the next token.
11326 *
11327 * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
11328 *
11329 * They may include invalid JSON as well (such as `1.2.3` or `ture`).
11330 */
11331 simdjson_really_inline const uint8_t *advance() noexcept;
11332 /**
11333 * Get the remaining length of the document, from the start of the current token.
11334 */
11335 simdjson_really_inline size_t remaining_len() const noexcept;
11336 /**
11337 * Check if we are at the end of the document.
11338 *
11339 * If this is true, there are no more tokens.
11340 */
11341 simdjson_really_inline bool at_eof() const noexcept;
11342 /**
11343 * Check if we are at the beginning of the document.
11344 */
11345 simdjson_really_inline bool at_beginning() const noexcept;
11346 simdjson_really_inline uint8_t last_structural() const noexcept;
11347
11348 /**
11349 * Log that a value has been found.
11350 *
11351 * Set ENABLE_LOGGING=true in logger.h to see logging.
11352 */
11353 simdjson_really_inline void log_value(const char *type) const noexcept;
11354 /**
11355 * Log the start of a multipart value.
11356 *
11357 * Set ENABLE_LOGGING=true in logger.h to see logging.
11358 */
11359 simdjson_really_inline void log_start_value(const char *type) const noexcept;
11360 /**
11361 * Log the end of a multipart value.
11362 *
11363 * Set ENABLE_LOGGING=true in logger.h to see logging.
11364 */
11365 simdjson_really_inline void log_end_value(const char *type) const noexcept;
11366 /**
11367 * Log an error.
11368 *
11369 * Set ENABLE_LOGGING=true in logger.h to see logging.
11370 */
11371 simdjson_really_inline void log_error(const char *error) const noexcept;
11372
11373 template<typename V>
11374 simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
11375 template<typename V>
11376 simdjson_warn_unused simdjson_really_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
11377 };
11378
11379 template<bool STREAMING, typename V>
walk_document(V & visitor)11380 simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_document(V &visitor) noexcept {
11381 logger::log_start();
11382
11383 //
11384 // Start the document
11385 //
11386 if (at_eof()) { return EMPTY; }
11387 log_start_value("document");
11388 SIMDJSON_TRY( visitor.visit_document_start(*this) );
11389
11390 //
11391 // Read first value
11392 //
11393 {
11394 auto value = advance();
11395
11396 // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
11397 // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
11398 if (!STREAMING) {
11399 switch (*value) {
11400 case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break;
11401 case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break;
11402 }
11403 }
11404
11405 switch (*value) {
11406 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
11407 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
11408 default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
11409 }
11410 }
11411 goto document_end;
11412
11413 //
11414 // Object parser states
11415 //
11416 object_begin:
11417 log_start_value("object");
11418 depth++;
11419 if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
11420 dom_parser.is_array[depth] = false;
11421 SIMDJSON_TRY( visitor.visit_object_start(*this) );
11422
11423 {
11424 auto key = advance();
11425 if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
11426 SIMDJSON_TRY( visitor.increment_count(*this) );
11427 SIMDJSON_TRY( visitor.visit_key(*this, key) );
11428 }
11429
11430 object_field:
11431 if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
11432 {
11433 auto value = advance();
11434 switch (*value) {
11435 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
11436 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
11437 default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
11438 }
11439 }
11440
11441 object_continue:
11442 switch (*advance()) {
11443 case ',':
11444 SIMDJSON_TRY( visitor.increment_count(*this) );
11445 {
11446 auto key = advance();
11447 if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
11448 SIMDJSON_TRY( visitor.visit_key(*this, key) );
11449 }
11450 goto object_field;
11451 case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
11452 default: log_error("No comma between object fields"); return TAPE_ERROR;
11453 }
11454
11455 scope_end:
11456 depth--;
11457 if (depth == 0) { goto document_end; }
11458 if (dom_parser.is_array[depth]) { goto array_continue; }
11459 goto object_continue;
11460
11461 //
11462 // Array parser states
11463 //
11464 array_begin:
11465 log_start_value("array");
11466 depth++;
11467 if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
11468 dom_parser.is_array[depth] = true;
11469 SIMDJSON_TRY( visitor.visit_array_start(*this) );
11470 SIMDJSON_TRY( visitor.increment_count(*this) );
11471
11472 array_value:
11473 {
11474 auto value = advance();
11475 switch (*value) {
11476 case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
11477 case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
11478 default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
11479 }
11480 }
11481
11482 array_continue:
11483 switch (*advance()) {
11484 case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
11485 case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
11486 default: log_error("Missing comma between array values"); return TAPE_ERROR;
11487 }
11488
11489 document_end:
11490 log_end_value("document");
11491 SIMDJSON_TRY( visitor.visit_document_end(*this) );
11492
11493 dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
11494
11495 // If we didn't make it to the end, it's an error
11496 if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
11497 log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
11498 return TAPE_ERROR;
11499 }
11500
11501 return SUCCESS;
11502
11503 } // walk_document()
11504
json_iterator(dom_parser_implementation & _dom_parser,size_t start_structural_index)11505 simdjson_really_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
11506 : buf{_dom_parser.buf},
11507 next_structural{&_dom_parser.structural_indexes[start_structural_index]},
11508 dom_parser{_dom_parser} {
11509 }
11510
peek() const11511 simdjson_really_inline const uint8_t *json_iterator::peek() const noexcept {
11512 return &buf[*(next_structural)];
11513 }
advance()11514 simdjson_really_inline const uint8_t *json_iterator::advance() noexcept {
11515 return &buf[*(next_structural++)];
11516 }
remaining_len() const11517 simdjson_really_inline size_t json_iterator::remaining_len() const noexcept {
11518 return dom_parser.len - *(next_structural-1);
11519 }
11520
at_eof() const11521 simdjson_really_inline bool json_iterator::at_eof() const noexcept {
11522 return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
11523 }
at_beginning() const11524 simdjson_really_inline bool json_iterator::at_beginning() const noexcept {
11525 return next_structural == dom_parser.structural_indexes.get();
11526 }
last_structural() const11527 simdjson_really_inline uint8_t json_iterator::last_structural() const noexcept {
11528 return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
11529 }
11530
log_value(const char * type) const11531 simdjson_really_inline void json_iterator::log_value(const char *type) const noexcept {
11532 logger::log_line(*this, "", type, "");
11533 }
11534
log_start_value(const char * type) const11535 simdjson_really_inline void json_iterator::log_start_value(const char *type) const noexcept {
11536 logger::log_line(*this, "+", type, "");
11537 if (logger::LOG_ENABLED) { logger::log_depth++; }
11538 }
11539
log_end_value(const char * type) const11540 simdjson_really_inline void json_iterator::log_end_value(const char *type) const noexcept {
11541 if (logger::LOG_ENABLED) { logger::log_depth--; }
11542 logger::log_line(*this, "-", type, "");
11543 }
11544
log_error(const char * error) const11545 simdjson_really_inline void json_iterator::log_error(const char *error) const noexcept {
11546 logger::log_line(*this, "", "ERROR", error);
11547 }
11548
11549 template<typename V>
visit_root_primitive(V & visitor,const uint8_t * value)11550 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
11551 switch (*value) {
11552 case '"': return visitor.visit_root_string(*this, value);
11553 case 't': return visitor.visit_root_true_atom(*this, value);
11554 case 'f': return visitor.visit_root_false_atom(*this, value);
11555 case 'n': return visitor.visit_root_null_atom(*this, value);
11556 case '-':
11557 case '0': case '1': case '2': case '3': case '4':
11558 case '5': case '6': case '7': case '8': case '9':
11559 return visitor.visit_root_number(*this, value);
11560 default:
11561 log_error("Document starts with a non-value character");
11562 return TAPE_ERROR;
11563 }
11564 }
11565 template<typename V>
visit_primitive(V & visitor,const uint8_t * value)11566 simdjson_warn_unused simdjson_really_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
11567 switch (*value) {
11568 case '"': return visitor.visit_string(*this, value);
11569 case 't': return visitor.visit_true_atom(*this, value);
11570 case 'f': return visitor.visit_false_atom(*this, value);
11571 case 'n': return visitor.visit_null_atom(*this, value);
11572 case '-':
11573 case '0': case '1': case '2': case '3': case '4':
11574 case '5': case '6': case '7': case '8': case '9':
11575 return visitor.visit_number(*this, value);
11576 default:
11577 log_error("Non-value found when value was expected!");
11578 return TAPE_ERROR;
11579 }
11580 }
11581
11582 } // namespace stage2
11583 } // unnamed namespace
11584 } // namespace westmere
11585 } // namespace simdjson
11586 /* end file src/generic/stage2/json_iterator.h */
11587 /* begin file src/generic/stage2/tape_writer.h */
11588 namespace simdjson {
11589 namespace westmere {
11590 namespace {
11591 namespace stage2 {
11592
11593 struct tape_writer {
11594 /** The next place to write to tape */
11595 uint64_t *next_tape_loc;
11596
11597 /** Write a signed 64-bit value to tape. */
11598 simdjson_really_inline void append_s64(int64_t value) noexcept;
11599
11600 /** Write an unsigned 64-bit value to tape. */
11601 simdjson_really_inline void append_u64(uint64_t value) noexcept;
11602
11603 /** Write a double value to tape. */
11604 simdjson_really_inline void append_double(double value) noexcept;
11605
11606 /**
11607 * Append a tape entry (an 8-bit type,and 56 bits worth of value).
11608 */
11609 simdjson_really_inline void append(uint64_t val, internal::tape_type t) noexcept;
11610
11611 /**
11612 * Skip the current tape entry without writing.
11613 *
11614 * Used to skip the start of the container, since we'll come back later to fill it in when the
11615 * container ends.
11616 */
11617 simdjson_really_inline void skip() noexcept;
11618
11619 /**
11620 * Skip the number of tape entries necessary to write a large u64 or i64.
11621 */
11622 simdjson_really_inline void skip_large_integer() noexcept;
11623
11624 /**
11625 * Skip the number of tape entries necessary to write a double.
11626 */
11627 simdjson_really_inline void skip_double() noexcept;
11628
11629 /**
11630 * Write a value to a known location on tape.
11631 *
11632 * Used to go back and write out the start of a container after the container ends.
11633 */
11634 simdjson_really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
11635
11636 private:
11637 /**
11638 * Append both the tape entry, and a supplementary value following it. Used for types that need
11639 * all 64 bits, such as double and uint64_t.
11640 */
11641 template<typename T>
11642 simdjson_really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
11643 }; // struct number_writer
11644
append_s64(int64_t value)11645 simdjson_really_inline void tape_writer::append_s64(int64_t value) noexcept {
11646 append2(0, value, internal::tape_type::INT64);
11647 }
11648
append_u64(uint64_t value)11649 simdjson_really_inline void tape_writer::append_u64(uint64_t value) noexcept {
11650 append(0, internal::tape_type::UINT64);
11651 *next_tape_loc = value;
11652 next_tape_loc++;
11653 }
11654
11655 /** Write a double value to tape. */
append_double(double value)11656 simdjson_really_inline void tape_writer::append_double(double value) noexcept {
11657 append2(0, value, internal::tape_type::DOUBLE);
11658 }
11659
skip()11660 simdjson_really_inline void tape_writer::skip() noexcept {
11661 next_tape_loc++;
11662 }
11663
skip_large_integer()11664 simdjson_really_inline void tape_writer::skip_large_integer() noexcept {
11665 next_tape_loc += 2;
11666 }
11667
skip_double()11668 simdjson_really_inline void tape_writer::skip_double() noexcept {
11669 next_tape_loc += 2;
11670 }
11671
append(uint64_t val,internal::tape_type t)11672 simdjson_really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
11673 *next_tape_loc = val | ((uint64_t(char(t))) << 56);
11674 next_tape_loc++;
11675 }
11676
11677 template<typename T>
append2(uint64_t val,T val2,internal::tape_type t)11678 simdjson_really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
11679 append(val, t);
11680 static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
11681 memcpy(next_tape_loc, &val2, sizeof(val2));
11682 next_tape_loc++;
11683 }
11684
write(uint64_t & tape_loc,uint64_t val,internal::tape_type t)11685 simdjson_really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
11686 tape_loc = val | ((uint64_t(char(t))) << 56);
11687 }
11688
11689 } // namespace stage2
11690 } // unnamed namespace
11691 } // namespace westmere
11692 } // namespace simdjson
11693 /* end file src/generic/stage2/tape_writer.h */
11694
11695 namespace simdjson {
11696 namespace westmere {
11697 namespace {
11698 namespace stage2 {
11699
11700 struct tape_builder {
11701 template<bool STREAMING>
11702 simdjson_warn_unused static simdjson_really_inline error_code parse_document(
11703 dom_parser_implementation &dom_parser,
11704 dom::document &doc) noexcept;
11705
11706 /** Called when a non-empty document starts. */
11707 simdjson_warn_unused simdjson_really_inline error_code visit_document_start(json_iterator &iter) noexcept;
11708 /** Called when a non-empty document ends without error. */
11709 simdjson_warn_unused simdjson_really_inline error_code visit_document_end(json_iterator &iter) noexcept;
11710
11711 /** Called when a non-empty array starts. */
11712 simdjson_warn_unused simdjson_really_inline error_code visit_array_start(json_iterator &iter) noexcept;
11713 /** Called when a non-empty array ends. */
11714 simdjson_warn_unused simdjson_really_inline error_code visit_array_end(json_iterator &iter) noexcept;
11715 /** Called when an empty array is found. */
11716 simdjson_warn_unused simdjson_really_inline error_code visit_empty_array(json_iterator &iter) noexcept;
11717
11718 /** Called when a non-empty object starts. */
11719 simdjson_warn_unused simdjson_really_inline error_code visit_object_start(json_iterator &iter) noexcept;
11720 /**
11721 * Called when a key in a field is encountered.
11722 *
11723 * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
11724 * will be called after this with the field value.
11725 */
11726 simdjson_warn_unused simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
11727 /** Called when a non-empty object ends. */
11728 simdjson_warn_unused simdjson_really_inline error_code visit_object_end(json_iterator &iter) noexcept;
11729 /** Called when an empty object is found. */
11730 simdjson_warn_unused simdjson_really_inline error_code visit_empty_object(json_iterator &iter) noexcept;
11731
11732 /**
11733 * Called when a string, number, boolean or null is found.
11734 */
11735 simdjson_warn_unused simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
11736 /**
11737 * Called when a string, number, boolean or null is found at the top level of a document (i.e.
11738 * when there is no array or object and the entire document is a single string, number, boolean or
11739 * null.
11740 *
11741 * This is separate from primitive() because simdjson's normal primitive parsing routines assume
11742 * there is at least one more token after the value, which is only true in an array or object.
11743 */
11744 simdjson_warn_unused simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
11745
11746 simdjson_warn_unused simdjson_really_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
11747 simdjson_warn_unused simdjson_really_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
11748 simdjson_warn_unused simdjson_really_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
11749 simdjson_warn_unused simdjson_really_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
11750 simdjson_warn_unused simdjson_really_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
11751
11752 simdjson_warn_unused simdjson_really_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
11753 simdjson_warn_unused simdjson_really_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
11754 simdjson_warn_unused simdjson_really_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
11755 simdjson_warn_unused simdjson_really_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
11756 simdjson_warn_unused simdjson_really_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
11757
11758 /** Called each time a new field or element in an array or object is found. */
11759 simdjson_warn_unused simdjson_really_inline error_code increment_count(json_iterator &iter) noexcept;
11760
11761 /** Next location to write to tape */
11762 tape_writer tape;
11763 private:
11764 /** Next write location in the string buf for stage 2 parsing */
11765 uint8_t *current_string_buf_loc;
11766
11767 simdjson_really_inline tape_builder(dom::document &doc) noexcept;
11768
11769 simdjson_really_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
11770 simdjson_really_inline void start_container(json_iterator &iter) noexcept;
11771 simdjson_warn_unused simdjson_really_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
11772 simdjson_warn_unused simdjson_really_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
11773 simdjson_really_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
11774 simdjson_really_inline void on_end_string(uint8_t *dst) noexcept;
11775 }; // class tape_builder
11776
11777 template<bool STREAMING>
parse_document(dom_parser_implementation & dom_parser,dom::document & doc)11778 simdjson_warn_unused simdjson_really_inline error_code tape_builder::parse_document(
11779 dom_parser_implementation &dom_parser,
11780 dom::document &doc) noexcept {
11781 dom_parser.doc = &doc;
11782 json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
11783 tape_builder builder(doc);
11784 return iter.walk_document<STREAMING>(builder);
11785 }
11786
visit_root_primitive(json_iterator & iter,const uint8_t * value)11787 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
11788 return iter.visit_root_primitive(*this, value);
11789 }
visit_primitive(json_iterator & iter,const uint8_t * value)11790 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
11791 return iter.visit_primitive(*this, value);
11792 }
visit_empty_object(json_iterator & iter)11793 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
11794 return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
11795 }
visit_empty_array(json_iterator & iter)11796 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
11797 return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
11798 }
11799
visit_document_start(json_iterator & iter)11800 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
11801 start_container(iter);
11802 return SUCCESS;
11803 }
visit_object_start(json_iterator & iter)11804 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
11805 start_container(iter);
11806 return SUCCESS;
11807 }
visit_array_start(json_iterator & iter)11808 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
11809 start_container(iter);
11810 return SUCCESS;
11811 }
11812
visit_object_end(json_iterator & iter)11813 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
11814 return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
11815 }
visit_array_end(json_iterator & iter)11816 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
11817 return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
11818 }
visit_document_end(json_iterator & iter)11819 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
11820 constexpr uint32_t start_tape_index = 0;
11821 tape.append(start_tape_index, internal::tape_type::ROOT);
11822 tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
11823 return SUCCESS;
11824 }
visit_key(json_iterator & iter,const uint8_t * key)11825 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
11826 return visit_string(iter, key, true);
11827 }
11828
increment_count(json_iterator & iter)11829 simdjson_warn_unused simdjson_really_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
11830 iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
11831 return SUCCESS;
11832 }
11833
tape_builder(dom::document & doc)11834 simdjson_really_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
11835
visit_string(json_iterator & iter,const uint8_t * value,bool key)11836 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
11837 iter.log_value(key ? "key" : "string");
11838 uint8_t *dst = on_start_string(iter);
11839 dst = stringparsing::parse_string(value+1, dst);
11840 if (dst == nullptr) {
11841 iter.log_error("Invalid escape in string");
11842 return STRING_ERROR;
11843 }
11844 on_end_string(dst);
11845 return SUCCESS;
11846 }
11847
visit_root_string(json_iterator & iter,const uint8_t * value)11848 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
11849 return visit_string(iter, value);
11850 }
11851
visit_number(json_iterator & iter,const uint8_t * value)11852 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
11853 iter.log_value("number");
11854 return numberparsing::parse_number(value, tape);
11855 }
11856
visit_root_number(json_iterator & iter,const uint8_t * value)11857 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
11858 //
11859 // We need to make a copy to make sure that the string is space terminated.
11860 // This is not about padding the input, which should already padded up
11861 // to len + SIMDJSON_PADDING. However, we have no control at this stage
11862 // on how the padding was done. What if the input string was padded with nulls?
11863 // It is quite common for an input string to have an extra null character (C string).
11864 // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
11865 // document, but the string "9\0" by itself is fine. So we make a copy and
11866 // pad the input with spaces when we know that there is just one input element.
11867 // This copy is relatively expensive, but it will almost never be called in
11868 // practice unless you are in the strange scenario where you have many JSON
11869 // documents made of single atoms.
11870 //
11871 std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
11872 if (copy.get() == nullptr) { return MEMALLOC; }
11873 std::memcpy(copy.get(), value, iter.remaining_len());
11874 std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
11875 error_code error = visit_number(iter, copy.get());
11876 return error;
11877 }
11878
visit_true_atom(json_iterator & iter,const uint8_t * value)11879 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
11880 iter.log_value("true");
11881 if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
11882 tape.append(0, internal::tape_type::TRUE_VALUE);
11883 return SUCCESS;
11884 }
11885
visit_root_true_atom(json_iterator & iter,const uint8_t * value)11886 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
11887 iter.log_value("true");
11888 if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
11889 tape.append(0, internal::tape_type::TRUE_VALUE);
11890 return SUCCESS;
11891 }
11892
visit_false_atom(json_iterator & iter,const uint8_t * value)11893 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
11894 iter.log_value("false");
11895 if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
11896 tape.append(0, internal::tape_type::FALSE_VALUE);
11897 return SUCCESS;
11898 }
11899
visit_root_false_atom(json_iterator & iter,const uint8_t * value)11900 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
11901 iter.log_value("false");
11902 if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
11903 tape.append(0, internal::tape_type::FALSE_VALUE);
11904 return SUCCESS;
11905 }
11906
visit_null_atom(json_iterator & iter,const uint8_t * value)11907 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
11908 iter.log_value("null");
11909 if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
11910 tape.append(0, internal::tape_type::NULL_VALUE);
11911 return SUCCESS;
11912 }
11913
visit_root_null_atom(json_iterator & iter,const uint8_t * value)11914 simdjson_warn_unused simdjson_really_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
11915 iter.log_value("null");
11916 if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
11917 tape.append(0, internal::tape_type::NULL_VALUE);
11918 return SUCCESS;
11919 }
11920
11921 // private:
11922
next_tape_index(json_iterator & iter) const11923 simdjson_really_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
11924 return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
11925 }
11926
empty_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)11927 simdjson_warn_unused simdjson_really_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
11928 auto start_index = next_tape_index(iter);
11929 tape.append(start_index+2, start);
11930 tape.append(start_index, end);
11931 return SUCCESS;
11932 }
11933
start_container(json_iterator & iter)11934 simdjson_really_inline void tape_builder::start_container(json_iterator &iter) noexcept {
11935 iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
11936 iter.dom_parser.open_containers[iter.depth].count = 0;
11937 tape.skip(); // We don't actually *write* the start element until the end.
11938 }
11939
end_container(json_iterator & iter,internal::tape_type start,internal::tape_type end)11940 simdjson_warn_unused simdjson_really_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
11941 // Write the ending tape element, pointing at the start location
11942 const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
11943 tape.append(start_tape_index, end);
11944 // Write the start tape element, pointing at the end location (and including count)
11945 // count can overflow if it exceeds 24 bits... so we saturate
11946 // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
11947 const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
11948 const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
11949 tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
11950 return SUCCESS;
11951 }
11952
on_start_string(json_iterator & iter)11953 simdjson_really_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
11954 // we advance the point, accounting for the fact that we have a NULL termination
11955 tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
11956 return current_string_buf_loc + sizeof(uint32_t);
11957 }
11958
on_end_string(uint8_t * dst)11959 simdjson_really_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
11960 uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
11961 // TODO check for overflow in case someone has a crazy string (>=4GB?)
11962 // But only add the overflow check when the document itself exceeds 4GB
11963 // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
11964 memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
11965 // NULL termination is still handy if you expect all your strings to
11966 // be NULL terminated? It comes at a small cost
11967 *dst = 0;
11968 current_string_buf_loc = dst + 1;
11969 }
11970
11971 } // namespace stage2
11972 } // unnamed namespace
11973 } // namespace westmere
11974 } // namespace simdjson
11975 /* end file src/generic/stage2/tape_builder.h */
11976
11977 //
11978 // Implementation-specific overrides
11979 //
11980
11981 namespace simdjson {
11982 namespace westmere {
11983 namespace {
11984 namespace stage1 {
11985
find_escaped(uint64_t backslash)11986 simdjson_really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
11987 if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
11988 return find_escaped_branchless(backslash);
11989 }
11990
11991 } // namespace stage1
11992 } // unnamed namespace
11993
minify(const uint8_t * buf,size_t len,uint8_t * dst,size_t & dst_len) const11994 simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
11995 return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
11996 }
11997
stage1(const uint8_t * _buf,size_t _len,bool streaming)11998 simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
11999 this->buf = _buf;
12000 this->len = _len;
12001 return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
12002 }
12003
validate_utf8(const char * buf,size_t len) const12004 simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
12005 return westmere::stage1::generic_validate_utf8(buf,len);
12006 }
12007
stage2(dom::document & _doc)12008 simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
12009 return stage2::tape_builder::parse_document<false>(*this, _doc);
12010 }
12011
stage2_next(dom::document & _doc)12012 simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
12013 return stage2::tape_builder::parse_document<true>(*this, _doc);
12014 }
12015
parse(const uint8_t * _buf,size_t _len,dom::document & _doc)12016 simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
12017 auto error = stage1(_buf, _len, false);
12018 if (error) { return error; }
12019 return stage2(_doc);
12020 }
12021
12022 } // namespace westmere
12023 } // namespace simdjson
12024
12025 /* begin file include/simdjson/westmere/end.h */
12026 SIMDJSON_UNTARGET_WESTMERE
12027 /* end file include/simdjson/westmere/end.h */
12028 /* end file src/westmere/dom_parser_implementation.cpp */
12029 #endif
12030
12031 SIMDJSON_POP_DISABLE_WARNINGS
12032 /* end file src/simdjson.cpp */
12033