1 #include "simdjson/internal/numberparsing_tables.h"
2 #include <limits>
3 
4 namespace simdjson {
5 namespace SIMDJSON_IMPLEMENTATION {
6 namespace {
7 /// @private
8 namespace numberparsing {
9 
10 
11 
12 #ifdef JSON_TEST_NUMBERS
13 #define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
14 #define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
15 #define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
16 #define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
17 #else
18 #define INVALID_NUMBER(SRC) (NUMBER_ERROR)
19 #define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
20 #define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
21 #define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
22 #endif
23 
24 namespace {
25 // Convert a mantissa, an exponent and a sign bit into an ieee64 double.
26 // The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
27 // The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
to_double(uint64_t mantissa,uint64_t real_exponent,bool negative)28 simdjson_really_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
29     double d;
30     mantissa &= ~(1ULL << 52);
31     mantissa |= real_exponent << 52;
32     mantissa |= ((static_cast<uint64_t>(negative)) << 63);
33     std::memcpy(&d, &mantissa, sizeof(d));
34     return d;
35 }
36 }
37 // Attempts to compute i * 10^(power) exactly; and if "negative" is
38 // true, negate the result.
39 // This function will only work in some cases, when it does not work, success is
40 // set to false. This should work *most of the time* (like 99% of the time).
41 // We assume that power is in the [smallest_power,
42 // largest_power] interval: the caller is responsible for this check.
compute_float_64(int64_t power,uint64_t i,bool negative,double & d)43 simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
44   // we start with a fast path
45   // It was described in
46   // Clinger WD. How to read floating point numbers accurately.
47   // ACM SIGPLAN Notices. 1990
48 #ifndef FLT_EVAL_METHOD
49 #error "FLT_EVAL_METHOD should be defined, please include cfloat."
50 #endif
51 #if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
52   // We cannot be certain that x/y is rounded to nearest.
53   if (0 <= power && power <= 22 && i <= 9007199254740991) {
54 #else
55   if (-22 <= power && power <= 22 && i <= 9007199254740991) {
56 #endif
57     // convert the integer into a double. This is lossless since
58     // 0 <= i <= 2^53 - 1.
59     d = double(i);
60     //
61     // The general idea is as follows.
62     // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
63     // 1) Both s and p can be represented exactly as 64-bit floating-point
64     // values
65     // (binary64).
66     // 2) Because s and p can be represented exactly as floating-point values,
67     // then s * p
68     // and s / p will produce correctly rounded values.
69     //
70     if (power < 0) {
71       d = d / simdjson::internal::power_of_ten[-power];
72     } else {
73       d = d * simdjson::internal::power_of_ten[power];
74     }
75     if (negative) {
76       d = -d;
77     }
78     return true;
79   }
80   // When 22 < power && power <  22 + 16, we could
81   // hope for another, secondary fast path.  It was
82   // described by David M. Gay in  "Correctly rounded
83   // binary-decimal and decimal-binary conversions." (1990)
84   // If you need to compute i * 10^(22 + x) for x < 16,
85   // first compute i * 10^x, if you know that result is exact
86   // (e.g., when i * 10^x < 2^53),
87   // then you can still proceed and do (i * 10^x) * 10^22.
88   // Is this worth your time?
89   // You need  22 < power *and* power <  22 + 16 *and* (i * 10^(x-22) < 2^53)
90   // for this second fast path to work.
91   // If you you have 22 < power *and* power <  22 + 16, and then you
92   // optimistically compute "i * 10^(x-22)", there is still a chance that you
93   // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
94   // this optimization maybe less common than we would like. Source:
95   // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
96   // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
97 
98   // The fast path has now failed, so we are failing back on the slower path.
99 
100   // In the slow path, we need to adjust i so that it is > 1<<63 which is always
101   // possible, except if i == 0, so we handle i == 0 separately.
102   if(i == 0) {
103     d = 0.0;
104     return true;
105   }
106 
107 
108   // The exponent is 1024 + 63 + power
109   //     + floor(log(5**power)/log(2)).
110   // The 1024 comes from the ieee64 standard.
111   // The 63 comes from the fact that we use a 64-bit word.
112   //
113   // Computing floor(log(5**power)/log(2)) could be
114   // slow. Instead we use a fast function.
115   //
116   // For power in (-400,350), we have that
117   // (((152170 + 65536) * power ) >> 16);
118   // is equal to
119   //  floor(log(5**power)/log(2)) + power when power >= 0
120   // and it is equal to
121   //  ceil(log(5**-power)/log(2)) + power when power < 0
122   //
123   // The 65536 is (1<<16) and corresponds to
124   // (65536 * power) >> 16 ---> power
125   //
126   // ((152170 * power ) >> 16) is equal to
127   // floor(log(5**power)/log(2))
128   //
129   // Note that this is not magic: 152170/(1<<16) is
130   // approximatively equal to log(5)/log(2).
131   // The 1<<16 value is a power of two; we could use a
132   // larger power of 2 if we wanted to.
133   //
134   int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
135 
136 
137   // We want the most significant bit of i to be 1. Shift if needed.
138   int lz = leading_zeroes(i);
139   i <<= lz;
140 
141 
142   // We are going to need to do some 64-bit arithmetic to get a precise product.
143   // We use a table lookup approach.
144   // It is safe because
145   // power >= smallest_power
146   // and power <= largest_power
147   // We recover the mantissa of the power, it has a leading 1. It is always
148   // rounded down.
149   //
150   // We want the most significant 64 bits of the product. We know
151   // this will be non-zero because the most significant bit of i is
152   // 1.
153   const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
154   // Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
155   //
156   // The full_multiplication function computes the 128-bit product of two 64-bit words
157   // with a returned value of type value128 with a "low component" corresponding to the
158   // 64-bit least significant bits of the product and with a "high component" corresponding
159   // to the 64-bit most significant bits of the product.
160   simdjson::internal::value128 firstproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index]);
161   // Both i and power_of_five_128[index] have their most significant bit set to 1 which
162   // implies that the either the most or the second most significant bit of the product
163   // is 1. We pack values in this manner for efficiency reasons: it maximizes the use
164   // we make of the product. It also makes it easy to reason aboutthe product: there
165   // 0 or 1 leading zero in the product.
166 
167   // Unless the least significant 9 bits of the high (64-bit) part of the full
168   // product are all 1s, then we know that the most significant 55 bits are
169   // exact and no further work is needed. Having 55 bits is necessary because
170   // we need 53 bits for the mantissa but we have to have one rounding bit and
171   // we can waste a bit if the most significant bit of the product is zero.
172   if((firstproduct.high & 0x1FF) == 0x1FF) {
173     // We want to compute i * 5^q, but only care about the top 55 bits at most.
174     // Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
175     // the full computation is wasteful. So we do what is called a "truncated
176     // multiplication".
177     // We take the most significant 64-bits, and we put them in
178     // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
179     // to the desired approximation using one multiplication. Sometimes it does not suffice.
180     // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
181     // then we get a better approximation to i * 5^q. In very rare cases, even that
182     // will not suffice, though it is seemingly very hard to find such a scenario.
183     //
184     // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
185     // more complicated.
186     //
187     // There is an extra layer of complexity in that we need more than 55 bits of
188     // accuracy in the round-to-even scenario.
189     //
190     // The full_multiplication function computes the 128-bit product of two 64-bit words
191     // with a returned value of type value128 with a "low component" corresponding to the
192     // 64-bit least significant bits of the product and with a "high component" corresponding
193     // to the 64-bit most significant bits of the product.
194     simdjson::internal::value128 secondproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
195     firstproduct.low += secondproduct.high;
196     if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
197     // At this point, we might need to add at most one to firstproduct, but this
198     // can only change the value of firstproduct.high if firstproduct.low is maximal.
199     if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
200       // This is very unlikely, but if so, we need to do much more work!
201       return false;
202     }
203   }
204   uint64_t lower = firstproduct.low;
205   uint64_t upper = firstproduct.high;
206   // The final mantissa should be 53 bits with a leading 1.
207   // We shift it so that it occupies 54 bits with a leading 1.
208   ///////
209   uint64_t upperbit = upper >> 63;
210   uint64_t mantissa = upper >> (upperbit + 9);
211   lz += int(1 ^ upperbit);
212 
213   // Here we have mantissa < (1<<54).
214   int64_t real_exponent = exponent - lz;
215   if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
216     // Here have that real_exponent <= 0 so -real_exponent >= 0
217     if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
218       d = 0.0;
219       return true;
220     }
221     // next line is safe because -real_exponent + 1 < 0
222     mantissa >>= -real_exponent + 1;
223     // Thankfully, we can't have both "round-to-even" and subnormals because
224     // "round-to-even" only occurs for powers close to 0.
225     mantissa += (mantissa & 1); // round up
226     mantissa >>= 1;
227     // There is a weird scenario where we don't have a subnormal but just.
228     // Suppose we start with 2.2250738585072013e-308, we end up
229     // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
230     // whereas 0x40000000000000 x 2^-1023-53  is normal. Now, we need to round
231     // up 0x3fffffffffffff x 2^-1023-53  and once we do, we are no longer
232     // subnormal, but we can only know this after rounding.
233     // So we only declare a subnormal if we are smaller than the threshold.
234     real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
235     d = to_double(mantissa, real_exponent, negative);
236     return true;
237   }
238   // We have to round to even. The "to even" part
239   // is only a problem when we are right in between two floats
240   // which we guard against.
241   // If we have lots of trailing zeros, we may fall right between two
242   // floating-point values.
243   //
244   // The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
245   // times a power of two. That is, it is right between a number with binary significand
246   // m and another number with binary significand m+1; and it must be the case
247   // that it cannot be represented by a float itself.
248   //
249   // We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
250   // Recall that 10^q = 5^q * 2^q.
251   // When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
252   //  5^23 <=  2^54 and it is the last power of five to qualify, so q <= 23.
253   // When q<0, we have  w  >=  (2m+1) x 5^{-q}.  We must have that w<2^{64} so
254   // (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
255   // 2^{53} x 5^{-q} < 2^{64}.
256   // Hence we have 5^{-q} < 2^{11}$ or q>= -4.
257   //
258   // We require lower <= 1 and not lower == 0 because we could not prove that
259   // that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
260   if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
261     if((mantissa  << (upperbit + 64 - 53 - 2)) ==  upper) {
262       mantissa &= ~1;             // flip it so that we do not round up
263     }
264   }
265 
266   mantissa += mantissa & 1;
267   mantissa >>= 1;
268 
269   // Here we have mantissa < (1<<53), unless there was an overflow
270   if (mantissa >= (1ULL << 53)) {
271     //////////
272     // This will happen when parsing values such as 7.2057594037927933e+16
273     ////////
274     mantissa = (1ULL << 52);
275     real_exponent++;
276   }
277   mantissa &= ~(1ULL << 52);
278   // we have to check that real_exponent is in range, otherwise we bail out
279   if (simdjson_unlikely(real_exponent > 2046)) {
280     // We have an infinte value!!! We could actually throw an error here if we could.
281     return false;
282   }
283   d = to_double(mantissa, real_exponent, negative);
284   return true;
285 }
286 
287 // We call a fallback floating-point parser that might be slow. Note
288 // it will accept JSON numbers, but the JSON spec. is more restrictive so
289 // before you call parse_float_fallback, you need to have validated the input
290 // string with the JSON grammar.
291 // It will return an error (false) if the parsed number is infinite.
292 // The string parsing itself always succeeds. We know that there is at least
293 // one digit.
294 static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
295   *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
296   // We do not accept infinite values.
297 
298   // Detecting finite values in a portable manner is ridiculously hard, ideally
299   // we would want to do:
300   // return !std::isfinite(*outDouble);
301   // but that mysteriously fails under legacy/old libc++ libraries, see
302   // https://github.com/simdjson/simdjson/issues/1286
303   //
304   // Therefore, fall back to this solution (the extra parens are there
305   // to handle that max may be a macro on windows).
306   return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
307 }
308 
309 // check quickly whether the next 8 chars are made of digits
310 // at a glance, it looks better than Mula's
311 // http://0x80.pl/articles/swar-digits-validate.html
312 simdjson_really_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
313   uint64_t val;
314   // this can read up to 7 bytes beyond the buffer size, but we require
315   // SIMDJSON_PADDING of padding
316   static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
317   std::memcpy(&val, chars, 8);
318   // a branchy method might be faster:
319   // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
320   //  && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
321   //  0x3030303030303030);
322   return (((val & 0xF0F0F0F0F0F0F0F0) |
323            (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
324           0x3333333333333333);
325 }
326 
327 template<typename W>
328 error_code slow_float_parsing(simdjson_unused const uint8_t * src, W writer) {
329   double d;
330   if (parse_float_fallback(src, &d)) {
331     writer.append_double(d);
332     return SUCCESS;
333   }
334   return INVALID_NUMBER(src);
335 }
336 
337 template<typename I>
338 NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
339 simdjson_really_inline bool parse_digit(const uint8_t c, I &i) {
340   const uint8_t digit = static_cast<uint8_t>(c - '0');
341   if (digit > 9) {
342     return false;
343   }
344   // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
345   i = 10 * i + digit; // might overflow, we will handle the overflow later
346   return true;
347 }
348 
349 simdjson_really_inline error_code parse_decimal(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
350   // we continue with the fiction that we have an integer. If the
351   // floating point number is representable as x * 10^z for some integer
352   // z that fits in 53 bits, then we will be able to convert back the
353   // the integer into a float in a lossless manner.
354   const uint8_t *const first_after_period = p;
355 
356 #ifdef SWAR_NUMBER_PARSING
357   // this helps if we have lots of decimals!
358   // this turns out to be frequent enough.
359   if (is_made_of_eight_digits_fast(p)) {
360     i = i * 100000000 + parse_eight_digits_unrolled(p);
361     p += 8;
362   }
363 #endif
364   // Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
365   if (parse_digit(*p, i)) { ++p; }
366   while (parse_digit(*p, i)) { p++; }
367   exponent = first_after_period - p;
368   // Decimal without digits (123.) is illegal
369   if (exponent == 0) {
370     return INVALID_NUMBER(src);
371   }
372   return SUCCESS;
373 }
374 
375 simdjson_really_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
376   // Exp Sign: -123.456e[-]78
377   bool neg_exp = ('-' == *p);
378   if (neg_exp || '+' == *p) { p++; } // Skip + as well
379 
380   // Exponent: -123.456e-[78]
381   auto start_exp = p;
382   int64_t exp_number = 0;
383   while (parse_digit(*p, exp_number)) { ++p; }
384   // It is possible for parse_digit to overflow.
385   // In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
386   // Thus we *must* check for possible overflow before we negate exp_number.
387 
388   // Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
389   // a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
390   // not oblige and may, in fact, generate two distinct paths in any case. It might be
391   // possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
392   // instructions for a simdjson_likely branch, an unconclusive gain.
393 
394   // If there were no digits, it's an error.
395   if (simdjson_unlikely(p == start_exp)) {
396     return INVALID_NUMBER(src);
397   }
398   // We have a valid positive exponent in exp_number at this point, except that
399   // it may have overflowed.
400 
401   // If there were more than 18 digits, we may have overflowed the integer. We have to do
402   // something!!!!
403   if (simdjson_unlikely(p > start_exp+18)) {
404     // Skip leading zeroes: 1e000000000000000000001 is technically valid and doesn't overflow
405     while (*start_exp == '0') { start_exp++; }
406     // 19 digits could overflow int64_t and is kind of absurd anyway. We don't
407     // support exponents smaller than -999,999,999,999,999,999 and bigger
408     // than 999,999,999,999,999,999.
409     // We can truncate.
410     // Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
411     // infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
412     // truncate at 324.
413     // Note that there is no reason to fail per se at this point in time.
414     // E.g., 0e999999999999999999999 is a fine number.
415     if (p > start_exp+18) { exp_number = 999999999999999999; }
416   }
417   // At this point, we know that exp_number is a sane, positive, signed integer.
418   // It is <= 999,999,999,999,999,999. As long as 'exponent' is in
419   // [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
420   // is bounded in magnitude by the size of the JSON input, we are fine in this universe.
421   // To sum it up: the next line should never overflow.
422   exponent += (neg_exp ? -exp_number : exp_number);
423   return SUCCESS;
424 }
425 
426 simdjson_really_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
427   // It is possible that the integer had an overflow.
428   // We have to handle the case where we have 0.0000somenumber.
429   const uint8_t *start = start_digits;
430   while ((*start == '0') || (*start == '.')) {
431     start++;
432   }
433   // we over-decrement by one when there is a '.'
434   return digit_count - size_t(start - start_digits);
435 }
436 
437 template<typename W>
438 simdjson_really_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
439   // If we frequently had to deal with long strings of digits,
440   // we could extend our code by using a 128-bit integer instead
441   // of a 64-bit integer. However, this is uncommon in practice.
442   //
443   // 9999999999999999999 < 2**64 so we can accomodate 19 digits.
444   // If we have a decimal separator, then digit_count - 1 is the number of digits, but we
445   // may not have a decimal separator!
446   if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
447     // Ok, chances are good that we had an overflow!
448     // this is almost never going to get called!!!
449     // we start anew, going slowly!!!
450     // This will happen in the following examples:
451     // 10000000000000000000000000000000000000000000e+308
452     // 3.1415926535897932384626433832795028841971693993751
453     //
454     // NOTE: This makes a *copy* of the writer and passes it to slow_float_parsing. This happens
455     // because slow_float_parsing is a non-inlined function. If we passed our writer reference to
456     // it, it would force it to be stored in memory, preventing the compiler from picking it apart
457     // and putting into registers. i.e. if we pass it as reference, it gets slow.
458     // This is what forces the skip_double, as well.
459     error_code error = slow_float_parsing(src, writer);
460     writer.skip_double();
461     return error;
462   }
463   // NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
464   // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
465   // To future reader: we'd love if someone found a better way, or at least could explain this result!
466   if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
467     //
468     // Important: smallest_power is such that it leads to a zero value.
469     // Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
470     // so something x 10^-343 goes to zero, but not so with  something x 10^-342.
471     static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
472     //
473     if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
474       WRITE_DOUBLE(0, src, writer);
475       return SUCCESS;
476     } else { // (exponent > largest_power) and (i != 0)
477       // We have, for sure, an infinite value and simdjson refuses to parse infinite values.
478       return INVALID_NUMBER(src);
479     }
480   }
481   double d;
482   if (!compute_float_64(exponent, i, negative, d)) {
483     // we are almost never going to get here.
484     if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
485   }
486   WRITE_DOUBLE(d, src, writer);
487   return SUCCESS;
488 }
489 
490 // for performance analysis, it is sometimes  useful to skip parsing
491 #ifdef SIMDJSON_SKIPNUMBERPARSING
492 
493 template<typename W>
494 simdjson_really_inline error_code parse_number(const uint8_t *const, W &writer) {
495   writer.append_s64(0);        // always write zero
496   return SUCCESS;              // always succeeds
497 }
498 
499 simdjson_unused simdjson_really_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
500 simdjson_unused simdjson_really_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
501 simdjson_unused simdjson_really_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
502 
503 #else
504 
505 // parse the number at src
506 // define JSON_TEST_NUMBERS for unit testing
507 //
508 // It is assumed that the number is followed by a structural ({,},],[) character
509 // or a white space character. If that is not the case (e.g., when the JSON
510 // document is made of a single number), then it is necessary to copy the
511 // content and append a space before calling this function.
512 //
513 // Our objective is accurate parsing (ULP of 0) at high speed.
514 template<typename W>
515 simdjson_really_inline error_code parse_number(const uint8_t *const src, W &writer) {
516 
517   //
518   // Check for minus sign
519   //
520   bool negative = (*src == '-');
521   const uint8_t *p = src + negative;
522 
523   //
524   // Parse the integer part.
525   //
526   // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
527   const uint8_t *const start_digits = p;
528   uint64_t i = 0;
529   while (parse_digit(*p, i)) { p++; }
530 
531   // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
532   // Optimization note: size_t is expected to be unsigned.
533   size_t digit_count = size_t(p - start_digits);
534   if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
535 
536   //
537   // Handle floats if there is a . or e (or both)
538   //
539   int64_t exponent = 0;
540   bool is_float = false;
541   if ('.' == *p) {
542     is_float = true;
543     ++p;
544     SIMDJSON_TRY( parse_decimal(src, p, i, exponent) );
545     digit_count = int(p - start_digits); // used later to guard against overflows
546   }
547   if (('e' == *p) || ('E' == *p)) {
548     is_float = true;
549     ++p;
550     SIMDJSON_TRY( parse_exponent(src, p, exponent) );
551   }
552   if (is_float) {
553     const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
554     SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
555     if (dirty_end) { return INVALID_NUMBER(src); }
556     return SUCCESS;
557   }
558 
559   // The longest negative 64-bit number is 19 digits.
560   // The longest positive 64-bit number is 20 digits.
561   // We do it this way so we don't trigger this branch unless we must.
562   size_t longest_digit_count = negative ? 19 : 20;
563   if (digit_count > longest_digit_count) { return INVALID_NUMBER(src); }
564   if (digit_count == longest_digit_count) {
565     if (negative) {
566       // Anything negative above INT64_MAX+1 is invalid
567       if (i > uint64_t(INT64_MAX)+1) { return INVALID_NUMBER(src);  }
568       WRITE_INTEGER(~i+1, src, writer);
569       if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
570       return SUCCESS;
571     // Positive overflow check:
572     // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
573     //   biggest uint64_t.
574     // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
575     //   If we got here, it's a 20 digit number starting with the digit "1".
576     // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
577     //   than 1,553,255,926,290,448,384.
578     // - That is smaller than the smallest possible 20-digit number the user could write:
579     //   10,000,000,000,000,000,000.
580     // - Therefore, if the number is positive and lower than that, it's overflow.
581     // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
582     //
583     }  else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
584   }
585 
586   // Write unsigned if it doesn't fit in a signed integer.
587   if (i > uint64_t(INT64_MAX)) {
588     WRITE_UNSIGNED(i, src, writer);
589   } else {
590     WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
591   }
592   if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
593   return SUCCESS;
594 }
595 
596 // Inlineable functions
597 namespace {
598 
599 // This table can be used to characterize the final character of an integer
600 // string. For JSON structural character and allowable white space characters,
601 // we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
602 // we return NUMBER_ERROR.
603 // Optimization note: we could easily reduce the size of the table by half (to 128)
604 // at the cost of an extra branch.
605 // Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
606 static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
607 static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
608 static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
609 
610 const uint8_t integer_string_finisher[256] = {
611     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
612     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
613     SUCCESS,      NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
614     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
615     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
616     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
617     NUMBER_ERROR, NUMBER_ERROR,   SUCCESS,      NUMBER_ERROR, NUMBER_ERROR,
618     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
619     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
620     NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
621     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
622     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
623     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
624     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
625     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
626     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
627     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
628     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
629     NUMBER_ERROR, SUCCESS,        NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
630     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
631     NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
632     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
633     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
634     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
635     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
636     SUCCESS,      NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
637     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
638     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
639     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
640     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
641     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
642     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
643     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
644     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
645     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
646     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
647     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
648     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
649     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
650     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
651     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
652     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
653     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
654     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
655     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
656     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
657     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
658     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
659     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
660     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
661     NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
662     NUMBER_ERROR};
663 
664 // Parse any number from 0 to 18,446,744,073,709,551,615
665 simdjson_unused simdjson_really_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
666   const uint8_t *p = src;
667   //
668   // Parse the integer part.
669   //
670   // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
671   const uint8_t *const start_digits = p;
672   uint64_t i = 0;
673   while (parse_digit(*p, i)) { p++; }
674 
675   // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
676   // Optimization note: size_t is expected to be unsigned.
677   size_t digit_count = size_t(p - start_digits);
678   // The longest positive 64-bit number is 20 digits.
679   // We do it this way so we don't trigger this branch unless we must.
680   // Optimization note: the compiler can probably merge
681   // ((digit_count == 0) || (digit_count > 20))
682   // into a single  branch since digit_count is unsigned.
683   if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
684   // Here digit_count > 0.
685   if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
686   // We can do the following...
687   // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
688   //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
689   // }
690   // as a single table lookup:
691   if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
692 
693   if (digit_count == 20) {
694     // Positive overflow check:
695     // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
696     //   biggest uint64_t.
697     // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
698     //   If we got here, it's a 20 digit number starting with the digit "1".
699     // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
700     //   than 1,553,255,926,290,448,384.
701     // - That is smaller than the smallest possible 20-digit number the user could write:
702     //   10,000,000,000,000,000,000.
703     // - Therefore, if the number is positive and lower than that, it's overflow.
704     // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
705     //
706     if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
707   }
708 
709   return i;
710 }
711 
712 // Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
713 simdjson_unused simdjson_really_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
714   //
715   // Check for minus sign
716   //
717   bool negative = (*src == '-');
718   const uint8_t *p = src + negative;
719 
720   //
721   // Parse the integer part.
722   //
723   // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
724   const uint8_t *const start_digits = p;
725   uint64_t i = 0;
726   while (parse_digit(*p, i)) { p++; }
727 
728   // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
729   // Optimization note: size_t is expected to be unsigned.
730   size_t digit_count = size_t(p - start_digits);
731   // The longest negative 64-bit number is 19 digits.
732   // The longest positive 64-bit number is 20 digits.
733   // We do it this way so we don't trigger this branch unless we must.
734   size_t longest_digit_count = negative ? 19 : 20;
735   // Optimization note: the compiler can probably merge
736   // ((digit_count == 0) || (digit_count > longest_digit_count))
737   // into a single  branch since digit_count is unsigned.
738   if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
739   // Here digit_count > 0.
740   if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
741   // We can do the following...
742   // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
743   //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
744   // }
745   // as a single table lookup:
746   if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
747   if (digit_count == longest_digit_count) {
748     if (negative) {
749       // Anything negative above INT64_MAX+1 is invalid
750       if (i > uint64_t(INT64_MAX)+1) { return INCORRECT_TYPE; }
751       return ~i+1;
752 
753     // Positive overflow check:
754     // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
755     //   biggest uint64_t.
756     // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
757     //   If we got here, it's a 20 digit number starting with the digit "1".
758     // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
759     //   than 1,553,255,926,290,448,384.
760     // - That is smaller than the smallest possible 20-digit number the user could write:
761     //   10,000,000,000,000,000,000.
762     // - Therefore, if the number is positive and lower than that, it's overflow.
763     // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
764     //
765     } else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
766   }
767 
768   return negative ? (~i+1) : i;
769 }
770 
771 simdjson_unused simdjson_really_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
772   //
773   // Check for minus sign
774   //
775   bool negative = (*src == '-');
776   src += negative;
777 
778   //
779   // Parse the integer part.
780   //
781   uint64_t i = 0;
782   const uint8_t *p = src;
783   p += parse_digit(*p, i);
784   bool leading_zero = (i == 0);
785   while (parse_digit(*p, i)) { p++; }
786   // no integer digits, or 0123 (zero must be solo)
787   if ( p == src ) { return INCORRECT_TYPE; }
788   if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
789 
790   //
791   // Parse the decimal part.
792   //
793   int64_t exponent = 0;
794   bool overflow;
795   if (simdjson_likely(*p == '.')) {
796     p++;
797     const uint8_t *start_decimal_digits = p;
798     if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
799     p++;
800     while (parse_digit(*p, i)) { p++; }
801     exponent = -(p - start_decimal_digits);
802 
803     // Overflow check. More than 19 digits (minus the decimal) may be overflow.
804     overflow = p-src-1 > 19;
805     if (simdjson_unlikely(overflow && leading_zero)) {
806       // Skip leading 0.00000 and see if it still overflows
807       const uint8_t *start_digits = src + 2;
808       while (*start_digits == '0') { start_digits++; }
809       overflow = start_digits-src > 19;
810     }
811   } else {
812     overflow = p-src > 19;
813   }
814 
815   //
816   // Parse the exponent
817   //
818   if (*p == 'e' || *p == 'E') {
819     p++;
820     bool exp_neg = *p == '-';
821     p += exp_neg || *p == '+';
822 
823     uint64_t exp = 0;
824     const uint8_t *start_exp_digits = p;
825     while (parse_digit(*p, exp)) { p++; }
826     // no exp digits, or 20+ exp digits
827     if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
828 
829     exponent += exp_neg ? 0-exp : exp;
830   }
831 
832   if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
833 
834   overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
835 
836   //
837   // Assemble (or slow-parse) the float
838   //
839   double d;
840   if (simdjson_likely(!overflow)) {
841     if (compute_float_64(exponent, i, negative, d)) { return d; }
842   }
843   if (!parse_float_fallback(src-negative, &d)) {
844     return NUMBER_ERROR;
845   }
846   return d;
847 }
848 } //namespace {}
849 #endif // SIMDJSON_SKIPNUMBERPARSING
850 
851 } // namespace numberparsing
852 } // unnamed namespace
853 } // namespace SIMDJSON_IMPLEMENTATION
854 } // namespace simdjson
855