1 #include "simdjson/internal/numberparsing_tables.h"
2 #include <limits>
3
4 namespace simdjson {
5 namespace SIMDJSON_IMPLEMENTATION {
6 namespace {
7 /// @private
8 namespace numberparsing {
9
10
11
12 #ifdef JSON_TEST_NUMBERS
13 #define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
14 #define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
15 #define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
16 #define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
17 #else
18 #define INVALID_NUMBER(SRC) (NUMBER_ERROR)
19 #define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
20 #define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
21 #define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
22 #endif
23
24 namespace {
25 // Convert a mantissa, an exponent and a sign bit into an ieee64 double.
26 // The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
27 // The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
to_double(uint64_t mantissa,uint64_t real_exponent,bool negative)28 simdjson_really_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
29 double d;
30 mantissa &= ~(1ULL << 52);
31 mantissa |= real_exponent << 52;
32 mantissa |= ((static_cast<uint64_t>(negative)) << 63);
33 std::memcpy(&d, &mantissa, sizeof(d));
34 return d;
35 }
36 }
37 // Attempts to compute i * 10^(power) exactly; and if "negative" is
38 // true, negate the result.
39 // This function will only work in some cases, when it does not work, success is
40 // set to false. This should work *most of the time* (like 99% of the time).
41 // We assume that power is in the [smallest_power,
42 // largest_power] interval: the caller is responsible for this check.
compute_float_64(int64_t power,uint64_t i,bool negative,double & d)43 simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
44 // we start with a fast path
45 // It was described in
46 // Clinger WD. How to read floating point numbers accurately.
47 // ACM SIGPLAN Notices. 1990
48 #ifndef FLT_EVAL_METHOD
49 #error "FLT_EVAL_METHOD should be defined, please include cfloat."
50 #endif
51 #if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
52 // We cannot be certain that x/y is rounded to nearest.
53 if (0 <= power && power <= 22 && i <= 9007199254740991) {
54 #else
55 if (-22 <= power && power <= 22 && i <= 9007199254740991) {
56 #endif
57 // convert the integer into a double. This is lossless since
58 // 0 <= i <= 2^53 - 1.
59 d = double(i);
60 //
61 // The general idea is as follows.
62 // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
63 // 1) Both s and p can be represented exactly as 64-bit floating-point
64 // values
65 // (binary64).
66 // 2) Because s and p can be represented exactly as floating-point values,
67 // then s * p
68 // and s / p will produce correctly rounded values.
69 //
70 if (power < 0) {
71 d = d / simdjson::internal::power_of_ten[-power];
72 } else {
73 d = d * simdjson::internal::power_of_ten[power];
74 }
75 if (negative) {
76 d = -d;
77 }
78 return true;
79 }
80 // When 22 < power && power < 22 + 16, we could
81 // hope for another, secondary fast path. It was
82 // described by David M. Gay in "Correctly rounded
83 // binary-decimal and decimal-binary conversions." (1990)
84 // If you need to compute i * 10^(22 + x) for x < 16,
85 // first compute i * 10^x, if you know that result is exact
86 // (e.g., when i * 10^x < 2^53),
87 // then you can still proceed and do (i * 10^x) * 10^22.
88 // Is this worth your time?
89 // You need 22 < power *and* power < 22 + 16 *and* (i * 10^(x-22) < 2^53)
90 // for this second fast path to work.
91 // If you you have 22 < power *and* power < 22 + 16, and then you
92 // optimistically compute "i * 10^(x-22)", there is still a chance that you
93 // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
94 // this optimization maybe less common than we would like. Source:
95 // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
96 // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
97
98 // The fast path has now failed, so we are failing back on the slower path.
99
100 // In the slow path, we need to adjust i so that it is > 1<<63 which is always
101 // possible, except if i == 0, so we handle i == 0 separately.
102 if(i == 0) {
103 d = 0.0;
104 return true;
105 }
106
107
108 // The exponent is 1024 + 63 + power
109 // + floor(log(5**power)/log(2)).
110 // The 1024 comes from the ieee64 standard.
111 // The 63 comes from the fact that we use a 64-bit word.
112 //
113 // Computing floor(log(5**power)/log(2)) could be
114 // slow. Instead we use a fast function.
115 //
116 // For power in (-400,350), we have that
117 // (((152170 + 65536) * power ) >> 16);
118 // is equal to
119 // floor(log(5**power)/log(2)) + power when power >= 0
120 // and it is equal to
121 // ceil(log(5**-power)/log(2)) + power when power < 0
122 //
123 // The 65536 is (1<<16) and corresponds to
124 // (65536 * power) >> 16 ---> power
125 //
126 // ((152170 * power ) >> 16) is equal to
127 // floor(log(5**power)/log(2))
128 //
129 // Note that this is not magic: 152170/(1<<16) is
130 // approximatively equal to log(5)/log(2).
131 // The 1<<16 value is a power of two; we could use a
132 // larger power of 2 if we wanted to.
133 //
134 int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
135
136
137 // We want the most significant bit of i to be 1. Shift if needed.
138 int lz = leading_zeroes(i);
139 i <<= lz;
140
141
142 // We are going to need to do some 64-bit arithmetic to get a precise product.
143 // We use a table lookup approach.
144 // It is safe because
145 // power >= smallest_power
146 // and power <= largest_power
147 // We recover the mantissa of the power, it has a leading 1. It is always
148 // rounded down.
149 //
150 // We want the most significant 64 bits of the product. We know
151 // this will be non-zero because the most significant bit of i is
152 // 1.
153 const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
154 // Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
155 //
156 // The full_multiplication function computes the 128-bit product of two 64-bit words
157 // with a returned value of type value128 with a "low component" corresponding to the
158 // 64-bit least significant bits of the product and with a "high component" corresponding
159 // to the 64-bit most significant bits of the product.
160 simdjson::internal::value128 firstproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index]);
161 // Both i and power_of_five_128[index] have their most significant bit set to 1 which
162 // implies that the either the most or the second most significant bit of the product
163 // is 1. We pack values in this manner for efficiency reasons: it maximizes the use
164 // we make of the product. It also makes it easy to reason aboutthe product: there
165 // 0 or 1 leading zero in the product.
166
167 // Unless the least significant 9 bits of the high (64-bit) part of the full
168 // product are all 1s, then we know that the most significant 55 bits are
169 // exact and no further work is needed. Having 55 bits is necessary because
170 // we need 53 bits for the mantissa but we have to have one rounding bit and
171 // we can waste a bit if the most significant bit of the product is zero.
172 if((firstproduct.high & 0x1FF) == 0x1FF) {
173 // We want to compute i * 5^q, but only care about the top 55 bits at most.
174 // Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
175 // the full computation is wasteful. So we do what is called a "truncated
176 // multiplication".
177 // We take the most significant 64-bits, and we put them in
178 // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
179 // to the desired approximation using one multiplication. Sometimes it does not suffice.
180 // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
181 // then we get a better approximation to i * 5^q. In very rare cases, even that
182 // will not suffice, though it is seemingly very hard to find such a scenario.
183 //
184 // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
185 // more complicated.
186 //
187 // There is an extra layer of complexity in that we need more than 55 bits of
188 // accuracy in the round-to-even scenario.
189 //
190 // The full_multiplication function computes the 128-bit product of two 64-bit words
191 // with a returned value of type value128 with a "low component" corresponding to the
192 // 64-bit least significant bits of the product and with a "high component" corresponding
193 // to the 64-bit most significant bits of the product.
194 simdjson::internal::value128 secondproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
195 firstproduct.low += secondproduct.high;
196 if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
197 // At this point, we might need to add at most one to firstproduct, but this
198 // can only change the value of firstproduct.high if firstproduct.low is maximal.
199 if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) {
200 // This is very unlikely, but if so, we need to do much more work!
201 return false;
202 }
203 }
204 uint64_t lower = firstproduct.low;
205 uint64_t upper = firstproduct.high;
206 // The final mantissa should be 53 bits with a leading 1.
207 // We shift it so that it occupies 54 bits with a leading 1.
208 ///////
209 uint64_t upperbit = upper >> 63;
210 uint64_t mantissa = upper >> (upperbit + 9);
211 lz += int(1 ^ upperbit);
212
213 // Here we have mantissa < (1<<54).
214 int64_t real_exponent = exponent - lz;
215 if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
216 // Here have that real_exponent <= 0 so -real_exponent >= 0
217 if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
218 d = 0.0;
219 return true;
220 }
221 // next line is safe because -real_exponent + 1 < 0
222 mantissa >>= -real_exponent + 1;
223 // Thankfully, we can't have both "round-to-even" and subnormals because
224 // "round-to-even" only occurs for powers close to 0.
225 mantissa += (mantissa & 1); // round up
226 mantissa >>= 1;
227 // There is a weird scenario where we don't have a subnormal but just.
228 // Suppose we start with 2.2250738585072013e-308, we end up
229 // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
230 // whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round
231 // up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer
232 // subnormal, but we can only know this after rounding.
233 // So we only declare a subnormal if we are smaller than the threshold.
234 real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
235 d = to_double(mantissa, real_exponent, negative);
236 return true;
237 }
238 // We have to round to even. The "to even" part
239 // is only a problem when we are right in between two floats
240 // which we guard against.
241 // If we have lots of trailing zeros, we may fall right between two
242 // floating-point values.
243 //
244 // The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
245 // times a power of two. That is, it is right between a number with binary significand
246 // m and another number with binary significand m+1; and it must be the case
247 // that it cannot be represented by a float itself.
248 //
249 // We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
250 // Recall that 10^q = 5^q * 2^q.
251 // When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
252 // 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23.
253 // When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so
254 // (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
255 // 2^{53} x 5^{-q} < 2^{64}.
256 // Hence we have 5^{-q} < 2^{11}$ or q>= -4.
257 //
258 // We require lower <= 1 and not lower == 0 because we could not prove that
259 // that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
260 if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
261 if((mantissa << (upperbit + 64 - 53 - 2)) == upper) {
262 mantissa &= ~1; // flip it so that we do not round up
263 }
264 }
265
266 mantissa += mantissa & 1;
267 mantissa >>= 1;
268
269 // Here we have mantissa < (1<<53), unless there was an overflow
270 if (mantissa >= (1ULL << 53)) {
271 //////////
272 // This will happen when parsing values such as 7.2057594037927933e+16
273 ////////
274 mantissa = (1ULL << 52);
275 real_exponent++;
276 }
277 mantissa &= ~(1ULL << 52);
278 // we have to check that real_exponent is in range, otherwise we bail out
279 if (simdjson_unlikely(real_exponent > 2046)) {
280 // We have an infinte value!!! We could actually throw an error here if we could.
281 return false;
282 }
283 d = to_double(mantissa, real_exponent, negative);
284 return true;
285 }
286
287 // We call a fallback floating-point parser that might be slow. Note
288 // it will accept JSON numbers, but the JSON spec. is more restrictive so
289 // before you call parse_float_fallback, you need to have validated the input
290 // string with the JSON grammar.
291 // It will return an error (false) if the parsed number is infinite.
292 // The string parsing itself always succeeds. We know that there is at least
293 // one digit.
294 static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
295 *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
296 // We do not accept infinite values.
297
298 // Detecting finite values in a portable manner is ridiculously hard, ideally
299 // we would want to do:
300 // return !std::isfinite(*outDouble);
301 // but that mysteriously fails under legacy/old libc++ libraries, see
302 // https://github.com/simdjson/simdjson/issues/1286
303 //
304 // Therefore, fall back to this solution (the extra parens are there
305 // to handle that max may be a macro on windows).
306 return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
307 }
308
309 // check quickly whether the next 8 chars are made of digits
310 // at a glance, it looks better than Mula's
311 // http://0x80.pl/articles/swar-digits-validate.html
312 simdjson_really_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
313 uint64_t val;
314 // this can read up to 7 bytes beyond the buffer size, but we require
315 // SIMDJSON_PADDING of padding
316 static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
317 std::memcpy(&val, chars, 8);
318 // a branchy method might be faster:
319 // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
320 // && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
321 // 0x3030303030303030);
322 return (((val & 0xF0F0F0F0F0F0F0F0) |
323 (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
324 0x3333333333333333);
325 }
326
327 template<typename W>
328 error_code slow_float_parsing(simdjson_unused const uint8_t * src, W writer) {
329 double d;
330 if (parse_float_fallback(src, &d)) {
331 writer.append_double(d);
332 return SUCCESS;
333 }
334 return INVALID_NUMBER(src);
335 }
336
337 template<typename I>
338 NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
339 simdjson_really_inline bool parse_digit(const uint8_t c, I &i) {
340 const uint8_t digit = static_cast<uint8_t>(c - '0');
341 if (digit > 9) {
342 return false;
343 }
344 // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
345 i = 10 * i + digit; // might overflow, we will handle the overflow later
346 return true;
347 }
348
349 simdjson_really_inline error_code parse_decimal(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
350 // we continue with the fiction that we have an integer. If the
351 // floating point number is representable as x * 10^z for some integer
352 // z that fits in 53 bits, then we will be able to convert back the
353 // the integer into a float in a lossless manner.
354 const uint8_t *const first_after_period = p;
355
356 #ifdef SWAR_NUMBER_PARSING
357 // this helps if we have lots of decimals!
358 // this turns out to be frequent enough.
359 if (is_made_of_eight_digits_fast(p)) {
360 i = i * 100000000 + parse_eight_digits_unrolled(p);
361 p += 8;
362 }
363 #endif
364 // Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
365 if (parse_digit(*p, i)) { ++p; }
366 while (parse_digit(*p, i)) { p++; }
367 exponent = first_after_period - p;
368 // Decimal without digits (123.) is illegal
369 if (exponent == 0) {
370 return INVALID_NUMBER(src);
371 }
372 return SUCCESS;
373 }
374
375 simdjson_really_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
376 // Exp Sign: -123.456e[-]78
377 bool neg_exp = ('-' == *p);
378 if (neg_exp || '+' == *p) { p++; } // Skip + as well
379
380 // Exponent: -123.456e-[78]
381 auto start_exp = p;
382 int64_t exp_number = 0;
383 while (parse_digit(*p, exp_number)) { ++p; }
384 // It is possible for parse_digit to overflow.
385 // In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
386 // Thus we *must* check for possible overflow before we negate exp_number.
387
388 // Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
389 // a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
390 // not oblige and may, in fact, generate two distinct paths in any case. It might be
391 // possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
392 // instructions for a simdjson_likely branch, an unconclusive gain.
393
394 // If there were no digits, it's an error.
395 if (simdjson_unlikely(p == start_exp)) {
396 return INVALID_NUMBER(src);
397 }
398 // We have a valid positive exponent in exp_number at this point, except that
399 // it may have overflowed.
400
401 // If there were more than 18 digits, we may have overflowed the integer. We have to do
402 // something!!!!
403 if (simdjson_unlikely(p > start_exp+18)) {
404 // Skip leading zeroes: 1e000000000000000000001 is technically valid and doesn't overflow
405 while (*start_exp == '0') { start_exp++; }
406 // 19 digits could overflow int64_t and is kind of absurd anyway. We don't
407 // support exponents smaller than -999,999,999,999,999,999 and bigger
408 // than 999,999,999,999,999,999.
409 // We can truncate.
410 // Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
411 // infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
412 // truncate at 324.
413 // Note that there is no reason to fail per se at this point in time.
414 // E.g., 0e999999999999999999999 is a fine number.
415 if (p > start_exp+18) { exp_number = 999999999999999999; }
416 }
417 // At this point, we know that exp_number is a sane, positive, signed integer.
418 // It is <= 999,999,999,999,999,999. As long as 'exponent' is in
419 // [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
420 // is bounded in magnitude by the size of the JSON input, we are fine in this universe.
421 // To sum it up: the next line should never overflow.
422 exponent += (neg_exp ? -exp_number : exp_number);
423 return SUCCESS;
424 }
425
426 simdjson_really_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
427 // It is possible that the integer had an overflow.
428 // We have to handle the case where we have 0.0000somenumber.
429 const uint8_t *start = start_digits;
430 while ((*start == '0') || (*start == '.')) {
431 start++;
432 }
433 // we over-decrement by one when there is a '.'
434 return digit_count - size_t(start - start_digits);
435 }
436
437 template<typename W>
438 simdjson_really_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
439 // If we frequently had to deal with long strings of digits,
440 // we could extend our code by using a 128-bit integer instead
441 // of a 64-bit integer. However, this is uncommon in practice.
442 //
443 // 9999999999999999999 < 2**64 so we can accomodate 19 digits.
444 // If we have a decimal separator, then digit_count - 1 is the number of digits, but we
445 // may not have a decimal separator!
446 if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
447 // Ok, chances are good that we had an overflow!
448 // this is almost never going to get called!!!
449 // we start anew, going slowly!!!
450 // This will happen in the following examples:
451 // 10000000000000000000000000000000000000000000e+308
452 // 3.1415926535897932384626433832795028841971693993751
453 //
454 // NOTE: This makes a *copy* of the writer and passes it to slow_float_parsing. This happens
455 // because slow_float_parsing is a non-inlined function. If we passed our writer reference to
456 // it, it would force it to be stored in memory, preventing the compiler from picking it apart
457 // and putting into registers. i.e. if we pass it as reference, it gets slow.
458 // This is what forces the skip_double, as well.
459 error_code error = slow_float_parsing(src, writer);
460 writer.skip_double();
461 return error;
462 }
463 // NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
464 // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
465 // To future reader: we'd love if someone found a better way, or at least could explain this result!
466 if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
467 //
468 // Important: smallest_power is such that it leads to a zero value.
469 // Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
470 // so something x 10^-343 goes to zero, but not so with something x 10^-342.
471 static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
472 //
473 if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
474 WRITE_DOUBLE(0, src, writer);
475 return SUCCESS;
476 } else { // (exponent > largest_power) and (i != 0)
477 // We have, for sure, an infinite value and simdjson refuses to parse infinite values.
478 return INVALID_NUMBER(src);
479 }
480 }
481 double d;
482 if (!compute_float_64(exponent, i, negative, d)) {
483 // we are almost never going to get here.
484 if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
485 }
486 WRITE_DOUBLE(d, src, writer);
487 return SUCCESS;
488 }
489
490 // for performance analysis, it is sometimes useful to skip parsing
491 #ifdef SIMDJSON_SKIPNUMBERPARSING
492
493 template<typename W>
494 simdjson_really_inline error_code parse_number(const uint8_t *const, W &writer) {
495 writer.append_s64(0); // always write zero
496 return SUCCESS; // always succeeds
497 }
498
499 simdjson_unused simdjson_really_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
500 simdjson_unused simdjson_really_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
501 simdjson_unused simdjson_really_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
502
503 #else
504
505 // parse the number at src
506 // define JSON_TEST_NUMBERS for unit testing
507 //
508 // It is assumed that the number is followed by a structural ({,},],[) character
509 // or a white space character. If that is not the case (e.g., when the JSON
510 // document is made of a single number), then it is necessary to copy the
511 // content and append a space before calling this function.
512 //
513 // Our objective is accurate parsing (ULP of 0) at high speed.
514 template<typename W>
515 simdjson_really_inline error_code parse_number(const uint8_t *const src, W &writer) {
516
517 //
518 // Check for minus sign
519 //
520 bool negative = (*src == '-');
521 const uint8_t *p = src + negative;
522
523 //
524 // Parse the integer part.
525 //
526 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
527 const uint8_t *const start_digits = p;
528 uint64_t i = 0;
529 while (parse_digit(*p, i)) { p++; }
530
531 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
532 // Optimization note: size_t is expected to be unsigned.
533 size_t digit_count = size_t(p - start_digits);
534 if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
535
536 //
537 // Handle floats if there is a . or e (or both)
538 //
539 int64_t exponent = 0;
540 bool is_float = false;
541 if ('.' == *p) {
542 is_float = true;
543 ++p;
544 SIMDJSON_TRY( parse_decimal(src, p, i, exponent) );
545 digit_count = int(p - start_digits); // used later to guard against overflows
546 }
547 if (('e' == *p) || ('E' == *p)) {
548 is_float = true;
549 ++p;
550 SIMDJSON_TRY( parse_exponent(src, p, exponent) );
551 }
552 if (is_float) {
553 const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
554 SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
555 if (dirty_end) { return INVALID_NUMBER(src); }
556 return SUCCESS;
557 }
558
559 // The longest negative 64-bit number is 19 digits.
560 // The longest positive 64-bit number is 20 digits.
561 // We do it this way so we don't trigger this branch unless we must.
562 size_t longest_digit_count = negative ? 19 : 20;
563 if (digit_count > longest_digit_count) { return INVALID_NUMBER(src); }
564 if (digit_count == longest_digit_count) {
565 if (negative) {
566 // Anything negative above INT64_MAX+1 is invalid
567 if (i > uint64_t(INT64_MAX)+1) { return INVALID_NUMBER(src); }
568 WRITE_INTEGER(~i+1, src, writer);
569 if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
570 return SUCCESS;
571 // Positive overflow check:
572 // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
573 // biggest uint64_t.
574 // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
575 // If we got here, it's a 20 digit number starting with the digit "1".
576 // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
577 // than 1,553,255,926,290,448,384.
578 // - That is smaller than the smallest possible 20-digit number the user could write:
579 // 10,000,000,000,000,000,000.
580 // - Therefore, if the number is positive and lower than that, it's overflow.
581 // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
582 //
583 } else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
584 }
585
586 // Write unsigned if it doesn't fit in a signed integer.
587 if (i > uint64_t(INT64_MAX)) {
588 WRITE_UNSIGNED(i, src, writer);
589 } else {
590 WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
591 }
592 if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
593 return SUCCESS;
594 }
595
596 // Inlineable functions
597 namespace {
598
599 // This table can be used to characterize the final character of an integer
600 // string. For JSON structural character and allowable white space characters,
601 // we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
602 // we return NUMBER_ERROR.
603 // Optimization note: we could easily reduce the size of the table by half (to 128)
604 // at the cost of an extra branch.
605 // Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
606 static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
607 static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
608 static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
609
610 const uint8_t integer_string_finisher[256] = {
611 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
612 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
613 SUCCESS, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
614 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
615 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
616 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
617 NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, NUMBER_ERROR,
618 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
619 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
620 NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
621 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
622 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
623 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
624 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
625 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
626 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
627 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
628 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
629 NUMBER_ERROR, SUCCESS, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
630 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
631 NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
632 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
633 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
634 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
635 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
636 SUCCESS, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
637 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
638 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
639 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
640 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
641 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
642 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
643 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
644 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
645 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
646 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
647 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
648 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
649 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
650 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
651 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
652 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
653 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
654 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
655 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
656 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
657 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
658 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
659 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
660 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
661 NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
662 NUMBER_ERROR};
663
664 // Parse any number from 0 to 18,446,744,073,709,551,615
665 simdjson_unused simdjson_really_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
666 const uint8_t *p = src;
667 //
668 // Parse the integer part.
669 //
670 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
671 const uint8_t *const start_digits = p;
672 uint64_t i = 0;
673 while (parse_digit(*p, i)) { p++; }
674
675 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
676 // Optimization note: size_t is expected to be unsigned.
677 size_t digit_count = size_t(p - start_digits);
678 // The longest positive 64-bit number is 20 digits.
679 // We do it this way so we don't trigger this branch unless we must.
680 // Optimization note: the compiler can probably merge
681 // ((digit_count == 0) || (digit_count > 20))
682 // into a single branch since digit_count is unsigned.
683 if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
684 // Here digit_count > 0.
685 if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
686 // We can do the following...
687 // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
688 // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
689 // }
690 // as a single table lookup:
691 if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
692
693 if (digit_count == 20) {
694 // Positive overflow check:
695 // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
696 // biggest uint64_t.
697 // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
698 // If we got here, it's a 20 digit number starting with the digit "1".
699 // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
700 // than 1,553,255,926,290,448,384.
701 // - That is smaller than the smallest possible 20-digit number the user could write:
702 // 10,000,000,000,000,000,000.
703 // - Therefore, if the number is positive and lower than that, it's overflow.
704 // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
705 //
706 if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
707 }
708
709 return i;
710 }
711
712 // Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
713 simdjson_unused simdjson_really_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
714 //
715 // Check for minus sign
716 //
717 bool negative = (*src == '-');
718 const uint8_t *p = src + negative;
719
720 //
721 // Parse the integer part.
722 //
723 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
724 const uint8_t *const start_digits = p;
725 uint64_t i = 0;
726 while (parse_digit(*p, i)) { p++; }
727
728 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
729 // Optimization note: size_t is expected to be unsigned.
730 size_t digit_count = size_t(p - start_digits);
731 // The longest negative 64-bit number is 19 digits.
732 // The longest positive 64-bit number is 20 digits.
733 // We do it this way so we don't trigger this branch unless we must.
734 size_t longest_digit_count = negative ? 19 : 20;
735 // Optimization note: the compiler can probably merge
736 // ((digit_count == 0) || (digit_count > longest_digit_count))
737 // into a single branch since digit_count is unsigned.
738 if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
739 // Here digit_count > 0.
740 if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
741 // We can do the following...
742 // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
743 // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
744 // }
745 // as a single table lookup:
746 if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
747 if (digit_count == longest_digit_count) {
748 if (negative) {
749 // Anything negative above INT64_MAX+1 is invalid
750 if (i > uint64_t(INT64_MAX)+1) { return INCORRECT_TYPE; }
751 return ~i+1;
752
753 // Positive overflow check:
754 // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
755 // biggest uint64_t.
756 // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
757 // If we got here, it's a 20 digit number starting with the digit "1".
758 // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
759 // than 1,553,255,926,290,448,384.
760 // - That is smaller than the smallest possible 20-digit number the user could write:
761 // 10,000,000,000,000,000,000.
762 // - Therefore, if the number is positive and lower than that, it's overflow.
763 // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
764 //
765 } else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
766 }
767
768 return negative ? (~i+1) : i;
769 }
770
771 simdjson_unused simdjson_really_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
772 //
773 // Check for minus sign
774 //
775 bool negative = (*src == '-');
776 src += negative;
777
778 //
779 // Parse the integer part.
780 //
781 uint64_t i = 0;
782 const uint8_t *p = src;
783 p += parse_digit(*p, i);
784 bool leading_zero = (i == 0);
785 while (parse_digit(*p, i)) { p++; }
786 // no integer digits, or 0123 (zero must be solo)
787 if ( p == src ) { return INCORRECT_TYPE; }
788 if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
789
790 //
791 // Parse the decimal part.
792 //
793 int64_t exponent = 0;
794 bool overflow;
795 if (simdjson_likely(*p == '.')) {
796 p++;
797 const uint8_t *start_decimal_digits = p;
798 if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
799 p++;
800 while (parse_digit(*p, i)) { p++; }
801 exponent = -(p - start_decimal_digits);
802
803 // Overflow check. More than 19 digits (minus the decimal) may be overflow.
804 overflow = p-src-1 > 19;
805 if (simdjson_unlikely(overflow && leading_zero)) {
806 // Skip leading 0.00000 and see if it still overflows
807 const uint8_t *start_digits = src + 2;
808 while (*start_digits == '0') { start_digits++; }
809 overflow = start_digits-src > 19;
810 }
811 } else {
812 overflow = p-src > 19;
813 }
814
815 //
816 // Parse the exponent
817 //
818 if (*p == 'e' || *p == 'E') {
819 p++;
820 bool exp_neg = *p == '-';
821 p += exp_neg || *p == '+';
822
823 uint64_t exp = 0;
824 const uint8_t *start_exp_digits = p;
825 while (parse_digit(*p, exp)) { p++; }
826 // no exp digits, or 20+ exp digits
827 if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
828
829 exponent += exp_neg ? 0-exp : exp;
830 }
831
832 if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
833
834 overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
835
836 //
837 // Assemble (or slow-parse) the float
838 //
839 double d;
840 if (simdjson_likely(!overflow)) {
841 if (compute_float_64(exponent, i, negative, d)) { return d; }
842 }
843 if (!parse_float_fallback(src-negative, &d)) {
844 return NUMBER_ERROR;
845 }
846 return d;
847 }
848 } //namespace {}
849 #endif // SIMDJSON_SKIPNUMBERPARSING
850
851 } // namespace numberparsing
852 } // unnamed namespace
853 } // namespace SIMDJSON_IMPLEMENTATION
854 } // namespace simdjson
855