1 /*
2 * Copyright (c) 2021 Calvin Rose
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to
6 * deal in the Software without restriction, including without limitation the
7 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8 * sell copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20 * IN THE SOFTWARE.
21 */
22
23 /* Use a custom double parser instead of libc's strtod for better portability
24 * and control.
25 *
26 * This version has been modified for much greater flexibility in parsing, such
27 * as choosing the radix and supporting scientific notation with any radix.
28 *
29 * Numbers are of the form [-+]R[rR]I.F[eE&][-+]X in pseudo-regex form, where R
30 * is the radix, I is the integer part, F is the fractional part, and X is the
31 * exponent. All signs, radix, decimal point, fractional part, and exponent can
32 * be omitted. The radix is assumed to be 10 if omitted, and the E or e
33 * separator for the exponent can only be used when the radix is 10. This is
34 * because E is a valid digit in bases 15 or greater. For bases greater than
35 * 10, the letters are used as digits. A through Z correspond to the digits 10
36 * through 35, and the lowercase letters have the same values. The radix number
37 * is always in base 10. For example, a hexidecimal number could be written
38 * '16rdeadbeef'. janet_scan_number also supports some c style syntax for
39 * hexidecimal literals. The previous number could also be written
40 * '0xdeadbeef'.
41 */
42
43 #ifndef JANET_AMALG
44 #include "features.h"
45 #include <janet.h>
46 #include "util.h"
47 #endif
48
49 #include <math.h>
50 #include <string.h>
51
52 /* Lookup table for getting values of characters when parsing numbers. Handles
53 * digits 0-9 and a-z (and A-Z). A-Z have values of 10 to 35. */
54 static uint8_t digit_lookup[128] = {
55 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
56 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
57 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
58 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
59 0xff, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
60 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 0xff, 0xff, 0xff, 0xff, 0xff,
61 0xff, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
62 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 0xff, 0xff, 0xff, 0xff, 0xff
63 };
64
65 #define BIGNAT_NBIT 31
66 #define BIGNAT_BASE 0x80000000U
67
68 /* Allow for large mantissa. BigNat is a natural number. */
69 struct BigNat {
70 uint32_t first_digit; /* First digit so we don't need to allocate when not needed. */
71 int32_t n; /* n digits */
72 int32_t cap; /* allocated digit capacity */
73 uint32_t *digits; /* Each digit is base (2 ^ 31). Digits are least significant first. */
74 };
75
76 /* Initialize a bignat to 0 */
bignat_zero(struct BigNat * x)77 static void bignat_zero(struct BigNat *x) {
78 x->first_digit = 0;
79 x->n = 0;
80 x->cap = 0;
81 x->digits = NULL;
82 }
83
84 /* Allocate n more digits for mant. Return a pointer to these digits. */
bignat_extra(struct BigNat * mant,int32_t n)85 static uint32_t *bignat_extra(struct BigNat *mant, int32_t n) {
86 int32_t oldn = mant->n;
87 int32_t newn = oldn + n;
88 if (mant->cap < newn) {
89 int32_t newcap = 2 * newn;
90 uint32_t *mem = janet_realloc(mant->digits, (size_t) newcap * sizeof(uint32_t));
91 if (NULL == mem) {
92 JANET_OUT_OF_MEMORY;
93 }
94 mant->cap = newcap;
95 mant->digits = mem;
96 }
97 mant->n = newn;
98 return mant->digits + oldn;
99 }
100
101 /* Append a digit */
bignat_append(struct BigNat * mant,uint32_t dig)102 static void bignat_append(struct BigNat *mant, uint32_t dig) {
103 bignat_extra(mant, 1)[0] = dig;
104 }
105
106 /* Multiply the mantissa mant by a factor and the add a term
107 * in one operation. factor will be between 2 and 36^4,
108 * term will be between 0 and 36. */
bignat_muladd(struct BigNat * mant,uint32_t factor,uint32_t term)109 static void bignat_muladd(struct BigNat *mant, uint32_t factor, uint32_t term) {
110 int32_t i;
111 uint64_t carry = ((uint64_t) mant->first_digit) * factor + term;
112 mant->first_digit = carry % BIGNAT_BASE;
113 carry /= BIGNAT_BASE;
114 for (i = 0; i < mant->n; i++) {
115 carry += ((uint64_t) mant->digits[i]) * factor;
116 mant->digits[i] = carry % BIGNAT_BASE;
117 carry /= BIGNAT_BASE;
118 }
119 if (carry) bignat_append(mant, (uint32_t) carry);
120 }
121
122 /* Divide the mantissa mant by a factor. Drop the remainder. */
bignat_div(struct BigNat * mant,uint32_t divisor)123 static void bignat_div(struct BigNat *mant, uint32_t divisor) {
124 int32_t i;
125 uint32_t quotient, remainder;
126 uint64_t dividend;
127 remainder = 0, quotient = 0;
128 for (i = mant->n - 1; i >= 0; i--) {
129 dividend = ((uint64_t)remainder * BIGNAT_BASE) + mant->digits[i];
130 if (i < mant->n - 1) mant->digits[i + 1] = quotient;
131 quotient = (uint32_t)(dividend / divisor);
132 remainder = (uint32_t)(dividend % divisor);
133 mant->digits[i] = remainder;
134 }
135 dividend = ((uint64_t)remainder * BIGNAT_BASE) + mant->first_digit;
136 if (mant->n && mant->digits[mant->n - 1] == 0) mant->n--;
137 mant->first_digit = (uint32_t)(dividend / divisor);
138 }
139
140 /* Shift left by a multiple of BIGNAT_NBIT */
bignat_lshift_n(struct BigNat * mant,int n)141 static void bignat_lshift_n(struct BigNat *mant, int n) {
142 if (!n) return;
143 int32_t oldn = mant->n;
144 bignat_extra(mant, n);
145 memmove(mant->digits + n, mant->digits, sizeof(uint32_t) * oldn);
146 memset(mant->digits, 0, sizeof(uint32_t) * (n - 1));
147 mant->digits[n - 1] = mant->first_digit;
148 mant->first_digit = 0;
149 }
150
151 #ifdef __GNUC__
152 #define clz(x) __builtin_clz(x)
153 #else
clz(uint32_t x)154 static int clz(uint32_t x) {
155 int n = 0;
156 if (x <= 0x0000ffff) n += 16, x <<= 16;
157 if (x <= 0x00ffffff) n += 8, x <<= 8;
158 if (x <= 0x0fffffff) n += 4, x <<= 4;
159 if (x <= 0x3fffffff) n += 2, x <<= 2;
160 if (x <= 0x7fffffff) n ++;
161 return n;
162 }
163 #endif
164
165 /* Extract double value from mantissa */
bignat_extract(struct BigNat * mant,int32_t exponent2)166 static double bignat_extract(struct BigNat *mant, int32_t exponent2) {
167 uint64_t top53;
168 int32_t n = mant->n;
169 /* Get most significant 53 bits from mant. Bit 52 (0 indexed) should
170 * always be 1. This is essentially a large right shift on mant.*/
171 if (n) {
172 /* Two or more digits */
173 uint64_t d1 = mant->digits[n - 1]; /* MSD (non-zero) */
174 uint64_t d2 = (n == 1) ? mant->first_digit : mant->digits[n - 2];
175 uint64_t d3 = (n > 2) ? mant->digits[n - 3] : (n == 2) ? mant->first_digit : 0;
176 int lz = clz((uint32_t) d1);
177 int nbits = 32 - lz;
178 /* First get 54 bits */
179 top53 = (d2 << (54 - BIGNAT_NBIT)) + (d3 >> (2 * BIGNAT_NBIT - 54));
180 top53 >>= nbits;
181 top53 |= (d1 << (54 - nbits));
182 /* Rounding based on lowest bit of 54 */
183 if (top53 & 1) top53++;
184 top53 >>= 1;
185 if (top53 > 0x1FffffFFFFffffUL) {
186 top53 >>= 1;
187 exponent2++;
188 }
189 /* Correct exponent - to correct for large right shift to mantissa. */
190 exponent2 += (nbits - 53) + BIGNAT_NBIT * n;
191 } else {
192 /* One digit */
193 top53 = mant->first_digit;
194 }
195 return ldexp((double)top53, exponent2);
196 }
197
198 /* Read in a mantissa and exponent of a certain base, and give
199 * back the double value. Should properly handle 0s, infinities, and
200 * denormalized numbers. (When the exponent values are too large or small) */
convert(int negative,struct BigNat * mant,int32_t base,int32_t exponent)201 static double convert(
202 int negative,
203 struct BigNat *mant,
204 int32_t base,
205 int32_t exponent) {
206
207 int32_t exponent2 = 0;
208
209 /* Approximate exponent in base 2 of mant and exponent. This should get us a good estimate of the final size of the
210 * number, within * 2^32 or so. */
211 int64_t mant_exp2_approx = mant->n * 32 + 16;
212 int64_t exp_exp2_approx = (int64_t)(floor(log2(base) * exponent));
213 int64_t exp2_approx = mant_exp2_approx + exp_exp2_approx;
214
215 /* Short circuit zero, huge, and small numbers. We use the exponent range of valid IEEE754 doubles (-1022, 1023)
216 * with a healthy buffer to allow for inaccuracies in the approximation and denormailzed numbers. */
217 if (mant->n == 0 && mant->first_digit == 0)
218 return negative ? -0.0 : 0.0;
219 if (exp2_approx > 1176)
220 return negative ? -INFINITY : INFINITY;
221 if (exp2_approx < -1175)
222 return negative ? -0.0 : 0.0;
223
224 /* Final value is X = mant * base ^ exponent * 2 ^ exponent2
225 * Get exponent to zero while holding X constant. */
226
227 /* Positive exponents are simple */
228 for (; exponent > 3; exponent -= 4) bignat_muladd(mant, base * base * base * base, 0);
229 for (; exponent > 1; exponent -= 2) bignat_muladd(mant, base * base, 0);
230 for (; exponent > 0; exponent -= 1) bignat_muladd(mant, base, 0);
231
232 /* Negative exponents are tricky - we don't want to loose bits
233 * from integer division, so we need to premultiply. */
234 if (exponent < 0) {
235 int32_t shamt = 5 - exponent / 4;
236 bignat_lshift_n(mant, shamt);
237 exponent2 -= shamt * BIGNAT_NBIT;
238 for (; exponent < -3; exponent += 4) bignat_div(mant, base * base * base * base);
239 for (; exponent < -1; exponent += 2) bignat_div(mant, base * base);
240 for (; exponent < 0; exponent += 1) bignat_div(mant, base);
241 }
242
243 return negative
244 ? -bignat_extract(mant, exponent2)
245 : bignat_extract(mant, exponent2);
246 }
247
248 /* Scan a real (double) from a string. If the string cannot be converted into
249 * and integer, return 0. */
janet_scan_number_base(const uint8_t * str,int32_t len,int32_t base,double * out)250 int janet_scan_number_base(
251 const uint8_t *str,
252 int32_t len,
253 int32_t base,
254 double *out) {
255 const uint8_t *end = str + len;
256 int seenadigit = 0;
257 int ex = 0;
258 int seenpoint = 0;
259 int foundexp = 0;
260 int neg = 0;
261 struct BigNat mant;
262 bignat_zero(&mant);
263
264 /* Prevent some kinds of overflow bugs relating to the exponent
265 * overflowing. For example, if a string was passed 2GB worth of 0s after
266 * the decimal point, exponent could wrap around and become positive. It's
267 * easier to reject ridiculously large inputs than to check for overflows.
268 * */
269 if (len > INT32_MAX / 40) goto error;
270
271 /* Get sign */
272 if (str >= end) goto error;
273 if (*str == '-') {
274 neg = 1;
275 str++;
276 } else if (*str == '+') {
277 str++;
278 }
279
280 /* Check for leading 0x or digit digit r */
281 if (base == 0) {
282 if (str + 1 < end && str[0] == '0' && str[1] == 'x') {
283 base = 16;
284 str += 2;
285 } else if (str + 1 < end &&
286 str[0] >= '0' && str[0] <= '9' &&
287 str[1] == 'r') {
288 base = str[0] - '0';
289 str += 2;
290 } else if (str + 2 < end &&
291 str[0] >= '0' && str[0] <= '9' &&
292 str[1] >= '0' && str[1] <= '9' &&
293 str[2] == 'r') {
294 base = 10 * (str[0] - '0') + (str[1] - '0');
295 if (base < 2 || base > 36) goto error;
296 str += 3;
297 }
298 }
299
300 /* If still base is 0, set to default (10) */
301 if (base == 0) {
302 base = 10;
303 }
304
305 /* Skip leading zeros */
306 while (str < end && (*str == '0' || *str == '.')) {
307 if (seenpoint) ex--;
308 if (*str == '.') {
309 if (seenpoint) goto error;
310 seenpoint = 1;
311 } else {
312 seenadigit = 1;
313 }
314 str++;
315 }
316
317 /* Parse significant digits */
318 while (str < end) {
319 if (*str == '.') {
320 if (seenpoint) goto error;
321 seenpoint = 1;
322 } else if (*str == '&') {
323 foundexp = 1;
324 break;
325 } else if (base == 10 && (*str == 'E' || *str == 'e')) {
326 foundexp = 1;
327 break;
328 } else if (*str == '_') {
329 if (!seenadigit) goto error;
330 } else {
331 int digit = digit_lookup[*str & 0x7F];
332 if (*str > 127 || digit >= base) goto error;
333 if (seenpoint) ex--;
334 bignat_muladd(&mant, base, digit);
335 seenadigit = 1;
336 }
337 str++;
338 }
339
340 if (!seenadigit)
341 goto error;
342
343 /* Read exponent */
344 if (str < end && foundexp) {
345 int eneg = 0;
346 int32_t ee = 0;
347 seenadigit = 0;
348 str++;
349 if (str >= end) goto error;
350 if (*str == '-') {
351 eneg = 1;
352 str++;
353 } else if (*str == '+') {
354 str++;
355 }
356 /* Skip leading 0s in exponent */
357 while (str < end && *str == '0') {
358 str++;
359 seenadigit = 1;
360 }
361 while (str < end) {
362 int digit = digit_lookup[*str & 0x7F];
363 if (*str > 127 || digit >= base) goto error;
364 if (ee < (INT32_MAX / 40)) {
365 ee = base * ee + digit;
366 }
367 str++;
368 seenadigit = 1;
369 }
370 if (eneg) ex -= ee;
371 else ex += ee;
372 }
373
374 if (!seenadigit)
375 goto error;
376
377 *out = convert(neg, &mant, base, ex);
378 janet_free(mant.digits);
379 return 0;
380
381 error:
382 janet_free(mant.digits);
383 return 1;
384 }
385
janet_scan_number(const uint8_t * str,int32_t len,double * out)386 int janet_scan_number(
387 const uint8_t *str,
388 int32_t len,
389 double *out) {
390 return janet_scan_number_base(str, len, 0, out);
391 }
392
393 #ifdef JANET_INT_TYPES
394
scan_uint64(const uint8_t * str,int32_t len,uint64_t * out,int * neg)395 static int scan_uint64(
396 const uint8_t *str,
397 int32_t len,
398 uint64_t *out,
399 int *neg) {
400 const uint8_t *end = str + len;
401 int seenadigit = 0;
402 int base = 10;
403 *neg = 0;
404 *out = 0;
405 uint64_t accum = 0;
406 /* len max is INT64_MAX in base 2 with _ between each bits */
407 /* '2r' + 64 bits + 63 _ + sign = 130 => 150 for some leading */
408 /* zeros */
409 if (len > 150) return 0;
410 /* Get sign */
411 if (str >= end) return 0;
412 if (*str == '-') {
413 *neg = 1;
414 str++;
415 } else if (*str == '+') {
416 str++;
417 }
418 /* Check for leading 0x or digit digit r */
419 if (str + 1 < end && str[0] == '0' && str[1] == 'x') {
420 base = 16;
421 str += 2;
422 } else if (str + 1 < end &&
423 str[0] >= '0' && str[0] <= '9' &&
424 str[1] == 'r') {
425 base = str[0] - '0';
426 str += 2;
427 } else if (str + 2 < end &&
428 str[0] >= '0' && str[0] <= '9' &&
429 str[1] >= '0' && str[1] <= '9' &&
430 str[2] == 'r') {
431 base = 10 * (str[0] - '0') + (str[1] - '0');
432 if (base < 2 || base > 36) return 0;
433 str += 3;
434 }
435
436 /* Skip leading zeros */
437 while (str < end && *str == '0') {
438 seenadigit = 1;
439 str++;
440 }
441 /* Parse significant digits */
442 while (str < end) {
443 if (*str == '_') {
444 if (!seenadigit) return 0;
445 } else {
446 int digit = digit_lookup[*str & 0x7F];
447 if (*str > 127 || digit >= base) return 0;
448 if (accum > (UINT64_MAX - digit) / base) return 0;
449 accum = accum * base + digit;
450 seenadigit = 1;
451 }
452 str++;
453 }
454
455 if (!seenadigit) return 0;
456 *out = accum;
457 return 1;
458 }
459
janet_scan_int64(const uint8_t * str,int32_t len,int64_t * out)460 int janet_scan_int64(const uint8_t *str, int32_t len, int64_t *out) {
461 int neg;
462 uint64_t bi;
463 if (scan_uint64(str, len, &bi, &neg)) {
464 if (neg && bi <= ((UINT64_MAX / 2) + 1)) {
465 if (bi > INT64_MAX) {
466 *out = INT64_MIN;
467 } else {
468 *out = -((int64_t) bi);
469 }
470 return 1;
471 }
472 if (!neg && bi <= INT64_MAX) {
473 *out = (int64_t) bi;
474 return 1;
475 }
476 }
477 return 0;
478 }
479
janet_scan_uint64(const uint8_t * str,int32_t len,uint64_t * out)480 int janet_scan_uint64(const uint8_t *str, int32_t len, uint64_t *out) {
481 int neg;
482 uint64_t bi;
483 if (scan_uint64(str, len, &bi, &neg)) {
484 if (!neg) {
485 *out = bi;
486 return 1;
487 }
488 }
489 return 0;
490 }
491
492 #endif
493