1 /*
2 * Copyright (c) 2021 Calvin Rose
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to
6 * deal in the Software without restriction, including without limitation the
7 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8 * sell copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20 * IN THE SOFTWARE.
21 */
22 
23 /* Use a custom double parser instead of libc's strtod for better portability
24  * and control.
25  *
26  * This version has been modified for much greater flexibility in parsing, such
27  * as choosing the radix and supporting scientific notation with any radix.
28  *
29  * Numbers are of the form [-+]R[rR]I.F[eE&][-+]X in pseudo-regex form, where R
30  * is the radix, I is the integer part, F is the fractional part, and X is the
31  * exponent. All signs, radix, decimal point, fractional part, and exponent can
32  * be omitted.  The radix is assumed to be 10 if omitted, and the E or e
33  * separator for the exponent can only be used when the radix is 10. This is
34  * because E is a valid digit in bases 15 or greater. For bases greater than
35  * 10, the letters are used as digits. A through Z correspond to the digits 10
36  * through 35, and the lowercase letters have the same values. The radix number
37  * is always in base 10. For example, a hexidecimal number could be written
38  * '16rdeadbeef'. janet_scan_number also supports some c style syntax for
39  * hexidecimal literals. The previous number could also be written
40  * '0xdeadbeef'.
41  */
42 
43 #ifndef JANET_AMALG
44 #include "features.h"
45 #include <janet.h>
46 #include "util.h"
47 #endif
48 
49 #include <math.h>
50 #include <string.h>
51 
52 /* Lookup table for getting values of characters when parsing numbers. Handles
53  * digits 0-9 and a-z (and A-Z). A-Z have values of 10 to 35. */
54 static uint8_t digit_lookup[128] = {
55     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
56     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
57     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
58     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
59     0xff, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
60     25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 0xff, 0xff, 0xff, 0xff, 0xff,
61     0xff, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
62     25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 0xff, 0xff, 0xff, 0xff, 0xff
63 };
64 
65 #define BIGNAT_NBIT 31
66 #define BIGNAT_BASE 0x80000000U
67 
68 /* Allow for large mantissa. BigNat is a natural number. */
69 struct BigNat {
70     uint32_t first_digit; /* First digit so we don't need to allocate when not needed. */
71     int32_t n; /* n digits */
72     int32_t cap; /* allocated digit capacity */
73     uint32_t *digits; /* Each digit is base (2 ^ 31). Digits are least significant first. */
74 };
75 
76 /* Initialize a bignat to 0 */
bignat_zero(struct BigNat * x)77 static void bignat_zero(struct BigNat *x) {
78     x->first_digit = 0;
79     x->n = 0;
80     x->cap = 0;
81     x->digits = NULL;
82 }
83 
84 /* Allocate n more digits for mant. Return a pointer to these digits. */
bignat_extra(struct BigNat * mant,int32_t n)85 static uint32_t *bignat_extra(struct BigNat *mant, int32_t n) {
86     int32_t oldn = mant->n;
87     int32_t newn = oldn + n;
88     if (mant->cap < newn) {
89         int32_t newcap = 2 * newn;
90         uint32_t *mem = janet_realloc(mant->digits, (size_t) newcap * sizeof(uint32_t));
91         if (NULL == mem) {
92             JANET_OUT_OF_MEMORY;
93         }
94         mant->cap = newcap;
95         mant->digits = mem;
96     }
97     mant->n = newn;
98     return mant->digits + oldn;
99 }
100 
101 /* Append a digit */
bignat_append(struct BigNat * mant,uint32_t dig)102 static void bignat_append(struct BigNat *mant, uint32_t dig) {
103     bignat_extra(mant, 1)[0] = dig;
104 }
105 
106 /* Multiply the mantissa mant by a factor and the add a term
107  * in one operation. factor will be between 2 and 36^4,
108  * term will be between 0 and 36. */
bignat_muladd(struct BigNat * mant,uint32_t factor,uint32_t term)109 static void bignat_muladd(struct BigNat *mant, uint32_t factor, uint32_t term) {
110     int32_t i;
111     uint64_t carry = ((uint64_t) mant->first_digit) * factor + term;
112     mant->first_digit = carry % BIGNAT_BASE;
113     carry /= BIGNAT_BASE;
114     for (i = 0; i < mant->n; i++) {
115         carry += ((uint64_t) mant->digits[i]) * factor;
116         mant->digits[i] = carry % BIGNAT_BASE;
117         carry /= BIGNAT_BASE;
118     }
119     if (carry) bignat_append(mant, (uint32_t) carry);
120 }
121 
122 /* Divide the mantissa mant by a factor. Drop the remainder. */
bignat_div(struct BigNat * mant,uint32_t divisor)123 static void bignat_div(struct BigNat *mant, uint32_t divisor) {
124     int32_t i;
125     uint32_t quotient, remainder;
126     uint64_t dividend;
127     remainder = 0, quotient = 0;
128     for (i = mant->n - 1; i >= 0; i--) {
129         dividend = ((uint64_t)remainder * BIGNAT_BASE) + mant->digits[i];
130         if (i < mant->n - 1) mant->digits[i + 1] = quotient;
131         quotient = (uint32_t)(dividend / divisor);
132         remainder = (uint32_t)(dividend % divisor);
133         mant->digits[i] = remainder;
134     }
135     dividend = ((uint64_t)remainder * BIGNAT_BASE) + mant->first_digit;
136     if (mant->n && mant->digits[mant->n - 1] == 0) mant->n--;
137     mant->first_digit = (uint32_t)(dividend / divisor);
138 }
139 
140 /* Shift left by a multiple of BIGNAT_NBIT */
bignat_lshift_n(struct BigNat * mant,int n)141 static void bignat_lshift_n(struct BigNat *mant, int n) {
142     if (!n) return;
143     int32_t oldn = mant->n;
144     bignat_extra(mant, n);
145     memmove(mant->digits + n, mant->digits, sizeof(uint32_t) * oldn);
146     memset(mant->digits, 0, sizeof(uint32_t) * (n - 1));
147     mant->digits[n - 1] = mant->first_digit;
148     mant->first_digit = 0;
149 }
150 
151 #ifdef __GNUC__
152 #define clz(x) __builtin_clz(x)
153 #else
clz(uint32_t x)154 static int clz(uint32_t x) {
155     int n = 0;
156     if (x <= 0x0000ffff) n += 16, x <<= 16;
157     if (x <= 0x00ffffff) n += 8, x <<= 8;
158     if (x <= 0x0fffffff) n += 4, x <<= 4;
159     if (x <= 0x3fffffff) n += 2, x <<= 2;
160     if (x <= 0x7fffffff) n ++;
161     return n;
162 }
163 #endif
164 
165 /* Extract double value from mantissa */
bignat_extract(struct BigNat * mant,int32_t exponent2)166 static double bignat_extract(struct BigNat *mant, int32_t exponent2) {
167     uint64_t top53;
168     int32_t n = mant->n;
169     /* Get most significant 53 bits from mant. Bit 52 (0 indexed) should
170      * always be 1. This is essentially a large right shift on mant.*/
171     if (n) {
172         /* Two or more digits */
173         uint64_t d1 = mant->digits[n - 1]; /* MSD (non-zero) */
174         uint64_t d2 = (n == 1) ? mant->first_digit : mant->digits[n - 2];
175         uint64_t d3 = (n > 2) ? mant->digits[n - 3] : (n == 2) ? mant->first_digit : 0;
176         int lz = clz((uint32_t) d1);
177         int nbits = 32 - lz;
178         /* First get 54 bits */
179         top53 = (d2 << (54 - BIGNAT_NBIT)) + (d3 >> (2 * BIGNAT_NBIT - 54));
180         top53 >>= nbits;
181         top53 |= (d1 << (54 - nbits));
182         /* Rounding based on lowest bit of 54 */
183         if (top53 & 1) top53++;
184         top53 >>= 1;
185         if (top53 > 0x1FffffFFFFffffUL) {
186             top53 >>= 1;
187             exponent2++;
188         }
189         /* Correct exponent - to correct for large right shift to mantissa. */
190         exponent2 += (nbits - 53) + BIGNAT_NBIT * n;
191     } else {
192         /* One digit */
193         top53 = mant->first_digit;
194     }
195     return ldexp((double)top53, exponent2);
196 }
197 
198 /* Read in a mantissa and exponent of a certain base, and give
199  * back the double value. Should properly handle 0s, infinities, and
200  * denormalized numbers. (When the exponent values are too large or small) */
convert(int negative,struct BigNat * mant,int32_t base,int32_t exponent)201 static double convert(
202     int negative,
203     struct BigNat *mant,
204     int32_t base,
205     int32_t exponent) {
206 
207     int32_t exponent2 = 0;
208 
209     /* Approximate exponent in base 2 of mant and exponent. This should get us a good estimate of the final size of the
210      * number, within * 2^32 or so. */
211     int64_t mant_exp2_approx = mant->n * 32 + 16;
212     int64_t exp_exp2_approx = (int64_t)(floor(log2(base) * exponent));
213     int64_t exp2_approx = mant_exp2_approx + exp_exp2_approx;
214 
215     /* Short circuit zero, huge, and small numbers. We use the exponent range of valid IEEE754 doubles (-1022, 1023)
216      * with a healthy buffer to allow for inaccuracies in the approximation and denormailzed numbers. */
217     if (mant->n == 0 && mant->first_digit == 0)
218         return negative ? -0.0 : 0.0;
219     if (exp2_approx > 1176)
220         return negative ? -INFINITY : INFINITY;
221     if (exp2_approx < -1175)
222         return negative ? -0.0 : 0.0;
223 
224     /* Final value is X = mant * base ^ exponent * 2 ^ exponent2
225      * Get exponent to zero while holding X constant. */
226 
227     /* Positive exponents are simple */
228     for (; exponent > 3; exponent -= 4) bignat_muladd(mant, base * base * base * base, 0);
229     for (; exponent > 1; exponent -= 2) bignat_muladd(mant, base * base, 0);
230     for (; exponent > 0; exponent -= 1) bignat_muladd(mant, base, 0);
231 
232     /* Negative exponents are tricky - we don't want to loose bits
233      * from integer division, so we need to premultiply. */
234     if (exponent < 0) {
235         int32_t shamt = 5 - exponent / 4;
236         bignat_lshift_n(mant, shamt);
237         exponent2 -= shamt * BIGNAT_NBIT;
238         for (; exponent < -3; exponent += 4) bignat_div(mant, base * base * base * base);
239         for (; exponent < -1; exponent += 2) bignat_div(mant, base * base);
240         for (; exponent <  0; exponent += 1) bignat_div(mant, base);
241     }
242 
243     return negative
244            ? -bignat_extract(mant, exponent2)
245            : bignat_extract(mant, exponent2);
246 }
247 
248 /* Scan a real (double) from a string. If the string cannot be converted into
249  * and integer, return 0. */
janet_scan_number_base(const uint8_t * str,int32_t len,int32_t base,double * out)250 int janet_scan_number_base(
251     const uint8_t *str,
252     int32_t len,
253     int32_t base,
254     double *out) {
255     const uint8_t *end = str + len;
256     int seenadigit = 0;
257     int ex = 0;
258     int seenpoint = 0;
259     int foundexp = 0;
260     int neg = 0;
261     struct BigNat mant;
262     bignat_zero(&mant);
263 
264     /* Prevent some kinds of overflow bugs relating to the exponent
265      * overflowing.  For example, if a string was passed 2GB worth of 0s after
266      * the decimal point, exponent could wrap around and become positive. It's
267      * easier to reject ridiculously large inputs than to check for overflows.
268      * */
269     if (len > INT32_MAX / 40) goto error;
270 
271     /* Get sign */
272     if (str >= end) goto error;
273     if (*str == '-') {
274         neg = 1;
275         str++;
276     } else if (*str == '+') {
277         str++;
278     }
279 
280     /* Check for leading 0x or digit digit r */
281     if (base == 0) {
282         if (str + 1 < end && str[0] == '0' && str[1] == 'x') {
283             base = 16;
284             str += 2;
285         } else if (str + 1 < end  &&
286                    str[0] >= '0' && str[0] <= '9' &&
287                    str[1] == 'r') {
288             base = str[0] - '0';
289             str += 2;
290         } else if (str + 2 < end  &&
291                    str[0] >= '0' && str[0] <= '9' &&
292                    str[1] >= '0' && str[1] <= '9' &&
293                    str[2] == 'r') {
294             base = 10 * (str[0] - '0') + (str[1] - '0');
295             if (base < 2 || base > 36) goto error;
296             str += 3;
297         }
298     }
299 
300     /* If still base is 0, set to default (10) */
301     if (base == 0) {
302         base = 10;
303     }
304 
305     /* Skip leading zeros */
306     while (str < end && (*str == '0' || *str == '.')) {
307         if (seenpoint) ex--;
308         if (*str == '.') {
309             if (seenpoint) goto error;
310             seenpoint = 1;
311         } else {
312             seenadigit = 1;
313         }
314         str++;
315     }
316 
317     /* Parse significant digits */
318     while (str < end) {
319         if (*str == '.') {
320             if (seenpoint) goto error;
321             seenpoint = 1;
322         } else if (*str == '&') {
323             foundexp = 1;
324             break;
325         } else if (base == 10 && (*str == 'E' || *str == 'e')) {
326             foundexp = 1;
327             break;
328         } else if (*str == '_') {
329             if (!seenadigit) goto error;
330         } else {
331             int digit = digit_lookup[*str & 0x7F];
332             if (*str > 127 || digit >= base) goto error;
333             if (seenpoint) ex--;
334             bignat_muladd(&mant, base, digit);
335             seenadigit = 1;
336         }
337         str++;
338     }
339 
340     if (!seenadigit)
341         goto error;
342 
343     /* Read exponent */
344     if (str < end && foundexp) {
345         int eneg = 0;
346         int32_t ee = 0;
347         seenadigit = 0;
348         str++;
349         if (str >= end) goto error;
350         if (*str == '-') {
351             eneg = 1;
352             str++;
353         } else if (*str == '+') {
354             str++;
355         }
356         /* Skip leading 0s in exponent */
357         while (str < end && *str == '0') {
358             str++;
359             seenadigit = 1;
360         }
361         while (str < end) {
362             int digit = digit_lookup[*str & 0x7F];
363             if (*str > 127 || digit >= base) goto error;
364             if (ee < (INT32_MAX / 40)) {
365                 ee = base * ee + digit;
366             }
367             str++;
368             seenadigit = 1;
369         }
370         if (eneg) ex -= ee;
371         else ex += ee;
372     }
373 
374     if (!seenadigit)
375         goto error;
376 
377     *out = convert(neg, &mant, base, ex);
378     janet_free(mant.digits);
379     return 0;
380 
381 error:
382     janet_free(mant.digits);
383     return 1;
384 }
385 
janet_scan_number(const uint8_t * str,int32_t len,double * out)386 int janet_scan_number(
387     const uint8_t *str,
388     int32_t len,
389     double *out) {
390     return janet_scan_number_base(str, len, 0, out);
391 }
392 
393 #ifdef JANET_INT_TYPES
394 
scan_uint64(const uint8_t * str,int32_t len,uint64_t * out,int * neg)395 static int scan_uint64(
396     const uint8_t *str,
397     int32_t len,
398     uint64_t *out,
399     int *neg) {
400     const uint8_t *end = str + len;
401     int seenadigit = 0;
402     int base = 10;
403     *neg = 0;
404     *out = 0;
405     uint64_t accum = 0;
406     /* len max is INT64_MAX in base 2 with _ between each bits */
407     /* '2r' + 64 bits + 63 _  + sign = 130 => 150 for some leading  */
408     /* zeros */
409     if (len > 150) return 0;
410     /* Get sign */
411     if (str >= end) return 0;
412     if (*str == '-') {
413         *neg = 1;
414         str++;
415     } else if (*str == '+') {
416         str++;
417     }
418     /* Check for leading 0x or digit digit r */
419     if (str + 1 < end && str[0] == '0' && str[1] == 'x') {
420         base = 16;
421         str += 2;
422     } else if (str + 1 < end  &&
423                str[0] >= '0' && str[0] <= '9' &&
424                str[1] == 'r') {
425         base = str[0] - '0';
426         str += 2;
427     } else if (str + 2 < end  &&
428                str[0] >= '0' && str[0] <= '9' &&
429                str[1] >= '0' && str[1] <= '9' &&
430                str[2] == 'r') {
431         base = 10 * (str[0] - '0') + (str[1] - '0');
432         if (base < 2 || base > 36) return 0;
433         str += 3;
434     }
435 
436     /* Skip leading zeros */
437     while (str < end && *str == '0') {
438         seenadigit = 1;
439         str++;
440     }
441     /* Parse significant digits */
442     while (str < end) {
443         if (*str == '_') {
444             if (!seenadigit) return 0;
445         } else {
446             int digit = digit_lookup[*str & 0x7F];
447             if (*str > 127 || digit >= base) return 0;
448             if (accum > (UINT64_MAX - digit) / base) return 0;
449             accum = accum * base + digit;
450             seenadigit = 1;
451         }
452         str++;
453     }
454 
455     if (!seenadigit) return 0;
456     *out = accum;
457     return 1;
458 }
459 
janet_scan_int64(const uint8_t * str,int32_t len,int64_t * out)460 int janet_scan_int64(const uint8_t *str, int32_t len, int64_t *out) {
461     int neg;
462     uint64_t bi;
463     if (scan_uint64(str, len, &bi, &neg)) {
464         if (neg && bi <= ((UINT64_MAX / 2) + 1)) {
465             if (bi > INT64_MAX) {
466                 *out = INT64_MIN;
467             } else {
468                 *out = -((int64_t) bi);
469             }
470             return 1;
471         }
472         if (!neg && bi <= INT64_MAX) {
473             *out = (int64_t) bi;
474             return 1;
475         }
476     }
477     return 0;
478 }
479 
janet_scan_uint64(const uint8_t * str,int32_t len,uint64_t * out)480 int janet_scan_uint64(const uint8_t *str, int32_t len, uint64_t *out) {
481     int neg;
482     uint64_t bi;
483     if (scan_uint64(str, len, &bi, &neg)) {
484         if (!neg) {
485             *out = bi;
486             return 1;
487         }
488     }
489     return 0;
490 }
491 
492 #endif
493