1 /* __gmp_doscan -- formatted input internals.
2
3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 FUTURE GNU MP RELEASES.
6
7 Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
8
9 This file is part of the GNU MP Library.
10
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of the GNU Lesser General Public License as published by
13 the Free Software Foundation; either version 3 of the License, or (at your
14 option) any later version.
15
16 The GNU MP Library is distributed in the hope that it will be useful, but
17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
19 License for more details.
20
21 You should have received a copy of the GNU Lesser General Public License
22 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
23
24 #define _GNU_SOURCE /* for DECIMAL_POINT in langinfo.h */
25
26 #include "config.h"
27
28 #if HAVE_STDARG
29 #include <stdarg.h>
30 #else
31 #include <varargs.h>
32 #endif
33
34 #include <ctype.h>
35 #include <stddef.h> /* for ptrdiff_t */
36 #include <stdio.h>
37 #include <stdlib.h> /* for strtol */
38 #include <string.h>
39
40 #if HAVE_LANGINFO_H
41 #include <langinfo.h> /* for nl_langinfo */
42 #endif
43
44 #if HAVE_LOCALE_H
45 #include <locale.h> /* for localeconv */
46 #endif
47
48 #if HAVE_INTTYPES_H
49 # include <inttypes.h> /* for intmax_t */
50 #else
51 # if HAVE_STDINT_H
52 # include <stdint.h>
53 # endif
54 #endif
55
56 #if HAVE_SYS_TYPES_H
57 #include <sys/types.h> /* for quad_t */
58 #endif
59
60 #include "gmp.h"
61 #include "gmp-impl.h"
62
63
64 /* Change this to "#define TRACE(x) x" for some traces. */
65 #define TRACE(x)
66
67
68 /* General:
69
70 It's necessary to parse up the format string to recognise the GMP
71 extra types F, Q and Z. Other types and conversions are passed
72 across to the standard sscanf or fscanf via funs->scan, for ease of
73 implementation. This is essential in the case of something like glibc
74 %p where the pointer format isn't actually documented.
75
76 Because funs->scan doesn't get the whole input it can't put the right
77 values in for %n, so that's handled in __gmp_doscan. Neither sscanf
78 nor fscanf directly indicate how many characters were read, so an
79 extra %n is appended to each run for that. For fscanf this merely
80 supports our %n output, but for sscanf it lets funs->step move us
81 along the input string.
82
83 Whitespace and literal matches in the format string, including %%,
84 are handled directly within __gmp_doscan. This is reasonably
85 efficient, and avoids some suspicious behaviour observed in various
86 system libc's. GLIBC 2.2.4 for instance returns 0 on
87
88 sscanf(" ", " x")
89 or
90 sscanf(" ", " x%d",&n)
91
92 whereas we think they should return EOF, since end-of-string is
93 reached when a match of "x" is required.
94
95 For standard % conversions, funs->scan is called once for each
96 conversion. If we had vfscanf and vsscanf and could rely on their
97 fixed text matching behaviour then we could call them with multiple
98 consecutive standard conversions. But plain fscanf and sscanf work
99 fine, and parsing one field at a time shouldn't be too much of a
100 slowdown.
101
102 gmpscan:
103
104 gmpscan reads a gmp type. It's only used from one place, but is a
105 separate subroutine to avoid a big chunk of complicated code in the
106 middle of __gmp_doscan. Within gmpscan a couple of loopbacks make it
107 possible to share code for parsing integers, rationals and floats.
108
109 In gmpscan normally one char of lookahead is maintained, but when width
110 is reached that stops, on the principle that an fgetc/ungetc of a char
111 past where we're told to stop would be undesirable. "chars" is how many
112 characters have been read so far, including the current c. When
113 chars==width and another character is desired then a jump is done to the
114 "convert" stage. c is invalid and mustn't be unget'ed in this case;
115 chars is set to width+1 to indicate that.
116
117 gmpscan normally returns the number of characters read. -1 means an
118 invalid field, -2 means EOF reached before any matching characters
119 were read.
120
121 For hex floats, the mantissa part is passed to mpf_set_str, then the
122 exponent is applied with mpf_mul_exp or mpf_div_2exp. This is easier
123 than teaching mpf_set_str about an exponent factor (ie. 2) differing
124 from the mantissa radix point factor (ie. 16). mpf_mul_exp and
125 mpf_div_2exp will preserve the application requested precision, so
126 nothing in that respect is lost by making this a two-step process.
127
128 Matching and errors:
129
130 C99 7.19.6.2 paras 9 and 10 say an input item is read as the longest
131 string which is a match for the appropriate type, or a prefix of a
132 match. With that done, if it's only a prefix then the result is a
133 matching failure, ie. invalid input.
134
135 This rule seems fairly clear, but doesn't seem to be universally
136 applied in system C libraries. Even GLIBC doesn't seem to get it
137 right, insofar as it seems to accept some apparently invalid forms.
138 Eg. glibc 2.3.1 accepts "0x" for a "%i", where a reading of the
139 standard would suggest a non-empty sequence of digits should be
140 required after an "0x".
141
142 A footnote to 7.19.6.2 para 17 notes how this input item reading can
143 mean inputs acceptable to strtol are not acceptable to fscanf. We
144 think this confirms our reading of "0x" as invalid.
145
146 Clearly gmp_sscanf could backtrack to a longest input which was a
147 valid match for a given item, but this is not done, since C99 says
148 sscanf is identical to fscanf, so we make gmp_sscanf identical to
149 gmp_fscanf.
150
151 Types:
152
153 C99 says "ll" is for long long, and "L" is for long double floats.
154 Unfortunately in GMP 4.1.1 we documented the two as equivalent. This
155 doesn't affect us directly, since both are passed through to plain
156 scanf. It seems wisest not to try to enforce the C99 rule. This is
157 consistent with what we said before, though whether it actually
158 worked was always up to the C library.
159
160 Alternatives:
161
162 Consideration was given to using separate code for gmp_fscanf and
163 gmp_sscanf. The sscanf case could zip across a string doing literal
164 matches or recognising digits in gmpscan, rather than making a
165 function call fun->get per character. The fscanf could use getc
166 rather than fgetc too, which might help those systems where getc is a
167 macro or otherwise inlined. But none of this scanning and converting
168 will be particularly fast, so the two are done together to keep it a
169 little simpler for now.
170
171 Various multibyte string issues are not addressed, for a start C99
172 scanf says the format string is multibyte. Since we pass %c, %s and
173 %[ to the system scanf, they might do multibyte reads already, but
174 it's another matter whether or not that can be used, since our digit
175 and whitespace parsing is only unibyte. The plan is to quietly
176 ignore multibyte locales for now. This is not as bad as it sounds,
177 since GMP is presumably used mostly on numbers, which can be
178 perfectly adequately treated in plain ASCII.
179
180 */
181
182
183 struct gmp_doscan_params_t {
184 int base;
185 int ignore;
186 char type;
187 int width;
188 };
189
190
191 #define GET(c) \
192 do { \
193 ASSERT (chars <= width); \
194 chars++; \
195 if (chars > width) \
196 goto convert; \
197 (c) = (*funs->get) (data); \
198 } while (0)
199
200 /* store into "s", extending if necessary */
201 #define STORE(c) \
202 do { \
203 ASSERT (s_upto <= s_alloc); \
204 if (s_upto >= s_alloc) \
205 { \
206 size_t s_alloc_new = s_alloc + S_ALLOC_STEP; \
207 s = __GMP_REALLOCATE_FUNC_TYPE (s, s_alloc, s_alloc_new, char); \
208 s_alloc = s_alloc_new; \
209 } \
210 s[s_upto++] = c; \
211 } while (0)
212
213 #define S_ALLOC_STEP 512
214
215 static int
gmpscan(const struct gmp_doscan_funs_t * funs,void * data,const struct gmp_doscan_params_t * p,void * dst)216 gmpscan (const struct gmp_doscan_funs_t *funs, void *data,
217 const struct gmp_doscan_params_t *p, void *dst)
218 {
219 int chars, c, base, first, width, seen_point, seen_digit, hexfloat;
220 size_t s_upto, s_alloc, hexexp;
221 char *s;
222 int invalid = 0;
223
224 TRACE (printf ("gmpscan\n"));
225
226 ASSERT (p->type == 'F' || p->type == 'Q' || p->type == 'Z');
227
228 c = (*funs->get) (data);
229 if (c == EOF)
230 return -2;
231
232 chars = 1;
233 first = 1;
234 seen_point = 0;
235 width = (p->width == 0 ? INT_MAX-1 : p->width);
236 base = p->base;
237 s_alloc = S_ALLOC_STEP;
238 s = __GMP_ALLOCATE_FUNC_TYPE (s_alloc, char);
239 s_upto = 0;
240 hexfloat = 0;
241 hexexp = 0;
242
243 another:
244 seen_digit = 0;
245 if (c == '-')
246 {
247 STORE (c);
248 goto get_for_sign;
249 }
250 else if (c == '+')
251 {
252 /* don't store '+', it's not accepted by mpz_set_str etc */
253 get_for_sign:
254 GET (c);
255 }
256
257 if (base == 0)
258 {
259 base = 10; /* decimal if no base indicator */
260 if (c == '0')
261 {
262 seen_digit = 1; /* 0 alone is a valid number */
263 if (p->type != 'F')
264 base = 8; /* leading 0 is octal, for non-floats */
265 STORE (c);
266 GET (c);
267 if (c == 'x' || c == 'X')
268 {
269 base = 16;
270 seen_digit = 0; /* must have digits after an 0x */
271 if (p->type == 'F') /* don't pass 'x' to mpf_set_str_point */
272 hexfloat = 1;
273 else
274 STORE (c);
275 GET (c);
276 }
277 }
278 }
279
280 digits:
281 for (;;)
282 {
283 if (base == 16)
284 {
285 if (! isxdigit (c))
286 break;
287 }
288 else
289 {
290 if (! isdigit (c))
291 break;
292 if (base == 8 && (c == '8' || c == '9'))
293 break;
294 }
295
296 seen_digit = 1;
297 STORE (c);
298 GET (c);
299 }
300
301 if (first)
302 {
303 /* decimal point */
304 if (p->type == 'F' && ! seen_point)
305 {
306 /* For a multi-character decimal point, if the first character is
307 present then all of it must be, otherwise the input is
308 considered invalid. */
309 const char *point = GMP_DECIMAL_POINT;
310 int pc = (unsigned char) *point++;
311 if (c == pc)
312 {
313 for (;;)
314 {
315 STORE (c);
316 GET (c);
317 pc = (unsigned char) *point++;
318 if (pc == '\0')
319 break;
320 if (c != pc)
321 goto set_invalid;
322 }
323 seen_point = 1;
324 goto digits;
325 }
326 }
327
328 /* exponent */
329 if (p->type == 'F')
330 {
331 if (hexfloat && (c == 'p' || c == 'P'))
332 {
333 hexexp = s_upto; /* exponent location */
334 base = 10; /* exponent in decimal */
335 goto exponent;
336 }
337 else if (! hexfloat && (c == 'e' || c == 'E'))
338 {
339 exponent:
340 /* must have at least one digit in the mantissa, just an exponent
341 is not good enough */
342 if (! seen_digit)
343 goto set_invalid;
344
345 do_second:
346 first = 0;
347 STORE (c);
348 GET (c);
349 goto another;
350 }
351 }
352
353 /* denominator */
354 if (p->type == 'Q' && c == '/')
355 {
356 /* must have at least one digit in the numerator */
357 if (! seen_digit)
358 goto set_invalid;
359
360 /* now look for at least one digit in the denominator */
361 seen_digit = 0;
362
363 /* allow the base to be redetermined for "%i" */
364 base = p->base;
365 goto do_second;
366 }
367 }
368
369 convert:
370 if (! seen_digit)
371 {
372 set_invalid:
373 invalid = 1;
374 goto done;
375 }
376
377 if (! p->ignore)
378 {
379 STORE ('\0');
380 TRACE (printf (" convert \"%s\"\n", s));
381
382 /* We ought to have parsed out a valid string above, so just test
383 mpz_set_str etc with an ASSERT. */
384 switch (p->type) {
385 case 'F':
386 {
387 mpf_ptr f = (mpf_ptr) dst;
388 if (hexexp != 0)
389 s[hexexp] = '\0';
390 ASSERT_NOCARRY (mpf_set_str (f, s, hexfloat ? 16 : 10));
391 if (hexexp != 0)
392 {
393 char *dummy;
394 long exp;
395 exp = strtol (s + hexexp + 1, &dummy, 10);
396 if (exp >= 0)
397 mpf_mul_2exp (f, f, (unsigned long) exp);
398 else
399 mpf_div_2exp (f, f, - (unsigned long) exp);
400 }
401 }
402 break;
403 case 'Q':
404 ASSERT_NOCARRY (mpq_set_str ((mpq_ptr) dst, s, p->base));
405 break;
406 case 'Z':
407 ASSERT_NOCARRY (mpz_set_str ((mpz_ptr) dst, s, p->base));
408 break;
409 default:
410 ASSERT (0);
411 /*FALLTHRU*/
412 break;
413 }
414 }
415
416 done:
417 ASSERT (chars <= width+1);
418 if (chars != width+1)
419 {
420 (*funs->unget) (c, data);
421 TRACE (printf (" ungetc %d, to give %d chars\n", c, chars-1));
422 }
423 chars--;
424
425 (*__gmp_free_func) (s, s_alloc);
426
427 if (invalid)
428 {
429 TRACE (printf (" invalid\n"));
430 return -1;
431 }
432
433 TRACE (printf (" return %d chars (cf width %d)\n", chars, width));
434 return chars;
435 }
436
437
438 /* Read and discard whitespace, if any. Return number of chars skipped.
439 Whitespace skipping never provokes the EOF return from __gmp_doscan, so
440 it's not necessary to watch for EOF from funs->get, */
441 static int
skip_white(const struct gmp_doscan_funs_t * funs,void * data)442 skip_white (const struct gmp_doscan_funs_t *funs, void *data)
443 {
444 int c;
445 int ret = 0;
446
447 do
448 {
449 c = (funs->get) (data);
450 ret++;
451 }
452 while (isspace (c));
453
454 (funs->unget) (c, data);
455 ret--;
456
457 TRACE (printf (" skip white %d\n", ret));
458 return ret;
459 }
460
461
462 int
__gmp_doscan(const struct gmp_doscan_funs_t * funs,void * data,const char * orig_fmt,va_list orig_ap)463 __gmp_doscan (const struct gmp_doscan_funs_t *funs, void *data,
464 const char *orig_fmt, va_list orig_ap)
465 {
466 struct gmp_doscan_params_t param;
467 va_list ap;
468 char *alloc_fmt;
469 const char *fmt, *this_fmt, *end_fmt;
470 size_t orig_fmt_len, alloc_fmt_size, len;
471 int new_fields, new_chars;
472 char fchar;
473 int fields = 0;
474 int chars = 0;
475
476 TRACE (printf ("__gmp_doscan \"%s\"\n", orig_fmt);
477 if (funs->scan == (gmp_doscan_scan_t) sscanf)
478 printf (" s=\"%s\"\n", * (const char **) data));
479
480 /* Don't modify orig_ap, if va_list is actually an array and hence call by
481 reference. It could be argued that it'd be more efficient to leave
482 callers to make a copy if they care, but doing so here is going to be a
483 very small part of the total work, and we may as well keep applications
484 out of trouble. */
485 va_copy (ap, orig_ap);
486
487 /* Parts of the format string are going to be copied so that a " %n" can
488 be appended. alloc_fmt is some space for that. orig_fmt_len+4 will be
489 needed if fmt consists of a single "%" specifier, but otherwise is an
490 overestimate. We're not going to be very fast here, so use
491 __gmp_allocate_func rather than TMP_ALLOC. */
492 orig_fmt_len = strlen (orig_fmt);
493 alloc_fmt_size = orig_fmt_len + 4;
494 alloc_fmt = __GMP_ALLOCATE_FUNC_TYPE (alloc_fmt_size, char);
495
496 fmt = orig_fmt;
497 end_fmt = orig_fmt + orig_fmt_len;
498
499 for (;;)
500 {
501 next:
502 fchar = *fmt++;
503
504 if (fchar == '\0')
505 break;
506
507 if (isspace (fchar))
508 {
509 chars += skip_white (funs, data);
510 continue;
511 }
512
513 if (fchar != '%')
514 {
515 int c;
516 literal:
517 c = (funs->get) (data);
518 if (c != fchar)
519 {
520 (funs->unget) (c, data);
521 if (c == EOF)
522 {
523 eof_no_match:
524 if (fields == 0)
525 fields = EOF;
526 }
527 goto done;
528 }
529 chars++;
530 continue;
531 }
532
533 param.type = '\0';
534 param.base = 0; /* for e,f,g,i */
535 param.ignore = 0;
536 param.width = 0;
537
538 this_fmt = fmt-1;
539 TRACE (printf (" this_fmt \"%s\"\n", this_fmt));
540
541 for (;;)
542 {
543 ASSERT (fmt <= end_fmt);
544
545 fchar = *fmt++;
546 switch (fchar) {
547
548 case '\0': /* unterminated % sequence */
549 ASSERT (0);
550 goto done;
551
552 case '%': /* literal % */
553 goto literal;
554
555 case '[': /* character range */
556 fchar = *fmt++;
557 if (fchar == '^')
558 fchar = *fmt++;
559 /* ']' allowed as the first char (possibly after '^') */
560 if (fchar == ']')
561 fchar = *fmt++;
562 for (;;)
563 {
564 ASSERT (fmt <= end_fmt);
565 if (fchar == '\0')
566 {
567 /* unterminated % sequence */
568 ASSERT (0);
569 goto done;
570 }
571 if (fchar == ']')
572 break;
573 fchar = *fmt++;
574 }
575 /*FALLTHRU*/
576 case 'c': /* characters */
577 case 's': /* string of non-whitespace */
578 case 'p': /* pointer */
579 libc_type:
580 len = fmt - this_fmt;
581 memcpy (alloc_fmt, this_fmt, len);
582 alloc_fmt[len++] = '%';
583 alloc_fmt[len++] = 'n';
584 alloc_fmt[len] = '\0';
585
586 TRACE (printf (" scan \"%s\"\n", alloc_fmt);
587 if (funs->scan == (gmp_doscan_scan_t) sscanf)
588 printf (" s=\"%s\"\n", * (const char **) data));
589
590 new_chars = -1;
591 if (param.ignore)
592 {
593 new_fields = (*funs->scan) (data, alloc_fmt, &new_chars, NULL);
594 ASSERT (new_fields == 0 || new_fields == EOF);
595 }
596 else
597 {
598 void *arg = va_arg (ap, void *);
599 new_fields = (*funs->scan) (data, alloc_fmt, arg, &new_chars);
600 ASSERT (new_fields==0 || new_fields==1 || new_fields==EOF);
601
602 if (new_fields == 0)
603 goto done; /* invalid input */
604
605 if (new_fields == 1)
606 ASSERT (new_chars != -1);
607 }
608 TRACE (printf (" new_fields %d new_chars %d\n",
609 new_fields, new_chars));
610
611 if (new_fields == -1)
612 goto eof_no_match; /* EOF before anything matched */
613
614 /* Under param.ignore, when new_fields==0 we don't know if
615 it's a successful match or an invalid field. new_chars
616 won't have been assigned if it was an invalid field. */
617 if (new_chars == -1)
618 goto done; /* invalid input */
619
620 chars += new_chars;
621 (*funs->step) (data, new_chars);
622
623 increment_fields:
624 if (! param.ignore)
625 fields++;
626 goto next;
627
628 case 'd': /* decimal */
629 case 'u': /* decimal */
630 param.base = 10;
631 goto numeric;
632
633 case 'e': /* float */
634 case 'E': /* float */
635 case 'f': /* float */
636 case 'g': /* float */
637 case 'G': /* float */
638 case 'i': /* integer with base marker */
639 numeric:
640 if (param.type != 'F' && param.type != 'Q' && param.type != 'Z')
641 goto libc_type;
642
643 chars += skip_white (funs, data);
644
645 new_chars = gmpscan (funs, data, ¶m,
646 param.ignore ? NULL : va_arg (ap, void*));
647 if (new_chars == -2)
648 goto eof_no_match;
649 if (new_chars == -1)
650 goto done;
651
652 ASSERT (new_chars >= 0);
653 chars += new_chars;
654 goto increment_fields;
655
656 case 'a': /* glibc allocate string */
657 case '\'': /* glibc digit groupings */
658 break;
659
660 case 'F': /* mpf_t */
661 case 'j': /* intmax_t */
662 case 'L': /* long long */
663 case 'q': /* quad_t */
664 case 'Q': /* mpq_t */
665 case 't': /* ptrdiff_t */
666 case 'z': /* size_t */
667 case 'Z': /* mpz_t */
668 set_type:
669 param.type = fchar;
670 break;
671
672 case 'h': /* short or char */
673 if (param.type != 'h')
674 goto set_type;
675 param.type = 'H'; /* internal code for "hh" */
676 break;
677
678 goto numeric;
679
680 case 'l': /* long, long long, double or long double */
681 if (param.type != 'l')
682 goto set_type;
683 param.type = 'L'; /* "ll" means "L" */
684 break;
685
686 case 'n':
687 if (! param.ignore)
688 {
689 void *p;
690 p = va_arg (ap, void *);
691 TRACE (printf (" store %%n to %p\n", p));
692 switch (param.type) {
693 case '\0': * (int *) p = chars; break;
694 case 'F': mpf_set_si ((mpf_ptr) p, (long) chars); break;
695 case 'H': * (char *) p = chars; break;
696 case 'h': * (short *) p = chars; break;
697 #if HAVE_INTMAX_T
698 case 'j': * (intmax_t *) p = chars; break;
699 #else
700 case 'j': ASSERT_FAIL (intmax_t not available); break;
701 #endif
702 case 'l': * (long *) p = chars; break;
703 #if HAVE_QUAD_T && HAVE_LONG_LONG
704 case 'q':
705 ASSERT_ALWAYS (sizeof (quad_t) == sizeof (long long));
706 /*FALLTHRU*/
707 #else
708 case 'q': ASSERT_FAIL (quad_t not available); break;
709 #endif
710 #if HAVE_LONG_LONG
711 case 'L': * (long long *) p = chars; break;
712 #else
713 case 'L': ASSERT_FAIL (long long not available); break;
714 #endif
715 case 'Q': mpq_set_si ((mpq_ptr) p, (long) chars, 1L); break;
716 #if HAVE_PTRDIFF_T
717 case 't': * (ptrdiff_t *) p = chars; break;
718 #else
719 case 't': ASSERT_FAIL (ptrdiff_t not available); break;
720 #endif
721 case 'z': * (size_t *) p = chars; break;
722 case 'Z': mpz_set_si ((mpz_ptr) p, (long) chars); break;
723 default: ASSERT (0); break;
724 }
725 }
726 goto next;
727
728 case 'o':
729 param.base = 8;
730 goto numeric;
731
732 case 'x':
733 case 'X':
734 param.base = 16;
735 goto numeric;
736
737 case '0': case '1': case '2': case '3': case '4':
738 case '5': case '6': case '7': case '8': case '9':
739 param.width = 0;
740 do {
741 param.width = param.width * 10 + (fchar-'0');
742 fchar = *fmt++;
743 } while (isdigit (fchar));
744 fmt--; /* unget the non-digit */
745 break;
746
747 case '*':
748 param.ignore = 1;
749 break;
750
751 default:
752 /* something invalid in a % sequence */
753 ASSERT (0);
754 goto next;
755 }
756 }
757 }
758
759 done:
760 (*__gmp_free_func) (alloc_fmt, alloc_fmt_size);
761 return fields;
762 }
763