1 /* __gmp_doscan -- formatted input internals. 2 3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST 4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN 5 FUTURE GNU MP RELEASES. 6 7 Copyright 2001, 2002, 2003 Free Software Foundation, Inc. 8 9 This file is part of the GNU MP Library. 10 11 The GNU MP Library is free software; you can redistribute it and/or modify 12 it under the terms of the GNU Lesser General Public License as published by 13 the Free Software Foundation; either version 3 of the License, or (at your 14 option) any later version. 15 16 The GNU MP Library is distributed in the hope that it will be useful, but 17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 18 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 19 License for more details. 20 21 You should have received a copy of the GNU Lesser General Public License 22 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 23 24 #define _GNU_SOURCE /* for DECIMAL_POINT in langinfo.h */ 25 26 #include "config.h" 27 28 #if HAVE_STDARG 29 #include <stdarg.h> 30 #else 31 #include <varargs.h> 32 #endif 33 34 #include <ctype.h> 35 #include <stddef.h> /* for ptrdiff_t */ 36 #include <stdio.h> 37 #include <stdlib.h> /* for strtol */ 38 #include <string.h> 39 40 #if HAVE_LANGINFO_H 41 #include <langinfo.h> /* for nl_langinfo */ 42 #endif 43 44 #if HAVE_LOCALE_H 45 #include <locale.h> /* for localeconv */ 46 #endif 47 48 #if HAVE_INTTYPES_H 49 # include <inttypes.h> /* for intmax_t */ 50 #else 51 # if HAVE_STDINT_H 52 # include <stdint.h> 53 # endif 54 #endif 55 56 #if HAVE_SYS_TYPES_H 57 #include <sys/types.h> /* for quad_t */ 58 #endif 59 60 #include "gmp.h" 61 #include "gmp-impl.h" 62 63 64 /* Change this to "#define TRACE(x) x" for some traces. */ 65 #define TRACE(x) 66 67 68 /* General: 69 70 It's necessary to parse up the format string to recognise the GMP 71 extra types F, Q and Z. Other types and conversions are passed 72 across to the standard sscanf or fscanf via funs->scan, for ease of 73 implementation. This is essential in the case of something like glibc 74 %p where the pointer format isn't actually documented. 75 76 Because funs->scan doesn't get the whole input it can't put the right 77 values in for %n, so that's handled in __gmp_doscan. Neither sscanf 78 nor fscanf directly indicate how many characters were read, so an 79 extra %n is appended to each run for that. For fscanf this merely 80 supports our %n output, but for sscanf it lets funs->step move us 81 along the input string. 82 83 Whitespace and literal matches in the format string, including %%, 84 are handled directly within __gmp_doscan. This is reasonably 85 efficient, and avoids some suspicious behaviour observed in various 86 system libc's. GLIBC 2.2.4 for instance returns 0 on 87 88 sscanf(" ", " x") 89 or 90 sscanf(" ", " x%d",&n) 91 92 whereas we think they should return EOF, since end-of-string is 93 reached when a match of "x" is required. 94 95 For standard % conversions, funs->scan is called once for each 96 conversion. If we had vfscanf and vsscanf and could rely on their 97 fixed text matching behaviour then we could call them with multiple 98 consecutive standard conversions. But plain fscanf and sscanf work 99 fine, and parsing one field at a time shouldn't be too much of a 100 slowdown. 101 102 gmpscan: 103 104 gmpscan reads a gmp type. It's only used from one place, but is a 105 separate subroutine to avoid a big chunk of complicated code in the 106 middle of __gmp_doscan. Within gmpscan a couple of loopbacks make it 107 possible to share code for parsing integers, rationals and floats. 108 109 In gmpscan normally one char of lookahead is maintained, but when width 110 is reached that stops, on the principle that an fgetc/ungetc of a char 111 past where we're told to stop would be undesirable. "chars" is how many 112 characters have been read so far, including the current c. When 113 chars==width and another character is desired then a jump is done to the 114 "convert" stage. c is invalid and mustn't be unget'ed in this case; 115 chars is set to width+1 to indicate that. 116 117 gmpscan normally returns the number of characters read. -1 means an 118 invalid field, -2 means EOF reached before any matching characters 119 were read. 120 121 For hex floats, the mantissa part is passed to mpf_set_str, then the 122 exponent is applied with mpf_mul_exp or mpf_div_2exp. This is easier 123 than teaching mpf_set_str about an exponent factor (ie. 2) differing 124 from the mantissa radix point factor (ie. 16). mpf_mul_exp and 125 mpf_div_2exp will preserve the application requested precision, so 126 nothing in that respect is lost by making this a two-step process. 127 128 Matching and errors: 129 130 C99 7.19.6.2 paras 9 and 10 say an input item is read as the longest 131 string which is a match for the appropriate type, or a prefix of a 132 match. With that done, if it's only a prefix then the result is a 133 matching failure, ie. invalid input. 134 135 This rule seems fairly clear, but doesn't seem to be universally 136 applied in system C libraries. Even GLIBC doesn't seem to get it 137 right, insofar as it seems to accept some apparently invalid forms. 138 Eg. glibc 2.3.1 accepts "0x" for a "%i", where a reading of the 139 standard would suggest a non-empty sequence of digits should be 140 required after an "0x". 141 142 A footnote to 7.19.6.2 para 17 notes how this input item reading can 143 mean inputs acceptable to strtol are not acceptable to fscanf. We 144 think this confirms our reading of "0x" as invalid. 145 146 Clearly gmp_sscanf could backtrack to a longest input which was a 147 valid match for a given item, but this is not done, since C99 says 148 sscanf is identical to fscanf, so we make gmp_sscanf identical to 149 gmp_fscanf. 150 151 Types: 152 153 C99 says "ll" is for long long, and "L" is for long double floats. 154 Unfortunately in GMP 4.1.1 we documented the two as equivalent. This 155 doesn't affect us directly, since both are passed through to plain 156 scanf. It seems wisest not to try to enforce the C99 rule. This is 157 consistent with what we said before, though whether it actually 158 worked was always up to the C library. 159 160 Alternatives: 161 162 Consideration was given to using separate code for gmp_fscanf and 163 gmp_sscanf. The sscanf case could zip across a string doing literal 164 matches or recognising digits in gmpscan, rather than making a 165 function call fun->get per character. The fscanf could use getc 166 rather than fgetc too, which might help those systems where getc is a 167 macro or otherwise inlined. But none of this scanning and converting 168 will be particularly fast, so the two are done together to keep it a 169 little simpler for now. 170 171 Various multibyte string issues are not addressed, for a start C99 172 scanf says the format string is multibyte. Since we pass %c, %s and 173 %[ to the system scanf, they might do multibyte reads already, but 174 it's another matter whether or not that can be used, since our digit 175 and whitespace parsing is only unibyte. The plan is to quietly 176 ignore multibyte locales for now. This is not as bad as it sounds, 177 since GMP is presumably used mostly on numbers, which can be 178 perfectly adequately treated in plain ASCII. 179 180 */ 181 182 183 struct gmp_doscan_params_t { 184 int base; 185 int ignore; 186 char type; 187 int width; 188 }; 189 190 191 #define GET(c) \ 192 do { \ 193 ASSERT (chars <= width); \ 194 chars++; \ 195 if (chars > width) \ 196 goto convert; \ 197 (c) = (*funs->get) (data); \ 198 } while (0) 199 200 /* store into "s", extending if necessary */ 201 #define STORE(c) \ 202 do { \ 203 ASSERT (s_upto <= s_alloc); \ 204 if (s_upto >= s_alloc) \ 205 { \ 206 size_t s_alloc_new = s_alloc + S_ALLOC_STEP; \ 207 s = __GMP_REALLOCATE_FUNC_TYPE (s, s_alloc, s_alloc_new, char); \ 208 s_alloc = s_alloc_new; \ 209 } \ 210 s[s_upto++] = c; \ 211 } while (0) 212 213 #define S_ALLOC_STEP 512 214 215 static int 216 gmpscan (const struct gmp_doscan_funs_t *funs, void *data, 217 const struct gmp_doscan_params_t *p, void *dst) 218 { 219 int chars, c, base, first, width, seen_point, seen_digit, hexfloat; 220 size_t s_upto, s_alloc, hexexp; 221 char *s; 222 int invalid = 0; 223 224 TRACE (printf ("gmpscan\n")); 225 226 ASSERT (p->type == 'F' || p->type == 'Q' || p->type == 'Z'); 227 228 c = (*funs->get) (data); 229 if (c == EOF) 230 return -2; 231 232 chars = 1; 233 first = 1; 234 seen_point = 0; 235 width = (p->width == 0 ? INT_MAX-1 : p->width); 236 base = p->base; 237 s_alloc = S_ALLOC_STEP; 238 s = __GMP_ALLOCATE_FUNC_TYPE (s_alloc, char); 239 s_upto = 0; 240 hexfloat = 0; 241 hexexp = 0; 242 243 another: 244 seen_digit = 0; 245 if (c == '-') 246 { 247 STORE (c); 248 goto get_for_sign; 249 } 250 else if (c == '+') 251 { 252 /* don't store '+', it's not accepted by mpz_set_str etc */ 253 get_for_sign: 254 GET (c); 255 } 256 257 if (base == 0) 258 { 259 base = 10; /* decimal if no base indicator */ 260 if (c == '0') 261 { 262 seen_digit = 1; /* 0 alone is a valid number */ 263 if (p->type != 'F') 264 base = 8; /* leading 0 is octal, for non-floats */ 265 STORE (c); 266 GET (c); 267 if (c == 'x' || c == 'X') 268 { 269 base = 16; 270 seen_digit = 0; /* must have digits after an 0x */ 271 if (p->type == 'F') /* don't pass 'x' to mpf_set_str_point */ 272 hexfloat = 1; 273 else 274 STORE (c); 275 GET (c); 276 } 277 } 278 } 279 280 digits: 281 for (;;) 282 { 283 if (base == 16) 284 { 285 if (! isxdigit (c)) 286 break; 287 } 288 else 289 { 290 if (! isdigit (c)) 291 break; 292 if (base == 8 && (c == '8' || c == '9')) 293 break; 294 } 295 296 seen_digit = 1; 297 STORE (c); 298 GET (c); 299 } 300 301 if (first) 302 { 303 /* decimal point */ 304 if (p->type == 'F' && ! seen_point) 305 { 306 /* For a multi-character decimal point, if the first character is 307 present then all of it must be, otherwise the input is 308 considered invalid. */ 309 const char *point = GMP_DECIMAL_POINT; 310 int pc = (unsigned char) *point++; 311 if (c == pc) 312 { 313 for (;;) 314 { 315 STORE (c); 316 GET (c); 317 pc = (unsigned char) *point++; 318 if (pc == '\0') 319 break; 320 if (c != pc) 321 goto set_invalid; 322 } 323 seen_point = 1; 324 goto digits; 325 } 326 } 327 328 /* exponent */ 329 if (p->type == 'F') 330 { 331 if (hexfloat && (c == 'p' || c == 'P')) 332 { 333 hexexp = s_upto; /* exponent location */ 334 base = 10; /* exponent in decimal */ 335 goto exponent; 336 } 337 else if (! hexfloat && (c == 'e' || c == 'E')) 338 { 339 exponent: 340 /* must have at least one digit in the mantissa, just an exponent 341 is not good enough */ 342 if (! seen_digit) 343 goto set_invalid; 344 345 do_second: 346 first = 0; 347 STORE (c); 348 GET (c); 349 goto another; 350 } 351 } 352 353 /* denominator */ 354 if (p->type == 'Q' && c == '/') 355 { 356 /* must have at least one digit in the numerator */ 357 if (! seen_digit) 358 goto set_invalid; 359 360 /* now look for at least one digit in the denominator */ 361 seen_digit = 0; 362 363 /* allow the base to be redetermined for "%i" */ 364 base = p->base; 365 goto do_second; 366 } 367 } 368 369 convert: 370 if (! seen_digit) 371 { 372 set_invalid: 373 invalid = 1; 374 goto done; 375 } 376 377 if (! p->ignore) 378 { 379 STORE ('\0'); 380 TRACE (printf (" convert \"%s\"\n", s)); 381 382 /* We ought to have parsed out a valid string above, so just test 383 mpz_set_str etc with an ASSERT. */ 384 switch (p->type) { 385 case 'F': 386 { 387 mpf_ptr f = (mpf_ptr) dst; 388 if (hexexp != 0) 389 s[hexexp] = '\0'; 390 ASSERT_NOCARRY (mpf_set_str (f, s, hexfloat ? 16 : 10)); 391 if (hexexp != 0) 392 { 393 char *dummy; 394 long exp; 395 exp = strtol (s + hexexp + 1, &dummy, 10); 396 if (exp >= 0) 397 mpf_mul_2exp (f, f, (unsigned long) exp); 398 else 399 mpf_div_2exp (f, f, - (unsigned long) exp); 400 } 401 } 402 break; 403 case 'Q': 404 ASSERT_NOCARRY (mpq_set_str ((mpq_ptr) dst, s, p->base)); 405 break; 406 case 'Z': 407 ASSERT_NOCARRY (mpz_set_str ((mpz_ptr) dst, s, p->base)); 408 break; 409 default: 410 ASSERT (0); 411 /*FALLTHRU*/ 412 break; 413 } 414 } 415 416 done: 417 ASSERT (chars <= width+1); 418 if (chars != width+1) 419 { 420 (*funs->unget) (c, data); 421 TRACE (printf (" ungetc %d, to give %d chars\n", c, chars-1)); 422 } 423 chars--; 424 425 (*__gmp_free_func) (s, s_alloc); 426 427 if (invalid) 428 { 429 TRACE (printf (" invalid\n")); 430 return -1; 431 } 432 433 TRACE (printf (" return %d chars (cf width %d)\n", chars, width)); 434 return chars; 435 } 436 437 438 /* Read and discard whitespace, if any. Return number of chars skipped. 439 Whitespace skipping never provokes the EOF return from __gmp_doscan, so 440 it's not necessary to watch for EOF from funs->get, */ 441 static int 442 skip_white (const struct gmp_doscan_funs_t *funs, void *data) 443 { 444 int c; 445 int ret = 0; 446 447 do 448 { 449 c = (funs->get) (data); 450 ret++; 451 } 452 while (isspace (c)); 453 454 (funs->unget) (c, data); 455 ret--; 456 457 TRACE (printf (" skip white %d\n", ret)); 458 return ret; 459 } 460 461 462 int 463 __gmp_doscan (const struct gmp_doscan_funs_t *funs, void *data, 464 const char *orig_fmt, va_list orig_ap) 465 { 466 struct gmp_doscan_params_t param; 467 va_list ap; 468 char *alloc_fmt; 469 const char *fmt, *this_fmt, *end_fmt; 470 size_t orig_fmt_len, alloc_fmt_size, len; 471 int new_fields, new_chars; 472 char fchar; 473 int fields = 0; 474 int chars = 0; 475 476 TRACE (printf ("__gmp_doscan \"%s\"\n", orig_fmt); 477 if (funs->scan == (gmp_doscan_scan_t) sscanf) 478 printf (" s=\"%s\"\n", * (const char **) data)); 479 480 /* Don't modify orig_ap, if va_list is actually an array and hence call by 481 reference. It could be argued that it'd be more efficient to leave 482 callers to make a copy if they care, but doing so here is going to be a 483 very small part of the total work, and we may as well keep applications 484 out of trouble. */ 485 va_copy (ap, orig_ap); 486 487 /* Parts of the format string are going to be copied so that a " %n" can 488 be appended. alloc_fmt is some space for that. orig_fmt_len+4 will be 489 needed if fmt consists of a single "%" specifier, but otherwise is an 490 overestimate. We're not going to be very fast here, so use 491 __gmp_allocate_func rather than TMP_ALLOC. */ 492 orig_fmt_len = strlen (orig_fmt); 493 alloc_fmt_size = orig_fmt_len + 4; 494 alloc_fmt = __GMP_ALLOCATE_FUNC_TYPE (alloc_fmt_size, char); 495 496 fmt = orig_fmt; 497 end_fmt = orig_fmt + orig_fmt_len; 498 499 for (;;) 500 { 501 next: 502 fchar = *fmt++; 503 504 if (fchar == '\0') 505 break; 506 507 if (isspace (fchar)) 508 { 509 chars += skip_white (funs, data); 510 continue; 511 } 512 513 if (fchar != '%') 514 { 515 int c; 516 literal: 517 c = (funs->get) (data); 518 if (c != fchar) 519 { 520 (funs->unget) (c, data); 521 if (c == EOF) 522 { 523 eof_no_match: 524 if (fields == 0) 525 fields = EOF; 526 } 527 goto done; 528 } 529 chars++; 530 continue; 531 } 532 533 param.type = '\0'; 534 param.base = 0; /* for e,f,g,i */ 535 param.ignore = 0; 536 param.width = 0; 537 538 this_fmt = fmt-1; 539 TRACE (printf (" this_fmt \"%s\"\n", this_fmt)); 540 541 for (;;) 542 { 543 ASSERT (fmt <= end_fmt); 544 545 fchar = *fmt++; 546 switch (fchar) { 547 548 case '\0': /* unterminated % sequence */ 549 ASSERT (0); 550 goto done; 551 552 case '%': /* literal % */ 553 goto literal; 554 555 case '[': /* character range */ 556 fchar = *fmt++; 557 if (fchar == '^') 558 fchar = *fmt++; 559 /* ']' allowed as the first char (possibly after '^') */ 560 if (fchar == ']') 561 fchar = *fmt++; 562 for (;;) 563 { 564 ASSERT (fmt <= end_fmt); 565 if (fchar == '\0') 566 { 567 /* unterminated % sequence */ 568 ASSERT (0); 569 goto done; 570 } 571 if (fchar == ']') 572 break; 573 fchar = *fmt++; 574 } 575 /*FALLTHRU*/ 576 case 'c': /* characters */ 577 case 's': /* string of non-whitespace */ 578 case 'p': /* pointer */ 579 libc_type: 580 len = fmt - this_fmt; 581 memcpy (alloc_fmt, this_fmt, len); 582 alloc_fmt[len++] = '%'; 583 alloc_fmt[len++] = 'n'; 584 alloc_fmt[len] = '\0'; 585 586 TRACE (printf (" scan \"%s\"\n", alloc_fmt); 587 if (funs->scan == (gmp_doscan_scan_t) sscanf) 588 printf (" s=\"%s\"\n", * (const char **) data)); 589 590 new_chars = -1; 591 if (param.ignore) 592 { 593 new_fields = (*funs->scan) (data, alloc_fmt, &new_chars, NULL); 594 ASSERT (new_fields == 0 || new_fields == EOF); 595 } 596 else 597 { 598 void *arg = va_arg (ap, void *); 599 new_fields = (*funs->scan) (data, alloc_fmt, arg, &new_chars); 600 ASSERT (new_fields==0 || new_fields==1 || new_fields==EOF); 601 602 if (new_fields == 0) 603 goto done; /* invalid input */ 604 605 if (new_fields == 1) 606 ASSERT (new_chars != -1); 607 } 608 TRACE (printf (" new_fields %d new_chars %d\n", 609 new_fields, new_chars)); 610 611 if (new_fields == -1) 612 goto eof_no_match; /* EOF before anything matched */ 613 614 /* Under param.ignore, when new_fields==0 we don't know if 615 it's a successful match or an invalid field. new_chars 616 won't have been assigned if it was an invalid field. */ 617 if (new_chars == -1) 618 goto done; /* invalid input */ 619 620 chars += new_chars; 621 (*funs->step) (data, new_chars); 622 623 increment_fields: 624 if (! param.ignore) 625 fields++; 626 goto next; 627 628 case 'd': /* decimal */ 629 case 'u': /* decimal */ 630 param.base = 10; 631 goto numeric; 632 633 case 'e': /* float */ 634 case 'E': /* float */ 635 case 'f': /* float */ 636 case 'g': /* float */ 637 case 'G': /* float */ 638 case 'i': /* integer with base marker */ 639 numeric: 640 if (param.type != 'F' && param.type != 'Q' && param.type != 'Z') 641 goto libc_type; 642 643 chars += skip_white (funs, data); 644 645 new_chars = gmpscan (funs, data, ¶m, 646 param.ignore ? NULL : va_arg (ap, void*)); 647 if (new_chars == -2) 648 goto eof_no_match; 649 if (new_chars == -1) 650 goto done; 651 652 ASSERT (new_chars >= 0); 653 chars += new_chars; 654 goto increment_fields; 655 656 case 'a': /* glibc allocate string */ 657 case '\'': /* glibc digit groupings */ 658 break; 659 660 case 'F': /* mpf_t */ 661 case 'j': /* intmax_t */ 662 case 'L': /* long long */ 663 case 'q': /* quad_t */ 664 case 'Q': /* mpq_t */ 665 case 't': /* ptrdiff_t */ 666 case 'z': /* size_t */ 667 case 'Z': /* mpz_t */ 668 set_type: 669 param.type = fchar; 670 break; 671 672 case 'h': /* short or char */ 673 if (param.type != 'h') 674 goto set_type; 675 param.type = 'H'; /* internal code for "hh" */ 676 break; 677 678 goto numeric; 679 680 case 'l': /* long, long long, double or long double */ 681 if (param.type != 'l') 682 goto set_type; 683 param.type = 'L'; /* "ll" means "L" */ 684 break; 685 686 case 'n': 687 if (! param.ignore) 688 { 689 void *p; 690 p = va_arg (ap, void *); 691 TRACE (printf (" store %%n to %p\n", p)); 692 switch (param.type) { 693 case '\0': * (int *) p = chars; break; 694 case 'F': mpf_set_si ((mpf_ptr) p, (long) chars); break; 695 case 'H': * (char *) p = chars; break; 696 case 'h': * (short *) p = chars; break; 697 #if HAVE_INTMAX_T 698 case 'j': * (intmax_t *) p = chars; break; 699 #else 700 case 'j': ASSERT_FAIL (intmax_t not available); break; 701 #endif 702 case 'l': * (long *) p = chars; break; 703 #if HAVE_QUAD_T && HAVE_LONG_LONG 704 case 'q': 705 ASSERT_ALWAYS (sizeof (quad_t) == sizeof (long long)); 706 /*FALLTHRU*/ 707 #else 708 case 'q': ASSERT_FAIL (quad_t not available); break; 709 #endif 710 #if HAVE_LONG_LONG 711 case 'L': * (long long *) p = chars; break; 712 #else 713 case 'L': ASSERT_FAIL (long long not available); break; 714 #endif 715 case 'Q': mpq_set_si ((mpq_ptr) p, (long) chars, 1L); break; 716 #if HAVE_PTRDIFF_T 717 case 't': * (ptrdiff_t *) p = chars; break; 718 #else 719 case 't': ASSERT_FAIL (ptrdiff_t not available); break; 720 #endif 721 case 'z': * (size_t *) p = chars; break; 722 case 'Z': mpz_set_si ((mpz_ptr) p, (long) chars); break; 723 default: ASSERT (0); break; 724 } 725 } 726 goto next; 727 728 case 'o': 729 param.base = 8; 730 goto numeric; 731 732 case 'x': 733 case 'X': 734 param.base = 16; 735 goto numeric; 736 737 case '0': case '1': case '2': case '3': case '4': 738 case '5': case '6': case '7': case '8': case '9': 739 param.width = 0; 740 do { 741 param.width = param.width * 10 + (fchar-'0'); 742 fchar = *fmt++; 743 } while (isdigit (fchar)); 744 fmt--; /* unget the non-digit */ 745 break; 746 747 case '*': 748 param.ignore = 1; 749 break; 750 751 default: 752 /* something invalid in a % sequence */ 753 ASSERT (0); 754 goto next; 755 } 756 } 757 } 758 759 done: 760 (*__gmp_free_func) (alloc_fmt, alloc_fmt_size); 761 return fields; 762 } 763