1 /* $OpenBSD: read.c,v 1.191 2021/06/27 17:57:13 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2010-2020 Ingo Schwarze <schwarze@openbsd.org> 4 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 * 19 * Top-level functions of the mandoc(3) parser: 20 * Parser and input encoding selection, decompression, 21 * handling of input bytes, characters, lines, and files, 22 * handling of roff(7) loops and file inclusion, 23 * and steering of the various parsers. 24 */ 25 #include <sys/types.h> 26 #include <sys/mman.h> 27 #include <sys/stat.h> 28 29 #include <assert.h> 30 #include <ctype.h> 31 #include <errno.h> 32 #include <fcntl.h> 33 #include <stdarg.h> 34 #include <stdio.h> 35 #include <stdlib.h> 36 #include <string.h> 37 #include <unistd.h> 38 #include <zlib.h> 39 40 #include "mandoc_aux.h" 41 #include "mandoc.h" 42 #include "roff.h" 43 #include "mdoc.h" 44 #include "man.h" 45 #include "mandoc_parse.h" 46 #include "libmandoc.h" 47 #include "roff_int.h" 48 #include "tag.h" 49 50 #define REPARSE_LIMIT 1000 51 52 struct mparse { 53 struct roff *roff; /* roff parser (!NULL) */ 54 struct roff_man *man; /* man parser */ 55 struct buf *primary; /* buffer currently being parsed */ 56 struct buf *secondary; /* copy of top level input */ 57 struct buf *loop; /* open .while request line */ 58 const char *os_s; /* default operating system */ 59 int options; /* parser options */ 60 int gzip; /* current input file is gzipped */ 61 int filenc; /* encoding of the current file */ 62 int reparse_count; /* finite interp. stack */ 63 int line; /* line number in the file */ 64 }; 65 66 static void choose_parser(struct mparse *); 67 static void free_buf_list(struct buf *); 68 static void resize_buf(struct buf *, size_t); 69 static int mparse_buf_r(struct mparse *, struct buf, size_t, int); 70 static int read_whole_file(struct mparse *, int, struct buf *, int *); 71 static void mparse_end(struct mparse *); 72 73 74 static void 75 resize_buf(struct buf *buf, size_t initial) 76 { 77 78 buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial; 79 buf->buf = mandoc_realloc(buf->buf, buf->sz); 80 } 81 82 static void 83 free_buf_list(struct buf *buf) 84 { 85 struct buf *tmp; 86 87 while (buf != NULL) { 88 tmp = buf; 89 buf = tmp->next; 90 free(tmp->buf); 91 free(tmp); 92 } 93 } 94 95 static void 96 choose_parser(struct mparse *curp) 97 { 98 char *cp, *ep; 99 int format; 100 101 /* 102 * If neither command line arguments -mdoc or -man select 103 * a parser nor the roff parser found a .Dd or .TH macro 104 * yet, look ahead in the main input buffer. 105 */ 106 107 if ((format = roff_getformat(curp->roff)) == 0) { 108 cp = curp->primary->buf; 109 ep = cp + curp->primary->sz; 110 while (cp < ep) { 111 if (*cp == '.' || *cp == '\'') { 112 cp++; 113 if (cp[0] == 'D' && cp[1] == 'd') { 114 format = MPARSE_MDOC; 115 break; 116 } 117 if (cp[0] == 'T' && cp[1] == 'H') { 118 format = MPARSE_MAN; 119 break; 120 } 121 } 122 cp = memchr(cp, '\n', ep - cp); 123 if (cp == NULL) 124 break; 125 cp++; 126 } 127 } 128 129 if (format == MPARSE_MDOC) { 130 curp->man->meta.macroset = MACROSET_MDOC; 131 if (curp->man->mdocmac == NULL) 132 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 133 } else { 134 curp->man->meta.macroset = MACROSET_MAN; 135 if (curp->man->manmac == NULL) 136 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 137 } 138 curp->man->meta.first->tok = TOKEN_NONE; 139 } 140 141 /* 142 * Main parse routine for a buffer. 143 * It assumes encoding and line numbering are already set up. 144 * It can recurse directly (for invocations of user-defined 145 * macros, inline equations, and input line traps) 146 * and indirectly (for .so file inclusion). 147 */ 148 static int 149 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start) 150 { 151 struct buf ln; 152 struct buf *firstln, *lastln, *thisln, *loop; 153 char *cp; 154 size_t pos; /* byte number in the ln buffer */ 155 size_t spos; /* at the start of the current line parse */ 156 int line_result, result; 157 int of; 158 int lnn; /* line number in the real file */ 159 int fd; 160 int inloop; /* Saw .while on this level. */ 161 unsigned char c; 162 163 ln.sz = 256; 164 ln.buf = mandoc_malloc(ln.sz); 165 ln.next = NULL; 166 firstln = lastln = loop = NULL; 167 lnn = curp->line; 168 pos = 0; 169 inloop = 0; 170 result = ROFF_CONT; 171 172 while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) { 173 if (start) { 174 curp->line = lnn; 175 curp->reparse_count = 0; 176 177 if (lnn < 3 && 178 curp->filenc & MPARSE_UTF8 && 179 curp->filenc & MPARSE_LATIN1) 180 curp->filenc = preconv_cue(&blk, i); 181 } 182 spos = pos; 183 184 while (i < blk.sz && (start || blk.buf[i] != '\0')) { 185 186 /* 187 * When finding an unescaped newline character, 188 * leave the character loop to process the line. 189 * Skip a preceding carriage return, if any. 190 */ 191 192 if ('\r' == blk.buf[i] && i + 1 < blk.sz && 193 '\n' == blk.buf[i + 1]) 194 ++i; 195 if ('\n' == blk.buf[i]) { 196 ++i; 197 ++lnn; 198 break; 199 } 200 201 /* 202 * Make sure we have space for the worst 203 * case of 12 bytes: "\\[u10ffff]\n\0" 204 */ 205 206 if (pos + 12 > ln.sz) 207 resize_buf(&ln, 256); 208 209 /* 210 * Encode 8-bit input. 211 */ 212 213 c = blk.buf[i]; 214 if (c & 0x80) { 215 if ( ! (curp->filenc && preconv_encode( 216 &blk, &i, &ln, &pos, &curp->filenc))) { 217 mandoc_msg(MANDOCERR_CHAR_BAD, 218 curp->line, pos, "0x%x", c); 219 ln.buf[pos++] = '?'; 220 i++; 221 } 222 continue; 223 } 224 225 /* 226 * Exclude control characters. 227 */ 228 229 if (c == 0x7f || (c < 0x20 && c != 0x09)) { 230 mandoc_msg(c == 0x00 || c == 0x04 || 231 c > 0x0a ? MANDOCERR_CHAR_BAD : 232 MANDOCERR_CHAR_UNSUPP, 233 curp->line, pos, "0x%x", c); 234 i++; 235 if (c != '\r') 236 ln.buf[pos++] = '?'; 237 continue; 238 } 239 240 ln.buf[pos++] = blk.buf[i++]; 241 } 242 ln.buf[pos] = '\0'; 243 244 /* 245 * Maintain a lookaside buffer of all lines. 246 * parsed from this input source. 247 */ 248 249 thisln = mandoc_malloc(sizeof(*thisln)); 250 thisln->buf = mandoc_strdup(ln.buf); 251 thisln->sz = strlen(ln.buf) + 1; 252 thisln->next = NULL; 253 if (firstln == NULL) { 254 firstln = lastln = thisln; 255 if (curp->secondary == NULL) 256 curp->secondary = firstln; 257 } else { 258 lastln->next = thisln; 259 lastln = thisln; 260 } 261 262 /* XXX Ugly hack to mark the end of the input. */ 263 264 if (i == blk.sz || blk.buf[i] == '\0') { 265 if (pos + 2 > ln.sz) 266 resize_buf(&ln, 256); 267 ln.buf[pos++] = '\n'; 268 ln.buf[pos] = '\0'; 269 } 270 271 /* 272 * A significant amount of complexity is contained by 273 * the roff preprocessor. It's line-oriented but can be 274 * expressed on one line, so we need at times to 275 * readjust our starting point and re-run it. The roff 276 * preprocessor can also readjust the buffers with new 277 * data, so we pass them in wholesale. 278 */ 279 280 of = 0; 281 rerun: 282 line_result = roff_parseln(curp->roff, curp->line, 283 &ln, &of, start && spos == 0 ? pos : 0); 284 285 /* Process options. */ 286 287 if (line_result & ROFF_APPEND) 288 assert(line_result == (ROFF_IGN | ROFF_APPEND)); 289 290 if (line_result & ROFF_USERCALL) 291 assert((line_result & ROFF_MASK) == ROFF_REPARSE); 292 293 if (line_result & ROFF_USERRET) { 294 assert(line_result == (ROFF_IGN | ROFF_USERRET)); 295 if (start == 0) { 296 /* Return from the current macro. */ 297 result = ROFF_USERRET; 298 goto out; 299 } 300 } 301 302 switch (line_result & ROFF_LOOPMASK) { 303 case ROFF_IGN: 304 break; 305 case ROFF_WHILE: 306 if (curp->loop != NULL) { 307 if (loop == curp->loop) 308 break; 309 mandoc_msg(MANDOCERR_WHILE_NEST, 310 curp->line, pos, NULL); 311 } 312 curp->loop = thisln; 313 loop = NULL; 314 inloop = 1; 315 break; 316 case ROFF_LOOPCONT: 317 case ROFF_LOOPEXIT: 318 if (curp->loop == NULL) { 319 mandoc_msg(MANDOCERR_WHILE_FAIL, 320 curp->line, pos, NULL); 321 break; 322 } 323 if (inloop == 0) { 324 mandoc_msg(MANDOCERR_WHILE_INTO, 325 curp->line, pos, NULL); 326 curp->loop = loop = NULL; 327 break; 328 } 329 if (line_result & ROFF_LOOPCONT) 330 loop = curp->loop; 331 else { 332 curp->loop = loop = NULL; 333 inloop = 0; 334 } 335 break; 336 default: 337 abort(); 338 } 339 340 /* Process the main instruction from the roff parser. */ 341 342 switch (line_result & ROFF_MASK) { 343 case ROFF_IGN: 344 break; 345 case ROFF_CONT: 346 if (curp->man->meta.macroset == MACROSET_NONE) 347 choose_parser(curp); 348 if ((curp->man->meta.macroset == MACROSET_MDOC ? 349 mdoc_parseln(curp->man, curp->line, ln.buf, of) : 350 man_parseln(curp->man, curp->line, ln.buf, of) 351 ) == 2) 352 goto out; 353 break; 354 case ROFF_RERUN: 355 goto rerun; 356 case ROFF_REPARSE: 357 if (++curp->reparse_count > REPARSE_LIMIT) { 358 /* Abort and return to the top level. */ 359 result = ROFF_IGN; 360 mandoc_msg(MANDOCERR_ROFFLOOP, 361 curp->line, pos, NULL); 362 goto out; 363 } 364 result = mparse_buf_r(curp, ln, of, 0); 365 if (line_result & ROFF_USERCALL) { 366 roff_userret(curp->roff); 367 /* Continue normally. */ 368 if (result & ROFF_USERRET) 369 result = ROFF_CONT; 370 } 371 if (start == 0 && result != ROFF_CONT) 372 goto out; 373 break; 374 case ROFF_SO: 375 if ( ! (curp->options & MPARSE_SO) && 376 (i >= blk.sz || blk.buf[i] == '\0')) { 377 curp->man->meta.sodest = 378 mandoc_strdup(ln.buf + of); 379 goto out; 380 } 381 if ((fd = mparse_open(curp, ln.buf + of)) != -1) { 382 mparse_readfd(curp, fd, ln.buf + of); 383 close(fd); 384 } else { 385 mandoc_msg(MANDOCERR_SO_FAIL, 386 curp->line, of, ".so %s: %s", 387 ln.buf + of, strerror(errno)); 388 ln.sz = mandoc_asprintf(&cp, 389 ".sp\nSee the file %s.\n.sp", 390 ln.buf + of); 391 free(ln.buf); 392 ln.buf = cp; 393 of = 0; 394 mparse_buf_r(curp, ln, of, 0); 395 } 396 break; 397 default: 398 abort(); 399 } 400 401 /* Start the next input line. */ 402 403 if (loop != NULL && 404 (line_result & ROFF_LOOPMASK) == ROFF_IGN) 405 loop = loop->next; 406 407 if (loop != NULL) { 408 if ((line_result & ROFF_APPEND) == 0) 409 *ln.buf = '\0'; 410 if (ln.sz < loop->sz) 411 resize_buf(&ln, loop->sz); 412 (void)strlcat(ln.buf, loop->buf, ln.sz); 413 of = 0; 414 goto rerun; 415 } 416 417 pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0; 418 } 419 out: 420 if (inloop) { 421 if (result != ROFF_USERRET) 422 mandoc_msg(MANDOCERR_WHILE_OUTOF, 423 curp->line, pos, NULL); 424 curp->loop = NULL; 425 } 426 free(ln.buf); 427 if (firstln != curp->secondary) 428 free_buf_list(firstln); 429 return result; 430 } 431 432 static int 433 read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap) 434 { 435 struct stat st; 436 gzFile gz; 437 size_t off; 438 ssize_t ssz; 439 int gzerrnum, retval; 440 441 if (fstat(fd, &st) == -1) { 442 mandoc_msg(MANDOCERR_FSTAT, 0, 0, "%s", strerror(errno)); 443 return -1; 444 } 445 446 /* 447 * If we're a regular file, try just reading in the whole entry 448 * via mmap(). This is faster than reading it into blocks, and 449 * since each file is only a few bytes to begin with, I'm not 450 * concerned that this is going to tank any machines. 451 */ 452 453 if (curp->gzip == 0 && S_ISREG(st.st_mode)) { 454 if (st.st_size > 0x7fffffff) { 455 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 456 return -1; 457 } 458 *with_mmap = 1; 459 fb->sz = (size_t)st.st_size; 460 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0); 461 if (fb->buf != MAP_FAILED) 462 return 0; 463 } 464 465 if (curp->gzip) { 466 /* 467 * Duplicating the file descriptor is required 468 * because we will have to call gzclose(3) 469 * to free memory used internally by zlib, 470 * but that will also close the file descriptor, 471 * which this function must not do. 472 */ 473 if ((fd = dup(fd)) == -1) { 474 mandoc_msg(MANDOCERR_DUP, 0, 0, 475 "%s", strerror(errno)); 476 return -1; 477 } 478 if ((gz = gzdopen(fd, "rb")) == NULL) { 479 mandoc_msg(MANDOCERR_GZDOPEN, 0, 0, 480 "%s", strerror(errno)); 481 close(fd); 482 return -1; 483 } 484 } else 485 gz = NULL; 486 487 /* 488 * If this isn't a regular file (like, say, stdin), then we must 489 * go the old way and just read things in bit by bit. 490 */ 491 492 *with_mmap = 0; 493 off = 0; 494 retval = -1; 495 fb->sz = 0; 496 fb->buf = NULL; 497 for (;;) { 498 if (off == fb->sz) { 499 if (fb->sz == (1U << 31)) { 500 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 501 break; 502 } 503 resize_buf(fb, 65536); 504 } 505 ssz = curp->gzip ? 506 gzread(gz, fb->buf + (int)off, fb->sz - off) : 507 read(fd, fb->buf + (int)off, fb->sz - off); 508 if (ssz == 0) { 509 fb->sz = off; 510 retval = 0; 511 break; 512 } 513 if (ssz == -1) { 514 if (curp->gzip) 515 (void)gzerror(gz, &gzerrnum); 516 mandoc_msg(MANDOCERR_READ, 0, 0, "%s", 517 curp->gzip && gzerrnum != Z_ERRNO ? 518 zError(gzerrnum) : strerror(errno)); 519 break; 520 } 521 off += (size_t)ssz; 522 } 523 524 if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK) 525 mandoc_msg(MANDOCERR_GZCLOSE, 0, 0, "%s", 526 gzerrnum == Z_ERRNO ? strerror(errno) : 527 zError(gzerrnum)); 528 if (retval == -1) { 529 free(fb->buf); 530 fb->buf = NULL; 531 } 532 return retval; 533 } 534 535 static void 536 mparse_end(struct mparse *curp) 537 { 538 if (curp->man->meta.macroset == MACROSET_NONE) 539 curp->man->meta.macroset = MACROSET_MAN; 540 if (curp->man->meta.macroset == MACROSET_MDOC) 541 mdoc_endparse(curp->man); 542 else 543 man_endparse(curp->man); 544 roff_endparse(curp->roff); 545 } 546 547 /* 548 * Read the whole file into memory and call the parsers. 549 * Called recursively when an .so request is encountered. 550 */ 551 void 552 mparse_readfd(struct mparse *curp, int fd, const char *filename) 553 { 554 static int recursion_depth; 555 556 struct buf blk; 557 struct buf *save_primary; 558 const char *save_filename, *cp; 559 size_t offset; 560 int save_filenc, save_lineno; 561 int with_mmap; 562 563 if (recursion_depth > 64) { 564 mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL); 565 return; 566 } else if (recursion_depth == 0 && 567 (cp = strrchr(filename, '.')) != NULL && 568 cp[1] >= '1' && cp[1] <= '9') 569 curp->man->filesec = cp[1]; 570 else 571 curp->man->filesec = '\0'; 572 573 if (read_whole_file(curp, fd, &blk, &with_mmap) == -1) 574 return; 575 576 /* 577 * Save some properties of the parent file. 578 */ 579 580 save_primary = curp->primary; 581 save_filenc = curp->filenc; 582 save_lineno = curp->line; 583 save_filename = mandoc_msg_getinfilename(); 584 585 curp->primary = &blk; 586 curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1); 587 curp->line = 1; 588 mandoc_msg_setinfilename(filename); 589 590 /* Skip an UTF-8 byte order mark. */ 591 if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 && 592 (unsigned char)blk.buf[0] == 0xef && 593 (unsigned char)blk.buf[1] == 0xbb && 594 (unsigned char)blk.buf[2] == 0xbf) { 595 offset = 3; 596 curp->filenc &= ~MPARSE_LATIN1; 597 } else 598 offset = 0; 599 600 recursion_depth++; 601 mparse_buf_r(curp, blk, offset, 1); 602 if (--recursion_depth == 0) 603 mparse_end(curp); 604 605 /* 606 * Clean up and restore saved parent properties. 607 */ 608 609 if (with_mmap) 610 munmap(blk.buf, blk.sz); 611 else 612 free(blk.buf); 613 614 curp->primary = save_primary; 615 curp->filenc = save_filenc; 616 curp->line = save_lineno; 617 if (save_filename != NULL) 618 mandoc_msg_setinfilename(save_filename); 619 } 620 621 int 622 mparse_open(struct mparse *curp, const char *file) 623 { 624 char *cp; 625 int fd, save_errno; 626 627 cp = strrchr(file, '.'); 628 curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz")); 629 630 /* First try to use the filename as it is. */ 631 632 if ((fd = open(file, O_RDONLY)) != -1) 633 return fd; 634 635 /* 636 * If that doesn't work and the filename doesn't 637 * already end in .gz, try appending .gz. 638 */ 639 640 if ( ! curp->gzip) { 641 save_errno = errno; 642 mandoc_asprintf(&cp, "%s.gz", file); 643 fd = open(cp, O_RDONLY); 644 free(cp); 645 errno = save_errno; 646 if (fd != -1) { 647 curp->gzip = 1; 648 return fd; 649 } 650 } 651 652 /* Neither worked, give up. */ 653 654 return -1; 655 } 656 657 struct mparse * 658 mparse_alloc(int options, enum mandoc_os os_e, const char *os_s) 659 { 660 struct mparse *curp; 661 662 curp = mandoc_calloc(1, sizeof(struct mparse)); 663 664 curp->options = options; 665 curp->os_s = os_s; 666 667 curp->roff = roff_alloc(options); 668 curp->man = roff_man_alloc(curp->roff, curp->os_s, 669 curp->options & MPARSE_QUICK ? 1 : 0); 670 if (curp->options & MPARSE_MDOC) { 671 curp->man->meta.macroset = MACROSET_MDOC; 672 if (curp->man->mdocmac == NULL) 673 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 674 } else if (curp->options & MPARSE_MAN) { 675 curp->man->meta.macroset = MACROSET_MAN; 676 if (curp->man->manmac == NULL) 677 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 678 } 679 curp->man->meta.first->tok = TOKEN_NONE; 680 curp->man->meta.os_e = os_e; 681 tag_alloc(); 682 return curp; 683 } 684 685 void 686 mparse_reset(struct mparse *curp) 687 { 688 tag_free(); 689 roff_reset(curp->roff); 690 roff_man_reset(curp->man); 691 free_buf_list(curp->secondary); 692 curp->secondary = NULL; 693 curp->gzip = 0; 694 tag_alloc(); 695 } 696 697 void 698 mparse_free(struct mparse *curp) 699 { 700 tag_free(); 701 roffhash_free(curp->man->mdocmac); 702 roffhash_free(curp->man->manmac); 703 roff_man_free(curp->man); 704 roff_free(curp->roff); 705 free_buf_list(curp->secondary); 706 free(curp); 707 } 708 709 struct roff_meta * 710 mparse_result(struct mparse *curp) 711 { 712 roff_state_reset(curp->man); 713 if (curp->options & MPARSE_VALIDATE) { 714 if (curp->man->meta.macroset == MACROSET_MDOC) 715 mdoc_validate(curp->man); 716 else 717 man_validate(curp->man); 718 tag_postprocess(curp->man, curp->man->meta.first); 719 } 720 return &curp->man->meta; 721 } 722 723 void 724 mparse_copy(const struct mparse *p) 725 { 726 struct buf *buf; 727 728 for (buf = p->secondary; buf != NULL; buf = buf->next) 729 puts(buf->buf); 730 } 731