1 /* $Id: mandoc.c,v 1.53 2011/05/24 21:31:23 kristaps Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #ifdef HAVE_CONFIG_H 19 #include "config.h" 20 #endif 21 22 #include <sys/types.h> 23 24 #include <assert.h> 25 #include <ctype.h> 26 #include <errno.h> 27 #include <limits.h> 28 #include <stdlib.h> 29 #include <stdio.h> 30 #include <string.h> 31 #include <time.h> 32 33 #include "mandoc.h" 34 #include "libmandoc.h" 35 36 #define DATESIZE 32 37 38 static int a2time(time_t *, const char *, const char *); 39 static char *time2a(time_t); 40 static int numescape(const char *); 41 42 /* 43 * Pass over recursive numerical expressions. This context of this 44 * function is important: it's only called within character-terminating 45 * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial 46 * recursion: we don't care about what's in these blocks. 47 * This returns the number of characters skipped or -1 if an error 48 * occurs (the caller should bail). 49 */ 50 static int 51 numescape(const char *start) 52 { 53 int i; 54 size_t sz; 55 const char *cp; 56 57 i = 0; 58 59 /* The expression consists of a subexpression. */ 60 61 if ('\\' == start[i]) { 62 cp = &start[++i]; 63 /* 64 * Read past the end of the subexpression. 65 * Bail immediately on errors. 66 */ 67 if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL)) 68 return(-1); 69 return(i + cp - &start[i]); 70 } 71 72 if ('(' != start[i++]) 73 return(0); 74 75 /* 76 * A parenthesised subexpression. Read until the closing 77 * parenthesis, making sure to handle any nested subexpressions 78 * that might ruin our parse. 79 */ 80 81 while (')' != start[i]) { 82 sz = strcspn(&start[i], ")\\"); 83 i += (int)sz; 84 85 if ('\0' == start[i]) 86 return(-1); 87 else if ('\\' != start[i]) 88 continue; 89 90 cp = &start[++i]; 91 if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL)) 92 return(-1); 93 i += cp - &start[i]; 94 } 95 96 /* Read past the terminating ')'. */ 97 return(++i); 98 } 99 100 enum mandoc_esc 101 mandoc_escape(const char **end, const char **start, int *sz) 102 { 103 char c, term, numeric; 104 int i, lim, ssz, rlim; 105 const char *cp, *rstart; 106 enum mandoc_esc gly; 107 108 cp = *end; 109 rstart = cp; 110 if (start) 111 *start = rstart; 112 i = lim = 0; 113 gly = ESCAPE_ERROR; 114 term = numeric = '\0'; 115 116 switch ((c = cp[i++])) { 117 /* 118 * First the glyphs. There are several different forms of 119 * these, but each eventually returns a substring of the glyph 120 * name. 121 */ 122 case ('('): 123 gly = ESCAPE_SPECIAL; 124 lim = 2; 125 break; 126 case ('['): 127 gly = ESCAPE_SPECIAL; 128 /* 129 * Unicode escapes are defined in groff as \[uXXXX] to 130 * \[u10FFFF], where the contained value must be a valid 131 * Unicode codepoint. Here, however, only check whether 132 * it's not a zero-width escape. 133 */ 134 if ('u' == cp[i] && ']' != cp[i + 1]) 135 gly = ESCAPE_UNICODE; 136 term = ']'; 137 break; 138 case ('C'): 139 if ('\'' != cp[i]) 140 return(ESCAPE_ERROR); 141 gly = ESCAPE_SPECIAL; 142 term = '\''; 143 break; 144 145 /* 146 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 147 * 'X' is the trigger. These have opaque sub-strings. 148 */ 149 case ('F'): 150 /* FALLTHROUGH */ 151 case ('g'): 152 /* FALLTHROUGH */ 153 case ('k'): 154 /* FALLTHROUGH */ 155 case ('M'): 156 /* FALLTHROUGH */ 157 case ('m'): 158 /* FALLTHROUGH */ 159 case ('n'): 160 /* FALLTHROUGH */ 161 case ('V'): 162 /* FALLTHROUGH */ 163 case ('Y'): 164 if (ESCAPE_ERROR == gly) 165 gly = ESCAPE_IGNORE; 166 /* FALLTHROUGH */ 167 case ('f'): 168 if (ESCAPE_ERROR == gly) 169 gly = ESCAPE_FONT; 170 171 rstart= &cp[i]; 172 if (start) 173 *start = rstart; 174 175 switch (cp[i++]) { 176 case ('('): 177 lim = 2; 178 break; 179 case ('['): 180 term = ']'; 181 break; 182 default: 183 lim = 1; 184 i--; 185 break; 186 } 187 break; 188 189 /* 190 * These escapes are of the form \X'Y', where 'X' is the trigger 191 * and 'Y' is any string. These have opaque sub-strings. 192 */ 193 case ('A'): 194 /* FALLTHROUGH */ 195 case ('b'): 196 /* FALLTHROUGH */ 197 case ('D'): 198 /* FALLTHROUGH */ 199 case ('o'): 200 /* FALLTHROUGH */ 201 case ('R'): 202 /* FALLTHROUGH */ 203 case ('X'): 204 /* FALLTHROUGH */ 205 case ('Z'): 206 if ('\'' != cp[i++]) 207 return(ESCAPE_ERROR); 208 gly = ESCAPE_IGNORE; 209 term = '\''; 210 break; 211 212 /* 213 * These escapes are of the form \X'N', where 'X' is the trigger 214 * and 'N' resolves to a numerical expression. 215 */ 216 case ('B'): 217 /* FALLTHROUGH */ 218 case ('h'): 219 /* FALLTHROUGH */ 220 case ('H'): 221 /* FALLTHROUGH */ 222 case ('L'): 223 /* FALLTHROUGH */ 224 case ('l'): 225 /* FALLTHROUGH */ 226 case ('N'): 227 if (ESCAPE_ERROR == gly) 228 gly = ESCAPE_NUMBERED; 229 /* FALLTHROUGH */ 230 case ('S'): 231 /* FALLTHROUGH */ 232 case ('v'): 233 /* FALLTHROUGH */ 234 case ('w'): 235 /* FALLTHROUGH */ 236 case ('x'): 237 if (ESCAPE_ERROR == gly) 238 gly = ESCAPE_IGNORE; 239 if ('\'' != cp[i++]) 240 return(ESCAPE_ERROR); 241 term = numeric = '\''; 242 break; 243 244 /* 245 * Sizes get a special category of their own. 246 */ 247 case ('s'): 248 gly = ESCAPE_IGNORE; 249 250 rstart = &cp[i]; 251 if (start) 252 *start = rstart; 253 254 /* See +/- counts as a sign. */ 255 c = cp[i]; 256 if ('+' == c || '-' == c || ASCII_HYPH == c) 257 ++i; 258 259 switch (cp[i++]) { 260 case ('('): 261 lim = 2; 262 break; 263 case ('['): 264 term = numeric = ']'; 265 break; 266 case ('\''): 267 term = numeric = '\''; 268 break; 269 default: 270 lim = 1; 271 i--; 272 break; 273 } 274 275 /* See +/- counts as a sign. */ 276 c = cp[i]; 277 if ('+' == c || '-' == c || ASCII_HYPH == c) 278 ++i; 279 280 break; 281 282 /* 283 * Anything else is assumed to be a glyph. 284 */ 285 default: 286 gly = ESCAPE_SPECIAL; 287 lim = 1; 288 i--; 289 break; 290 } 291 292 assert(ESCAPE_ERROR != gly); 293 294 rstart = &cp[i]; 295 if (start) 296 *start = rstart; 297 298 /* 299 * If a terminating block has been specified, we need to 300 * handle the case of recursion, which could have their 301 * own terminating blocks that mess up our parse. This, by the 302 * way, means that the "start" and "size" values will be 303 * effectively meaningless. 304 */ 305 306 ssz = 0; 307 if (numeric && -1 == (ssz = numescape(&cp[i]))) 308 return(ESCAPE_ERROR); 309 310 i += ssz; 311 rlim = -1; 312 313 /* 314 * We have a character terminator. Try to read up to that 315 * character. If we can't (i.e., we hit the nil), then return 316 * an error; if we can, calculate our length, read past the 317 * terminating character, and exit. 318 */ 319 320 if ('\0' != term) { 321 *end = strchr(&cp[i], term); 322 if ('\0' == *end) 323 return(ESCAPE_ERROR); 324 325 rlim = *end - &cp[i]; 326 if (sz) 327 *sz = rlim; 328 (*end)++; 329 goto out; 330 } 331 332 assert(lim > 0); 333 334 /* 335 * We have a numeric limit. If the string is shorter than that, 336 * stop and return an error. Else adjust our endpoint, length, 337 * and return the current glyph. 338 */ 339 340 if ((size_t)lim > strlen(&cp[i])) 341 return(ESCAPE_ERROR); 342 343 rlim = lim; 344 if (sz) 345 *sz = rlim; 346 347 *end = &cp[i] + lim; 348 349 out: 350 assert(rlim >= 0 && rstart); 351 352 /* Run post-processors. */ 353 354 switch (gly) { 355 case (ESCAPE_FONT): 356 if (1 != rlim) 357 break; 358 switch (*rstart) { 359 case ('3'): 360 /* FALLTHROUGH */ 361 case ('B'): 362 gly = ESCAPE_FONTBOLD; 363 break; 364 case ('2'): 365 /* FALLTHROUGH */ 366 case ('I'): 367 gly = ESCAPE_FONTITALIC; 368 break; 369 case ('P'): 370 gly = ESCAPE_FONTPREV; 371 break; 372 case ('1'): 373 /* FALLTHROUGH */ 374 case ('R'): 375 gly = ESCAPE_FONTROMAN; 376 break; 377 } 378 break; 379 case (ESCAPE_SPECIAL): 380 if (1 != rlim) 381 break; 382 if ('c' == *rstart) 383 gly = ESCAPE_NOSPACE; 384 break; 385 default: 386 break; 387 } 388 389 return(gly); 390 } 391 392 void * 393 mandoc_calloc(size_t num, size_t size) 394 { 395 void *ptr; 396 397 ptr = calloc(num, size); 398 if (NULL == ptr) { 399 perror(NULL); 400 exit((int)MANDOCLEVEL_SYSERR); 401 } 402 403 return(ptr); 404 } 405 406 407 void * 408 mandoc_malloc(size_t size) 409 { 410 void *ptr; 411 412 ptr = malloc(size); 413 if (NULL == ptr) { 414 perror(NULL); 415 exit((int)MANDOCLEVEL_SYSERR); 416 } 417 418 return(ptr); 419 } 420 421 422 void * 423 mandoc_realloc(void *ptr, size_t size) 424 { 425 426 ptr = realloc(ptr, size); 427 if (NULL == ptr) { 428 perror(NULL); 429 exit((int)MANDOCLEVEL_SYSERR); 430 } 431 432 return(ptr); 433 } 434 435 436 char * 437 mandoc_strdup(const char *ptr) 438 { 439 char *p; 440 441 p = strdup(ptr); 442 if (NULL == p) { 443 perror(NULL); 444 exit((int)MANDOCLEVEL_SYSERR); 445 } 446 447 return(p); 448 } 449 450 /* 451 * Parse a quoted or unquoted roff-style request or macro argument. 452 * Return a pointer to the parsed argument, which is either the original 453 * pointer or advanced by one byte in case the argument is quoted. 454 * Null-terminate the argument in place. 455 * Collapse pairs of quotes inside quoted arguments. 456 * Advance the argument pointer to the next argument, 457 * or to the null byte terminating the argument line. 458 */ 459 char * 460 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 461 { 462 char *start, *cp; 463 int quoted, pairs, white; 464 465 /* Quoting can only start with a new word. */ 466 start = *cpp; 467 quoted = 0; 468 if ('"' == *start) { 469 quoted = 1; 470 start++; 471 } 472 473 pairs = 0; 474 white = 0; 475 for (cp = start; '\0' != *cp; cp++) { 476 /* Move left after quoted quotes and escaped backslashes. */ 477 if (pairs) 478 cp[-pairs] = cp[0]; 479 if ('\\' == cp[0]) { 480 if ('\\' == cp[1]) { 481 /* Poor man's copy mode. */ 482 pairs++; 483 cp++; 484 } else if (0 == quoted && ' ' == cp[1]) 485 /* Skip escaped blanks. */ 486 cp++; 487 } else if (0 == quoted) { 488 if (' ' == cp[0]) { 489 /* Unescaped blanks end unquoted args. */ 490 white = 1; 491 break; 492 } 493 } else if ('"' == cp[0]) { 494 if ('"' == cp[1]) { 495 /* Quoted quotes collapse. */ 496 pairs++; 497 cp++; 498 } else { 499 /* Unquoted quotes end quoted args. */ 500 quoted = 2; 501 break; 502 } 503 } 504 } 505 506 /* Quoted argument without a closing quote. */ 507 if (1 == quoted) 508 mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL); 509 510 /* Null-terminate this argument and move to the next one. */ 511 if (pairs) 512 cp[-pairs] = '\0'; 513 if ('\0' != *cp) { 514 *cp++ = '\0'; 515 while (' ' == *cp) 516 cp++; 517 } 518 *pos += (int)(cp - start) + (quoted ? 1 : 0); 519 *cpp = cp; 520 521 if ('\0' == *cp && (white || ' ' == cp[-1])) 522 mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL); 523 524 return(start); 525 } 526 527 static int 528 a2time(time_t *t, const char *fmt, const char *p) 529 { 530 struct tm tm; 531 char *pp; 532 533 memset(&tm, 0, sizeof(struct tm)); 534 535 pp = strptime(p, fmt, &tm); 536 if (NULL != pp && '\0' == *pp) { 537 *t = mktime(&tm); 538 return(1); 539 } 540 541 return(0); 542 } 543 544 static char * 545 time2a(time_t t) 546 { 547 struct tm tm; 548 char *buf, *p; 549 size_t ssz; 550 int isz; 551 552 localtime_r(&t, &tm); 553 554 /* 555 * Reserve space: 556 * up to 9 characters for the month (September) + blank 557 * up to 2 characters for the day + comma + blank 558 * 4 characters for the year and a terminating '\0' 559 */ 560 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 561 562 if (0 == (ssz = strftime(p, 10 + 1, "%B ", &tm))) 563 goto fail; 564 p += (int)ssz; 565 566 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm.tm_mday))) 567 goto fail; 568 p += isz; 569 570 if (0 == strftime(p, 4 + 1, "%Y", &tm)) 571 goto fail; 572 return(buf); 573 574 fail: 575 free(buf); 576 return(NULL); 577 } 578 579 char * 580 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos) 581 { 582 char *out; 583 time_t t; 584 585 if (NULL == in || '\0' == *in || 586 0 == strcmp(in, "$" "Mdocdate$")) { 587 mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL); 588 time(&t); 589 } 590 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) && 591 !a2time(&t, "%b %d, %Y", in) && 592 !a2time(&t, "%Y-%m-%d", in)) { 593 mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL); 594 t = 0; 595 } 596 out = t ? time2a(t) : NULL; 597 return(out ? out : mandoc_strdup(in)); 598 } 599 600 int 601 mandoc_eos(const char *p, size_t sz, int enclosed) 602 { 603 const char *q; 604 int found; 605 606 if (0 == sz) 607 return(0); 608 609 /* 610 * End-of-sentence recognition must include situations where 611 * some symbols, such as `)', allow prior EOS punctuation to 612 * propagate outward. 613 */ 614 615 found = 0; 616 for (q = p + (int)sz - 1; q >= p; q--) { 617 switch (*q) { 618 case ('\"'): 619 /* FALLTHROUGH */ 620 case ('\''): 621 /* FALLTHROUGH */ 622 case (']'): 623 /* FALLTHROUGH */ 624 case (')'): 625 if (0 == found) 626 enclosed = 1; 627 break; 628 case ('.'): 629 /* FALLTHROUGH */ 630 case ('!'): 631 /* FALLTHROUGH */ 632 case ('?'): 633 found = 1; 634 break; 635 default: 636 return(found && (!enclosed || isalnum((unsigned char)*q))); 637 } 638 } 639 640 return(found && !enclosed); 641 } 642 643 int 644 mandoc_hyph(const char *start, const char *c) 645 { 646 647 /* 648 * Choose whether to break at a hyphenated character. We only 649 * do this if it's free-standing within a word. 650 */ 651 652 /* Skip first/last character of buffer. */ 653 if (c == start || '\0' == *(c + 1)) 654 return(0); 655 /* Skip first/last character of word. */ 656 if ('\t' == *(c + 1) || '\t' == *(c - 1)) 657 return(0); 658 if (' ' == *(c + 1) || ' ' == *(c - 1)) 659 return(0); 660 /* Skip double invocations. */ 661 if ('-' == *(c + 1) || '-' == *(c - 1)) 662 return(0); 663 /* Skip escapes. */ 664 if ('\\' == *(c - 1)) 665 return(0); 666 667 return(1); 668 } 669 670 /* 671 * Find out whether a line is a macro line or not. If it is, adjust the 672 * current position and return one; if it isn't, return zero and don't 673 * change the current position. 674 */ 675 int 676 mandoc_getcontrol(const char *cp, int *ppos) 677 { 678 int pos; 679 680 pos = *ppos; 681 682 if ('\\' == cp[pos] && '.' == cp[pos + 1]) 683 pos += 2; 684 else if ('.' == cp[pos] || '\'' == cp[pos]) 685 pos++; 686 else 687 return(0); 688 689 while (' ' == cp[pos] || '\t' == cp[pos]) 690 pos++; 691 692 *ppos = pos; 693 return(1); 694 } 695 696 /* 697 * Convert a string to a long that may not be <0. 698 * If the string is invalid, or is less than 0, return -1. 699 */ 700 int 701 mandoc_strntou(const char *p, size_t sz, int base) 702 { 703 char buf[32]; 704 char *ep; 705 long v; 706 707 if (sz > 31) 708 return(-1); 709 710 memcpy(buf, p, sz); 711 buf[(int)sz] = '\0'; 712 713 errno = 0; 714 v = strtol(buf, &ep, base); 715 716 if (buf[0] == '\0' || *ep != '\0') 717 return(-1); 718 719 if ((errno == ERANGE && 720 (v == LONG_MAX || v == LONG_MIN)) || 721 (v > INT_MAX || v < 0)) 722 return(-1); 723 724 return((int)v); 725 } 726 727