1 /* Id: mandoc.c,v 1.75 2013/12/31 23:23:10 schwarze Exp */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #ifdef HAVE_CONFIG_H 19 #include "config.h" 20 #endif 21 22 #include <sys/types.h> 23 24 #include <assert.h> 25 #include <ctype.h> 26 #include <errno.h> 27 #include <limits.h> 28 #include <stdlib.h> 29 #include <stdio.h> 30 #include <string.h> 31 #include <time.h> 32 33 #include "mandoc.h" 34 #include "libmandoc.h" 35 36 #define DATESIZE 32 37 38 static int a2time(time_t *, const char *, const char *); 39 static char *time2a(time_t); 40 41 42 enum mandoc_esc 43 mandoc_escape(const char **end, const char **start, int *sz) 44 { 45 const char *local_start; 46 int local_sz; 47 char term; 48 enum mandoc_esc gly; 49 50 /* 51 * When the caller doesn't provide return storage, 52 * use local storage. 53 */ 54 55 if (NULL == start) 56 start = &local_start; 57 if (NULL == sz) 58 sz = &local_sz; 59 60 /* 61 * Beyond the backslash, at least one input character 62 * is part of the escape sequence. With one exception 63 * (see below), that character won't be returned. 64 */ 65 66 gly = ESCAPE_ERROR; 67 *start = ++*end; 68 *sz = 0; 69 term = '\0'; 70 71 switch ((*start)[-1]) { 72 /* 73 * First the glyphs. There are several different forms of 74 * these, but each eventually returns a substring of the glyph 75 * name. 76 */ 77 case ('('): 78 gly = ESCAPE_SPECIAL; 79 *sz = 2; 80 break; 81 case ('['): 82 gly = ESCAPE_SPECIAL; 83 /* 84 * Unicode escapes are defined in groff as \[uXXXX] to 85 * \[u10FFFF], where the contained value must be a valid 86 * Unicode codepoint. Here, however, only check whether 87 * it's not a zero-width escape. 88 */ 89 if ('u' == (*start)[0] && ']' != (*start)[1]) 90 gly = ESCAPE_UNICODE; 91 term = ']'; 92 break; 93 case ('C'): 94 if ('\'' != **start) 95 return(ESCAPE_ERROR); 96 *start = ++*end; 97 if ('u' == (*start)[0] && '\'' != (*start)[1]) 98 gly = ESCAPE_UNICODE; 99 else 100 gly = ESCAPE_SPECIAL; 101 term = '\''; 102 break; 103 104 /* 105 * Escapes taking no arguments at all. 106 */ 107 case ('d'): 108 /* FALLTHROUGH */ 109 case ('u'): 110 return(ESCAPE_IGNORE); 111 112 /* 113 * The \z escape is supposed to output the following 114 * character without advancing the cursor position. 115 * Since we are mostly dealing with terminal mode, 116 * let us just skip the next character. 117 */ 118 case ('z'): 119 return(ESCAPE_SKIPCHAR); 120 121 /* 122 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 123 * 'X' is the trigger. These have opaque sub-strings. 124 */ 125 case ('F'): 126 /* FALLTHROUGH */ 127 case ('g'): 128 /* FALLTHROUGH */ 129 case ('k'): 130 /* FALLTHROUGH */ 131 case ('M'): 132 /* FALLTHROUGH */ 133 case ('m'): 134 /* FALLTHROUGH */ 135 case ('n'): 136 /* FALLTHROUGH */ 137 case ('V'): 138 /* FALLTHROUGH */ 139 case ('Y'): 140 gly = ESCAPE_IGNORE; 141 /* FALLTHROUGH */ 142 case ('f'): 143 if (ESCAPE_ERROR == gly) 144 gly = ESCAPE_FONT; 145 switch (**start) { 146 case ('('): 147 *start = ++*end; 148 *sz = 2; 149 break; 150 case ('['): 151 *start = ++*end; 152 term = ']'; 153 break; 154 default: 155 *sz = 1; 156 break; 157 } 158 break; 159 160 /* 161 * These escapes are of the form \X'Y', where 'X' is the trigger 162 * and 'Y' is any string. These have opaque sub-strings. 163 */ 164 case ('A'): 165 /* FALLTHROUGH */ 166 case ('b'): 167 /* FALLTHROUGH */ 168 case ('B'): 169 /* FALLTHROUGH */ 170 case ('D'): 171 /* FALLTHROUGH */ 172 case ('o'): 173 /* FALLTHROUGH */ 174 case ('R'): 175 /* FALLTHROUGH */ 176 case ('w'): 177 /* FALLTHROUGH */ 178 case ('X'): 179 /* FALLTHROUGH */ 180 case ('Z'): 181 if ('\'' != **start) 182 return(ESCAPE_ERROR); 183 gly = ESCAPE_IGNORE; 184 *start = ++*end; 185 term = '\''; 186 break; 187 188 /* 189 * These escapes are of the form \X'N', where 'X' is the trigger 190 * and 'N' resolves to a numerical expression. 191 */ 192 case ('h'): 193 /* FALLTHROUGH */ 194 case ('H'): 195 /* FALLTHROUGH */ 196 case ('L'): 197 /* FALLTHROUGH */ 198 case ('l'): 199 /* FALLTHROUGH */ 200 case ('S'): 201 /* FALLTHROUGH */ 202 case ('v'): 203 /* FALLTHROUGH */ 204 case ('x'): 205 if ('\'' != **start) 206 return(ESCAPE_ERROR); 207 gly = ESCAPE_IGNORE; 208 *start = ++*end; 209 term = '\''; 210 break; 211 212 /* 213 * Special handling for the numbered character escape. 214 * XXX Do any other escapes need similar handling? 215 */ 216 case ('N'): 217 if ('\0' == **start) 218 return(ESCAPE_ERROR); 219 (*end)++; 220 if (isdigit((unsigned char)**start)) { 221 *sz = 1; 222 return(ESCAPE_IGNORE); 223 } 224 (*start)++; 225 while (isdigit((unsigned char)**end)) 226 (*end)++; 227 *sz = *end - *start; 228 if ('\0' != **end) 229 (*end)++; 230 return(ESCAPE_NUMBERED); 231 232 /* 233 * Sizes get a special category of their own. 234 */ 235 case ('s'): 236 gly = ESCAPE_IGNORE; 237 238 /* See +/- counts as a sign. */ 239 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 240 (*end)++; 241 242 switch (**end) { 243 case ('('): 244 *start = ++*end; 245 *sz = 2; 246 break; 247 case ('['): 248 *start = ++*end; 249 term = ']'; 250 break; 251 case ('\''): 252 *start = ++*end; 253 term = '\''; 254 break; 255 default: 256 *sz = 1; 257 break; 258 } 259 260 break; 261 262 /* 263 * Anything else is assumed to be a glyph. 264 * In this case, pass back the character after the backslash. 265 */ 266 default: 267 gly = ESCAPE_SPECIAL; 268 *start = --*end; 269 *sz = 1; 270 break; 271 } 272 273 assert(ESCAPE_ERROR != gly); 274 275 /* 276 * Read up to the terminating character, 277 * paying attention to nested escapes. 278 */ 279 280 if ('\0' != term) { 281 while (**end != term) { 282 switch (**end) { 283 case ('\0'): 284 return(ESCAPE_ERROR); 285 case ('\\'): 286 (*end)++; 287 if (ESCAPE_ERROR == 288 mandoc_escape(end, NULL, NULL)) 289 return(ESCAPE_ERROR); 290 break; 291 default: 292 (*end)++; 293 break; 294 } 295 } 296 *sz = (*end)++ - *start; 297 } else { 298 assert(*sz > 0); 299 if ((size_t)*sz > strlen(*start)) 300 return(ESCAPE_ERROR); 301 *end += *sz; 302 } 303 304 /* Run post-processors. */ 305 306 switch (gly) { 307 case (ESCAPE_FONT): 308 if (2 == *sz) { 309 if ('C' == **start) { 310 /* 311 * Treat constant-width font modes 312 * just like regular font modes. 313 */ 314 (*start)++; 315 (*sz)--; 316 } else { 317 if ('B' == (*start)[0] && 'I' == (*start)[1]) 318 gly = ESCAPE_FONTBI; 319 break; 320 } 321 } else if (1 != *sz) 322 break; 323 324 switch (**start) { 325 case ('3'): 326 /* FALLTHROUGH */ 327 case ('B'): 328 gly = ESCAPE_FONTBOLD; 329 break; 330 case ('2'): 331 /* FALLTHROUGH */ 332 case ('I'): 333 gly = ESCAPE_FONTITALIC; 334 break; 335 case ('P'): 336 gly = ESCAPE_FONTPREV; 337 break; 338 case ('1'): 339 /* FALLTHROUGH */ 340 case ('R'): 341 gly = ESCAPE_FONTROMAN; 342 break; 343 } 344 break; 345 case (ESCAPE_SPECIAL): 346 if (1 == *sz && 'c' == **start) 347 gly = ESCAPE_NOSPACE; 348 break; 349 default: 350 break; 351 } 352 353 return(gly); 354 } 355 356 void * 357 mandoc_calloc(size_t num, size_t size) 358 { 359 void *ptr; 360 361 ptr = calloc(num, size); 362 if (NULL == ptr) { 363 perror(NULL); 364 exit((int)MANDOCLEVEL_SYSERR); 365 } 366 367 return(ptr); 368 } 369 370 371 void * 372 mandoc_malloc(size_t size) 373 { 374 void *ptr; 375 376 ptr = malloc(size); 377 if (NULL == ptr) { 378 perror(NULL); 379 exit((int)MANDOCLEVEL_SYSERR); 380 } 381 382 return(ptr); 383 } 384 385 386 void * 387 mandoc_realloc(void *ptr, size_t size) 388 { 389 390 ptr = realloc(ptr, size); 391 if (NULL == ptr) { 392 perror(NULL); 393 exit((int)MANDOCLEVEL_SYSERR); 394 } 395 396 return(ptr); 397 } 398 399 char * 400 mandoc_strndup(const char *ptr, size_t sz) 401 { 402 char *p; 403 404 p = mandoc_malloc(sz + 1); 405 memcpy(p, ptr, sz); 406 p[(int)sz] = '\0'; 407 return(p); 408 } 409 410 char * 411 mandoc_strdup(const char *ptr) 412 { 413 char *p; 414 415 p = strdup(ptr); 416 if (NULL == p) { 417 perror(NULL); 418 exit((int)MANDOCLEVEL_SYSERR); 419 } 420 421 return(p); 422 } 423 424 /* 425 * Parse a quoted or unquoted roff-style request or macro argument. 426 * Return a pointer to the parsed argument, which is either the original 427 * pointer or advanced by one byte in case the argument is quoted. 428 * NUL-terminate the argument in place. 429 * Collapse pairs of quotes inside quoted arguments. 430 * Advance the argument pointer to the next argument, 431 * or to the NUL byte terminating the argument line. 432 */ 433 char * 434 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 435 { 436 char *start, *cp; 437 int quoted, pairs, white; 438 439 /* Quoting can only start with a new word. */ 440 start = *cpp; 441 quoted = 0; 442 if ('"' == *start) { 443 quoted = 1; 444 start++; 445 } 446 447 pairs = 0; 448 white = 0; 449 for (cp = start; '\0' != *cp; cp++) { 450 451 /* 452 * Move the following text left 453 * after quoted quotes and after "\\" and "\t". 454 */ 455 if (pairs) 456 cp[-pairs] = cp[0]; 457 458 if ('\\' == cp[0]) { 459 /* 460 * In copy mode, translate double to single 461 * backslashes and backslash-t to literal tabs. 462 */ 463 switch (cp[1]) { 464 case ('t'): 465 cp[0] = '\t'; 466 /* FALLTHROUGH */ 467 case ('\\'): 468 pairs++; 469 cp++; 470 break; 471 case (' '): 472 /* Skip escaped blanks. */ 473 if (0 == quoted) 474 cp++; 475 break; 476 default: 477 break; 478 } 479 } else if (0 == quoted) { 480 if (' ' == cp[0]) { 481 /* Unescaped blanks end unquoted args. */ 482 white = 1; 483 break; 484 } 485 } else if ('"' == cp[0]) { 486 if ('"' == cp[1]) { 487 /* Quoted quotes collapse. */ 488 pairs++; 489 cp++; 490 } else { 491 /* Unquoted quotes end quoted args. */ 492 quoted = 2; 493 break; 494 } 495 } 496 } 497 498 /* Quoted argument without a closing quote. */ 499 if (1 == quoted) 500 mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL); 501 502 /* NUL-terminate this argument and move to the next one. */ 503 if (pairs) 504 cp[-pairs] = '\0'; 505 if ('\0' != *cp) { 506 *cp++ = '\0'; 507 while (' ' == *cp) 508 cp++; 509 } 510 *pos += (int)(cp - start) + (quoted ? 1 : 0); 511 *cpp = cp; 512 513 if ('\0' == *cp && (white || ' ' == cp[-1])) 514 mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL); 515 516 return(start); 517 } 518 519 static int 520 a2time(time_t *t, const char *fmt, const char *p) 521 { 522 struct tm tm; 523 char *pp; 524 525 memset(&tm, 0, sizeof(struct tm)); 526 527 pp = NULL; 528 #ifdef HAVE_STRPTIME 529 pp = strptime(p, fmt, &tm); 530 #endif 531 if (NULL != pp && '\0' == *pp) { 532 *t = mktime(&tm); 533 return(1); 534 } 535 536 return(0); 537 } 538 539 static char * 540 time2a(time_t t) 541 { 542 struct tm *tm; 543 char *buf, *p; 544 size_t ssz; 545 int isz; 546 547 tm = localtime(&t); 548 549 /* 550 * Reserve space: 551 * up to 9 characters for the month (September) + blank 552 * up to 2 characters for the day + comma + blank 553 * 4 characters for the year and a terminating '\0' 554 */ 555 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 556 557 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm))) 558 goto fail; 559 p += (int)ssz; 560 561 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday))) 562 goto fail; 563 p += isz; 564 565 if (0 == strftime(p, 4 + 1, "%Y", tm)) 566 goto fail; 567 return(buf); 568 569 fail: 570 free(buf); 571 return(NULL); 572 } 573 574 char * 575 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos) 576 { 577 char *out; 578 time_t t; 579 580 if (NULL == in || '\0' == *in || 581 0 == strcmp(in, "$" "Mdocdate$")) { 582 mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL); 583 time(&t); 584 } 585 else if (a2time(&t, "%Y-%m-%d", in)) 586 t = 0; 587 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) && 588 !a2time(&t, "%b %d, %Y", in)) { 589 mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL); 590 t = 0; 591 } 592 out = t ? time2a(t) : NULL; 593 return(out ? out : mandoc_strdup(in)); 594 } 595 596 int 597 mandoc_eos(const char *p, size_t sz) 598 { 599 const char *q; 600 int enclosed, found; 601 602 if (0 == sz) 603 return(0); 604 605 /* 606 * End-of-sentence recognition must include situations where 607 * some symbols, such as `)', allow prior EOS punctuation to 608 * propagate outward. 609 */ 610 611 enclosed = found = 0; 612 for (q = p + (int)sz - 1; q >= p; q--) { 613 switch (*q) { 614 case ('\"'): 615 /* FALLTHROUGH */ 616 case ('\''): 617 /* FALLTHROUGH */ 618 case (']'): 619 /* FALLTHROUGH */ 620 case (')'): 621 if (0 == found) 622 enclosed = 1; 623 break; 624 case ('.'): 625 /* FALLTHROUGH */ 626 case ('!'): 627 /* FALLTHROUGH */ 628 case ('?'): 629 found = 1; 630 break; 631 default: 632 return(found && (!enclosed || isalnum((unsigned char)*q))); 633 } 634 } 635 636 return(found && !enclosed); 637 } 638 639 /* 640 * Convert a string to a long that may not be <0. 641 * If the string is invalid, or is less than 0, return -1. 642 */ 643 int 644 mandoc_strntoi(const char *p, size_t sz, int base) 645 { 646 char buf[32]; 647 char *ep; 648 long v; 649 650 if (sz > 31) 651 return(-1); 652 653 memcpy(buf, p, sz); 654 buf[(int)sz] = '\0'; 655 656 errno = 0; 657 v = strtol(buf, &ep, base); 658 659 if (buf[0] == '\0' || *ep != '\0') 660 return(-1); 661 662 if (v > INT_MAX) 663 v = INT_MAX; 664 if (v < INT_MIN) 665 v = INT_MIN; 666 667 return((int)v); 668 } 669