1 /* $Id: mandoc.c,v 1.70 2013/11/10 21:34:04 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #ifdef HAVE_CONFIG_H 19 #include "config.h" 20 #endif 21 22 #include <sys/types.h> 23 24 #include <assert.h> 25 #include <ctype.h> 26 #include <errno.h> 27 #include <limits.h> 28 #include <stdlib.h> 29 #include <stdio.h> 30 #include <string.h> 31 #include <time.h> 32 33 #include "mandoc.h" 34 #include "libmandoc.h" 35 36 #define DATESIZE 32 37 38 static int a2time(time_t *, const char *, const char *); 39 static char *time2a(time_t); 40 41 42 enum mandoc_esc 43 mandoc_escape(const char const **end, const char const **start, int *sz) 44 { 45 const char *local_start; 46 int local_sz; 47 char term; 48 enum mandoc_esc gly; 49 50 /* 51 * When the caller doesn't provide return storage, 52 * use local storage. 53 */ 54 55 if (NULL == start) 56 start = &local_start; 57 if (NULL == sz) 58 sz = &local_sz; 59 60 /* 61 * Beyond the backslash, at least one input character 62 * is part of the escape sequence. With one exception 63 * (see below), that character won't be returned. 64 */ 65 66 gly = ESCAPE_ERROR; 67 *start = ++*end; 68 *sz = 0; 69 term = '\0'; 70 71 switch ((*start)[-1]) { 72 /* 73 * First the glyphs. There are several different forms of 74 * these, but each eventually returns a substring of the glyph 75 * name. 76 */ 77 case ('('): 78 gly = ESCAPE_SPECIAL; 79 *sz = 2; 80 break; 81 case ('['): 82 gly = ESCAPE_SPECIAL; 83 /* 84 * Unicode escapes are defined in groff as \[uXXXX] to 85 * \[u10FFFF], where the contained value must be a valid 86 * Unicode codepoint. Here, however, only check whether 87 * it's not a zero-width escape. 88 */ 89 if ('u' == (*start)[0] && ']' != (*start)[1]) 90 gly = ESCAPE_UNICODE; 91 term = ']'; 92 break; 93 case ('C'): 94 if ('\'' != **start) 95 return(ESCAPE_ERROR); 96 *start = ++*end; 97 if ('u' == (*start)[0] && '\'' != (*start)[1]) 98 gly = ESCAPE_UNICODE; 99 else 100 gly = ESCAPE_SPECIAL; 101 term = '\''; 102 break; 103 104 /* 105 * The \z escape is supposed to output the following 106 * character without advancing the cursor position. 107 * Since we are mostly dealing with terminal mode, 108 * let us just skip the next character. 109 */ 110 case ('z'): 111 return(ESCAPE_SKIPCHAR); 112 113 /* 114 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 115 * 'X' is the trigger. These have opaque sub-strings. 116 */ 117 case ('F'): 118 /* FALLTHROUGH */ 119 case ('g'): 120 /* FALLTHROUGH */ 121 case ('k'): 122 /* FALLTHROUGH */ 123 case ('M'): 124 /* FALLTHROUGH */ 125 case ('m'): 126 /* FALLTHROUGH */ 127 case ('n'): 128 /* FALLTHROUGH */ 129 case ('V'): 130 /* FALLTHROUGH */ 131 case ('Y'): 132 gly = ESCAPE_IGNORE; 133 /* FALLTHROUGH */ 134 case ('f'): 135 if (ESCAPE_ERROR == gly) 136 gly = ESCAPE_FONT; 137 switch (**start) { 138 case ('('): 139 *start = ++*end; 140 *sz = 2; 141 break; 142 case ('['): 143 *start = ++*end; 144 term = ']'; 145 break; 146 default: 147 *sz = 1; 148 break; 149 } 150 break; 151 152 /* 153 * These escapes are of the form \X'Y', where 'X' is the trigger 154 * and 'Y' is any string. These have opaque sub-strings. 155 */ 156 case ('A'): 157 /* FALLTHROUGH */ 158 case ('b'): 159 /* FALLTHROUGH */ 160 case ('D'): 161 /* FALLTHROUGH */ 162 case ('o'): 163 /* FALLTHROUGH */ 164 case ('R'): 165 /* FALLTHROUGH */ 166 case ('X'): 167 /* FALLTHROUGH */ 168 case ('Z'): 169 if ('\'' != **start) 170 return(ESCAPE_ERROR); 171 gly = ESCAPE_IGNORE; 172 *start = ++*end; 173 term = '\''; 174 break; 175 176 /* 177 * These escapes are of the form \X'N', where 'X' is the trigger 178 * and 'N' resolves to a numerical expression. 179 */ 180 case ('B'): 181 /* FALLTHROUGH */ 182 case ('h'): 183 /* FALLTHROUGH */ 184 case ('H'): 185 /* FALLTHROUGH */ 186 case ('L'): 187 /* FALLTHROUGH */ 188 case ('l'): 189 gly = ESCAPE_NUMBERED; 190 /* FALLTHROUGH */ 191 case ('S'): 192 /* FALLTHROUGH */ 193 case ('v'): 194 /* FALLTHROUGH */ 195 case ('w'): 196 /* FALLTHROUGH */ 197 case ('x'): 198 if ('\'' != **start) 199 return(ESCAPE_ERROR); 200 if (ESCAPE_ERROR == gly) 201 gly = ESCAPE_IGNORE; 202 *start = ++*end; 203 term = '\''; 204 break; 205 206 /* 207 * Special handling for the numbered character escape. 208 * XXX Do any other escapes need similar handling? 209 */ 210 case ('N'): 211 if ('\0' == **start) 212 return(ESCAPE_ERROR); 213 (*end)++; 214 if (isdigit((unsigned char)**start)) { 215 *sz = 1; 216 return(ESCAPE_IGNORE); 217 } 218 (*start)++; 219 while (isdigit((unsigned char)**end)) 220 (*end)++; 221 *sz = *end - *start; 222 if ('\0' != **end) 223 (*end)++; 224 return(ESCAPE_NUMBERED); 225 226 /* 227 * Sizes get a special category of their own. 228 */ 229 case ('s'): 230 gly = ESCAPE_IGNORE; 231 232 /* See +/- counts as a sign. */ 233 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 234 (*end)++; 235 236 switch (**end) { 237 case ('('): 238 *start = ++*end; 239 *sz = 2; 240 break; 241 case ('['): 242 *start = ++*end; 243 term = ']'; 244 break; 245 case ('\''): 246 *start = ++*end; 247 term = '\''; 248 break; 249 default: 250 *sz = 1; 251 break; 252 } 253 254 break; 255 256 /* 257 * Anything else is assumed to be a glyph. 258 * In this case, pass back the character after the backslash. 259 */ 260 default: 261 gly = ESCAPE_SPECIAL; 262 *start = --*end; 263 *sz = 1; 264 break; 265 } 266 267 assert(ESCAPE_ERROR != gly); 268 269 /* 270 * Read up to the terminating character, 271 * paying attention to nested escapes. 272 */ 273 274 if ('\0' != term) { 275 while (**end != term) { 276 switch (**end) { 277 case ('\0'): 278 return(ESCAPE_ERROR); 279 case ('\\'): 280 (*end)++; 281 if (ESCAPE_ERROR == 282 mandoc_escape(end, NULL, NULL)) 283 return(ESCAPE_ERROR); 284 break; 285 default: 286 (*end)++; 287 break; 288 } 289 } 290 *sz = (*end)++ - *start; 291 } else { 292 assert(*sz > 0); 293 if ((size_t)*sz > strlen(*start)) 294 return(ESCAPE_ERROR); 295 *end += *sz; 296 } 297 298 /* Run post-processors. */ 299 300 switch (gly) { 301 case (ESCAPE_FONT): 302 if (2 == *sz) { 303 if ('C' == **start) { 304 /* 305 * Treat constant-width font modes 306 * just like regular font modes. 307 */ 308 (*start)++; 309 (*sz)--; 310 } else { 311 if ('B' == (*start)[0] && 'I' == (*start)[1]) 312 gly = ESCAPE_FONTBI; 313 break; 314 } 315 } else if (1 != *sz) 316 break; 317 318 switch (**start) { 319 case ('3'): 320 /* FALLTHROUGH */ 321 case ('B'): 322 gly = ESCAPE_FONTBOLD; 323 break; 324 case ('2'): 325 /* FALLTHROUGH */ 326 case ('I'): 327 gly = ESCAPE_FONTITALIC; 328 break; 329 case ('P'): 330 gly = ESCAPE_FONTPREV; 331 break; 332 case ('1'): 333 /* FALLTHROUGH */ 334 case ('R'): 335 gly = ESCAPE_FONTROMAN; 336 break; 337 } 338 break; 339 case (ESCAPE_SPECIAL): 340 if (1 == *sz && 'c' == **start) 341 gly = ESCAPE_NOSPACE; 342 break; 343 default: 344 break; 345 } 346 347 return(gly); 348 } 349 350 void * 351 mandoc_calloc(size_t num, size_t size) 352 { 353 void *ptr; 354 355 ptr = calloc(num, size); 356 if (NULL == ptr) { 357 perror(NULL); 358 exit((int)MANDOCLEVEL_SYSERR); 359 } 360 361 return(ptr); 362 } 363 364 365 void * 366 mandoc_malloc(size_t size) 367 { 368 void *ptr; 369 370 ptr = malloc(size); 371 if (NULL == ptr) { 372 perror(NULL); 373 exit((int)MANDOCLEVEL_SYSERR); 374 } 375 376 return(ptr); 377 } 378 379 380 void * 381 mandoc_realloc(void *ptr, size_t size) 382 { 383 384 ptr = realloc(ptr, size); 385 if (NULL == ptr) { 386 perror(NULL); 387 exit((int)MANDOCLEVEL_SYSERR); 388 } 389 390 return(ptr); 391 } 392 393 char * 394 mandoc_strndup(const char *ptr, size_t sz) 395 { 396 char *p; 397 398 p = mandoc_malloc(sz + 1); 399 memcpy(p, ptr, sz); 400 p[(int)sz] = '\0'; 401 return(p); 402 } 403 404 char * 405 mandoc_strdup(const char *ptr) 406 { 407 char *p; 408 409 p = strdup(ptr); 410 if (NULL == p) { 411 perror(NULL); 412 exit((int)MANDOCLEVEL_SYSERR); 413 } 414 415 return(p); 416 } 417 418 /* 419 * Parse a quoted or unquoted roff-style request or macro argument. 420 * Return a pointer to the parsed argument, which is either the original 421 * pointer or advanced by one byte in case the argument is quoted. 422 * Null-terminate the argument in place. 423 * Collapse pairs of quotes inside quoted arguments. 424 * Advance the argument pointer to the next argument, 425 * or to the null byte terminating the argument line. 426 */ 427 char * 428 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 429 { 430 char *start, *cp; 431 int quoted, pairs, white; 432 433 /* Quoting can only start with a new word. */ 434 start = *cpp; 435 quoted = 0; 436 if ('"' == *start) { 437 quoted = 1; 438 start++; 439 } 440 441 pairs = 0; 442 white = 0; 443 for (cp = start; '\0' != *cp; cp++) { 444 445 /* 446 * Move the following text left 447 * after quoted quotes and after "\\" and "\t". 448 */ 449 if (pairs) 450 cp[-pairs] = cp[0]; 451 452 if ('\\' == cp[0]) { 453 /* 454 * In copy mode, translate double to single 455 * backslashes and backslash-t to literal tabs. 456 */ 457 switch (cp[1]) { 458 case ('t'): 459 cp[0] = '\t'; 460 /* FALLTHROUGH */ 461 case ('\\'): 462 pairs++; 463 cp++; 464 break; 465 case (' '): 466 /* Skip escaped blanks. */ 467 if (0 == quoted) 468 cp++; 469 break; 470 default: 471 break; 472 } 473 } else if (0 == quoted) { 474 if (' ' == cp[0]) { 475 /* Unescaped blanks end unquoted args. */ 476 white = 1; 477 break; 478 } 479 } else if ('"' == cp[0]) { 480 if ('"' == cp[1]) { 481 /* Quoted quotes collapse. */ 482 pairs++; 483 cp++; 484 } else { 485 /* Unquoted quotes end quoted args. */ 486 quoted = 2; 487 break; 488 } 489 } 490 } 491 492 /* Quoted argument without a closing quote. */ 493 if (1 == quoted) 494 mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL); 495 496 /* Null-terminate this argument and move to the next one. */ 497 if (pairs) 498 cp[-pairs] = '\0'; 499 if ('\0' != *cp) { 500 *cp++ = '\0'; 501 while (' ' == *cp) 502 cp++; 503 } 504 *pos += (int)(cp - start) + (quoted ? 1 : 0); 505 *cpp = cp; 506 507 if ('\0' == *cp && (white || ' ' == cp[-1])) 508 mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL); 509 510 return(start); 511 } 512 513 static int 514 a2time(time_t *t, const char *fmt, const char *p) 515 { 516 struct tm tm; 517 char *pp; 518 519 memset(&tm, 0, sizeof(struct tm)); 520 521 pp = NULL; 522 #ifdef HAVE_STRPTIME 523 pp = strptime(p, fmt, &tm); 524 #endif 525 if (NULL != pp && '\0' == *pp) { 526 *t = mktime(&tm); 527 return(1); 528 } 529 530 return(0); 531 } 532 533 static char * 534 time2a(time_t t) 535 { 536 struct tm *tm; 537 char *buf, *p; 538 size_t ssz; 539 int isz; 540 541 tm = localtime(&t); 542 543 /* 544 * Reserve space: 545 * up to 9 characters for the month (September) + blank 546 * up to 2 characters for the day + comma + blank 547 * 4 characters for the year and a terminating '\0' 548 */ 549 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 550 551 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm))) 552 goto fail; 553 p += (int)ssz; 554 555 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday))) 556 goto fail; 557 p += isz; 558 559 if (0 == strftime(p, 4 + 1, "%Y", tm)) 560 goto fail; 561 return(buf); 562 563 fail: 564 free(buf); 565 return(NULL); 566 } 567 568 char * 569 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos) 570 { 571 char *out; 572 time_t t; 573 574 if (NULL == in || '\0' == *in || 575 0 == strcmp(in, "$" "Mdocdate$")) { 576 mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL); 577 time(&t); 578 } 579 else if (a2time(&t, "%Y-%m-%d", in)) 580 t = 0; 581 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) && 582 !a2time(&t, "%b %d, %Y", in)) { 583 mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL); 584 t = 0; 585 } 586 out = t ? time2a(t) : NULL; 587 return(out ? out : mandoc_strdup(in)); 588 } 589 590 int 591 mandoc_eos(const char *p, size_t sz, int enclosed) 592 { 593 const char *q; 594 int found; 595 596 if (0 == sz) 597 return(0); 598 599 /* 600 * End-of-sentence recognition must include situations where 601 * some symbols, such as `)', allow prior EOS punctuation to 602 * propagate outward. 603 */ 604 605 found = 0; 606 for (q = p + (int)sz - 1; q >= p; q--) { 607 switch (*q) { 608 case ('\"'): 609 /* FALLTHROUGH */ 610 case ('\''): 611 /* FALLTHROUGH */ 612 case (']'): 613 /* FALLTHROUGH */ 614 case (')'): 615 if (0 == found) 616 enclosed = 1; 617 break; 618 case ('.'): 619 /* FALLTHROUGH */ 620 case ('!'): 621 /* FALLTHROUGH */ 622 case ('?'): 623 found = 1; 624 break; 625 default: 626 return(found && (!enclosed || isalnum((unsigned char)*q))); 627 } 628 } 629 630 return(found && !enclosed); 631 } 632 633 /* 634 * Convert a string to a long that may not be <0. 635 * If the string is invalid, or is less than 0, return -1. 636 */ 637 int 638 mandoc_strntoi(const char *p, size_t sz, int base) 639 { 640 char buf[32]; 641 char *ep; 642 long v; 643 644 if (sz > 31) 645 return(-1); 646 647 memcpy(buf, p, sz); 648 buf[(int)sz] = '\0'; 649 650 errno = 0; 651 v = strtol(buf, &ep, base); 652 653 if (buf[0] == '\0' || *ep != '\0') 654 return(-1); 655 656 if (v > INT_MAX) 657 v = INT_MAX; 658 if (v < INT_MIN) 659 v = INT_MIN; 660 661 return((int)v); 662 } 663