1 /* $Id: mandoc.c,v 1.68 2013/08/08 20:07:47 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #ifdef HAVE_CONFIG_H 19 #include "config.h" 20 #endif 21 22 #include <sys/types.h> 23 24 #include <assert.h> 25 #include <ctype.h> 26 #include <errno.h> 27 #include <limits.h> 28 #include <stdlib.h> 29 #include <stdio.h> 30 #include <string.h> 31 #include <time.h> 32 33 #include "mandoc.h" 34 #include "libmandoc.h" 35 36 #define DATESIZE 32 37 38 static int a2time(time_t *, const char *, const char *); 39 static char *time2a(time_t); 40 41 42 enum mandoc_esc 43 mandoc_escape(const char **end, const char **start, int *sz) 44 { 45 const char *local_start; 46 int local_sz; 47 char term; 48 enum mandoc_esc gly; 49 50 /* 51 * When the caller doesn't provide return storage, 52 * use local storage. 53 */ 54 55 if (NULL == start) 56 start = &local_start; 57 if (NULL == sz) 58 sz = &local_sz; 59 60 /* 61 * Beyond the backslash, at least one input character 62 * is part of the escape sequence. With one exception 63 * (see below), that character won't be returned. 64 */ 65 66 gly = ESCAPE_ERROR; 67 *start = ++*end; 68 *sz = 0; 69 term = '\0'; 70 71 switch ((*start)[-1]) { 72 /* 73 * First the glyphs. There are several different forms of 74 * these, but each eventually returns a substring of the glyph 75 * name. 76 */ 77 case ('('): 78 gly = ESCAPE_SPECIAL; 79 *sz = 2; 80 break; 81 case ('['): 82 gly = ESCAPE_SPECIAL; 83 /* 84 * Unicode escapes are defined in groff as \[uXXXX] to 85 * \[u10FFFF], where the contained value must be a valid 86 * Unicode codepoint. Here, however, only check whether 87 * it's not a zero-width escape. 88 */ 89 if ('u' == (*start)[0] && ']' != (*start)[1]) 90 gly = ESCAPE_UNICODE; 91 term = ']'; 92 break; 93 case ('C'): 94 if ('\'' != **start) 95 return(ESCAPE_ERROR); 96 gly = ESCAPE_SPECIAL; 97 *start = ++*end; 98 term = '\''; 99 break; 100 101 /* 102 * The \z escape is supposed to output the following 103 * character without advancing the cursor position. 104 * Since we are mostly dealing with terminal mode, 105 * let us just skip the next character. 106 */ 107 case ('z'): 108 return(ESCAPE_SKIPCHAR); 109 110 /* 111 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 112 * 'X' is the trigger. These have opaque sub-strings. 113 */ 114 case ('F'): 115 /* FALLTHROUGH */ 116 case ('g'): 117 /* FALLTHROUGH */ 118 case ('k'): 119 /* FALLTHROUGH */ 120 case ('M'): 121 /* FALLTHROUGH */ 122 case ('m'): 123 /* FALLTHROUGH */ 124 case ('n'): 125 /* FALLTHROUGH */ 126 case ('V'): 127 /* FALLTHROUGH */ 128 case ('Y'): 129 gly = ESCAPE_IGNORE; 130 /* FALLTHROUGH */ 131 case ('f'): 132 if (ESCAPE_ERROR == gly) 133 gly = ESCAPE_FONT; 134 switch (**start) { 135 case ('('): 136 *start = ++*end; 137 *sz = 2; 138 break; 139 case ('['): 140 *start = ++*end; 141 term = ']'; 142 break; 143 default: 144 *sz = 1; 145 break; 146 } 147 break; 148 149 /* 150 * These escapes are of the form \X'Y', where 'X' is the trigger 151 * and 'Y' is any string. These have opaque sub-strings. 152 */ 153 case ('A'): 154 /* FALLTHROUGH */ 155 case ('b'): 156 /* FALLTHROUGH */ 157 case ('D'): 158 /* FALLTHROUGH */ 159 case ('o'): 160 /* FALLTHROUGH */ 161 case ('R'): 162 /* FALLTHROUGH */ 163 case ('X'): 164 /* FALLTHROUGH */ 165 case ('Z'): 166 if ('\'' != **start) 167 return(ESCAPE_ERROR); 168 gly = ESCAPE_IGNORE; 169 *start = ++*end; 170 term = '\''; 171 break; 172 173 /* 174 * These escapes are of the form \X'N', where 'X' is the trigger 175 * and 'N' resolves to a numerical expression. 176 */ 177 case ('B'): 178 /* FALLTHROUGH */ 179 case ('h'): 180 /* FALLTHROUGH */ 181 case ('H'): 182 /* FALLTHROUGH */ 183 case ('L'): 184 /* FALLTHROUGH */ 185 case ('l'): 186 gly = ESCAPE_NUMBERED; 187 /* FALLTHROUGH */ 188 case ('S'): 189 /* FALLTHROUGH */ 190 case ('v'): 191 /* FALLTHROUGH */ 192 case ('w'): 193 /* FALLTHROUGH */ 194 case ('x'): 195 if ('\'' != **start) 196 return(ESCAPE_ERROR); 197 if (ESCAPE_ERROR == gly) 198 gly = ESCAPE_IGNORE; 199 *start = ++*end; 200 term = '\''; 201 break; 202 203 /* 204 * Special handling for the numbered character escape. 205 * XXX Do any other escapes need similar handling? 206 */ 207 case ('N'): 208 if ('\0' == **start) 209 return(ESCAPE_ERROR); 210 (*end)++; 211 if (isdigit((unsigned char)**start)) { 212 *sz = 1; 213 return(ESCAPE_IGNORE); 214 } 215 (*start)++; 216 while (isdigit((unsigned char)**end)) 217 (*end)++; 218 *sz = *end - *start; 219 if ('\0' != **end) 220 (*end)++; 221 return(ESCAPE_NUMBERED); 222 223 /* 224 * Sizes get a special category of their own. 225 */ 226 case ('s'): 227 gly = ESCAPE_IGNORE; 228 229 /* See +/- counts as a sign. */ 230 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 231 (*end)++; 232 233 switch (**end) { 234 case ('('): 235 *start = ++*end; 236 *sz = 2; 237 break; 238 case ('['): 239 *start = ++*end; 240 term = ']'; 241 break; 242 case ('\''): 243 *start = ++*end; 244 term = '\''; 245 break; 246 default: 247 *sz = 1; 248 break; 249 } 250 251 break; 252 253 /* 254 * Anything else is assumed to be a glyph. 255 * In this case, pass back the character after the backslash. 256 */ 257 default: 258 gly = ESCAPE_SPECIAL; 259 *start = --*end; 260 *sz = 1; 261 break; 262 } 263 264 assert(ESCAPE_ERROR != gly); 265 266 /* 267 * Read up to the terminating character, 268 * paying attention to nested escapes. 269 */ 270 271 if ('\0' != term) { 272 while (**end != term) { 273 switch (**end) { 274 case ('\0'): 275 return(ESCAPE_ERROR); 276 case ('\\'): 277 (*end)++; 278 if (ESCAPE_ERROR == 279 mandoc_escape(end, NULL, NULL)) 280 return(ESCAPE_ERROR); 281 break; 282 default: 283 (*end)++; 284 break; 285 } 286 } 287 *sz = (*end)++ - *start; 288 } else { 289 assert(*sz > 0); 290 if ((size_t)*sz > strlen(*start)) 291 return(ESCAPE_ERROR); 292 *end += *sz; 293 } 294 295 /* Run post-processors. */ 296 297 switch (gly) { 298 case (ESCAPE_FONT): 299 if (2 == *sz) { 300 if ('C' == **start) { 301 /* 302 * Treat constant-width font modes 303 * just like regular font modes. 304 */ 305 (*start)++; 306 (*sz)--; 307 } else { 308 if ('B' == (*start)[0] && 'I' == (*start)[1]) 309 gly = ESCAPE_FONTBI; 310 break; 311 } 312 } else if (1 != *sz) 313 break; 314 315 switch (**start) { 316 case ('3'): 317 /* FALLTHROUGH */ 318 case ('B'): 319 gly = ESCAPE_FONTBOLD; 320 break; 321 case ('2'): 322 /* FALLTHROUGH */ 323 case ('I'): 324 gly = ESCAPE_FONTITALIC; 325 break; 326 case ('P'): 327 gly = ESCAPE_FONTPREV; 328 break; 329 case ('1'): 330 /* FALLTHROUGH */ 331 case ('R'): 332 gly = ESCAPE_FONTROMAN; 333 break; 334 } 335 break; 336 case (ESCAPE_SPECIAL): 337 if (1 == *sz && 'c' == **start) 338 gly = ESCAPE_NOSPACE; 339 break; 340 default: 341 break; 342 } 343 344 return(gly); 345 } 346 347 void * 348 mandoc_calloc(size_t num, size_t size) 349 { 350 void *ptr; 351 352 ptr = calloc(num, size); 353 if (NULL == ptr) { 354 perror(NULL); 355 exit((int)MANDOCLEVEL_SYSERR); 356 } 357 358 return(ptr); 359 } 360 361 362 void * 363 mandoc_malloc(size_t size) 364 { 365 void *ptr; 366 367 ptr = malloc(size); 368 if (NULL == ptr) { 369 perror(NULL); 370 exit((int)MANDOCLEVEL_SYSERR); 371 } 372 373 return(ptr); 374 } 375 376 377 void * 378 mandoc_realloc(void *ptr, size_t size) 379 { 380 381 ptr = realloc(ptr, size); 382 if (NULL == ptr) { 383 perror(NULL); 384 exit((int)MANDOCLEVEL_SYSERR); 385 } 386 387 return(ptr); 388 } 389 390 char * 391 mandoc_strndup(const char *ptr, size_t sz) 392 { 393 char *p; 394 395 p = mandoc_malloc(sz + 1); 396 memcpy(p, ptr, sz); 397 p[(int)sz] = '\0'; 398 return(p); 399 } 400 401 char * 402 mandoc_strdup(const char *ptr) 403 { 404 char *p; 405 406 p = strdup(ptr); 407 if (NULL == p) { 408 perror(NULL); 409 exit((int)MANDOCLEVEL_SYSERR); 410 } 411 412 return(p); 413 } 414 415 /* 416 * Parse a quoted or unquoted roff-style request or macro argument. 417 * Return a pointer to the parsed argument, which is either the original 418 * pointer or advanced by one byte in case the argument is quoted. 419 * Null-terminate the argument in place. 420 * Collapse pairs of quotes inside quoted arguments. 421 * Advance the argument pointer to the next argument, 422 * or to the null byte terminating the argument line. 423 */ 424 char * 425 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 426 { 427 char *start, *cp; 428 int quoted, pairs, white; 429 430 /* Quoting can only start with a new word. */ 431 start = *cpp; 432 quoted = 0; 433 if ('"' == *start) { 434 quoted = 1; 435 start++; 436 } 437 438 pairs = 0; 439 white = 0; 440 for (cp = start; '\0' != *cp; cp++) { 441 442 /* 443 * Move the following text left 444 * after quoted quotes and after "\\" and "\t". 445 */ 446 if (pairs) 447 cp[-pairs] = cp[0]; 448 449 if ('\\' == cp[0]) { 450 /* 451 * In copy mode, translate double to single 452 * backslashes and backslash-t to literal tabs. 453 */ 454 switch (cp[1]) { 455 case ('t'): 456 cp[0] = '\t'; 457 /* FALLTHROUGH */ 458 case ('\\'): 459 pairs++; 460 cp++; 461 break; 462 case (' '): 463 /* Skip escaped blanks. */ 464 if (0 == quoted) 465 cp++; 466 break; 467 default: 468 break; 469 } 470 } else if (0 == quoted) { 471 if (' ' == cp[0]) { 472 /* Unescaped blanks end unquoted args. */ 473 white = 1; 474 break; 475 } 476 } else if ('"' == cp[0]) { 477 if ('"' == cp[1]) { 478 /* Quoted quotes collapse. */ 479 pairs++; 480 cp++; 481 } else { 482 /* Unquoted quotes end quoted args. */ 483 quoted = 2; 484 break; 485 } 486 } 487 } 488 489 /* Quoted argument without a closing quote. */ 490 if (1 == quoted) 491 mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL); 492 493 /* Null-terminate this argument and move to the next one. */ 494 if (pairs) 495 cp[-pairs] = '\0'; 496 if ('\0' != *cp) { 497 *cp++ = '\0'; 498 while (' ' == *cp) 499 cp++; 500 } 501 *pos += (int)(cp - start) + (quoted ? 1 : 0); 502 *cpp = cp; 503 504 if ('\0' == *cp && (white || ' ' == cp[-1])) 505 mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL); 506 507 return(start); 508 } 509 510 static int 511 a2time(time_t *t, const char *fmt, const char *p) 512 { 513 struct tm tm; 514 char *pp; 515 516 memset(&tm, 0, sizeof(struct tm)); 517 518 pp = NULL; 519 #ifdef HAVE_STRPTIME 520 pp = strptime(p, fmt, &tm); 521 #endif 522 if (NULL != pp && '\0' == *pp) { 523 *t = mktime(&tm); 524 return(1); 525 } 526 527 return(0); 528 } 529 530 static char * 531 time2a(time_t t) 532 { 533 struct tm *tm; 534 char *buf, *p; 535 size_t ssz; 536 int isz; 537 538 tm = localtime(&t); 539 540 /* 541 * Reserve space: 542 * up to 9 characters for the month (September) + blank 543 * up to 2 characters for the day + comma + blank 544 * 4 characters for the year and a terminating '\0' 545 */ 546 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 547 548 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm))) 549 goto fail; 550 p += (int)ssz; 551 552 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday))) 553 goto fail; 554 p += isz; 555 556 if (0 == strftime(p, 4 + 1, "%Y", tm)) 557 goto fail; 558 return(buf); 559 560 fail: 561 free(buf); 562 return(NULL); 563 } 564 565 char * 566 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos) 567 { 568 char *out; 569 time_t t; 570 571 if (NULL == in || '\0' == *in || 572 0 == strcmp(in, "$" "Mdocdate$")) { 573 mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL); 574 time(&t); 575 } 576 else if (a2time(&t, "%Y-%m-%d", in)) 577 t = 0; 578 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) && 579 !a2time(&t, "%b %d, %Y", in)) { 580 mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL); 581 t = 0; 582 } 583 out = t ? time2a(t) : NULL; 584 return(out ? out : mandoc_strdup(in)); 585 } 586 587 int 588 mandoc_eos(const char *p, size_t sz, int enclosed) 589 { 590 const char *q; 591 int found; 592 593 if (0 == sz) 594 return(0); 595 596 /* 597 * End-of-sentence recognition must include situations where 598 * some symbols, such as `)', allow prior EOS punctuation to 599 * propagate outward. 600 */ 601 602 found = 0; 603 for (q = p + (int)sz - 1; q >= p; q--) { 604 switch (*q) { 605 case ('\"'): 606 /* FALLTHROUGH */ 607 case ('\''): 608 /* FALLTHROUGH */ 609 case (']'): 610 /* FALLTHROUGH */ 611 case (')'): 612 if (0 == found) 613 enclosed = 1; 614 break; 615 case ('.'): 616 /* FALLTHROUGH */ 617 case ('!'): 618 /* FALLTHROUGH */ 619 case ('?'): 620 found = 1; 621 break; 622 default: 623 return(found && (!enclosed || isalnum((unsigned char)*q))); 624 } 625 } 626 627 return(found && !enclosed); 628 } 629 630 /* 631 * Convert a string to a long that may not be <0. 632 * If the string is invalid, or is less than 0, return -1. 633 */ 634 int 635 mandoc_strntoi(const char *p, size_t sz, int base) 636 { 637 char buf[32]; 638 char *ep; 639 long v; 640 641 if (sz > 31) 642 return(-1); 643 644 memcpy(buf, p, sz); 645 buf[(int)sz] = '\0'; 646 647 errno = 0; 648 v = strtol(buf, &ep, base); 649 650 if (buf[0] == '\0' || *ep != '\0') 651 return(-1); 652 653 if (v > INT_MAX) 654 v = INT_MAX; 655 if (v < INT_MIN) 656 v = INT_MIN; 657 658 return((int)v); 659 } 660