1 /* $OpenBSD: mandoc.c,v 1.60 2015/02/20 23:51:54 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011-2015 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <errno.h> 23 #include <limits.h> 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <time.h> 28 29 #include "mandoc.h" 30 #include "mandoc_aux.h" 31 #include "libmandoc.h" 32 33 #define DATESIZE 32 34 35 static int a2time(time_t *, const char *, const char *); 36 static char *time2a(time_t); 37 38 39 enum mandoc_esc 40 mandoc_escape(const char **end, const char **start, int *sz) 41 { 42 const char *local_start; 43 int local_sz; 44 char term; 45 enum mandoc_esc gly; 46 47 /* 48 * When the caller doesn't provide return storage, 49 * use local storage. 50 */ 51 52 if (NULL == start) 53 start = &local_start; 54 if (NULL == sz) 55 sz = &local_sz; 56 57 /* 58 * Beyond the backslash, at least one input character 59 * is part of the escape sequence. With one exception 60 * (see below), that character won't be returned. 61 */ 62 63 gly = ESCAPE_ERROR; 64 *start = ++*end; 65 *sz = 0; 66 term = '\0'; 67 68 switch ((*start)[-1]) { 69 /* 70 * First the glyphs. There are several different forms of 71 * these, but each eventually returns a substring of the glyph 72 * name. 73 */ 74 case '(': 75 gly = ESCAPE_SPECIAL; 76 *sz = 2; 77 break; 78 case '[': 79 gly = ESCAPE_SPECIAL; 80 term = ']'; 81 break; 82 case 'C': 83 if ('\'' != **start) 84 return(ESCAPE_ERROR); 85 *start = ++*end; 86 gly = ESCAPE_SPECIAL; 87 term = '\''; 88 break; 89 90 /* 91 * Escapes taking no arguments at all. 92 */ 93 case 'd': 94 /* FALLTHROUGH */ 95 case 'u': 96 return(ESCAPE_IGNORE); 97 98 /* 99 * The \z escape is supposed to output the following 100 * character without advancing the cursor position. 101 * Since we are mostly dealing with terminal mode, 102 * let us just skip the next character. 103 */ 104 case 'z': 105 return(ESCAPE_SKIPCHAR); 106 107 /* 108 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 109 * 'X' is the trigger. These have opaque sub-strings. 110 */ 111 case 'F': 112 /* FALLTHROUGH */ 113 case 'g': 114 /* FALLTHROUGH */ 115 case 'k': 116 /* FALLTHROUGH */ 117 case 'M': 118 /* FALLTHROUGH */ 119 case 'm': 120 /* FALLTHROUGH */ 121 case 'n': 122 /* FALLTHROUGH */ 123 case 'V': 124 /* FALLTHROUGH */ 125 case 'Y': 126 gly = ESCAPE_IGNORE; 127 /* FALLTHROUGH */ 128 case 'f': 129 if (ESCAPE_ERROR == gly) 130 gly = ESCAPE_FONT; 131 switch (**start) { 132 case '(': 133 *start = ++*end; 134 *sz = 2; 135 break; 136 case '[': 137 *start = ++*end; 138 term = ']'; 139 break; 140 default: 141 *sz = 1; 142 break; 143 } 144 break; 145 146 /* 147 * These escapes are of the form \X'Y', where 'X' is the trigger 148 * and 'Y' is any string. These have opaque sub-strings. 149 * The \B and \w escapes are handled in roff.c, roff_res(). 150 */ 151 case 'A': 152 /* FALLTHROUGH */ 153 case 'b': 154 /* FALLTHROUGH */ 155 case 'D': 156 /* FALLTHROUGH */ 157 case 'R': 158 /* FALLTHROUGH */ 159 case 'X': 160 /* FALLTHROUGH */ 161 case 'Z': 162 gly = ESCAPE_IGNORE; 163 /* FALLTHROUGH */ 164 case 'o': 165 if (**start == '\0') 166 return(ESCAPE_ERROR); 167 if (gly == ESCAPE_ERROR) 168 gly = ESCAPE_OVERSTRIKE; 169 term = **start; 170 *start = ++*end; 171 break; 172 173 /* 174 * These escapes are of the form \X'N', where 'X' is the trigger 175 * and 'N' resolves to a numerical expression. 176 */ 177 case 'h': 178 /* FALLTHROUGH */ 179 case 'H': 180 /* FALLTHROUGH */ 181 case 'L': 182 /* FALLTHROUGH */ 183 case 'l': 184 /* FALLTHROUGH */ 185 case 'S': 186 /* FALLTHROUGH */ 187 case 'v': 188 /* FALLTHROUGH */ 189 case 'x': 190 if (strchr(" %&()*+-./0123456789:<=>", **start)) { 191 if ('\0' != **start) 192 ++*end; 193 return(ESCAPE_ERROR); 194 } 195 gly = ESCAPE_IGNORE; 196 term = **start; 197 *start = ++*end; 198 break; 199 200 /* 201 * Special handling for the numbered character escape. 202 * XXX Do any other escapes need similar handling? 203 */ 204 case 'N': 205 if ('\0' == **start) 206 return(ESCAPE_ERROR); 207 (*end)++; 208 if (isdigit((unsigned char)**start)) { 209 *sz = 1; 210 return(ESCAPE_IGNORE); 211 } 212 (*start)++; 213 while (isdigit((unsigned char)**end)) 214 (*end)++; 215 *sz = *end - *start; 216 if ('\0' != **end) 217 (*end)++; 218 return(ESCAPE_NUMBERED); 219 220 /* 221 * Sizes get a special category of their own. 222 */ 223 case 's': 224 gly = ESCAPE_IGNORE; 225 226 /* See +/- counts as a sign. */ 227 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 228 *start = ++*end; 229 230 switch (**end) { 231 case '(': 232 *start = ++*end; 233 *sz = 2; 234 break; 235 case '[': 236 *start = ++*end; 237 term = ']'; 238 break; 239 case '\'': 240 *start = ++*end; 241 term = '\''; 242 break; 243 case '3': 244 /* FALLTHROUGH */ 245 case '2': 246 /* FALLTHROUGH */ 247 case '1': 248 *sz = (*end)[-1] == 's' && 249 isdigit((unsigned char)(*end)[1]) ? 2 : 1; 250 break; 251 default: 252 *sz = 1; 253 break; 254 } 255 256 break; 257 258 /* 259 * Anything else is assumed to be a glyph. 260 * In this case, pass back the character after the backslash. 261 */ 262 default: 263 gly = ESCAPE_SPECIAL; 264 *start = --*end; 265 *sz = 1; 266 break; 267 } 268 269 assert(ESCAPE_ERROR != gly); 270 271 /* 272 * Read up to the terminating character, 273 * paying attention to nested escapes. 274 */ 275 276 if ('\0' != term) { 277 while (**end != term) { 278 switch (**end) { 279 case '\0': 280 return(ESCAPE_ERROR); 281 case '\\': 282 (*end)++; 283 if (ESCAPE_ERROR == 284 mandoc_escape(end, NULL, NULL)) 285 return(ESCAPE_ERROR); 286 break; 287 default: 288 (*end)++; 289 break; 290 } 291 } 292 *sz = (*end)++ - *start; 293 } else { 294 assert(*sz > 0); 295 if ((size_t)*sz > strlen(*start)) 296 return(ESCAPE_ERROR); 297 *end += *sz; 298 } 299 300 /* Run post-processors. */ 301 302 switch (gly) { 303 case ESCAPE_FONT: 304 if (2 == *sz) { 305 if ('C' == **start) { 306 /* 307 * Treat constant-width font modes 308 * just like regular font modes. 309 */ 310 (*start)++; 311 (*sz)--; 312 } else { 313 if ('B' == (*start)[0] && 'I' == (*start)[1]) 314 gly = ESCAPE_FONTBI; 315 break; 316 } 317 } else if (1 != *sz) 318 break; 319 320 switch (**start) { 321 case '3': 322 /* FALLTHROUGH */ 323 case 'B': 324 gly = ESCAPE_FONTBOLD; 325 break; 326 case '2': 327 /* FALLTHROUGH */ 328 case 'I': 329 gly = ESCAPE_FONTITALIC; 330 break; 331 case 'P': 332 gly = ESCAPE_FONTPREV; 333 break; 334 case '1': 335 /* FALLTHROUGH */ 336 case 'R': 337 gly = ESCAPE_FONTROMAN; 338 break; 339 } 340 break; 341 case ESCAPE_SPECIAL: 342 if (1 == *sz && 'c' == **start) 343 gly = ESCAPE_NOSPACE; 344 /* 345 * Unicode escapes are defined in groff as \[u0000] 346 * to \[u10FFFF], where the contained value must be 347 * a valid Unicode codepoint. Here, however, only 348 * check the length and range. 349 */ 350 if (**start != 'u' || *sz < 5 || *sz > 7) 351 break; 352 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0')) 353 break; 354 if (*sz == 6 && (*start)[1] == '0') 355 break; 356 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef") 357 + 1 == *sz) 358 gly = ESCAPE_UNICODE; 359 break; 360 default: 361 break; 362 } 363 364 return(gly); 365 } 366 367 /* 368 * Parse a quoted or unquoted roff-style request or macro argument. 369 * Return a pointer to the parsed argument, which is either the original 370 * pointer or advanced by one byte in case the argument is quoted. 371 * NUL-terminate the argument in place. 372 * Collapse pairs of quotes inside quoted arguments. 373 * Advance the argument pointer to the next argument, 374 * or to the NUL byte terminating the argument line. 375 */ 376 char * 377 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 378 { 379 char *start, *cp; 380 int quoted, pairs, white; 381 382 /* Quoting can only start with a new word. */ 383 start = *cpp; 384 quoted = 0; 385 if ('"' == *start) { 386 quoted = 1; 387 start++; 388 } 389 390 pairs = 0; 391 white = 0; 392 for (cp = start; '\0' != *cp; cp++) { 393 394 /* 395 * Move the following text left 396 * after quoted quotes and after "\\" and "\t". 397 */ 398 if (pairs) 399 cp[-pairs] = cp[0]; 400 401 if ('\\' == cp[0]) { 402 /* 403 * In copy mode, translate double to single 404 * backslashes and backslash-t to literal tabs. 405 */ 406 switch (cp[1]) { 407 case 't': 408 cp[0] = '\t'; 409 /* FALLTHROUGH */ 410 case '\\': 411 pairs++; 412 cp++; 413 break; 414 case ' ': 415 /* Skip escaped blanks. */ 416 if (0 == quoted) 417 cp++; 418 break; 419 default: 420 break; 421 } 422 } else if (0 == quoted) { 423 if (' ' == cp[0]) { 424 /* Unescaped blanks end unquoted args. */ 425 white = 1; 426 break; 427 } 428 } else if ('"' == cp[0]) { 429 if ('"' == cp[1]) { 430 /* Quoted quotes collapse. */ 431 pairs++; 432 cp++; 433 } else { 434 /* Unquoted quotes end quoted args. */ 435 quoted = 2; 436 break; 437 } 438 } 439 } 440 441 /* Quoted argument without a closing quote. */ 442 if (1 == quoted) 443 mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL); 444 445 /* NUL-terminate this argument and move to the next one. */ 446 if (pairs) 447 cp[-pairs] = '\0'; 448 if ('\0' != *cp) { 449 *cp++ = '\0'; 450 while (' ' == *cp) 451 cp++; 452 } 453 *pos += (int)(cp - start) + (quoted ? 1 : 0); 454 *cpp = cp; 455 456 if ('\0' == *cp && (white || ' ' == cp[-1])) 457 mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL); 458 459 return(start); 460 } 461 462 static int 463 a2time(time_t *t, const char *fmt, const char *p) 464 { 465 struct tm tm; 466 char *pp; 467 468 memset(&tm, 0, sizeof(struct tm)); 469 470 pp = strptime(p, fmt, &tm); 471 if (NULL != pp && '\0' == *pp) { 472 *t = mktime(&tm); 473 return(1); 474 } 475 476 return(0); 477 } 478 479 static char * 480 time2a(time_t t) 481 { 482 struct tm *tm; 483 char *buf, *p; 484 size_t ssz; 485 int isz; 486 487 tm = localtime(&t); 488 if (tm == NULL) 489 return(NULL); 490 491 /* 492 * Reserve space: 493 * up to 9 characters for the month (September) + blank 494 * up to 2 characters for the day + comma + blank 495 * 4 characters for the year and a terminating '\0' 496 */ 497 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 498 499 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm))) 500 goto fail; 501 p += (int)ssz; 502 503 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday))) 504 goto fail; 505 p += isz; 506 507 if (0 == strftime(p, 4 + 1, "%Y", tm)) 508 goto fail; 509 return(buf); 510 511 fail: 512 free(buf); 513 return(NULL); 514 } 515 516 char * 517 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos) 518 { 519 char *out; 520 time_t t; 521 522 if (NULL == in || '\0' == *in || 523 0 == strcmp(in, "$" "Mdocdate$")) { 524 mandoc_msg(MANDOCERR_DATE_MISSING, parse, ln, pos, NULL); 525 time(&t); 526 } 527 else if (a2time(&t, "%Y-%m-%d", in)) 528 t = 0; 529 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) && 530 !a2time(&t, "%b %d, %Y", in)) { 531 mandoc_msg(MANDOCERR_DATE_BAD, parse, ln, pos, in); 532 t = 0; 533 } 534 out = t ? time2a(t) : NULL; 535 return(out ? out : mandoc_strdup(in)); 536 } 537 538 int 539 mandoc_eos(const char *p, size_t sz) 540 { 541 const char *q; 542 int enclosed, found; 543 544 if (0 == sz) 545 return(0); 546 547 /* 548 * End-of-sentence recognition must include situations where 549 * some symbols, such as `)', allow prior EOS punctuation to 550 * propagate outward. 551 */ 552 553 enclosed = found = 0; 554 for (q = p + (int)sz - 1; q >= p; q--) { 555 switch (*q) { 556 case '\"': 557 /* FALLTHROUGH */ 558 case '\'': 559 /* FALLTHROUGH */ 560 case ']': 561 /* FALLTHROUGH */ 562 case ')': 563 if (0 == found) 564 enclosed = 1; 565 break; 566 case '.': 567 /* FALLTHROUGH */ 568 case '!': 569 /* FALLTHROUGH */ 570 case '?': 571 found = 1; 572 break; 573 default: 574 return(found && (!enclosed || isalnum((unsigned char)*q))); 575 } 576 } 577 578 return(found && !enclosed); 579 } 580 581 /* 582 * Convert a string to a long that may not be <0. 583 * If the string is invalid, or is less than 0, return -1. 584 */ 585 int 586 mandoc_strntoi(const char *p, size_t sz, int base) 587 { 588 char buf[32]; 589 char *ep; 590 long v; 591 592 if (sz > 31) 593 return(-1); 594 595 memcpy(buf, p, sz); 596 buf[(int)sz] = '\0'; 597 598 errno = 0; 599 v = strtol(buf, &ep, base); 600 601 if (buf[0] == '\0' || *ep != '\0') 602 return(-1); 603 604 if (v > INT_MAX) 605 v = INT_MAX; 606 if (v < INT_MIN) 607 v = INT_MIN; 608 609 return((int)v); 610 } 611