1 /* $OpenBSD: mandoc.c,v 1.84 2019/06/27 15:05:14 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011-2015, 2017, 2018 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <errno.h> 23 #include <limits.h> 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <time.h> 28 29 #include "mandoc_aux.h" 30 #include "mandoc.h" 31 #include "roff.h" 32 #include "libmandoc.h" 33 #include "roff_int.h" 34 35 static int a2time(time_t *, const char *, const char *); 36 static char *time2a(time_t); 37 38 39 enum mandoc_esc 40 mandoc_font(const char *cp, int sz) 41 { 42 switch (sz) { 43 case 0: 44 return ESCAPE_FONTPREV; 45 case 1: 46 switch (cp[0]) { 47 case 'B': 48 case '3': 49 return ESCAPE_FONTBOLD; 50 case 'I': 51 case '2': 52 return ESCAPE_FONTITALIC; 53 case 'P': 54 return ESCAPE_FONTPREV; 55 case 'R': 56 case '1': 57 return ESCAPE_FONTROMAN; 58 case '4': 59 return ESCAPE_FONTBI; 60 default: 61 return ESCAPE_ERROR; 62 } 63 case 2: 64 switch (cp[0]) { 65 case 'B': 66 switch (cp[1]) { 67 case 'I': 68 return ESCAPE_FONTBI; 69 default: 70 return ESCAPE_ERROR; 71 } 72 case 'C': 73 switch (cp[1]) { 74 case 'B': 75 return ESCAPE_FONTBOLD; 76 case 'I': 77 return ESCAPE_FONTITALIC; 78 case 'R': 79 case 'W': 80 return ESCAPE_FONTCW; 81 default: 82 return ESCAPE_ERROR; 83 } 84 default: 85 return ESCAPE_ERROR; 86 } 87 default: 88 return ESCAPE_ERROR; 89 } 90 } 91 92 enum mandoc_esc 93 mandoc_escape(const char **end, const char **start, int *sz) 94 { 95 const char *local_start; 96 int local_sz, c, i; 97 char term; 98 enum mandoc_esc gly; 99 100 /* 101 * When the caller doesn't provide return storage, 102 * use local storage. 103 */ 104 105 if (NULL == start) 106 start = &local_start; 107 if (NULL == sz) 108 sz = &local_sz; 109 110 /* 111 * Treat "\E" just like "\"; 112 * it only makes a difference in copy mode. 113 */ 114 115 if (**end == 'E') 116 ++*end; 117 118 /* 119 * Beyond the backslash, at least one input character 120 * is part of the escape sequence. With one exception 121 * (see below), that character won't be returned. 122 */ 123 124 gly = ESCAPE_ERROR; 125 *start = ++*end; 126 *sz = 0; 127 term = '\0'; 128 129 switch ((*start)[-1]) { 130 /* 131 * First the glyphs. There are several different forms of 132 * these, but each eventually returns a substring of the glyph 133 * name. 134 */ 135 case '(': 136 gly = ESCAPE_SPECIAL; 137 *sz = 2; 138 break; 139 case '[': 140 if (**start == ' ') { 141 ++*end; 142 return ESCAPE_ERROR; 143 } 144 gly = ESCAPE_SPECIAL; 145 term = ']'; 146 break; 147 case 'C': 148 if ('\'' != **start) 149 return ESCAPE_ERROR; 150 *start = ++*end; 151 gly = ESCAPE_SPECIAL; 152 term = '\''; 153 break; 154 155 /* 156 * Escapes taking no arguments at all. 157 */ 158 case '!': 159 case '?': 160 return ESCAPE_UNSUPP; 161 case '%': 162 case '&': 163 case ')': 164 case ',': 165 case '/': 166 case '^': 167 case 'a': 168 case 'd': 169 case 'r': 170 case 't': 171 case 'u': 172 case '{': 173 case '|': 174 case '}': 175 return ESCAPE_IGNORE; 176 case 'c': 177 return ESCAPE_NOSPACE; 178 case 'p': 179 return ESCAPE_BREAK; 180 181 /* 182 * The \z escape is supposed to output the following 183 * character without advancing the cursor position. 184 * Since we are mostly dealing with terminal mode, 185 * let us just skip the next character. 186 */ 187 case 'z': 188 return ESCAPE_SKIPCHAR; 189 190 /* 191 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 192 * 'X' is the trigger. These have opaque sub-strings. 193 */ 194 case 'F': 195 case 'f': 196 case 'g': 197 case 'k': 198 case 'M': 199 case 'm': 200 case 'n': 201 case 'O': 202 case 'V': 203 case 'Y': 204 gly = (*start)[-1] == 'f' ? ESCAPE_FONT : ESCAPE_IGNORE; 205 switch (**start) { 206 case '(': 207 if ((*start)[-1] == 'O') 208 gly = ESCAPE_ERROR; 209 *start = ++*end; 210 *sz = 2; 211 break; 212 case '[': 213 if ((*start)[-1] == 'O') 214 gly = (*start)[1] == '5' ? 215 ESCAPE_UNSUPP : ESCAPE_ERROR; 216 *start = ++*end; 217 term = ']'; 218 break; 219 default: 220 if ((*start)[-1] == 'O') { 221 switch (**start) { 222 case '0': 223 gly = ESCAPE_UNSUPP; 224 break; 225 case '1': 226 case '2': 227 case '3': 228 case '4': 229 break; 230 default: 231 gly = ESCAPE_ERROR; 232 break; 233 } 234 } 235 *sz = 1; 236 break; 237 } 238 break; 239 case '*': 240 if (strncmp(*start, "(.T", 3) != 0) 241 abort(); 242 gly = ESCAPE_DEVICE; 243 *start = ++*end; 244 *sz = 2; 245 break; 246 247 /* 248 * These escapes are of the form \X'Y', where 'X' is the trigger 249 * and 'Y' is any string. These have opaque sub-strings. 250 * The \B and \w escapes are handled in roff.c, roff_res(). 251 */ 252 case 'A': 253 case 'b': 254 case 'D': 255 case 'R': 256 case 'X': 257 case 'Z': 258 gly = ESCAPE_IGNORE; 259 /* FALLTHROUGH */ 260 case 'o': 261 if (**start == '\0') 262 return ESCAPE_ERROR; 263 if (gly == ESCAPE_ERROR) 264 gly = ESCAPE_OVERSTRIKE; 265 term = **start; 266 *start = ++*end; 267 break; 268 269 /* 270 * These escapes are of the form \X'N', where 'X' is the trigger 271 * and 'N' resolves to a numerical expression. 272 */ 273 case 'h': 274 case 'H': 275 case 'L': 276 case 'l': 277 case 'S': 278 case 'v': 279 case 'x': 280 if (strchr(" %&()*+-./0123456789:<=>", **start)) { 281 if ('\0' != **start) 282 ++*end; 283 return ESCAPE_ERROR; 284 } 285 switch ((*start)[-1]) { 286 case 'h': 287 gly = ESCAPE_HORIZ; 288 break; 289 case 'l': 290 gly = ESCAPE_HLINE; 291 break; 292 default: 293 gly = ESCAPE_IGNORE; 294 break; 295 } 296 term = **start; 297 *start = ++*end; 298 break; 299 300 /* 301 * Special handling for the numbered character escape. 302 * XXX Do any other escapes need similar handling? 303 */ 304 case 'N': 305 if ('\0' == **start) 306 return ESCAPE_ERROR; 307 (*end)++; 308 if (isdigit((unsigned char)**start)) { 309 *sz = 1; 310 return ESCAPE_IGNORE; 311 } 312 (*start)++; 313 while (isdigit((unsigned char)**end)) 314 (*end)++; 315 *sz = *end - *start; 316 if ('\0' != **end) 317 (*end)++; 318 return ESCAPE_NUMBERED; 319 320 /* 321 * Sizes get a special category of their own. 322 */ 323 case 's': 324 gly = ESCAPE_IGNORE; 325 326 /* See +/- counts as a sign. */ 327 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 328 *start = ++*end; 329 330 switch (**end) { 331 case '(': 332 *start = ++*end; 333 *sz = 2; 334 break; 335 case '[': 336 *start = ++*end; 337 term = ']'; 338 break; 339 case '\'': 340 *start = ++*end; 341 term = '\''; 342 break; 343 case '3': 344 case '2': 345 case '1': 346 *sz = (*end)[-1] == 's' && 347 isdigit((unsigned char)(*end)[1]) ? 2 : 1; 348 break; 349 default: 350 *sz = 1; 351 break; 352 } 353 354 break; 355 356 /* 357 * Several special characters can be encoded as 358 * one-byte escape sequences without using \[]. 359 */ 360 case ' ': 361 case '\'': 362 case '-': 363 case '.': 364 case '0': 365 case ':': 366 case '_': 367 case '`': 368 case 'e': 369 case '~': 370 gly = ESCAPE_SPECIAL; 371 /* FALLTHROUGH */ 372 default: 373 if (gly == ESCAPE_ERROR) 374 gly = ESCAPE_UNDEF; 375 *start = --*end; 376 *sz = 1; 377 break; 378 } 379 380 /* 381 * Read up to the terminating character, 382 * paying attention to nested escapes. 383 */ 384 385 if ('\0' != term) { 386 while (**end != term) { 387 switch (**end) { 388 case '\0': 389 return ESCAPE_ERROR; 390 case '\\': 391 (*end)++; 392 if (ESCAPE_ERROR == 393 mandoc_escape(end, NULL, NULL)) 394 return ESCAPE_ERROR; 395 break; 396 default: 397 (*end)++; 398 break; 399 } 400 } 401 *sz = (*end)++ - *start; 402 403 /* 404 * The file chars.c only provides one common list 405 * of character names, but \[-] == \- is the only 406 * one of the characters with one-byte names that 407 * allows enclosing the name in brackets. 408 */ 409 if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-') 410 return ESCAPE_ERROR; 411 } else { 412 assert(*sz > 0); 413 if ((size_t)*sz > strlen(*start)) 414 return ESCAPE_ERROR; 415 *end += *sz; 416 } 417 418 /* Run post-processors. */ 419 420 switch (gly) { 421 case ESCAPE_FONT: 422 gly = mandoc_font(*start, *sz); 423 break; 424 case ESCAPE_SPECIAL: 425 if (**start == 'c') { 426 if (*sz < 6 || *sz > 7 || 427 strncmp(*start, "char", 4) != 0 || 428 (int)strspn(*start + 4, "0123456789") + 4 < *sz) 429 break; 430 c = 0; 431 for (i = 4; i < *sz; i++) 432 c = 10 * c + ((*start)[i] - '0'); 433 if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) 434 break; 435 *start += 4; 436 *sz -= 4; 437 gly = ESCAPE_NUMBERED; 438 break; 439 } 440 441 /* 442 * Unicode escapes are defined in groff as \[u0000] 443 * to \[u10FFFF], where the contained value must be 444 * a valid Unicode codepoint. Here, however, only 445 * check the length and range. 446 */ 447 if (**start != 'u' || *sz < 5 || *sz > 7) 448 break; 449 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0')) 450 break; 451 if (*sz == 6 && (*start)[1] == '0') 452 break; 453 if (*sz == 5 && (*start)[1] == 'D' && 454 strchr("89ABCDEF", (*start)[2]) != NULL) 455 break; 456 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef") 457 + 1 == *sz) 458 gly = ESCAPE_UNICODE; 459 break; 460 default: 461 break; 462 } 463 464 return gly; 465 } 466 467 static int 468 a2time(time_t *t, const char *fmt, const char *p) 469 { 470 struct tm tm; 471 char *pp; 472 473 memset(&tm, 0, sizeof(struct tm)); 474 475 pp = strptime(p, fmt, &tm); 476 if (NULL != pp && '\0' == *pp) { 477 *t = mktime(&tm); 478 return 1; 479 } 480 481 return 0; 482 } 483 484 static char * 485 time2a(time_t t) 486 { 487 struct tm *tm; 488 char *buf, *p; 489 size_t ssz; 490 int isz; 491 492 buf = NULL; 493 tm = localtime(&t); 494 if (tm == NULL) 495 goto fail; 496 497 /* 498 * Reserve space: 499 * up to 9 characters for the month (September) + blank 500 * up to 2 characters for the day + comma + blank 501 * 4 characters for the year and a terminating '\0' 502 */ 503 504 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 505 506 if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0) 507 goto fail; 508 p += (int)ssz; 509 510 /* 511 * The output format is just "%d" here, not "%2d" or "%02d". 512 * That's also the reason why we can't just format the 513 * date as a whole with "%B %e, %Y" or "%B %d, %Y". 514 * Besides, the present approach is less prone to buffer 515 * overflows, in case anybody should ever introduce the bug 516 * of looking at LC_TIME. 517 */ 518 519 isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday); 520 if (isz < 0 || isz > 4) 521 goto fail; 522 p += isz; 523 524 if (strftime(p, 4 + 1, "%Y", tm) == 0) 525 goto fail; 526 return buf; 527 528 fail: 529 free(buf); 530 return mandoc_strdup(""); 531 } 532 533 char * 534 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos) 535 { 536 char *cp; 537 time_t t; 538 539 if (man->quick) 540 return mandoc_strdup(in == NULL ? "" : in); 541 542 /* No date specified: use today's date. */ 543 544 if (in == NULL || *in == '\0') 545 mandoc_msg(MANDOCERR_DATE_MISSING, ln, pos, NULL); 546 if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) 547 return time2a(time(NULL)); 548 549 /* Valid mdoc(7) date format. */ 550 551 if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) || 552 a2time(&t, "%b %d, %Y", in)) { 553 cp = time2a(t); 554 if (t > time(NULL) + 86400) 555 mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", cp); 556 else if (*in != '$' && strcmp(in, cp) != 0) 557 mandoc_msg(MANDOCERR_DATE_NORM, ln, pos, "%s", cp); 558 return cp; 559 } 560 561 /* In man(7), do not warn about the legacy format. */ 562 563 if (a2time(&t, "%Y-%m-%d", in) == 0) 564 mandoc_msg(MANDOCERR_DATE_BAD, ln, pos, "%s", in); 565 else if (t > time(NULL) + 86400) 566 mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", in); 567 else if (man->meta.macroset == MACROSET_MDOC) 568 mandoc_msg(MANDOCERR_DATE_LEGACY, ln, pos, "Dd %s", in); 569 570 /* Use any non-mdoc(7) date verbatim. */ 571 572 return mandoc_strdup(in); 573 } 574 575 int 576 mandoc_eos(const char *p, size_t sz) 577 { 578 const char *q; 579 int enclosed, found; 580 581 if (0 == sz) 582 return 0; 583 584 /* 585 * End-of-sentence recognition must include situations where 586 * some symbols, such as `)', allow prior EOS punctuation to 587 * propagate outward. 588 */ 589 590 enclosed = found = 0; 591 for (q = p + (int)sz - 1; q >= p; q--) { 592 switch (*q) { 593 case '\"': 594 case '\'': 595 case ']': 596 case ')': 597 if (0 == found) 598 enclosed = 1; 599 break; 600 case '.': 601 case '!': 602 case '?': 603 found = 1; 604 break; 605 default: 606 return found && 607 (!enclosed || isalnum((unsigned char)*q)); 608 } 609 } 610 611 return found && !enclosed; 612 } 613 614 /* 615 * Convert a string to a long that may not be <0. 616 * If the string is invalid, or is less than 0, return -1. 617 */ 618 int 619 mandoc_strntoi(const char *p, size_t sz, int base) 620 { 621 char buf[32]; 622 char *ep; 623 long v; 624 625 if (sz > 31) 626 return -1; 627 628 memcpy(buf, p, sz); 629 buf[(int)sz] = '\0'; 630 631 errno = 0; 632 v = strtol(buf, &ep, base); 633 634 if (buf[0] == '\0' || *ep != '\0') 635 return -1; 636 637 if (v > INT_MAX) 638 v = INT_MAX; 639 if (v < INT_MIN) 640 v = INT_MIN; 641 642 return (int)v; 643 } 644