1 /* $Id: mandoc.c,v 1.83 2014/07/06 19:09:00 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #ifdef HAVE_CONFIG_H 19 #include "config.h" 20 #endif 21 22 #include <sys/types.h> 23 24 #include <assert.h> 25 #include <ctype.h> 26 #include <errno.h> 27 #include <limits.h> 28 #include <stdlib.h> 29 #include <stdio.h> 30 #include <string.h> 31 #include <time.h> 32 33 #include "mandoc.h" 34 #include "mandoc_aux.h" 35 #include "libmandoc.h" 36 37 #define DATESIZE 32 38 39 static int a2time(time_t *, const char *, const char *); 40 static char *time2a(time_t); 41 42 43 enum mandoc_esc 44 mandoc_escape(const char **end, const char **start, int *sz) 45 { 46 const char *local_start; 47 int local_sz; 48 char term; 49 enum mandoc_esc gly; 50 51 /* 52 * When the caller doesn't provide return storage, 53 * use local storage. 54 */ 55 56 if (NULL == start) 57 start = &local_start; 58 if (NULL == sz) 59 sz = &local_sz; 60 61 /* 62 * Beyond the backslash, at least one input character 63 * is part of the escape sequence. With one exception 64 * (see below), that character won't be returned. 65 */ 66 67 gly = ESCAPE_ERROR; 68 *start = ++*end; 69 *sz = 0; 70 term = '\0'; 71 72 switch ((*start)[-1]) { 73 /* 74 * First the glyphs. There are several different forms of 75 * these, but each eventually returns a substring of the glyph 76 * name. 77 */ 78 case '(': 79 gly = ESCAPE_SPECIAL; 80 *sz = 2; 81 break; 82 case '[': 83 gly = ESCAPE_SPECIAL; 84 /* 85 * Unicode escapes are defined in groff as \[uXXXX] to 86 * \[u10FFFF], where the contained value must be a valid 87 * Unicode codepoint. Here, however, only check whether 88 * it's not a zero-width escape. 89 */ 90 if ('u' == (*start)[0] && ']' != (*start)[1]) 91 gly = ESCAPE_UNICODE; 92 term = ']'; 93 break; 94 case 'C': 95 if ('\'' != **start) 96 return(ESCAPE_ERROR); 97 *start = ++*end; 98 if ('u' == (*start)[0] && '\'' != (*start)[1]) 99 gly = ESCAPE_UNICODE; 100 else 101 gly = ESCAPE_SPECIAL; 102 term = '\''; 103 break; 104 105 /* 106 * Escapes taking no arguments at all. 107 */ 108 case 'd': 109 /* FALLTHROUGH */ 110 case 'u': 111 return(ESCAPE_IGNORE); 112 113 /* 114 * The \z escape is supposed to output the following 115 * character without advancing the cursor position. 116 * Since we are mostly dealing with terminal mode, 117 * let us just skip the next character. 118 */ 119 case 'z': 120 return(ESCAPE_SKIPCHAR); 121 122 /* 123 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 124 * 'X' is the trigger. These have opaque sub-strings. 125 */ 126 case 'F': 127 /* FALLTHROUGH */ 128 case 'g': 129 /* FALLTHROUGH */ 130 case 'k': 131 /* FALLTHROUGH */ 132 case 'M': 133 /* FALLTHROUGH */ 134 case 'm': 135 /* FALLTHROUGH */ 136 case 'n': 137 /* FALLTHROUGH */ 138 case 'V': 139 /* FALLTHROUGH */ 140 case 'Y': 141 gly = ESCAPE_IGNORE; 142 /* FALLTHROUGH */ 143 case 'f': 144 if (ESCAPE_ERROR == gly) 145 gly = ESCAPE_FONT; 146 switch (**start) { 147 case '(': 148 *start = ++*end; 149 *sz = 2; 150 break; 151 case '[': 152 *start = ++*end; 153 term = ']'; 154 break; 155 default: 156 *sz = 1; 157 break; 158 } 159 break; 160 161 /* 162 * These escapes are of the form \X'Y', where 'X' is the trigger 163 * and 'Y' is any string. These have opaque sub-strings. 164 * The \B and \w escapes are handled in roff.c, roff_res(). 165 */ 166 case 'A': 167 /* FALLTHROUGH */ 168 case 'b': 169 /* FALLTHROUGH */ 170 case 'D': 171 /* FALLTHROUGH */ 172 case 'o': 173 /* FALLTHROUGH */ 174 case 'R': 175 /* FALLTHROUGH */ 176 case 'X': 177 /* FALLTHROUGH */ 178 case 'Z': 179 if ('\0' == **start) 180 return(ESCAPE_ERROR); 181 gly = ESCAPE_IGNORE; 182 term = **start; 183 *start = ++*end; 184 break; 185 186 /* 187 * These escapes are of the form \X'N', where 'X' is the trigger 188 * and 'N' resolves to a numerical expression. 189 */ 190 case 'h': 191 /* FALLTHROUGH */ 192 case 'H': 193 /* FALLTHROUGH */ 194 case 'L': 195 /* FALLTHROUGH */ 196 case 'l': 197 /* FALLTHROUGH */ 198 case 'S': 199 /* FALLTHROUGH */ 200 case 'v': 201 /* FALLTHROUGH */ 202 case 'x': 203 if (strchr(" %&()*+-./0123456789:<=>", **start)) { 204 ++*end; 205 return(ESCAPE_ERROR); 206 } 207 gly = ESCAPE_IGNORE; 208 term = **start; 209 *start = ++*end; 210 break; 211 212 /* 213 * Special handling for the numbered character escape. 214 * XXX Do any other escapes need similar handling? 215 */ 216 case 'N': 217 if ('\0' == **start) 218 return(ESCAPE_ERROR); 219 (*end)++; 220 if (isdigit((unsigned char)**start)) { 221 *sz = 1; 222 return(ESCAPE_IGNORE); 223 } 224 (*start)++; 225 while (isdigit((unsigned char)**end)) 226 (*end)++; 227 *sz = *end - *start; 228 if ('\0' != **end) 229 (*end)++; 230 return(ESCAPE_NUMBERED); 231 232 /* 233 * Sizes get a special category of their own. 234 */ 235 case 's': 236 gly = ESCAPE_IGNORE; 237 238 /* See +/- counts as a sign. */ 239 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 240 (*end)++; 241 242 switch (**end) { 243 case '(': 244 *start = ++*end; 245 *sz = 2; 246 break; 247 case '[': 248 *start = ++*end; 249 term = ']'; 250 break; 251 case '\'': 252 *start = ++*end; 253 term = '\''; 254 break; 255 default: 256 *sz = 1; 257 break; 258 } 259 260 break; 261 262 /* 263 * Anything else is assumed to be a glyph. 264 * In this case, pass back the character after the backslash. 265 */ 266 default: 267 gly = ESCAPE_SPECIAL; 268 *start = --*end; 269 *sz = 1; 270 break; 271 } 272 273 assert(ESCAPE_ERROR != gly); 274 275 /* 276 * Read up to the terminating character, 277 * paying attention to nested escapes. 278 */ 279 280 if ('\0' != term) { 281 while (**end != term) { 282 switch (**end) { 283 case '\0': 284 return(ESCAPE_ERROR); 285 case '\\': 286 (*end)++; 287 if (ESCAPE_ERROR == 288 mandoc_escape(end, NULL, NULL)) 289 return(ESCAPE_ERROR); 290 break; 291 default: 292 (*end)++; 293 break; 294 } 295 } 296 *sz = (*end)++ - *start; 297 } else { 298 assert(*sz > 0); 299 if ((size_t)*sz > strlen(*start)) 300 return(ESCAPE_ERROR); 301 *end += *sz; 302 } 303 304 /* Run post-processors. */ 305 306 switch (gly) { 307 case ESCAPE_FONT: 308 if (2 == *sz) { 309 if ('C' == **start) { 310 /* 311 * Treat constant-width font modes 312 * just like regular font modes. 313 */ 314 (*start)++; 315 (*sz)--; 316 } else { 317 if ('B' == (*start)[0] && 'I' == (*start)[1]) 318 gly = ESCAPE_FONTBI; 319 break; 320 } 321 } else if (1 != *sz) 322 break; 323 324 switch (**start) { 325 case '3': 326 /* FALLTHROUGH */ 327 case 'B': 328 gly = ESCAPE_FONTBOLD; 329 break; 330 case '2': 331 /* FALLTHROUGH */ 332 case 'I': 333 gly = ESCAPE_FONTITALIC; 334 break; 335 case 'P': 336 gly = ESCAPE_FONTPREV; 337 break; 338 case '1': 339 /* FALLTHROUGH */ 340 case 'R': 341 gly = ESCAPE_FONTROMAN; 342 break; 343 } 344 break; 345 case ESCAPE_SPECIAL: 346 if (1 == *sz && 'c' == **start) 347 gly = ESCAPE_NOSPACE; 348 break; 349 default: 350 break; 351 } 352 353 return(gly); 354 } 355 356 /* 357 * Parse a quoted or unquoted roff-style request or macro argument. 358 * Return a pointer to the parsed argument, which is either the original 359 * pointer or advanced by one byte in case the argument is quoted. 360 * NUL-terminate the argument in place. 361 * Collapse pairs of quotes inside quoted arguments. 362 * Advance the argument pointer to the next argument, 363 * or to the NUL byte terminating the argument line. 364 */ 365 char * 366 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 367 { 368 char *start, *cp; 369 int quoted, pairs, white; 370 371 /* Quoting can only start with a new word. */ 372 start = *cpp; 373 quoted = 0; 374 if ('"' == *start) { 375 quoted = 1; 376 start++; 377 } 378 379 pairs = 0; 380 white = 0; 381 for (cp = start; '\0' != *cp; cp++) { 382 383 /* 384 * Move the following text left 385 * after quoted quotes and after "\\" and "\t". 386 */ 387 if (pairs) 388 cp[-pairs] = cp[0]; 389 390 if ('\\' == cp[0]) { 391 /* 392 * In copy mode, translate double to single 393 * backslashes and backslash-t to literal tabs. 394 */ 395 switch (cp[1]) { 396 case 't': 397 cp[0] = '\t'; 398 /* FALLTHROUGH */ 399 case '\\': 400 pairs++; 401 cp++; 402 break; 403 case ' ': 404 /* Skip escaped blanks. */ 405 if (0 == quoted) 406 cp++; 407 break; 408 default: 409 break; 410 } 411 } else if (0 == quoted) { 412 if (' ' == cp[0]) { 413 /* Unescaped blanks end unquoted args. */ 414 white = 1; 415 break; 416 } 417 } else if ('"' == cp[0]) { 418 if ('"' == cp[1]) { 419 /* Quoted quotes collapse. */ 420 pairs++; 421 cp++; 422 } else { 423 /* Unquoted quotes end quoted args. */ 424 quoted = 2; 425 break; 426 } 427 } 428 } 429 430 /* Quoted argument without a closing quote. */ 431 if (1 == quoted) 432 mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL); 433 434 /* NUL-terminate this argument and move to the next one. */ 435 if (pairs) 436 cp[-pairs] = '\0'; 437 if ('\0' != *cp) { 438 *cp++ = '\0'; 439 while (' ' == *cp) 440 cp++; 441 } 442 *pos += (int)(cp - start) + (quoted ? 1 : 0); 443 *cpp = cp; 444 445 if ('\0' == *cp && (white || ' ' == cp[-1])) 446 mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL); 447 448 return(start); 449 } 450 451 static int 452 a2time(time_t *t, const char *fmt, const char *p) 453 { 454 struct tm tm; 455 char *pp; 456 457 memset(&tm, 0, sizeof(struct tm)); 458 459 pp = NULL; 460 #ifdef HAVE_STRPTIME 461 pp = strptime(p, fmt, &tm); 462 #endif 463 if (NULL != pp && '\0' == *pp) { 464 *t = mktime(&tm); 465 return(1); 466 } 467 468 return(0); 469 } 470 471 static char * 472 time2a(time_t t) 473 { 474 struct tm *tm; 475 char *buf, *p; 476 size_t ssz; 477 int isz; 478 479 tm = localtime(&t); 480 481 /* 482 * Reserve space: 483 * up to 9 characters for the month (September) + blank 484 * up to 2 characters for the day + comma + blank 485 * 4 characters for the year and a terminating '\0' 486 */ 487 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 488 489 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm))) 490 goto fail; 491 p += (int)ssz; 492 493 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday))) 494 goto fail; 495 p += isz; 496 497 if (0 == strftime(p, 4 + 1, "%Y", tm)) 498 goto fail; 499 return(buf); 500 501 fail: 502 free(buf); 503 return(NULL); 504 } 505 506 char * 507 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos) 508 { 509 char *out; 510 time_t t; 511 512 if (NULL == in || '\0' == *in || 513 0 == strcmp(in, "$" "Mdocdate$")) { 514 mandoc_msg(MANDOCERR_DATE_MISSING, parse, ln, pos, NULL); 515 time(&t); 516 } 517 else if (a2time(&t, "%Y-%m-%d", in)) 518 t = 0; 519 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) && 520 !a2time(&t, "%b %d, %Y", in)) { 521 mandoc_msg(MANDOCERR_DATE_BAD, parse, ln, pos, in); 522 t = 0; 523 } 524 out = t ? time2a(t) : NULL; 525 return(out ? out : mandoc_strdup(in)); 526 } 527 528 int 529 mandoc_eos(const char *p, size_t sz) 530 { 531 const char *q; 532 int enclosed, found; 533 534 if (0 == sz) 535 return(0); 536 537 /* 538 * End-of-sentence recognition must include situations where 539 * some symbols, such as `)', allow prior EOS punctuation to 540 * propagate outward. 541 */ 542 543 enclosed = found = 0; 544 for (q = p + (int)sz - 1; q >= p; q--) { 545 switch (*q) { 546 case '\"': 547 /* FALLTHROUGH */ 548 case '\'': 549 /* FALLTHROUGH */ 550 case ']': 551 /* FALLTHROUGH */ 552 case ')': 553 if (0 == found) 554 enclosed = 1; 555 break; 556 case '.': 557 /* FALLTHROUGH */ 558 case '!': 559 /* FALLTHROUGH */ 560 case '?': 561 found = 1; 562 break; 563 default: 564 return(found && (!enclosed || isalnum((unsigned char)*q))); 565 } 566 } 567 568 return(found && !enclosed); 569 } 570 571 /* 572 * Convert a string to a long that may not be <0. 573 * If the string is invalid, or is less than 0, return -1. 574 */ 575 int 576 mandoc_strntoi(const char *p, size_t sz, int base) 577 { 578 char buf[32]; 579 char *ep; 580 long v; 581 582 if (sz > 31) 583 return(-1); 584 585 memcpy(buf, p, sz); 586 buf[(int)sz] = '\0'; 587 588 errno = 0; 589 v = strtol(buf, &ep, base); 590 591 if (buf[0] == '\0' || *ep != '\0') 592 return(-1); 593 594 if (v > INT_MAX) 595 v = INT_MAX; 596 if (v < INT_MIN) 597 v = INT_MIN; 598 599 return((int)v); 600 } 601