xref: /openbsd/usr.bin/mandoc/mandoc.c (revision 274d7c50)
1 /*	$OpenBSD: mandoc.c,v 1.84 2019/06/27 15:05:14 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2011-2015, 2017, 2018 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #include <sys/types.h>
19 
20 #include <assert.h>
21 #include <ctype.h>
22 #include <errno.h>
23 #include <limits.h>
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <time.h>
28 
29 #include "mandoc_aux.h"
30 #include "mandoc.h"
31 #include "roff.h"
32 #include "libmandoc.h"
33 #include "roff_int.h"
34 
35 static	int	 a2time(time_t *, const char *, const char *);
36 static	char	*time2a(time_t);
37 
38 
39 enum mandoc_esc
40 mandoc_font(const char *cp, int sz)
41 {
42 	switch (sz) {
43 	case 0:
44 		return ESCAPE_FONTPREV;
45 	case 1:
46 		switch (cp[0]) {
47 		case 'B':
48 		case '3':
49 			return ESCAPE_FONTBOLD;
50 		case 'I':
51 		case '2':
52 			return ESCAPE_FONTITALIC;
53 		case 'P':
54 			return ESCAPE_FONTPREV;
55 		case 'R':
56 		case '1':
57 			return ESCAPE_FONTROMAN;
58 		case '4':
59 			return ESCAPE_FONTBI;
60 		default:
61 			return ESCAPE_ERROR;
62 		}
63 	case 2:
64 		switch (cp[0]) {
65 		case 'B':
66 			switch (cp[1]) {
67 			case 'I':
68 				return ESCAPE_FONTBI;
69 			default:
70 				return ESCAPE_ERROR;
71 			}
72 		case 'C':
73 			switch (cp[1]) {
74 			case 'B':
75 				return ESCAPE_FONTBOLD;
76 			case 'I':
77 				return ESCAPE_FONTITALIC;
78 			case 'R':
79 			case 'W':
80 				return ESCAPE_FONTCW;
81 			default:
82 				return ESCAPE_ERROR;
83 			}
84 		default:
85 			return ESCAPE_ERROR;
86 		}
87 	default:
88 		return ESCAPE_ERROR;
89 	}
90 }
91 
92 enum mandoc_esc
93 mandoc_escape(const char **end, const char **start, int *sz)
94 {
95 	const char	*local_start;
96 	int		 local_sz, c, i;
97 	char		 term;
98 	enum mandoc_esc	 gly;
99 
100 	/*
101 	 * When the caller doesn't provide return storage,
102 	 * use local storage.
103 	 */
104 
105 	if (NULL == start)
106 		start = &local_start;
107 	if (NULL == sz)
108 		sz = &local_sz;
109 
110 	/*
111 	 * Treat "\E" just like "\";
112 	 * it only makes a difference in copy mode.
113 	 */
114 
115 	if (**end == 'E')
116 		++*end;
117 
118 	/*
119 	 * Beyond the backslash, at least one input character
120 	 * is part of the escape sequence.  With one exception
121 	 * (see below), that character won't be returned.
122 	 */
123 
124 	gly = ESCAPE_ERROR;
125 	*start = ++*end;
126 	*sz = 0;
127 	term = '\0';
128 
129 	switch ((*start)[-1]) {
130 	/*
131 	 * First the glyphs.  There are several different forms of
132 	 * these, but each eventually returns a substring of the glyph
133 	 * name.
134 	 */
135 	case '(':
136 		gly = ESCAPE_SPECIAL;
137 		*sz = 2;
138 		break;
139 	case '[':
140 		if (**start == ' ') {
141 			++*end;
142 			return ESCAPE_ERROR;
143 		}
144 		gly = ESCAPE_SPECIAL;
145 		term = ']';
146 		break;
147 	case 'C':
148 		if ('\'' != **start)
149 			return ESCAPE_ERROR;
150 		*start = ++*end;
151 		gly = ESCAPE_SPECIAL;
152 		term = '\'';
153 		break;
154 
155 	/*
156 	 * Escapes taking no arguments at all.
157 	 */
158 	case '!':
159 	case '?':
160 		return ESCAPE_UNSUPP;
161 	case '%':
162 	case '&':
163 	case ')':
164 	case ',':
165 	case '/':
166 	case '^':
167 	case 'a':
168 	case 'd':
169 	case 'r':
170 	case 't':
171 	case 'u':
172 	case '{':
173 	case '|':
174 	case '}':
175 		return ESCAPE_IGNORE;
176 	case 'c':
177 		return ESCAPE_NOSPACE;
178 	case 'p':
179 		return ESCAPE_BREAK;
180 
181 	/*
182 	 * The \z escape is supposed to output the following
183 	 * character without advancing the cursor position.
184 	 * Since we are mostly dealing with terminal mode,
185 	 * let us just skip the next character.
186 	 */
187 	case 'z':
188 		return ESCAPE_SKIPCHAR;
189 
190 	/*
191 	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
192 	 * 'X' is the trigger.  These have opaque sub-strings.
193 	 */
194 	case 'F':
195 	case 'f':
196 	case 'g':
197 	case 'k':
198 	case 'M':
199 	case 'm':
200 	case 'n':
201 	case 'O':
202 	case 'V':
203 	case 'Y':
204 		gly = (*start)[-1] == 'f' ? ESCAPE_FONT : ESCAPE_IGNORE;
205 		switch (**start) {
206 		case '(':
207 			if ((*start)[-1] == 'O')
208 				gly = ESCAPE_ERROR;
209 			*start = ++*end;
210 			*sz = 2;
211 			break;
212 		case '[':
213 			if ((*start)[-1] == 'O')
214 				gly = (*start)[1] == '5' ?
215 				    ESCAPE_UNSUPP : ESCAPE_ERROR;
216 			*start = ++*end;
217 			term = ']';
218 			break;
219 		default:
220 			if ((*start)[-1] == 'O') {
221 				switch (**start) {
222 				case '0':
223 					gly = ESCAPE_UNSUPP;
224 					break;
225 				case '1':
226 				case '2':
227 				case '3':
228 				case '4':
229 					break;
230 				default:
231 					gly = ESCAPE_ERROR;
232 					break;
233 				}
234 			}
235 			*sz = 1;
236 			break;
237 		}
238 		break;
239 	case '*':
240 		if (strncmp(*start, "(.T", 3) != 0)
241 			abort();
242 		gly = ESCAPE_DEVICE;
243 		*start = ++*end;
244 		*sz = 2;
245 		break;
246 
247 	/*
248 	 * These escapes are of the form \X'Y', where 'X' is the trigger
249 	 * and 'Y' is any string.  These have opaque sub-strings.
250 	 * The \B and \w escapes are handled in roff.c, roff_res().
251 	 */
252 	case 'A':
253 	case 'b':
254 	case 'D':
255 	case 'R':
256 	case 'X':
257 	case 'Z':
258 		gly = ESCAPE_IGNORE;
259 		/* FALLTHROUGH */
260 	case 'o':
261 		if (**start == '\0')
262 			return ESCAPE_ERROR;
263 		if (gly == ESCAPE_ERROR)
264 			gly = ESCAPE_OVERSTRIKE;
265 		term = **start;
266 		*start = ++*end;
267 		break;
268 
269 	/*
270 	 * These escapes are of the form \X'N', where 'X' is the trigger
271 	 * and 'N' resolves to a numerical expression.
272 	 */
273 	case 'h':
274 	case 'H':
275 	case 'L':
276 	case 'l':
277 	case 'S':
278 	case 'v':
279 	case 'x':
280 		if (strchr(" %&()*+-./0123456789:<=>", **start)) {
281 			if ('\0' != **start)
282 				++*end;
283 			return ESCAPE_ERROR;
284 		}
285 		switch ((*start)[-1]) {
286 		case 'h':
287 			gly = ESCAPE_HORIZ;
288 			break;
289 		case 'l':
290 			gly = ESCAPE_HLINE;
291 			break;
292 		default:
293 			gly = ESCAPE_IGNORE;
294 			break;
295 		}
296 		term = **start;
297 		*start = ++*end;
298 		break;
299 
300 	/*
301 	 * Special handling for the numbered character escape.
302 	 * XXX Do any other escapes need similar handling?
303 	 */
304 	case 'N':
305 		if ('\0' == **start)
306 			return ESCAPE_ERROR;
307 		(*end)++;
308 		if (isdigit((unsigned char)**start)) {
309 			*sz = 1;
310 			return ESCAPE_IGNORE;
311 		}
312 		(*start)++;
313 		while (isdigit((unsigned char)**end))
314 			(*end)++;
315 		*sz = *end - *start;
316 		if ('\0' != **end)
317 			(*end)++;
318 		return ESCAPE_NUMBERED;
319 
320 	/*
321 	 * Sizes get a special category of their own.
322 	 */
323 	case 's':
324 		gly = ESCAPE_IGNORE;
325 
326 		/* See +/- counts as a sign. */
327 		if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
328 			*start = ++*end;
329 
330 		switch (**end) {
331 		case '(':
332 			*start = ++*end;
333 			*sz = 2;
334 			break;
335 		case '[':
336 			*start = ++*end;
337 			term = ']';
338 			break;
339 		case '\'':
340 			*start = ++*end;
341 			term = '\'';
342 			break;
343 		case '3':
344 		case '2':
345 		case '1':
346 			*sz = (*end)[-1] == 's' &&
347 			    isdigit((unsigned char)(*end)[1]) ? 2 : 1;
348 			break;
349 		default:
350 			*sz = 1;
351 			break;
352 		}
353 
354 		break;
355 
356 	/*
357 	 * Several special characters can be encoded as
358 	 * one-byte escape sequences without using \[].
359 	 */
360 	case ' ':
361 	case '\'':
362 	case '-':
363 	case '.':
364 	case '0':
365 	case ':':
366 	case '_':
367 	case '`':
368 	case 'e':
369 	case '~':
370 		gly = ESCAPE_SPECIAL;
371 		/* FALLTHROUGH */
372 	default:
373 		if (gly == ESCAPE_ERROR)
374 			gly = ESCAPE_UNDEF;
375 		*start = --*end;
376 		*sz = 1;
377 		break;
378 	}
379 
380 	/*
381 	 * Read up to the terminating character,
382 	 * paying attention to nested escapes.
383 	 */
384 
385 	if ('\0' != term) {
386 		while (**end != term) {
387 			switch (**end) {
388 			case '\0':
389 				return ESCAPE_ERROR;
390 			case '\\':
391 				(*end)++;
392 				if (ESCAPE_ERROR ==
393 				    mandoc_escape(end, NULL, NULL))
394 					return ESCAPE_ERROR;
395 				break;
396 			default:
397 				(*end)++;
398 				break;
399 			}
400 		}
401 		*sz = (*end)++ - *start;
402 
403 		/*
404 		 * The file chars.c only provides one common list
405 		 * of character names, but \[-] == \- is the only
406 		 * one of the characters with one-byte names that
407 		 * allows enclosing the name in brackets.
408 		 */
409 		if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
410 			return ESCAPE_ERROR;
411 	} else {
412 		assert(*sz > 0);
413 		if ((size_t)*sz > strlen(*start))
414 			return ESCAPE_ERROR;
415 		*end += *sz;
416 	}
417 
418 	/* Run post-processors. */
419 
420 	switch (gly) {
421 	case ESCAPE_FONT:
422 		gly = mandoc_font(*start, *sz);
423 		break;
424 	case ESCAPE_SPECIAL:
425 		if (**start == 'c') {
426 			if (*sz < 6 || *sz > 7 ||
427 			    strncmp(*start, "char", 4) != 0 ||
428 			    (int)strspn(*start + 4, "0123456789") + 4 < *sz)
429 				break;
430 			c = 0;
431 			for (i = 4; i < *sz; i++)
432 				c = 10 * c + ((*start)[i] - '0');
433 			if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
434 				break;
435 			*start += 4;
436 			*sz -= 4;
437 			gly = ESCAPE_NUMBERED;
438 			break;
439 		}
440 
441 		/*
442 		 * Unicode escapes are defined in groff as \[u0000]
443 		 * to \[u10FFFF], where the contained value must be
444 		 * a valid Unicode codepoint.  Here, however, only
445 		 * check the length and range.
446 		 */
447 		if (**start != 'u' || *sz < 5 || *sz > 7)
448 			break;
449 		if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
450 			break;
451 		if (*sz == 6 && (*start)[1] == '0')
452 			break;
453 		if (*sz == 5 && (*start)[1] == 'D' &&
454 		    strchr("89ABCDEF", (*start)[2]) != NULL)
455 			break;
456 		if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
457 		    + 1 == *sz)
458 			gly = ESCAPE_UNICODE;
459 		break;
460 	default:
461 		break;
462 	}
463 
464 	return gly;
465 }
466 
467 static int
468 a2time(time_t *t, const char *fmt, const char *p)
469 {
470 	struct tm	 tm;
471 	char		*pp;
472 
473 	memset(&tm, 0, sizeof(struct tm));
474 
475 	pp = strptime(p, fmt, &tm);
476 	if (NULL != pp && '\0' == *pp) {
477 		*t = mktime(&tm);
478 		return 1;
479 	}
480 
481 	return 0;
482 }
483 
484 static char *
485 time2a(time_t t)
486 {
487 	struct tm	*tm;
488 	char		*buf, *p;
489 	size_t		 ssz;
490 	int		 isz;
491 
492 	buf = NULL;
493 	tm = localtime(&t);
494 	if (tm == NULL)
495 		goto fail;
496 
497 	/*
498 	 * Reserve space:
499 	 * up to 9 characters for the month (September) + blank
500 	 * up to 2 characters for the day + comma + blank
501 	 * 4 characters for the year and a terminating '\0'
502 	 */
503 
504 	p = buf = mandoc_malloc(10 + 4 + 4 + 1);
505 
506 	if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
507 		goto fail;
508 	p += (int)ssz;
509 
510 	/*
511 	 * The output format is just "%d" here, not "%2d" or "%02d".
512 	 * That's also the reason why we can't just format the
513 	 * date as a whole with "%B %e, %Y" or "%B %d, %Y".
514 	 * Besides, the present approach is less prone to buffer
515 	 * overflows, in case anybody should ever introduce the bug
516 	 * of looking at LC_TIME.
517 	 */
518 
519 	isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday);
520 	if (isz < 0 || isz > 4)
521 		goto fail;
522 	p += isz;
523 
524 	if (strftime(p, 4 + 1, "%Y", tm) == 0)
525 		goto fail;
526 	return buf;
527 
528 fail:
529 	free(buf);
530 	return mandoc_strdup("");
531 }
532 
533 char *
534 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos)
535 {
536 	char		*cp;
537 	time_t		 t;
538 
539 	if (man->quick)
540 		return mandoc_strdup(in == NULL ? "" : in);
541 
542 	/* No date specified: use today's date. */
543 
544 	if (in == NULL || *in == '\0')
545 		mandoc_msg(MANDOCERR_DATE_MISSING, ln, pos, NULL);
546 	if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0)
547 		return time2a(time(NULL));
548 
549 	/* Valid mdoc(7) date format. */
550 
551 	if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) ||
552 	    a2time(&t, "%b %d, %Y", in)) {
553 		cp = time2a(t);
554 		if (t > time(NULL) + 86400)
555 			mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", cp);
556 		else if (*in != '$' && strcmp(in, cp) != 0)
557 			mandoc_msg(MANDOCERR_DATE_NORM, ln, pos, "%s", cp);
558 		return cp;
559 	}
560 
561 	/* In man(7), do not warn about the legacy format. */
562 
563 	if (a2time(&t, "%Y-%m-%d", in) == 0)
564 		mandoc_msg(MANDOCERR_DATE_BAD, ln, pos, "%s", in);
565 	else if (t > time(NULL) + 86400)
566 		mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", in);
567 	else if (man->meta.macroset == MACROSET_MDOC)
568 		mandoc_msg(MANDOCERR_DATE_LEGACY, ln, pos, "Dd %s", in);
569 
570 	/* Use any non-mdoc(7) date verbatim. */
571 
572 	return mandoc_strdup(in);
573 }
574 
575 int
576 mandoc_eos(const char *p, size_t sz)
577 {
578 	const char	*q;
579 	int		 enclosed, found;
580 
581 	if (0 == sz)
582 		return 0;
583 
584 	/*
585 	 * End-of-sentence recognition must include situations where
586 	 * some symbols, such as `)', allow prior EOS punctuation to
587 	 * propagate outward.
588 	 */
589 
590 	enclosed = found = 0;
591 	for (q = p + (int)sz - 1; q >= p; q--) {
592 		switch (*q) {
593 		case '\"':
594 		case '\'':
595 		case ']':
596 		case ')':
597 			if (0 == found)
598 				enclosed = 1;
599 			break;
600 		case '.':
601 		case '!':
602 		case '?':
603 			found = 1;
604 			break;
605 		default:
606 			return found &&
607 			    (!enclosed || isalnum((unsigned char)*q));
608 		}
609 	}
610 
611 	return found && !enclosed;
612 }
613 
614 /*
615  * Convert a string to a long that may not be <0.
616  * If the string is invalid, or is less than 0, return -1.
617  */
618 int
619 mandoc_strntoi(const char *p, size_t sz, int base)
620 {
621 	char		 buf[32];
622 	char		*ep;
623 	long		 v;
624 
625 	if (sz > 31)
626 		return -1;
627 
628 	memcpy(buf, p, sz);
629 	buf[(int)sz] = '\0';
630 
631 	errno = 0;
632 	v = strtol(buf, &ep, base);
633 
634 	if (buf[0] == '\0' || *ep != '\0')
635 		return -1;
636 
637 	if (v > INT_MAX)
638 		v = INT_MAX;
639 	if (v < INT_MIN)
640 		v = INT_MIN;
641 
642 	return (int)v;
643 }
644