xref: /openbsd/usr.bin/mandoc/mandoc.c (revision 8529ddd3)
1 /*	$OpenBSD: mandoc.c,v 1.60 2015/02/20 23:51:54 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2011-2015 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #include <sys/types.h>
19 
20 #include <assert.h>
21 #include <ctype.h>
22 #include <errno.h>
23 #include <limits.h>
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <time.h>
28 
29 #include "mandoc.h"
30 #include "mandoc_aux.h"
31 #include "libmandoc.h"
32 
33 #define DATESIZE 32
34 
35 static	int	 a2time(time_t *, const char *, const char *);
36 static	char	*time2a(time_t);
37 
38 
39 enum mandoc_esc
40 mandoc_escape(const char **end, const char **start, int *sz)
41 {
42 	const char	*local_start;
43 	int		 local_sz;
44 	char		 term;
45 	enum mandoc_esc	 gly;
46 
47 	/*
48 	 * When the caller doesn't provide return storage,
49 	 * use local storage.
50 	 */
51 
52 	if (NULL == start)
53 		start = &local_start;
54 	if (NULL == sz)
55 		sz = &local_sz;
56 
57 	/*
58 	 * Beyond the backslash, at least one input character
59 	 * is part of the escape sequence.  With one exception
60 	 * (see below), that character won't be returned.
61 	 */
62 
63 	gly = ESCAPE_ERROR;
64 	*start = ++*end;
65 	*sz = 0;
66 	term = '\0';
67 
68 	switch ((*start)[-1]) {
69 	/*
70 	 * First the glyphs.  There are several different forms of
71 	 * these, but each eventually returns a substring of the glyph
72 	 * name.
73 	 */
74 	case '(':
75 		gly = ESCAPE_SPECIAL;
76 		*sz = 2;
77 		break;
78 	case '[':
79 		gly = ESCAPE_SPECIAL;
80 		term = ']';
81 		break;
82 	case 'C':
83 		if ('\'' != **start)
84 			return(ESCAPE_ERROR);
85 		*start = ++*end;
86 		gly = ESCAPE_SPECIAL;
87 		term = '\'';
88 		break;
89 
90 	/*
91 	 * Escapes taking no arguments at all.
92 	 */
93 	case 'd':
94 		/* FALLTHROUGH */
95 	case 'u':
96 		return(ESCAPE_IGNORE);
97 
98 	/*
99 	 * The \z escape is supposed to output the following
100 	 * character without advancing the cursor position.
101 	 * Since we are mostly dealing with terminal mode,
102 	 * let us just skip the next character.
103 	 */
104 	case 'z':
105 		return(ESCAPE_SKIPCHAR);
106 
107 	/*
108 	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
109 	 * 'X' is the trigger.  These have opaque sub-strings.
110 	 */
111 	case 'F':
112 		/* FALLTHROUGH */
113 	case 'g':
114 		/* FALLTHROUGH */
115 	case 'k':
116 		/* FALLTHROUGH */
117 	case 'M':
118 		/* FALLTHROUGH */
119 	case 'm':
120 		/* FALLTHROUGH */
121 	case 'n':
122 		/* FALLTHROUGH */
123 	case 'V':
124 		/* FALLTHROUGH */
125 	case 'Y':
126 		gly = ESCAPE_IGNORE;
127 		/* FALLTHROUGH */
128 	case 'f':
129 		if (ESCAPE_ERROR == gly)
130 			gly = ESCAPE_FONT;
131 		switch (**start) {
132 		case '(':
133 			*start = ++*end;
134 			*sz = 2;
135 			break;
136 		case '[':
137 			*start = ++*end;
138 			term = ']';
139 			break;
140 		default:
141 			*sz = 1;
142 			break;
143 		}
144 		break;
145 
146 	/*
147 	 * These escapes are of the form \X'Y', where 'X' is the trigger
148 	 * and 'Y' is any string.  These have opaque sub-strings.
149 	 * The \B and \w escapes are handled in roff.c, roff_res().
150 	 */
151 	case 'A':
152 		/* FALLTHROUGH */
153 	case 'b':
154 		/* FALLTHROUGH */
155 	case 'D':
156 		/* FALLTHROUGH */
157 	case 'R':
158 		/* FALLTHROUGH */
159 	case 'X':
160 		/* FALLTHROUGH */
161 	case 'Z':
162 		gly = ESCAPE_IGNORE;
163 		/* FALLTHROUGH */
164 	case 'o':
165 		if (**start == '\0')
166 			return(ESCAPE_ERROR);
167 		if (gly == ESCAPE_ERROR)
168 			gly = ESCAPE_OVERSTRIKE;
169 		term = **start;
170 		*start = ++*end;
171 		break;
172 
173 	/*
174 	 * These escapes are of the form \X'N', where 'X' is the trigger
175 	 * and 'N' resolves to a numerical expression.
176 	 */
177 	case 'h':
178 		/* FALLTHROUGH */
179 	case 'H':
180 		/* FALLTHROUGH */
181 	case 'L':
182 		/* FALLTHROUGH */
183 	case 'l':
184 		/* FALLTHROUGH */
185 	case 'S':
186 		/* FALLTHROUGH */
187 	case 'v':
188 		/* FALLTHROUGH */
189 	case 'x':
190 		if (strchr(" %&()*+-./0123456789:<=>", **start)) {
191 			if ('\0' != **start)
192 				++*end;
193 			return(ESCAPE_ERROR);
194 		}
195 		gly = ESCAPE_IGNORE;
196 		term = **start;
197 		*start = ++*end;
198 		break;
199 
200 	/*
201 	 * Special handling for the numbered character escape.
202 	 * XXX Do any other escapes need similar handling?
203 	 */
204 	case 'N':
205 		if ('\0' == **start)
206 			return(ESCAPE_ERROR);
207 		(*end)++;
208 		if (isdigit((unsigned char)**start)) {
209 			*sz = 1;
210 			return(ESCAPE_IGNORE);
211 		}
212 		(*start)++;
213 		while (isdigit((unsigned char)**end))
214 			(*end)++;
215 		*sz = *end - *start;
216 		if ('\0' != **end)
217 			(*end)++;
218 		return(ESCAPE_NUMBERED);
219 
220 	/*
221 	 * Sizes get a special category of their own.
222 	 */
223 	case 's':
224 		gly = ESCAPE_IGNORE;
225 
226 		/* See +/- counts as a sign. */
227 		if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
228 			*start = ++*end;
229 
230 		switch (**end) {
231 		case '(':
232 			*start = ++*end;
233 			*sz = 2;
234 			break;
235 		case '[':
236 			*start = ++*end;
237 			term = ']';
238 			break;
239 		case '\'':
240 			*start = ++*end;
241 			term = '\'';
242 			break;
243 		case '3':
244 			/* FALLTHROUGH */
245 		case '2':
246 			/* FALLTHROUGH */
247 		case '1':
248 			*sz = (*end)[-1] == 's' &&
249 			    isdigit((unsigned char)(*end)[1]) ? 2 : 1;
250 			break;
251 		default:
252 			*sz = 1;
253 			break;
254 		}
255 
256 		break;
257 
258 	/*
259 	 * Anything else is assumed to be a glyph.
260 	 * In this case, pass back the character after the backslash.
261 	 */
262 	default:
263 		gly = ESCAPE_SPECIAL;
264 		*start = --*end;
265 		*sz = 1;
266 		break;
267 	}
268 
269 	assert(ESCAPE_ERROR != gly);
270 
271 	/*
272 	 * Read up to the terminating character,
273 	 * paying attention to nested escapes.
274 	 */
275 
276 	if ('\0' != term) {
277 		while (**end != term) {
278 			switch (**end) {
279 			case '\0':
280 				return(ESCAPE_ERROR);
281 			case '\\':
282 				(*end)++;
283 				if (ESCAPE_ERROR ==
284 				    mandoc_escape(end, NULL, NULL))
285 					return(ESCAPE_ERROR);
286 				break;
287 			default:
288 				(*end)++;
289 				break;
290 			}
291 		}
292 		*sz = (*end)++ - *start;
293 	} else {
294 		assert(*sz > 0);
295 		if ((size_t)*sz > strlen(*start))
296 			return(ESCAPE_ERROR);
297 		*end += *sz;
298 	}
299 
300 	/* Run post-processors. */
301 
302 	switch (gly) {
303 	case ESCAPE_FONT:
304 		if (2 == *sz) {
305 			if ('C' == **start) {
306 				/*
307 				 * Treat constant-width font modes
308 				 * just like regular font modes.
309 				 */
310 				(*start)++;
311 				(*sz)--;
312 			} else {
313 				if ('B' == (*start)[0] && 'I' == (*start)[1])
314 					gly = ESCAPE_FONTBI;
315 				break;
316 			}
317 		} else if (1 != *sz)
318 			break;
319 
320 		switch (**start) {
321 		case '3':
322 			/* FALLTHROUGH */
323 		case 'B':
324 			gly = ESCAPE_FONTBOLD;
325 			break;
326 		case '2':
327 			/* FALLTHROUGH */
328 		case 'I':
329 			gly = ESCAPE_FONTITALIC;
330 			break;
331 		case 'P':
332 			gly = ESCAPE_FONTPREV;
333 			break;
334 		case '1':
335 			/* FALLTHROUGH */
336 		case 'R':
337 			gly = ESCAPE_FONTROMAN;
338 			break;
339 		}
340 		break;
341 	case ESCAPE_SPECIAL:
342 		if (1 == *sz && 'c' == **start)
343 			gly = ESCAPE_NOSPACE;
344 		/*
345 		 * Unicode escapes are defined in groff as \[u0000]
346 		 * to \[u10FFFF], where the contained value must be
347 		 * a valid Unicode codepoint.  Here, however, only
348 		 * check the length and range.
349 		 */
350 		if (**start != 'u' || *sz < 5 || *sz > 7)
351 			break;
352 		if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
353 			break;
354 		if (*sz == 6 && (*start)[1] == '0')
355 			break;
356 		if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
357 		    + 1 == *sz)
358 			gly = ESCAPE_UNICODE;
359 		break;
360 	default:
361 		break;
362 	}
363 
364 	return(gly);
365 }
366 
367 /*
368  * Parse a quoted or unquoted roff-style request or macro argument.
369  * Return a pointer to the parsed argument, which is either the original
370  * pointer or advanced by one byte in case the argument is quoted.
371  * NUL-terminate the argument in place.
372  * Collapse pairs of quotes inside quoted arguments.
373  * Advance the argument pointer to the next argument,
374  * or to the NUL byte terminating the argument line.
375  */
376 char *
377 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
378 {
379 	char	 *start, *cp;
380 	int	  quoted, pairs, white;
381 
382 	/* Quoting can only start with a new word. */
383 	start = *cpp;
384 	quoted = 0;
385 	if ('"' == *start) {
386 		quoted = 1;
387 		start++;
388 	}
389 
390 	pairs = 0;
391 	white = 0;
392 	for (cp = start; '\0' != *cp; cp++) {
393 
394 		/*
395 		 * Move the following text left
396 		 * after quoted quotes and after "\\" and "\t".
397 		 */
398 		if (pairs)
399 			cp[-pairs] = cp[0];
400 
401 		if ('\\' == cp[0]) {
402 			/*
403 			 * In copy mode, translate double to single
404 			 * backslashes and backslash-t to literal tabs.
405 			 */
406 			switch (cp[1]) {
407 			case 't':
408 				cp[0] = '\t';
409 				/* FALLTHROUGH */
410 			case '\\':
411 				pairs++;
412 				cp++;
413 				break;
414 			case ' ':
415 				/* Skip escaped blanks. */
416 				if (0 == quoted)
417 					cp++;
418 				break;
419 			default:
420 				break;
421 			}
422 		} else if (0 == quoted) {
423 			if (' ' == cp[0]) {
424 				/* Unescaped blanks end unquoted args. */
425 				white = 1;
426 				break;
427 			}
428 		} else if ('"' == cp[0]) {
429 			if ('"' == cp[1]) {
430 				/* Quoted quotes collapse. */
431 				pairs++;
432 				cp++;
433 			} else {
434 				/* Unquoted quotes end quoted args. */
435 				quoted = 2;
436 				break;
437 			}
438 		}
439 	}
440 
441 	/* Quoted argument without a closing quote. */
442 	if (1 == quoted)
443 		mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL);
444 
445 	/* NUL-terminate this argument and move to the next one. */
446 	if (pairs)
447 		cp[-pairs] = '\0';
448 	if ('\0' != *cp) {
449 		*cp++ = '\0';
450 		while (' ' == *cp)
451 			cp++;
452 	}
453 	*pos += (int)(cp - start) + (quoted ? 1 : 0);
454 	*cpp = cp;
455 
456 	if ('\0' == *cp && (white || ' ' == cp[-1]))
457 		mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL);
458 
459 	return(start);
460 }
461 
462 static int
463 a2time(time_t *t, const char *fmt, const char *p)
464 {
465 	struct tm	 tm;
466 	char		*pp;
467 
468 	memset(&tm, 0, sizeof(struct tm));
469 
470 	pp = strptime(p, fmt, &tm);
471 	if (NULL != pp && '\0' == *pp) {
472 		*t = mktime(&tm);
473 		return(1);
474 	}
475 
476 	return(0);
477 }
478 
479 static char *
480 time2a(time_t t)
481 {
482 	struct tm	*tm;
483 	char		*buf, *p;
484 	size_t		 ssz;
485 	int		 isz;
486 
487 	tm = localtime(&t);
488 	if (tm == NULL)
489 		return(NULL);
490 
491 	/*
492 	 * Reserve space:
493 	 * up to 9 characters for the month (September) + blank
494 	 * up to 2 characters for the day + comma + blank
495 	 * 4 characters for the year and a terminating '\0'
496 	 */
497 	p = buf = mandoc_malloc(10 + 4 + 4 + 1);
498 
499 	if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
500 		goto fail;
501 	p += (int)ssz;
502 
503 	if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
504 		goto fail;
505 	p += isz;
506 
507 	if (0 == strftime(p, 4 + 1, "%Y", tm))
508 		goto fail;
509 	return(buf);
510 
511 fail:
512 	free(buf);
513 	return(NULL);
514 }
515 
516 char *
517 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
518 {
519 	char		*out;
520 	time_t		 t;
521 
522 	if (NULL == in || '\0' == *in ||
523 	    0 == strcmp(in, "$" "Mdocdate$")) {
524 		mandoc_msg(MANDOCERR_DATE_MISSING, parse, ln, pos, NULL);
525 		time(&t);
526 	}
527 	else if (a2time(&t, "%Y-%m-%d", in))
528 		t = 0;
529 	else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
530 	    !a2time(&t, "%b %d, %Y", in)) {
531 		mandoc_msg(MANDOCERR_DATE_BAD, parse, ln, pos, in);
532 		t = 0;
533 	}
534 	out = t ? time2a(t) : NULL;
535 	return(out ? out : mandoc_strdup(in));
536 }
537 
538 int
539 mandoc_eos(const char *p, size_t sz)
540 {
541 	const char	*q;
542 	int		 enclosed, found;
543 
544 	if (0 == sz)
545 		return(0);
546 
547 	/*
548 	 * End-of-sentence recognition must include situations where
549 	 * some symbols, such as `)', allow prior EOS punctuation to
550 	 * propagate outward.
551 	 */
552 
553 	enclosed = found = 0;
554 	for (q = p + (int)sz - 1; q >= p; q--) {
555 		switch (*q) {
556 		case '\"':
557 			/* FALLTHROUGH */
558 		case '\'':
559 			/* FALLTHROUGH */
560 		case ']':
561 			/* FALLTHROUGH */
562 		case ')':
563 			if (0 == found)
564 				enclosed = 1;
565 			break;
566 		case '.':
567 			/* FALLTHROUGH */
568 		case '!':
569 			/* FALLTHROUGH */
570 		case '?':
571 			found = 1;
572 			break;
573 		default:
574 			return(found && (!enclosed || isalnum((unsigned char)*q)));
575 		}
576 	}
577 
578 	return(found && !enclosed);
579 }
580 
581 /*
582  * Convert a string to a long that may not be <0.
583  * If the string is invalid, or is less than 0, return -1.
584  */
585 int
586 mandoc_strntoi(const char *p, size_t sz, int base)
587 {
588 	char		 buf[32];
589 	char		*ep;
590 	long		 v;
591 
592 	if (sz > 31)
593 		return(-1);
594 
595 	memcpy(buf, p, sz);
596 	buf[(int)sz] = '\0';
597 
598 	errno = 0;
599 	v = strtol(buf, &ep, base);
600 
601 	if (buf[0] == '\0' || *ep != '\0')
602 		return(-1);
603 
604 	if (v > INT_MAX)
605 		v = INT_MAX;
606 	if (v < INT_MIN)
607 		v = INT_MIN;
608 
609 	return((int)v);
610 }
611