xref: /illumos-gate/usr/src/cmd/mandoc/read.c (revision d2a70789)
1 /*	$Id: read.c,v 1.149 2016/07/10 13:34:30 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2010-2016 Ingo Schwarze <schwarze@openbsd.org>
5  * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 #include "config.h"
20 
21 #include <sys/types.h>
22 #if HAVE_MMAP
23 #include <sys/mman.h>
24 #include <sys/stat.h>
25 #endif
26 
27 #include <assert.h>
28 #include <ctype.h>
29 #if HAVE_ERR
30 #include <err.h>
31 #endif
32 #include <errno.h>
33 #include <fcntl.h>
34 #include <stdarg.h>
35 #include <stdint.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <unistd.h>
40 #include <zlib.h>
41 
42 #include "mandoc_aux.h"
43 #include "mandoc.h"
44 #include "roff.h"
45 #include "mdoc.h"
46 #include "man.h"
47 #include "libmandoc.h"
48 #include "roff_int.h"
49 
50 #define	REPARSE_LIMIT	1000
51 
52 struct	mparse {
53 	struct roff_man	 *man; /* man parser */
54 	struct roff	 *roff; /* roff parser (!NULL) */
55 	char		 *sodest; /* filename pointed to by .so */
56 	const char	 *file; /* filename of current input file */
57 	struct buf	 *primary; /* buffer currently being parsed */
58 	struct buf	 *secondary; /* preprocessed copy of input */
59 	const char	 *defos; /* default operating system */
60 	mandocmsg	  mmsg; /* warning/error message handler */
61 	enum mandoclevel  file_status; /* status of current parse */
62 	enum mandoclevel  wlevel; /* ignore messages below this */
63 	int		  options; /* parser options */
64 	int		  gzip; /* current input file is gzipped */
65 	int		  filenc; /* encoding of the current file */
66 	int		  reparse_count; /* finite interp. stack */
67 	int		  line; /* line number in the file */
68 };
69 
70 static	void	  choose_parser(struct mparse *);
71 static	void	  resize_buf(struct buf *, size_t);
72 static	void	  mparse_buf_r(struct mparse *, struct buf, size_t, int);
73 static	int	  read_whole_file(struct mparse *, const char *, int,
74 				struct buf *, int *);
75 static	void	  mparse_end(struct mparse *);
76 static	void	  mparse_parse_buffer(struct mparse *, struct buf,
77 			const char *);
78 
79 static	const enum mandocerr	mandoclimits[MANDOCLEVEL_MAX] = {
80 	MANDOCERR_OK,
81 	MANDOCERR_WARNING,
82 	MANDOCERR_WARNING,
83 	MANDOCERR_ERROR,
84 	MANDOCERR_UNSUPP,
85 	MANDOCERR_MAX,
86 	MANDOCERR_MAX
87 };
88 
89 static	const char * const	mandocerrs[MANDOCERR_MAX] = {
90 	"ok",
91 
92 	"generic warning",
93 
94 	/* related to the prologue */
95 	"missing manual title, using UNTITLED",
96 	"missing manual title, using \"\"",
97 	"lower case character in document title",
98 	"missing manual section, using \"\"",
99 	"unknown manual section",
100 	"missing date, using today's date",
101 	"cannot parse date, using it verbatim",
102 	"missing Os macro, using \"\"",
103 	"duplicate prologue macro",
104 	"late prologue macro",
105 	"skipping late title macro",
106 	"prologue macros out of order",
107 
108 	/* related to document structure */
109 	".so is fragile, better use ln(1)",
110 	"no document body",
111 	"content before first section header",
112 	"first section is not \"NAME\"",
113 	"NAME section without name",
114 	"NAME section without description",
115 	"description not at the end of NAME",
116 	"bad NAME section content",
117 	"missing description line, using \"\"",
118 	"sections out of conventional order",
119 	"duplicate section title",
120 	"unexpected section",
121 	"unusual Xr order",
122 	"unusual Xr punctuation",
123 	"AUTHORS section without An macro",
124 
125 	/* related to macros and nesting */
126 	"obsolete macro",
127 	"macro neither callable nor escaped",
128 	"skipping paragraph macro",
129 	"moving paragraph macro out of list",
130 	"skipping no-space macro",
131 	"blocks badly nested",
132 	"nested displays are not portable",
133 	"moving content out of list",
134 	"fill mode already enabled, skipping",
135 	"fill mode already disabled, skipping",
136 	"line scope broken",
137 
138 	/* related to missing macro arguments */
139 	"skipping empty request",
140 	"conditional request controls empty scope",
141 	"skipping empty macro",
142 	"empty block",
143 	"empty argument, using 0n",
144 	"missing display type, using -ragged",
145 	"list type is not the first argument",
146 	"missing -width in -tag list, using 8n",
147 	"missing utility name, using \"\"",
148 	"missing function name, using \"\"",
149 	"empty head in list item",
150 	"empty list item",
151 	"missing font type, using \\fR",
152 	"unknown font type, using \\fR",
153 	"nothing follows prefix",
154 	"empty reference block",
155 	"missing -std argument, adding it",
156 	"missing option string, using \"\"",
157 	"missing resource identifier, using \"\"",
158 	"missing eqn box, using \"\"",
159 
160 	/* related to bad macro arguments */
161 	"unterminated quoted argument",
162 	"duplicate argument",
163 	"skipping duplicate argument",
164 	"skipping duplicate display type",
165 	"skipping duplicate list type",
166 	"skipping -width argument",
167 	"wrong number of cells",
168 	"unknown AT&T UNIX version",
169 	"comma in function argument",
170 	"parenthesis in function name",
171 	"invalid content in Rs block",
172 	"invalid Boolean argument",
173 	"unknown font, skipping request",
174 	"odd number of characters in request",
175 
176 	/* related to plain text */
177 	"blank line in fill mode, using .sp",
178 	"tab in filled text",
179 	"whitespace at end of input line",
180 	"bad comment style",
181 	"invalid escape sequence",
182 	"undefined string, using \"\"",
183 
184 	/* related to tables */
185 	"tbl line starts with span",
186 	"tbl column starts with span",
187 	"skipping vertical bar in tbl layout",
188 
189 	"generic error",
190 
191 	/* related to tables */
192 	"non-alphabetic character in tbl options",
193 	"skipping unknown tbl option",
194 	"missing tbl option argument",
195 	"wrong tbl option argument size",
196 	"empty tbl layout",
197 	"invalid character in tbl layout",
198 	"unmatched parenthesis in tbl layout",
199 	"tbl without any data cells",
200 	"ignoring data in spanned tbl cell",
201 	"ignoring extra tbl data cells",
202 	"data block open at end of tbl",
203 
204 	/* related to document structure and macros */
205 	NULL,
206 	"input stack limit exceeded, infinite loop?",
207 	"skipping bad character",
208 	"skipping unknown macro",
209 	"skipping insecure request",
210 	"skipping item outside list",
211 	"skipping column outside column list",
212 	"skipping end of block that is not open",
213 	"fewer RS blocks open, skipping",
214 	"inserting missing end of block",
215 	"appending missing end of block",
216 
217 	/* related to request and macro arguments */
218 	"escaped character not allowed in a name",
219 	"NOT IMPLEMENTED: Bd -file",
220 	"skipping display without arguments",
221 	"missing list type, using -item",
222 	"missing manual name, using \"\"",
223 	"uname(3) system call failed, using UNKNOWN",
224 	"unknown standard specifier",
225 	"skipping request without numeric argument",
226 	"NOT IMPLEMENTED: .so with absolute path or \"..\"",
227 	".so request failed",
228 	"skipping all arguments",
229 	"skipping excess arguments",
230 	"divide by zero",
231 
232 	"unsupported feature",
233 	"input too large",
234 	"unsupported control character",
235 	"unsupported roff request",
236 	"eqn delim option in tbl",
237 	"unsupported tbl layout modifier",
238 	"ignoring macro in table",
239 };
240 
241 static	const char * const	mandoclevels[MANDOCLEVEL_MAX] = {
242 	"SUCCESS",
243 	"RESERVED",
244 	"WARNING",
245 	"ERROR",
246 	"UNSUPP",
247 	"BADARG",
248 	"SYSERR"
249 };
250 
251 
252 static void
253 resize_buf(struct buf *buf, size_t initial)
254 {
255 
256 	buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
257 	buf->buf = mandoc_realloc(buf->buf, buf->sz);
258 }
259 
260 static void
261 choose_parser(struct mparse *curp)
262 {
263 	char		*cp, *ep;
264 	int		 format;
265 
266 	/*
267 	 * If neither command line arguments -mdoc or -man select
268 	 * a parser nor the roff parser found a .Dd or .TH macro
269 	 * yet, look ahead in the main input buffer.
270 	 */
271 
272 	if ((format = roff_getformat(curp->roff)) == 0) {
273 		cp = curp->primary->buf;
274 		ep = cp + curp->primary->sz;
275 		while (cp < ep) {
276 			if (*cp == '.' || *cp == '\'') {
277 				cp++;
278 				if (cp[0] == 'D' && cp[1] == 'd') {
279 					format = MPARSE_MDOC;
280 					break;
281 				}
282 				if (cp[0] == 'T' && cp[1] == 'H') {
283 					format = MPARSE_MAN;
284 					break;
285 				}
286 			}
287 			cp = memchr(cp, '\n', ep - cp);
288 			if (cp == NULL)
289 				break;
290 			cp++;
291 		}
292 	}
293 
294 	if (curp->man == NULL) {
295 		curp->man = roff_man_alloc(curp->roff, curp, curp->defos,
296 		    curp->options & MPARSE_QUICK ? 1 : 0);
297 		curp->man->macroset = MACROSET_MAN;
298 		curp->man->first->tok = TOKEN_NONE;
299 	}
300 
301 	if (format == MPARSE_MDOC) {
302 		mdoc_hash_init();
303 		curp->man->macroset = MACROSET_MDOC;
304 		curp->man->first->tok = TOKEN_NONE;
305 	} else {
306 		man_hash_init();
307 		curp->man->macroset = MACROSET_MAN;
308 		curp->man->first->tok = TOKEN_NONE;
309 	}
310 }
311 
312 /*
313  * Main parse routine for a buffer.
314  * It assumes encoding and line numbering are already set up.
315  * It can recurse directly (for invocations of user-defined
316  * macros, inline equations, and input line traps)
317  * and indirectly (for .so file inclusion).
318  */
319 static void
320 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start)
321 {
322 	const struct tbl_span	*span;
323 	struct buf	 ln;
324 	const char	*save_file;
325 	char		*cp;
326 	size_t		 pos; /* byte number in the ln buffer */
327 	enum rofferr	 rr;
328 	int		 of;
329 	int		 lnn; /* line number in the real file */
330 	int		 fd;
331 	unsigned char	 c;
332 
333 	memset(&ln, 0, sizeof(ln));
334 
335 	lnn = curp->line;
336 	pos = 0;
337 
338 	while (i < blk.sz) {
339 		if (0 == pos && '\0' == blk.buf[i])
340 			break;
341 
342 		if (start) {
343 			curp->line = lnn;
344 			curp->reparse_count = 0;
345 
346 			if (lnn < 3 &&
347 			    curp->filenc & MPARSE_UTF8 &&
348 			    curp->filenc & MPARSE_LATIN1)
349 				curp->filenc = preconv_cue(&blk, i);
350 		}
351 
352 		while (i < blk.sz && (start || blk.buf[i] != '\0')) {
353 
354 			/*
355 			 * When finding an unescaped newline character,
356 			 * leave the character loop to process the line.
357 			 * Skip a preceding carriage return, if any.
358 			 */
359 
360 			if ('\r' == blk.buf[i] && i + 1 < blk.sz &&
361 			    '\n' == blk.buf[i + 1])
362 				++i;
363 			if ('\n' == blk.buf[i]) {
364 				++i;
365 				++lnn;
366 				break;
367 			}
368 
369 			/*
370 			 * Make sure we have space for the worst
371 			 * case of 11 bytes: "\\[u10ffff]\0"
372 			 */
373 
374 			if (pos + 11 > ln.sz)
375 				resize_buf(&ln, 256);
376 
377 			/*
378 			 * Encode 8-bit input.
379 			 */
380 
381 			c = blk.buf[i];
382 			if (c & 0x80) {
383 				if ( ! (curp->filenc && preconv_encode(
384 				    &blk, &i, &ln, &pos, &curp->filenc))) {
385 					mandoc_vmsg(MANDOCERR_CHAR_BAD, curp,
386 					    curp->line, pos, "0x%x", c);
387 					ln.buf[pos++] = '?';
388 					i++;
389 				}
390 				continue;
391 			}
392 
393 			/*
394 			 * Exclude control characters.
395 			 */
396 
397 			if (c == 0x7f || (c < 0x20 && c != 0x09)) {
398 				mandoc_vmsg(c == 0x00 || c == 0x04 ||
399 				    c > 0x0a ? MANDOCERR_CHAR_BAD :
400 				    MANDOCERR_CHAR_UNSUPP,
401 				    curp, curp->line, pos, "0x%x", c);
402 				i++;
403 				if (c != '\r')
404 					ln.buf[pos++] = '?';
405 				continue;
406 			}
407 
408 			/* Trailing backslash = a plain char. */
409 
410 			if (blk.buf[i] != '\\' || i + 1 == blk.sz) {
411 				ln.buf[pos++] = blk.buf[i++];
412 				continue;
413 			}
414 
415 			/*
416 			 * Found escape and at least one other character.
417 			 * When it's a newline character, skip it.
418 			 * When there is a carriage return in between,
419 			 * skip that one as well.
420 			 */
421 
422 			if ('\r' == blk.buf[i + 1] && i + 2 < blk.sz &&
423 			    '\n' == blk.buf[i + 2])
424 				++i;
425 			if ('\n' == blk.buf[i + 1]) {
426 				i += 2;
427 				++lnn;
428 				continue;
429 			}
430 
431 			if ('"' == blk.buf[i + 1] || '#' == blk.buf[i + 1]) {
432 				i += 2;
433 				/* Comment, skip to end of line */
434 				for (; i < blk.sz; ++i) {
435 					if ('\n' == blk.buf[i]) {
436 						++i;
437 						++lnn;
438 						break;
439 					}
440 				}
441 
442 				/* Backout trailing whitespaces */
443 				for (; pos > 0; --pos) {
444 					if (ln.buf[pos - 1] != ' ')
445 						break;
446 					if (pos > 2 && ln.buf[pos - 2] == '\\')
447 						break;
448 				}
449 				break;
450 			}
451 
452 			/* Catch escaped bogus characters. */
453 
454 			c = (unsigned char) blk.buf[i+1];
455 
456 			if ( ! (isascii(c) &&
457 			    (isgraph(c) || isblank(c)))) {
458 				mandoc_vmsg(MANDOCERR_CHAR_BAD, curp,
459 				    curp->line, pos, "0x%x", c);
460 				i += 2;
461 				ln.buf[pos++] = '?';
462 				continue;
463 			}
464 
465 			/* Some other escape sequence, copy & cont. */
466 
467 			ln.buf[pos++] = blk.buf[i++];
468 			ln.buf[pos++] = blk.buf[i++];
469 		}
470 
471 		if (pos >= ln.sz)
472 			resize_buf(&ln, 256);
473 
474 		ln.buf[pos] = '\0';
475 
476 		/*
477 		 * A significant amount of complexity is contained by
478 		 * the roff preprocessor.  It's line-oriented but can be
479 		 * expressed on one line, so we need at times to
480 		 * readjust our starting point and re-run it.  The roff
481 		 * preprocessor can also readjust the buffers with new
482 		 * data, so we pass them in wholesale.
483 		 */
484 
485 		of = 0;
486 
487 		/*
488 		 * Maintain a lookaside buffer of all parsed lines.  We
489 		 * only do this if mparse_keep() has been invoked (the
490 		 * buffer may be accessed with mparse_getkeep()).
491 		 */
492 
493 		if (curp->secondary) {
494 			curp->secondary->buf = mandoc_realloc(
495 			    curp->secondary->buf,
496 			    curp->secondary->sz + pos + 2);
497 			memcpy(curp->secondary->buf +
498 			    curp->secondary->sz,
499 			    ln.buf, pos);
500 			curp->secondary->sz += pos;
501 			curp->secondary->buf
502 				[curp->secondary->sz] = '\n';
503 			curp->secondary->sz++;
504 			curp->secondary->buf
505 				[curp->secondary->sz] = '\0';
506 		}
507 rerun:
508 		rr = roff_parseln(curp->roff, curp->line, &ln, &of);
509 
510 		switch (rr) {
511 		case ROFF_REPARSE:
512 			if (REPARSE_LIMIT >= ++curp->reparse_count)
513 				mparse_buf_r(curp, ln, of, 0);
514 			else
515 				mandoc_msg(MANDOCERR_ROFFLOOP, curp,
516 				    curp->line, pos, NULL);
517 			pos = 0;
518 			continue;
519 		case ROFF_APPEND:
520 			pos = strlen(ln.buf);
521 			continue;
522 		case ROFF_RERUN:
523 			goto rerun;
524 		case ROFF_IGN:
525 			pos = 0;
526 			continue;
527 		case ROFF_SO:
528 			if ( ! (curp->options & MPARSE_SO) &&
529 			    (i >= blk.sz || blk.buf[i] == '\0')) {
530 				curp->sodest = mandoc_strdup(ln.buf + of);
531 				free(ln.buf);
532 				return;
533 			}
534 			/*
535 			 * We remove `so' clauses from our lookaside
536 			 * buffer because we're going to descend into
537 			 * the file recursively.
538 			 */
539 			if (curp->secondary)
540 				curp->secondary->sz -= pos + 1;
541 			save_file = curp->file;
542 			if ((fd = mparse_open(curp, ln.buf + of)) != -1) {
543 				mparse_readfd(curp, fd, ln.buf + of);
544 				close(fd);
545 				curp->file = save_file;
546 			} else {
547 				curp->file = save_file;
548 				mandoc_vmsg(MANDOCERR_SO_FAIL,
549 				    curp, curp->line, pos,
550 				    ".so %s", ln.buf + of);
551 				ln.sz = mandoc_asprintf(&cp,
552 				    ".sp\nSee the file %s.\n.sp",
553 				    ln.buf + of);
554 				free(ln.buf);
555 				ln.buf = cp;
556 				of = 0;
557 				mparse_buf_r(curp, ln, of, 0);
558 			}
559 			pos = 0;
560 			continue;
561 		default:
562 			break;
563 		}
564 
565 		/*
566 		 * If input parsers have not been allocated, do so now.
567 		 * We keep these instanced between parsers, but set them
568 		 * locally per parse routine since we can use different
569 		 * parsers with each one.
570 		 */
571 
572 		if (curp->man == NULL ||
573 		    curp->man->macroset == MACROSET_NONE)
574 			choose_parser(curp);
575 
576 		/*
577 		 * Lastly, push down into the parsers themselves.
578 		 * If libroff returns ROFF_TBL, then add it to the
579 		 * currently open parse.  Since we only get here if
580 		 * there does exist data (see tbl_data.c), we're
581 		 * guaranteed that something's been allocated.
582 		 * Do the same for ROFF_EQN.
583 		 */
584 
585 		if (rr == ROFF_TBL)
586 			while ((span = roff_span(curp->roff)) != NULL)
587 				roff_addtbl(curp->man, span);
588 		else if (rr == ROFF_EQN)
589 			roff_addeqn(curp->man, roff_eqn(curp->roff));
590 		else if ((curp->man->macroset == MACROSET_MDOC ?
591 		    mdoc_parseln(curp->man, curp->line, ln.buf, of) :
592 		    man_parseln(curp->man, curp->line, ln.buf, of)) == 2)
593 				break;
594 
595 		/* Temporary buffers typically are not full. */
596 
597 		if (0 == start && '\0' == blk.buf[i])
598 			break;
599 
600 		/* Start the next input line. */
601 
602 		pos = 0;
603 	}
604 
605 	free(ln.buf);
606 }
607 
608 static int
609 read_whole_file(struct mparse *curp, const char *file, int fd,
610 		struct buf *fb, int *with_mmap)
611 {
612 	gzFile		 gz;
613 	size_t		 off;
614 	ssize_t		 ssz;
615 
616 #if HAVE_MMAP
617 	struct stat	 st;
618 
619 	if (fstat(fd, &st) == -1)
620 		err((int)MANDOCLEVEL_SYSERR, "%s", file);
621 
622 	/*
623 	 * If we're a regular file, try just reading in the whole entry
624 	 * via mmap().  This is faster than reading it into blocks, and
625 	 * since each file is only a few bytes to begin with, I'm not
626 	 * concerned that this is going to tank any machines.
627 	 */
628 
629 	if (curp->gzip == 0 && S_ISREG(st.st_mode)) {
630 		if (st.st_size > 0x7fffffff) {
631 			mandoc_msg(MANDOCERR_TOOLARGE, curp, 0, 0, NULL);
632 			return 0;
633 		}
634 		*with_mmap = 1;
635 		fb->sz = (size_t)st.st_size;
636 		fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
637 		if (fb->buf != MAP_FAILED)
638 			return 1;
639 	}
640 #endif
641 
642 	if (curp->gzip) {
643 		if ((gz = gzdopen(fd, "rb")) == NULL)
644 			err((int)MANDOCLEVEL_SYSERR, "%s", file);
645 	} else
646 		gz = NULL;
647 
648 	/*
649 	 * If this isn't a regular file (like, say, stdin), then we must
650 	 * go the old way and just read things in bit by bit.
651 	 */
652 
653 	*with_mmap = 0;
654 	off = 0;
655 	fb->sz = 0;
656 	fb->buf = NULL;
657 	for (;;) {
658 		if (off == fb->sz) {
659 			if (fb->sz == (1U << 31)) {
660 				mandoc_msg(MANDOCERR_TOOLARGE, curp,
661 				    0, 0, NULL);
662 				break;
663 			}
664 			resize_buf(fb, 65536);
665 		}
666 		ssz = curp->gzip ?
667 		    gzread(gz, fb->buf + (int)off, fb->sz - off) :
668 		    read(fd, fb->buf + (int)off, fb->sz - off);
669 		if (ssz == 0) {
670 			fb->sz = off;
671 			return 1;
672 		}
673 		if (ssz == -1)
674 			err((int)MANDOCLEVEL_SYSERR, "%s", file);
675 		off += (size_t)ssz;
676 	}
677 
678 	free(fb->buf);
679 	fb->buf = NULL;
680 	return 0;
681 }
682 
683 static void
684 mparse_end(struct mparse *curp)
685 {
686 
687 	if (curp->man == NULL && curp->sodest == NULL)
688 		curp->man = roff_man_alloc(curp->roff, curp, curp->defos,
689 		    curp->options & MPARSE_QUICK ? 1 : 0);
690 	if (curp->man->macroset == MACROSET_NONE)
691 		curp->man->macroset = MACROSET_MAN;
692 	if (curp->man->macroset == MACROSET_MDOC)
693 		mdoc_endparse(curp->man);
694 	else
695 		man_endparse(curp->man);
696 	roff_endparse(curp->roff);
697 }
698 
699 static void
700 mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file)
701 {
702 	struct buf	*svprimary;
703 	const char	*svfile;
704 	size_t		 offset;
705 	static int	 recursion_depth;
706 
707 	if (64 < recursion_depth) {
708 		mandoc_msg(MANDOCERR_ROFFLOOP, curp, curp->line, 0, NULL);
709 		return;
710 	}
711 
712 	/* Line number is per-file. */
713 	svfile = curp->file;
714 	curp->file = file;
715 	svprimary = curp->primary;
716 	curp->primary = &blk;
717 	curp->line = 1;
718 	recursion_depth++;
719 
720 	/* Skip an UTF-8 byte order mark. */
721 	if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
722 	    (unsigned char)blk.buf[0] == 0xef &&
723 	    (unsigned char)blk.buf[1] == 0xbb &&
724 	    (unsigned char)blk.buf[2] == 0xbf) {
725 		offset = 3;
726 		curp->filenc &= ~MPARSE_LATIN1;
727 	} else
728 		offset = 0;
729 
730 	mparse_buf_r(curp, blk, offset, 1);
731 
732 	if (--recursion_depth == 0)
733 		mparse_end(curp);
734 
735 	curp->primary = svprimary;
736 	curp->file = svfile;
737 }
738 
739 enum mandoclevel
740 mparse_readmem(struct mparse *curp, void *buf, size_t len,
741 		const char *file)
742 {
743 	struct buf blk;
744 
745 	blk.buf = buf;
746 	blk.sz = len;
747 
748 	mparse_parse_buffer(curp, blk, file);
749 	return curp->file_status;
750 }
751 
752 /*
753  * Read the whole file into memory and call the parsers.
754  * Called recursively when an .so request is encountered.
755  */
756 enum mandoclevel
757 mparse_readfd(struct mparse *curp, int fd, const char *file)
758 {
759 	struct buf	 blk;
760 	int		 with_mmap;
761 	int		 save_filenc;
762 
763 	if (read_whole_file(curp, file, fd, &blk, &with_mmap)) {
764 		save_filenc = curp->filenc;
765 		curp->filenc = curp->options &
766 		    (MPARSE_UTF8 | MPARSE_LATIN1);
767 		mparse_parse_buffer(curp, blk, file);
768 		curp->filenc = save_filenc;
769 #if HAVE_MMAP
770 		if (with_mmap)
771 			munmap(blk.buf, blk.sz);
772 		else
773 #endif
774 			free(blk.buf);
775 	}
776 	return curp->file_status;
777 }
778 
779 int
780 mparse_open(struct mparse *curp, const char *file)
781 {
782 	char		 *cp;
783 	int		  fd;
784 
785 	curp->file = file;
786 	cp = strrchr(file, '.');
787 	curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz"));
788 
789 	/* First try to use the filename as it is. */
790 
791 	if ((fd = open(file, O_RDONLY)) != -1)
792 		return fd;
793 
794 	/*
795 	 * If that doesn't work and the filename doesn't
796 	 * already  end in .gz, try appending .gz.
797 	 */
798 
799 	if ( ! curp->gzip) {
800 		mandoc_asprintf(&cp, "%s.gz", file);
801 		fd = open(cp, O_RDONLY);
802 		free(cp);
803 		if (fd != -1) {
804 			curp->gzip = 1;
805 			return fd;
806 		}
807 	}
808 
809 	/* Neither worked, give up. */
810 
811 	mandoc_msg(MANDOCERR_FILE, curp, 0, 0, strerror(errno));
812 	return -1;
813 }
814 
815 struct mparse *
816 mparse_alloc(int options, enum mandoclevel wlevel, mandocmsg mmsg,
817     const char *defos)
818 {
819 	struct mparse	*curp;
820 
821 	curp = mandoc_calloc(1, sizeof(struct mparse));
822 
823 	curp->options = options;
824 	curp->wlevel = wlevel;
825 	curp->mmsg = mmsg;
826 	curp->defos = defos;
827 
828 	curp->roff = roff_alloc(curp, options);
829 	curp->man = roff_man_alloc( curp->roff, curp, curp->defos,
830 		curp->options & MPARSE_QUICK ? 1 : 0);
831 	if (curp->options & MPARSE_MDOC) {
832 		mdoc_hash_init();
833 		curp->man->macroset = MACROSET_MDOC;
834 	} else if (curp->options & MPARSE_MAN) {
835 		man_hash_init();
836 		curp->man->macroset = MACROSET_MAN;
837 	}
838 	curp->man->first->tok = TOKEN_NONE;
839 	return curp;
840 }
841 
842 void
843 mparse_reset(struct mparse *curp)
844 {
845 
846 	roff_reset(curp->roff);
847 
848 	if (curp->man != NULL)
849 		roff_man_reset(curp->man);
850 	if (curp->secondary)
851 		curp->secondary->sz = 0;
852 
853 	curp->file_status = MANDOCLEVEL_OK;
854 
855 	free(curp->sodest);
856 	curp->sodest = NULL;
857 }
858 
859 void
860 mparse_free(struct mparse *curp)
861 {
862 
863 	roff_man_free(curp->man);
864 	if (curp->roff)
865 		roff_free(curp->roff);
866 	if (curp->secondary)
867 		free(curp->secondary->buf);
868 
869 	free(curp->secondary);
870 	free(curp->sodest);
871 	free(curp);
872 }
873 
874 void
875 mparse_result(struct mparse *curp, struct roff_man **man,
876 	char **sodest)
877 {
878 
879 	if (sodest && NULL != (*sodest = curp->sodest)) {
880 		*man = NULL;
881 		return;
882 	}
883 	if (man)
884 		*man = curp->man;
885 }
886 
887 void
888 mandoc_vmsg(enum mandocerr t, struct mparse *m,
889 		int ln, int pos, const char *fmt, ...)
890 {
891 	char		 buf[256];
892 	va_list		 ap;
893 
894 	va_start(ap, fmt);
895 	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
896 	va_end(ap);
897 
898 	mandoc_msg(t, m, ln, pos, buf);
899 }
900 
901 void
902 mandoc_msg(enum mandocerr er, struct mparse *m,
903 		int ln, int col, const char *msg)
904 {
905 	enum mandoclevel level;
906 
907 	level = MANDOCLEVEL_UNSUPP;
908 	while (er < mandoclimits[level])
909 		level--;
910 
911 	if (level < m->wlevel && er != MANDOCERR_FILE)
912 		return;
913 
914 	if (m->mmsg)
915 		(*m->mmsg)(er, level, m->file, ln, col, msg);
916 
917 	if (m->file_status < level)
918 		m->file_status = level;
919 }
920 
921 const char *
922 mparse_strerror(enum mandocerr er)
923 {
924 
925 	return mandocerrs[er];
926 }
927 
928 const char *
929 mparse_strlevel(enum mandoclevel lvl)
930 {
931 	return mandoclevels[lvl];
932 }
933 
934 void
935 mparse_keep(struct mparse *p)
936 {
937 
938 	assert(NULL == p->secondary);
939 	p->secondary = mandoc_calloc(1, sizeof(struct buf));
940 }
941 
942 const char *
943 mparse_getkeep(const struct mparse *p)
944 {
945 
946 	assert(p->secondary);
947 	return p->secondary->sz ? p->secondary->buf : NULL;
948 }
949