xref: /illumos-gate/usr/src/cmd/mandoc/read.c (revision bec2e3ff)
1 /*	$Id: read.c,v 1.161 2017/02/18 17:29:28 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2010-2017 Ingo Schwarze <schwarze@openbsd.org>
5  * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 #include "config.h"
20 
21 #include <sys/types.h>
22 #include <sys/mman.h>
23 #include <sys/stat.h>
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #if HAVE_ERR
28 #include <err.h>
29 #endif
30 #include <errno.h>
31 #include <fcntl.h>
32 #include <stdarg.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37 #include <zlib.h>
38 
39 #include "mandoc_aux.h"
40 #include "mandoc.h"
41 #include "roff.h"
42 #include "mdoc.h"
43 #include "man.h"
44 #include "libmandoc.h"
45 #include "roff_int.h"
46 
47 #define	REPARSE_LIMIT	1000
48 
49 struct	mparse {
50 	struct roff	 *roff; /* roff parser (!NULL) */
51 	struct roff_man	 *man; /* man parser */
52 	char		 *sodest; /* filename pointed to by .so */
53 	const char	 *file; /* filename of current input file */
54 	struct buf	 *primary; /* buffer currently being parsed */
55 	struct buf	 *secondary; /* preprocessed copy of input */
56 	const char	 *defos; /* default operating system */
57 	mandocmsg	  mmsg; /* warning/error message handler */
58 	enum mandoclevel  file_status; /* status of current parse */
59 	enum mandoclevel  wlevel; /* ignore messages below this */
60 	int		  options; /* parser options */
61 	int		  gzip; /* current input file is gzipped */
62 	int		  filenc; /* encoding of the current file */
63 	int		  reparse_count; /* finite interp. stack */
64 	int		  line; /* line number in the file */
65 };
66 
67 static	void	  choose_parser(struct mparse *);
68 static	void	  resize_buf(struct buf *, size_t);
69 static	void	  mparse_buf_r(struct mparse *, struct buf, size_t, int);
70 static	int	  read_whole_file(struct mparse *, const char *, int,
71 				struct buf *, int *);
72 static	void	  mparse_end(struct mparse *);
73 static	void	  mparse_parse_buffer(struct mparse *, struct buf,
74 			const char *);
75 
76 static	const enum mandocerr	mandoclimits[MANDOCLEVEL_MAX] = {
77 	MANDOCERR_OK,
78 	MANDOCERR_WARNING,
79 	MANDOCERR_WARNING,
80 	MANDOCERR_ERROR,
81 	MANDOCERR_UNSUPP,
82 	MANDOCERR_MAX,
83 	MANDOCERR_MAX
84 };
85 
86 static	const char * const	mandocerrs[MANDOCERR_MAX] = {
87 	"ok",
88 
89 	"generic warning",
90 
91 	/* related to the prologue */
92 	"missing manual title, using UNTITLED",
93 	"missing manual title, using \"\"",
94 	"lower case character in document title",
95 	"missing manual section, using \"\"",
96 	"unknown manual section",
97 	"missing date, using today's date",
98 	"cannot parse date, using it verbatim",
99 	"missing Os macro, using \"\"",
100 	"duplicate prologue macro",
101 	"late prologue macro",
102 	"skipping late title macro",
103 	"prologue macros out of order",
104 
105 	/* related to document structure */
106 	".so is fragile, better use ln(1)",
107 	"no document body",
108 	"content before first section header",
109 	"first section is not \"NAME\"",
110 	"NAME section without Nm before Nd",
111 	"NAME section without description",
112 	"description not at the end of NAME",
113 	"bad NAME section content",
114 	"missing comma before name",
115 	"missing description line, using \"\"",
116 	"sections out of conventional order",
117 	"duplicate section title",
118 	"unexpected section",
119 	"unusual Xr order",
120 	"unusual Xr punctuation",
121 	"AUTHORS section without An macro",
122 
123 	/* related to macros and nesting */
124 	"obsolete macro",
125 	"macro neither callable nor escaped",
126 	"skipping paragraph macro",
127 	"moving paragraph macro out of list",
128 	"skipping no-space macro",
129 	"blocks badly nested",
130 	"nested displays are not portable",
131 	"moving content out of list",
132 	"fill mode already enabled, skipping",
133 	"fill mode already disabled, skipping",
134 	"line scope broken",
135 
136 	/* related to missing macro arguments */
137 	"skipping empty request",
138 	"conditional request controls empty scope",
139 	"skipping empty macro",
140 	"empty block",
141 	"empty argument, using 0n",
142 	"missing display type, using -ragged",
143 	"list type is not the first argument",
144 	"missing -width in -tag list, using 6n",
145 	"missing utility name, using \"\"",
146 	"missing function name, using \"\"",
147 	"empty head in list item",
148 	"empty list item",
149 	"missing font type, using \\fR",
150 	"unknown font type, using \\fR",
151 	"nothing follows prefix",
152 	"empty reference block",
153 	"missing section argument",
154 	"missing -std argument, adding it",
155 	"missing option string, using \"\"",
156 	"missing resource identifier, using \"\"",
157 	"missing eqn box, using \"\"",
158 
159 	/* related to bad macro arguments */
160 	"unterminated quoted argument",
161 	"duplicate argument",
162 	"skipping duplicate argument",
163 	"skipping duplicate display type",
164 	"skipping duplicate list type",
165 	"skipping -width argument",
166 	"wrong number of cells",
167 	"unknown AT&T UNIX version",
168 	"comma in function argument",
169 	"parenthesis in function name",
170 	"invalid content in Rs block",
171 	"invalid Boolean argument",
172 	"unknown font, skipping request",
173 	"odd number of characters in request",
174 
175 	/* related to plain text */
176 	"blank line in fill mode, using .sp",
177 	"tab in filled text",
178 	"whitespace at end of input line",
179 	"new sentence, new line",
180 	"bad comment style",
181 	"invalid escape sequence",
182 	"undefined string, using \"\"",
183 
184 	/* related to tables */
185 	"tbl line starts with span",
186 	"tbl column starts with span",
187 	"skipping vertical bar in tbl layout",
188 
189 	"generic error",
190 
191 	/* related to tables */
192 	"non-alphabetic character in tbl options",
193 	"skipping unknown tbl option",
194 	"missing tbl option argument",
195 	"wrong tbl option argument size",
196 	"empty tbl layout",
197 	"invalid character in tbl layout",
198 	"unmatched parenthesis in tbl layout",
199 	"tbl without any data cells",
200 	"ignoring data in spanned tbl cell",
201 	"ignoring extra tbl data cells",
202 	"data block open at end of tbl",
203 
204 	/* related to document structure and macros */
205 	NULL,
206 	"input stack limit exceeded, infinite loop?",
207 	"skipping bad character",
208 	"skipping unknown macro",
209 	"skipping insecure request",
210 	"skipping item outside list",
211 	"skipping column outside column list",
212 	"skipping end of block that is not open",
213 	"fewer RS blocks open, skipping",
214 	"inserting missing end of block",
215 	"appending missing end of block",
216 
217 	/* related to request and macro arguments */
218 	"escaped character not allowed in a name",
219 	"NOT IMPLEMENTED: Bd -file",
220 	"skipping display without arguments",
221 	"missing list type, using -item",
222 	"missing manual name, using \"\"",
223 	"uname(3) system call failed, using UNKNOWN",
224 	"unknown standard specifier",
225 	"skipping request without numeric argument",
226 	"NOT IMPLEMENTED: .so with absolute path or \"..\"",
227 	".so request failed",
228 	"skipping all arguments",
229 	"skipping excess arguments",
230 	"divide by zero",
231 
232 	"unsupported feature",
233 	"input too large",
234 	"unsupported control character",
235 	"unsupported roff request",
236 	"eqn delim option in tbl",
237 	"unsupported tbl layout modifier",
238 	"ignoring macro in table",
239 };
240 
241 static	const char * const	mandoclevels[MANDOCLEVEL_MAX] = {
242 	"SUCCESS",
243 	"RESERVED",
244 	"WARNING",
245 	"ERROR",
246 	"UNSUPP",
247 	"BADARG",
248 	"SYSERR"
249 };
250 
251 
252 static void
253 resize_buf(struct buf *buf, size_t initial)
254 {
255 
256 	buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
257 	buf->buf = mandoc_realloc(buf->buf, buf->sz);
258 }
259 
260 static void
261 choose_parser(struct mparse *curp)
262 {
263 	char		*cp, *ep;
264 	int		 format;
265 
266 	/*
267 	 * If neither command line arguments -mdoc or -man select
268 	 * a parser nor the roff parser found a .Dd or .TH macro
269 	 * yet, look ahead in the main input buffer.
270 	 */
271 
272 	if ((format = roff_getformat(curp->roff)) == 0) {
273 		cp = curp->primary->buf;
274 		ep = cp + curp->primary->sz;
275 		while (cp < ep) {
276 			if (*cp == '.' || *cp == '\'') {
277 				cp++;
278 				if (cp[0] == 'D' && cp[1] == 'd') {
279 					format = MPARSE_MDOC;
280 					break;
281 				}
282 				if (cp[0] == 'T' && cp[1] == 'H') {
283 					format = MPARSE_MAN;
284 					break;
285 				}
286 			}
287 			cp = memchr(cp, '\n', ep - cp);
288 			if (cp == NULL)
289 				break;
290 			cp++;
291 		}
292 	}
293 
294 	if (format == MPARSE_MDOC) {
295 		mdoc_hash_init();
296 		curp->man->macroset = MACROSET_MDOC;
297 		curp->man->first->tok = TOKEN_NONE;
298 	} else {
299 		man_hash_init();
300 		curp->man->macroset = MACROSET_MAN;
301 		curp->man->first->tok = TOKEN_NONE;
302 	}
303 }
304 
305 /*
306  * Main parse routine for a buffer.
307  * It assumes encoding and line numbering are already set up.
308  * It can recurse directly (for invocations of user-defined
309  * macros, inline equations, and input line traps)
310  * and indirectly (for .so file inclusion).
311  */
312 static void
313 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start)
314 {
315 	const struct tbl_span	*span;
316 	struct buf	 ln;
317 	const char	*save_file;
318 	char		*cp;
319 	size_t		 pos; /* byte number in the ln buffer */
320 	size_t		 j;  /* auxiliary byte number in the blk buffer */
321 	enum rofferr	 rr;
322 	int		 of;
323 	int		 lnn; /* line number in the real file */
324 	int		 fd;
325 	unsigned char	 c;
326 
327 	memset(&ln, 0, sizeof(ln));
328 
329 	lnn = curp->line;
330 	pos = 0;
331 
332 	while (i < blk.sz) {
333 		if (0 == pos && '\0' == blk.buf[i])
334 			break;
335 
336 		if (start) {
337 			curp->line = lnn;
338 			curp->reparse_count = 0;
339 
340 			if (lnn < 3 &&
341 			    curp->filenc & MPARSE_UTF8 &&
342 			    curp->filenc & MPARSE_LATIN1)
343 				curp->filenc = preconv_cue(&blk, i);
344 		}
345 
346 		while (i < blk.sz && (start || blk.buf[i] != '\0')) {
347 
348 			/*
349 			 * When finding an unescaped newline character,
350 			 * leave the character loop to process the line.
351 			 * Skip a preceding carriage return, if any.
352 			 */
353 
354 			if ('\r' == blk.buf[i] && i + 1 < blk.sz &&
355 			    '\n' == blk.buf[i + 1])
356 				++i;
357 			if ('\n' == blk.buf[i]) {
358 				++i;
359 				++lnn;
360 				break;
361 			}
362 
363 			/*
364 			 * Make sure we have space for the worst
365 			 * case of 11 bytes: "\\[u10ffff]\0"
366 			 */
367 
368 			if (pos + 11 > ln.sz)
369 				resize_buf(&ln, 256);
370 
371 			/*
372 			 * Encode 8-bit input.
373 			 */
374 
375 			c = blk.buf[i];
376 			if (c & 0x80) {
377 				if ( ! (curp->filenc && preconv_encode(
378 				    &blk, &i, &ln, &pos, &curp->filenc))) {
379 					mandoc_vmsg(MANDOCERR_CHAR_BAD, curp,
380 					    curp->line, pos, "0x%x", c);
381 					ln.buf[pos++] = '?';
382 					i++;
383 				}
384 				continue;
385 			}
386 
387 			/*
388 			 * Exclude control characters.
389 			 */
390 
391 			if (c == 0x7f || (c < 0x20 && c != 0x09)) {
392 				mandoc_vmsg(c == 0x00 || c == 0x04 ||
393 				    c > 0x0a ? MANDOCERR_CHAR_BAD :
394 				    MANDOCERR_CHAR_UNSUPP,
395 				    curp, curp->line, pos, "0x%x", c);
396 				i++;
397 				if (c != '\r')
398 					ln.buf[pos++] = '?';
399 				continue;
400 			}
401 
402 			/* Trailing backslash = a plain char. */
403 
404 			if (blk.buf[i] != '\\' || i + 1 == blk.sz) {
405 				ln.buf[pos++] = blk.buf[i++];
406 				continue;
407 			}
408 
409 			/*
410 			 * Found escape and at least one other character.
411 			 * When it's a newline character, skip it.
412 			 * When there is a carriage return in between,
413 			 * skip that one as well.
414 			 */
415 
416 			if ('\r' == blk.buf[i + 1] && i + 2 < blk.sz &&
417 			    '\n' == blk.buf[i + 2])
418 				++i;
419 			if ('\n' == blk.buf[i + 1]) {
420 				i += 2;
421 				++lnn;
422 				continue;
423 			}
424 
425 			if ('"' == blk.buf[i + 1] || '#' == blk.buf[i + 1]) {
426 				j = i;
427 				i += 2;
428 				/* Comment, skip to end of line */
429 				for (; i < blk.sz; ++i) {
430 					if (blk.buf[i] != '\n')
431 						continue;
432 					if (blk.buf[i - 1] == ' ' ||
433 					    blk.buf[i - 1] == '\t')
434 						mandoc_msg(
435 						    MANDOCERR_SPACE_EOL,
436 						    curp, curp->line,
437 						    pos + i-1 - j, NULL);
438 					++i;
439 					++lnn;
440 					break;
441 				}
442 
443 				/* Backout trailing whitespaces */
444 				for (; pos > 0; --pos) {
445 					if (ln.buf[pos - 1] != ' ')
446 						break;
447 					if (pos > 2 && ln.buf[pos - 2] == '\\')
448 						break;
449 				}
450 				break;
451 			}
452 
453 			/* Catch escaped bogus characters. */
454 
455 			c = (unsigned char) blk.buf[i+1];
456 
457 			if ( ! (isascii(c) &&
458 			    (isgraph(c) || isblank(c)))) {
459 				mandoc_vmsg(MANDOCERR_CHAR_BAD, curp,
460 				    curp->line, pos, "0x%x", c);
461 				i += 2;
462 				ln.buf[pos++] = '?';
463 				continue;
464 			}
465 
466 			/* Some other escape sequence, copy & cont. */
467 
468 			ln.buf[pos++] = blk.buf[i++];
469 			ln.buf[pos++] = blk.buf[i++];
470 		}
471 
472 		if (pos >= ln.sz)
473 			resize_buf(&ln, 256);
474 
475 		ln.buf[pos] = '\0';
476 
477 		/*
478 		 * A significant amount of complexity is contained by
479 		 * the roff preprocessor.  It's line-oriented but can be
480 		 * expressed on one line, so we need at times to
481 		 * readjust our starting point and re-run it.  The roff
482 		 * preprocessor can also readjust the buffers with new
483 		 * data, so we pass them in wholesale.
484 		 */
485 
486 		of = 0;
487 
488 		/*
489 		 * Maintain a lookaside buffer of all parsed lines.  We
490 		 * only do this if mparse_keep() has been invoked (the
491 		 * buffer may be accessed with mparse_getkeep()).
492 		 */
493 
494 		if (curp->secondary) {
495 			curp->secondary->buf = mandoc_realloc(
496 			    curp->secondary->buf,
497 			    curp->secondary->sz + pos + 2);
498 			memcpy(curp->secondary->buf +
499 			    curp->secondary->sz,
500 			    ln.buf, pos);
501 			curp->secondary->sz += pos;
502 			curp->secondary->buf
503 				[curp->secondary->sz] = '\n';
504 			curp->secondary->sz++;
505 			curp->secondary->buf
506 				[curp->secondary->sz] = '\0';
507 		}
508 rerun:
509 		rr = roff_parseln(curp->roff, curp->line, &ln, &of);
510 
511 		switch (rr) {
512 		case ROFF_REPARSE:
513 			if (REPARSE_LIMIT >= ++curp->reparse_count)
514 				mparse_buf_r(curp, ln, of, 0);
515 			else
516 				mandoc_msg(MANDOCERR_ROFFLOOP, curp,
517 				    curp->line, pos, NULL);
518 			pos = 0;
519 			continue;
520 		case ROFF_APPEND:
521 			pos = strlen(ln.buf);
522 			continue;
523 		case ROFF_RERUN:
524 			goto rerun;
525 		case ROFF_IGN:
526 			pos = 0;
527 			continue;
528 		case ROFF_SO:
529 			if ( ! (curp->options & MPARSE_SO) &&
530 			    (i >= blk.sz || blk.buf[i] == '\0')) {
531 				curp->sodest = mandoc_strdup(ln.buf + of);
532 				free(ln.buf);
533 				return;
534 			}
535 			/*
536 			 * We remove `so' clauses from our lookaside
537 			 * buffer because we're going to descend into
538 			 * the file recursively.
539 			 */
540 			if (curp->secondary)
541 				curp->secondary->sz -= pos + 1;
542 			save_file = curp->file;
543 			if ((fd = mparse_open(curp, ln.buf + of)) != -1) {
544 				mparse_readfd(curp, fd, ln.buf + of);
545 				close(fd);
546 				curp->file = save_file;
547 			} else {
548 				curp->file = save_file;
549 				mandoc_vmsg(MANDOCERR_SO_FAIL,
550 				    curp, curp->line, pos,
551 				    ".so %s", ln.buf + of);
552 				ln.sz = mandoc_asprintf(&cp,
553 				    ".sp\nSee the file %s.\n.sp",
554 				    ln.buf + of);
555 				free(ln.buf);
556 				ln.buf = cp;
557 				of = 0;
558 				mparse_buf_r(curp, ln, of, 0);
559 			}
560 			pos = 0;
561 			continue;
562 		default:
563 			break;
564 		}
565 
566 		if (curp->man->macroset == MACROSET_NONE)
567 			choose_parser(curp);
568 
569 		/*
570 		 * Lastly, push down into the parsers themselves.
571 		 * If libroff returns ROFF_TBL, then add it to the
572 		 * currently open parse.  Since we only get here if
573 		 * there does exist data (see tbl_data.c), we're
574 		 * guaranteed that something's been allocated.
575 		 * Do the same for ROFF_EQN.
576 		 */
577 
578 		if (rr == ROFF_TBL)
579 			while ((span = roff_span(curp->roff)) != NULL)
580 				roff_addtbl(curp->man, span);
581 		else if (rr == ROFF_EQN)
582 			roff_addeqn(curp->man, roff_eqn(curp->roff));
583 		else if ((curp->man->macroset == MACROSET_MDOC ?
584 		    mdoc_parseln(curp->man, curp->line, ln.buf, of) :
585 		    man_parseln(curp->man, curp->line, ln.buf, of)) == 2)
586 				break;
587 
588 		/* Temporary buffers typically are not full. */
589 
590 		if (0 == start && '\0' == blk.buf[i])
591 			break;
592 
593 		/* Start the next input line. */
594 
595 		pos = 0;
596 	}
597 
598 	free(ln.buf);
599 }
600 
601 static int
602 read_whole_file(struct mparse *curp, const char *file, int fd,
603 		struct buf *fb, int *with_mmap)
604 {
605 	struct stat	 st;
606 	gzFile		 gz;
607 	size_t		 off;
608 	ssize_t		 ssz;
609 
610 	if (fstat(fd, &st) == -1)
611 		err((int)MANDOCLEVEL_SYSERR, "%s", file);
612 
613 	/*
614 	 * If we're a regular file, try just reading in the whole entry
615 	 * via mmap().  This is faster than reading it into blocks, and
616 	 * since each file is only a few bytes to begin with, I'm not
617 	 * concerned that this is going to tank any machines.
618 	 */
619 
620 	if (curp->gzip == 0 && S_ISREG(st.st_mode)) {
621 		if (st.st_size > 0x7fffffff) {
622 			mandoc_msg(MANDOCERR_TOOLARGE, curp, 0, 0, NULL);
623 			return 0;
624 		}
625 		*with_mmap = 1;
626 		fb->sz = (size_t)st.st_size;
627 		fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
628 		if (fb->buf != MAP_FAILED)
629 			return 1;
630 	}
631 
632 	if (curp->gzip) {
633 		if ((gz = gzdopen(fd, "rb")) == NULL)
634 			err((int)MANDOCLEVEL_SYSERR, "%s", file);
635 	} else
636 		gz = NULL;
637 
638 	/*
639 	 * If this isn't a regular file (like, say, stdin), then we must
640 	 * go the old way and just read things in bit by bit.
641 	 */
642 
643 	*with_mmap = 0;
644 	off = 0;
645 	fb->sz = 0;
646 	fb->buf = NULL;
647 	for (;;) {
648 		if (off == fb->sz) {
649 			if (fb->sz == (1U << 31)) {
650 				mandoc_msg(MANDOCERR_TOOLARGE, curp,
651 				    0, 0, NULL);
652 				break;
653 			}
654 			resize_buf(fb, 65536);
655 		}
656 		ssz = curp->gzip ?
657 		    gzread(gz, fb->buf + (int)off, fb->sz - off) :
658 		    read(fd, fb->buf + (int)off, fb->sz - off);
659 		if (ssz == 0) {
660 			fb->sz = off;
661 			return 1;
662 		}
663 		if (ssz == -1)
664 			err((int)MANDOCLEVEL_SYSERR, "%s", file);
665 		off += (size_t)ssz;
666 	}
667 
668 	free(fb->buf);
669 	fb->buf = NULL;
670 	return 0;
671 }
672 
673 static void
674 mparse_end(struct mparse *curp)
675 {
676 	if (curp->man->macroset == MACROSET_NONE)
677 		curp->man->macroset = MACROSET_MAN;
678 	if (curp->man->macroset == MACROSET_MDOC)
679 		mdoc_endparse(curp->man);
680 	else
681 		man_endparse(curp->man);
682 	roff_endparse(curp->roff);
683 }
684 
685 static void
686 mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file)
687 {
688 	struct buf	*svprimary;
689 	const char	*svfile;
690 	size_t		 offset;
691 	static int	 recursion_depth;
692 
693 	if (64 < recursion_depth) {
694 		mandoc_msg(MANDOCERR_ROFFLOOP, curp, curp->line, 0, NULL);
695 		return;
696 	}
697 
698 	/* Line number is per-file. */
699 	svfile = curp->file;
700 	curp->file = file;
701 	svprimary = curp->primary;
702 	curp->primary = &blk;
703 	curp->line = 1;
704 	recursion_depth++;
705 
706 	/* Skip an UTF-8 byte order mark. */
707 	if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
708 	    (unsigned char)blk.buf[0] == 0xef &&
709 	    (unsigned char)blk.buf[1] == 0xbb &&
710 	    (unsigned char)blk.buf[2] == 0xbf) {
711 		offset = 3;
712 		curp->filenc &= ~MPARSE_LATIN1;
713 	} else
714 		offset = 0;
715 
716 	mparse_buf_r(curp, blk, offset, 1);
717 
718 	if (--recursion_depth == 0)
719 		mparse_end(curp);
720 
721 	curp->primary = svprimary;
722 	curp->file = svfile;
723 }
724 
725 enum mandoclevel
726 mparse_readmem(struct mparse *curp, void *buf, size_t len,
727 		const char *file)
728 {
729 	struct buf blk;
730 
731 	blk.buf = buf;
732 	blk.sz = len;
733 
734 	mparse_parse_buffer(curp, blk, file);
735 	return curp->file_status;
736 }
737 
738 /*
739  * Read the whole file into memory and call the parsers.
740  * Called recursively when an .so request is encountered.
741  */
742 enum mandoclevel
743 mparse_readfd(struct mparse *curp, int fd, const char *file)
744 {
745 	struct buf	 blk;
746 	int		 with_mmap;
747 	int		 save_filenc;
748 
749 	if (read_whole_file(curp, file, fd, &blk, &with_mmap)) {
750 		save_filenc = curp->filenc;
751 		curp->filenc = curp->options &
752 		    (MPARSE_UTF8 | MPARSE_LATIN1);
753 		mparse_parse_buffer(curp, blk, file);
754 		curp->filenc = save_filenc;
755 		if (with_mmap)
756 			munmap(blk.buf, blk.sz);
757 		else
758 			free(blk.buf);
759 	}
760 	return curp->file_status;
761 }
762 
763 int
764 mparse_open(struct mparse *curp, const char *file)
765 {
766 	char		 *cp;
767 	int		  fd;
768 
769 	curp->file = file;
770 	cp = strrchr(file, '.');
771 	curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz"));
772 
773 	/* First try to use the filename as it is. */
774 
775 	if ((fd = open(file, O_RDONLY)) != -1)
776 		return fd;
777 
778 	/*
779 	 * If that doesn't work and the filename doesn't
780 	 * already  end in .gz, try appending .gz.
781 	 */
782 
783 	if ( ! curp->gzip) {
784 		mandoc_asprintf(&cp, "%s.gz", file);
785 		fd = open(cp, O_RDONLY);
786 		free(cp);
787 		if (fd != -1) {
788 			curp->gzip = 1;
789 			return fd;
790 		}
791 	}
792 
793 	/* Neither worked, give up. */
794 
795 	mandoc_msg(MANDOCERR_FILE, curp, 0, 0, strerror(errno));
796 	return -1;
797 }
798 
799 struct mparse *
800 mparse_alloc(int options, enum mandoclevel wlevel, mandocmsg mmsg,
801     const char *defos)
802 {
803 	struct mparse	*curp;
804 
805 	curp = mandoc_calloc(1, sizeof(struct mparse));
806 
807 	curp->options = options;
808 	curp->wlevel = wlevel;
809 	curp->mmsg = mmsg;
810 	curp->defos = defos;
811 
812 	curp->roff = roff_alloc(curp, options);
813 	curp->man = roff_man_alloc( curp->roff, curp, curp->defos,
814 		curp->options & MPARSE_QUICK ? 1 : 0);
815 	if (curp->options & MPARSE_MDOC) {
816 		mdoc_hash_init();
817 		curp->man->macroset = MACROSET_MDOC;
818 	} else if (curp->options & MPARSE_MAN) {
819 		man_hash_init();
820 		curp->man->macroset = MACROSET_MAN;
821 	}
822 	curp->man->first->tok = TOKEN_NONE;
823 	return curp;
824 }
825 
826 void
827 mparse_reset(struct mparse *curp)
828 {
829 	roff_reset(curp->roff);
830 	roff_man_reset(curp->man);
831 
832 	free(curp->sodest);
833 	curp->sodest = NULL;
834 
835 	if (curp->secondary)
836 		curp->secondary->sz = 0;
837 
838 	curp->file_status = MANDOCLEVEL_OK;
839 	curp->gzip = 0;
840 }
841 
842 void
843 mparse_free(struct mparse *curp)
844 {
845 
846 	roff_man_free(curp->man);
847 	roff_free(curp->roff);
848 	if (curp->secondary)
849 		free(curp->secondary->buf);
850 
851 	free(curp->secondary);
852 	free(curp->sodest);
853 	free(curp);
854 }
855 
856 void
857 mparse_result(struct mparse *curp, struct roff_man **man,
858 	char **sodest)
859 {
860 
861 	if (sodest && NULL != (*sodest = curp->sodest)) {
862 		*man = NULL;
863 		return;
864 	}
865 	if (man)
866 		*man = curp->man;
867 }
868 
869 void
870 mparse_updaterc(struct mparse *curp, enum mandoclevel *rc)
871 {
872 	if (curp->file_status > *rc)
873 		*rc = curp->file_status;
874 }
875 
876 void
877 mandoc_vmsg(enum mandocerr t, struct mparse *m,
878 		int ln, int pos, const char *fmt, ...)
879 {
880 	char		 buf[256];
881 	va_list		 ap;
882 
883 	va_start(ap, fmt);
884 	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
885 	va_end(ap);
886 
887 	mandoc_msg(t, m, ln, pos, buf);
888 }
889 
890 void
891 mandoc_msg(enum mandocerr er, struct mparse *m,
892 		int ln, int col, const char *msg)
893 {
894 	enum mandoclevel level;
895 
896 	level = MANDOCLEVEL_UNSUPP;
897 	while (er < mandoclimits[level])
898 		level--;
899 
900 	if (level < m->wlevel && er != MANDOCERR_FILE)
901 		return;
902 
903 	if (m->mmsg)
904 		(*m->mmsg)(er, level, m->file, ln, col, msg);
905 
906 	if (m->file_status < level)
907 		m->file_status = level;
908 }
909 
910 const char *
911 mparse_strerror(enum mandocerr er)
912 {
913 
914 	return mandocerrs[er];
915 }
916 
917 const char *
918 mparse_strlevel(enum mandoclevel lvl)
919 {
920 	return mandoclevels[lvl];
921 }
922 
923 void
924 mparse_keep(struct mparse *p)
925 {
926 
927 	assert(NULL == p->secondary);
928 	p->secondary = mandoc_calloc(1, sizeof(struct buf));
929 }
930 
931 const char *
932 mparse_getkeep(const struct mparse *p)
933 {
934 
935 	assert(p->secondary);
936 	return p->secondary->sz ? p->secondary->buf : NULL;
937 }
938