1 /*	$Id: util.c,v 1.15 2015/02/25 14:49:14 kristaps Exp $ */
2 /*
3  * Copyright (c) 2015 Kristaps Dzonsons <kristaps@bsd.lv>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 #include <sys/mman.h>
18 #include <sys/stat.h>
19 
20 #include <assert.h>
21 #include <ctype.h>
22 #include <fcntl.h>
23 #include <getopt.h>
24 #include <libgen.h>
25 #include <limits.h>
26 #include <stdarg.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <time.h>
31 #include <unistd.h>
32 
33 #include "extern.h"
34 
35 /*
36  * Unmap the top-most file in the stack of files currently opened (that
37  * is, nested calls to parsefile()).
38  */
39 void
texifilepop(struct texi * p)40 texifilepop(struct texi *p)
41 {
42 	struct texifile	*f;
43 
44 	assert(p->filepos > 0);
45 	f = &p->files[--p->filepos];
46 	free(f->map);
47 }
48 
49 static void
teximacrofree(struct teximacro * p)50 teximacrofree(struct teximacro *p)
51 {
52 	size_t	 i;
53 
54 	for (i = 0; i < p->argsz; i++)
55 		free(p->args[i]);
56 
57 	free(p->args);
58 	free(p->key);
59 	free(p->value);
60 }
61 
62 static void
texivaluefree(struct texivalue * p)63 texivaluefree(struct texivalue *p)
64 {
65 
66 	free(p->key);
67 	free(p->value);
68 }
69 
70 /*
71  * Unmap all files that we're currently using and free all resources
72  * that we've allocated during the parse.
73  * The utility should exit(...) after this is called.
74  */
75 void
texiexit(struct texi * p)76 texiexit(struct texi *p)
77 {
78 	size_t	 i;
79 
80 	/* Make sure we're newline-terminated. */
81 	if (p->outcol)
82 		putchar('\n');
83 
84 	/* Unmap all files. */
85 	while (p->filepos > 0)
86 		texifilepop(p);
87 
88 	for (i = 0; i < p->macrosz; i++)
89 		teximacrofree(&p->macros[i]);
90 	for (i = 0; i < p->dirsz; i++)
91 		free(p->dirs[i]);
92 	for (i = 0; i < p->indexsz; i++)
93 		free(p->indexs[i]);
94 	for (i = 0; i < p->valsz; i++)
95 		texivaluefree(&p->vals[i]);
96 
97 	free(p->macros);
98 	free(p->vals);
99 	free(p->indexs);
100 	free(p->dirs);
101 	free(p->subtitle);
102 	free(p->title);
103 }
104 
105 /*
106  * Fatal error: unmap all files and exit.
107  * The "errstring" is passed to perror(3).
108  */
109 void
texiabort(struct texi * p,const char * errstring)110 texiabort(struct texi *p, const char *errstring)
111 {
112 
113 	perror(errstring);
114 	texiexit(p);
115 	exit(EXIT_FAILURE);
116 }
117 
118 /*
119  * Print a generic warning message (to stderr) tied to our current
120  * location in the parse sequence.
121  */
122 void
texiwarn(const struct texi * p,const char * fmt,...)123 texiwarn(const struct texi *p, const char *fmt, ...)
124 {
125 	va_list	 	 	 ap;
126 	const struct texifile	*f;
127 
128 	f = &p->files[p->filepos - 1];
129 
130 	if (f->insplice)
131 		fprintf(stderr, "%s:%zu:%zu (%zuB left in splice): "
132 			"warning: ", f->name, f->line + 1,
133 			f->col + 1, f->insplice);
134 	else
135 		fprintf(stderr, "%s:%zu:%zu: warning: ",
136 			f->name, f->line + 1, f->col + 1);
137 
138 	va_start(ap, fmt);
139 	vfprintf(stderr, fmt, ap);
140 	va_end(ap);
141 	fputc('\n', stderr);
142 }
143 
144 /*
145  * Print an error message (to stderr) tied to our current location in
146  * the parse sequence, invoke texiexit(), then die.
147  */
148 void
texierr(struct texi * p,const char * fmt,...)149 texierr(struct texi *p, const char *fmt, ...)
150 {
151 	va_list	 	 ap;
152 	struct texifile	*f;
153 
154 	f = &p->files[p->filepos - 1];
155 
156 	if (f->insplice)
157 		fprintf(stderr, "%s:%zu:%zu: (%zuB left in splice): "
158 			"error: ", f->name, f->line + 1,
159 			f->col + 1, f->insplice);
160 	else
161 		fprintf(stderr, "%s:%zu:%zu: error: ",
162 			f->name, f->line + 1, f->col + 1);
163 
164 	va_start(ap, fmt);
165 	vfprintf(stderr, fmt, ap);
166 	va_end(ap);
167 	fputc('\n', stderr);
168 	texiexit(p);
169 	exit(EXIT_FAILURE);
170 }
171 
172 /*
173  * Put a single data character to the output if we're not ignoring.
174  * Escape starting a line with a control character and slashes.
175  */
176 void
texiputchar(struct texi * p,char c)177 texiputchar(struct texi *p, char c)
178 {
179 
180 	if (p->ign)
181 		return;
182 	if ('.' == c && 0 == p->outcol)
183 		fputs("\\&", stdout);
184 	if ('\'' == c && 0 == p->outcol)
185 		fputs("\\&", stdout);
186 
187 	putchar(c);
188 	if ('\\' == c)
189 		putchar('e');
190 	p->seenvs = 0;
191 	if ('\n' == c) {
192 		p->outcol = 0;
193 		p->seenws = 0;
194 	} else
195 		p->outcol++;
196 }
197 
198 /*
199  * Put an opaque series of characters.
200  * Characters starting a line with a control character are escaped, but
201  * that's it, so don't use this for non-controlled sequences of text.
202  */
203 void
texiputchars(struct texi * p,const char * s)204 texiputchars(struct texi *p, const char *s)
205 {
206 
207 	if (p->ign)
208 		return;
209 	if ('.' == *s && 0 == p->outcol)
210 		fputs("\\&", stdout);
211 	if ('\'' == *s && 0 == p->outcol)
212 		fputs("\\&", stdout);
213 	p->outcol += fputs(s, stdout);
214 	p->seenvs = 0;
215 }
216 
217 /*
218  * This puts all characters onto the output stream but makes sure to
219  * escape mdoc(7) slashes.
220  * FIXME: useless.
221  */
222 void
texiputbuf(struct texi * p,size_t start,size_t end)223 texiputbuf(struct texi *p, size_t start, size_t end)
224 {
225 
226 	for ( ; start < end; start++)
227 		texiputchar(p, BUF(p)[start]);
228 }
229 
230 /*
231  * Close an mdoc(7) macro opened with teximacroopen().
232  * If there are no more macros on the line, prints a newline.
233  */
234 void
teximacroclose(struct texi * p)235 teximacroclose(struct texi *p)
236 {
237 
238 	if (p->ign)
239 		return;
240 
241 	if (0 == --p->outmacro) {
242 		putchar('\n');
243 		p->outcol = p->seenws = 0;
244 	}
245 }
246 
247 /*
248  * Open a mdoc(7) macro.
249  * This is used for line macros, e.g., Qq [foo bar baz].
250  * It can be invoked for nested macros, e.g., Qq Li foo .
251  * TODO: flush-right punctuation (e.g., parenthesis).
252  */
253 void
teximacroopen(struct texi * p,const char * s)254 teximacroopen(struct texi *p, const char *s)
255 {
256 	int	 rc;
257 
258 	if (p->ign)
259 		return;
260 
261 	if (p->outcol && 0 == p->outmacro) {
262 		putchar('\n');
263 		p->outcol = 0;
264 	}
265 
266 	if (0 == p->outmacro)
267 		putchar('.');
268 	else
269 		putchar(' ');
270 
271 	if (EOF != (rc = fputs(s, stdout)))
272 		p->outcol += rc;
273 
274 	putchar(' ');
275 	p->outcol++;
276 	p->outmacro++;
277 	p->seenws = 0;
278 }
279 
280 /*
281  * Put a stadnalone mdoc(7) command with the trailing newline.
282  */
283 void
teximacro(struct texi * p,const char * s)284 teximacro(struct texi *p, const char *s)
285 {
286 
287 	if (p->ign)
288 		return;
289 
290 	if (p->outmacro)
291 		texierr(p, "\"%s\" in open line scope!?", s);
292 	if (p->literal)
293 		texierr(p, "\"%s\" in a literal scope!?", s);
294 
295 	if (p->outcol)
296 		putchar('\n');
297 
298 	putchar('.');
299 	puts(s);
300 	p->outcol = p->seenws = 0;
301 }
302 
303 /*
304  * Introduce vertical space during normal (non-macro) input.
305  */
306 void
texivspace(struct texi * p)307 texivspace(struct texi *p)
308 {
309 
310 	if (p->seenvs || TEXILIST_TABLE == p->list)
311 		return;
312 	teximacro(p, "Pp");
313 	p->seenvs = 1;
314 }
315 
316 /*
317  * Advance by a single byte in the input stream, adjusting our location
318  * in the current input file.
319  */
320 void
advance(struct texi * p,size_t * pos)321 advance(struct texi *p, size_t *pos)
322 {
323 	struct texifile	*f;
324 
325 	f = &p->files[p->filepos - 1];
326 
327 	if (0 == f->insplice) {
328 		if ('\n' == BUF(p)[*pos]) {
329 			f->line++;
330 			f->col = 0;
331 		} else
332 			f->col++;
333 	} else
334 		--f->insplice;
335 
336 	(*pos)++;
337 }
338 
339 /*
340  * It's common to wait punctuation to float on the right side of macro
341  * lines in mdoc(7), e.g., ".Em hello ) ."
342  * This function does so, and should be called before teximacroclose().
343  * It will detect that it's the last in the nested macros and
344  * appropriately flush-left punctuation alongside the macro.
345  */
346 void
texipunctuate(struct texi * p,size_t * pos)347 texipunctuate(struct texi *p, size_t *pos)
348 {
349 	size_t	 start, end;
350 
351 	if (1 != p->outmacro)
352 		return;
353 
354 	for (start = end = *pos; end < BUFSZ(p); end++) {
355 		switch (BUF(p)[end]) {
356 		case (','):
357 		case (')'):
358 		case ('.'):
359 		case ('"'):
360 		case (':'):
361 		case ('!'):
362 		case ('?'):
363 			continue;
364 		default:
365 			break;
366 		}
367 		break;
368 	}
369 	if (end == *pos)
370 		return;
371 	if (end + 1 == BUFSZ(p) || ' ' == BUF(p)[end] ||
372 		'\n' == BUF(p)[end]) {
373 		for ( ; start < end; start++) {
374 			texiputchar(p, ' ');
375 			texiputchar(p, BUF(p)[start]);
376 			advance(p, pos);
377 		}
378 	}
379 }
380 
381 /*
382  * Advance to the next non-whitespace word in the input stream.
383  * If we're in literal mode, then print all of the whitespace as we're
384  * doing so.
385  */
386 static size_t
advancenext(struct texi * p,size_t * pos)387 advancenext(struct texi *p, size_t *pos)
388 {
389 
390 	if (p->literal) {
391 		while (*pos < BUFSZ(p) && ismspace(BUF(p)[*pos])) {
392 			texiputchar(p, BUF(p)[*pos]);
393 			advance(p, pos);
394 		}
395 		return(*pos);
396 	}
397 
398 	while (*pos < BUFSZ(p) && ismspace(BUF(p)[*pos])) {
399 		p->seenws = 1;
400 		/*
401 		 * If it looks like we've printed a double-line, then
402 		 * output a paragraph.
403 		 * FIXME: this is stupid.
404 		 */
405 		if (*pos && '\n' == BUF(p)[*pos] && '\n' == BUF(p)[*pos - 1])
406 			texivspace(p);
407 		advance(p, pos);
408 	}
409 	return(*pos);
410 }
411 
412 /*
413  * Advance to the EOLN in the input stream.
414  * NOTE: THIS SHOULD NOT BE CALLED ON BLANK TEXT, as it will read up to
415  * the @\n.
416  */
417 size_t
advanceeoln(struct texi * p,size_t * pos,int consumenl)418 advanceeoln(struct texi *p, size_t *pos, int consumenl)
419 {
420 
421 	while (*pos < BUFSZ(p) && '\n' != BUF(p)[*pos])
422 		advance(p, pos);
423 	if (*pos < BUFSZ(p) && consumenl)
424 		advance(p, pos);
425 	return(*pos);
426 }
427 
428 /*
429  * Advance to position "end", which is an absolute position in the
430  * current buffer greater than or equal to the current position.
431  */
432 void
advanceto(struct texi * p,size_t * pos,size_t end)433 advanceto(struct texi *p, size_t *pos, size_t end)
434 {
435 
436 	assert(*pos <= end);
437 	while (*pos < end)
438 		advance(p, pos);
439 }
440 
441 static void
texiexecmacro(struct texi * p,struct teximacro * m,size_t * pos)442 texiexecmacro(struct texi *p, struct teximacro *m, size_t *pos)
443 {
444 	size_t		  valsz, realsz, aasz, asz,
445 			   ssz, i, j, k, start, end;
446 	char		 *val;
447 	char		**args;
448 	const char	 *cp;
449 
450 	args = argparse(p, pos, &asz, m->argsz);
451 	if (asz != m->argsz)
452 		texiwarn(p, "invalid macro argument length");
453 	aasz = asz < m->argsz ? asz : m->argsz;
454 
455 	if (0 == aasz) {
456 		texisplice(p, m->value, strlen(m->value), pos);
457 		return;
458 	}
459 
460 	valsz = realsz = strlen(m->value);
461 	val = strdup(m->value);
462 
463 	for (i = j = 0; i < realsz; i++) {
464 		/* Parse blindly til the backslash delimiter. */
465 		if ('\\' != m->value[i]) {
466 			val[j++] = m->value[i];
467 			val[j] = '\0';
468 			continue;
469 		} else if (i == realsz - 1)
470 			texierr(p, "trailing argument name delimiter");
471 
472 		/* Double-backslash is escaped. */
473 		if ('\\' == m->value[i + 1]) {
474 			val[j++] = m->value[i++];
475 			val[j] = '\0';
476 			continue;
477 		}
478 
479 		assert('\\' == m->value[i] && i < realsz - 1);
480 
481 		/* Parse to terminating delimiter. */
482 		/* FIXME: embedded, escaped delimiters? */
483 		for (start = end = i + 1; end < realsz; end++)
484 			if ('\\' == m->value[end])
485 				break;
486 		if (end == realsz)
487 			texierr(p, "unterminated argument name");
488 
489 		for (k = 0; k < aasz; k++) {
490 			if ((ssz = strlen(m->args[k])) != (end - start))
491 				continue;
492 			if (strncmp(&m->value[start], m->args[k], ssz))
493 				continue;
494 			break;
495 		}
496 
497 		/*
498 		 * Argument didn't exist in argument table.
499 		 * Just ignore it.
500 		 */
501 		if (k == aasz) {
502 			i = end;
503 			continue;
504 		}
505 
506 		if (strlen(args[k]) > ssz) {
507 			valsz += strlen(args[k]);
508 			val = realloc(val, valsz + 1);
509 			if (NULL == val)
510 				texiabort(p, NULL);
511 		}
512 
513 		for (cp = args[k]; '\0' != *cp; cp++)
514 			val[j++] = *cp;
515 
516 		val[j] = '\0';
517 		i = end;
518 	}
519 
520 	texisplice(p, val, strlen(val), pos);
521 
522 	for (i = 0; i < asz; i++)
523 		free(args[i]);
524 	free(args);
525 	free(val);
526 }
527 
528 /*
529  * Output a free-form word in the input stream, progressing to the next
530  * command or white-space.
531  * This also will advance the input stream.
532  */
533 static void
parseword(struct texi * p,size_t * pos,char extra)534 parseword(struct texi *p, size_t *pos, char extra)
535 {
536 
537 	if (p->seenws && 0 == p->outmacro &&
538 		 p->outcol > 72 && 0 == p->literal)
539 		texiputchar(p, '\n');
540 	/* FIXME: abstract this: we use it elsewhere. */
541 	if (p->seenws && p->outcol && 0 == p->literal)
542 		texiputchar(p, ' ');
543 
544 	p->seenws = 0;
545 
546 	while (*pos < BUFSZ(p) && ! ismspace(BUF(p)[*pos])) {
547 		switch (BUF(p)[*pos]) {
548 		case ('@'):
549 		case ('}'):
550 		case ('{'):
551 			return;
552 		}
553 		if ('\0' != extra && BUF(p)[*pos] == extra)
554 			return;
555 		if (*pos < BUFSZ(p) - 1 &&
556 			 '`' == BUF(p)[*pos] &&
557 			 '`' == BUF(p)[*pos + 1]) {
558 			texiputchars(p, "\\(lq");
559 			advance(p, pos);
560 		} else if (*pos < BUFSZ(p) - 1 &&
561 			 '\'' == BUF(p)[*pos] &&
562 			 '\'' == BUF(p)[*pos + 1]) {
563 			texiputchars(p, "\\(rq");
564 			advance(p, pos);
565 		} else
566 			texiputchar(p, BUF(p)[*pos]);
567 		advance(p, pos);
568 	}
569 }
570 
571 /*
572  * Look up the command at position "pos" in the buffer, returning it (or
573  * TEXICMD__MAX if none found) and setting "end" to be the absolute
574  * index after the command name.
575  */
576 enum texicmd
texicmd(struct texi * p,size_t pos,size_t * end,struct teximacro ** macro)577 texicmd(struct texi *p, size_t pos, size_t *end, struct teximacro **macro)
578 {
579 	size_t	 i, len, toksz;
580 
581 	assert('@' == BUF(p)[pos]);
582 
583 	if (NULL != macro)
584 		*macro = NULL;
585 
586 	if ((*end = pos) == BUFSZ(p))
587 		return(TEXICMD__MAX);
588 	else if ((*end = ++pos) == BUFSZ(p))
589 		return(TEXICMD__MAX);
590 
591 	/* Alphabetic commands are special. */
592 	if ( ! isalpha(BUF(p)[pos])) {
593 		if ((*end = pos + 1) == BUFSZ(p))
594 			return(TEXICMD__MAX);
595 		for (i = 0; i < TEXICMD__MAX; i++) {
596 			if (1 != texitoks[i].len)
597 				continue;
598 			if (0 == strncmp(texitoks[i].tok, &BUF(p)[pos], 1))
599 				return(i);
600 		}
601 		texiwarn(p, "bad command: @%c", BUF(p)[pos]);
602 		return(TEXICMD__MAX);
603 	}
604 
605 	/* Scan to the end of the possible command name. */
606 	for (*end = pos; *end < BUFSZ(p) && ! ismspace(BUF(p)[*end]); (*end)++)
607 		if ((*end > pos && ('@' == BUF(p)[*end] ||
608 			  '{' == BUF(p)[*end] || '}' == BUF(p)[*end])))
609 			break;
610 
611 	/* Look for the command. */
612 	len = *end - pos;
613 	for (i = 0; i < TEXICMD__MAX; i++) {
614 		if (len != texitoks[i].len)
615 			continue;
616 		if (0 == strncmp(texitoks[i].tok, &BUF(p)[pos], len))
617 			return(i);
618 	}
619 
620 	/* Look for it in our indices. */
621 	for (i = 0; i < p->indexsz; i++) {
622 		toksz = strlen(p->indexs[i]);
623 		if (len != 5 + toksz)
624 			continue;
625 		if (strncmp(&BUF(p)[pos], p->indexs[i], toksz))
626 			continue;
627 		if (0 == strncmp(&BUF(p)[pos + toksz], "index", 5))
628 			return(TEXICMD_USER_INDEX);
629 	}
630 
631 	for (i = 0; i < p->macrosz; i++) {
632 		if (len != strlen(p->macros[i].key))
633 			continue;
634 		if (strncmp(&BUF(p)[pos], p->macros[i].key, len))
635 			continue;
636 		if (NULL != macro)
637 			*macro = &p->macros[i];
638 		return(TEXICMD__MAX);
639 	}
640 
641 	texiwarn(p, "bad command: @%.*s", (int)len, &BUF(p)[pos]);
642 	return(TEXICMD__MAX);
643 }
644 
645 /*
646  * Parse an argument from a bracketed command, e.g., @url{foo, baz}.
647  * Num should be set to the argument we're currently parsing, although
648  * it suffixes for it to be zero or non-zero.
649  * This will return 1 if there are more arguments, 0 otherwise.
650  * This will stop (returning 0) in the event of EOF or if we're not at a
651  * bracket for the zeroth parse.
652  */
653 int
parsearg(struct texi * p,size_t * pos,size_t num)654 parsearg(struct texi *p, size_t *pos, size_t num)
655 {
656 	size_t		  end;
657 	enum texicmd	  cmd;
658 	struct teximacro *macro;
659 
660 	while (*pos < BUFSZ(p) && ismspace(BUF(p)[*pos]))
661 		advance(p, pos);
662 	if (*pos == BUFSZ(p) || (0 == num && '{' != BUF(p)[*pos]))
663 		return(0);
664 	if (0 == num)
665 		advance(p, pos);
666 
667 	while ((*pos = advancenext(p, pos)) < BUFSZ(p)) {
668 		switch (BUF(p)[*pos]) {
669 		case (','):
670 			advance(p, pos);
671 			return(1);
672 		case ('}'):
673 			advance(p, pos);
674 			return(0);
675 		case ('{'):
676 			if (0 == p->ign)
677 				texiwarn(p, "unexpected \"{\"");
678 			advance(p, pos);
679 			continue;
680 		case ('@'):
681 			break;
682 		default:
683 			parseword(p, pos, ',');
684 			continue;
685 		}
686 
687 		cmd = texicmd(p, *pos, &end, &macro);
688 		advanceto(p, pos, end);
689 		if (NULL != macro)
690 			texiexecmacro(p, macro, pos);
691 		if (TEXICMD__MAX == cmd)
692 			continue;
693 		if (NULL != texitoks[cmd].fp)
694 			(*texitoks[cmd].fp)(p, cmd, pos);
695 	}
696 	return(0);
697 }
698 
699 /*
700  * Parse until the end of a bracketed statement, e.g., @foo{bar baz}.
701  * This will stop in the event of EOF or if we're not at a bracket.
702  */
703 void
parsebracket(struct texi * p,size_t * pos)704 parsebracket(struct texi *p, size_t *pos)
705 {
706 	size_t		  end;
707 	enum texicmd	  cmd;
708 	struct teximacro *macro;
709 
710 	while (*pos < BUFSZ(p) && ismspace(BUF(p)[*pos]))
711 		advance(p, pos);
712 
713 	if (*pos == BUFSZ(p) || '{' != BUF(p)[*pos])
714 		return;
715 	advance(p, pos);
716 
717 	while ((*pos = advancenext(p, pos)) < BUFSZ(p)) {
718 		switch (BUF(p)[*pos]) {
719 		case ('}'):
720 			advance(p, pos);
721 			return;
722 		case ('{'):
723 			if (0 == p->ign)
724 				texiwarn(p, "unexpected \"{\"");
725 			advance(p, pos);
726 			continue;
727 		case ('@'):
728 			break;
729 		default:
730 			parseword(p, pos, '\0');
731 			continue;
732 		}
733 
734 		cmd = texicmd(p, *pos, &end, &macro);
735 		advanceto(p, pos, end);
736 		if (NULL != macro)
737 			texiexecmacro(p, macro, pos);
738 		if (TEXICMD__MAX == cmd)
739 			continue;
740 		if (NULL != texitoks[cmd].fp)
741 			(*texitoks[cmd].fp)(p, cmd, pos);
742 	}
743 }
744 
745 /*
746  * This should be invoked when we're on a macro line and want to process
747  * to the end of the current input line, doing all of our macros along
748  * the way.
749  */
750 void
parseeoln(struct texi * p,size_t * pos)751 parseeoln(struct texi *p, size_t *pos)
752 {
753 	size_t		  end;
754 	enum texicmd	  cmd;
755 	struct teximacro *macro;
756 
757 	while (*pos < BUFSZ(p) && '\n' != BUF(p)[*pos]) {
758 		while (*pos < BUFSZ(p) && isws(BUF(p)[*pos])) {
759 			p->seenws = 1;
760 			if (p->literal)
761 				texiputchar(p, BUF(p)[*pos]);
762 			advance(p, pos);
763 		}
764 		switch (BUF(p)[*pos]) {
765 		case ('}'):
766 			if (0 == p->ign)
767 				texiwarn(p, "unexpected \"}\"");
768 			advance(p, pos);
769 			continue;
770 		case ('{'):
771 			if (0 == p->ign)
772 				texiwarn(p, "unexpected \"{\"");
773 			advance(p, pos);
774 			continue;
775 		case ('@'):
776 			break;
777 		default:
778 			parseword(p, pos, '\0');
779 			continue;
780 		}
781 
782 		cmd = texicmd(p, *pos, &end, &macro);
783 		advanceto(p, pos, end);
784 		if (NULL != macro)
785 			texiexecmacro(p, macro, pos);
786 		if (TEXICMD__MAX == cmd)
787 			continue;
788 		if (NULL != texitoks[cmd].fp)
789 			(*texitoks[cmd].fp)(p, cmd, pos);
790 	}
791 
792 	if (*pos < BUFSZ(p) && '\n' == BUF(p)[*pos])
793 		advance(p, pos);
794 }
795 
796 /*
797  * Parse a single word or command.
798  * This will return immediately at the EOF.
799  */
800 static void
parsesingle(struct texi * p,size_t * pos)801 parsesingle(struct texi *p, size_t *pos)
802 {
803 	size_t		  end;
804 	enum texicmd	  cmd;
805 	struct teximacro *macro;
806 
807 	if ((*pos = advancenext(p, pos)) >= BUFSZ(p))
808 		return;
809 
810 	switch (BUF(p)[*pos]) {
811 	case ('}'):
812 		if (0 == p->ign)
813 			texiwarn(p, "unexpected \"}\"");
814 		advance(p, pos);
815 		return;
816 	case ('{'):
817 		if (0 == p->ign)
818 			texiwarn(p, "unexpected \"{\"");
819 		advance(p, pos);
820 		return;
821 	case ('@'):
822 		break;
823 	default:
824 		parseword(p, pos, '\0');
825 		return;
826 	}
827 
828 	cmd = texicmd(p, *pos, &end, &macro);
829 	advanceto(p, pos, end);
830 	if (NULL != macro)
831 		texiexecmacro(p, macro, pos);
832 	if (TEXICMD__MAX == cmd)
833 		return;
834 	if (NULL != texitoks[cmd].fp)
835 		(*texitoks[cmd].fp)(p, cmd, pos);
836 }
837 
838 /*
839  * This is used in the @deffn type of command.
840  * These have an arbitrary number of line arguments; however, these
841  * arguments may or may not be surrounded by brackets.
842  * In this function, we parse each one as either a bracketed or
843  * non-bracketed argument, returning 0 when we've reached the end of
844  * line or 1 otherwise.
845  */
846 int
parselinearg(struct texi * p,size_t * pos)847 parselinearg(struct texi *p, size_t *pos)
848 {
849 
850 	while (*pos < BUFSZ(p) && isws(BUF(p)[*pos])) {
851 		p->seenws = 1;
852 		advance(p, pos);
853 	}
854 
855 	if (*pos < BUFSZ(p) && '{' == BUF(p)[*pos])
856 		parsebracket(p, pos);
857 	else if (*pos < BUFSZ(p) && '\n' != BUF(p)[*pos])
858 		parsesingle(p, pos);
859 	else
860 		return(0);
861 
862 	return(1);
863 }
864 
865 /*
866  * Parse til the end of the buffer.
867  */
868 static void
parseeof(struct texi * p)869 parseeof(struct texi *p)
870 {
871 	size_t	 pos;
872 
873 	for (pos = 0; pos < BUFSZ(p); )
874 		parsesingle(p, &pos);
875 }
876 
877 void
texisplice(struct texi * p,const char * buf,size_t sz,size_t * pos)878 texisplice(struct texi *p, const char *buf, size_t sz, size_t *pos)
879 {
880 	char		*cp;
881 	struct texifile	*f;
882 
883 	assert(p->filepos > 0);
884 	f = &p->files[p->filepos - 1];
885 
886 	if (f->mapsz + sz > f->mapmaxsz) {
887 		f->mapmaxsz = f->mapsz + sz + 1024;
888 		cp = realloc(f->map, f->mapmaxsz);
889 		if (NULL == cp)
890 			texiabort(p, NULL);
891 		f->map = cp;
892 	}
893 
894 	f->insplice += sz;
895 	memmove(f->map + *pos + sz, f->map + *pos, f->mapsz - *pos);
896 	memcpy(f->map + *pos, buf, sz);
897 	f->mapsz += sz;
898 }
899 
900 /*
901  * Parse a block sequence until we have the "@end endtoken" command
902  * invocation.
903  * This will return immediately at EOF.
904  */
905 void
parseto(struct texi * p,size_t * pos,const char * endtoken)906 parseto(struct texi *p, size_t *pos, const char *endtoken)
907 {
908 	size_t		  end;
909 	enum texicmd	  cmd;
910 	size_t		  endtoksz;
911 	struct teximacro *macro;
912 
913 	endtoksz = strlen(endtoken);
914 	assert(endtoksz > 0);
915 
916 	while ((*pos = advancenext(p, pos)) < BUFSZ(p)) {
917 		switch (BUF(p)[*pos]) {
918 		case ('}'):
919 			if (0 == p->ign)
920 				texiwarn(p, "unexpected \"}\"");
921 			advance(p, pos);
922 			continue;
923 		case ('{'):
924 			if (0 == p->ign)
925 				texiwarn(p, "unexpected \"{\"");
926 			advance(p, pos);
927 			continue;
928 		case ('@'):
929 			break;
930 		default:
931 			parseword(p, pos, '\0');
932 			continue;
933 		}
934 
935 		cmd = texicmd(p, *pos, &end, &macro);
936 		advanceto(p, pos, end);
937 		if (TEXICMD_END == cmd) {
938 			while (*pos < BUFSZ(p) && isws(BUF(p)[*pos]))
939 				advance(p, pos);
940 			/*
941 			 * FIXME: check the full word, not just its
942 			 * initial substring!
943 			 */
944 			if (BUFSZ(p) - *pos >= endtoksz && 0 == strncmp
945 				 (&BUF(p)[*pos], endtoken, endtoksz)) {
946 				advanceeoln(p, pos, 0);
947 				break;
948 			}
949 			if (0 == p->ign)
950 				texiwarn(p, "unexpected \"end\"");
951 			advanceeoln(p, pos, 0);
952 			continue;
953 		}
954 		if (NULL != macro)
955 			texiexecmacro(p, macro, pos);
956 		if (TEXICMD__MAX == cmd)
957 			continue;
958 		if (NULL != texitoks[cmd].fp)
959 			(*texitoks[cmd].fp)(p, cmd, pos);
960 	}
961 }
962 
963 /*
964  * Like parsefile() but used for reading from stdandard input.
965  * This can only be called for the first file!
966  */
967 void
parsestdin(struct texi * p)968 parsestdin(struct texi *p)
969 {
970 	struct texifile	*f;
971 	ssize_t		 ssz;
972 
973 	assert(0 == p->filepos);
974 	f = &p->files[p->filepos];
975 	memset(f, 0, sizeof(struct texifile));
976 
977 	f->type = TEXISRC_STDIN;
978 	f->name = "<stdin>";
979 
980 	for (f->mapsz = 0; ; f->mapsz += (size_t)ssz) {
981 		if (f->mapsz == f->mapmaxsz) {
982 			if (f->mapmaxsz == (1U << 31))
983 				texierr(p, "stdin buffer too long");
984 			f->mapmaxsz = f->mapmaxsz > 65536 / 2 ?
985 				2 * f->mapmaxsz : 65536;
986 			f->map = realloc(f->map, f->mapmaxsz);
987 			if (NULL == f->map)
988 				texiabort(p, NULL);
989 		}
990 		ssz = read(STDIN_FILENO, f->map +
991 			(int)f->mapsz, f->mapmaxsz - f->mapsz);
992 		if (0 == ssz)
993 			break;
994 		else if (-1 == ssz)
995 			texiabort(p, NULL);
996 	}
997 
998 	p->filepos++;
999 	parseeof(p);
1000 	texifilepop(p);
1001 }
1002 
1003 /*
1004  * Memory-map the file "fname" and begin parsing it unless "parse" is
1005  * zero, in which case we just dump the file to stdout (making sure it
1006  * doesn't trip up mdoc(7) along the way).
1007  * This can be called in a nested context.
1008  */
1009 void
parsefile(struct texi * p,const char * fname,int parse)1010 parsefile(struct texi *p, const char *fname, int parse)
1011 {
1012 	struct texifile	*f;
1013 	int		 fd;
1014 	struct stat	 st;
1015 	size_t		 i;
1016 	char		*map;
1017 
1018 	if (64 == p->filepos)
1019 		texierr(p, "too many open files");
1020 	f = &p->files[p->filepos];
1021 	memset(f, 0, sizeof(struct texifile));
1022 
1023 	f->type = TEXISRC_FILE;
1024 	f->name = fname;
1025 	if (-1 == (fd = open(fname, O_RDONLY, 0))) {
1026 		texiabort(p, fname);
1027 	} else if (-1 == fstat(fd, &st)) {
1028 		close(fd);
1029 		texiabort(p, fname);
1030 	}
1031 
1032 	f->mapsz = f->mapmaxsz = st.st_size;
1033 	map = mmap(NULL, f->mapsz,
1034 		PROT_READ, MAP_SHARED, fd, 0);
1035 	close(fd);
1036 
1037 	if (MAP_FAILED == map)
1038 		texiabort(p, fname);
1039 
1040 	if ( ! parse) {
1041 		for (i = 0; i < f->mapsz; i++)
1042 			texiputchar(p, map[i]);
1043 		if (p->outcol)
1044 			texiputchar(p, '\n');
1045 		munmap(map, f->mapsz);
1046 		return;
1047 	}
1048 
1049 	p->filepos++;
1050 	f->map = malloc(f->mapsz);
1051 	memcpy(f->map, map, f->mapsz);
1052 	munmap(map, f->mapsz);
1053 	parseeof(p);
1054 	texifilepop(p);
1055 }
1056 
1057 /*
1058  * Look up the value to a stored pair's value starting in "buf" from
1059  * start to end.
1060  * Return the pointer to the value memory, which can be NULL if the
1061  * pointer key does not exist.
1062  * The pointer can point to NULL if the value has been unset.
1063  */
1064 static char **
valuequery(const struct texi * p,size_t start,size_t end)1065 valuequery(const struct texi *p, size_t start, size_t end)
1066 {
1067 	size_t	 i, sz, len;
1068 
1069 	assert(end >= start);
1070 	/* Ignore zero-length. */
1071 	if (0 == (len = (end - start)))
1072 		return(NULL);
1073 	for (i = 0; i < p->valsz; i++) {
1074 		sz = strlen(p->vals[i].key);
1075 		if (sz != len)
1076 			continue;
1077 		if (0 == strncmp(p->vals[i].key, &BUF(p)[start], len))
1078 			return(&p->vals[i].value);
1079 	}
1080 	return(NULL);
1081 }
1082 
1083 /*
1084  * Parse a key until the end of line, e.g., @clear foo\n, and return the
1085  * pointer to its value via valuequery().
1086  */
1087 static char **
valuelquery(struct texi * p,size_t * pos)1088 valuelquery(struct texi *p, size_t *pos)
1089 {
1090 	size_t	  start, end;
1091 	char	**ret;
1092 
1093 	while (*pos < BUFSZ(p) && isws(BUF(p)[*pos]))
1094 		advance(p, pos);
1095 	if (*pos == BUFSZ(p))
1096 		return(NULL);
1097 	for (start = end = *pos; end < BUFSZ(p); end++)
1098 		if ('\n' == BUF(p)[end])
1099 			break;
1100 	advanceto(p, pos, end);
1101 	if (*pos < BUFSZ(p)) {
1102 		assert('\n' == BUF(p)[*pos]);
1103 		advance(p, pos);
1104 	}
1105 	if (NULL == (ret = valuequery(p, start, end)))
1106 		return(NULL);
1107 	return(ret);
1108 }
1109 
1110 void
valuelclear(struct texi * p,size_t * pos)1111 valuelclear(struct texi *p, size_t *pos)
1112 {
1113 	char	**ret;
1114 
1115 	if (NULL == (ret = valuelquery(p, pos)))
1116 		return;
1117 	free(*ret);
1118 	*ret = NULL;
1119 }
1120 
1121 const char *
valuellookup(struct texi * p,size_t * pos)1122 valuellookup(struct texi *p, size_t *pos)
1123 {
1124 	char	**ret;
1125 
1126 	if (NULL == (ret = valuelquery(p, pos)))
1127 		return(NULL);
1128 	return(*ret);
1129 }
1130 
1131 /*
1132  * Parse a key from a bracketed string, e.g., @value{foo}, and return
1133  * the pointer to its value.
1134  * If the returned pointer is NULL, either there was no string within
1135  * the brackets (or no brackets), or the value was not found, or the
1136  * value had previously been unset.
1137  */
1138 const char *
valueblookup(struct texi * p,size_t * pos)1139 valueblookup(struct texi *p, size_t *pos)
1140 {
1141 	size_t	  start, end;
1142 	char	**ret;
1143 
1144 	while (*pos < BUFSZ(p) && isws(BUF(p)[*pos]))
1145 		advance(p, pos);
1146 	if (*pos == BUFSZ(p) || '{' != BUF(p)[*pos])
1147 		return(NULL);
1148 	advance(p, pos);
1149 	for (start = end = *pos; end < BUFSZ(p); end++)
1150 		if ('}' == BUF(p)[end])
1151 			break;
1152 	advanceto(p, pos, end);
1153 	if (*pos < BUFSZ(p)) {
1154 		assert('}' == BUF(p)[*pos]);
1155 		advance(p, pos);
1156 	}
1157 	if (NULL == (ret = valuequery(p, start, end)))
1158 		return(NULL);
1159 	return(*ret);
1160 }
1161 
1162 void
valueadd(struct texi * p,char * key,char * val)1163 valueadd(struct texi *p, char *key, char *val)
1164 {
1165 	size_t	 i;
1166 
1167 	assert(NULL != key);
1168 	assert(NULL != val);
1169 
1170 	for (i = 0; i < p->valsz; i++)
1171 		if (0 == strcmp(p->vals[i].key, key))
1172 			break;
1173 
1174 	if (i < p->valsz) {
1175 		free(key);
1176 		free(p->vals[i].value);
1177 		p->vals[i].value = val;
1178 	} else {
1179 		/* FIXME: reallocarray() */
1180 		p->vals = realloc(p->vals,
1181 			(p->valsz + 1) *
1182 			 sizeof(struct texivalue));
1183 		if (NULL == p->vals)
1184 			texiabort(p, NULL);
1185 		p->vals[p->valsz].key = key;
1186 		p->vals[p->valsz].value = val;
1187 		p->valsz++;
1188 	}
1189 }
1190 
1191 /*
1192  * Take the arguments to a macro, e.g., @foo{bar, baz, xyzzy} (or the
1193  * declaration form, @macro foo {arg1, ...}) and textually convert it to
1194  * an array of arguments of size "argsz".
1195  * These need to be freed individually and as a whole.
1196  * NOTE: this will puke on @, or @} macros, which can trick it into
1197  * stopping argument parsing earlier.
1198  * Ergo, textual: this doesn't interpret the arguments in any way.
1199  */
1200 char **
argparse(struct texi * p,size_t * pos,size_t * argsz,size_t hint)1201 argparse(struct texi *p, size_t *pos, size_t *argsz, size_t hint)
1202 {
1203 	char	**args;
1204 	size_t	  start, end, stack;
1205 
1206 	while (*pos < BUFSZ(p) && isws(BUF(p)[*pos]))
1207 		advance(p, pos);
1208 
1209 	args = NULL;
1210 	*argsz = 0;
1211 
1212 	if ('{' != BUF(p)[*pos] && hint) {
1213 		/*
1214 		 * Special case: if we encounter an unbracketed argument
1215 		 * and we're being invoked with non-zero arguments
1216 		 * (versus being set, i.e., hint>0), then parse until
1217 		 * the end of line.
1218 		 */
1219 		*argsz = 1;
1220 		args = calloc(1, sizeof(char *));
1221 		if (NULL == args)
1222 			texiabort(p, NULL);
1223 		start = *pos;
1224 		while (*pos < BUFSZ(p)) {
1225 			if ('\n' == BUF(p)[*pos])
1226 				break;
1227 			advance(p, pos);
1228 		}
1229 		args[0] = malloc(*pos - start + 1);
1230 		memcpy(args[0], &BUF(p)[start], *pos - start);
1231 		args[0][*pos - start] = '\0';
1232 		if (*pos < BUFSZ(p) && '\n' == BUF(p)[*pos])
1233 			advance(p, pos);
1234 		return(args);
1235 	} else if ('{' != BUF(p)[*pos])
1236 		return(args);
1237 
1238 	/* Parse til the closing '}', putting into the array. */
1239 	advance(p, pos);
1240 	while (*pos < BUFSZ(p)) {
1241 		while (*pos < BUFSZ(p) && isws(BUF(p)[*pos]))
1242 			advance(p, pos);
1243 		start = *pos;
1244 		stack = 0;
1245 		while (*pos < BUFSZ(p)) {
1246 			/*
1247 			 * According to the manual, commas within
1248 			 * embedded commands are escaped.
1249 			 * We keep track of embedded-ness in the "stack"
1250 			 * state anyway, so this is free.
1251 			 */
1252 			if (',' == BUF(p)[*pos] && 0 == stack && 1 != hint)
1253 				break;
1254 			else if (0 == stack && '}' == BUF(p)[*pos])
1255 				break;
1256 			else if (0 != stack && '}' == BUF(p)[*pos])
1257 				stack--;
1258 			else if ('{' == BUF(p)[*pos])
1259 				stack++;
1260 			advance(p, pos);
1261 		}
1262 		if (stack)
1263 			texiwarn(p, "unterminated macro "
1264 				"in macro arguments");
1265 		if ((end = *pos) == BUFSZ(p))
1266 			break;
1267 		/* Test for zero-length '{  }'. */
1268 		if (start == end && '}' == BUF(p)[*pos] && 0 == *argsz)
1269 			break;
1270 		/* FIXME: use reallocarray. */
1271 		args = realloc
1272 			(args, sizeof(char *) *
1273 			 (*argsz + 1));
1274 		if (NULL == args)
1275 			texiabort(p, NULL);
1276 		args[*argsz] = malloc(end - start + 1);
1277 		if (NULL == args[*argsz])
1278 			texiabort(p, NULL);
1279 		memcpy(args[*argsz],
1280 			&BUF(p)[start], end - start);
1281 		args[*argsz][end - start] = '\0';
1282 		(*argsz)++;
1283 		if ('}' == BUF(p)[*pos])
1284 			break;
1285 		advance(p, pos);
1286 	}
1287 
1288 	if (*pos == BUFSZ(p))
1289 		texierr(p, "unterminated arguments");
1290 	assert('}' == BUF(p)[*pos]);
1291 	advance(p, pos);
1292 	return(args);
1293 }
1294