1 /* $Id: util.c,v 1.15 2015/02/25 14:49:14 kristaps Exp $ */
2 /*
3 * Copyright (c) 2015 Kristaps Dzonsons <kristaps@bsd.lv>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17 #include <sys/mman.h>
18 #include <sys/stat.h>
19
20 #include <assert.h>
21 #include <ctype.h>
22 #include <fcntl.h>
23 #include <getopt.h>
24 #include <libgen.h>
25 #include <limits.h>
26 #include <stdarg.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <time.h>
31 #include <unistd.h>
32
33 #include "extern.h"
34
35 /*
36 * Unmap the top-most file in the stack of files currently opened (that
37 * is, nested calls to parsefile()).
38 */
39 void
texifilepop(struct texi * p)40 texifilepop(struct texi *p)
41 {
42 struct texifile *f;
43
44 assert(p->filepos > 0);
45 f = &p->files[--p->filepos];
46 free(f->map);
47 }
48
49 static void
teximacrofree(struct teximacro * p)50 teximacrofree(struct teximacro *p)
51 {
52 size_t i;
53
54 for (i = 0; i < p->argsz; i++)
55 free(p->args[i]);
56
57 free(p->args);
58 free(p->key);
59 free(p->value);
60 }
61
62 static void
texivaluefree(struct texivalue * p)63 texivaluefree(struct texivalue *p)
64 {
65
66 free(p->key);
67 free(p->value);
68 }
69
70 /*
71 * Unmap all files that we're currently using and free all resources
72 * that we've allocated during the parse.
73 * The utility should exit(...) after this is called.
74 */
75 void
texiexit(struct texi * p)76 texiexit(struct texi *p)
77 {
78 size_t i;
79
80 /* Make sure we're newline-terminated. */
81 if (p->outcol)
82 putchar('\n');
83
84 /* Unmap all files. */
85 while (p->filepos > 0)
86 texifilepop(p);
87
88 for (i = 0; i < p->macrosz; i++)
89 teximacrofree(&p->macros[i]);
90 for (i = 0; i < p->dirsz; i++)
91 free(p->dirs[i]);
92 for (i = 0; i < p->indexsz; i++)
93 free(p->indexs[i]);
94 for (i = 0; i < p->valsz; i++)
95 texivaluefree(&p->vals[i]);
96
97 free(p->macros);
98 free(p->vals);
99 free(p->indexs);
100 free(p->dirs);
101 free(p->subtitle);
102 free(p->title);
103 }
104
105 /*
106 * Fatal error: unmap all files and exit.
107 * The "errstring" is passed to perror(3).
108 */
109 void
texiabort(struct texi * p,const char * errstring)110 texiabort(struct texi *p, const char *errstring)
111 {
112
113 perror(errstring);
114 texiexit(p);
115 exit(EXIT_FAILURE);
116 }
117
118 /*
119 * Print a generic warning message (to stderr) tied to our current
120 * location in the parse sequence.
121 */
122 void
texiwarn(const struct texi * p,const char * fmt,...)123 texiwarn(const struct texi *p, const char *fmt, ...)
124 {
125 va_list ap;
126 const struct texifile *f;
127
128 f = &p->files[p->filepos - 1];
129
130 if (f->insplice)
131 fprintf(stderr, "%s:%zu:%zu (%zuB left in splice): "
132 "warning: ", f->name, f->line + 1,
133 f->col + 1, f->insplice);
134 else
135 fprintf(stderr, "%s:%zu:%zu: warning: ",
136 f->name, f->line + 1, f->col + 1);
137
138 va_start(ap, fmt);
139 vfprintf(stderr, fmt, ap);
140 va_end(ap);
141 fputc('\n', stderr);
142 }
143
144 /*
145 * Print an error message (to stderr) tied to our current location in
146 * the parse sequence, invoke texiexit(), then die.
147 */
148 void
texierr(struct texi * p,const char * fmt,...)149 texierr(struct texi *p, const char *fmt, ...)
150 {
151 va_list ap;
152 struct texifile *f;
153
154 f = &p->files[p->filepos - 1];
155
156 if (f->insplice)
157 fprintf(stderr, "%s:%zu:%zu: (%zuB left in splice): "
158 "error: ", f->name, f->line + 1,
159 f->col + 1, f->insplice);
160 else
161 fprintf(stderr, "%s:%zu:%zu: error: ",
162 f->name, f->line + 1, f->col + 1);
163
164 va_start(ap, fmt);
165 vfprintf(stderr, fmt, ap);
166 va_end(ap);
167 fputc('\n', stderr);
168 texiexit(p);
169 exit(EXIT_FAILURE);
170 }
171
172 /*
173 * Put a single data character to the output if we're not ignoring.
174 * Escape starting a line with a control character and slashes.
175 */
176 void
texiputchar(struct texi * p,char c)177 texiputchar(struct texi *p, char c)
178 {
179
180 if (p->ign)
181 return;
182 if ('.' == c && 0 == p->outcol)
183 fputs("\\&", stdout);
184 if ('\'' == c && 0 == p->outcol)
185 fputs("\\&", stdout);
186
187 putchar(c);
188 if ('\\' == c)
189 putchar('e');
190 p->seenvs = 0;
191 if ('\n' == c) {
192 p->outcol = 0;
193 p->seenws = 0;
194 } else
195 p->outcol++;
196 }
197
198 /*
199 * Put an opaque series of characters.
200 * Characters starting a line with a control character are escaped, but
201 * that's it, so don't use this for non-controlled sequences of text.
202 */
203 void
texiputchars(struct texi * p,const char * s)204 texiputchars(struct texi *p, const char *s)
205 {
206
207 if (p->ign)
208 return;
209 if ('.' == *s && 0 == p->outcol)
210 fputs("\\&", stdout);
211 if ('\'' == *s && 0 == p->outcol)
212 fputs("\\&", stdout);
213 p->outcol += fputs(s, stdout);
214 p->seenvs = 0;
215 }
216
217 /*
218 * This puts all characters onto the output stream but makes sure to
219 * escape mdoc(7) slashes.
220 * FIXME: useless.
221 */
222 void
texiputbuf(struct texi * p,size_t start,size_t end)223 texiputbuf(struct texi *p, size_t start, size_t end)
224 {
225
226 for ( ; start < end; start++)
227 texiputchar(p, BUF(p)[start]);
228 }
229
230 /*
231 * Close an mdoc(7) macro opened with teximacroopen().
232 * If there are no more macros on the line, prints a newline.
233 */
234 void
teximacroclose(struct texi * p)235 teximacroclose(struct texi *p)
236 {
237
238 if (p->ign)
239 return;
240
241 if (0 == --p->outmacro) {
242 putchar('\n');
243 p->outcol = p->seenws = 0;
244 }
245 }
246
247 /*
248 * Open a mdoc(7) macro.
249 * This is used for line macros, e.g., Qq [foo bar baz].
250 * It can be invoked for nested macros, e.g., Qq Li foo .
251 * TODO: flush-right punctuation (e.g., parenthesis).
252 */
253 void
teximacroopen(struct texi * p,const char * s)254 teximacroopen(struct texi *p, const char *s)
255 {
256 int rc;
257
258 if (p->ign)
259 return;
260
261 if (p->outcol && 0 == p->outmacro) {
262 putchar('\n');
263 p->outcol = 0;
264 }
265
266 if (0 == p->outmacro)
267 putchar('.');
268 else
269 putchar(' ');
270
271 if (EOF != (rc = fputs(s, stdout)))
272 p->outcol += rc;
273
274 putchar(' ');
275 p->outcol++;
276 p->outmacro++;
277 p->seenws = 0;
278 }
279
280 /*
281 * Put a stadnalone mdoc(7) command with the trailing newline.
282 */
283 void
teximacro(struct texi * p,const char * s)284 teximacro(struct texi *p, const char *s)
285 {
286
287 if (p->ign)
288 return;
289
290 if (p->outmacro)
291 texierr(p, "\"%s\" in open line scope!?", s);
292 if (p->literal)
293 texierr(p, "\"%s\" in a literal scope!?", s);
294
295 if (p->outcol)
296 putchar('\n');
297
298 putchar('.');
299 puts(s);
300 p->outcol = p->seenws = 0;
301 }
302
303 /*
304 * Introduce vertical space during normal (non-macro) input.
305 */
306 void
texivspace(struct texi * p)307 texivspace(struct texi *p)
308 {
309
310 if (p->seenvs || TEXILIST_TABLE == p->list)
311 return;
312 teximacro(p, "Pp");
313 p->seenvs = 1;
314 }
315
316 /*
317 * Advance by a single byte in the input stream, adjusting our location
318 * in the current input file.
319 */
320 void
advance(struct texi * p,size_t * pos)321 advance(struct texi *p, size_t *pos)
322 {
323 struct texifile *f;
324
325 f = &p->files[p->filepos - 1];
326
327 if (0 == f->insplice) {
328 if ('\n' == BUF(p)[*pos]) {
329 f->line++;
330 f->col = 0;
331 } else
332 f->col++;
333 } else
334 --f->insplice;
335
336 (*pos)++;
337 }
338
339 /*
340 * It's common to wait punctuation to float on the right side of macro
341 * lines in mdoc(7), e.g., ".Em hello ) ."
342 * This function does so, and should be called before teximacroclose().
343 * It will detect that it's the last in the nested macros and
344 * appropriately flush-left punctuation alongside the macro.
345 */
346 void
texipunctuate(struct texi * p,size_t * pos)347 texipunctuate(struct texi *p, size_t *pos)
348 {
349 size_t start, end;
350
351 if (1 != p->outmacro)
352 return;
353
354 for (start = end = *pos; end < BUFSZ(p); end++) {
355 switch (BUF(p)[end]) {
356 case (','):
357 case (')'):
358 case ('.'):
359 case ('"'):
360 case (':'):
361 case ('!'):
362 case ('?'):
363 continue;
364 default:
365 break;
366 }
367 break;
368 }
369 if (end == *pos)
370 return;
371 if (end + 1 == BUFSZ(p) || ' ' == BUF(p)[end] ||
372 '\n' == BUF(p)[end]) {
373 for ( ; start < end; start++) {
374 texiputchar(p, ' ');
375 texiputchar(p, BUF(p)[start]);
376 advance(p, pos);
377 }
378 }
379 }
380
381 /*
382 * Advance to the next non-whitespace word in the input stream.
383 * If we're in literal mode, then print all of the whitespace as we're
384 * doing so.
385 */
386 static size_t
advancenext(struct texi * p,size_t * pos)387 advancenext(struct texi *p, size_t *pos)
388 {
389
390 if (p->literal) {
391 while (*pos < BUFSZ(p) && ismspace(BUF(p)[*pos])) {
392 texiputchar(p, BUF(p)[*pos]);
393 advance(p, pos);
394 }
395 return(*pos);
396 }
397
398 while (*pos < BUFSZ(p) && ismspace(BUF(p)[*pos])) {
399 p->seenws = 1;
400 /*
401 * If it looks like we've printed a double-line, then
402 * output a paragraph.
403 * FIXME: this is stupid.
404 */
405 if (*pos && '\n' == BUF(p)[*pos] && '\n' == BUF(p)[*pos - 1])
406 texivspace(p);
407 advance(p, pos);
408 }
409 return(*pos);
410 }
411
412 /*
413 * Advance to the EOLN in the input stream.
414 * NOTE: THIS SHOULD NOT BE CALLED ON BLANK TEXT, as it will read up to
415 * the @\n.
416 */
417 size_t
advanceeoln(struct texi * p,size_t * pos,int consumenl)418 advanceeoln(struct texi *p, size_t *pos, int consumenl)
419 {
420
421 while (*pos < BUFSZ(p) && '\n' != BUF(p)[*pos])
422 advance(p, pos);
423 if (*pos < BUFSZ(p) && consumenl)
424 advance(p, pos);
425 return(*pos);
426 }
427
428 /*
429 * Advance to position "end", which is an absolute position in the
430 * current buffer greater than or equal to the current position.
431 */
432 void
advanceto(struct texi * p,size_t * pos,size_t end)433 advanceto(struct texi *p, size_t *pos, size_t end)
434 {
435
436 assert(*pos <= end);
437 while (*pos < end)
438 advance(p, pos);
439 }
440
441 static void
texiexecmacro(struct texi * p,struct teximacro * m,size_t * pos)442 texiexecmacro(struct texi *p, struct teximacro *m, size_t *pos)
443 {
444 size_t valsz, realsz, aasz, asz,
445 ssz, i, j, k, start, end;
446 char *val;
447 char **args;
448 const char *cp;
449
450 args = argparse(p, pos, &asz, m->argsz);
451 if (asz != m->argsz)
452 texiwarn(p, "invalid macro argument length");
453 aasz = asz < m->argsz ? asz : m->argsz;
454
455 if (0 == aasz) {
456 texisplice(p, m->value, strlen(m->value), pos);
457 return;
458 }
459
460 valsz = realsz = strlen(m->value);
461 val = strdup(m->value);
462
463 for (i = j = 0; i < realsz; i++) {
464 /* Parse blindly til the backslash delimiter. */
465 if ('\\' != m->value[i]) {
466 val[j++] = m->value[i];
467 val[j] = '\0';
468 continue;
469 } else if (i == realsz - 1)
470 texierr(p, "trailing argument name delimiter");
471
472 /* Double-backslash is escaped. */
473 if ('\\' == m->value[i + 1]) {
474 val[j++] = m->value[i++];
475 val[j] = '\0';
476 continue;
477 }
478
479 assert('\\' == m->value[i] && i < realsz - 1);
480
481 /* Parse to terminating delimiter. */
482 /* FIXME: embedded, escaped delimiters? */
483 for (start = end = i + 1; end < realsz; end++)
484 if ('\\' == m->value[end])
485 break;
486 if (end == realsz)
487 texierr(p, "unterminated argument name");
488
489 for (k = 0; k < aasz; k++) {
490 if ((ssz = strlen(m->args[k])) != (end - start))
491 continue;
492 if (strncmp(&m->value[start], m->args[k], ssz))
493 continue;
494 break;
495 }
496
497 /*
498 * Argument didn't exist in argument table.
499 * Just ignore it.
500 */
501 if (k == aasz) {
502 i = end;
503 continue;
504 }
505
506 if (strlen(args[k]) > ssz) {
507 valsz += strlen(args[k]);
508 val = realloc(val, valsz + 1);
509 if (NULL == val)
510 texiabort(p, NULL);
511 }
512
513 for (cp = args[k]; '\0' != *cp; cp++)
514 val[j++] = *cp;
515
516 val[j] = '\0';
517 i = end;
518 }
519
520 texisplice(p, val, strlen(val), pos);
521
522 for (i = 0; i < asz; i++)
523 free(args[i]);
524 free(args);
525 free(val);
526 }
527
528 /*
529 * Output a free-form word in the input stream, progressing to the next
530 * command or white-space.
531 * This also will advance the input stream.
532 */
533 static void
parseword(struct texi * p,size_t * pos,char extra)534 parseword(struct texi *p, size_t *pos, char extra)
535 {
536
537 if (p->seenws && 0 == p->outmacro &&
538 p->outcol > 72 && 0 == p->literal)
539 texiputchar(p, '\n');
540 /* FIXME: abstract this: we use it elsewhere. */
541 if (p->seenws && p->outcol && 0 == p->literal)
542 texiputchar(p, ' ');
543
544 p->seenws = 0;
545
546 while (*pos < BUFSZ(p) && ! ismspace(BUF(p)[*pos])) {
547 switch (BUF(p)[*pos]) {
548 case ('@'):
549 case ('}'):
550 case ('{'):
551 return;
552 }
553 if ('\0' != extra && BUF(p)[*pos] == extra)
554 return;
555 if (*pos < BUFSZ(p) - 1 &&
556 '`' == BUF(p)[*pos] &&
557 '`' == BUF(p)[*pos + 1]) {
558 texiputchars(p, "\\(lq");
559 advance(p, pos);
560 } else if (*pos < BUFSZ(p) - 1 &&
561 '\'' == BUF(p)[*pos] &&
562 '\'' == BUF(p)[*pos + 1]) {
563 texiputchars(p, "\\(rq");
564 advance(p, pos);
565 } else
566 texiputchar(p, BUF(p)[*pos]);
567 advance(p, pos);
568 }
569 }
570
571 /*
572 * Look up the command at position "pos" in the buffer, returning it (or
573 * TEXICMD__MAX if none found) and setting "end" to be the absolute
574 * index after the command name.
575 */
576 enum texicmd
texicmd(struct texi * p,size_t pos,size_t * end,struct teximacro ** macro)577 texicmd(struct texi *p, size_t pos, size_t *end, struct teximacro **macro)
578 {
579 size_t i, len, toksz;
580
581 assert('@' == BUF(p)[pos]);
582
583 if (NULL != macro)
584 *macro = NULL;
585
586 if ((*end = pos) == BUFSZ(p))
587 return(TEXICMD__MAX);
588 else if ((*end = ++pos) == BUFSZ(p))
589 return(TEXICMD__MAX);
590
591 /* Alphabetic commands are special. */
592 if ( ! isalpha(BUF(p)[pos])) {
593 if ((*end = pos + 1) == BUFSZ(p))
594 return(TEXICMD__MAX);
595 for (i = 0; i < TEXICMD__MAX; i++) {
596 if (1 != texitoks[i].len)
597 continue;
598 if (0 == strncmp(texitoks[i].tok, &BUF(p)[pos], 1))
599 return(i);
600 }
601 texiwarn(p, "bad command: @%c", BUF(p)[pos]);
602 return(TEXICMD__MAX);
603 }
604
605 /* Scan to the end of the possible command name. */
606 for (*end = pos; *end < BUFSZ(p) && ! ismspace(BUF(p)[*end]); (*end)++)
607 if ((*end > pos && ('@' == BUF(p)[*end] ||
608 '{' == BUF(p)[*end] || '}' == BUF(p)[*end])))
609 break;
610
611 /* Look for the command. */
612 len = *end - pos;
613 for (i = 0; i < TEXICMD__MAX; i++) {
614 if (len != texitoks[i].len)
615 continue;
616 if (0 == strncmp(texitoks[i].tok, &BUF(p)[pos], len))
617 return(i);
618 }
619
620 /* Look for it in our indices. */
621 for (i = 0; i < p->indexsz; i++) {
622 toksz = strlen(p->indexs[i]);
623 if (len != 5 + toksz)
624 continue;
625 if (strncmp(&BUF(p)[pos], p->indexs[i], toksz))
626 continue;
627 if (0 == strncmp(&BUF(p)[pos + toksz], "index", 5))
628 return(TEXICMD_USER_INDEX);
629 }
630
631 for (i = 0; i < p->macrosz; i++) {
632 if (len != strlen(p->macros[i].key))
633 continue;
634 if (strncmp(&BUF(p)[pos], p->macros[i].key, len))
635 continue;
636 if (NULL != macro)
637 *macro = &p->macros[i];
638 return(TEXICMD__MAX);
639 }
640
641 texiwarn(p, "bad command: @%.*s", (int)len, &BUF(p)[pos]);
642 return(TEXICMD__MAX);
643 }
644
645 /*
646 * Parse an argument from a bracketed command, e.g., @url{foo, baz}.
647 * Num should be set to the argument we're currently parsing, although
648 * it suffixes for it to be zero or non-zero.
649 * This will return 1 if there are more arguments, 0 otherwise.
650 * This will stop (returning 0) in the event of EOF or if we're not at a
651 * bracket for the zeroth parse.
652 */
653 int
parsearg(struct texi * p,size_t * pos,size_t num)654 parsearg(struct texi *p, size_t *pos, size_t num)
655 {
656 size_t end;
657 enum texicmd cmd;
658 struct teximacro *macro;
659
660 while (*pos < BUFSZ(p) && ismspace(BUF(p)[*pos]))
661 advance(p, pos);
662 if (*pos == BUFSZ(p) || (0 == num && '{' != BUF(p)[*pos]))
663 return(0);
664 if (0 == num)
665 advance(p, pos);
666
667 while ((*pos = advancenext(p, pos)) < BUFSZ(p)) {
668 switch (BUF(p)[*pos]) {
669 case (','):
670 advance(p, pos);
671 return(1);
672 case ('}'):
673 advance(p, pos);
674 return(0);
675 case ('{'):
676 if (0 == p->ign)
677 texiwarn(p, "unexpected \"{\"");
678 advance(p, pos);
679 continue;
680 case ('@'):
681 break;
682 default:
683 parseword(p, pos, ',');
684 continue;
685 }
686
687 cmd = texicmd(p, *pos, &end, ¯o);
688 advanceto(p, pos, end);
689 if (NULL != macro)
690 texiexecmacro(p, macro, pos);
691 if (TEXICMD__MAX == cmd)
692 continue;
693 if (NULL != texitoks[cmd].fp)
694 (*texitoks[cmd].fp)(p, cmd, pos);
695 }
696 return(0);
697 }
698
699 /*
700 * Parse until the end of a bracketed statement, e.g., @foo{bar baz}.
701 * This will stop in the event of EOF or if we're not at a bracket.
702 */
703 void
parsebracket(struct texi * p,size_t * pos)704 parsebracket(struct texi *p, size_t *pos)
705 {
706 size_t end;
707 enum texicmd cmd;
708 struct teximacro *macro;
709
710 while (*pos < BUFSZ(p) && ismspace(BUF(p)[*pos]))
711 advance(p, pos);
712
713 if (*pos == BUFSZ(p) || '{' != BUF(p)[*pos])
714 return;
715 advance(p, pos);
716
717 while ((*pos = advancenext(p, pos)) < BUFSZ(p)) {
718 switch (BUF(p)[*pos]) {
719 case ('}'):
720 advance(p, pos);
721 return;
722 case ('{'):
723 if (0 == p->ign)
724 texiwarn(p, "unexpected \"{\"");
725 advance(p, pos);
726 continue;
727 case ('@'):
728 break;
729 default:
730 parseword(p, pos, '\0');
731 continue;
732 }
733
734 cmd = texicmd(p, *pos, &end, ¯o);
735 advanceto(p, pos, end);
736 if (NULL != macro)
737 texiexecmacro(p, macro, pos);
738 if (TEXICMD__MAX == cmd)
739 continue;
740 if (NULL != texitoks[cmd].fp)
741 (*texitoks[cmd].fp)(p, cmd, pos);
742 }
743 }
744
745 /*
746 * This should be invoked when we're on a macro line and want to process
747 * to the end of the current input line, doing all of our macros along
748 * the way.
749 */
750 void
parseeoln(struct texi * p,size_t * pos)751 parseeoln(struct texi *p, size_t *pos)
752 {
753 size_t end;
754 enum texicmd cmd;
755 struct teximacro *macro;
756
757 while (*pos < BUFSZ(p) && '\n' != BUF(p)[*pos]) {
758 while (*pos < BUFSZ(p) && isws(BUF(p)[*pos])) {
759 p->seenws = 1;
760 if (p->literal)
761 texiputchar(p, BUF(p)[*pos]);
762 advance(p, pos);
763 }
764 switch (BUF(p)[*pos]) {
765 case ('}'):
766 if (0 == p->ign)
767 texiwarn(p, "unexpected \"}\"");
768 advance(p, pos);
769 continue;
770 case ('{'):
771 if (0 == p->ign)
772 texiwarn(p, "unexpected \"{\"");
773 advance(p, pos);
774 continue;
775 case ('@'):
776 break;
777 default:
778 parseword(p, pos, '\0');
779 continue;
780 }
781
782 cmd = texicmd(p, *pos, &end, ¯o);
783 advanceto(p, pos, end);
784 if (NULL != macro)
785 texiexecmacro(p, macro, pos);
786 if (TEXICMD__MAX == cmd)
787 continue;
788 if (NULL != texitoks[cmd].fp)
789 (*texitoks[cmd].fp)(p, cmd, pos);
790 }
791
792 if (*pos < BUFSZ(p) && '\n' == BUF(p)[*pos])
793 advance(p, pos);
794 }
795
796 /*
797 * Parse a single word or command.
798 * This will return immediately at the EOF.
799 */
800 static void
parsesingle(struct texi * p,size_t * pos)801 parsesingle(struct texi *p, size_t *pos)
802 {
803 size_t end;
804 enum texicmd cmd;
805 struct teximacro *macro;
806
807 if ((*pos = advancenext(p, pos)) >= BUFSZ(p))
808 return;
809
810 switch (BUF(p)[*pos]) {
811 case ('}'):
812 if (0 == p->ign)
813 texiwarn(p, "unexpected \"}\"");
814 advance(p, pos);
815 return;
816 case ('{'):
817 if (0 == p->ign)
818 texiwarn(p, "unexpected \"{\"");
819 advance(p, pos);
820 return;
821 case ('@'):
822 break;
823 default:
824 parseword(p, pos, '\0');
825 return;
826 }
827
828 cmd = texicmd(p, *pos, &end, ¯o);
829 advanceto(p, pos, end);
830 if (NULL != macro)
831 texiexecmacro(p, macro, pos);
832 if (TEXICMD__MAX == cmd)
833 return;
834 if (NULL != texitoks[cmd].fp)
835 (*texitoks[cmd].fp)(p, cmd, pos);
836 }
837
838 /*
839 * This is used in the @deffn type of command.
840 * These have an arbitrary number of line arguments; however, these
841 * arguments may or may not be surrounded by brackets.
842 * In this function, we parse each one as either a bracketed or
843 * non-bracketed argument, returning 0 when we've reached the end of
844 * line or 1 otherwise.
845 */
846 int
parselinearg(struct texi * p,size_t * pos)847 parselinearg(struct texi *p, size_t *pos)
848 {
849
850 while (*pos < BUFSZ(p) && isws(BUF(p)[*pos])) {
851 p->seenws = 1;
852 advance(p, pos);
853 }
854
855 if (*pos < BUFSZ(p) && '{' == BUF(p)[*pos])
856 parsebracket(p, pos);
857 else if (*pos < BUFSZ(p) && '\n' != BUF(p)[*pos])
858 parsesingle(p, pos);
859 else
860 return(0);
861
862 return(1);
863 }
864
865 /*
866 * Parse til the end of the buffer.
867 */
868 static void
parseeof(struct texi * p)869 parseeof(struct texi *p)
870 {
871 size_t pos;
872
873 for (pos = 0; pos < BUFSZ(p); )
874 parsesingle(p, &pos);
875 }
876
877 void
texisplice(struct texi * p,const char * buf,size_t sz,size_t * pos)878 texisplice(struct texi *p, const char *buf, size_t sz, size_t *pos)
879 {
880 char *cp;
881 struct texifile *f;
882
883 assert(p->filepos > 0);
884 f = &p->files[p->filepos - 1];
885
886 if (f->mapsz + sz > f->mapmaxsz) {
887 f->mapmaxsz = f->mapsz + sz + 1024;
888 cp = realloc(f->map, f->mapmaxsz);
889 if (NULL == cp)
890 texiabort(p, NULL);
891 f->map = cp;
892 }
893
894 f->insplice += sz;
895 memmove(f->map + *pos + sz, f->map + *pos, f->mapsz - *pos);
896 memcpy(f->map + *pos, buf, sz);
897 f->mapsz += sz;
898 }
899
900 /*
901 * Parse a block sequence until we have the "@end endtoken" command
902 * invocation.
903 * This will return immediately at EOF.
904 */
905 void
parseto(struct texi * p,size_t * pos,const char * endtoken)906 parseto(struct texi *p, size_t *pos, const char *endtoken)
907 {
908 size_t end;
909 enum texicmd cmd;
910 size_t endtoksz;
911 struct teximacro *macro;
912
913 endtoksz = strlen(endtoken);
914 assert(endtoksz > 0);
915
916 while ((*pos = advancenext(p, pos)) < BUFSZ(p)) {
917 switch (BUF(p)[*pos]) {
918 case ('}'):
919 if (0 == p->ign)
920 texiwarn(p, "unexpected \"}\"");
921 advance(p, pos);
922 continue;
923 case ('{'):
924 if (0 == p->ign)
925 texiwarn(p, "unexpected \"{\"");
926 advance(p, pos);
927 continue;
928 case ('@'):
929 break;
930 default:
931 parseword(p, pos, '\0');
932 continue;
933 }
934
935 cmd = texicmd(p, *pos, &end, ¯o);
936 advanceto(p, pos, end);
937 if (TEXICMD_END == cmd) {
938 while (*pos < BUFSZ(p) && isws(BUF(p)[*pos]))
939 advance(p, pos);
940 /*
941 * FIXME: check the full word, not just its
942 * initial substring!
943 */
944 if (BUFSZ(p) - *pos >= endtoksz && 0 == strncmp
945 (&BUF(p)[*pos], endtoken, endtoksz)) {
946 advanceeoln(p, pos, 0);
947 break;
948 }
949 if (0 == p->ign)
950 texiwarn(p, "unexpected \"end\"");
951 advanceeoln(p, pos, 0);
952 continue;
953 }
954 if (NULL != macro)
955 texiexecmacro(p, macro, pos);
956 if (TEXICMD__MAX == cmd)
957 continue;
958 if (NULL != texitoks[cmd].fp)
959 (*texitoks[cmd].fp)(p, cmd, pos);
960 }
961 }
962
963 /*
964 * Like parsefile() but used for reading from stdandard input.
965 * This can only be called for the first file!
966 */
967 void
parsestdin(struct texi * p)968 parsestdin(struct texi *p)
969 {
970 struct texifile *f;
971 ssize_t ssz;
972
973 assert(0 == p->filepos);
974 f = &p->files[p->filepos];
975 memset(f, 0, sizeof(struct texifile));
976
977 f->type = TEXISRC_STDIN;
978 f->name = "<stdin>";
979
980 for (f->mapsz = 0; ; f->mapsz += (size_t)ssz) {
981 if (f->mapsz == f->mapmaxsz) {
982 if (f->mapmaxsz == (1U << 31))
983 texierr(p, "stdin buffer too long");
984 f->mapmaxsz = f->mapmaxsz > 65536 / 2 ?
985 2 * f->mapmaxsz : 65536;
986 f->map = realloc(f->map, f->mapmaxsz);
987 if (NULL == f->map)
988 texiabort(p, NULL);
989 }
990 ssz = read(STDIN_FILENO, f->map +
991 (int)f->mapsz, f->mapmaxsz - f->mapsz);
992 if (0 == ssz)
993 break;
994 else if (-1 == ssz)
995 texiabort(p, NULL);
996 }
997
998 p->filepos++;
999 parseeof(p);
1000 texifilepop(p);
1001 }
1002
1003 /*
1004 * Memory-map the file "fname" and begin parsing it unless "parse" is
1005 * zero, in which case we just dump the file to stdout (making sure it
1006 * doesn't trip up mdoc(7) along the way).
1007 * This can be called in a nested context.
1008 */
1009 void
parsefile(struct texi * p,const char * fname,int parse)1010 parsefile(struct texi *p, const char *fname, int parse)
1011 {
1012 struct texifile *f;
1013 int fd;
1014 struct stat st;
1015 size_t i;
1016 char *map;
1017
1018 if (64 == p->filepos)
1019 texierr(p, "too many open files");
1020 f = &p->files[p->filepos];
1021 memset(f, 0, sizeof(struct texifile));
1022
1023 f->type = TEXISRC_FILE;
1024 f->name = fname;
1025 if (-1 == (fd = open(fname, O_RDONLY, 0))) {
1026 texiabort(p, fname);
1027 } else if (-1 == fstat(fd, &st)) {
1028 close(fd);
1029 texiabort(p, fname);
1030 }
1031
1032 f->mapsz = f->mapmaxsz = st.st_size;
1033 map = mmap(NULL, f->mapsz,
1034 PROT_READ, MAP_SHARED, fd, 0);
1035 close(fd);
1036
1037 if (MAP_FAILED == map)
1038 texiabort(p, fname);
1039
1040 if ( ! parse) {
1041 for (i = 0; i < f->mapsz; i++)
1042 texiputchar(p, map[i]);
1043 if (p->outcol)
1044 texiputchar(p, '\n');
1045 munmap(map, f->mapsz);
1046 return;
1047 }
1048
1049 p->filepos++;
1050 f->map = malloc(f->mapsz);
1051 memcpy(f->map, map, f->mapsz);
1052 munmap(map, f->mapsz);
1053 parseeof(p);
1054 texifilepop(p);
1055 }
1056
1057 /*
1058 * Look up the value to a stored pair's value starting in "buf" from
1059 * start to end.
1060 * Return the pointer to the value memory, which can be NULL if the
1061 * pointer key does not exist.
1062 * The pointer can point to NULL if the value has been unset.
1063 */
1064 static char **
valuequery(const struct texi * p,size_t start,size_t end)1065 valuequery(const struct texi *p, size_t start, size_t end)
1066 {
1067 size_t i, sz, len;
1068
1069 assert(end >= start);
1070 /* Ignore zero-length. */
1071 if (0 == (len = (end - start)))
1072 return(NULL);
1073 for (i = 0; i < p->valsz; i++) {
1074 sz = strlen(p->vals[i].key);
1075 if (sz != len)
1076 continue;
1077 if (0 == strncmp(p->vals[i].key, &BUF(p)[start], len))
1078 return(&p->vals[i].value);
1079 }
1080 return(NULL);
1081 }
1082
1083 /*
1084 * Parse a key until the end of line, e.g., @clear foo\n, and return the
1085 * pointer to its value via valuequery().
1086 */
1087 static char **
valuelquery(struct texi * p,size_t * pos)1088 valuelquery(struct texi *p, size_t *pos)
1089 {
1090 size_t start, end;
1091 char **ret;
1092
1093 while (*pos < BUFSZ(p) && isws(BUF(p)[*pos]))
1094 advance(p, pos);
1095 if (*pos == BUFSZ(p))
1096 return(NULL);
1097 for (start = end = *pos; end < BUFSZ(p); end++)
1098 if ('\n' == BUF(p)[end])
1099 break;
1100 advanceto(p, pos, end);
1101 if (*pos < BUFSZ(p)) {
1102 assert('\n' == BUF(p)[*pos]);
1103 advance(p, pos);
1104 }
1105 if (NULL == (ret = valuequery(p, start, end)))
1106 return(NULL);
1107 return(ret);
1108 }
1109
1110 void
valuelclear(struct texi * p,size_t * pos)1111 valuelclear(struct texi *p, size_t *pos)
1112 {
1113 char **ret;
1114
1115 if (NULL == (ret = valuelquery(p, pos)))
1116 return;
1117 free(*ret);
1118 *ret = NULL;
1119 }
1120
1121 const char *
valuellookup(struct texi * p,size_t * pos)1122 valuellookup(struct texi *p, size_t *pos)
1123 {
1124 char **ret;
1125
1126 if (NULL == (ret = valuelquery(p, pos)))
1127 return(NULL);
1128 return(*ret);
1129 }
1130
1131 /*
1132 * Parse a key from a bracketed string, e.g., @value{foo}, and return
1133 * the pointer to its value.
1134 * If the returned pointer is NULL, either there was no string within
1135 * the brackets (or no brackets), or the value was not found, or the
1136 * value had previously been unset.
1137 */
1138 const char *
valueblookup(struct texi * p,size_t * pos)1139 valueblookup(struct texi *p, size_t *pos)
1140 {
1141 size_t start, end;
1142 char **ret;
1143
1144 while (*pos < BUFSZ(p) && isws(BUF(p)[*pos]))
1145 advance(p, pos);
1146 if (*pos == BUFSZ(p) || '{' != BUF(p)[*pos])
1147 return(NULL);
1148 advance(p, pos);
1149 for (start = end = *pos; end < BUFSZ(p); end++)
1150 if ('}' == BUF(p)[end])
1151 break;
1152 advanceto(p, pos, end);
1153 if (*pos < BUFSZ(p)) {
1154 assert('}' == BUF(p)[*pos]);
1155 advance(p, pos);
1156 }
1157 if (NULL == (ret = valuequery(p, start, end)))
1158 return(NULL);
1159 return(*ret);
1160 }
1161
1162 void
valueadd(struct texi * p,char * key,char * val)1163 valueadd(struct texi *p, char *key, char *val)
1164 {
1165 size_t i;
1166
1167 assert(NULL != key);
1168 assert(NULL != val);
1169
1170 for (i = 0; i < p->valsz; i++)
1171 if (0 == strcmp(p->vals[i].key, key))
1172 break;
1173
1174 if (i < p->valsz) {
1175 free(key);
1176 free(p->vals[i].value);
1177 p->vals[i].value = val;
1178 } else {
1179 /* FIXME: reallocarray() */
1180 p->vals = realloc(p->vals,
1181 (p->valsz + 1) *
1182 sizeof(struct texivalue));
1183 if (NULL == p->vals)
1184 texiabort(p, NULL);
1185 p->vals[p->valsz].key = key;
1186 p->vals[p->valsz].value = val;
1187 p->valsz++;
1188 }
1189 }
1190
1191 /*
1192 * Take the arguments to a macro, e.g., @foo{bar, baz, xyzzy} (or the
1193 * declaration form, @macro foo {arg1, ...}) and textually convert it to
1194 * an array of arguments of size "argsz".
1195 * These need to be freed individually and as a whole.
1196 * NOTE: this will puke on @, or @} macros, which can trick it into
1197 * stopping argument parsing earlier.
1198 * Ergo, textual: this doesn't interpret the arguments in any way.
1199 */
1200 char **
argparse(struct texi * p,size_t * pos,size_t * argsz,size_t hint)1201 argparse(struct texi *p, size_t *pos, size_t *argsz, size_t hint)
1202 {
1203 char **args;
1204 size_t start, end, stack;
1205
1206 while (*pos < BUFSZ(p) && isws(BUF(p)[*pos]))
1207 advance(p, pos);
1208
1209 args = NULL;
1210 *argsz = 0;
1211
1212 if ('{' != BUF(p)[*pos] && hint) {
1213 /*
1214 * Special case: if we encounter an unbracketed argument
1215 * and we're being invoked with non-zero arguments
1216 * (versus being set, i.e., hint>0), then parse until
1217 * the end of line.
1218 */
1219 *argsz = 1;
1220 args = calloc(1, sizeof(char *));
1221 if (NULL == args)
1222 texiabort(p, NULL);
1223 start = *pos;
1224 while (*pos < BUFSZ(p)) {
1225 if ('\n' == BUF(p)[*pos])
1226 break;
1227 advance(p, pos);
1228 }
1229 args[0] = malloc(*pos - start + 1);
1230 memcpy(args[0], &BUF(p)[start], *pos - start);
1231 args[0][*pos - start] = '\0';
1232 if (*pos < BUFSZ(p) && '\n' == BUF(p)[*pos])
1233 advance(p, pos);
1234 return(args);
1235 } else if ('{' != BUF(p)[*pos])
1236 return(args);
1237
1238 /* Parse til the closing '}', putting into the array. */
1239 advance(p, pos);
1240 while (*pos < BUFSZ(p)) {
1241 while (*pos < BUFSZ(p) && isws(BUF(p)[*pos]))
1242 advance(p, pos);
1243 start = *pos;
1244 stack = 0;
1245 while (*pos < BUFSZ(p)) {
1246 /*
1247 * According to the manual, commas within
1248 * embedded commands are escaped.
1249 * We keep track of embedded-ness in the "stack"
1250 * state anyway, so this is free.
1251 */
1252 if (',' == BUF(p)[*pos] && 0 == stack && 1 != hint)
1253 break;
1254 else if (0 == stack && '}' == BUF(p)[*pos])
1255 break;
1256 else if (0 != stack && '}' == BUF(p)[*pos])
1257 stack--;
1258 else if ('{' == BUF(p)[*pos])
1259 stack++;
1260 advance(p, pos);
1261 }
1262 if (stack)
1263 texiwarn(p, "unterminated macro "
1264 "in macro arguments");
1265 if ((end = *pos) == BUFSZ(p))
1266 break;
1267 /* Test for zero-length '{ }'. */
1268 if (start == end && '}' == BUF(p)[*pos] && 0 == *argsz)
1269 break;
1270 /* FIXME: use reallocarray. */
1271 args = realloc
1272 (args, sizeof(char *) *
1273 (*argsz + 1));
1274 if (NULL == args)
1275 texiabort(p, NULL);
1276 args[*argsz] = malloc(end - start + 1);
1277 if (NULL == args[*argsz])
1278 texiabort(p, NULL);
1279 memcpy(args[*argsz],
1280 &BUF(p)[start], end - start);
1281 args[*argsz][end - start] = '\0';
1282 (*argsz)++;
1283 if ('}' == BUF(p)[*pos])
1284 break;
1285 advance(p, pos);
1286 }
1287
1288 if (*pos == BUFSZ(p))
1289 texierr(p, "unterminated arguments");
1290 assert('}' == BUF(p)[*pos]);
1291 advance(p, pos);
1292 return(args);
1293 }
1294