1 /*
2 * input.c: read the source form
3 */
4
5 #include <stdio.h>
6 #include <assert.h>
7 #include <time.h>
8 #include "halibut.h"
9
10 #define TAB_STOP 8 /* for column number tracking */
11
setpos(input * in,char * fname)12 static void setpos(input *in, char *fname) {
13 in->pos.filename = fname;
14 in->pos.line = 1;
15 in->pos.col = (in->reportcols ? 1 : -1);
16 }
17
unget(input * in,int c,filepos * pos)18 static void unget(input *in, int c, filepos *pos) {
19 if (in->npushback >= in->pushbacksize) {
20 in->pushbacksize = in->npushback + 16;
21 in->pushback = sresize(in->pushback, in->pushbacksize, pushback);
22 }
23 in->pushback[in->npushback].chr = c;
24 in->pushback[in->npushback].pos = *pos; /* structure copy */
25 in->npushback++;
26 }
27
28 /* ---------------------------------------------------------------------- */
29 /*
30 * Macro subsystem
31 */
32 typedef struct macro_Tag macro;
33 struct macro_Tag {
34 wchar_t *name, *text;
35 };
36 struct macrostack_Tag {
37 macrostack *next;
38 wchar_t *text;
39 int ptr, npushback;
40 filepos pos;
41 };
macrocmp(void * av,void * bv)42 static int macrocmp(void *av, void *bv) {
43 macro *a = (macro *)av, *b = (macro *)bv;
44 return ustrcmp(a->name, b->name);
45 }
macrodef(tree234 * macros,wchar_t * name,wchar_t * text,filepos fpos)46 static void macrodef(tree234 *macros, wchar_t *name, wchar_t *text,
47 filepos fpos) {
48 macro *m = snew(macro);
49 m->name = name;
50 m->text = text;
51 if (add234(macros, m) != m) {
52 err_macroexists(&fpos, name);
53 sfree(name);
54 sfree(text);
55 }
56 }
macrolookup(tree234 * macros,input * in,wchar_t * name,filepos * pos)57 static int macrolookup(tree234 *macros, input *in, wchar_t *name,
58 filepos *pos) {
59 macro m, *gotit;
60 m.name = name;
61 gotit = find234(macros, &m, NULL);
62 if (gotit) {
63 macrostack *expansion = snew(macrostack);
64 expansion->next = in->stack;
65 expansion->text = gotit->text;
66 expansion->pos = *pos; /* structure copy */
67 expansion->ptr = 0;
68 expansion->npushback = in->npushback;
69 in->stack = expansion;
70 return TRUE;
71 } else
72 return FALSE;
73 }
macrocleanup(tree234 * macros)74 static void macrocleanup(tree234 *macros) {
75 int ti;
76 macro *m;
77 for (ti = 0; (m = (macro *)index234(macros, ti)) != NULL; ti++) {
78 sfree(m->name);
79 sfree(m->text);
80 sfree(m);
81 }
82 freetree234(macros);
83 }
84
input_configure(input * in,paragraph * cfg)85 static void input_configure(input *in, paragraph *cfg) {
86 assert(cfg->type == para_Config);
87
88 if (!ustricmp(cfg->keyword, L"input-charset")) {
89 in->charset = charset_from_ustr(&cfg->fpos, uadv(cfg->keyword));
90 }
91 }
92
93 /*
94 * Can return EOF
95 */
get(input * in,filepos * pos,rdstringc * rsc)96 static int get(input *in, filepos *pos, rdstringc *rsc) {
97 int pushbackpt = in->stack ? in->stack->npushback : 0;
98 if (in->npushback > pushbackpt) {
99 --in->npushback;
100 if (pos)
101 *pos = in->pushback[in->npushback].pos; /* structure copy */
102 return in->pushback[in->npushback].chr;
103 }
104 else if (in->stack) {
105 wchar_t c = in->stack->text[in->stack->ptr];
106 if (pos)
107 *pos = in->stack->pos;
108 if (in->stack->text[++in->stack->ptr] == L'\0') {
109 macrostack *tmp = in->stack;
110 in->stack = tmp->next;
111 sfree(tmp);
112 }
113 return c;
114 }
115 else if (in->currfp) {
116
117 while (in->wcpos >= in->nwc) {
118
119 int c = getc(in->currfp);
120
121 if (c == EOF) {
122 if (in->wantclose)
123 fclose(in->currfp);
124 in->currfp = NULL;
125 return EOF;
126 }
127
128 if (rsc)
129 rdaddc(rsc, c);
130
131 /* Track line numbers, for error reporting */
132 if (pos)
133 *pos = in->pos;
134 if (in->reportcols) {
135 switch (c) {
136 case '\t':
137 in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
138 break;
139 case '\n':
140 in->pos.col = 1;
141 in->pos.line++;
142 break;
143 default:
144 in->pos.col++;
145 break;
146 }
147 } else {
148 in->pos.col = -1;
149 if (c == '\n')
150 in->pos.line++;
151 }
152
153 /*
154 * Do input character set translation, so that we return
155 * Unicode.
156 */
157 {
158 char buf[1];
159 char const *p;
160 int inlen;
161
162 buf[0] = (char)c;
163 p = buf;
164 inlen = 1;
165
166 in->nwc = charset_to_unicode(&p, &inlen,
167 in->wc, lenof(in->wc),
168 in->charset, &in->csstate,
169 NULL, 0);
170 assert(p == buf+1 && inlen == 0);
171
172 in->wcpos = 0;
173 }
174 }
175
176 return in->wc[in->wcpos++];
177
178 } else
179 return EOF;
180 }
181
182 /*
183 * Lexical analysis of source files.
184 */
185 typedef struct token_Tag token;
186 struct token_Tag {
187 int type;
188 int cmd, aux;
189 wchar_t *text;
190 char *origtext;
191 filepos pos;
192 };
193 enum {
194 tok_eof, /* end of file */
195 tok_eop, /* end of paragraph */
196 tok_white, /* whitespace */
197 tok_word, /* a word or word fragment */
198 tok_cmd, /* \command */
199 tok_lbrace, /* { */
200 tok_rbrace /* } */
201 };
202
203 /* Halibut command keywords. */
204 enum {
205 c__invalid, /* invalid command */
206 c__comment, /* comment command (\#) */
207 c__escaped, /* escaped character */
208 c__nop, /* no-op */
209 c__nbsp, /* nonbreaking space */
210 c_A, /* appendix heading */
211 c_B, /* bibliography entry */
212 c_BR, /* bibliography rewrite */
213 c_C, /* chapter heading */
214 c_H, /* heading */
215 c_I, /* invisible index mark */
216 c_IM, /* index merge/rewrite */
217 c_K, /* capitalised cross-reference */
218 c_S, /* aux field is 0, 1, 2, ... */
219 c_U, /* unnumbered-chapter heading */
220 c_W, /* Web hyperlink */
221 c_b, /* bulletted list */
222 c_c, /* code */
223 c_cfg, /* configuration directive */
224 c_copyright, /* copyright statement */
225 c_cq, /* quoted code (sugar for \q{\cw{x}}) */
226 c_cw, /* weak code */
227 c_date, /* document processing date */
228 c_dd, /* description list: description */
229 c_define, /* macro definition */
230 c_dt, /* description list: described thing */
231 c_e, /* emphasis */
232 c_i, /* visible index mark */
233 c_ii, /* uncapitalised visible index mark */
234 c_k, /* uncapitalised cross-reference */
235 c_lcont, /* continuation para(s) for list item */
236 c_n, /* numbered list */
237 c_nocite, /* bibliography trickery */
238 c_preamble, /* (obsolete) preamble text */
239 c_q, /* quote marks */
240 c_quote, /* block-quoted paragraphs */
241 c_rule, /* horizontal rule */
242 c_s, /* strong */
243 c_title, /* document title */
244 c_u, /* aux field is char code */
245 c_versionid /* document RCS id */
246 };
247
248 /* Perhaps whitespace should be defined in a more Unicode-friendly way? */
249 #define iswhite(c) ( (c)==32 || (c)==9 || (c)==13 || (c)==10 )
250 #define isnl(c) ( (c)==10 )
251 #define isdec(c) ( ((c)>='0'&&(c)<='9') )
252 #define fromdec(c) ( (c)-'0' )
253 #define ishex(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='F') || ((c)>='a'&&(c)<='f'))
254 #define fromhex(c) ( (c)<='9' ? (c)-'0' : ((c)&0xDF) - ('A'-10) )
255 #define iscmd(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='Z') || ((c)>='a'&&(c)<='z'))
256
257 /*
258 * Keyword comparison function. Like strcmp, but between a wchar_t *
259 * and a char *.
260 */
kwcmp(wchar_t const * p,char const * q)261 static int kwcmp(wchar_t const *p, char const *q) {
262 int i;
263 do {
264 i = *p - *q;
265 } while (*p++ && *q++ && !i);
266 return i;
267 }
268
269 /*
270 * Match a keyword.
271 */
match_kw(token * tok)272 static void match_kw(token *tok) {
273 /*
274 * FIXME. The ids are explicit in here so as to allow long-name
275 * equivalents to the various very short keywords.
276 */
277 static const struct { char const *name; int id; } keywords[] = {
278 {"#", c__comment}, /* comment command (\#) */
279 {"-", c__escaped}, /* nonbreaking hyphen */
280 {".", c__nop}, /* no-op */
281 {"A", c_A}, /* appendix heading */
282 {"B", c_B}, /* bibliography entry */
283 {"BR", c_BR}, /* bibliography rewrite */
284 {"C", c_C}, /* chapter heading */
285 {"H", c_H}, /* heading */
286 {"I", c_I}, /* invisible index mark */
287 {"IM", c_IM}, /* index merge/rewrite */
288 {"K", c_K}, /* capitalised cross-reference */
289 {"U", c_U}, /* unnumbered-chapter heading */
290 {"W", c_W}, /* Web hyperlink */
291 {"\\", c__escaped}, /* escaped backslash (\\) */
292 {"_", c__nbsp}, /* nonbreaking space (\_) */
293 {"b", c_b}, /* bulletted list */
294 {"c", c_c}, /* code */
295 {"cfg", c_cfg}, /* configuration directive */
296 {"copyright", c_copyright}, /* copyright statement */
297 {"cq", c_cq}, /* quoted code (sugar for \q{\cw{x}}) */
298 {"cw", c_cw}, /* weak code */
299 {"date", c_date}, /* document processing date */
300 {"dd", c_dd}, /* description list: description */
301 {"define", c_define}, /* macro definition */
302 {"dt", c_dt}, /* description list: described thing */
303 {"e", c_e}, /* emphasis */
304 {"i", c_i}, /* visible index mark */
305 {"ii", c_ii}, /* uncapitalised visible index mark */
306 {"k", c_k}, /* uncapitalised cross-reference */
307 {"lcont", c_lcont}, /* continuation para(s) for list item */
308 {"n", c_n}, /* numbered list */
309 {"nocite", c_nocite}, /* bibliography trickery */
310 {"preamble", c_preamble}, /* (obsolete) preamble text */
311 {"q", c_q}, /* quote marks */
312 {"quote", c_quote}, /* block-quoted paragraphs */
313 {"rule", c_rule}, /* horizontal rule */
314 {"s", c_s}, /* strong */
315 {"title", c_title}, /* document title */
316 {"versionid", c_versionid}, /* document RCS id */
317 {"{", c__escaped}, /* escaped lbrace (\{) */
318 {"}", c__escaped}, /* escaped rbrace (\}) */
319 };
320 int i, j, k, c;
321
322 /*
323 * Special cases: \S{0,1,2,...} and \uABCD. If the syntax
324 * doesn't match correctly, we just fall through to the
325 * binary-search phase.
326 */
327 if (tok->text[0] == 'S') {
328 /* We expect numeric characters thereafter. */
329 wchar_t *p = tok->text+1;
330 int n;
331 if (!*p)
332 n = 1;
333 else {
334 n = 0;
335 while (*p && isdec(*p)) {
336 n = 10 * n + fromdec(*p);
337 p++;
338 }
339 }
340 if (!*p) {
341 tok->cmd = c_S;
342 tok->aux = n;
343 return;
344 }
345 } else if (tok->text[0] == 'u') {
346 /* We expect hex characters thereafter. */
347 wchar_t *p = tok->text+1;
348 int n = 0;
349 while (*p && ishex(*p)) {
350 n = 16 * n + fromhex(*p);
351 p++;
352 }
353 if (!*p) {
354 tok->cmd = c_u;
355 tok->aux = n;
356 return;
357 }
358 }
359
360 i = -1;
361 j = sizeof(keywords)/sizeof(*keywords);
362 while (j-i > 1) {
363 k = (i+j)/2;
364 c = kwcmp(tok->text, keywords[k].name);
365 if (c < 0)
366 j = k;
367 else if (c > 0)
368 i = k;
369 else /* c == 0 */ {
370 tok->cmd = keywords[k].id;
371 return;
372 }
373 }
374
375 tok->cmd = c__invalid;
376 }
377
378
379 /*
380 * Read a token from the input file, in the normal way (`normal' in
381 * the sense that code paragraphs work a different way).
382 */
get_token(input * in)383 token get_token(input *in) {
384 int c;
385 int nls;
386 int prevpos;
387 token ret;
388 rdstring rs = { 0, 0, NULL };
389 rdstringc rsc = { 0, 0, NULL };
390 filepos cpos;
391
392 ret.text = NULL; /* default */
393 ret.origtext = NULL; /* default */
394 if (in->pushback_chars) {
395 rdaddsc(&rsc, in->pushback_chars);
396 sfree(in->pushback_chars);
397 in->pushback_chars = NULL;
398 }
399 c = get(in, &cpos, &rsc);
400 ret.pos = cpos;
401 if (iswhite(c)) { /* tok_white or tok_eop */
402 nls = 0;
403 prevpos = 0;
404 do {
405 if (isnl(c))
406 nls++;
407 prevpos = rsc.pos;
408 } while ((c = get(in, &cpos, &rsc)) != EOF && iswhite(c));
409 if (c == EOF) {
410 ret.type = tok_eof;
411 sfree(rsc.text);
412 return ret;
413 }
414 if (rsc.text) {
415 in->pushback_chars = dupstr(rsc.text + prevpos);
416 sfree(rsc.text);
417 }
418 unget(in, c, &cpos);
419 ret.type = (nls > 1 ? tok_eop : tok_white);
420 return ret;
421 } else if (c == EOF) { /* tok_eof */
422 ret.type = tok_eof;
423 sfree(rsc.text);
424 return ret;
425 } else if (c == '\\') { /* tok_cmd */
426 rsc.pos = prevpos = 0;
427 c = get(in, &cpos, &rsc);
428 if (c == '-' || c == '\\' || c == '_' ||
429 c == '#' || c == '{' || c == '}' || c == '.') {
430 /* single-char command */
431 rdadd(&rs, c);
432 prevpos = rsc.pos;
433 } else if (c == 'u') {
434 int len = 0;
435 do {
436 rdadd(&rs, c);
437 len++;
438 prevpos = rsc.pos;
439 c = get(in, &cpos, &rsc);
440 } while (ishex(c) && len < 5);
441 unget(in, c, &cpos);
442 } else if (iscmd(c)) {
443 do {
444 rdadd(&rs, c);
445 prevpos = rsc.pos;
446 c = get(in, &cpos, &rsc);
447 } while (iscmd(c));
448 unget(in, c, &cpos);
449 }
450 /*
451 * Now match the command against the list of available
452 * ones.
453 */
454 ret.type = tok_cmd;
455 ret.text = ustrdup(rs.text);
456 if (rsc.text) {
457 in->pushback_chars = dupstr(rsc.text + prevpos);
458 rsc.text[prevpos] = '\0';
459 ret.origtext = dupstr(rsc.text);
460 } else {
461 ret.origtext = dupstr("");
462 }
463 match_kw(&ret);
464 sfree(rs.text);
465 sfree(rsc.text);
466 return ret;
467 } else if (c == '{') { /* tok_lbrace */
468 ret.type = tok_lbrace;
469 sfree(rsc.text);
470 return ret;
471 } else if (c == '}') { /* tok_rbrace */
472 ret.type = tok_rbrace;
473 sfree(rsc.text);
474 return ret;
475 } else { /* tok_word */
476 /*
477 * Read a word: the longest possible contiguous sequence of
478 * things other than whitespace, backslash, braces and
479 * hyphen. A hyphen terminates the word but is returned as
480 * part of it; everything else is pushed back for the next
481 * token. The `aux' field contains TRUE if the word ends in
482 * a hyphen.
483 */
484 ret.aux = FALSE; /* assumed for now */
485 prevpos = 0;
486 while (1) {
487 if (iswhite(c) || c=='{' || c=='}' || c=='\\' || c==EOF) {
488 /* Put back the character that caused termination */
489 unget(in, c, &cpos);
490 break;
491 } else {
492 rdadd(&rs, c);
493 if (c == '-') {
494 prevpos = rsc.pos;
495 ret.aux = TRUE;
496 break; /* hyphen terminates word */
497 }
498 }
499 prevpos = rsc.pos;
500 c = get(in, &cpos, &rsc);
501 }
502 ret.type = tok_word;
503 ret.text = ustrdup(rs.text);
504 if (rsc.text) {
505 in->pushback_chars = dupstr(rsc.text + prevpos);
506 rsc.text[prevpos] = '\0';
507 ret.origtext = dupstr(rsc.text);
508 } else {
509 ret.origtext = dupstr("");
510 }
511 sfree(rs.text);
512 sfree(rsc.text);
513 return ret;
514 }
515 }
516
517 /*
518 * Determine whether the next input character is an open brace (for
519 * telling code paragraphs from paragraphs which merely start with
520 * code).
521 */
isbrace(input * in)522 int isbrace(input *in) {
523 int c;
524 filepos cpos;
525
526 c = get(in, &cpos, NULL);
527 unget(in, c, &cpos);
528 return (c == '{');
529 }
530
531 /*
532 * Read the rest of a line that starts `\c'. Including nothing at
533 * all (tok_word with empty text).
534 */
get_codepar_token(input * in)535 token get_codepar_token(input *in) {
536 int c;
537 token ret;
538 rdstring rs = { 0, 0, NULL };
539 filepos cpos;
540
541 ret.type = tok_word;
542 ret.origtext = NULL;
543 c = get(in, &cpos, NULL); /* expect (and discard) one space */
544 ret.pos = cpos;
545 if (c == ' ') {
546 c = get(in, &cpos, NULL);
547 ret.pos = cpos;
548 }
549 while (!isnl(c) && c != EOF) {
550 int c2 = c;
551 c = get(in, &cpos, NULL);
552 /* Discard \r just before \n. */
553 if (c2 != 13 || !isnl(c))
554 rdadd(&rs, c2);
555 }
556 unget(in, c, &cpos);
557 ret.text = ustrdup(rs.text);
558 sfree(rs.text);
559 return ret;
560 }
561
562 /*
563 * Adds a new word to a linked list
564 */
addword(word newword,word *** hptrptr)565 static word *addword(word newword, word ***hptrptr) {
566 word *mnewword;
567 if (!hptrptr)
568 return NULL;
569 mnewword = snew(word);
570 *mnewword = newword; /* structure copy */
571 mnewword->next = NULL;
572 **hptrptr = mnewword;
573 *hptrptr = &mnewword->next;
574 return mnewword;
575 }
576
577 /*
578 * Adds a new paragraph to a linked list
579 */
addpara(paragraph newpara,paragraph *** hptrptr)580 static paragraph *addpara(paragraph newpara, paragraph ***hptrptr) {
581 paragraph *mnewpara = snew(paragraph);
582 *mnewpara = newpara; /* structure copy */
583 mnewpara->next = NULL;
584 **hptrptr = mnewpara;
585 *hptrptr = &mnewpara->next;
586 return mnewpara;
587 }
588
589 /*
590 * Destructor before token is reassigned; should catch most memory
591 * leaks
592 */
593 #define dtor(t) ( sfree(t.text), sfree(t.origtext) )
594
595 /*
596 * Reads a single file (ie until get() returns EOF)
597 */
read_file(paragraph *** ret,input * in,indexdata * idx,tree234 * macros)598 static void read_file(paragraph ***ret, input *in, indexdata *idx,
599 tree234 *macros) {
600 token t;
601 paragraph par;
602 word wd, **whptr, **idximplicit;
603 wchar_t utext[2], *wdtext;
604 int style, spcstyle;
605 int already;
606 int iswhite, seenwhite;
607 int type;
608 int prev_para_type;
609 struct stack_item {
610 enum {
611 stack_nop = 0, /* do nothing (for error recovery) */
612 stack_ualt = 1, /* \u alternative */
613 stack_style = 2, /* \e, \c, \cw */
614 stack_idx = 4, /* \I, \i, \ii */
615 stack_hyper = 8, /* \W */
616 stack_quote = 16 /* \q */
617 } type;
618 word **whptr; /* to restore from \u alternatives */
619 word **idximplicit; /* to restore from \u alternatives */
620 filepos fpos;
621 int in_code;
622 } *sitem;
623 stack parsestk;
624 struct crossparaitem {
625 int type; /* currently c_lcont, c_quote or -1 */
626 int seen_lcont, seen_quote;
627 };
628 stack crossparastk;
629 word *indexword, *uword, *iword;
630 word *idxwordlist;
631 rdstring indexstr;
632 int index_downcase, index_visible, indexing;
633 const rdstring nullrs = { 0, 0, NULL };
634 wchar_t uchr;
635
636 t.text = NULL;
637 t.origtext = NULL;
638 already = FALSE;
639
640 crossparastk = stk_new();
641
642 /*
643 * Loop on each paragraph.
644 */
645 while (1) {
646 int start_cmd = c__invalid;
647 par.words = NULL;
648 par.keyword = NULL;
649 par.origkeyword = NULL;
650 whptr = &par.words;
651
652 /*
653 * Get a token.
654 */
655 do {
656 if (!already) {
657 dtor(t), t = get_token(in);
658 }
659 already = FALSE;
660 } while (t.type == tok_eop);
661 if (t.type == tok_eof)
662 break;
663
664 /*
665 * Parse code paragraphs separately.
666 */
667 if (t.type == tok_cmd && t.cmd == c_c && !isbrace(in)) {
668 int wtype = word_WeakCode;
669
670 par.type = para_Code;
671 par.fpos = t.pos;
672 while (1) {
673 dtor(t), t = get_codepar_token(in);
674 wd.type = wtype;
675 wd.breaks = FALSE; /* shouldn't need this... */
676 wd.text = ustrdup(t.text);
677 wd.alt = NULL;
678 wd.fpos = t.pos;
679 addword(wd, &whptr);
680 dtor(t), t = get_token(in);
681 if (t.type == tok_white) {
682 /*
683 * The newline after a code-paragraph line
684 */
685 dtor(t), t = get_token(in);
686 }
687 if (t.type == tok_eop || t.type == tok_eof ||
688 t.type == tok_rbrace) { /* might be } terminating \lcont */
689 if (t.type == tok_rbrace)
690 already = TRUE;
691 break;
692 } else if (t.type == tok_cmd && t.cmd == c_c) {
693 wtype = word_WeakCode;
694 } else if (t.type == tok_cmd && t.cmd == c_e &&
695 wtype == word_WeakCode) {
696 wtype = word_Emph;
697 } else if (t.type == tok_cmd && t.cmd == c_s &&
698 wtype == word_WeakCode) {
699 wtype = word_Strong;
700 } else {
701 err_brokencodepara(&t.pos);
702 prev_para_type = par.type;
703 addpara(par, ret);
704 while (t.type != tok_eop) /* error recovery: */
705 dtor(t), t = get_token(in); /* eat rest of paragraph */
706 goto codeparabroken; /* ick, but such is life */
707 }
708 }
709 prev_para_type = par.type;
710 addpara(par, ret);
711 codeparabroken:
712 continue;
713 }
714
715 /*
716 * Spot the special commands that define a grouping of more
717 * than one paragraph, and also the closing braces that
718 * finish them.
719 */
720 if (t.type == tok_cmd &&
721 (t.cmd == c_lcont || t.cmd == c_quote)) {
722 struct crossparaitem *sitem, *stop;
723 int cmd = t.cmd;
724
725 /*
726 * Expect, and swallow, an open brace.
727 */
728 dtor(t), t = get_token(in);
729 if (t.type != tok_lbrace) {
730 err_explbr(&t.pos);
731 continue;
732 }
733
734 /*
735 * Also expect, and swallow, any whitespace after that
736 * (a newline before a code paragraph wouldn't be
737 * surprising).
738 */
739 do {
740 dtor(t), t = get_token(in);
741 } while (t.type == tok_white);
742 already = TRUE;
743
744 if (cmd == c_lcont) {
745 /*
746 * \lcont causes a continuation of a list item into
747 * multiple paragraphs (which may in turn contain
748 * nested lists, code paras etc). Hence, the previous
749 * paragraph must be of a list type.
750 */
751 sitem = snew(struct crossparaitem);
752 stop = (struct crossparaitem *)stk_top(crossparastk);
753 if (stop)
754 *sitem = *stop;
755 else
756 sitem->seen_quote = sitem->seen_lcont = 0;
757
758 if (prev_para_type == para_Bullet ||
759 prev_para_type == para_NumberedList ||
760 prev_para_type == para_Description) {
761 sitem->type = c_lcont;
762 sitem->seen_lcont = 1;
763 par.type = para_LcontPush;
764 prev_para_type = par.type;
765 addpara(par, ret);
766 } else {
767 /*
768 * Push a null item on the cross-para stack so that
769 * when we see the corresponding closing brace we
770 * don't give a cascade error.
771 */
772 sitem->type = -1;
773 err_misplacedlcont(&t.pos);
774 }
775 } else {
776 /*
777 * \quote causes a group of paragraphs to be
778 * block-quoted (typically they will be indented a
779 * bit).
780 */
781 sitem = snew(struct crossparaitem);
782 stop = (struct crossparaitem *)stk_top(crossparastk);
783 if (stop)
784 *sitem = *stop;
785 else
786 sitem->seen_quote = sitem->seen_lcont = 0;
787 sitem->type = c_quote;
788 sitem->seen_quote = 1;
789 par.type = para_QuotePush;
790 prev_para_type = par.type;
791 addpara(par, ret);
792 }
793 stk_push(crossparastk, sitem);
794 continue;
795 } else if (t.type == tok_rbrace) {
796 struct crossparaitem *sitem = stk_pop(crossparastk);
797 if (!sitem)
798 err_unexbrace(&t.pos);
799 else {
800 switch (sitem->type) {
801 case c_lcont:
802 par.type = para_LcontPop;
803 prev_para_type = par.type;
804 addpara(par, ret);
805 break;
806 case c_quote:
807 par.type = para_QuotePop;
808 prev_para_type = par.type;
809 addpara(par, ret);
810 break;
811 }
812 sfree(sitem);
813 }
814 continue;
815 }
816
817 while (t.type == tok_cmd &&
818 macrolookup(macros, in, t.text, &t.pos)) {
819 dtor(t), t = get_token(in);
820 }
821
822 /*
823 * This token begins a paragraph. See if it's one of the
824 * special commands that define a paragraph type.
825 *
826 * (note that \# is special in a way, and \nocite takes no
827 * text)
828 */
829 par.type = para_Normal;
830 if (t.type == tok_cmd) {
831 int needkw;
832 int is_macro = FALSE;
833
834 par.fpos = t.pos;
835 switch (t.cmd) {
836 default:
837 needkw = -1;
838 break;
839 case c__invalid:
840 err_badparatype(t.text, &t.pos);
841 needkw = 4;
842 break;
843 case c__comment:
844 if (isbrace(in)) {
845 needkw = -1;
846 break; /* `\#{': isn't a comment para */
847 }
848 do {
849 dtor(t), t = get_token(in);
850 } while (t.type != tok_eop && t.type != tok_eof);
851 continue; /* next paragraph */
852 /*
853 * `needkw' values:
854 *
855 * 1 -- exactly one keyword
856 * 2 -- at least one keyword
857 * 4 -- any number of keywords including zero
858 * 8 -- at least one keyword and then nothing else
859 * 16 -- nothing at all! no keywords, no body
860 * 32 -- no keywords at all
861 */
862 case c_A: needkw = 2; par.type = para_Appendix; break;
863 case c_B: needkw = 2; par.type = para_Biblio; break;
864 case c_BR: needkw = 1; par.type = para_BR;
865 start_cmd = c_BR; break;
866 case c_C: needkw = 2; par.type = para_Chapter; break;
867 case c_H: needkw = 2; par.type = para_Heading;
868 par.aux = 0;
869 break;
870 case c_IM: needkw = 2; par.type = para_IM;
871 start_cmd = c_IM; break;
872 case c_S: needkw = 2; par.type = para_Subsect;
873 par.aux = t.aux; break;
874 case c_U: needkw = 32; par.type = para_UnnumberedChapter; break;
875 /* For \b and \n the keyword is optional */
876 case c_b: needkw = 4; par.type = para_Bullet; break;
877 case c_dt: needkw = 4; par.type = para_DescribedThing; break;
878 case c_dd: needkw = 4; par.type = para_Description; break;
879 case c_n: needkw = 4; par.type = para_NumberedList; break;
880 case c_cfg: needkw = 8; par.type = para_Config;
881 start_cmd = c_cfg; break;
882 case c_copyright: needkw = 32; par.type = para_Copyright; break;
883 case c_define: is_macro = TRUE; needkw = 1; break;
884 /* For \nocite the keyword is _everything_ */
885 case c_nocite: needkw = 8; par.type = para_NoCite; break;
886 case c_preamble: needkw = 32; par.type = para_Normal; break;
887 case c_rule: needkw = 16; par.type = para_Rule; break;
888 case c_title: needkw = 32; par.type = para_Title; break;
889 case c_versionid: needkw = 32; par.type = para_VersionID; break;
890 }
891
892 if (par.type == para_Chapter ||
893 par.type == para_Heading ||
894 par.type == para_Subsect ||
895 par.type == para_Appendix ||
896 par.type == para_UnnumberedChapter) {
897 struct crossparaitem *sitem = stk_top(crossparastk);
898 if (sitem && (sitem->seen_lcont || sitem->seen_quote)) {
899 err_sectmarkerinblock( &t.pos,
900 (sitem->seen_lcont ? "lcont" : "quote"));
901 }
902 }
903
904 if (needkw > 0) {
905 rdstring rs = { 0, 0, NULL };
906 rdstringc rsc = { 0, 0, NULL };
907 int nkeys = 0;
908 filepos fp;
909
910 /* Get keywords. */
911 dtor(t), t = get_token(in);
912 fp = t.pos;
913 while (t.type == tok_lbrace ||
914 (t.type == tok_white && (needkw & 24))) {
915 /*
916 * In paragraph types which can't accept any
917 * body text (such as \cfg), we are lenient
918 * about whitespace between keywords. This is
919 * important for \cfg in particular since it
920 * can often have many keywords which are long
921 * pieces of text, so it's useful to permit the
922 * user to wrap the line between them.
923 */
924 if (t.type == tok_white) {
925 dtor(t), t = get_token(in); /* eat the space */
926 continue;
927 }
928 /* This is a keyword. */
929 nkeys++;
930 /* FIXME: there will be bugs if anyone specifies an
931 * empty keyword (\foo{}), so trap this case. */
932 while (dtor(t), t = get_token(in),
933 t.type == tok_word ||
934 t.type == tok_white ||
935 (t.type == tok_cmd && t.cmd == c__nbsp) ||
936 (t.type == tok_cmd && t.cmd == c__escaped) ||
937 (t.type == tok_cmd && t.cmd == c_u)) {
938 if (t.type == tok_white ||
939 (t.type == tok_cmd && t.cmd == c__nbsp)) {
940 rdadd(&rs, ' ');
941 rdaddc(&rsc, ' ');
942 } else if (t.type == tok_cmd && t.cmd == c_u) {
943 rdadd(&rs, t.aux);
944 rdaddc(&rsc, '\\');
945 rdaddsc(&rsc, t.origtext);
946 } else {
947 rdadds(&rs, t.text);
948 rdaddsc(&rsc, t.origtext);
949 }
950 }
951 if (t.type != tok_rbrace) {
952 err_kwunclosed(&t.pos);
953 continue;
954 }
955 rdadd(&rs, 0); /* add string terminator */
956 rdaddc(&rsc, 0); /* add string terminator */
957 dtor(t), t = get_token(in); /* eat right brace */
958 }
959
960 rdadd(&rs, 0); /* add string terminator */
961 rdaddc(&rsc, 0); /* add string terminator */
962
963 /* See whether we have the right number of keywords. */
964 if ((needkw & 48) && nkeys > 0)
965 err_kwillegal(&fp);
966 if ((needkw & 11) && nkeys == 0)
967 err_kwexpected(&fp);
968 if ((needkw & 5) && nkeys > 1)
969 err_kwtoomany(&fp);
970
971 if (is_macro) {
972 /*
973 * Macro definition. Get the rest of the line
974 * as a code-paragraph token, repeatedly until
975 * there's nothing more left of it. Separate
976 * with newlines.
977 */
978 rdstring macrotext = { 0, 0, NULL };
979 while (1) {
980 dtor(t), t = get_codepar_token(in);
981 if (macrotext.pos > 0)
982 rdadd(¯otext, L'\n');
983 rdadds(¯otext, t.text);
984 dtor(t), t = get_token(in);
985 if (t.type == tok_eop || t.type == tok_eof)
986 break;
987 }
988 macrodef(macros, rs.text, macrotext.text, fp);
989 continue; /* next paragraph */
990 }
991
992 par.keyword = rdtrim(&rs);
993 par.origkeyword = rdtrimc(&rsc);
994
995 /* Move to EOP in case of needkw==8 or 16 (no body) */
996 if (needkw & 24) {
997 /* We allow whitespace even when we expect no para body */
998 while (t.type == tok_white)
999 dtor(t), t = get_token(in);
1000 if (t.type != tok_eop && t.type != tok_eof &&
1001 (start_cmd == c__invalid ||
1002 t.type != tok_cmd || t.cmd != start_cmd)) {
1003 err_bodyillegal(&t.pos);
1004 /* Error recovery: eat the rest of the paragraph */
1005 while (t.type != tok_eop && t.type != tok_eof &&
1006 (start_cmd == c__invalid ||
1007 t.type != tok_cmd || t.cmd != start_cmd))
1008 dtor(t), t = get_token(in);
1009 }
1010 if (t.type == tok_cmd)
1011 already = TRUE;/* inhibit get_token at top of loop */
1012 prev_para_type = par.type;
1013 addpara(par, ret);
1014
1015 if (par.type == para_Config) {
1016 input_configure(in, &par);
1017 }
1018 continue; /* next paragraph */
1019 }
1020 }
1021 }
1022
1023 /*
1024 * Now read the actual paragraph, word by word, adding to
1025 * the paragraph list.
1026 *
1027 * Mid-paragraph commands:
1028 *
1029 * \K \k
1030 * \c \cw \cq
1031 * \e
1032 * \i \ii
1033 * \I
1034 * \q
1035 * \u
1036 * \W
1037 * \date
1038 * \\ \{ \}
1039 */
1040 parsestk = stk_new();
1041 style = word_Normal;
1042 spcstyle = word_WhiteSpace;
1043 indexing = FALSE;
1044 seenwhite = TRUE;
1045 while (t.type != tok_eop && t.type != tok_eof) {
1046 iswhite = FALSE;
1047 already = FALSE;
1048
1049 /* Handle implicit paragraph breaks after \IM, \BR etc */
1050 if (start_cmd != c__invalid &&
1051 t.type == tok_cmd && t.cmd == start_cmd) {
1052 already = TRUE; /* inhibit get_token at top of loop */
1053 break;
1054 }
1055
1056 if (t.type == tok_cmd && t.cmd == c__nop) {
1057 dtor(t), t = get_token(in);
1058 continue; /* do nothing! */
1059 }
1060
1061 if (t.type == tok_cmd && t.cmd == c__escaped) {
1062 t.type = tok_word; /* nice and simple */
1063 t.aux = 0; /* even if `\-' - nonbreaking! */
1064 }
1065 if (t.type == tok_cmd && t.cmd == c__nbsp) {
1066 t.type = tok_word; /* nice and simple */
1067 sfree(t.text);
1068 t.text = ustrdup(L" "); /* text is ` ' not `_' */
1069 t.aux = 0; /* (nonbreaking) */
1070 }
1071 switch (t.type) {
1072 case tok_white:
1073 if (whptr == &par.words)
1074 break; /* strip whitespace at start of para */
1075 wd.text = NULL;
1076 wd.type = spcstyle;
1077 wd.alt = NULL;
1078 wd.aux = 0;
1079 wd.fpos = t.pos;
1080 wd.breaks = FALSE;
1081
1082 /*
1083 * Inhibit use of whitespace if it's (probably the
1084 * newline) before a repeat \IM / \BR type
1085 * directive.
1086 */
1087 if (start_cmd != c__invalid) {
1088 dtor(t), t = get_token(in);
1089 already = TRUE;
1090 if (t.type == tok_cmd && t.cmd == start_cmd)
1091 break;
1092 }
1093
1094 if (indexing)
1095 rdadd(&indexstr, ' ');
1096 if (!indexing || index_visible)
1097 addword(wd, &whptr);
1098 if (indexing)
1099 addword(wd, &idximplicit);
1100 iswhite = TRUE;
1101 break;
1102 case tok_word:
1103 if (indexing)
1104 rdadds(&indexstr, t.text);
1105 wd.type = style;
1106 wd.alt = NULL;
1107 wd.aux = 0;
1108 wd.fpos = t.pos;
1109 wd.breaks = t.aux;
1110 if (!indexing || index_visible) {
1111 wd.text = ustrdup(t.text);
1112 addword(wd, &whptr);
1113 }
1114 if (indexing) {
1115 wd.text = ustrdup(t.text);
1116 addword(wd, &idximplicit);
1117 }
1118 break;
1119 case tok_lbrace:
1120 err_unexbrace(&t.pos);
1121 /* Error recovery: push nop */
1122 sitem = snew(struct stack_item);
1123 sitem->type = stack_nop;
1124 sitem->fpos = t.pos;
1125 stk_push(parsestk, sitem);
1126 break;
1127 case tok_rbrace:
1128 sitem = stk_pop(parsestk);
1129 if (!sitem) {
1130 /*
1131 * This closing brace could have been an
1132 * indication that the cross-paragraph stack
1133 * wants popping. Accordingly, we treat it here
1134 * as an indication that the paragraph is over.
1135 */
1136 already = TRUE;
1137 goto finished_para;
1138 } else {
1139 if (sitem->type & stack_ualt) {
1140 whptr = sitem->whptr;
1141 idximplicit = sitem->idximplicit;
1142 }
1143 if (sitem->type & stack_style) {
1144 style = word_Normal;
1145 spcstyle = word_WhiteSpace;
1146 }
1147 if (sitem->type & stack_idx) {
1148 indexword->text = ustrdup(indexstr.text);
1149 if (index_downcase) {
1150 word *w;
1151
1152 ustrlow(indexword->text);
1153 ustrlow(indexstr.text);
1154
1155 for (w = idxwordlist; w; w = w->next)
1156 if (w->text)
1157 ustrlow(w->text);
1158 }
1159 indexing = FALSE;
1160 rdadd(&indexstr, L'\0');
1161 index_merge(idx, FALSE, indexstr.text,
1162 idxwordlist, &sitem->fpos);
1163 sfree(indexstr.text);
1164 }
1165 if (sitem->type & stack_hyper) {
1166 wd.text = NULL;
1167 wd.type = word_HyperEnd;
1168 wd.alt = NULL;
1169 wd.aux = 0;
1170 wd.fpos = t.pos;
1171 wd.breaks = FALSE;
1172 if (!indexing || index_visible)
1173 addword(wd, &whptr);
1174 if (indexing)
1175 addword(wd, &idximplicit);
1176 }
1177 if (sitem->type & stack_quote) {
1178 wd.text = NULL;
1179 wd.type = toquotestyle(style);
1180 wd.alt = NULL;
1181 wd.aux = quote_Close;
1182 wd.fpos = t.pos;
1183 wd.breaks = FALSE;
1184 if (!indexing || index_visible)
1185 addword(wd, &whptr);
1186 if (indexing) {
1187 rdadd(&indexstr, L'"');
1188 addword(wd, &idximplicit);
1189 }
1190 }
1191 }
1192 sfree(sitem);
1193 break;
1194 case tok_cmd:
1195 switch (t.cmd) {
1196 case c__comment:
1197 /*
1198 * In-paragraph comment: \#{ balanced braces }
1199 *
1200 * Anything goes here; even tok_eop. We should
1201 * eat whitespace after the close brace _if_
1202 * there was whitespace before the \#.
1203 */
1204 dtor(t), t = get_token(in);
1205 if (t.type != tok_lbrace) {
1206 err_explbr(&t.pos);
1207 } else {
1208 int braces = 1;
1209 while (braces > 0) {
1210 dtor(t), t = get_token(in);
1211 if (t.type == tok_lbrace)
1212 braces++;
1213 else if (t.type == tok_rbrace)
1214 braces--;
1215 else if (t.type == tok_eof) {
1216 err_commenteof(&t.pos);
1217 break;
1218 }
1219 }
1220 }
1221 if (seenwhite) {
1222 already = TRUE;
1223 dtor(t), t = get_token(in);
1224 if (t.type == tok_white) {
1225 iswhite = TRUE;
1226 already = FALSE;
1227 }
1228 }
1229 break;
1230 case c_q:
1231 case c_cq:
1232 type = t.cmd;
1233 dtor(t), t = get_token(in);
1234 if (t.type != tok_lbrace) {
1235 err_explbr(&t.pos);
1236 } else {
1237 /*
1238 * Enforce that \q may not be used anywhere
1239 * within \c. (It shouldn't be necessary
1240 * since the whole point of \c should be
1241 * that the user wants to exercise exact
1242 * control over the glyphs used, and
1243 * forbidding it has the useful effect of
1244 * relieving some backends of having to
1245 * make difficult decisions.)
1246 */
1247 int stype;
1248
1249 if (style != word_Code && style != word_WeakCode) {
1250 wd.text = NULL;
1251 wd.type = toquotestyle(style);
1252 wd.alt = NULL;
1253 wd.aux = quote_Open;
1254 wd.fpos = t.pos;
1255 wd.breaks = FALSE;
1256 if (!indexing || index_visible)
1257 addword(wd, &whptr);
1258 if (indexing) {
1259 rdadd(&indexstr, L'"');
1260 addword(wd, &idximplicit);
1261 }
1262 stype = stack_quote;
1263 } else {
1264 err_codequote(&t.pos);
1265 stype = stack_nop;
1266 }
1267 sitem = snew(struct stack_item);
1268 sitem->fpos = t.pos;
1269 sitem->type = stype;
1270 if (type == c_cq) {
1271 if (style != word_Normal) {
1272 err_nestedstyles(&t.pos);
1273 } else {
1274 style = word_WeakCode;
1275 spcstyle = tospacestyle(style);
1276 sitem->type |= stack_style;
1277 }
1278 }
1279 stk_push(parsestk, sitem);
1280 }
1281 break;
1282 case c_K:
1283 case c_k:
1284 case c_W:
1285 case c_date:
1286 /*
1287 * Keyword, hyperlink, or \date. We expect a
1288 * left brace, some text, and then a right
1289 * brace. No nesting; no arguments.
1290 */
1291 wd.fpos = t.pos;
1292 wd.breaks = FALSE;
1293 if (t.cmd == c_K)
1294 wd.type = word_UpperXref;
1295 else if (t.cmd == c_k)
1296 wd.type = word_LowerXref;
1297 else if (t.cmd == c_W)
1298 wd.type = word_HyperLink;
1299 else
1300 wd.type = word_Normal;
1301 dtor(t), t = get_token(in);
1302 if (t.type != tok_lbrace) {
1303 if (wd.type == word_Normal) {
1304 time_t thetime = time(NULL);
1305 struct tm *broken = localtime(&thetime);
1306 already = TRUE;
1307 wdtext = ustrftime(NULL, broken);
1308 wd.type = style;
1309 } else {
1310 err_explbr(&t.pos);
1311 wdtext = NULL;
1312 }
1313 } else {
1314 rdstring rs = { 0, 0, NULL };
1315 while (dtor(t), t = get_token(in),
1316 t.type == tok_word || t.type == tok_white) {
1317 if (t.type == tok_white)
1318 rdadd(&rs, ' ');
1319 else
1320 rdadds(&rs, t.text);
1321 }
1322 if (wd.type == word_Normal) {
1323 time_t thetime = time(NULL);
1324 struct tm *broken = localtime(&thetime);
1325 wdtext = ustrftime(rs.text, broken);
1326 wd.type = style;
1327 } else {
1328 wdtext = ustrdup(rs.text);
1329 }
1330 sfree(rs.text);
1331 if (t.type != tok_rbrace) {
1332 err_kwexprbr(&t.pos);
1333 }
1334 }
1335 wd.alt = NULL;
1336 wd.aux = 0;
1337 if (!indexing || index_visible) {
1338 wd.text = ustrdup(wdtext);
1339 addword(wd, &whptr);
1340 }
1341 if (indexing) {
1342 wd.text = ustrdup(wdtext);
1343 addword(wd, &idximplicit);
1344 }
1345 sfree(wdtext);
1346 if (wd.type == word_HyperLink) {
1347 /*
1348 * Hyperlinks are different: they then
1349 * expect another left brace, to begin
1350 * delimiting the text marked by the link.
1351 */
1352 dtor(t), t = get_token(in);
1353 sitem = snew(struct stack_item);
1354 sitem->fpos = wd.fpos;
1355 sitem->type = stack_hyper;
1356 /*
1357 * Special cases: \W{}\i, \W{}\ii
1358 */
1359 if (t.type == tok_cmd &&
1360 (t.cmd == c_i || t.cmd == c_ii)) {
1361 if (indexing) {
1362 err_nestedindex(&t.pos);
1363 } else {
1364 /* Add an index-reference word with no
1365 * text as yet */
1366 wd.type = word_IndexRef;
1367 wd.text = NULL;
1368 wd.alt = NULL;
1369 wd.aux = 0;
1370 wd.breaks = FALSE;
1371 indexword = addword(wd, &whptr);
1372 /* Set up a rdstring to read the
1373 * index text */
1374 indexstr = nullrs;
1375 /* Flags so that we do the Right
1376 * Things with text */
1377 index_visible = (type != c_I);
1378 index_downcase = (type == c_ii);
1379 indexing = TRUE;
1380 idxwordlist = NULL;
1381 idximplicit = &idxwordlist;
1382
1383 sitem->type |= stack_idx;
1384 }
1385 dtor(t), t = get_token(in);
1386 }
1387 /*
1388 * Special cases: \W{}\c, \W{}\e, \W{}\s, \W{}\cw
1389 */
1390 if (t.type == tok_cmd &&
1391 (t.cmd == c_e || t.cmd == c_s ||
1392 t.cmd == c_c || t.cmd == c_cw)) {
1393 if (style != word_Normal)
1394 err_nestedstyles(&t.pos);
1395 else {
1396 style = (t.cmd == c_c ? word_Code :
1397 t.cmd == c_cw ? word_WeakCode :
1398 t.cmd == c_s ? word_Strong :
1399 word_Emph);
1400 spcstyle = tospacestyle(style);
1401 sitem->type |= stack_style;
1402 }
1403 dtor(t), t = get_token(in);
1404 }
1405 if (t.type != tok_lbrace) {
1406 err_explbr(&t.pos);
1407 sfree(sitem);
1408 } else {
1409 stk_push(parsestk, sitem);
1410 }
1411 }
1412 break;
1413 case c_c:
1414 case c_cw:
1415 case c_e:
1416 case c_s:
1417 type = t.cmd;
1418 if (style != word_Normal) {
1419 err_nestedstyles(&t.pos);
1420 /* Error recovery: eat lbrace, push nop. */
1421 dtor(t), t = get_token(in);
1422 sitem = snew(struct stack_item);
1423 sitem->fpos = t.pos;
1424 sitem->type = stack_nop;
1425 stk_push(parsestk, sitem);
1426 }
1427 dtor(t), t = get_token(in);
1428 if (t.type != tok_lbrace) {
1429 err_explbr(&t.pos);
1430 } else {
1431 style = (type == c_c ? word_Code :
1432 type == c_cw ? word_WeakCode :
1433 type == c_s ? word_Strong :
1434 word_Emph);
1435 spcstyle = tospacestyle(style);
1436 sitem = snew(struct stack_item);
1437 sitem->fpos = t.pos;
1438 sitem->type = stack_style;
1439 stk_push(parsestk, sitem);
1440 }
1441 break;
1442 case c_i:
1443 case c_ii:
1444 case c_I:
1445 type = t.cmd;
1446 if (indexing) {
1447 err_nestedindex(&t.pos);
1448 /* Error recovery: eat lbrace, push nop. */
1449 dtor(t), t = get_token(in);
1450 sitem = snew(struct stack_item);
1451 sitem->fpos = t.pos;
1452 sitem->type = stack_nop;
1453 stk_push(parsestk, sitem);
1454 }
1455 sitem = snew(struct stack_item);
1456 sitem->fpos = t.pos;
1457 sitem->type = stack_idx;
1458 dtor(t), t = get_token(in);
1459 /*
1460 * Special cases: \i\c, \i\e, \i\s, \i\cw
1461 */
1462 wd.fpos = t.pos;
1463 if (t.type == tok_cmd &&
1464 (t.cmd == c_e || t.cmd == c_s ||
1465 t.cmd == c_c || t.cmd == c_cw)) {
1466 if (style != word_Normal)
1467 err_nestedstyles(&t.pos);
1468 else {
1469 style = (t.cmd == c_c ? word_Code :
1470 t.cmd == c_cw ? word_WeakCode :
1471 t.cmd == c_s ? word_Strong :
1472 word_Emph);
1473 spcstyle = tospacestyle(style);
1474 sitem->type |= stack_style;
1475 }
1476 dtor(t), t = get_token(in);
1477 }
1478 if (t.type != tok_lbrace) {
1479 sfree(sitem);
1480 err_explbr(&t.pos);
1481 } else {
1482 /* Add an index-reference word with no text as yet */
1483 wd.type = word_IndexRef;
1484 wd.text = NULL;
1485 wd.alt = NULL;
1486 wd.aux = 0;
1487 wd.breaks = FALSE;
1488 indexword = addword(wd, &whptr);
1489 /* Set up a rdstring to read the index text */
1490 indexstr = nullrs;
1491 /* Flags so that we do the Right Things with text */
1492 index_visible = (type != c_I);
1493 index_downcase = (type == c_ii);
1494 indexing = TRUE;
1495 idxwordlist = NULL;
1496 idximplicit = &idxwordlist;
1497 /* Stack item to close the indexing on exit */
1498 stk_push(parsestk, sitem);
1499 }
1500 break;
1501 case c_u:
1502 uchr = t.aux;
1503 utext[0] = uchr; utext[1] = 0;
1504 wd.type = style;
1505 wd.breaks = FALSE;
1506 wd.alt = NULL;
1507 wd.aux = 0;
1508 wd.fpos = t.pos;
1509 if (!indexing || index_visible) {
1510 wd.text = ustrdup(utext);
1511 uword = addword(wd, &whptr);
1512 } else
1513 uword = NULL;
1514 if (indexing) {
1515 wd.text = ustrdup(utext);
1516 iword = addword(wd, &idximplicit);
1517 } else
1518 iword = NULL;
1519 dtor(t), t = get_token(in);
1520 if (t.type == tok_lbrace) {
1521 /*
1522 * \u with a left brace. Until the brace
1523 * closes, all further words go on a
1524 * sidetrack from the main thread of the
1525 * paragraph.
1526 */
1527 sitem = snew(struct stack_item);
1528 sitem->fpos = t.pos;
1529 sitem->type = stack_ualt;
1530 sitem->whptr = whptr;
1531 sitem->idximplicit = idximplicit;
1532 stk_push(parsestk, sitem);
1533 whptr = uword ? &uword->alt : NULL;
1534 idximplicit = iword ? &iword->alt : NULL;
1535 } else {
1536 if (indexing)
1537 rdadd(&indexstr, uchr);
1538 already = TRUE;
1539 }
1540 break;
1541 default:
1542 if (!macrolookup(macros, in, t.text, &t.pos))
1543 err_badmidcmd(t.text, &t.pos);
1544 break;
1545 }
1546 }
1547 if (!already)
1548 dtor(t), t = get_token(in);
1549 seenwhite = iswhite;
1550 }
1551 finished_para:
1552 /* Check the stack is empty */
1553 if (stk_top(parsestk)) {
1554 while ((sitem = stk_pop(parsestk)))
1555 sfree(sitem);
1556 err_missingrbrace(&t.pos);
1557 }
1558 stk_free(parsestk);
1559 prev_para_type = par.type;
1560 /*
1561 * Before we add the paragraph to the output list, we
1562 * should check that there was any text in it at all; there
1563 * might not be if (for example) the paragraph contained
1564 * nothing but an unrecognised command sequence, and if we
1565 * put an empty paragraph on the list it may confuse the
1566 * back ends later on.
1567 */
1568 if (par.words) {
1569 addpara(par, ret);
1570 }
1571 if (t.type == tok_eof)
1572 already = TRUE;
1573 }
1574
1575 if (stk_top(crossparastk)) {
1576 void *p;
1577
1578 err_missingrbrace2(&t.pos);
1579 while ((p = stk_pop(crossparastk)))
1580 sfree(p);
1581 }
1582
1583 /*
1584 * We break to here rather than returning, because otherwise
1585 * this cleanup doesn't happen.
1586 */
1587 dtor(t);
1588
1589 stk_free(crossparastk);
1590 }
1591
1592 struct {
1593 char const *magic;
1594 size_t nmagic;
1595 int binary;
1596 void (*reader)(input *);
1597 } magics[] = {
1598 { "%!FontType1-", 12, FALSE, &read_pfa_file },
1599 { "%!PS-AdobeFont-", 15, FALSE, &read_pfa_file },
1600 { "\x80\x01", 2, TRUE, &read_pfb_file },
1601 { "StartFontMetrics", 16, FALSE, &read_afm_file },
1602 { "\x00\x01\x00\x00", 4, TRUE, &read_sfnt_file },
1603 { "true", 4, TRUE, &read_sfnt_file },
1604 };
1605
read_input(input * in,indexdata * idx)1606 paragraph *read_input(input *in, indexdata *idx) {
1607 paragraph *head = NULL;
1608 paragraph **hptr = &head;
1609 tree234 *macros;
1610 char mag[16];
1611 size_t len, i;
1612 int binary;
1613 void (*reader)(input *);
1614
1615 macros = newtree234(macrocmp);
1616
1617 while (in->currindex < in->nfiles) {
1618 setpos(in, in->filenames[in->currindex]);
1619 in->charset = in->defcharset;
1620 in->csstate = charset_init_state;
1621 in->wcpos = in->nwc = 0;
1622 in->pushback_chars = NULL;
1623
1624 if (!in->filenames[in->currindex]) {
1625 in->currfp = stdin;
1626 in->wantclose = FALSE; /* don't fclose stdin */
1627 /*
1628 * When reading standard input, we always expect to see
1629 * an actual Halibut file and not any of the unusual
1630 * input types like fonts.
1631 */
1632 reader = NULL;
1633 } else {
1634 /*
1635 * Open the file in binary mode to look for magic
1636 * numbers. We'll switch to text mode if we find we're
1637 * looking at a text file type.
1638 */
1639 in->currfp = fopen(in->filenames[in->currindex], "rb");
1640 binary = FALSE; /* default to Halibut source, which is text */
1641 if (in->currfp) {
1642 in->wantclose = TRUE;
1643 reader = NULL;
1644 len = fread(mag, 1, sizeof(mag), in->currfp);
1645 for (i = 0; i < lenof(magics); i++) {
1646 if (len >= magics[i].nmagic &&
1647 memcmp(mag, magics[i].magic, magics[i].nmagic) == 0) {
1648 reader = magics[i].reader;
1649 binary = magics[i].binary;
1650 break;
1651 }
1652 }
1653 rewind(in->currfp);
1654 }
1655 if (!binary) {
1656 if (in->currfp)
1657 fclose(in->currfp);
1658 in->currfp = fopen(in->filenames[in->currindex], "r");
1659 }
1660 }
1661 if (in->currfp) {
1662 if (reader == NULL) {
1663 read_file(&hptr, in, idx, macros);
1664 } else {
1665 (*reader)(in);
1666 }
1667 } else {
1668 err_cantopen(in->filenames[in->currindex]);
1669 }
1670 in->currindex++;
1671 }
1672
1673 macrocleanup(macros);
1674
1675 return head;
1676 }
1677