1% textoken.w
2%
3% Copyright 2006-2011 Taco Hoekwater <taco@@luatex.org>
4%
5% This file is part of LuaTeX.
6%
7% LuaTeX is free software; you can redistribute it and/or modify it under
8% the terms of the GNU General Public License as published by the Free
9% Software Foundation; either version 2 of the License, or (at your
10% option) any later version.
11%
12% LuaTeX is distributed in the hope that it will be useful, but WITHOUT
13% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14% FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15% License for more details.
16%
17% You should have received a copy of the GNU General Public License along
18% with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
19
20@ @c
21
22
23#include "ptexlib.h"
24
25@ @c
26#define pausing int_par(pausing_code)
27#define cat_code_table int_par(cat_code_table_code)
28#define tracing_nesting int_par(tracing_nesting_code)
29#define suppress_outer_error int_par(suppress_outer_error_code)
30#define suppress_mathpar_error int_par(suppress_mathpar_error_code)
31
32
33#define every_eof equiv(every_eof_loc)
34#define box(A) equiv(box_base+(A))
35
36#define detokenized_line() (line_catcode_table==NO_CAT_TABLE)
37
38#define do_get_cat_code(a,b) do {                                         \
39    if (line_catcode_table!=DEFAULT_CAT_TABLE)                          \
40      a=get_cat_code(line_catcode_table,b);                       \
41    else                                                                \
42      a=get_cat_code(cat_code_table,b);                           \
43  } while (0)
44
45
46@ The \TeX\ system does nearly all of its own memory allocation, so that it
47can readily be transported into environments that do not have automatic
48facilities for strings, garbage collection, etc., and so that it can be in
49control of what error messages the user receives. The dynamic storage
50requirements of \TeX\ are handled by providing two large arrays called
51|fixmem| and |varmem| in which consecutive blocks of words are used as
52nodes by the \TeX\ routines.
53
54Pointer variables are indices into this array, or into another array
55called |eqtb| that will be explained later. A pointer variable might
56also be a special flag that lies outside the bounds of |mem|, so we
57allow pointers to assume any |halfword| value. The minimum halfword
58value represents a null pointer. \TeX\ does not assume that |mem[null]| exists.
59
60
61
62@ Locations in |fixmem| are used for storing one-word records; a conventional
63\.{AVAIL} stack is used for allocation in this array.
64
65@c
66smemory_word *fixmem;           /* the big dynamic storage area */
67unsigned fix_mem_min;           /* the smallest location of one-word memory in use */
68unsigned fix_mem_max;           /* the largest location of one-word memory in use */
69
70
71@ In order to study the memory requirements of particular applications, it
72is possible to prepare a version of \TeX\ that keeps track of current and
73maximum memory usage. When code between the delimiters |@!stat| $\ldots$
74|tats| is not ``commented out,'' \TeX\ will run a bit slower but it will
75report these statistics when |tracing_stats| is sufficiently large.
76
77@c
78int var_used, dyn_used;         /* how much memory is in use */
79
80halfword avail;                 /* head of the list of available one-word nodes */
81unsigned fix_mem_end;           /* the last one-word node used in |mem| */
82
83halfword garbage;               /* head of a junk list, write only */
84halfword temp_token_head;       /* head of a temporary list of some kind */
85halfword hold_token_head;       /* head of a temporary list of another kind */
86halfword omit_template;         /* a constant token list */
87halfword null_list;             /* permanently empty list */
88halfword backup_head;           /* head of token list built by |scan_keyword| */
89
90@ @c
91void initialize_tokens(void)
92{
93    halfword p;
94    avail = null;
95    fix_mem_end = 0;
96    p = get_avail();
97    temp_token_head = p;
98    set_token_info(temp_token_head, 0);
99    p = get_avail();
100    hold_token_head = p;
101    set_token_info(hold_token_head, 0);
102    p = get_avail();
103    omit_template = p;
104    set_token_info(omit_template, 0);
105    p = get_avail();
106    null_list = p;
107    set_token_info(null_list, 0);
108    p = get_avail();
109    backup_head = p;
110    set_token_info(backup_head, 0);
111    p = get_avail();
112    garbage = p;
113    set_token_info(garbage, 0);
114    dyn_used = 0;               /* initialize statistics */
115}
116
117@ The function |get_avail| returns a pointer to a new one-word node whose
118|link| field is null. However, \TeX\ will halt if there is no more room left.
119@^inner loop@>
120
121If the available-space list is empty, i.e., if |avail=null|,
122we try first to increase |fix_mem_end|. If that cannot be done, i.e., if
123|fix_mem_end=fix_mem_max|, we try to reallocate array |fixmem|.
124If, that doesn't work, we have to quit.
125
126@c
127halfword get_avail(void)
128{                               /* single-word node allocation */
129    unsigned p;                 /* the new node being got */
130    unsigned t;
131    p = (unsigned) avail;       /* get top location in the |avail| stack */
132    if (p != null) {
133        avail = token_link(avail);      /* and pop it off */
134    } else if (fix_mem_end < fix_mem_max) {     /* or go into virgin territory */
135        incr(fix_mem_end);
136        p = fix_mem_end;
137    } else {
138        smemory_word *new_fixmem;       /* the big dynamic storage area */
139        t = (fix_mem_max / 5);
140        new_fixmem =
141            fixmemcast(realloc
142                       (fixmem, sizeof(smemory_word) * (fix_mem_max + t + 1)));
143        if (new_fixmem == NULL) {
144            runaway();          /* if memory is exhausted, display possible runaway text */
145            overflow("token memory size", fix_mem_max);
146        } else {
147            fixmem = new_fixmem;
148        }
149        memset(voidcast(fixmem + fix_mem_max + 1), 0, t * sizeof(smemory_word));
150        fix_mem_max += t;
151        p = ++fix_mem_end;
152    }
153    token_link(p) = null;       /* provide an oft-desired initialization of the new node */
154    incr(dyn_used);             /* maintain statistics */
155    return (halfword) p;
156}
157
158
159@ The procedure |flush_list(p)| frees an entire linked list of
160one-word nodes that starts at position |p|.
161@^inner loop@>
162
163@c
164void flush_list(halfword p)
165{                               /* makes list of single-word nodes available */
166    halfword q, r;              /* list traversers */
167    if (p != null) {
168        r = p;
169        do {
170            q = r;
171            r = token_link(r);
172            decr(dyn_used);
173        } while (r != null);    /* now |q| is the last node on the list */
174        token_link(q) = avail;
175        avail = p;
176    }
177}
178
179@ A \TeX\ token is either a character or a control sequence, and it is
180@^token@>
181represented internally in one of two ways: (1)~A character whose ASCII
182code number is |c| and whose command code is |m| is represented as the
183number $2^{21}m+c$; the command code is in the range |1<=m<=14|. (2)~A control
184sequence whose |eqtb| address is |p| is represented as the number
185|cs_token_flag+p|. Here |cs_token_flag=@t$2^{25}-1$@>| is larger than
186$2^{21}m+c$, yet it is small enough that |cs_token_flag+p< max_halfword|;
187thus, a token fits comfortably in a halfword.
188
189A token |t| represents a |left_brace| command if and only if
190|t<left_brace_limit|; it represents a |right_brace| command if and only if
191we have |left_brace_limit<=t<right_brace_limit|; and it represents a |match| or
192|end_match| command if and only if |match_token<=t<=end_match_token|.
193The following definitions take care of these token-oriented constants
194and a few others.
195
196@ A token list is a singly linked list of one-word nodes in |mem|, where
197each word contains a token and a link. Macro definitions, output-routine
198definitions, marks, \.{\\write} texts, and a few other things
199are remembered by \TeX\ in the form
200of token lists, usually preceded by a node with a reference count in its
201|token_ref_count| field. The token stored in location |p| is called
202|info(p)|.
203
204Three special commands appear in the token lists of macro definitions.
205When |m=match|, it means that \TeX\ should scan a parameter
206for the current macro; when |m=end_match|, it means that parameter
207matching should end and \TeX\ should start reading the macro text; and
208when |m=out_param|, it means that \TeX\ should insert parameter
209number |c| into the text at this point.
210
211The enclosing \.{\char'173} and \.{\char'175} characters of a macro
212definition are omitted, but the final right brace of an output routine
213is included at the end of its token list.
214
215Here is an example macro definition that illustrates these conventions.
216After \TeX\ processes the text
217$$\.{\\def\\mac a\#1\#2 \\b \{\#1\\-a \#\#1\#2 \#2\}}$$
218the definition of \.{\\mac} is represented as a token list containing
219$$\def\,{\hskip2pt}
220\vbox{\halign{\hfil#\hfil\cr
221(reference count), |letter|\,\.a, |match|\,\#, |match|\,\#, |spacer|\,\.\ ,
222\.{\\b}, |end_match|,\cr
223|out_param|\,1, \.{\\-}, |letter|\,\.a, |spacer|\,\.\ , |mac_param|\,\#,
224|other_char|\,\.1,\cr
225|out_param|\,2, |spacer|\,\.\ , |out_param|\,2.\cr}}$$
226The procedure |scan_toks| builds such token lists, and |macro_call|
227does the parameter matching.
228@^reference counts@>
229
230Examples such as
231$$\.{\\def\\m\{\\def\\m\{a\}\ b\}}$$
232explain why reference counts would be needed even if \TeX\ had no \.{\\let}
233operation: When the token list for \.{\\m} is being read, the redefinition of
234\.{\\m} changes the |eqtb| entry before the token list has been fully
235consumed, so we dare not simply destroy a token list when its
236control sequence is being redefined.
237
238If the parameter-matching part of a definition ends with `\.{\#\{}',
239the corresponding token list will have `\.\{' just before the `|end_match|'
240and also at the very end. The first `\.\{' is used to delimit the parameter; the
241second one keeps the first from disappearing.
242
243The |print_meaning| subroutine displays |cur_cmd| and |cur_chr| in
244symbolic form, including the expansion of a macro or mark.
245
246@c
247void print_meaning(void)
248{
249    print_cmd_chr((quarterword) cur_cmd, cur_chr);
250    if (cur_cmd >= call_cmd) {
251        print_char(':');
252        print_ln();
253        token_show(cur_chr);
254    } else {
255        /* Show the meaning of a mark node */
256        if ((cur_cmd == top_bot_mark_cmd) && (cur_chr < marks_code)) {
257            print_char(':');
258            print_ln();
259            switch (cur_chr) {
260            case first_mark_code:
261                token_show(first_mark(0));
262                break;
263            case bot_mark_code:
264                token_show(bot_mark(0));
265                break;
266            case split_first_mark_code:
267                token_show(split_first_mark(0));
268                break;
269            case split_bot_mark_code:
270                token_show(split_bot_mark(0));
271                break;
272            default:
273                token_show(top_mark(0));
274                break;
275            }
276        }
277    }
278}
279
280
281@ The procedure |show_token_list|, which prints a symbolic form of
282the token list that starts at a given node |p|, illustrates these
283conventions. The token list being displayed should not begin with a reference
284count. However, the procedure is intended to be robust, so that if the
285memory links are awry or if |p| is not really a pointer to a token list,
286nothing catastrophic will happen.
287
288An additional parameter |q| is also given; this parameter is either null
289or it points to a node in the token list where a certain magic computation
290takes place that will be explained later. (Basically, |q| is non-null when
291we are printing the two-line context information at the time of an error
292message; |q| marks the place corresponding to where the second line
293should begin.)
294
295For example, if |p| points to the node containing the first \.a in the
296token list above, then |show_token_list| will print the string
297$$\hbox{`\.{a\#1\#2\ \\b\ ->\#1\\-a\ \#\#1\#2\ \#2}';}$$
298and if |q| points to the node containing the second \.a,
299the magic computation will be performed just before the second \.a is printed.
300
301The generation will stop, and `\.{\\ETC.}' will be printed, if the length
302of printing exceeds a given limit~|l|. Anomalous entries are printed in the
303form of control sequences that are not followed by a blank space, e.g.,
304`\.{\\BAD.}'; this cannot be confused with actual control sequences because
305a real control sequence named \.{BAD} would come out `\.{\\BAD\ }'.
306
307@c
308void show_token_list(int p, int q, int l)
309{
310    int m, c;                   /* pieces of a token */
311    ASCII_code match_chr;       /* character used in a `|match|' */
312    ASCII_code n;               /* the highest parameter number, as an ASCII digit */
313    match_chr = '#';
314    n = '0';
315    tally = 0;
316    if (l < 0)
317        l = 0x3FFFFFFF;
318    while ((p != null) && (tally < l)) {
319        if (p == q) {
320            /* Do magic computation */
321            set_trick_count();
322        }
323        /* Display token |p|, and |return| if there are problems */
324        if ((p < (int) fix_mem_min) || (p > (int) fix_mem_end)) {
325            tprint_esc("CLOBBERED.");
326            return;
327        }
328        if (token_info(p) >= cs_token_flag) {
329            if (!((inhibit_par_tokens) && (token_info(p) == par_token)))
330                print_cs(token_info(p) - cs_token_flag);
331        } else {
332            m = token_cmd(token_info(p));
333            c = token_chr(token_info(p));
334            if (token_info(p) < 0) {
335                tprint_esc("BAD.");
336            } else {
337                /* Display the token $(|m|,|c|)$ */
338                /* The procedure usually ``learns'' the character code used for macro
339                   parameters by seeing one in a |match| command before it runs into any
340                   |out_param| commands. */
341                switch (m) {
342                case left_brace_cmd:
343                case right_brace_cmd:
344                case math_shift_cmd:
345                case tab_mark_cmd:
346                case sup_mark_cmd:
347                case sub_mark_cmd:
348                case spacer_cmd:
349                case letter_cmd:
350                case other_char_cmd:
351                    print(c);
352                    break;
353                case mac_param_cmd:
354                    if (!in_lua_escape)
355                        print(c);
356                    print(c);
357                    break;
358                case out_param_cmd:
359                    print(match_chr);
360                    if (c <= 9) {
361                        print_char(c + '0');
362                    } else {
363                        print_char('!');
364                        return;
365                    }
366                    break;
367                case match_cmd:
368                    match_chr = c;
369                    print(c);
370                    incr(n);
371                    print_char(n);
372                    if (n > '9')
373                        return;
374                    break;
375                case end_match_cmd:
376                    if (c == 0)
377                        tprint("->");
378                    break;
379                default:
380                    tprint_esc("BAD.");
381                    break;
382                }
383            }
384        }
385        p = token_link(p);
386    }
387    if (p != null)
388        tprint_esc("ETC.");
389}
390
391@ @c
392#define do_buffer_to_unichar(a,b)  do {                         \
393        a = (halfword)str2uni(buffer+b);                        \
394        b += utf8_size(a);                                      \
395    } while (0)
396
397
398@ Here's the way we sometimes want to display a token list, given a pointer
399to its reference count; the pointer may be null.
400
401@c
402void token_show(halfword p)
403{
404    if (p != null)
405        show_token_list(token_link(p), null, 10000000);
406}
407
408
409
410@ |delete_token_ref|, is called when
411a pointer to a token list's reference count is being removed. This means
412that the token list should disappear if the reference count was |null|,
413otherwise the count should be decreased by one.
414@^reference counts@>
415
416@c
417void delete_token_ref(halfword p)
418{                               /* |p| points to the reference count
419                                   of a token list that is losing one reference */
420    assert(token_ref_count(p) >= 0);
421    if (token_ref_count(p) == 0)
422        flush_list(p);
423    else
424        decr(token_ref_count(p));
425}
426
427@ @c
428int get_char_cat_code(int curchr)
429{
430    int a;
431    do_get_cat_code(a,curchr);
432    return a;
433}
434
435@ @c
436static void invalid_character_error(void)
437{
438    const char *hlp[] =
439        { "A funny symbol that I can't read has just been input.",
440        "Continue, and I'll forget that it ever happened.",
441        NULL
442    };
443    deletions_allowed = false;
444    tex_error("Text line contains an invalid character", hlp);
445    deletions_allowed = true;
446}
447
448@ @c
449static boolean process_sup_mark(void);  /* below */
450
451static int scan_control_sequence(void); /* below */
452
453typedef enum { next_line_ok, next_line_return,
454    next_line_restart
455} next_line_retval;
456
457static next_line_retval next_line(void);        /* below */
458
459
460@  In case you are getting bored, here is a slightly less trivial routine:
461   Given a string of lowercase letters, like `\.{pt}' or `\.{plus}' or
462   `\.{width}', the |scan_keyword| routine checks to see whether the next
463   tokens of input match this string. The match must be exact, except that
464   uppercase letters will match their lowercase counterparts; uppercase
465   equivalents are determined by subtracting |"a"-"A"|, rather than using the
466   |uc_code| table, since \TeX\ uses this routine only for its own limited
467   set of keywords.
468
469   If a match is found, the characters are effectively removed from the input
470   and |true| is returned. Otherwise |false| is returned, and the input
471   is left essentially unchanged (except for the fact that some macros
472   may have been expanded, etc.).
473   @^inner loop@>
474
475@c
476boolean scan_keyword(const char *s)
477{                               /* look for a given string */
478    halfword p;                 /* tail of the backup list */
479    halfword q;                 /* new node being added to the token list via |store_new_token| */
480    const char *k;              /* index into |str_pool| */
481    halfword save_cur_cs = cur_cs;
482    int saved_align_state = align_state;
483    if (strlen(s) == 0)        /* was assert (strlen(s) > 1); */
484      return false ;           /* but not with newtokenlib  zero keyword simply doesn't match  */
485    p = backup_head;
486    token_link(p) = null;
487    k = s;
488    while (*k) {
489        get_x_token();      /* recursion is possible here */
490        if ((cur_cs == 0) &&
491            ((cur_chr == *k) || (cur_chr == *k - 'a' + 'A'))) {
492            store_new_token(cur_tok);
493            k++;
494        } else if ((cur_cmd != spacer_cmd) || (p != backup_head)) {
495            if (p != backup_head) {
496                q = get_avail();
497                token_info(q) = cur_tok;
498                token_link(q) = null;
499                token_link(p) = q;
500                begin_token_list(token_link(backup_head), backed_up);
501                if (cur_cmd != endv_cmd)
502   	           align_state = saved_align_state;
503            } else {
504                back_input();
505            }
506            cur_cs = save_cur_cs;
507            return false;
508        }
509    }
510    flush_list(token_link(backup_head));
511    cur_cs = save_cur_cs;
512    if (cur_cmd != endv_cmd)
513        align_state = saved_align_state;
514    return true;
515}
516
517@ We can not return |undefined_control_sequence| under some conditions
518 (inside |shift_case|, for example). This needs thinking.
519
520@c
521halfword active_to_cs(int curchr, int force)
522{
523    halfword curcs;
524    char *a, *b;
525    char *utfbytes = xmalloc(10);
526    int nncs = no_new_control_sequence;
527    a = (char *) uni2str(0xFFFF);
528    utfbytes = strcpy(utfbytes, a);
529    if (force)
530        no_new_control_sequence = false;
531    if (curchr > 0) {
532        b = (char *) uni2str((unsigned) curchr);
533        utfbytes = strcat(utfbytes, b);
534        free(b);
535        curcs = string_lookup(utfbytes, strlen(utfbytes));
536    } else {
537        utfbytes[3] = '\0';
538        curcs = string_lookup(utfbytes, 4);
539    }
540    no_new_control_sequence = nncs;
541    free(a);
542    free(utfbytes);
543    return curcs;
544}
545
546@ TODO this function should listen to \.{\\escapechar}
547
548@c
549static char *cs_to_string(halfword p)
550{                               /* prints a control sequence */
551    const char *s;
552    char *sh;
553    int k = 0;
554    static char ret[256] = { 0 };
555    if (p == 0 || p == null_cs) {
556        ret[k++] = '\\';
557        s = "csname";
558        while (*s) {
559            ret[k++] = *s++;
560        }
561        ret[k++] = '\\';
562        s = "endcsname";
563        while (*s) {
564            ret[k++] = *s++;
565        }
566        ret[k] = 0;
567
568    } else {
569        str_number txt = cs_text(p);
570        sh = makecstring(txt);
571        s = sh;
572        if (is_active_cs(txt)) {
573            s = s + 3;
574            while (*s) {
575                ret[k++] = *s++;
576            }
577            ret[k] = 0;
578        } else {
579            ret[k++] = '\\';
580            while (*s) {
581                ret[k++] = *s++;
582            }
583            ret[k] = 0;
584        }
585        free(sh);
586    }
587    return (char *) ret;
588}
589
590@ TODO this is a quick hack, will be solved differently soon
591
592@c
593static char *cmd_chr_to_string(int cmd, int chr)
594{
595    char *s;
596    str_number str;
597    int sel = selector;
598    selector = new_string;
599    print_cmd_chr((quarterword) cmd, chr);
600    str = make_string();
601    s = makecstring(str);
602    selector = sel;
603    flush_str(str);
604    return s;
605}
606
607@ The heart of \TeX's input mechanism is the |get_next| procedure, which
608we shall develop in the next few sections of the program. Perhaps we
609shouldn't actually call it the ``heart,'' however, because it really acts
610as \TeX's eyes and mouth, reading the source files and gobbling them up.
611And it also helps \TeX\ to regurgitate stored token lists that are to be
612processed again.
613@^eyes and mouth@>
614
615The main duty of |get_next| is to input one token and to set |cur_cmd|
616and |cur_chr| to that token's command code and modifier. Furthermore, if
617the input token is a control sequence, the |eqtb| location of that control
618sequence is stored in |cur_cs|; otherwise |cur_cs| is set to zero.
619
620Underlying this simple description is a certain amount of complexity
621because of all the cases that need to be handled.
622However, the inner loop of |get_next| is reasonably short and fast.
623
624When |get_next| is asked to get the next token of a \.{\\read} line,
625it sets |cur_cmd=cur_chr=cur_cs=0| in the case that no more tokens
626appear on that line. (There might not be any tokens at all, if the
627|end_line_char| has |ignore| as its catcode.)
628
629
630@ The value of |par_loc| is the |eqtb| address of `\.{\\par}'. This quantity
631is needed because a blank line of input is supposed to be exactly equivalent
632to the appearance of \.{\\par}; we must set |cur_cs:=par_loc|
633when detecting a blank line.
634
635@c
636halfword par_loc;               /* location of `\.{\\par}' in |eqtb| */
637halfword par_token;             /* token representing `\.{\\par}' */
638
639
640@ Parts |get_next| are executed more often than any other instructions of \TeX.
641@^mastication@>@^inner loop@>
642
643
644
645@ The global variable |force_eof| is normally |false|; it is set |true|
646by an \.{\\endinput} command. |luacstrings| is the number of lua print
647statements waiting to be input, it is changed by |luatokencall|.
648
649@c
650boolean force_eof;              /* should the next \.{\\input} be aborted early? */
651int luacstrings;                /* how many lua strings are waiting to be input? */
652
653
654@ If the user has set the |pausing| parameter to some positive value,
655and if nonstop mode has not been selected, each line of input is displayed
656on the terminal and the transcript file, followed by `\.{=>}'.
657\TeX\ waits for a response. If the response is simply |carriage_return|, the
658line is accepted as it stands, otherwise the line typed is
659used instead of the line in the file.
660
661@c
662void firm_up_the_line(void)
663{
664    int k;                      /* an index into |buffer| */
665    ilimit = last;
666    if (pausing > 0) {
667        if (interaction > nonstop_mode) {
668            wake_up_terminal();
669            print_ln();
670            if (istart < ilimit) {
671                for (k = istart; k <= ilimit - 1; k++)
672                    print_char(buffer[k]);
673            }
674            first = ilimit;
675            prompt_input("=>"); /* wait for user response */
676            if (last > first) {
677                for (k = first; k < +last - 1; k++)     /* move line down in buffer */
678                    buffer[k + istart - first] = buffer[k];
679                ilimit = istart + last - first;
680            }
681        }
682    }
683}
684
685
686
687@ Before getting into |get_next|, let's consider the subroutine that
688   is called when an `\.{\\outer}' control sequence has been scanned or
689   when the end of a file has been reached. These two cases are distinguished
690   by |cur_cs|, which is zero at the end of a file.
691
692@c
693void check_outer_validity(void)
694{
695    halfword p;                 /* points to inserted token list */
696    halfword q;                 /* auxiliary pointer */
697    if (suppress_outer_error)
698        return;
699    if (scanner_status != normal) {
700        deletions_allowed = false;
701        /* Back up an outer control sequence so that it can be reread; */
702        /* An outer control sequence that occurs in a \.{\\read} will not be reread,
703           since the error recovery for \.{\\read} is not very powerful. */
704        if (cur_cs != 0) {
705            if ((istate == token_list) || (iname < 1) || (iname > 17)) {
706                p = get_avail();
707                token_info(p) = cs_token_flag + cur_cs;
708                begin_token_list(p, backed_up); /* prepare to read the control sequence again */
709            }
710            cur_cmd = spacer_cmd;
711            cur_chr = ' ';      /* replace it by a space */
712        }
713        if (scanner_status > skipping) {
714            const char *errhlp[] =
715                { "I suspect you have forgotten a `}', causing me",
716                "to read past where you wanted me to stop.",
717                "I'll try to recover; but if the error is serious,",
718                "you'd better type `E' or `X' now and fix your file.",
719                NULL
720            };
721            char errmsg[256];
722            const char *startmsg;
723            const char *scannermsg;
724            /* Tell the user what has run away and try to recover */
725            runaway();          /* print a definition, argument, or preamble */
726            if (cur_cs == 0) {
727                startmsg = "File ended";
728            } else {
729                cur_cs = 0;
730                startmsg = "Forbidden control sequence found";
731            }
732            /* Print either `\.{definition}' or `\.{use}' or `\.{preamble}' or `\.{text}',
733               and insert tokens that should lead to recovery; */
734            /* The recovery procedure can't be fully understood without knowing more
735               about the \TeX\ routines that should be aborted, but we can sketch the
736               ideas here:  For a runaway definition we will insert a right brace; for a
737               runaway preamble, we will insert a special \.{\\cr} token and a right
738               brace; and for a runaway argument, we will set |long_state| to
739               |outer_call| and insert \.{\\par}. */
740            p = get_avail();
741            switch (scanner_status) {
742            case defining:
743                scannermsg = "definition";
744                token_info(p) = right_brace_token + '}';
745                break;
746            case matching:
747                scannermsg = "use";
748                token_info(p) = par_token;
749                long_state = outer_call_cmd;
750                break;
751            case aligning:
752                scannermsg = "preamble";
753                token_info(p) = right_brace_token + '}';
754                q = p;
755                p = get_avail();
756                token_link(p) = q;
757                token_info(p) = cs_token_flag + frozen_cr;
758                align_state = -1000000;
759                break;
760            case absorbing:
761                scannermsg = "text";
762                token_info(p) = right_brace_token + '}';
763                break;
764            default:           /* can't happen */
765                scannermsg = "unknown";
766                break;
767            }                   /*there are no other cases */
768            begin_token_list(p, inserted);
769            snprintf(errmsg, 255, "%s while scanning %s of %s",
770                     startmsg, scannermsg, cs_to_string(warning_index));
771            tex_error(errmsg, errhlp);
772        } else {
773            char errmsg[256];
774            const char *errhlp_no[] =
775                { "The file ended while I was skipping conditional text.",
776                "This kind of error happens when you say `\\if...' and forget",
777                "the matching `\\fi'. I've inserted a `\\fi'; this might work.",
778                NULL
779            };
780            const char *errhlp_cs[] =
781                { "A forbidden control sequence occurred in skipped text.",
782                "This kind of error happens when you say `\\if...' and forget",
783                "the matching `\\fi'. I've inserted a `\\fi'; this might work.",
784                NULL
785            };
786            const char **errhlp = (const char **) errhlp_no;
787            char *ss;
788            if (cur_cs != 0) {
789                errhlp = errhlp_cs;
790                cur_cs = 0;
791            }
792            ss = cmd_chr_to_string(if_test_cmd, cur_if);
793            snprintf(errmsg, 255,
794                     "Incomplete %s; all text was ignored after line %d",
795                     ss, (int) skip_line);
796            free(ss);
797            /* Incomplete \\if... */
798            cur_tok = cs_token_flag + frozen_fi;
799            /* back up one inserted token and call |error| */
800            {
801                OK_to_interrupt = false;
802                back_input();
803                token_type = inserted;
804                OK_to_interrupt = true;
805                tex_error(errmsg, errhlp);
806            }
807        }
808        deletions_allowed = true;
809    }
810}
811
812@ @c
813static boolean get_next_file(void)
814{
815  SWITCH:
816    if (iloc <= ilimit) {       /* current line not yet finished */
817        do_buffer_to_unichar(cur_chr, iloc);
818
819      RESWITCH:
820        if (detokenized_line()) {
821            cur_cmd = (cur_chr == ' ' ? 10 : 12);
822        } else {
823            do_get_cat_code(cur_cmd, cur_chr);
824        }
825        /*
826           Change state if necessary, and |goto switch| if the current
827           character should be ignored, or |goto reswitch| if the current
828           character changes to another;
829         */
830        /* The following 48-way switch accomplishes the scanning quickly, assuming
831           that a decent C compiler has translated the code. Note that the numeric
832           values for |mid_line|, |skip_blanks|, and |new_line| are spaced
833           apart from each other by |max_char_code+1|, so we can add a character's
834           command code to the state to get a single number that characterizes both.
835         */
836        switch (istate + cur_cmd) {
837        case mid_line + ignore_cmd:
838        case skip_blanks + ignore_cmd:
839        case new_line + ignore_cmd:
840        case skip_blanks + spacer_cmd:
841        case new_line + spacer_cmd:    /* Cases where character is ignored */
842            goto SWITCH;
843            break;
844        case mid_line + escape_cmd:
845        case new_line + escape_cmd:
846        case skip_blanks + escape_cmd: /* Scan a control sequence ...; */
847            istate = (unsigned char) scan_control_sequence();
848            if (cur_cmd >= outer_call_cmd)
849                check_outer_validity();
850            break;
851        case mid_line + active_char_cmd:
852        case new_line + active_char_cmd:
853        case skip_blanks + active_char_cmd:    /* Process an active-character  */
854            cur_cs = active_to_cs(cur_chr, false);
855            cur_cmd = eq_type(cur_cs);
856            cur_chr = equiv(cur_cs);
857            istate = mid_line;
858            if (cur_cmd >= outer_call_cmd)
859                check_outer_validity();
860            break;
861        case mid_line + sup_mark_cmd:
862        case new_line + sup_mark_cmd:
863        case skip_blanks + sup_mark_cmd:       /* If this |sup_mark| starts */
864            if (process_sup_mark())
865                goto RESWITCH;
866            else
867                istate = mid_line;
868            break;
869        case mid_line + invalid_char_cmd:
870        case new_line + invalid_char_cmd:
871        case skip_blanks + invalid_char_cmd:   /* Decry the invalid character and |goto restart|; */
872            invalid_character_error();
873            return false;       /* because state may be |token_list| now */
874            break;
875        case mid_line + spacer_cmd:    /* Enter |skip_blanks| state, emit a space; */
876            istate = skip_blanks;
877            cur_chr = ' ';
878            break;
879        case mid_line + car_ret_cmd:   /* Finish line, emit a space; */
880            /* When a character of type |spacer| gets through, its character code is
881               changed to $\.{"\ "}=040$. This means that the ASCII codes for tab and space,
882               and for the space inserted at the end of a line, will
883               be treated alike when macro parameters are being matched. We do this
884               since such characters are indistinguishable on most computer terminal displays.
885             */
886            iloc = ilimit + 1;
887            cur_cmd = spacer_cmd;
888            cur_chr = ' ';
889            break;
890        case skip_blanks + car_ret_cmd:
891        case mid_line + comment_cmd:
892        case new_line + comment_cmd:
893        case skip_blanks + comment_cmd:        /* Finish line, |goto switch|; */
894            iloc = ilimit + 1;
895            goto SWITCH;
896            break;
897        case new_line + car_ret_cmd:   /* Finish line, emit a \.{\\par}; */
898            iloc = ilimit + 1;
899            cur_cs = par_loc;
900            cur_cmd = eq_type(cur_cs);
901            cur_chr = equiv(cur_cs);
902            if (cur_cmd >= outer_call_cmd)
903                check_outer_validity();
904            break;
905        case skip_blanks + left_brace_cmd:
906        case new_line + left_brace_cmd:
907            istate = mid_line;  /* fall through */
908        case mid_line + left_brace_cmd:
909            align_state++;
910            break;
911        case skip_blanks + right_brace_cmd:
912        case new_line + right_brace_cmd:
913            istate = mid_line;  /* fall through */
914        case mid_line + right_brace_cmd:
915            align_state--;
916            break;
917        case mid_line + math_shift_cmd:
918        case mid_line + tab_mark_cmd:
919        case mid_line + mac_param_cmd:
920        case mid_line + sub_mark_cmd:
921        case mid_line + letter_cmd:
922        case mid_line + other_char_cmd:
923            break;
924#if 0
925               case skip_blanks + math_shift:
926               case skip_blanks + tab_mark:
927               case skip_blanks + mac_param:
928               case skip_blanks + sub_mark:
929               case skip_blanks + letter:
930               case skip_blanks + other_char:
931               case new_line    + math_shift:
932               case new_line    + tab_mark:
933               case new_line    + mac_param:
934               case new_line    + sub_mark:
935               case new_line    + letter:
936               case new_line    + other_char:
937#else
938        default:
939#endif
940            istate = mid_line;
941            break;
942        }
943    } else {
944        if (iname != 21)
945            istate = new_line;
946
947        /*
948           Move to next line of file,
949           or |goto restart| if there is no next line,
950           or |return| if a \.{\\read} line has finished;
951         */
952        do {
953            next_line_retval r = next_line();
954            if (r == next_line_return) {
955                return true;
956            } else if (r == next_line_restart) {
957                return false;
958            }
959        } while (0);
960        check_interrupt();
961        goto SWITCH;
962    }
963    return true;
964}
965
966@ @c
967#define is_hex(a) ((a>='0'&&a<='9')||(a>='a'&&a<='f'))
968
969#define add_nybble(a)   do {                                            \
970    if (a<='9') cur_chr=(cur_chr<<4)+a-'0';                             \
971    else        cur_chr=(cur_chr<<4)+a-'a'+10;                          \
972  } while (0)
973
974#define hex_to_cur_chr do {                                             \
975    if (c<='9')  cur_chr=c-'0';                                         \
976    else         cur_chr=c-'a'+10;                                      \
977    add_nybble(cc);                                                     \
978  } while (0)
979
980#define four_hex_to_cur_chr do {                                        \
981    hex_to_cur_chr;                                                     \
982    add_nybble(ccc); add_nybble(cccc);                                  \
983  } while (0)
984
985#define five_hex_to_cur_chr  do {                                       \
986    four_hex_to_cur_chr;                                                \
987    add_nybble(ccccc);                                                  \
988  } while (0)
989
990#define six_hex_to_cur_chr do {                                         \
991    five_hex_to_cur_chr;                                                \
992    add_nybble(cccccc);                                                 \
993  } while (0)
994
995
996@ Notice that a code like \.{\^\^8} becomes \.x if not followed by a hex digit.
997
998@c
999static boolean process_sup_mark(void)
1000{
1001    if (cur_chr == buffer[iloc]) {
1002        int c, cc;
1003        if (iloc < ilimit) {
1004            if ((cur_chr == buffer[iloc + 1]) && (cur_chr == buffer[iloc + 2])
1005                && (cur_chr == buffer[iloc + 3])
1006                && (cur_chr == buffer[iloc + 4])
1007                && ((iloc + 10) <= ilimit)) {
1008                int ccc, cccc, ccccc, cccccc;   /* constituents of a possible expanded code */
1009                c = buffer[iloc + 5];
1010                cc = buffer[iloc + 6];
1011                ccc = buffer[iloc + 7];
1012                cccc = buffer[iloc + 8];
1013                ccccc = buffer[iloc + 9];
1014                cccccc = buffer[iloc + 10];
1015                if ((is_hex(c)) && (is_hex(cc)) && (is_hex(ccc))
1016                    && (is_hex(cccc))
1017                    && (is_hex(ccccc)) && (is_hex(cccccc))) {
1018                    iloc = iloc + 11;
1019                    six_hex_to_cur_chr;
1020                    return true;
1021                }
1022            }
1023            if ((cur_chr == buffer[iloc + 1]) && (cur_chr == buffer[iloc + 2])
1024                && (cur_chr == buffer[iloc + 3]) && ((iloc + 8) <= ilimit)) {
1025                int ccc, cccc, ccccc;   /* constituents of a possible expanded code */
1026                c = buffer[iloc + 4];
1027                cc = buffer[iloc + 5];
1028                ccc = buffer[iloc + 6];
1029                cccc = buffer[iloc + 7];
1030                ccccc = buffer[iloc + 8];
1031                if ((is_hex(c)) && (is_hex(cc)) && (is_hex(ccc))
1032                    && (is_hex(cccc)) && (is_hex(ccccc))) {
1033                    iloc = iloc + 9;
1034                    five_hex_to_cur_chr;
1035                    return true;
1036                }
1037            }
1038            if ((cur_chr == buffer[iloc + 1]) && (cur_chr == buffer[iloc + 2])
1039                && ((iloc + 6) <= ilimit)) {
1040                int ccc, cccc;  /* constituents of a possible expanded code */
1041                c = buffer[iloc + 3];
1042                cc = buffer[iloc + 4];
1043                ccc = buffer[iloc + 5];
1044                cccc = buffer[iloc + 6];
1045                if ((is_hex(c)) && (is_hex(cc)) && (is_hex(ccc))
1046                    && (is_hex(cccc))) {
1047                    iloc = iloc + 7;
1048                    four_hex_to_cur_chr;
1049                    return true;
1050                }
1051            }
1052            c = buffer[iloc + 1];
1053            if (c < 0200) {     /* yes we have an expanded char */
1054                iloc = iloc + 2;
1055                if (is_hex(c) && iloc <= ilimit) {
1056                    cc = buffer[iloc];
1057                    if (is_hex(cc)) {
1058                        incr(iloc);
1059                        hex_to_cur_chr;
1060                        return true;
1061                    }
1062                }
1063                cur_chr = (c < 0100 ? c + 0100 : c - 0100);
1064                return true;
1065            }
1066        }
1067    }
1068    return false;
1069}
1070
1071@ Control sequence names are scanned only when they appear in some line of
1072   a file; once they have been scanned the first time, their |eqtb| location
1073   serves as a unique identification, so \TeX\ doesn't need to refer to the
1074   original name any more except when it prints the equivalent in symbolic form.
1075
1076   The program that scans a control sequence has been written carefully
1077   in order to avoid the blowups that might otherwise occur if a malicious
1078   user tried something like `\.{\\catcode\'15=0}'. The algorithm might
1079   look at |buffer[ilimit+1]|, but it never looks at |buffer[ilimit+2]|.
1080
1081   If expanded characters like `\.{\^\^A}' or `\.{\^\^df}'
1082   appear in or just following
1083   a control sequence name, they are converted to single characters in the
1084   buffer and the process is repeated, slowly but surely.
1085
1086@c
1087static boolean check_expanded_code(int *kk);    /* below */
1088
1089static int scan_control_sequence(void)
1090{
1091    int retval = mid_line;
1092    if (iloc > ilimit) {
1093        cur_cs = null_cs;       /* |state| is irrelevant in this case */
1094    } else {
1095        register int cat;       /* |cat_code(cur_chr)|, usually */
1096        while (1) {
1097            int k = iloc;
1098            do_buffer_to_unichar(cur_chr, k);
1099            do_get_cat_code(cat, cur_chr);
1100            if (cat != letter_cmd || k > ilimit) {
1101                retval = (cat == spacer_cmd ? skip_blanks : mid_line);
1102                if (cat == sup_mark_cmd && check_expanded_code(&k))     /* If an expanded...; */
1103                    continue;
1104            } else {
1105                retval = skip_blanks;
1106                do {
1107                    do_buffer_to_unichar(cur_chr, k);
1108                    do_get_cat_code(cat, cur_chr);
1109                } while (cat == letter_cmd && k <= ilimit);
1110
1111                if (cat == sup_mark_cmd && check_expanded_code(&k))     /* If an expanded...; */
1112                    continue;
1113                if (cat != letter_cmd) {
1114                    decr(k);
1115                    if (cur_chr > 0xFFFF)
1116                        decr(k);
1117                    if (cur_chr > 0x7FF)
1118                        decr(k);
1119                    if (cur_chr > 0x7F)
1120                        decr(k);
1121                }               /* now |k| points to first nonletter */
1122            }
1123            cur_cs = id_lookup(iloc, k - iloc);
1124            iloc = k;
1125            break;
1126        }
1127    }
1128    cur_cmd = eq_type(cur_cs);
1129    cur_chr = equiv(cur_cs);
1130    return retval;
1131}
1132
1133@ Whenever we reach the following piece of code, we will have
1134   |cur_chr=buffer[k-1]| and |k<=ilimit+1| and |cat=get_cat_code(cat_code_table,cur_chr)|. If an
1135   expanded code like \.{\^\^A} or \.{\^\^df} appears in |buffer[(k-1)..(k+1)]|
1136   or |buffer[(k-1)..(k+2)]|, we
1137   will store the corresponding code in |buffer[k-1]| and shift the rest of
1138   the buffer left two or three places.
1139
1140@c
1141static boolean check_expanded_code(int *kk)
1142{
1143    int l;
1144    int k = *kk;
1145    int d = 1;                  /* number of excess characters in an expanded code */
1146    int c, cc, ccc, cccc, ccccc, cccccc;        /* constituents of a possible expanded code */
1147    if (buffer[k] == cur_chr && k < ilimit) {
1148        if ((cur_chr == buffer[k + 1]) && (cur_chr == buffer[k + 2])
1149            && ((k + 6) <= ilimit)) {
1150            d = 4;
1151            if ((cur_chr == buffer[k + 3]) && ((k + 8) <= ilimit))
1152                d = 5;
1153            if ((cur_chr == buffer[k + 4]) && ((k + 10) <= ilimit))
1154                d = 6;
1155            c = buffer[k + d - 1];
1156            cc = buffer[k + d];
1157            ccc = buffer[k + d + 1];
1158            cccc = buffer[k + d + 2];
1159            if (d == 6) {
1160                ccccc = buffer[k + d + 3];
1161                cccccc = buffer[k + d + 4];
1162                if (is_hex(c) && is_hex(cc) && is_hex(ccc) && is_hex(cccc)
1163                    && is_hex(ccccc) && is_hex(cccccc))
1164                    six_hex_to_cur_chr;
1165            } else if (d == 5) {
1166                ccccc = buffer[k + d + 3];
1167                if (is_hex(c) && is_hex(cc) && is_hex(ccc) && is_hex(cccc)
1168                    && is_hex(ccccc))
1169                    five_hex_to_cur_chr;
1170            } else {
1171                if (is_hex(c) && is_hex(cc) && is_hex(ccc) && is_hex(cccc))
1172                    four_hex_to_cur_chr;
1173            }
1174        } else {
1175            c = buffer[k + 1];
1176            if (c < 0200) {
1177                d = 1;
1178                if (is_hex(c) && (k + 2) <= ilimit) {
1179                    cc = buffer[k + 2];
1180                    if (is_hex(c) && is_hex(cc)) {
1181                        d = 2;
1182                        hex_to_cur_chr;
1183                    }
1184                } else if (c < 0100) {
1185                    cur_chr = c + 0100;
1186                } else {
1187                    cur_chr = c - 0100;
1188                }
1189            }
1190        }
1191        if (d > 2)
1192            d = 2 * d - 1;
1193        else
1194            d++;
1195        if (cur_chr <= 0x7F) {
1196            buffer[k - 1] = (packed_ASCII_code) cur_chr;
1197        } else if (cur_chr <= 0x7FF) {
1198            buffer[k - 1] = (packed_ASCII_code) (0xC0 + cur_chr / 0x40);
1199            k++;
1200            d--;
1201            buffer[k - 1] = (packed_ASCII_code) (0x80 + cur_chr % 0x40);
1202        } else if (cur_chr <= 0xFFFF) {
1203            buffer[k - 1] = (packed_ASCII_code) (0xE0 + cur_chr / 0x1000);
1204            k++;
1205            d--;
1206            buffer[k - 1] =
1207                (packed_ASCII_code) (0x80 + (cur_chr % 0x1000) / 0x40);
1208            k++;
1209            d--;
1210            buffer[k - 1] =
1211                (packed_ASCII_code) (0x80 + (cur_chr % 0x1000) % 0x40);
1212        } else {
1213            buffer[k - 1] = (packed_ASCII_code) (0xF0 + cur_chr / 0x40000);
1214            k++;
1215            d--;
1216            buffer[k - 1] =
1217                (packed_ASCII_code) (0x80 + (cur_chr % 0x40000) / 0x1000);
1218            k++;
1219            d--;
1220            buffer[k - 1] =
1221                (packed_ASCII_code) (0x80 +
1222                                     ((cur_chr % 0x40000) % 0x1000) / 0x40);
1223            k++;
1224            d--;
1225            buffer[k - 1] =
1226                (packed_ASCII_code) (0x80 +
1227                                     ((cur_chr % 0x40000) % 0x1000) % 0x40);
1228        }
1229        l = k;
1230        ilimit = ilimit - d;
1231        while (l <= ilimit) {
1232            buffer[l] = buffer[l + d];
1233            l++;
1234        }
1235        *kk = k;
1236        return true;
1237    }
1238    return false;
1239}
1240
1241
1242@ All of the easy branches of |get_next| have now been taken care of.
1243  There is one more branch.
1244
1245@c
1246static next_line_retval next_line(void)
1247{
1248    boolean inhibit_eol = false;        /* a way to end a pseudo file without trailing space */
1249    if (iname > 17) {
1250        /* Read next line of file into |buffer|, or |goto restart| if the file has ended */
1251        incr(line);
1252        first = istart;
1253        if (!force_eof) {
1254            if (iname <= 20) {
1255                if (pseudo_input()) {   /* not end of file */
1256                    firm_up_the_line(); /* this sets |ilimit| */
1257                    line_catcode_table = DEFAULT_CAT_TABLE;
1258                    if ((iname == 19) && (pseudo_lines(pseudo_files) == null))
1259                        inhibit_eol = true;
1260                } else if ((every_eof != null) && !eof_seen[iindex]) {
1261                    ilimit = first - 1;
1262                    eof_seen[iindex] = true;    /* fake one empty line */
1263                    if (iname != 19)
1264                        begin_token_list(every_eof, every_eof_text);
1265                    return next_line_restart;
1266                } else {
1267                    force_eof = true;
1268                }
1269            } else {
1270                if (iname == 21) {
1271                    if (luacstring_input()) {   /* not end of strings  */
1272                        firm_up_the_line();
1273                        line_catcode_table = (short) luacstring_cattable();
1274                        line_partial = (signed char) luacstring_partial();
1275                        if (luacstring_final_line() || line_partial
1276                            || line_catcode_table == NO_CAT_TABLE)
1277                            inhibit_eol = true;
1278                        if (!line_partial)
1279                            istate = new_line;
1280                    } else {
1281                        force_eof = true;
1282                    }
1283                } else {
1284                    if (lua_input_ln(cur_file, 0, true)) {      /* not end of file */
1285                        firm_up_the_line();     /* this sets |ilimit| */
1286                        line_catcode_table = DEFAULT_CAT_TABLE;
1287                    } else if ((every_eof != null) && (!eof_seen[iindex])) {
1288                        ilimit = first - 1;
1289                        eof_seen[iindex] = true;        /* fake one empty line */
1290                        begin_token_list(every_eof, every_eof_text);
1291                        return next_line_restart;
1292                    } else {
1293                        force_eof = true;
1294                    }
1295                }
1296            }
1297        }
1298        if (force_eof) {
1299            if (tracing_nesting > 0)
1300                if ((grp_stack[in_open] != cur_boundary)
1301                    || (if_stack[in_open] != cond_ptr))
1302                    if (!((iname == 19) || (iname == 21)))
1303                        file_warning(); /* give warning for some unfinished groups and/or conditionals */
1304            if ((iname > 21) || (iname == 20)) {
1305                report_stop_file(filetype_tex);
1306                decr(open_parens);
1307#if 0
1308                update_terminal(); /* show user that file has been read */
1309#endif
1310            }
1311            force_eof = false;
1312            if (iname == 21 ||  /* lua input */
1313                iname == 19) {  /* \.{\\scantextokens} */
1314                end_file_reading();
1315            } else {
1316                end_file_reading();
1317                check_outer_validity();
1318            }
1319            return next_line_restart;
1320        }
1321        if (inhibit_eol || end_line_char_inactive)
1322            ilimit--;
1323        else
1324            buffer[ilimit] = (packed_ASCII_code) end_line_char;
1325        first = ilimit + 1;
1326        iloc = istart;          /* ready to read */
1327    } else {
1328        if (!terminal_input) {  /* \.{\\read} line has ended */
1329            cur_cmd = 0;
1330            cur_chr = 0;
1331            return next_line_return;    /* OUTER */
1332        }
1333        if (input_ptr > 0) {    /* text was inserted during error recovery */
1334            end_file_reading();
1335            return next_line_restart;   /* resume previous level */
1336        }
1337        if (selector < log_only)
1338            open_log_file();
1339        if (interaction > nonstop_mode) {
1340            if (end_line_char_inactive)
1341                ilimit++;
1342            if (ilimit == istart) {     /* previous line was empty */
1343                tprint_nl("(Please type a command or say `\\end')");
1344            }
1345            print_ln();
1346            first = istart;
1347            prompt_input("*");  /* input on-line into |buffer| */
1348            ilimit = last;
1349            if (end_line_char_inactive)
1350                ilimit--;
1351            else
1352                buffer[ilimit] = (packed_ASCII_code) end_line_char;
1353            first = ilimit + 1;
1354            iloc = istart;
1355        } else {
1356            fatal_error("*** (job aborted, no legal \\end found)");
1357            /* nonstop mode, which is intended for overnight batch processing,
1358               never waits for on-line input */
1359        }
1360    }
1361    return next_line_ok;
1362}
1363
1364@ Let's consider now what happens when |get_next| is looking at a token list.
1365
1366@c
1367static boolean get_next_tokenlist(void)
1368{
1369    register halfword t;        /* a token */
1370    t = token_info(iloc);
1371    iloc = token_link(iloc);    /* move to next */
1372    if (t >= cs_token_flag) {   /* a control sequence token */
1373        cur_cs = t - cs_token_flag;
1374        cur_cmd = eq_type(cur_cs);
1375        if (cur_cmd >= outer_call_cmd) {
1376            if (cur_cmd == dont_expand_cmd) {   /* Get the next token, suppressing expansion */
1377                /* The present point in the program is reached only when the |expand|
1378                   routine has inserted a special marker into the input. In this special
1379                   case, |token_info(iloc)| is known to be a control sequence token, and |token_link(iloc)=null|.
1380                 */
1381                cur_cs = token_info(iloc) - cs_token_flag;
1382                iloc = null;
1383                cur_cmd = eq_type(cur_cs);
1384                if (cur_cmd > max_command_cmd) {
1385                    cur_cmd = relax_cmd;
1386                    cur_chr = no_expand_flag;
1387                    return true;
1388                }
1389            } else {
1390                check_outer_validity();
1391            }
1392        }
1393        cur_chr = equiv(cur_cs);
1394    } else {
1395        cur_cmd = token_cmd(t);
1396        cur_chr = token_chr(t);
1397        switch (cur_cmd) {
1398        case left_brace_cmd:
1399            align_state++;
1400            break;
1401        case right_brace_cmd:
1402            align_state--;
1403            break;
1404        case out_param_cmd:    /* Insert macro parameter and |goto restart|; */
1405            begin_token_list(param_stack[param_start + cur_chr - 1], parameter);
1406            return false;
1407            break;
1408        }
1409    }
1410    return true;
1411}
1412
1413@ Now we're ready to take the plunge into |get_next| itself. Parts of
1414   this routine are executed more often than any other instructions of \TeX.
1415   @^mastication@>@^inner loop@>
1416
1417@ sets |cur_cmd|, |cur_chr|, |cur_cs| to next token
1418
1419@c
1420void get_next(void)
1421{
1422  RESTART:
1423    cur_cs = 0;
1424    if (istate != token_list) {
1425        /* Input from external file, |goto restart| if no input found */
1426        if (!get_next_file())
1427            goto RESTART;
1428    } else {
1429        if (iloc == null) {
1430            end_token_list();
1431            goto RESTART;       /* list exhausted, resume previous level */
1432        } else if (!get_next_tokenlist()) {
1433            goto RESTART;       /* parameter needs to be expanded */
1434        }
1435    }
1436    /* If an alignment entry has just ended, take appropriate action */
1437    if ((cur_cmd == tab_mark_cmd || cur_cmd == car_ret_cmd) && align_state == 0) {
1438        insert_vj_template();
1439        goto RESTART;
1440    }
1441}
1442
1443
1444@ Since |get_next| is used so frequently in \TeX, it is convenient
1445to define three related procedures that do a little more:
1446
1447\yskip\hang|get_token| not only sets |cur_cmd| and |cur_chr|, it
1448also sets |cur_tok|, a packed halfword version of the current token.
1449
1450\yskip\hang|get_x_token|, meaning ``get an expanded token,'' is like
1451|get_token|, but if the current token turns out to be a user-defined
1452control sequence (i.e., a macro call), or a conditional,
1453or something like \.{\\topmark} or \.{\\expandafter} or \.{\\csname},
1454it is eliminated from the input by beginning the expansion of the macro
1455or the evaluation of the conditional.
1456
1457\yskip\hang|x_token| is like |get_x_token| except that it assumes that
1458|get_next| has already been called.
1459
1460\yskip\noindent
1461In fact, these three procedures account for almost every use of |get_next|.
1462
1463No new control sequences will be defined except during a call of
1464|get_token|, or when \.{\\csname} compresses a token list, because
1465|no_new_control_sequence| is always |true| at other times.
1466
1467@c
1468void get_token(void)
1469{                               /* sets |cur_cmd|, |cur_chr|, |cur_tok| */
1470    no_new_control_sequence = false;
1471    get_token_lua();
1472    no_new_control_sequence = true;
1473    if (cur_cs == 0)
1474        cur_tok = token_val(cur_cmd, cur_chr);
1475    else
1476        cur_tok = cs_token_flag + cur_cs;
1477}
1478
1479@ @c
1480void get_token_lua(void)
1481{
1482    register int callback_id;
1483    callback_id = callback_defined(token_filter_callback);
1484    if (callback_id > 0) {
1485        while (istate == token_list && iloc == null && iindex != v_template)
1486            end_token_list();
1487        /* there is some stuff we don't want to see inside the callback */
1488        if (!(istate == token_list &&
1489              ((nofilter == true) || (iindex == backed_up && iloc != null)))) {
1490            do_get_token_lua(callback_id);
1491            return;
1492        }
1493    }
1494    get_next();
1495}
1496
1497
1498@ changes the string |s| to a token list
1499@c
1500halfword string_to_toks(char *ss)
1501{
1502    halfword p;                 /* tail of the token list */
1503    halfword q;                 /* new node being added to the token list via |store_new_token| */
1504    halfword t;                 /* token being appended */
1505    char *s = ss, *se = ss + strlen(s);
1506    p = temp_token_head;
1507    set_token_link(p, null);
1508    while (s < se) {
1509        t = (halfword) str2uni((unsigned char *) s);
1510        s += utf8_size(t);
1511        if (t == ' ')
1512            t = space_token;
1513        else
1514            t = other_token + t;
1515        fast_store_new_token(t);
1516    }
1517    return token_link(temp_token_head);
1518}
1519
1520@ The token lists for macros and for other things like \.{\\mark} and \.{\\output}
1521and \.{\\write} are produced by a procedure called |scan_toks|.
1522
1523Before we get into the details of |scan_toks|, let's consider a much
1524simpler task, that of converting the current string into a token list.
1525The |str_toks| function does this; it classifies spaces as type |spacer|
1526and everything else as type |other_char|.
1527
1528The token list created by |str_toks| begins at |link(temp_token_head)| and ends
1529at the value |p| that is returned. (If |p=temp_token_head|, the list is empty.)
1530
1531|lua_str_toks| is almost identical, but it also escapes the three
1532symbols that |lua| considers special while scanning a literal string
1533
1534@c
1535static halfword lua_str_toks(lstring b)
1536{                               /* changes the string |str_pool[b..pool_ptr]| to a token list */
1537    halfword p;                 /* tail of the token list */
1538    halfword q;                 /* new node being added to the token list via |store_new_token| */
1539    halfword t;                 /* token being appended */
1540    unsigned char *k;           /* index into string */
1541    p = temp_token_head;
1542    set_token_link(p, null);
1543    k = (unsigned char *) b.s;
1544    while (k < (unsigned char *) b.s + b.l) {
1545        t = pool_to_unichar(k);
1546        k += utf8_size(t);
1547        if (t == ' ') {
1548            t = space_token;
1549        } else {
1550            if ((t == '\\') || (t == '"') || (t == '\'') || (t == 10)
1551                || (t == 13))
1552                fast_store_new_token(other_token + '\\');
1553            if (t == 10)
1554                t = 'n';
1555            if (t == 13)
1556                t = 'r';
1557            t = other_token + t;
1558        }
1559        fast_store_new_token(t);
1560    }
1561    return p;
1562}
1563
1564
1565@ Incidentally, the main reason for wanting |str_toks| is the function |the_toks|,
1566which has similar input/output characteristics.
1567
1568@c
1569halfword str_toks(lstring s)
1570{                               /* changes the string |str_pool[b..pool_ptr]| to a token list */
1571    halfword p;                 /* tail of the token list */
1572    halfword q;                 /* new node being added to the token list via |store_new_token| */
1573    halfword t;                 /* token being appended */
1574    unsigned char *k, *l;       /* index into string */
1575    p = temp_token_head;
1576    set_token_link(p, null);
1577    k = s.s;
1578    l = k + s.l;
1579    while (k < l) {
1580        t = pool_to_unichar(k);
1581        k += utf8_size(t);
1582        if (t == ' ')
1583            t = space_token;
1584        else
1585            t = other_token + t;
1586        fast_store_new_token(t);
1587    }
1588    return p;
1589}
1590
1591@ Here's part of the |expand| subroutine that we are now ready to complete:
1592@c
1593void ins_the_toks(void)
1594{
1595    (void) the_toks();
1596    ins_list(token_link(temp_token_head));
1597}
1598
1599@ This routine, used in the next one, prints the job name, possibly
1600modified by the |process_jobname| callback.
1601
1602@c
1603static void print_job_name(void)
1604{
1605   if (job_name) {
1606      char *s, *ss; /* C strings for jobname before and after processing */
1607      int callback_id, lua_retval;
1608      s = (char*)str_string(job_name);
1609      callback_id = callback_defined(process_jobname_callback);
1610      if (callback_id > 0) {
1611        lua_retval = run_callback(callback_id, "S->S", s, &ss);
1612        if ((lua_retval == true) && (ss != NULL))
1613            s = ss;
1614      }
1615      tprint(s);
1616   } else {
1617      print(job_name);
1618   }
1619}
1620
1621@ Here is a routine that print the result of a convert command, using
1622   the argument |i|. It returns |false | if it does not know to print
1623   the code |c|. The function exists because lua code and tex code can
1624   both call it to convert something.
1625
1626@c
1627static boolean print_convert_string(halfword c, int i)
1628{
1629    int ff;                     /* for use with |set_ff| */
1630    boolean ret = true;
1631    switch (c) {
1632    case number_code:
1633        print_int(i);
1634        break;
1635    case uchar_code:
1636        print(i);
1637        break;
1638    case roman_numeral_code:
1639        print_roman_int(i);
1640        break;
1641    case etex_code:
1642        tprint(eTeX_version_string);
1643        break;
1644    case pdftex_revision_code:
1645        tprint(pdftex_revision);
1646        break;
1647    case luatex_revision_code:
1648        print(get_luatexrevision());
1649        break;
1650    case luatex_date_code:
1651        print_int(get_luatex_date_info());
1652        break;
1653    case luatex_banner_code:
1654        tprint(luatex_banner);
1655        break;
1656    case uniform_deviate_code:
1657        print_int(unif_rand(i));
1658        break;
1659    case normal_deviate_code:
1660        print_int(norm_rand());
1661        break;
1662    case format_name_code:
1663        print(format_name);
1664        break;
1665    case job_name_code:
1666        print_job_name();
1667        break;
1668    case font_name_code:
1669        append_string((unsigned char *) font_name(i),
1670                      (unsigned) strlen(font_name(i)));
1671        if (font_size(i) != font_dsize(i)) {
1672            tprint(" at ");
1673            print_scaled(font_size(i));
1674            tprint("pt");
1675        }
1676        break;
1677    case font_id_code:
1678        print_int(i);
1679        break;
1680    case math_style_code:
1681        print_math_style();
1682        break;
1683    case pdf_font_name_code:
1684    case pdf_font_objnum_code:
1685        set_ff(i);
1686        if (c == pdf_font_name_code)
1687            print_int(obj_info(static_pdf, pdf_font_num(ff)));
1688        else
1689            print_int(pdf_font_num(ff));
1690        break;
1691    case pdf_font_size_code:
1692        print_scaled(font_size(i));
1693        tprint("pt");
1694        break;
1695    case pdf_page_ref_code:
1696        print_int(pdf_get_obj(static_pdf, obj_type_page, i, false));
1697        break;
1698    case pdf_xform_name_code:
1699        print_int(obj_info(static_pdf, i));
1700        break;
1701    case eTeX_revision_code:
1702        tprint(eTeX_revision);
1703        break;
1704    default:
1705        ret = false;
1706        break;
1707    }
1708    return ret;
1709}
1710
1711@ @c
1712int scan_lua_state(void) /* hh-ls: optional name or number (not optional name optional number) */
1713{
1714    /* Parse optional lua state integer, or an instance name to be stored in |sn| */
1715    /* Get the next non-blank non-relax non-call token */
1716    int sn = 0;
1717    do {
1718        get_x_token();
1719    } while ((cur_cmd == spacer_cmd) || (cur_cmd == relax_cmd));
1720    back_input();               /* have to push it back, whatever it is  */
1721    if (cur_cmd != left_brace_cmd) {
1722        if (scan_keyword("name")) {
1723            (void) scan_toks(false, true);
1724            sn = def_ref;
1725        } else {
1726            scan_register_num();
1727            if (get_lua_name(cur_val))
1728                sn = (cur_val - 65536);
1729        }
1730    }
1731    return sn;
1732}
1733
1734
1735
1736@ The procedure |conv_toks| uses |str_toks| to insert the token list
1737for |convert| functions into the scanner; `\.{\\outer}' control sequences
1738are allowed to follow `\.{\\string}' and `\.{\\meaning}'.
1739
1740The extra temp string |u| is needed because |pdf_scan_ext_toks| incorporates
1741any pending string in its output. In order to save such a pending string,
1742we have to create a temporary string that is destroyed immediately after.
1743
1744@c
1745void conv_toks(void)
1746{
1747    int old_setting;            /* holds |selector| setting */
1748    halfword p, q;
1749    int save_scanner_status;    /* |scanner_status| upon entry */
1750    halfword save_def_ref;      /* |def_ref| upon entry, important if inside `\.{\\message}' */
1751    halfword save_warning_index;
1752    boolean bool;               /* temp boolean */
1753    str_number s;               /* first temp string */
1754    int sn;                     /* lua chunk name */
1755    str_number u = 0;           /* third temp string, will become non-nil if a string is already being built */
1756    int i = 0;                  /* first temp integer */
1757    int j = 0;                  /* second temp integer */
1758    int c = cur_chr;            /* desired type of conversion */
1759    str_number str;
1760    /* Scan the argument for command |c| */
1761    switch (c) {
1762    case uchar_code:
1763        scan_char_num();
1764        break;
1765    case number_code:
1766    case roman_numeral_code:
1767        scan_int();
1768        break;
1769    case string_code:
1770    case meaning_code:
1771        save_scanner_status = scanner_status;
1772        scanner_status = normal;
1773        get_token();
1774        scanner_status = save_scanner_status;
1775        break;
1776    case etex_code:
1777        break;
1778    case font_name_code:
1779    case font_id_code:
1780        scan_font_ident();
1781        break;
1782    case pdftex_revision_code:
1783    case luatex_revision_code:
1784    case luatex_date_code:
1785    case luatex_banner_code:
1786        break;
1787    case pdf_font_name_code:
1788    case pdf_font_objnum_code:
1789    case pdf_font_size_code:
1790        scan_font_ident();
1791        if (cur_val == null_font)
1792            pdf_error("font", "invalid font identifier");
1793        if (c != pdf_font_size_code) {
1794            pdf_check_vf(cur_val);
1795            if (!font_used(cur_val))
1796                pdf_init_font(static_pdf, cur_val);
1797        }
1798        break;
1799    case pdf_page_ref_code:
1800        scan_int();
1801        if (cur_val <= 0)
1802            pdf_error("pageref", "invalid page number");
1803        break;
1804    case left_margin_kern_code:
1805    case right_margin_kern_code:
1806        scan_int();
1807        if ((box(cur_val) == null) || (type(box(cur_val)) != hlist_node))
1808            pdf_error("marginkern", "a non-empty hbox expected");
1809        break;
1810    case pdf_xform_name_code:
1811        scan_int();
1812        check_obj_type(static_pdf, obj_type_xform, cur_val);
1813        break;
1814    case pdf_creation_date_code:
1815        ins_list(string_to_toks(getcreationdate(static_pdf)));
1816        return;
1817        break;
1818    case format_name_code:
1819    case job_name_code:
1820        if (job_name == 0)
1821            open_log_file();
1822        break;
1823    case pdf_colorstack_init_code:
1824        bool = scan_keyword("page");
1825        if (scan_keyword("direct"))
1826            cur_val = direct_always;
1827        else if (scan_keyword("page"))
1828            cur_val = direct_page;
1829        else
1830            cur_val = set_origin;
1831        save_scanner_status = scanner_status;
1832        save_warning_index = warning_index;
1833        save_def_ref = def_ref;
1834        u = save_cur_string();
1835        scan_toks(false, true); /*hh-ls was scan_pdf_ext_toks();*/
1836        s = tokens_to_string(def_ref);
1837        delete_token_ref(def_ref);
1838        def_ref = save_def_ref;
1839        warning_index = save_warning_index;
1840        scanner_status = save_scanner_status;
1841        cur_val = newcolorstack(s, cur_val, bool);
1842        flush_str(s);
1843        cur_val_level = int_val_level;
1844        if (cur_val < 0) {
1845            print_err("Too many color stacks");
1846            help2("The number of color stacks is limited to 32768.",
1847                  "I'll use the default color stack 0 here.");
1848            error();
1849            cur_val = 0;
1850            restore_cur_string(u);
1851        }
1852        break;
1853    case uniform_deviate_code:
1854        scan_int();
1855        break;
1856    case normal_deviate_code:
1857        break;
1858    case lua_escape_string_code:
1859        {
1860            lstring escstr;
1861            int l = 0;
1862            save_scanner_status = scanner_status;
1863            save_def_ref = def_ref;
1864            save_warning_index = warning_index;
1865            scan_toks(false, true); /*hh-ls was scan_pdf_ext_toks();*/
1866            bool = in_lua_escape;
1867            in_lua_escape = true;
1868            escstr.s = (unsigned char *) tokenlist_to_cstring(def_ref, false, &l);
1869            escstr.l = (unsigned) l;
1870            in_lua_escape = bool;
1871            delete_token_ref(def_ref);
1872            def_ref = save_def_ref;
1873            warning_index = save_warning_index;
1874            scanner_status = save_scanner_status;
1875            (void) lua_str_toks(escstr);
1876            ins_list(token_link(temp_token_head));
1877            free(escstr.s);
1878            return;
1879        }
1880        break;
1881    case math_style_code:
1882        break;
1883    case expanded_code:
1884        save_scanner_status = scanner_status;
1885        save_warning_index = warning_index;
1886        save_def_ref = def_ref;
1887        u = save_cur_string();
1888        scan_toks(false, true); /*hh-ls was scan_pdf_ext_toks();*/
1889        warning_index = save_warning_index;
1890        scanner_status = save_scanner_status;
1891        ins_list(token_link(def_ref));
1892        def_ref = save_def_ref;
1893        restore_cur_string(u);
1894        return;
1895        break;
1896    case lua_code:
1897        u = save_cur_string();
1898        save_scanner_status = scanner_status;
1899        save_def_ref = def_ref;
1900        save_warning_index = warning_index;
1901        sn = scan_lua_state();
1902        scan_toks(false, true); /*hh-ls was scan_pdf_ext_toks();*/
1903        s = def_ref;
1904        warning_index = save_warning_index;
1905        def_ref = save_def_ref;
1906        scanner_status = save_scanner_status;
1907        luacstrings = 0;
1908        luatokencall(s, sn);
1909        delete_token_ref(s);
1910        restore_cur_string(u);  /* TODO: check this, was different */
1911        if (luacstrings > 0)
1912            lua_string_start();
1913        return;
1914        break;
1915    case lua_function_code:
1916        scan_int();
1917        if (cur_val <= 0) {
1918            pdf_error("luafunction", "invalid number");
1919        } else {
1920            u = save_cur_string();
1921            luacstrings = 0;
1922            luafunctioncall(cur_val);
1923            restore_cur_string(u);
1924            if (luacstrings > 0)
1925                lua_string_start();
1926        }
1927        return;
1928        break;
1929    case pdf_insert_ht_code:
1930        scan_register_num();
1931        break;
1932    case pdf_ximage_bbox_code:
1933        scan_int();
1934        check_obj_type(static_pdf, obj_type_ximage, cur_val);
1935        i = obj_data_ptr(static_pdf, cur_val);
1936        scan_int();
1937        j = cur_val;
1938        if ((j < 1) || (j > 4))
1939            pdf_error("pdfximagebbox", "invalid parameter");
1940        break;
1941        /* Cases of 'Scan the argument for command |c|' */
1942    case eTeX_revision_code:
1943        break;
1944    default:
1945        confusion("convert");
1946        break;
1947    }
1948
1949    old_setting = selector;
1950    selector = new_string;
1951
1952    /* Print the result of command |c| */
1953    if (!print_convert_string(c, cur_val)) {
1954        switch (c) {
1955        case string_code:
1956            if (cur_cs != 0)
1957                sprint_cs(cur_cs);
1958            else
1959                print(cur_chr);
1960            break;
1961        case meaning_code:
1962            print_meaning();
1963            break;
1964        case left_margin_kern_code:
1965            p = list_ptr(box(cur_val));
1966            if ((p != null) && (!is_char_node(p)) &&
1967                (type(p) == glue_node) && (subtype(p) == left_skip_code + 1))
1968                p = vlink(p);
1969            if ((p != null) && (!is_char_node(p)) &&
1970                (type(p) == margin_kern_node) && (subtype(p) == left_side))
1971                print_scaled(width(p));
1972            else
1973                print_char('0');
1974            tprint("pt");
1975            break;
1976        case right_margin_kern_code:
1977            q = list_ptr(box(cur_val));
1978            p = null;
1979            if (q != null) {
1980                p = prev_rightmost(q, null);
1981                if ((p != null) && (!is_char_node(p)) && (type(p) == glue_node)
1982                    && (subtype(p) == right_skip_code + 1))
1983                    p = prev_rightmost(q, p);
1984            }
1985            if ((p != null) && (!is_char_node(p)) &&
1986                (type(p) == margin_kern_node) && (subtype(p) == right_side))
1987                print_scaled(width(p));
1988            else
1989                print_char('0');
1990            tprint("pt");
1991            break;
1992        case pdf_colorstack_init_code:
1993            print_int(cur_val);
1994            break;
1995        case pdf_insert_ht_code:
1996            i = cur_val;
1997            p = page_ins_head;
1998            while (i >= subtype(vlink(p)))
1999                p = vlink(p);
2000            if (subtype(p) == i)
2001                print_scaled(height(p));
2002            else
2003                print_char('0');
2004            tprint("pt");
2005            break;
2006        case pdf_ximage_bbox_code:
2007            if (is_pdf_image(i)) {
2008                switch (j) {
2009                case 1:
2010                    print_scaled(epdf_orig_x(i));
2011                    break;
2012                case 2:
2013                    print_scaled(epdf_orig_y(i));
2014                    break;
2015                case 3:
2016                    print_scaled(epdf_orig_x(i) + epdf_xsize(i));
2017                    break;
2018                case 4:
2019                    print_scaled(epdf_orig_y(i) + epdf_ysize(i));
2020                    break;
2021                }
2022            } else {
2023                print_scaled(0);
2024            }
2025            tprint("pt");
2026            break;
2027        case pdf_creation_date_code:
2028        case lua_escape_string_code:
2029        case lua_code:
2030        case lua_function_code:
2031        case expanded_code:
2032            break;
2033        default:
2034            confusion("convert");
2035            break;
2036        }
2037    }
2038
2039    selector = old_setting;
2040    str = make_string();
2041    (void) str_toks(str_lstring(str));
2042    flush_str(str);
2043    ins_list(token_link(temp_token_head));
2044}
2045
2046@ This boolean is keeping track of the lua string escape state
2047@c
2048boolean in_lua_escape;
2049
2050@ probably not needed anymore
2051@c
2052boolean is_convert(halfword c)
2053{
2054    return (c == convert_cmd);
2055}
2056
2057str_number the_convert_string(halfword c, int i)
2058{
2059    int old_setting;            /* saved |selector| setting */
2060    str_number ret = 0;
2061    old_setting = selector;
2062    selector = new_string;
2063    if (print_convert_string(c, i)) {
2064        ret = make_string();
2065    } else if (c == font_identifier_code) {
2066        print_font_identifier(i);
2067        ret = make_string();
2068    }
2069    selector = old_setting;
2070    return ret;
2071}
2072
2073@ Another way to create a token list is via the \.{\\read} command. The
2074sixteen files potentially usable for reading appear in the following
2075global variables. The value of |read_open[n]| will be |closed| if
2076stream number |n| has not been opened or if it has been fully read;
2077|just_open| if an \.{\\openin} but not a \.{\\read} has been done;
2078and |normal| if it is open and ready to read the next line.
2079
2080@c
2081FILE *read_file[16];            /* used for \.{\\read} */
2082int read_open[17];              /* state of |read_file[n]| */
2083
2084void initialize_read(void)
2085{
2086    int k;
2087    for (k = 0; k <= 16; k++)
2088        read_open[k] = closed;
2089}
2090
2091@ The |read_toks| procedure constructs a token list like that for any
2092macro definition, and makes |cur_val| point to it. Parameter |r| points
2093to the control sequence that will receive this token list.
2094
2095@c
2096void read_toks(int n, halfword r, halfword j)
2097{
2098    halfword p;                 /* tail of the token list */
2099    halfword q;                 /* new node being added to the token list via |store_new_token| */
2100    int s;                      /* saved value of |align_state| */
2101    int m;                      /* stream number */
2102    scanner_status = defining;
2103    warning_index = r;
2104    p = get_avail();
2105    def_ref = p;
2106    set_token_ref_count(def_ref, 0);
2107    p = def_ref;                /* the reference count */
2108    store_new_token(end_match_token);
2109    if ((n < 0) || (n > 15))
2110        m = 16;
2111    else
2112        m = n;
2113    s = align_state;
2114    align_state = 1000000;      /* disable tab marks, etc. */
2115    do {
2116        /* Input and store tokens from the next line of the file */
2117        begin_file_reading();
2118        iname = m + 1;
2119        if (read_open[m] == closed) {
2120            /* Input for \.{\\read} from the terminal */
2121            /* Here we input on-line into the |buffer| array, prompting the user explicitly
2122               if |n>=0|.  The value of |n| is set negative so that additional prompts
2123               will not be given in the case of multi-line input. */
2124            if (interaction > nonstop_mode) {
2125                if (n < 0) {
2126                    prompt_input("");
2127                } else {
2128                    wake_up_terminal();
2129                    print_ln();
2130                    sprint_cs(r);
2131                    prompt_input(" =");
2132                    n = -1;
2133                }
2134            } else {
2135                fatal_error
2136                    ("*** (cannot \\read from terminal in nonstop modes)");
2137            }
2138
2139        } else if (read_open[m] == just_open) {
2140            /* Input the first line of |read_file[m]| */
2141            /* The first line of a file must be treated specially, since |lua_input_ln|
2142               must be told not to start with |get|. */
2143            if (lua_input_ln(read_file[m], (m + 1), false)) {
2144                read_open[m] = normal;
2145            } else {
2146                lua_a_close_in(read_file[m], (m + 1));
2147                read_open[m] = closed;
2148            }
2149
2150        } else {
2151            /* Input the next line of |read_file[m]| */
2152            /*  An empty line is appended at the end of a |read_file|. */
2153            if (!lua_input_ln(read_file[m], (m + 1), true)) {
2154                lua_a_close_in(read_file[m], (m + 1));
2155                read_open[m] = closed;
2156                if (align_state != 1000000) {
2157                    runaway();
2158                    print_err("File ended within \\read");
2159                    help1("This \\read has unbalanced braces.");
2160                    align_state = 1000000;
2161                    error();
2162                }
2163            }
2164
2165        }
2166        ilimit = last;
2167        if (end_line_char_inactive)
2168            decr(ilimit);
2169        else
2170            buffer[ilimit] = (packed_ASCII_code) int_par(end_line_char_code);
2171        first = ilimit + 1;
2172        iloc = istart;
2173        istate = new_line;
2174        /* Handle \.{\\readline} and |goto done|; */
2175        if (j == 1) {
2176            while (iloc <= ilimit) {    /* current line not yet finished */
2177		do_buffer_to_unichar(cur_chr, iloc);
2178                if (cur_chr == ' ')
2179                    cur_tok = space_token;
2180                else
2181                    cur_tok = cur_chr + other_token;
2182                store_new_token(cur_tok);
2183            }
2184        } else {
2185            while (1) {
2186                get_token();
2187                if (cur_tok == 0)
2188                    break;      /* |cur_cmd=cur_chr=0| will occur at the end of the line */
2189                if (align_state < 1000000) {    /* unmatched `\.\}' aborts the line */
2190                    do {
2191                        get_token();
2192                    } while (cur_tok != 0);
2193                    align_state = 1000000;
2194                    break;
2195                }
2196                store_new_token(cur_tok);
2197            }
2198        }
2199        end_file_reading();
2200
2201    } while (align_state != 1000000);
2202    cur_val = def_ref;
2203    scanner_status = normal;
2204    align_state = s;
2205}
2206
2207@ @c
2208str_number tokens_to_string(halfword p)
2209{                               /* return a string from tokens list */
2210    int old_setting;
2211    if (selector == new_string)
2212        pdf_error("tokens",
2213                  "tokens_to_string() called while selector = new_string");
2214    old_setting = selector;
2215    selector = new_string;
2216    show_token_list(token_link(p), null, -1);
2217    selector = old_setting;
2218    return make_string();
2219}
2220
2221@ @c
2222#define make_room(a)                                    \
2223    if ((unsigned)i+a+1>alloci) {                      \
2224        ret = xrealloc(ret,(alloci+64));                \
2225        alloci = alloci + 64;                           \
2226    }
2227
2228
2229#define append_i_byte(a) ret[i++] = (char)(a)
2230
2231#define Print_char(a) make_room(1); append_i_byte(a)
2232
2233#define Print_uchar(s) {                                           \
2234    make_room(4);                                                  \
2235    if (s<=0x7F) {                                                 \
2236      append_i_byte(s);                                            \
2237    } else if (s<=0x7FF) {                                         \
2238      append_i_byte(0xC0 + (s / 0x40));                            \
2239      append_i_byte(0x80 + (s % 0x40));                            \
2240    } else if (s<=0xFFFF) {                                        \
2241      append_i_byte(0xE0 + (s / 0x1000));                          \
2242      append_i_byte(0x80 + ((s % 0x1000) / 0x40));                 \
2243      append_i_byte(0x80 + ((s % 0x1000) % 0x40));                 \
2244    } else if (s>=0x110000) {                                      \
2245      append_i_byte(s-0x11000);                                    \
2246    } else {                                                       \
2247      append_i_byte(0xF0 + (s / 0x40000));                         \
2248      append_i_byte(0x80 + ((s % 0x40000) / 0x1000));              \
2249      append_i_byte(0x80 + (((s % 0x40000) % 0x1000) / 0x40));     \
2250      append_i_byte(0x80 + (((s % 0x40000) % 0x1000) % 0x40));     \
2251    } }
2252
2253
2254#define Print_esc(b) {                                          \
2255    const char *v = b;                                          \
2256    if (e>0 && e<STRING_OFFSET) {                               \
2257        Print_uchar (e);                                        \
2258    }                                                           \
2259    make_room(strlen(v));                                       \
2260    while (*v) { append_i_byte(*v); v++; }                      \
2261  }
2262
2263#define is_cat_letter(a)                                                \
2264    (get_char_cat_code(pool_to_unichar(str_string((a)))) == 11)
2265
2266@ the actual token conversion in this function is now functionally
2267   equivalent to |show_token_list|, except that it always prints the
2268   whole token list.
2269   TODO: check whether this causes problems in the lua library.
2270
2271@c
2272char *tokenlist_to_cstring(int pp, int inhibit_par, int *siz)
2273{
2274    register int p, c, m;
2275    int q;
2276    int infop;
2277    char *s, *sh;
2278    int e = 0;
2279    char *ret;
2280    int match_chr = '#';
2281    int n = '0';
2282    unsigned alloci = 1024;
2283    int i = 0;
2284    p = pp;
2285    if (p == null) {
2286        if (siz != NULL)
2287            *siz = 0;
2288        return NULL;
2289    }
2290    ret = xmalloc(alloci);
2291    p = token_link(p);          /* skip refcount */
2292    if (p != null) {
2293        e = int_par(escape_char_code);
2294    }
2295    while (p != null) {
2296        if (p < (int) fix_mem_min || p > (int) fix_mem_end) {
2297            Print_esc("CLOBBERED.");
2298            break;
2299        }
2300        infop = token_info(p);
2301        if (infop >= cs_token_flag) {
2302            if (!(inhibit_par && infop == par_token)) {
2303                q = infop - cs_token_flag;
2304                if (q < hash_base) {
2305                    if (q == null_cs) {
2306                        Print_esc("csname");
2307                        Print_esc("endcsname");
2308                    } else {
2309                        Print_esc("IMPOSSIBLE.");
2310                    }
2311                } else if ((q >= undefined_control_sequence)
2312                           && ((q <= eqtb_size)
2313                               || (q > eqtb_size + hash_extra))) {
2314                    Print_esc("IMPOSSIBLE.");
2315                } else if ((cs_text(q) < 0) || (cs_text(q) >= str_ptr)) {
2316                    Print_esc("NONEXISTENT.");
2317                } else {
2318                    str_number txt = cs_text(q);
2319                    sh = makecstring(txt);
2320                    s = sh;
2321                    if (is_active_cs(txt)) {
2322                        s = s + 3;
2323                        while (*s) {
2324                            Print_char(*s);
2325                            s++;
2326                        }
2327                    } else {
2328                        if (e>=0 && e<0x110000) Print_uchar(e);
2329                        while (*s) {
2330                            Print_char(*s);
2331                            s++;
2332                        }
2333                        if ((!single_letter(txt)) || is_cat_letter(txt)) {
2334                            Print_char(' ');
2335                        }
2336                    }
2337                    free(sh);
2338                }
2339            }
2340        } else {
2341            if (infop < 0) {
2342                Print_esc("BAD.");
2343            } else {
2344                m = token_cmd(infop);
2345                c = token_chr(infop);
2346                switch (m) {
2347                case left_brace_cmd:
2348                case right_brace_cmd:
2349                case math_shift_cmd:
2350                case tab_mark_cmd:
2351                case sup_mark_cmd:
2352                case sub_mark_cmd:
2353                case spacer_cmd:
2354                case letter_cmd:
2355                case other_char_cmd:
2356                    Print_uchar(c);
2357                    break;
2358                case mac_param_cmd:
2359                    if (!in_lua_escape)
2360                        Print_uchar(c);
2361                    Print_uchar(c);
2362                    break;
2363                case out_param_cmd:
2364                    Print_uchar(match_chr);
2365                    if (c <= 9) {
2366                        Print_char(c + '0');
2367                    } else {
2368                        Print_char('!');
2369                        goto EXIT;
2370                    }
2371                    break;
2372                case match_cmd:
2373                    match_chr = c;
2374                    Print_uchar(c);
2375                    n++;
2376                    Print_char(n);
2377                    if (n > '9')
2378                        goto EXIT;
2379                    break;
2380                case end_match_cmd:
2381                    if (c == 0) {
2382                        Print_char('-');
2383                        Print_char('>');
2384                    }
2385                    break;
2386                default:
2387                    Print_esc("BAD.");
2388                    break;
2389                }
2390            }
2391        }
2392        p = token_link(p);
2393    }
2394  EXIT:
2395    ret[i] = '\0';
2396    if (siz != NULL)
2397        *siz = i;
2398    return ret;
2399}
2400
2401@ @c
2402lstring *tokenlist_to_lstring(int pp, int inhibit_par)
2403{
2404    int siz;
2405    lstring *ret = xmalloc(sizeof(lstring));
2406    ret->s = (unsigned char *) tokenlist_to_cstring(pp, inhibit_par, &siz);
2407    ret->l = (size_t) siz;
2408    return ret;
2409}
2410
2411@ @c
2412void free_lstring(lstring * ls)
2413{
2414    if (ls == NULL)
2415        return;
2416    if (ls->s != NULL)
2417        free(ls->s);
2418    free(ls);
2419}
2420