1% primitive.w
2%
3% Copyright 2008-2010 Taco Hoekwater <taco@@luatex.org>
4%
5% This file is part of LuaTeX.
6%
7% LuaTeX is free software; you can redistribute it and/or modify it under
8% the terms of the GNU General Public License as published by the Free
9% Software Foundation; either version 2 of the License, or (at your
10% option) any later version.
11%
12% LuaTeX is distributed in the hope that it will be useful, but WITHOUT
13% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14% FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15% License for more details.
16%
17% You should have received a copy of the GNU General Public License along
18% with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
19
20@ @c
21
22
23#include "ptexlib.h"
24
25@ Control sequences are stored and retrieved by means of a fairly standard hash
26table algorithm called the method of ``coalescing lists'' (cf.\ Algorithm 6.4C
27in {\sl The Art of Computer Programming\/}). Once a control sequence enters the
28table, it is never removed, because there are complicated situations
29involving \.{\\gdef} where the removal of a control sequence at the end of
30a group would be a mistake preventable only by the introduction of a
31complicated reference-count mechanism.
32
33The actual sequence of letters forming a control sequence identifier is
34stored in the |str_pool| array together with all the other strings. An
35auxiliary array |hash| consists of items with two halfword fields per
36word. The first of these, called |next(p)|, points to the next identifier
37belonging to the same coalesced list as the identifier corresponding to~|p|;
38and the other, called |text(p)|, points to the |str_start| entry for
39|p|'s identifier. If position~|p| of the hash table is empty, we have
40|text(p)=0|; if position |p| is either empty or the end of a coalesced
41hash list, we have |next(p)=0|. An auxiliary pointer variable called
42|hash_used| is maintained in such a way that all locations |p>=hash_used|
43are nonempty. The global variable |cs_count| tells how many multiletter
44control sequences have been defined, if statistics are being kept.
45
46A global boolean variable called |no_new_control_sequence| is set to
47|true| during the time that new hash table entries are forbidden.
48
49@c
50two_halves *hash;               /* the hash table */
51halfword hash_used;             /* allocation pointer for |hash| */
52int hash_extra;                 /* |hash_extra=hash| above |eqtb_size| */
53halfword hash_top;              /* maximum of the hash array */
54halfword hash_high;             /* pointer to next high hash location */
55boolean no_new_control_sequence;        /* are new identifiers legal? */
56int cs_count;                   /* total number of known identifiers */
57
58#define hash_is_full (hash_used==hash_base)     /* test if all positions are occupied */
59
60@ \.{\\primitive} support needs a few extra variables and definitions
61
62@c
63#define prim_base 1
64
65@ The arrays |prim| and |prim_eqtb| are used for name -> cmd,chr lookups.
66
67 The are  modelled after |hash| and |eqtb|, except that primitives do not
68  have an |eq_level|, that field is replaced by |origin|.
69
70@c
71#define prim_next(a) prim[(a)].lhfield  /* link for coalesced lists */
72#define prim_text(a) prim[(a)].rh       /* string number for control sequence name */
73#define prim_is_full (prim_used==prim_base)     /* test if all positions are occupied */
74
75#define prim_origin_field(a) (a).hh.b1
76#define prim_eq_type_field(a)  (a).hh.b0
77#define prim_equiv_field(a) (a).hh.rh
78#define prim_origin(a) prim_origin_field(prim_eqtb[(a)])        /* level of definition */
79#define prim_eq_type(a) prim_eq_type_field(prim_eqtb[(a)])      /* command code for equivalent */
80#define prim_equiv(a) prim_equiv_field(prim_eqtb[(a)])  /* equivalent value */
81
82static pointer prim_used;       /* allocation pointer for |prim| */
83static two_halves prim[(prim_size + 1)];        /* the primitives table */
84static memory_word prim_eqtb[(prim_size + 1)];
85
86@ The array |prim_data| works the other way around, it is used for
87   cmd,chr -> name lookups.
88
89@c
90typedef struct prim_info {
91    halfword subids;            /* number of name entries */
92    halfword offset;            /* offset to be used for |chr_code|s */
93    str_number *names;          /* array of names */
94} prim_info;
95
96static prim_info prim_data[(last_cmd + 1)];
97
98@ initialize the memory arrays
99@c
100void init_primitives(void)
101{
102    int k;
103    memset(prim_data, 0, (sizeof(prim_info) * (last_cmd + 1)));
104    memset(prim, 0, (sizeof(two_halves) * (prim_size + 1)));
105    memset(prim_eqtb, 0, (sizeof(memory_word) * (prim_size + 1)));
106    for (k = 0; k <= prim_size; k++)
107        prim_eq_type(k) = undefined_cs_cmd;
108}
109
110void ini_init_primitives(void)
111{
112    prim_used = prim_size;      /* nothing is used */
113}
114
115
116@ The value of |hash_prime| should be roughly 85\%! of |hash_size|, and it
117   should be a prime number.  The theory of hashing tells us to expect fewer
118   than two table probes, on the average, when the search is successful.
119   [See J.~S. Vitter, {\sl Journal of the ACM\/ \bf30} (1983), 231--258.]
120   @^Vitter, Jeffrey Scott@>
121
122@c
123static halfword compute_hash(const char *j, unsigned int l,
124                             halfword prime_number)
125{
126    int k;
127    halfword h = (unsigned char) *j;
128    for (k = 1; k <= (int)(l - 1); k++) {
129        h = h + h + (unsigned char) *(j + k);
130        while (h >= prime_number)
131            h = h - prime_number;
132    }
133    return h;
134}
135
136
137@ Here is the subroutine that searches the primitive table for an identifier
138@c
139pointer prim_lookup(str_number s)
140{
141    int h;                      /* hash code */
142    pointer p;                  /* index in |hash| array */
143    unsigned char *j;
144    unsigned l;
145    if (s < STRING_OFFSET) {
146        p = s;
147        if ((p < 0) || (get_prim_eq_type(p) == undefined_cs_cmd)) {
148            p = undefined_primitive;
149        }
150    } else {
151        j = str_string(s);
152        l = (unsigned) str_length(s);
153        h = compute_hash((char *) j, l, prim_prime);
154        p = h + prim_base;      /* we start searching here; note that |0<=h<hash_prime| */
155        while (1) {
156            if (prim_text(p) > 0)
157                if (str_length(prim_text(p)) == l)
158                    if (str_eq_str(prim_text(p), s))
159                        goto FOUND;
160            if (prim_next(p) == 0) {
161                if (no_new_control_sequence) {
162                    p = undefined_primitive;
163                } else {
164                    /* Insert a new primitive after |p|, then make |p| point to it */
165                    if (prim_text(p) > 0) {
166                        do {    /* search for an empty location in |prim| */
167                            if (prim_is_full)
168                                overflow("primitive size", prim_size);
169                            decr(prim_used);
170                        } while (prim_text(prim_used) != 0);
171                        prim_next(p) = prim_used;
172                        p = prim_used;
173                    }
174                    prim_text(p) = s;
175                }
176                goto FOUND;
177            }
178            p = prim_next(p);
179        }
180    }
181  FOUND:
182    return p;
183}
184
185@ how to test a csname for primitive-ness
186@c
187boolean is_primitive(str_number csname)
188{
189    int n, m;
190    char *ss;
191    m = prim_lookup(csname);
192    ss = makecstring(csname);
193    n = string_lookup(ss, str_length(csname));
194    free(ss);
195    return ((n != undefined_cs_cmd) &&
196            (m != undefined_primitive) &&
197            (eq_type(n) == prim_eq_type(m)) && (equiv(n) == prim_equiv(m)));
198}
199
200
201@ a few simple accessors
202@c
203quarterword get_prim_eq_type(int p)
204{
205    return prim_eq_type(p);
206}
207
208quarterword get_prim_origin(int p)
209{
210    return prim_origin(p);
211}
212
213halfword get_prim_equiv(int p)
214{
215    return prim_equiv(p);
216}
217
218str_number get_prim_text(int p)
219{
220    return prim_text(p);
221}
222
223
224@ dumping and undumping
225@c
226void dump_primitives(void)
227{
228    int p, q;
229    for (p = 0; p <= prim_size; p++)
230        dump_hh(prim[p]);
231    for (p = 0; p <= prim_size; p++)
232        dump_wd(prim_eqtb[p]);
233    for (p = 0; p <= last_cmd; p++) {
234        dump_int(prim_data[p].offset);
235        dump_int(prim_data[p].subids);
236        for (q = 0; q < prim_data[p].subids; q++) {
237            dump_int(prim_data[p].names[q]);
238        }
239    }
240}
241
242void undump_primitives(void)
243{
244    int p, q;
245    for (p = 0; p <= prim_size; p++)
246        undump_hh(prim[p]);
247    for (p = 0; p <= prim_size; p++)
248        undump_wd(prim_eqtb[p]);
249
250    for (p = 0; p <= last_cmd; p++) {
251        undump_int(prim_data[p].offset);
252        undump_int(prim_data[p].subids);
253        if (prim_data[p].subids > 0) {
254            prim_data[p].names = (str_number *)
255                xmalloc((unsigned)
256                        ((unsigned) prim_data[p].subids *
257                         sizeof(str_number *)));
258            for (q = 0; q < prim_data[p].subids; q++)
259                undump_int(prim_data[p].names[q]);
260        }
261    }
262}
263
264@   We need to put \TeX's ``primitive'' control sequences into the hash
265   table, together with their command code (which will be the |eq_type|)
266   and an operand (which will be the |equiv|). The |primitive| procedure
267   does this, in a way that no \TeX\ user can. The global value |cur_val|
268   contains the new |eqtb| pointer after |primitive| has acted.
269
270
271@  Because the definitions of the actual user-accessible name of a
272   primitive can be postponed until runtime, the function |primitive_def|
273   is needed that does nothing except creating the control sequence name.
274
275@c
276void primitive_def(const char *s, size_t l, quarterword c, halfword o)
277{
278    int nncs = no_new_control_sequence;
279    no_new_control_sequence = false;
280    cur_val = string_lookup(s, l);      /* this creates the |text()| string */
281    no_new_control_sequence = nncs;
282    eq_level(cur_val) = level_one;
283    eq_type(cur_val) = c;
284    equiv(cur_val) = o;
285}
286
287@ The function |store_primitive_name| sets up the bookkeeping for the
288   reverse lookup. It is quite paranoid, because it is easy to mess this up
289   accidentally.
290
291   The |offset| is needed because sometimes character codes (in |o|)
292   are indices into |eqtb| or are offset by a magical value to make
293   sure they do not conflict with something else. We don't want the
294   |prim_data[c].names| to have too many entries as it will just be
295   wasted room, so |offset| is substracted from |o| because creating
296   or accessing the array. The |assert(idx<=0xFFFF)| is not strictly
297   needed, but it helps catch errors of this kind.
298
299@c
300static void
301store_primitive_name(str_number s, quarterword c, halfword o, halfword offset)
302{
303    int idx;
304    if (prim_data[c].offset != 0 && prim_data[c].offset != offset) {
305        assert(false);
306    }
307    prim_data[c].offset = offset;
308    idx = ((int) o - offset);
309    assert(idx >= 0);
310    assert(idx <= 0xFFFF);
311    if (prim_data[c].subids < (idx + 1)) {
312        str_number *new =
313            (str_number *) xcalloc((unsigned) (idx + 1), sizeof(str_number *));
314        if (prim_data[c].names != NULL) {
315            assert(prim_data[c].subids);
316            memcpy(new, (prim_data[c].names),
317                   (unsigned) (prim_data[c].subids) * sizeof(str_number));
318            free(prim_data[c].names);
319        }
320        prim_data[c].names = new;
321        prim_data[c].subids = idx + 1;
322    }
323    prim_data[c].names[idx] = s;
324}
325
326@ Compared to tex82, |primitive| has two extra parameters. The |off| is an offset
327   that will be passed on to |store_primitive_name|, the |cmd_origin| is the bit
328   that is used to group primitives by originator.
329
330@c
331void
332primitive(const char *thes, quarterword c, halfword o, halfword off,
333          int cmd_origin)
334{
335    int prim_val;               /* needed to fill |prim_eqtb| */
336    str_number ss;
337    assert(o >= off);
338    ss = maketexstring(thes);
339    if (cmd_origin == tex_command || cmd_origin == core_command) {
340        primitive_def(thes, strlen(thes), c, o);
341    }
342    prim_val = prim_lookup(ss);
343    prim_origin(prim_val) = (quarterword) cmd_origin;
344    prim_eq_type(prim_val) = c;
345    prim_equiv(prim_val) = o;
346    store_primitive_name(ss, c, o, off);
347}
348
349
350
351@ Here is a helper that does the actual hash insertion.
352
353@c
354static halfword insert_id(halfword p, const unsigned char *j, unsigned int l)
355{
356    unsigned saved_cur_length;
357    unsigned saved_cur_string_size;
358    unsigned char *saved_cur_string;
359    const unsigned char *k;
360    /* This code far from ideal: the existance of |hash_extra| changes
361       all the potential (short) coalesced lists into a single (long)
362       one. This will create a slowdown. */
363    if (cs_text(p) > 0) {
364        if (hash_high < hash_extra) {
365            incr(hash_high);
366            /* can't use |eqtb_top| here (perhaps because that is not finalized
367               yet when called from |primitive|?) */
368            cs_next(p) = hash_high + eqtb_size;
369            p = cs_next(p);
370        } else {
371            do {
372                if (hash_is_full)
373                    overflow("hash size", (unsigned) (hash_size + hash_extra));
374                decr(hash_used);
375            } while (cs_text(hash_used) != 0);  /* search for an empty location in |hash| */
376            cs_next(p) = hash_used;
377            p = hash_used;
378        }
379    }
380    saved_cur_length = cur_length;
381    saved_cur_string = cur_string;
382    saved_cur_string_size = cur_string_size;
383    reset_cur_string();
384    for (k = j; k <= j + l - 1; k++)
385        append_char(*k);
386    cs_text(p) = make_string();
387    cur_length = saved_cur_length;
388    xfree(cur_string);
389    cur_string = saved_cur_string;
390    cur_string_size = saved_cur_string_size;
391    incr(cs_count);
392    return p;
393}
394
395
396@ Here is the subroutine that searches the hash table for an identifier
397 that matches a given string of length |l>1| appearing in |buffer[j..
398 (j+l-1)]|. If the identifier is found, the corresponding hash table address
399 is returned. Otherwise, if the global variable |no_new_control_sequence|
400 is |true|, the dummy address |undefined_control_sequence| is returned.
401 Otherwise the identifier is inserted into the hash table and its location
402 is returned.
403
404@c
405pointer id_lookup(int j, int l)
406{                               /* search the hash table */
407    int h;                      /* hash code */
408    pointer p;                  /* index in |hash| array */
409
410    h = compute_hash((char *) (buffer + j), (unsigned) l, hash_prime);
411#ifdef VERBOSE
412    {
413        unsigned char *todo = xmalloc(l + 2);
414        strncpy(todo, (buffer + j), l);
415        todo[l] = '\0';
416        todo[l + 1] = '\0';
417        fprintf(stdout, "id_lookup(%s)\n", todo);
418        free(todo);
419    }
420#endif
421    p = h + hash_base;          /* we start searching here; note that |0<=h<hash_prime| */
422    while (1) {
423        if (cs_text(p) > 0)
424            if (str_length(cs_text(p)) == (unsigned) l)
425                if (str_eq_buf(cs_text(p), j))
426                    goto FOUND;
427        if (cs_next(p) == 0) {
428            if (no_new_control_sequence) {
429                p = undefined_control_sequence;
430            } else {
431                p = insert_id(p, (buffer + j), (unsigned) l);
432            }
433            goto FOUND;
434        }
435        p = cs_next(p);
436    }
437  FOUND:
438    return p;
439}
440
441@ Here is a similar subroutine for finding a primitive in the hash.
442This one is based on a C string.
443
444@c
445pointer string_lookup(const char *s, size_t l)
446{                               /* search the hash table */
447    int h;                      /* hash code */
448    pointer p;                  /* index in |hash| array */
449    h = compute_hash(s, (unsigned) l, hash_prime);
450    p = h + hash_base;          /* we start searching here; note that |0<=h<hash_prime| */
451    while (1) {
452        if (cs_text(p) > 0)
453            if (str_eq_cstr(cs_text(p), s, l))
454                goto FOUND;
455        if (cs_next(p) == 0) {
456            if (no_new_control_sequence) {
457                p = undefined_control_sequence;
458            } else {
459                p = insert_id(p, (const unsigned char *) s, (unsigned) l);
460            }
461            goto FOUND;
462        }
463        p = cs_next(p);
464    }
465  FOUND:
466    return p;
467}
468
469@ The |print_cmd_chr| routine prints a symbolic interpretation of a
470   command code and its modifier. This is used in certain `\.{You can\'t}'
471   error messages, and in the implementation of diagnostic routines like
472   \.{\\show}.
473
474   The body of |print_cmd_chr| use to be  a rather tedious listing of print
475   commands, and most of it was essentially an inverse to the |primitive|
476   routine that enters a \TeX\ primitive into |eqtb|.
477
478   Thanks to |prim_data|, there is no need for all that tediousness. What
479   is left of |primt_cnd_chr| are just the exceptions to the general rule
480   that the  |cmd,chr_code| pair represents in a single primitive command.
481
482@c
483#define chr_cmd(A) do { tprint(A); print(chr_code); } while (0)
484
485static void prim_cmd_chr(quarterword cmd, halfword chr_code)
486{
487    int idx = chr_code - prim_data[cmd].offset;
488    if (cmd <= last_cmd &&
489        idx >= 0 && idx < prim_data[cmd].subids &&
490        prim_data[cmd].names != NULL && prim_data[cmd].names[idx] != 0) {
491        tprint_esc("");
492        print(prim_data[cmd].names[idx]);
493    } else {
494        /* TEX82 didn't print the |cmd,idx| information, but it may be useful */
495        tprint("[unknown command code! (");
496        print_int(cmd);
497        tprint(", ");
498        print_int(idx);
499        tprint(")]");
500    }
501}
502
503void print_cmd_chr(quarterword cmd, halfword chr_code)
504{
505    int n;                      /* temp variable */
506    switch (cmd) {
507    case left_brace_cmd:
508        chr_cmd("begin-group character ");
509        break;
510    case right_brace_cmd:
511        chr_cmd("end-group character ");
512        break;
513    case math_shift_cmd:
514        chr_cmd("math shift character ");
515        break;
516    case mac_param_cmd:
517        if (chr_code == tab_mark_cmd_code)
518            tprint_esc("alignmark");
519        else
520            chr_cmd("macro parameter character ");
521        break;
522    case sup_mark_cmd:
523        chr_cmd("superscript character ");
524        break;
525    case sub_mark_cmd:
526        chr_cmd("subscript character ");
527        break;
528    case endv_cmd:
529        tprint("end of alignment template");
530        break;
531    case spacer_cmd:
532        chr_cmd("blank space ");
533        break;
534    case letter_cmd:
535        chr_cmd("the letter ");
536        break;
537    case other_char_cmd:
538        chr_cmd("the character ");
539        break;
540    case tab_mark_cmd:
541        if (chr_code == span_code)
542            tprint_esc("span");
543        else if (chr_code == tab_mark_cmd_code)
544            tprint_esc("aligntab");
545        else
546            chr_cmd("alignment tab character ");
547        break;
548    case if_test_cmd:
549        if (chr_code >= unless_code)
550            tprint_esc("unless");
551        prim_cmd_chr(cmd, (chr_code % unless_code));
552        break;
553    case char_given_cmd:
554        tprint_esc("char");
555        print_hex(chr_code);
556        break;
557    case math_given_cmd:
558        tprint_esc("mathchar");
559        show_mathcode_value(mathchar_from_integer(chr_code, tex_mathcode));
560        break;
561    case xmath_given_cmd:
562        tprint_esc("Umathchar");
563        show_mathcode_value(mathchar_from_integer(chr_code, xetex_mathcode));
564        break;
565    case set_font_cmd:
566        tprint("select font ");
567        tprint(font_name(chr_code));
568        if (font_size(chr_code) != font_dsize(chr_code)) {
569            tprint(" at ");
570            print_scaled(font_size(chr_code));
571            tprint("pt");
572        }
573        break;
574    case undefined_cs_cmd:
575        tprint("undefined");
576        break;
577    case call_cmd:
578    case long_call_cmd:
579    case outer_call_cmd:
580    case long_outer_call_cmd:
581        n = cmd - call_cmd;
582        if (token_info(token_link(chr_code)) == protected_token)
583            n = n + 4;
584        if (odd(n / 4))
585            tprint_esc("protected");
586        if (odd(n))
587            tprint_esc("long");
588        if (odd(n / 2))
589            tprint_esc("outer");
590        if (n > 0)
591            tprint(" ");
592        tprint("macro");
593        break;
594    case extension_cmd:
595        if (chr_code < prim_data[cmd].subids &&
596            prim_data[cmd].names[chr_code] != 0) {
597            prim_cmd_chr(cmd, chr_code);
598        } else {
599            tprint("[unknown extension! (");
600            print_int(chr_code);
601            tprint(")]");
602
603        }
604        break;
605    case assign_glue_cmd:
606    case assign_mu_glue_cmd:
607        if (chr_code < skip_base) {
608            prim_cmd_chr(cmd, chr_code);
609        } else if (chr_code < mu_skip_base) {
610            tprint_esc("skip");
611            print_int(chr_code - skip_base);
612        } else {
613            tprint_esc("muskip");
614            print_int(chr_code - mu_skip_base);
615        }
616        break;
617    case assign_toks_cmd:
618        if (chr_code >= toks_base) {
619            tprint_esc("toks");
620            print_int(chr_code - toks_base);
621        } else {
622            prim_cmd_chr(cmd, chr_code);
623        }
624        break;
625    case assign_int_cmd:
626        if (chr_code < count_base) {
627            prim_cmd_chr(cmd, chr_code);
628        } else {
629            tprint_esc("count");
630            print_int(chr_code - count_base);
631        }
632        break;
633    case assign_attr_cmd:
634        tprint_esc("attribute");
635        print_int(chr_code - attribute_base);
636        break;
637    case assign_dimen_cmd:
638        if (chr_code < scaled_base) {
639            prim_cmd_chr(cmd, chr_code);
640        } else {
641            tprint_esc("dimen");
642            print_int(chr_code - scaled_base);
643        }
644        break;
645    default:
646        /* these are most commands, actually */
647        prim_cmd_chr(cmd, chr_code);
648        break;
649    }
650}
651