1% primitive.w 2% 3% Copyright 2008-2010 Taco Hoekwater <taco@@luatex.org> 4% 5% This file is part of LuaTeX. 6% 7% LuaTeX is free software; you can redistribute it and/or modify it under 8% the terms of the GNU General Public License as published by the Free 9% Software Foundation; either version 2 of the License, or (at your 10% option) any later version. 11% 12% LuaTeX is distributed in the hope that it will be useful, but WITHOUT 13% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14% FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15% License for more details. 16% 17% You should have received a copy of the GNU General Public License along 18% with LuaTeX; if not, see <http://www.gnu.org/licenses/>. 19 20@ @c 21 22 23#include "ptexlib.h" 24 25@ Control sequences are stored and retrieved by means of a fairly standard hash 26table algorithm called the method of ``coalescing lists'' (cf.\ Algorithm 6.4C 27in {\sl The Art of Computer Programming\/}). Once a control sequence enters the 28table, it is never removed, because there are complicated situations 29involving \.{\\gdef} where the removal of a control sequence at the end of 30a group would be a mistake preventable only by the introduction of a 31complicated reference-count mechanism. 32 33The actual sequence of letters forming a control sequence identifier is 34stored in the |str_pool| array together with all the other strings. An 35auxiliary array |hash| consists of items with two halfword fields per 36word. The first of these, called |next(p)|, points to the next identifier 37belonging to the same coalesced list as the identifier corresponding to~|p|; 38and the other, called |text(p)|, points to the |str_start| entry for 39|p|'s identifier. If position~|p| of the hash table is empty, we have 40|text(p)=0|; if position |p| is either empty or the end of a coalesced 41hash list, we have |next(p)=0|. An auxiliary pointer variable called 42|hash_used| is maintained in such a way that all locations |p>=hash_used| 43are nonempty. The global variable |cs_count| tells how many multiletter 44control sequences have been defined, if statistics are being kept. 45 46A global boolean variable called |no_new_control_sequence| is set to 47|true| during the time that new hash table entries are forbidden. 48 49@c 50two_halves *hash; /* the hash table */ 51halfword hash_used; /* allocation pointer for |hash| */ 52int hash_extra; /* |hash_extra=hash| above |eqtb_size| */ 53halfword hash_top; /* maximum of the hash array */ 54halfword hash_high; /* pointer to next high hash location */ 55boolean no_new_control_sequence; /* are new identifiers legal? */ 56int cs_count; /* total number of known identifiers */ 57 58#define hash_is_full (hash_used==hash_base) /* test if all positions are occupied */ 59 60@ \.{\\primitive} support needs a few extra variables and definitions 61 62@c 63#define prim_base 1 64 65@ The arrays |prim| and |prim_eqtb| are used for name -> cmd,chr lookups. 66 67 The are modelled after |hash| and |eqtb|, except that primitives do not 68 have an |eq_level|, that field is replaced by |origin|. 69 70@c 71#define prim_next(a) prim[(a)].lhfield /* link for coalesced lists */ 72#define prim_text(a) prim[(a)].rh /* string number for control sequence name */ 73#define prim_is_full (prim_used==prim_base) /* test if all positions are occupied */ 74 75#define prim_origin_field(a) (a).hh.b1 76#define prim_eq_type_field(a) (a).hh.b0 77#define prim_equiv_field(a) (a).hh.rh 78#define prim_origin(a) prim_origin_field(prim_eqtb[(a)]) /* level of definition */ 79#define prim_eq_type(a) prim_eq_type_field(prim_eqtb[(a)]) /* command code for equivalent */ 80#define prim_equiv(a) prim_equiv_field(prim_eqtb[(a)]) /* equivalent value */ 81 82static pointer prim_used; /* allocation pointer for |prim| */ 83static two_halves prim[(prim_size + 1)]; /* the primitives table */ 84static memory_word prim_eqtb[(prim_size + 1)]; 85 86@ The array |prim_data| works the other way around, it is used for 87 cmd,chr -> name lookups. 88 89@c 90typedef struct prim_info { 91 halfword subids; /* number of name entries */ 92 halfword offset; /* offset to be used for |chr_code|s */ 93 str_number *names; /* array of names */ 94} prim_info; 95 96static prim_info prim_data[(last_cmd + 1)]; 97 98@ initialize the memory arrays 99@c 100void init_primitives(void) 101{ 102 int k; 103 memset(prim_data, 0, (sizeof(prim_info) * (last_cmd + 1))); 104 memset(prim, 0, (sizeof(two_halves) * (prim_size + 1))); 105 memset(prim_eqtb, 0, (sizeof(memory_word) * (prim_size + 1))); 106 for (k = 0; k <= prim_size; k++) 107 prim_eq_type(k) = undefined_cs_cmd; 108} 109 110void ini_init_primitives(void) 111{ 112 prim_used = prim_size; /* nothing is used */ 113} 114 115 116@ The value of |hash_prime| should be roughly 85\%! of |hash_size|, and it 117 should be a prime number. The theory of hashing tells us to expect fewer 118 than two table probes, on the average, when the search is successful. 119 [See J.~S. Vitter, {\sl Journal of the ACM\/ \bf30} (1983), 231--258.] 120 @^Vitter, Jeffrey Scott@> 121 122@c 123static halfword compute_hash(const char *j, unsigned int l, 124 halfword prime_number) 125{ 126 int k; 127 halfword h = (unsigned char) *j; 128 for (k = 1; k <= (int)(l - 1); k++) { 129 h = h + h + (unsigned char) *(j + k); 130 while (h >= prime_number) 131 h = h - prime_number; 132 } 133 return h; 134} 135 136 137@ Here is the subroutine that searches the primitive table for an identifier 138@c 139pointer prim_lookup(str_number s) 140{ 141 int h; /* hash code */ 142 pointer p; /* index in |hash| array */ 143 unsigned char *j; 144 unsigned l; 145 if (s < STRING_OFFSET) { 146 p = s; 147 if ((p < 0) || (get_prim_eq_type(p) == undefined_cs_cmd)) { 148 p = undefined_primitive; 149 } 150 } else { 151 j = str_string(s); 152 l = (unsigned) str_length(s); 153 h = compute_hash((char *) j, l, prim_prime); 154 p = h + prim_base; /* we start searching here; note that |0<=h<hash_prime| */ 155 while (1) { 156 if (prim_text(p) > 0) 157 if (str_length(prim_text(p)) == l) 158 if (str_eq_str(prim_text(p), s)) 159 goto FOUND; 160 if (prim_next(p) == 0) { 161 if (no_new_control_sequence) { 162 p = undefined_primitive; 163 } else { 164 /* Insert a new primitive after |p|, then make |p| point to it */ 165 if (prim_text(p) > 0) { 166 do { /* search for an empty location in |prim| */ 167 if (prim_is_full) 168 overflow("primitive size", prim_size); 169 decr(prim_used); 170 } while (prim_text(prim_used) != 0); 171 prim_next(p) = prim_used; 172 p = prim_used; 173 } 174 prim_text(p) = s; 175 } 176 goto FOUND; 177 } 178 p = prim_next(p); 179 } 180 } 181 FOUND: 182 return p; 183} 184 185@ how to test a csname for primitive-ness 186@c 187boolean is_primitive(str_number csname) 188{ 189 int n, m; 190 char *ss; 191 m = prim_lookup(csname); 192 ss = makecstring(csname); 193 n = string_lookup(ss, str_length(csname)); 194 free(ss); 195 return ((n != undefined_cs_cmd) && 196 (m != undefined_primitive) && 197 (eq_type(n) == prim_eq_type(m)) && (equiv(n) == prim_equiv(m))); 198} 199 200 201@ a few simple accessors 202@c 203quarterword get_prim_eq_type(int p) 204{ 205 return prim_eq_type(p); 206} 207 208quarterword get_prim_origin(int p) 209{ 210 return prim_origin(p); 211} 212 213halfword get_prim_equiv(int p) 214{ 215 return prim_equiv(p); 216} 217 218str_number get_prim_text(int p) 219{ 220 return prim_text(p); 221} 222 223 224@ dumping and undumping 225@c 226void dump_primitives(void) 227{ 228 int p, q; 229 for (p = 0; p <= prim_size; p++) 230 dump_hh(prim[p]); 231 for (p = 0; p <= prim_size; p++) 232 dump_wd(prim_eqtb[p]); 233 for (p = 0; p <= last_cmd; p++) { 234 dump_int(prim_data[p].offset); 235 dump_int(prim_data[p].subids); 236 for (q = 0; q < prim_data[p].subids; q++) { 237 dump_int(prim_data[p].names[q]); 238 } 239 } 240} 241 242void undump_primitives(void) 243{ 244 int p, q; 245 for (p = 0; p <= prim_size; p++) 246 undump_hh(prim[p]); 247 for (p = 0; p <= prim_size; p++) 248 undump_wd(prim_eqtb[p]); 249 250 for (p = 0; p <= last_cmd; p++) { 251 undump_int(prim_data[p].offset); 252 undump_int(prim_data[p].subids); 253 if (prim_data[p].subids > 0) { 254 prim_data[p].names = (str_number *) 255 xmalloc((unsigned) 256 ((unsigned) prim_data[p].subids * 257 sizeof(str_number *))); 258 for (q = 0; q < prim_data[p].subids; q++) 259 undump_int(prim_data[p].names[q]); 260 } 261 } 262} 263 264@ We need to put \TeX's ``primitive'' control sequences into the hash 265 table, together with their command code (which will be the |eq_type|) 266 and an operand (which will be the |equiv|). The |primitive| procedure 267 does this, in a way that no \TeX\ user can. The global value |cur_val| 268 contains the new |eqtb| pointer after |primitive| has acted. 269 270 271@ Because the definitions of the actual user-accessible name of a 272 primitive can be postponed until runtime, the function |primitive_def| 273 is needed that does nothing except creating the control sequence name. 274 275@c 276void primitive_def(const char *s, size_t l, quarterword c, halfword o) 277{ 278 int nncs = no_new_control_sequence; 279 no_new_control_sequence = false; 280 cur_val = string_lookup(s, l); /* this creates the |text()| string */ 281 no_new_control_sequence = nncs; 282 eq_level(cur_val) = level_one; 283 eq_type(cur_val) = c; 284 equiv(cur_val) = o; 285} 286 287@ The function |store_primitive_name| sets up the bookkeeping for the 288 reverse lookup. It is quite paranoid, because it is easy to mess this up 289 accidentally. 290 291 The |offset| is needed because sometimes character codes (in |o|) 292 are indices into |eqtb| or are offset by a magical value to make 293 sure they do not conflict with something else. We don't want the 294 |prim_data[c].names| to have too many entries as it will just be 295 wasted room, so |offset| is substracted from |o| because creating 296 or accessing the array. The |assert(idx<=0xFFFF)| is not strictly 297 needed, but it helps catch errors of this kind. 298 299@c 300static void 301store_primitive_name(str_number s, quarterword c, halfword o, halfword offset) 302{ 303 int idx; 304 if (prim_data[c].offset != 0 && prim_data[c].offset != offset) { 305 assert(false); 306 } 307 prim_data[c].offset = offset; 308 idx = ((int) o - offset); 309 assert(idx >= 0); 310 assert(idx <= 0xFFFF); 311 if (prim_data[c].subids < (idx + 1)) { 312 str_number *new = 313 (str_number *) xcalloc((unsigned) (idx + 1), sizeof(str_number *)); 314 if (prim_data[c].names != NULL) { 315 assert(prim_data[c].subids); 316 memcpy(new, (prim_data[c].names), 317 (unsigned) (prim_data[c].subids) * sizeof(str_number)); 318 free(prim_data[c].names); 319 } 320 prim_data[c].names = new; 321 prim_data[c].subids = idx + 1; 322 } 323 prim_data[c].names[idx] = s; 324} 325 326@ Compared to tex82, |primitive| has two extra parameters. The |off| is an offset 327 that will be passed on to |store_primitive_name|, the |cmd_origin| is the bit 328 that is used to group primitives by originator. 329 330@c 331void 332primitive(const char *thes, quarterword c, halfword o, halfword off, 333 int cmd_origin) 334{ 335 int prim_val; /* needed to fill |prim_eqtb| */ 336 str_number ss; 337 assert(o >= off); 338 ss = maketexstring(thes); 339 if (cmd_origin == tex_command || cmd_origin == core_command) { 340 primitive_def(thes, strlen(thes), c, o); 341 } 342 prim_val = prim_lookup(ss); 343 prim_origin(prim_val) = (quarterword) cmd_origin; 344 prim_eq_type(prim_val) = c; 345 prim_equiv(prim_val) = o; 346 store_primitive_name(ss, c, o, off); 347} 348 349 350 351@ Here is a helper that does the actual hash insertion. 352 353@c 354static halfword insert_id(halfword p, const unsigned char *j, unsigned int l) 355{ 356 unsigned saved_cur_length; 357 unsigned saved_cur_string_size; 358 unsigned char *saved_cur_string; 359 const unsigned char *k; 360 /* This code far from ideal: the existance of |hash_extra| changes 361 all the potential (short) coalesced lists into a single (long) 362 one. This will create a slowdown. */ 363 if (cs_text(p) > 0) { 364 if (hash_high < hash_extra) { 365 incr(hash_high); 366 /* can't use |eqtb_top| here (perhaps because that is not finalized 367 yet when called from |primitive|?) */ 368 cs_next(p) = hash_high + eqtb_size; 369 p = cs_next(p); 370 } else { 371 do { 372 if (hash_is_full) 373 overflow("hash size", (unsigned) (hash_size + hash_extra)); 374 decr(hash_used); 375 } while (cs_text(hash_used) != 0); /* search for an empty location in |hash| */ 376 cs_next(p) = hash_used; 377 p = hash_used; 378 } 379 } 380 saved_cur_length = cur_length; 381 saved_cur_string = cur_string; 382 saved_cur_string_size = cur_string_size; 383 reset_cur_string(); 384 for (k = j; k <= j + l - 1; k++) 385 append_char(*k); 386 cs_text(p) = make_string(); 387 cur_length = saved_cur_length; 388 xfree(cur_string); 389 cur_string = saved_cur_string; 390 cur_string_size = saved_cur_string_size; 391 incr(cs_count); 392 return p; 393} 394 395 396@ Here is the subroutine that searches the hash table for an identifier 397 that matches a given string of length |l>1| appearing in |buffer[j.. 398 (j+l-1)]|. If the identifier is found, the corresponding hash table address 399 is returned. Otherwise, if the global variable |no_new_control_sequence| 400 is |true|, the dummy address |undefined_control_sequence| is returned. 401 Otherwise the identifier is inserted into the hash table and its location 402 is returned. 403 404@c 405pointer id_lookup(int j, int l) 406{ /* search the hash table */ 407 int h; /* hash code */ 408 pointer p; /* index in |hash| array */ 409 410 h = compute_hash((char *) (buffer + j), (unsigned) l, hash_prime); 411#ifdef VERBOSE 412 { 413 unsigned char *todo = xmalloc(l + 2); 414 strncpy(todo, (buffer + j), l); 415 todo[l] = '\0'; 416 todo[l + 1] = '\0'; 417 fprintf(stdout, "id_lookup(%s)\n", todo); 418 free(todo); 419 } 420#endif 421 p = h + hash_base; /* we start searching here; note that |0<=h<hash_prime| */ 422 while (1) { 423 if (cs_text(p) > 0) 424 if (str_length(cs_text(p)) == (unsigned) l) 425 if (str_eq_buf(cs_text(p), j)) 426 goto FOUND; 427 if (cs_next(p) == 0) { 428 if (no_new_control_sequence) { 429 p = undefined_control_sequence; 430 } else { 431 p = insert_id(p, (buffer + j), (unsigned) l); 432 } 433 goto FOUND; 434 } 435 p = cs_next(p); 436 } 437 FOUND: 438 return p; 439} 440 441@ Here is a similar subroutine for finding a primitive in the hash. 442This one is based on a C string. 443 444@c 445pointer string_lookup(const char *s, size_t l) 446{ /* search the hash table */ 447 int h; /* hash code */ 448 pointer p; /* index in |hash| array */ 449 h = compute_hash(s, (unsigned) l, hash_prime); 450 p = h + hash_base; /* we start searching here; note that |0<=h<hash_prime| */ 451 while (1) { 452 if (cs_text(p) > 0) 453 if (str_eq_cstr(cs_text(p), s, l)) 454 goto FOUND; 455 if (cs_next(p) == 0) { 456 if (no_new_control_sequence) { 457 p = undefined_control_sequence; 458 } else { 459 p = insert_id(p, (const unsigned char *) s, (unsigned) l); 460 } 461 goto FOUND; 462 } 463 p = cs_next(p); 464 } 465 FOUND: 466 return p; 467} 468 469@ The |print_cmd_chr| routine prints a symbolic interpretation of a 470 command code and its modifier. This is used in certain `\.{You can\'t}' 471 error messages, and in the implementation of diagnostic routines like 472 \.{\\show}. 473 474 The body of |print_cmd_chr| use to be a rather tedious listing of print 475 commands, and most of it was essentially an inverse to the |primitive| 476 routine that enters a \TeX\ primitive into |eqtb|. 477 478 Thanks to |prim_data|, there is no need for all that tediousness. What 479 is left of |primt_cnd_chr| are just the exceptions to the general rule 480 that the |cmd,chr_code| pair represents in a single primitive command. 481 482@c 483#define chr_cmd(A) do { tprint(A); print(chr_code); } while (0) 484 485static void prim_cmd_chr(quarterword cmd, halfword chr_code) 486{ 487 int idx = chr_code - prim_data[cmd].offset; 488 if (cmd <= last_cmd && 489 idx >= 0 && idx < prim_data[cmd].subids && 490 prim_data[cmd].names != NULL && prim_data[cmd].names[idx] != 0) { 491 tprint_esc(""); 492 print(prim_data[cmd].names[idx]); 493 } else { 494 /* TEX82 didn't print the |cmd,idx| information, but it may be useful */ 495 tprint("[unknown command code! ("); 496 print_int(cmd); 497 tprint(", "); 498 print_int(idx); 499 tprint(")]"); 500 } 501} 502 503void print_cmd_chr(quarterword cmd, halfword chr_code) 504{ 505 int n; /* temp variable */ 506 switch (cmd) { 507 case left_brace_cmd: 508 chr_cmd("begin-group character "); 509 break; 510 case right_brace_cmd: 511 chr_cmd("end-group character "); 512 break; 513 case math_shift_cmd: 514 chr_cmd("math shift character "); 515 break; 516 case mac_param_cmd: 517 if (chr_code == tab_mark_cmd_code) 518 tprint_esc("alignmark"); 519 else 520 chr_cmd("macro parameter character "); 521 break; 522 case sup_mark_cmd: 523 chr_cmd("superscript character "); 524 break; 525 case sub_mark_cmd: 526 chr_cmd("subscript character "); 527 break; 528 case endv_cmd: 529 tprint("end of alignment template"); 530 break; 531 case spacer_cmd: 532 chr_cmd("blank space "); 533 break; 534 case letter_cmd: 535 chr_cmd("the letter "); 536 break; 537 case other_char_cmd: 538 chr_cmd("the character "); 539 break; 540 case tab_mark_cmd: 541 if (chr_code == span_code) 542 tprint_esc("span"); 543 else if (chr_code == tab_mark_cmd_code) 544 tprint_esc("aligntab"); 545 else 546 chr_cmd("alignment tab character "); 547 break; 548 case if_test_cmd: 549 if (chr_code >= unless_code) 550 tprint_esc("unless"); 551 prim_cmd_chr(cmd, (chr_code % unless_code)); 552 break; 553 case char_given_cmd: 554 tprint_esc("char"); 555 print_hex(chr_code); 556 break; 557 case math_given_cmd: 558 tprint_esc("mathchar"); 559 show_mathcode_value(mathchar_from_integer(chr_code, tex_mathcode)); 560 break; 561 case xmath_given_cmd: 562 tprint_esc("Umathchar"); 563 show_mathcode_value(mathchar_from_integer(chr_code, xetex_mathcode)); 564 break; 565 case set_font_cmd: 566 tprint("select font "); 567 tprint(font_name(chr_code)); 568 if (font_size(chr_code) != font_dsize(chr_code)) { 569 tprint(" at "); 570 print_scaled(font_size(chr_code)); 571 tprint("pt"); 572 } 573 break; 574 case undefined_cs_cmd: 575 tprint("undefined"); 576 break; 577 case call_cmd: 578 case long_call_cmd: 579 case outer_call_cmd: 580 case long_outer_call_cmd: 581 n = cmd - call_cmd; 582 if (token_info(token_link(chr_code)) == protected_token) 583 n = n + 4; 584 if (odd(n / 4)) 585 tprint_esc("protected"); 586 if (odd(n)) 587 tprint_esc("long"); 588 if (odd(n / 2)) 589 tprint_esc("outer"); 590 if (n > 0) 591 tprint(" "); 592 tprint("macro"); 593 break; 594 case extension_cmd: 595 if (chr_code < prim_data[cmd].subids && 596 prim_data[cmd].names[chr_code] != 0) { 597 prim_cmd_chr(cmd, chr_code); 598 } else { 599 tprint("[unknown extension! ("); 600 print_int(chr_code); 601 tprint(")]"); 602 603 } 604 break; 605 case assign_glue_cmd: 606 case assign_mu_glue_cmd: 607 if (chr_code < skip_base) { 608 prim_cmd_chr(cmd, chr_code); 609 } else if (chr_code < mu_skip_base) { 610 tprint_esc("skip"); 611 print_int(chr_code - skip_base); 612 } else { 613 tprint_esc("muskip"); 614 print_int(chr_code - mu_skip_base); 615 } 616 break; 617 case assign_toks_cmd: 618 if (chr_code >= toks_base) { 619 tprint_esc("toks"); 620 print_int(chr_code - toks_base); 621 } else { 622 prim_cmd_chr(cmd, chr_code); 623 } 624 break; 625 case assign_int_cmd: 626 if (chr_code < count_base) { 627 prim_cmd_chr(cmd, chr_code); 628 } else { 629 tprint_esc("count"); 630 print_int(chr_code - count_base); 631 } 632 break; 633 case assign_attr_cmd: 634 tprint_esc("attribute"); 635 print_int(chr_code - attribute_base); 636 break; 637 case assign_dimen_cmd: 638 if (chr_code < scaled_base) { 639 prim_cmd_chr(cmd, chr_code); 640 } else { 641 tprint_esc("dimen"); 642 print_int(chr_code - scaled_base); 643 } 644 break; 645 default: 646 /* these are most commands, actually */ 647 prim_cmd_chr(cmd, chr_code); 648 break; 649 } 650} 651