1 /* $Header: d:/cvsroot/tads/tads3/tctok.h,v 1.5 1999/07/11 00:46:59 MJRoberts Exp $ */ 2 3 /* 4 * Copyright (c) 1999, 2002 Michael J. Roberts. All Rights Reserved. 5 * 6 * Please see the accompanying license file, LICENSE.TXT, for information 7 * on using and copying this software. 8 */ 9 /* 10 Name 11 tctok.h - TADS3 compiler tokenizer and preprocessor 12 Function 13 14 Notes 15 The tokenizer is layered with the preprocessor, so that the preprocessor 16 can deal with include files, macro expansion, and preprocessor directives. 17 Modified 18 04/12/99 MJRoberts - Creation 19 */ 20 21 #ifndef TCTOK_H 22 #define TCTOK_H 23 24 #include <stdlib.h> 25 #include <string.h> 26 #include <assert.h> 27 28 #include "os.h" 29 #include "t3std.h" 30 #include "utf8.h" 31 #include "vmhash.h" 32 #include "vmerr.h" 33 #include "tcerr.h" 34 #include "tcerrnum.h" 35 36 37 /* ------------------------------------------------------------------------ */ 38 /* 39 * Constants 40 */ 41 42 /* maximum length of a symbol name, in characters */ 43 const size_t TOK_SYM_MAX_LEN = 80; 44 45 /* 46 * Maximum buffer required to hold a symbol, in bytes. Each UTF-8 47 * character may take up three bytes, plus we need a null terminator 48 * byte. 49 */ 50 const size_t TOK_SYM_MAX_BUFFER = (3*TOK_SYM_MAX_LEN + 1); 51 52 /* maximum #if nesting level */ 53 const size_t TOK_MAX_IF_NESTING = 100; 54 55 /* maximum number of parameters per macro */ 56 const int TOK_MAX_MACRO_ARGS = 128; 57 58 /* 59 * Special token flag characters - these are a characters that can't 60 * occur in an input file (we guarantee this by converting any 61 * occurrences of this character to a space on reading input). We use 62 * these to flag certain special properties of tokens in the input 63 * buffer. 64 * 65 * We use ASCII characters in the control range (0x01 (^A) through 0x1A 66 * (^Z), excluding 0x09 (tab), 0x0A (LF), 0x0D (CR), and 0x0C (Page 67 * Feed); a well-formed source file would never use any of these 68 * characters in input. Even if it does, we won't get confused, since 69 * we'll always translate these to a space if we find them in input; but 70 * choosing characters that *should* never occur in valid input will 71 * ensure that we never alter the meaning of valid source by this 72 * translation. 73 */ 74 75 /* 76 * macro parameter flag - we use this in the internal storage of a 77 * #define expansion to flag where the formal parameters are mentioned, 78 * so that we can substitute the actuals when expanding the macro 79 */ 80 const char TOK_MACRO_FORMAL_FLAG = 0x01; 81 82 /* 83 * Token fully expanded flag. Whenever we detect that a particular 84 * token has been fully expanded in the course of a particular macro 85 * expansion, we'll insert this byte before the token; on subsequent 86 * re-scans, whenever we see this flag, we'll realize that the token 87 * needs no further consideration of expansion. 88 */ 89 const char TOK_FULLY_EXPANDED_FLAG = 0x02; 90 91 /* 92 * Macro substitution end marker. Each time we expand a macro, we'll 93 * insert immediately after the macro expansion a special pseudo-token, 94 * consisting of this flag followed by a pointer to the symbol table 95 * entry for the symbol expanded. As we expand macros, we'll check to 96 * see if any of these special flags appear in the buffer after the 97 * macro about to be expanded. If we find such a flag matching the 98 * symbol about to be expanded, we'll know the symbol has already been 99 * fully expanded on a previous scan and thus must not be expanded 100 * again. 101 */ 102 const char TOK_MACRO_EXP_END = 0x03; 103 104 /* 105 * End-of-line flag. This serves as a local end-of-file marker for 106 * preprocessor lines. Because preprocessor lines must be considered in 107 * isolation, we need some way when parsing one to tell the tokenizer 108 * not to try to read another line when it reaches the end of the 109 * current line. This flag serves this purpose: when the tokenizer 110 * encounters one of these flags, it will simply return end-of-file 111 * until the caller explicitly reads a new source line. 112 */ 113 const char TOK_END_PP_LINE = 0x04; 114 115 /* 116 * "#foreach" marker flag. This marks the presence of a #foreach token in 117 * a macro's expansion. We leave the text of the expansion area intact, 118 * but we replace the #foreach token with this marker character. 119 */ 120 const char TOK_MACRO_FOREACH_FLAG = 0x05; 121 122 /* 123 * "#argcount" marker flag. This marks the presence of a #argcount token 124 * in a macro's expansion. 125 */ 126 const char TOK_MACRO_ARGCOUNT_FLAG = 0x06; 127 128 /* 129 * "#ifempty" and #ifnempty" marker flags 130 */ 131 const char TOK_MACRO_IFEMPTY_FLAG = 0x07; 132 const char TOK_MACRO_IFNEMPTY_FLAG = 0x08; 133 134 135 /* ------------------------------------------------------------------------ */ 136 /* 137 * #if state 138 */ 139 enum tok_if_t 140 { 141 TOKIF_NONE, /* not in a #if block at all */ 142 TOKIF_IF_YES, /* processing a true #if branch */ 143 TOKIF_IF_NO, /* processing a false #if branch */ 144 TOKIF_IF_DONE, /* done with true #if/#elif; skip #elif's and #else */ 145 TOKIF_ELSE_YES, /* processing a true #else branch */ 146 TOKIF_ELSE_NO /* processing a false #else branch */ 147 }; 148 149 /* 150 * #if stack entry 151 */ 152 struct tok_if_info_t 153 { 154 /* state */ 155 tok_if_t state; 156 157 /* file descriptor and line number of starting #if */ 158 class CTcTokFileDesc *desc; 159 long linenum; 160 }; 161 162 /* ------------------------------------------------------------------------ */ 163 /* 164 * Token Types 165 */ 166 167 enum tc_toktyp_t 168 { 169 TOKT_INVALID, /* invalid token */ 170 TOKT_NULLTOK, /* null token - caller should read another token */ 171 TOKT_EOF, /* end of file */ 172 TOKT_MACRO_FORMAL, /* formal parameter replacement placeholder */ 173 TOKT_MACRO_FOREACH, /* macro varargs #foreach placeholder */ 174 TOKT_MACRO_ARGCOUNT, /* macro varargs #argcount placeholder */ 175 TOKT_MACRO_IFEMPTY, /* #ifempty macro placeholder */ 176 TOKT_MACRO_IFNEMPTY, /* #ifnempty macro placeholder */ 177 TOKT_SYM, /* symbolic name */ 178 TOKT_INT, /* integer */ 179 TOKT_SSTR, /* single-quoted string */ 180 TOKT_DSTR, /* double-quoted string */ 181 TOKT_DSTR_START, /* start of a dstring with embedding - "...<< */ 182 TOKT_DSTR_MID, /* middle of a dstring with embedding - >>...<< */ 183 TOKT_DSTR_END, /* end of a dstring with embedding - >>..." */ 184 TOKT_LPAR, /* left paren '(' */ 185 TOKT_RPAR, /* right paren ')' */ 186 TOKT_COMMA, /* comma ',' */ 187 TOKT_DOT, /* period '.' */ 188 TOKT_LBRACE, /* left brace '{' */ 189 TOKT_RBRACE, /* right brace '}' */ 190 TOKT_LBRACK, /* left square bracket '[' */ 191 TOKT_RBRACK, /* right square bracket ']' */ 192 TOKT_EQ, /* equals sign '=' */ 193 TOKT_EQEQ, /* double-equals sign '==' */ 194 TOKT_ASI, /* colon-equals assignment operator ':=' */ 195 TOKT_PLUS, /* plus sign '+' */ 196 TOKT_MINUS, /* minus sign '-' */ 197 TOKT_TIMES, /* multiplication symbol '*' */ 198 TOKT_DIV, /* division symbol '/' */ 199 TOKT_MOD, /* modulo '%' */ 200 TOKT_GT, /* greater-than sign '>' */ 201 TOKT_LT, /* less-than sign '<' */ 202 TOKT_GE, /* greater-or-equal sign '>=' */ 203 TOKT_LE, /* less-or-equal sign '<=' */ 204 TOKT_NE, /* not-equals sign '!=' or '<>' */ 205 TOKT_ARROW, /* arrow symbol '->' */ 206 TOKT_COLON, /* colon ':' */ 207 TOKT_SEM, /* semicolon ';' */ 208 TOKT_AND, /* bitwise AND '&' */ 209 TOKT_ANDAND, /* logical AND '&&' */ 210 TOKT_OR, /* bitwise OR '|' */ 211 TOKT_OROR, /* logical OR '||' */ 212 TOKT_XOR, /* bitwise XOR '^' */ 213 TOKT_SHL, /* shift left '<<' */ 214 TOKT_SHR, /* shift right '>>' */ 215 TOKT_INC, /* increment '++' */ 216 TOKT_DEC, /* decrement '--' */ 217 TOKT_PLUSEQ, /* plus-equals '+=' */ 218 TOKT_MINEQ, /* minus-equals '-=' */ 219 TOKT_TIMESEQ, /* times-equals '*=' */ 220 TOKT_DIVEQ, /* divide-equals '/=' */ 221 TOKT_MODEQ, /* mod-equals '%=' */ 222 TOKT_ANDEQ, /* and-equals '&=' */ 223 TOKT_OREQ, /* or-equals '|=' */ 224 TOKT_XOREQ, /* xor-equals '^=' */ 225 TOKT_SHLEQ, /* shift-left-and-assign '<<=' */ 226 TOKT_SHREQ, /* shift-right-and-assign '>>=' */ 227 TOKT_NOT, /* logical not '!' */ 228 TOKT_BNOT, /* bitwise not '~' */ 229 TOKT_POUND, /* pound '#' */ 230 TOKT_POUNDPOUND, /* double-pound '##' */ 231 TOKT_POUNDAT, /* pound-at '#@' */ 232 TOKT_ELLIPSIS, /* ellipsis '...' */ 233 TOKT_QUESTION, /* question mark '?' */ 234 TOKT_COLONCOLON, /* double-colon '::' */ 235 TOKT_FLOAT, /* floating-point number */ 236 TOKT_AT, /* at-sign */ 237 238 /* keywords */ 239 TOKT_SELF, 240 TOKT_INHERITED, 241 TOKT_ARGCOUNT, 242 TOKT_IF, 243 TOKT_ELSE, 244 TOKT_FOR, 245 TOKT_WHILE, 246 TOKT_DO, 247 TOKT_SWITCH, 248 TOKT_CASE, 249 TOKT_DEFAULT, 250 TOKT_GOTO, 251 TOKT_BREAK, 252 TOKT_CONTINUE, 253 TOKT_FUNCTION, 254 TOKT_RETURN, 255 TOKT_LOCAL, 256 TOKT_OBJECT, 257 TOKT_NIL, 258 TOKT_TRUE, 259 TOKT_PASS, 260 TOKT_EXTERNAL, 261 TOKT_EXTERN, 262 TOKT_FORMATSTRING, 263 TOKT_CLASS, 264 TOKT_REPLACE, 265 TOKT_MODIFY, 266 TOKT_NEW, 267 TOKT_DELETE, 268 TOKT_THROW, 269 TOKT_TRY, 270 TOKT_CATCH, 271 TOKT_FINALLY, 272 TOKT_INTRINSIC, 273 TOKT_DICTIONARY, 274 TOKT_GRAMMAR, 275 TOKT_ENUM, 276 TOKT_TEMPLATE, 277 TOKT_STATIC, 278 TOKT_FOREACH, 279 TOKT_EXPORT, 280 TOKT_DELEGATED, 281 TOKT_TARGETPROP, 282 TOKT_PROPERTYSET, 283 TOKT_TARGETOBJ, 284 TOKT_DEFININGOBJ, 285 TOKT_TRANSIENT, 286 TOKT_REPLACED, 287 288 /* type names */ 289 TOKT_VOID, 290 TOKT_INTKW, 291 TOKT_STRING, 292 TOKT_LIST, 293 TOKT_BOOLEAN, 294 TOKT_PROPERTY, 295 TOKT_ANY 296 }; 297 298 /* ------------------------------------------------------------------------ */ 299 /* 300 * Source Block. As we read the source file, we need to keep quoted 301 * strings and symbol names around for later reference, in case they're 302 * needed after reading more tokens and flushing the line buffer. We'll 303 * copy needed text into our source blocks, which we keep in memory 304 * throughout the compilation, so that we can be certain we can 305 * reference these strings at any time. 306 */ 307 308 /* size of a source block */ 309 const size_t TCTOK_SRC_BLOCK_SIZE = 50000; 310 311 /* source block class */ 312 class CTcTokSrcBlock 313 { 314 public: CTcTokSrcBlock()315 CTcTokSrcBlock() 316 { 317 /* no next block yet */ 318 nxt_ = 0; 319 } 320 ~CTcTokSrcBlock()321 ~CTcTokSrcBlock() 322 { 323 /* delete the next block in line */ 324 if (nxt_ != 0) 325 delete nxt_; 326 } 327 328 /* get/set the next block */ get_next()329 CTcTokSrcBlock *get_next() const { return nxt_; } set_next(CTcTokSrcBlock * blk)330 void set_next(CTcTokSrcBlock *blk) { nxt_ = blk; } 331 332 /* get a pointer to the block's buffer */ get_buf()333 char *get_buf() { return buf_; } 334 335 private: 336 /* the next block in the list */ 337 CTcTokSrcBlock *nxt_; 338 339 /* bytes of the list entry */ 340 char buf_[TCTOK_SRC_BLOCK_SIZE]; 341 }; 342 343 344 /* ------------------------------------------------------------------------ */ 345 /* 346 * String Buffer. We use these buffers for reading input lines and 347 * expanding macros. 348 */ 349 class CTcTokString 350 { 351 public: CTcTokString()352 CTcTokString() 353 { 354 /* no buffer yet */ 355 buf_ = 0; 356 buf_len_ = 0; 357 buf_size_ = 0; 358 } 359 ~CTcTokString()360 virtual ~CTcTokString() 361 { 362 /* delete our buffer */ 363 if (buf_ != 0) 364 t3free(buf_); 365 } 366 367 /* ensure that a given amount of space if available */ ensure_space(size_t siz)368 virtual void ensure_space(size_t siz) 369 { 370 /* make sure there's room for the requested size plus a null byte */ 371 if (buf_size_ < siz + 1) 372 { 373 /* increase to the next 4k increment */ 374 buf_size_ = (siz + 4095 + 1) & ~4095; 375 376 /* allocate or re-allocate the buffer */ 377 if (buf_ == 0) 378 buf_ = (char *)t3malloc(buf_size_); 379 else 380 buf_ = (char *)t3realloc(buf_, buf_size_); 381 382 /* throw an error if that failed */ 383 if (buf_ == 0) 384 err_throw(TCERR_NO_STRBUF_MEM); 385 } 386 } 387 388 /* expand the buffer */ expand()389 void expand() 390 { 391 /* expand to the next 4k increment */ 392 ensure_space(buf_size_ + 4096); 393 } 394 395 /* get the text and the length of the text */ get_text()396 const char *get_text() const { return buf_; } get_text_len()397 size_t get_text_len() const { return buf_len_; } 398 399 /* get the end of the text */ get_text_end()400 const char *get_text_end() const { return buf_ + buf_len_; } 401 402 /* append text to the buffer */ append(const char * p,size_t len)403 virtual void append(const char *p, size_t len) 404 { 405 /* make sure we have space available */ 406 ensure_space(buf_len_ + len); 407 408 /* copy the text onto the end of our buffer */ 409 memcpy(buf_ + buf_len_, p, len); 410 411 /* add it to the length of the text */ 412 buf_len_ += len; 413 414 /* null-terminte it */ 415 buf_[buf_len_] = '\0'; 416 } 417 418 /* copy text into the buffer, replacing existing text */ copy(const char * p,size_t len)419 virtual void copy(const char *p, size_t len) 420 { 421 /* ensure we have enough space */ 422 ensure_space(len); 423 424 /* copy the text */ 425 memcpy(buf_, p, len); 426 427 /* set our length */ 428 buf_len_ = len; 429 430 /* null-terminate it */ 431 buf_[buf_len_] = '\0'; 432 } 433 434 /* clear any existing text */ clear_text()435 virtual void clear_text() 436 { 437 /* zero the length */ 438 buf_len_ = 0; 439 440 /* put a null terminator at the start of the buffer if possible */ 441 if (buf_size_ > 0) 442 buf_[0] = '\0'; 443 } 444 445 /* get the buffer, for copying text directly into it */ get_buf()446 virtual char *get_buf() const { return buf_; } get_buf_size()447 size_t get_buf_size() const { return buf_size_; } 448 449 /* 450 * Set the text length - use this after copying directly into the 451 * buffer to set the length, excluding the null terminator. We'll 452 * add a null terminator at the given length. 453 */ set_text_len(size_t len)454 virtual void set_text_len(size_t len) 455 { 456 /* set the new length */ 457 buf_len_ = len; 458 459 /* add a null terminator after the new length */ 460 if (len < buf_size_) 461 buf_[len] = '\0'; 462 } 463 464 protected: 465 /* buffer */ 466 char *buf_; 467 468 /* size of the buffer */ 469 size_t buf_size_; 470 471 /* length of the text in the buffer (excluding trailing null) */ 472 size_t buf_len_; 473 }; 474 475 476 /* 477 * String buffer subclass for a non-allocated string that merely 478 * references another buffer. This can be used anywhere a CTcString is 479 * required, but does not require any allocation. 480 * 481 * These objects can only be used in 'const' contexts: the underlying 482 * buffer cannot be changed or expanded, since we do not own the 483 * underlying buffer. 484 */ 485 class CTcTokStringRef: public CTcTokString 486 { 487 public: CTcTokStringRef()488 CTcTokStringRef() 489 { 490 /* we have no referenced buffer yet */ 491 buf_ = 0; 492 buf_size_ = 0; 493 buf_len_ = 0; 494 } 495 ~CTcTokStringRef()496 ~CTcTokStringRef() 497 { 498 /* we don't own the underlying buffer, so simply forget about it */ 499 buf_ = 0; 500 } 501 502 /* we can't make any changes to the underlying buffer */ ensure_space(size_t)503 void ensure_space(size_t) { } append(const char *,size_t)504 void append(const char *, size_t) { assert(FALSE); } copy(const char *,size_t)505 void copy(const char *, size_t) { assert(FALSE); } clear_text()506 void clear_text() { assert(FALSE); } get_buf()507 char *get_buf() const { assert(FALSE); return 0; } set_text_len(size_t)508 void set_text_len(size_t) { assert(FALSE); } 509 510 /* set my underlying buffer */ set_buffer(const char * buf,size_t len)511 void set_buffer(const char *buf, size_t len) 512 { 513 buf_ = (char *)buf; 514 buf_size_ = len + 1; 515 buf_len_ = len; 516 } 517 }; 518 519 /* ------------------------------------------------------------------------ */ 520 /* 521 * Token 522 */ 523 class CTcToken 524 { 525 public: 526 /* get/set the token type */ gettyp()527 tc_toktyp_t gettyp() const { return typ_; } settyp(tc_toktyp_t typ)528 void settyp(tc_toktyp_t typ) { typ_ = typ; } 529 530 /* get/set the fully-expanded flag */ get_fully_expanded()531 int get_fully_expanded() const { return fully_expanded_; } set_fully_expanded(int flag)532 void set_fully_expanded(int flag) { fully_expanded_ = flag; } 533 534 /* get/set the text pointer */ get_text()535 const char *get_text() const { return text_; } get_text_len()536 size_t get_text_len() const { return text_len_; } set_text(const char * txt,size_t len)537 void set_text(const char *txt, size_t len) 538 { 539 text_ = txt; 540 text_len_ = len; 541 } 542 543 /* get/set the integer value */ get_int_val()544 long get_int_val() const { return int_val_; } set_int_val(long val)545 void set_int_val(long val) { typ_ = TOKT_INT; int_val_ = val; } 546 547 /* 548 * compare the text to the given string - returns true if the text 549 * matches, false if not 550 */ text_matches(const char * txt,size_t len)551 int text_matches(const char *txt, size_t len) const 552 { 553 return (len == text_len_ 554 && memcmp(txt, text_, len) == 0); 555 } 556 557 private: 558 /* token type */ 559 tc_toktyp_t typ_; 560 561 /* 562 * Pointer to the token's text. This is a pointer into the 563 * tokenizer's symbol table or into the token list itself, so this 564 * pointer is valid as long as the tokenizer and its token list are 565 * valid. 566 */ 567 const char *text_; 568 size_t text_len_; 569 570 /* integer value - valid when the token type is TOKT_INT */ 571 long int_val_; 572 573 /* 574 * flag: the token has been fully expanded, and should not be 575 * expanded further on any subsequent rescan for macros 576 */ 577 uint fully_expanded_ : 1; 578 }; 579 580 581 /* ------------------------------------------------------------------------ */ 582 /* 583 * Macro Expansion Resource object. This object is a collection of 584 * resources that are needed for a macro expansion. To avoid frequent 585 * allocating and freeing of these resources, we keep a pool of these 586 * objects around so that we can re-use them as needed. We'll 587 * dynamically expand the pool as necessary, so this doesn't impose any 588 * pre-set limits; it simply avoids lots of memory allocation activity. 589 */ 590 class CTcMacroRsc 591 { 592 public: CTcMacroRsc()593 CTcMacroRsc() 594 { 595 /* we're not in any lists yet */ 596 next_avail_ = 0; 597 next_ = 0; 598 } 599 600 /* buffer for expansion of the whole line */ 601 CTcTokString line_exp_; 602 603 /* buffer for expansion of current macro on line */ 604 CTcTokString macro_exp_; 605 606 /* buffer for expansion of an actual parameter value */ 607 CTcTokString actual_exp_buf_; 608 609 /* next resource object in the "available" list */ 610 CTcMacroRsc *next_avail_; 611 612 /* next resource object in the master list */ 613 CTcMacroRsc *next_; 614 }; 615 616 617 /* ------------------------------------------------------------------------ */ 618 /* 619 * Abstract token source interface. This is used to allow external code 620 * to inject their own substreams into the main token stream. 621 */ 622 class CTcTokenSource 623 { 624 public: 625 /* 626 * Get the next token from the source. Returns null if there are no 627 * more tokens. 628 */ 629 virtual const CTcToken *get_next_token() = 0; 630 631 /* set the enclosing external token source and current token */ set_enclosing_source(CTcTokenSource * src,const CTcToken * tok)632 void set_enclosing_source(CTcTokenSource *src, const CTcToken *tok) 633 { 634 /* remember the enclosing source */ 635 enclosing_src_ = src; 636 637 /* remember the current token */ 638 enclosing_curtok_ = *tok; 639 } 640 641 /* get the enclosing external token source */ get_enclosing_source()642 CTcTokenSource *get_enclosing_source() const 643 { return enclosing_src_; } 644 645 /* get the token that was current when this source was inserted */ get_enclosing_curtok()646 const CTcToken *get_enclosing_curtok() const 647 { return &enclosing_curtok_; } 648 649 protected: 650 /* the enclosing external token source */ 651 CTcTokenSource *enclosing_src_; 652 653 /* 654 * the current token in effect enclosing this source - this is the 655 * token that comes immediately after the source's tokens, because a 656 * source is inserted before the current token 657 */ 658 CTcToken enclosing_curtok_; 659 }; 660 661 662 /* ------------------------------------------------------------------------ */ 663 /* 664 * Tokenizer. This object reads a file and constructs a representation 665 * of the file as a token list in memory. The tokenizer interprets 666 * preprocessor directives and expands macros. 667 */ 668 class CTcTokenizer 669 { 670 public: 671 /* 672 * Create the tokenizer and start reading from the given file. The 673 * default character set is generally specified by the user (on the 674 * compiler command line, for example), or obtained from the 675 * operating system. 676 */ 677 CTcTokenizer(class CResLoader *res_loader, const char *default_charset); 678 679 /* destroy the tokenizer */ 680 ~CTcTokenizer(); 681 682 /* 683 * Reset the tokenizer. Deletes the current source object and all 684 * saved token text. This can be used after compilation of a unit 685 * is completed and the intermediate parser state can be completely 686 * discarded. 687 */ 688 void reset(); 689 690 /* 691 * Set the source file. 'src_filename' is the fully-resolved local 692 * filename of the source file; 'orig_name' is the original name as 693 * given on the command line, in the makefile, or wherever it came 694 * from. We keep track of the original name so that we can pass 695 * information to the debugger indicating the name as it was originally 696 * given; this is more useful than the resolved filename, because we 697 * might want to run the debugger on another machine with a different 698 * local directory structure. 699 */ 700 int set_source(const char *src_filename, const char *orig_name); 701 702 /* set the source to a memory buffer */ 703 void set_source_buf(const char *buf); 704 705 /* 706 * Add a #include directory to the include path. We search the 707 * include path in the order in which they were defined. 708 */ 709 void add_inc_path(const char *path); 710 711 /* 712 * Set preprocess-only mode. In this mode, we'll retain 713 * preprocessor directives that will be needed if the preprocessed 714 * result is itself compiled; for example, we'll retain #line, 715 * #pragma C, #error, and #pragma message directives. 716 */ set_mode_pp_only(int flag)717 void set_mode_pp_only(int flag) { pp_only_mode_ = flag; } 718 719 /* 720 * Set list-includes mode. In this mode, we'll simply scan source 721 * files and write to the standard output a list of the names of all 722 * of the #include files. 723 */ set_list_includes_mode(int flag)724 void set_list_includes_mode(int flag) { list_includes_mode_ = flag; } 725 726 /* 727 * Get/set the test-report mode. In this mode, we'll expand __FILE__ 728 * macros with the root name only. 729 */ get_test_report_mode()730 int get_test_report_mode() const { return test_report_mode_; } set_test_report_mode(int flag)731 void set_test_report_mode(int flag) { test_report_mode_ = flag; } 732 733 /* enable or disable preprocessing directives */ enable_pp(int enable)734 void enable_pp(int enable) { allow_pp_ = enable; } 735 736 /* get the type of the current token */ cur()737 tc_toktyp_t cur() const { return curtok_.gettyp(); } 738 739 /* get the next token, reading a new line of source if necessary */ 740 tc_toktyp_t next(); 741 742 /* 743 * Un-get the current token and back up to the previous token. The 744 * maximum un-get depth is one token - after un-getting one token, 745 * another token must not be un-gotten until after reading another 746 * token. 747 * 748 * Tokens un-got with this routine are accessible only to next(), 749 * not to any of the lower-level token readers. 750 */ 751 void unget(); 752 753 /* get the current token */ getcur()754 const class CTcToken *getcur() const { return &curtok_; } 755 756 /* 757 * Copy the current token. This makes a copy of the token's text in 758 * tokenizer source memory, to ensure that the reference to the text 759 * buffer the caller is keeping will remain valid forever. 760 */ 761 const class CTcToken *copycur(); 762 763 /* make a safely storable copy of a given token */ 764 void copytok(class CTcToken *dst, const class CTcToken *src); 765 766 /* check to see if the current token matches the given text */ 767 int cur_tok_matches(const char *txt, size_t len); 768 769 /* 770 * Set an external token source. We'll read tokens from this source 771 * until it is exhausted, at which point we'll revert to the enclosing 772 * source. 773 * 774 * The new source is inserted before the current token, so the current 775 * token will become current once again when this source is exhausted. 776 * We'll automatically advance to the next token, which (unless we 777 * have an ungotten token stashed) will go to the first token in the 778 * new source. 779 */ set_external_source(CTcTokenSource * src)780 void set_external_source(CTcTokenSource *src) 781 { 782 /* 783 * store the old source in the new source, so we can restore the 784 * old source when we have exhausted the new source 785 */ 786 src->set_enclosing_source(ext_src_, &curtok_); 787 788 /* set the new external source */ 789 ext_src_ = src; 790 791 /* skip to the next token */ 792 next(); 793 } 794 795 /* clear all external sources, returning to the real token stream */ 796 void clear_external_sources(); 797 798 /* 799 * assume that we should have found '>>' sequence after an embedded 800 * expression in a string - used by parsers to resynchronize after 801 * an apparent syntax error 802 */ 803 void assume_missing_dstr_cont(); 804 805 /* define a macro */ 806 void add_define(const char *sym, size_t len, const char *expansion, 807 size_t expan_len); 808 add_define(const char * sym,const char * expansion,size_t expan_len)809 void add_define(const char *sym, const char *expansion, size_t expan_len) 810 { add_define(sym, strlen(sym), expansion, expan_len); } 811 add_define(const char * sym,const char * expansion)812 void add_define(const char *sym, const char *expansion) 813 { add_define(sym, strlen(sym), expansion, strlen(expansion)); } 814 815 /* add a macro, given the symbol entry */ 816 void add_define(class CTcHashEntryPp *entry); 817 818 /* undefine a previously defined macro */ 819 void undefine(const char *sym, size_t len); undefine(const char * sym)820 void undefine(const char *sym) { undefine(sym, strlen(sym)); } 821 822 /* find a #define symbol */ 823 class CTcHashEntryPp *find_define(const char *sym, size_t len) const; 824 825 /* find an #undef symbol */ 826 class CTcHashEntryPp *find_undef(const char *sym, size_t len) const; 827 828 /* enumerate all of the #define symbols through a callback */ 829 void enum_defines(void (*func)(void *ctx, class CTcHashEntryPp *entry), 830 void *ctx); 831 832 /* read the next line and handle preprocessor directives */ 833 int read_line_pp(); 834 835 /* get the file descriptor and line number of the last line read */ get_last_desc()836 class CTcTokFileDesc *get_last_desc() const { return last_desc_; } get_last_linenum()837 long get_last_linenum() const { return last_linenum_; } get_last_pos(class CTcTokFileDesc ** desc,long * linenum)838 void get_last_pos(class CTcTokFileDesc **desc, long *linenum) const 839 { 840 *desc = last_desc_; 841 *linenum = last_linenum_; 842 } 843 844 /* 845 * set the current file descriptor and line number -- this can be 846 * used to force the line position to a previously-saved value 847 * (during code generation, for example) for error-reporting and 848 * debug-record purposes 849 */ set_line_info(class CTcTokFileDesc * desc,long linenum)850 void set_line_info(class CTcTokFileDesc *desc, long linenum) 851 { 852 last_desc_ = desc; 853 last_linenum_ = linenum; 854 } 855 856 /* 857 * Parse a preprocessor constant expression. We always parse out of 858 * the macro expansion buffer (expbuf_), but the caller must set p_ 859 * to point to the starting point on the expansion line prior to 860 * calling this routine. 861 * 862 * If 'read_first' is true, we'll read a token into curtok_ before 863 * parsing; otherwise, we'll assume the caller has already primed 864 * the pump by reading the first token. 865 * 866 * If 'last_on_line' is true, we'll flag an error if anything is 867 * left on the line after we finish parsing the expression. 868 * 869 * If 'add_line_ending' is true, we'll add an end-of-line marker to 870 * the expansion buffer, so that the tokenizer won't attempt to read 871 * past the end of the line. Since a preprocessor expression must 872 * be contained entirely on a single logical line, we must never try 873 * to read past the end of the current line when parsing a 874 * preprocessor expression. 875 */ 876 int pp_parse_expr(class CTcConstVal *result, 877 int read_first, int last_on_line, int add_line_ending); 878 879 /* log an error, optionally with parameters */ 880 static void log_error(int errnum, ...); 881 882 /* 883 * log an error with the current token text as the parameter, 884 * suitable for a "%.*s" format list entry (hence we'll provide two 885 * parameters: an integer with the length of the token text, and a 886 * pointer to the token text string) 887 */ 888 void log_error_curtok(int errnum); 889 890 /* log a warning, optionally with parameters */ 891 static void log_warning(int errnum, ...); 892 893 /* log a warning with the current token as the parameter */ 894 void log_warning_curtok(int errnum); 895 896 /* log a warning or error for the current token */ 897 void log_error_or_warning_curtok(tc_severity_t sev, int errnum); 898 899 /* log a warning or error for a given token */ 900 void log_error_or_warning_with_tok(tc_severity_t sev, int errnum, 901 const CTcToken *tok); 902 903 /* 904 * log then throw a fatal error (this is different from an internal 905 * error in that it indicates an unrecoverable error in the input; 906 * an internal error indicates that something is wrong with the 907 * compiler itself) 908 */ 909 static void throw_fatal_error(int errnum, ...); 910 911 /* 912 * log then throw an internal error (internal errors are always 913 * fatal: these indicate that something has gone wrong in the 914 * compiler, and are equivalent to an assert failure) 915 */ 916 static void throw_internal_error(int errnum, ...); 917 918 /* display a string/number value */ 919 void msg_str(const char *str, size_t len) const; 920 void msg_long(long val) const; 921 922 /* get the current line */ get_cur_line()923 const char *get_cur_line() const { return linebuf_.get_text(); } get_cur_line_len()924 size_t get_cur_line_len() const { return linebuf_.get_text_len(); } 925 926 /* get the #define hash table */ get_defines_table()927 class CVmHashTable *get_defines_table() const { return defines_; } 928 929 /* 930 * look up a token as a keyword; returns true and fills in 'kw' with 931 * the keyword token ID if the token is in fact a keyword, or 932 * returns false if it's not a keyword 933 */ 934 int look_up_keyword(const CTcToken *tok, tc_toktyp_t *kw); 935 936 /* 937 * Get the next token on the line, filling in the token object. 938 * Advances the pointer to the character immediately following the 939 * token. 940 * 941 * If the token is a string, and the string contains backslash 942 * sequences, we'll modify the source string by translating each 943 * backslash sequences; for example, a "\n" sequence is changed into 944 * an ASCII 10. 945 */ 946 static tc_toktyp_t next_on_line(utf8_ptr *p, CTcToken *tok, 947 int *in_embedding); 948 949 /* 950 * Get the text of an operator token. Returns a pointer to a 951 * constant, static, null-terminated string, suitable for use in 952 * error messages. 953 */ 954 static const char *get_op_text(tc_toktyp_t op); 955 956 /* 957 * Store text in the source list. Text stored here is available 958 * throughout compilation. 959 */ 960 const char *store_source(const char *txt, size_t len); 961 962 /* 963 * Get the index of the next source file descriptor that will be 964 * created. The linker can use this information to fix up 965 * references to file descriptors in an object file when loading 966 * multiple object files. 967 */ get_next_filedesc_index()968 int get_next_filedesc_index() const { return next_filedesc_id_; } 969 970 /* get the number of source file descriptors in the master list */ get_filedesc_count()971 int get_filedesc_count() const { return next_filedesc_id_; } 972 973 /* get the file descriptor at the given (0-based) index */ get_filedesc(size_t idx)974 class CTcTokFileDesc *get_filedesc(size_t idx) const 975 { 976 /* return the array entry at the index, if the index is valid */ 977 return (idx < desc_list_cnt_ ? desc_list_[idx] : 0); 978 } 979 980 /* get the head of the master source file descriptor list */ get_first_filedesc()981 class CTcTokFileDesc *get_first_filedesc() const { return desc_head_; } 982 983 /* 984 * Create a new file descriptor and add it to the master list. This 985 * creates the new descriptor unconditionally, even if a descriptor 986 * for the same source file already exists. 987 */ create_file_desc(const char * fname,size_t len)988 class CTcTokFileDesc *create_file_desc(const char *fname, size_t len) 989 { return get_file_desc(fname, len, TRUE, fname, len); } 990 991 /* 992 * Set the string capture file. Once this is set, we'll write the 993 * contents of each string token that we encounter to this file, 994 * with a newline after each token. 995 */ 996 void set_string_capture(osfildef *fp); 997 998 /* write macros to a file, for debugger use */ 999 void write_macros_to_file_for_debug(class CVmFile *fp); 1000 1001 /* 1002 * Load macros from a file. If any errors occur, we'll flag them 1003 * through the error handler object and return a non-zero value. 1004 * Returns zero on success. 1005 */ 1006 int load_macros_from_file(class CVmStream *fp, 1007 class CTcTokLoadMacErr *err_handler); 1008 1009 /* receive notification that the compiler is done with all parsing */ parsing_done()1010 void parsing_done() 1011 { 1012 /* forget any input file position */ 1013 set_line_info(0, 0); 1014 } 1015 1016 private: 1017 /* skip whitespace and token markers */ 1018 static void skip_ws_and_markers(utf8_ptr *p); 1019 1020 /* 1021 * get the next token on the line; if we go past the end of the 1022 * string buffer, we'll return EOF 1023 */ 1024 static tc_toktyp_t next_on_line(const CTcTokString *srcbuf, utf8_ptr *p, 1025 CTcToken *tok, int *in_embedding); 1026 1027 /* 1028 * get the next token on the current line, updating the internal 1029 * character position pointer to point just past the token, and filling 1030 * in the internal current token object with the toen data 1031 */ next_on_line()1032 tc_toktyp_t next_on_line() { return next_on_line(&p_, &curtok_, 0); } 1033 1034 /* get the next token on the line, with string translation */ next_on_line_xlat(int * in_embedding)1035 tc_toktyp_t next_on_line_xlat(int *in_embedding) 1036 { return next_on_line_xlat(&p_, &curtok_, in_embedding); } 1037 1038 /* 1039 * get the next token, translating strings and storing string and 1040 * symbol text in the source block list 1041 */ 1042 tc_toktyp_t next_on_line_xlat_keep(); 1043 1044 /* 1045 * get the next token on the line, translating strings to internal 1046 * format 1047 */ 1048 tc_toktyp_t next_on_line_xlat(utf8_ptr *p, CTcToken *tok, 1049 int *in_embedding); 1050 1051 /* 1052 * translate a string to internal format by converting escape 1053 * sequences; overwrites the original buffer 1054 */ 1055 tc_toktyp_t xlat_string(utf8_ptr *p, CTcToken *tok, 1056 int *in_embedding); 1057 1058 /* 1059 * translate a string into a given buffer; if 'force_embed_end' is 1060 * true, we'll act as though we're continuing the string after the 1061 * '>>' after an embedded expression, no matter what the actual 1062 * input looks like 1063 */ 1064 tc_toktyp_t xlat_string_to(char *dst, utf8_ptr *p, CTcToken *tok, 1065 int *in_embedding, int force_embed_end); 1066 1067 /* 1068 * Translate a string, saving the translated version in the source 1069 * block list. If 'force_end_embed' is true, we'll act as though we 1070 * were looking at '>>' (or, more precisely, we'll act as though 1071 * '>>' immediately preceded the current input), regardless of what 1072 * the actual input looks like. 1073 */ 1074 tc_toktyp_t xlat_string_to_src(int *in_embedding, int force_end_embed); 1075 1076 /* initialize the source block list */ 1077 void init_src_block_list(); 1078 1079 /* delete current source file, including all including parents */ 1080 void delete_source(); 1081 1082 /* 1083 * read the next line; processes comments, but does not expand 1084 * macros or parse preprocessor directives 1085 */ 1086 char *read_line(int append); 1087 1088 /* set the source read pointer to the start of a new line */ start_new_line(char * p,size_t len)1089 void start_new_line(char *p, size_t len) 1090 { 1091 /* set the read pointer to the start of the line */ 1092 p_.set(p); 1093 1094 /* remember where the current line starts and its total length */ 1095 line_start_ = p; 1096 line_len_ = len; 1097 } 1098 1099 /* reserve space for text in the source list */ 1100 void reserve_source(size_t len); 1101 1102 /* 1103 * Commit space in the source list - this is used when text is 1104 * directly stored after reserving space. The size reserved may be 1105 * greater than the size committed, because it is sometimes more 1106 * efficient to make a guess that may overestimate the amount we 1107 * actually end up needing. 1108 */ 1109 void commit_source(size_t len); 1110 1111 /* unsplice text from the current line and make it the next line */ 1112 void unsplice_line(const char *new_line_start); 1113 1114 /* parse a string */ 1115 static tc_toktyp_t tokenize_string(utf8_ptr *p, CTcToken *tok, 1116 int *in_embedding); 1117 1118 /* process comments */ 1119 void process_comments(size_t start_ofs); 1120 1121 /* splice lines for a string that runs across multiple lines */ 1122 void splice_string(); 1123 1124 /* expand macros in the current line */ 1125 int expand_macros_curline(int read_more, int allow_defined, 1126 int append_to_expbuf); 1127 1128 /* expand all of the macros in the given text */ 1129 int expand_macros(class CTcTokString *srcbuf, utf8_ptr *src, 1130 class CTcTokString *expbuf, int read_more, 1131 int allow_defined, int append); 1132 1133 /* expand the macro at the current token on the current line */ 1134 int expand_macro(class CTcMacroRsc *res, class CTcTokString *expbuf, 1135 const class CTcTokString *srcbuf, utf8_ptr *src, 1136 size_t macro_srcbuf_ofs, CTcHashEntryPp *entry, 1137 int read_more, int allow_defined, int *expanded); 1138 1139 /* scan for a prior expansion of a macro within the current context */ 1140 static int scan_for_prior_expansion(utf8_ptr src, const char *src_end, 1141 const class CTcHashEntryPp *entry); 1142 1143 /* remove end-of-macro-expansion flags from a buffer */ 1144 static void remove_end_markers(class CTcTokString *buf); 1145 1146 /* change a buffer to use individual token full-expansion markers */ 1147 void mark_full_exp_tokens(CTcTokString *dstbuf, 1148 const class CTcTokString *srcbuf, 1149 int append) const; 1150 1151 /* allocate a macro expansion resource */ 1152 class CTcMacroRsc *alloc_macro_rsc(); 1153 1154 /* release a macro expansion resource */ 1155 void release_macro_rsc(class CTcMacroRsc *rsc); 1156 1157 /* 1158 * Parse the actual parameters to a macro. Fills in argofs[] and 1159 * arglen[] with the offsets (from srcbuf->get_buf()) and lengths, 1160 * respectively, of each actual parameter's text. 1161 */ 1162 int parse_macro_actuals(const class CTcTokString *srcbuf, utf8_ptr *src, 1163 const CTcHashEntryPp *macro_entry, 1164 size_t argofs[TOK_MAX_MACRO_ARGS], 1165 size_t arglen[TOK_MAX_MACRO_ARGS], 1166 int read_more, int *found_actuals); 1167 1168 /* splice the next line for reading more macro actuals */ 1169 tc_toktyp_t actual_splice_next_line(const CTcTokString *srcbuf, 1170 utf8_ptr *src, CTcToken *tok); 1171 1172 /* substitute the actual parameters in a macro's expansion */ 1173 int substitute_macro_actuals(class CTcMacroRsc *rsc, 1174 class CTcTokString *subexp, 1175 CTcHashEntryPp *macro_entry, 1176 const class CTcTokString *srcbuf, 1177 const size_t *argofs, const size_t *arglen, 1178 int allow_defined); 1179 1180 /* stringize a macro actual parameter into an expansion buffer */ 1181 void stringize_macro_actual(class CTcTokString *expbuf, 1182 const char *actual_val, size_t actual_len, 1183 char quote_char, int add_open_quote, 1184 int add_close_quote); 1185 1186 /* skip a delimited macro expansion area (#foreach, #ifempty, etc) */ 1187 void skip_delimited_group(utf8_ptr *p, int parts_to_skip); 1188 1189 /* expand a defined() preprocessor operator */ 1190 int expand_defined(class CTcTokString *subexp, 1191 const class CTcTokString *srcbuf, utf8_ptr *src); 1192 1193 /* add a file to the list of files to be included only once */ 1194 void add_include_once(const char *fname); 1195 1196 /* find a file in the list of files to be included only once */ 1197 int find_include_once(const char *fname); 1198 1199 /* process a #pragma directive */ 1200 void pp_pragma(); 1201 1202 /* process a #charset directive */ 1203 void pp_charset(); 1204 1205 /* process a #include directive */ 1206 void pp_include(); 1207 1208 /* process a #define directive */ 1209 void pp_define(); 1210 1211 /* process a #if directive */ 1212 void pp_if(); 1213 1214 /* process a #ifdef directive */ 1215 void pp_ifdef(); 1216 1217 /* process a #ifdef directive */ 1218 void pp_ifndef(); 1219 1220 /* process a #ifdef or #ifndef */ 1221 void pp_ifdef_or_ifndef(int sense); 1222 1223 /* process a #else directive */ 1224 void pp_else(); 1225 1226 /* process a #elif directive */ 1227 void pp_elif(); 1228 1229 /* process a #endif directive */ 1230 void pp_endif(); 1231 1232 /* process a #error directive */ 1233 void pp_error(); 1234 1235 /* process a #undef directive */ 1236 void pp_undef(); 1237 1238 /* process a #line directive */ 1239 void pp_line(); 1240 1241 /* get a lone identifier for a preprocessor directive */ 1242 int pp_get_lone_ident(char *buf, size_t bufl); 1243 1244 /* process a #pragma C directive */ 1245 // void pragma_c(); - not currently used 1246 1247 /* process a #pragma once directive */ 1248 void pragma_once(); 1249 1250 /* process a #pragma all_once directive */ 1251 void pragma_all_once(); 1252 1253 /* process a #pragma message directive */ 1254 void pragma_message(); 1255 1256 /* process a #pragma newline_spacing(on/off) directive */ 1257 void pragma_newline_spacing(); 1258 1259 /* 1260 * Determine if we're in a false #if branch. If we're inside a #if 1261 * block, and the state is either IF_NO, IF_DONE, or ELSE_NO, or 1262 * we're inside a #if nested within any negative branch, we're in a 1263 * not-taken branch of a #if block. 1264 */ in_false_if()1265 int in_false_if() const 1266 { 1267 return (if_sp_ != 0 1268 && (if_false_level_ != 0 1269 || if_stack_[if_sp_ - 1].state == TOKIF_IF_NO 1270 || if_stack_[if_sp_ - 1].state == TOKIF_IF_DONE 1271 || if_stack_[if_sp_ - 1].state == TOKIF_ELSE_NO)); 1272 } 1273 1274 /* push a new #if level with the given state */ 1275 void push_if(tok_if_t state); 1276 1277 /* get the current #if state */ get_if_state()1278 tok_if_t get_if_state() const 1279 { 1280 if (if_sp_ == 0) 1281 return TOKIF_NONE; 1282 else 1283 return if_stack_[if_sp_ - 1].state; 1284 } 1285 1286 /* switch the current #if level to the given state */ change_if_state(tok_if_t state)1287 void change_if_state(tok_if_t state) 1288 { 1289 if (if_sp_ != 0) 1290 if_stack_[if_sp_ - 1].state = state; 1291 } 1292 1293 /* pop the current #if level */ 1294 void pop_if(); 1295 1296 /* 1297 * Find or create a descriptor for the given filename. 'fname' is 1298 * the full file system path specifying the file. 'orig_fname' is 1299 * the filename as originally specified by the user, if different; 1300 * in the case of #include files, this indicates the name that was 1301 * specified in the directive itself, whereas 'fname' is the actual 1302 * filename that resulted from searching the include path for the 1303 * given name. 1304 */ 1305 class CTcTokFileDesc *get_file_desc(const char *fname, size_t fname_len, 1306 int always_create, 1307 const char *orig_fname, 1308 size_t orig_fname_len); 1309 1310 /* clear the line buffer */ 1311 void clear_linebuf(); 1312 1313 /* flag: ALL_ONCE mode - we include each file only once */ 1314 int all_once_ : 1; 1315 1316 /* flag: warn on ignoring a redundant #include file */ 1317 int warn_on_ignore_incl_ : 1; 1318 1319 /* 1320 * Flag: in preprocess-only mode. In this mode, we'll leave certain 1321 * preprocessor directives intact in the source, since they'll be 1322 * needed in a subsequent compilation of the preprocessed source. 1323 * For example, we'll leave #line directives, #pragma C, #error, and 1324 * #pragma message directives in the preprocessed result. 1325 */ 1326 int pp_only_mode_ : 1; 1327 1328 /* 1329 * Flag: in test reporting mode. In this mode, we'll expand __FILE__ 1330 * macros with the root name only. 1331 */ 1332 int test_report_mode_ : 1; 1333 1334 /* 1335 * Flag: in preprocess-for-includes mode. In this mode, we'll do 1336 * nothing except run the preprocessor and generate a list of the 1337 * header files that are included, along with header files they 1338 * include, and so on. 1339 */ 1340 int list_includes_mode_ : 1; 1341 1342 /* 1343 * Flag: treat newlines in strings as whitespace. When this is true, 1344 * whenever we find a newline character in a string, we'll convert the 1345 * newline and all leading whitespace on the next line to a single 1346 * space character. When this is false, we'll entirely strip out each 1347 * newline in a string and all whitespace that immediately follows; 1348 * this mode is desirable for some languages, such as Chinese, where 1349 * whitespace is not conventionally used as a token separator in 1350 * ordinary text. 1351 */ 1352 int string_newline_spacing_ : 1; 1353 1354 /* 1355 * flag: we're parsing a preprocessor constant expression (for a 1356 * #if, for example; this doesn't apply to simple macro expansion) 1357 */ 1358 int in_pp_expr_ : 1; 1359 1360 /* resource loader */ 1361 class CResLoader *res_loader_; 1362 1363 /* 1364 * name of our default character set - this is generally specified 1365 * by the user (on the compiler command line, for example), or 1366 * obtained from the operating system 1367 */ 1368 char *default_charset_; 1369 1370 /* input (to unicode) character mapper for the default character set */ 1371 class CCharmapToUni *default_mapper_; 1372 1373 /* head of list of previously-included files */ 1374 struct tctok_incfile_t *prev_includes_; 1375 1376 /* head and tail of include path list */ 1377 struct tctok_incpath_t *incpath_head_; 1378 struct tctok_incpath_t *incpath_tail_; 1379 1380 /* file descriptor and line number of last line read */ 1381 class CTcTokFileDesc *last_desc_; 1382 long last_linenum_; 1383 1384 /* file descriptor and line number of last line appended */ 1385 class CTcTokFileDesc *appended_desc_; 1386 long appended_linenum_; 1387 1388 /* current input stream */ 1389 class CTcTokStream *str_; 1390 1391 /* master list of file descriptors */ 1392 class CTcTokFileDesc *desc_head_; 1393 class CTcTokFileDesc *desc_tail_; 1394 1395 /* 1396 * array of file descriptors (we keep the list in both an array and 1397 * a linked list, since we need both sequential and indexed access; 1398 * this isn't a lot of trouble since we never need to remove an 1399 * entry from the list) 1400 */ 1401 class CTcTokFileDesc **desc_list_; 1402 1403 /* number of entries in desc_list_ */ 1404 size_t desc_list_cnt_; 1405 1406 /* number of slots allocated in desc_list_ array */ 1407 size_t desc_list_alo_; 1408 1409 /* next file descriptor ID to be assigned */ 1410 int next_filedesc_id_; 1411 1412 /* pointer to current position in current line */ 1413 utf8_ptr p_; 1414 1415 /* pointer to start of current line, and length of current line */ 1416 const char *line_start_; 1417 size_t line_len_; 1418 1419 /* input buffer */ 1420 CTcTokString linebuf_; 1421 1422 /* 1423 * unsplice buffer - we'll put any unspliced text into this buffer, 1424 * then read it back at the next read_line() 1425 */ 1426 CTcTokString unsplicebuf_; 1427 1428 /* 1429 * Flag: in a string. If this is '\0', we're not in a string; 1430 * otherwise, this is the quote character that ends the string. 1431 */ 1432 wchar_t in_quote_; 1433 1434 /* flag: in an embedded expression during line processing */ 1435 uint comment_in_embedding_ : 1; 1436 1437 /* flag: macro processing token stream is in an embedded expression */ 1438 int macro_in_embedding_; 1439 1440 /* flag: main token stream is in an embedded expression */ 1441 int main_in_embedding_; 1442 1443 /* 1444 * #if state stack. if_sp_ is the index of the next nesting slot; 1445 * if if_sp_ is zero, it means that we're not in a #if at all. 1446 * 1447 * Separately, the if_false_level_ is the level of #if's contained 1448 * within a false #if branch. This is separate because, once we're 1449 * in a false #if branch, everything within it is false. 1450 */ 1451 int if_sp_; 1452 tok_if_info_t if_stack_[TOK_MAX_IF_NESTING]; 1453 int if_false_level_; 1454 1455 /* source block list head */ 1456 CTcTokSrcBlock *src_head_; 1457 1458 /* current (and last) source block */ 1459 CTcTokSrcBlock *src_cur_; 1460 1461 /* pointer to next available byte in the current source block */ 1462 char *src_ptr_; 1463 1464 /* number of bytes remaining in the current source block */ 1465 size_t src_rem_; 1466 1467 /* current token */ 1468 CTcToken curtok_; 1469 1470 /* previous token (for unget) */ 1471 CTcToken prvtok_; 1472 1473 /* 1474 * next token, if a token has been un-gotten, and a flag indicating 1475 * that this is indeed the case. 1476 */ 1477 CTcToken nxttok_; 1478 unsigned int nxttok_valid_ : 1; 1479 1480 /* the external token source, if any */ 1481 CTcTokenSource *ext_src_; 1482 1483 /* macro expansion buffer */ 1484 CTcTokString expbuf_; 1485 1486 /* symbol table for #define symbols */ 1487 class CVmHashTable *defines_; 1488 1489 /* 1490 * symbol table for symbols explicitly undefined; we keep track of 1491 * these so that we can exclude anything ever undefined from the debug 1492 * macro records, since only static global macros can be handled in the 1493 * debug records 1494 */ 1495 class CVmHashTable *undefs_; 1496 1497 /* symbol table for TADS keywords */ 1498 class CVmHashTable *kw_; 1499 1500 /* head of macro resource pool list */ 1501 class CTcMacroRsc *macro_res_head_; 1502 1503 /* head of list of available macro resources */ 1504 class CTcMacroRsc *macro_res_avail_; 1505 1506 /* 1507 * string capture file - if this is non-null, we'll capture all of 1508 * the strings we read to this file, one string per line 1509 */ 1510 osfildef *string_fp_; 1511 1512 /* character mapper for writing to the string capture file */ 1513 class CCharmapToLocal *string_fp_map_; 1514 1515 /* true -> allow preprocessor directives */ 1516 unsigned int allow_pp_; 1517 }; 1518 1519 /* ------------------------------------------------------------------------ */ 1520 /* 1521 * Error handler interface. Callers of load_macros_from_file() in 1522 * CTcTokenizer must provide an implementation of this interface to handle 1523 * errors that occur while loading macros. 1524 */ 1525 class CTcTokLoadMacErr 1526 { 1527 public: 1528 /* 1529 * Flag an error. The error codes are taken from the following list: 1530 * 1531 * 1 - a macro name symbol in the file is too long (it exceeds the 1532 * maximum symbol length for the preprocessor) 1533 * 1534 * 2 - a formal parameter name is too long 1535 */ 1536 virtual void log_error(int err) = 0; 1537 }; 1538 1539 /* ------------------------------------------------------------------------ */ 1540 /* 1541 * Tokenizer File Descriptor. Each unique source file has a separate 1542 * file descriptor, which keeps track of the file's name. 1543 */ 1544 class CTcTokFileDesc 1545 { 1546 public: 1547 /* create a file descriptor */ 1548 CTcTokFileDesc(const char *fname, size_t fname_len, int index, 1549 CTcTokFileDesc *orig_desc, 1550 const char *orig_fname, size_t orig_fname_len); 1551 1552 /* delete the descriptor */ 1553 ~CTcTokFileDesc(); 1554 1555 /* get the filename */ get_fname()1556 const char *get_fname() const { return fname_; } 1557 1558 /* get the original filename string */ get_orig_fname()1559 const char *get_orig_fname() const { return orig_fname_; } 1560 1561 /* 1562 * get the filename as a double-quoted string (backslashes and 1563 * double-quotes will be escaped with backslashes) 1564 */ get_dquoted_fname()1565 const char *get_dquoted_fname() const { return dquoted_fname_; } 1566 1567 /* 1568 * get the root filename (i.e., with no path prefix) as a 1569 * double-quoted string 1570 */ get_dquoted_rootname()1571 const char *get_dquoted_rootname() const { return dquoted_rootname_; } 1572 1573 /* get the filename as a single-quoted string */ get_squoted_fname()1574 const char *get_squoted_fname() const { return squoted_fname_; } 1575 1576 /* get the root filename as a single-quoted string */ get_squoted_rootname()1577 const char *get_squoted_rootname() const { return squoted_rootname_; } 1578 1579 /* get/set the next file descriptor in the descriptor chain */ get_next()1580 CTcTokFileDesc *get_next() const { return next_; } set_next(CTcTokFileDesc * nxt)1581 void set_next(CTcTokFileDesc *nxt) { next_ = nxt; } 1582 1583 /* get my index in the master list */ get_index()1584 int get_index() const { return index_; } 1585 1586 /* get the original descriptor for this file in the list */ get_orig()1587 CTcTokFileDesc *get_orig() const { return orig_; } 1588 1589 /* 1590 * get the list index of the original entry (returns my own list 1591 * index if I am the original entry) 1592 */ get_orig_index()1593 int get_orig_index() const 1594 { return orig_ == 0 ? index_ : orig_->get_index(); } 1595 1596 /* 1597 * Add a source line position to our list. We keep an index of the 1598 * byte-code address for each executable source line, so that 1599 * debuggers can find the compiled code corresponding to a source 1600 * location. The image builder gives us this information during the 1601 * linking process. The address is the absolute location in the 1602 * image file of the executable code for the given source line (the 1603 * first line in the file is numbered 1). 1604 */ 1605 void add_source_line(ulong linenum, ulong line_addr); 1606 1607 /* 1608 * Enumerate the source lines, calling the callback for each one. 1609 * We will only enumerate source lines which actually have an 1610 * associated code location - source lines that generated no 1611 * executable code are skipped. We'll enumerate the lines in 1612 * ascending order of line number, and each line number will appear 1613 * only once. 1614 */ 1615 void enum_source_lines(void (*cbfunc)(void *ctx, ulong linenum, 1616 ulong byte_code_addr), 1617 void *cbctx); 1618 1619 private: 1620 /* index in the master list */ 1621 int index_; 1622 1623 /* filename string - this is the actual file system filename */ 1624 char *fname_; 1625 1626 /* 1627 * original filename string, if different from fname_ - this is the 1628 * filename as specified by the user, before it was adjusted with 1629 * include paths or other extra location information 1630 */ 1631 char *orig_fname_; 1632 1633 /* double-quoted version of the filename */ 1634 char *dquoted_fname_; 1635 1636 /* single-quoted version of the filename */ 1637 char *squoted_fname_; 1638 1639 /* single-quoted version of the root filename */ 1640 char *squoted_rootname_; 1641 1642 /* double-quoted version of the root filename */ 1643 char *dquoted_rootname_; 1644 1645 /* next descriptor in the master descriptor list */ 1646 CTcTokFileDesc *next_; 1647 1648 /* 1649 * The original file descriptor with the same filename. If we 1650 * create multiple descriptors for the same filename (because, for 1651 * example, the same header is included in several different object 1652 * files), we'll keep track of the original descriptor for the file 1653 * in all of the copies. 1654 */ 1655 CTcTokFileDesc *orig_; 1656 1657 /* source line pages */ 1658 struct CTcTokSrcPage **src_pages_; 1659 1660 /* number of source line page slots allocated */ 1661 size_t src_pages_alo_; 1662 }; 1663 1664 1665 /* ------------------------------------------------------------------------ */ 1666 /* 1667 * Tokenizer Input Stream 1668 */ 1669 class CTcTokStream 1670 { 1671 public: 1672 /* create a token stream */ 1673 CTcTokStream(class CTcTokFileDesc *desc, class CTcSrcObject *src, 1674 CTcTokStream *parent, int charset_error, 1675 int init_if_level); 1676 1677 /* delete the stream */ 1678 ~CTcTokStream(); 1679 1680 /* get/set the associated file descriptor */ get_desc()1681 class CTcTokFileDesc *get_desc() const { return desc_; } set_desc(class CTcTokFileDesc * desc)1682 void set_desc(class CTcTokFileDesc *desc) { desc_ = desc; } 1683 1684 /* get the underlying source file */ get_src()1685 class CTcSrcObject *get_src() const { return src_; } 1686 1687 /* get the line number of the next line to be read */ get_next_linenum()1688 long get_next_linenum() const { return next_linenum_; } 1689 1690 /* set the next line number */ set_next_linenum(long l)1691 void set_next_linenum(long l) { next_linenum_ = l; } 1692 1693 /* get the enclosing stream */ get_parent()1694 CTcTokStream *get_parent() const { return parent_; } 1695 1696 /* count having read a line */ count_line()1697 void count_line() { ++next_linenum_; } 1698 1699 /* was there a #charset error when opening the file? */ get_charset_error()1700 int get_charset_error() const { return charset_error_; } 1701 1702 /* get/set the in-comment status */ is_in_comment()1703 int is_in_comment() const { return in_comment_; } set_in_comment(int f)1704 void set_in_comment(int f) { in_comment_ = f; } 1705 1706 /* get/set the pragma C mode */ 1707 // int is_pragma_c() const { return pragma_c_; } 1708 // void set_pragma_c(int f) { pragma_c_ = f; } 1709 1710 /* get/set if nesting level at the start of the file */ get_init_if_level()1711 int get_init_if_level() const { return init_if_level_; } set_init_if_level(int level)1712 void set_init_if_level(int level) { init_if_level_ = level; } 1713 1714 /* get/set the newline spacing mode */ get_newline_spacing()1715 int get_newline_spacing() const { return newline_spacing_; } set_newline_spacing(int f)1716 void set_newline_spacing(int f) { newline_spacing_ = f; } 1717 1718 private: 1719 /* file descriptor associated with this file */ 1720 class CTcTokFileDesc *desc_; 1721 1722 /* the underlying source reader */ 1723 class CTcSrcObject *src_; 1724 1725 /* 1726 * the enclosing stream - this is the stream that #include'd the 1727 * current stream 1728 */ 1729 CTcTokStream *parent_; 1730 1731 /* line number of next line to be read */ 1732 ulong next_linenum_; 1733 1734 /* #if nesting level at the start of the file */ 1735 int init_if_level_; 1736 1737 /* flag: we were unable to load the map in the #charset directive */ 1738 uint charset_error_ : 1; 1739 1740 /* the stream is in a multi-line comment */ 1741 uint in_comment_ : 1; 1742 1743 /* newline_spacing mode when the stream was stacked */ 1744 uint newline_spacing_ : 1; 1745 1746 /* flag: we're in #pragma C+ mode */ 1747 // uint pragma_c_ : 1; - #pragma C is not currently used 1748 }; 1749 1750 /* ------------------------------------------------------------------------ */ 1751 /* 1752 * Keyword Hash Table Entry 1753 */ 1754 class CTcHashEntryKw: public CVmHashEntryCS 1755 { 1756 public: CTcHashEntryKw(const textchar_t * str,tc_toktyp_t tokid)1757 CTcHashEntryKw(const textchar_t *str, tc_toktyp_t tokid) 1758 : CVmHashEntryCS(str, strlen(str), FALSE) 1759 { 1760 /* save the token ID for the keyword */ 1761 tokid_ = tokid; 1762 } 1763 1764 /* get the token ID */ get_tok_id()1765 tc_toktyp_t get_tok_id() const { return tokid_; } 1766 1767 private: 1768 /* our token ID */ 1769 tc_toktyp_t tokid_; 1770 }; 1771 1772 /* ------------------------------------------------------------------------ */ 1773 /* 1774 * basic #define symbol table entry 1775 */ 1776 class CTcHashEntryPp: public CVmHashEntryCS 1777 { 1778 public: CTcHashEntryPp(const textchar_t * str,size_t len,int copy)1779 CTcHashEntryPp(const textchar_t *str, size_t len, int copy) 1780 : CVmHashEntryCS(str, len, copy) 1781 { 1782 /* by default, we have no arguments */ 1783 has_args_ = FALSE; 1784 has_varargs_ = FALSE; 1785 argc_ = 0; 1786 argv_ = 0; 1787 params_table_ = 0; 1788 } 1789 1790 /* get the expansion text */ 1791 virtual const char *get_expansion() const = 0; 1792 virtual size_t get_expan_len() const = 0; 1793 1794 /* certain special macros (__LINE__, __FILE__) aren't undef'able */ is_undefable()1795 virtual int is_undefable() const { return TRUE; } 1796 1797 /* 1798 * most macros are real symbols, created by #define's, but some are 1799 * special pseudo-macros, like __LINE__ and __FILE__, that the 1800 * preprocessor provides 1801 */ is_pseudo()1802 virtual int is_pseudo() const { return FALSE; } 1803 1804 /* does the macro have an argument list? */ has_args()1805 int has_args() const { return has_args_; } 1806 1807 /* get the number of arguments */ get_argc()1808 int get_argc() const { return argc_; } 1809 1810 /* do we have a variable number of arguments? */ has_varargs()1811 int has_varargs() const { return has_varargs_; } 1812 1813 /* 1814 * get the minimum number of allowed arguments - if we have varargs, 1815 * this is one less than the number of formals listed, since the last 1816 * formal can correspond to any number of actuals, including zero 1817 */ get_min_argc()1818 int get_min_argc() const { return has_varargs_ ? argc_ - 1 : argc_; } 1819 1820 /* get the name of an argument by position (0 = first argument) */ get_arg_name(int idx)1821 const char *get_arg_name(int idx) const { return argv_[idx]; } 1822 1823 /* get the parameter hash table entry for the parameter */ get_arg_entry(int idx)1824 class CTcHashEntryPpArg *get_arg_entry(int idx) const 1825 { return arg_entry_[idx]; } 1826 1827 /* get the parameters hash table */ get_params_table()1828 const CVmHashTable *get_params_table() const { return params_table_; } 1829 1830 protected: 1831 /* argument list */ 1832 char **argv_; 1833 1834 /* list of parameter hash entries */ 1835 class CTcHashEntryPpArg **arg_entry_; 1836 1837 /* parameter hash table */ 1838 CVmHashTable *params_table_; 1839 1840 /* argument count */ 1841 int argc_; 1842 1843 /* flag: the macro has a parameter list */ 1844 uint has_args_ : 1; 1845 1846 /* 1847 * flag: the parameter list takes a variable number of arguments; if 1848 * this is set, then argc_ is one greater than the minimum number of 1849 * arguments required, and the last formal receives the varying part 1850 * of the actual parameter list, which can contain zero or more 1851 * actuals 1852 */ 1853 uint has_varargs_ : 1; 1854 }; 1855 1856 /* 1857 * #define symbol hash table entry 1858 */ 1859 class CTcHashEntryPpDefine: public CTcHashEntryPp 1860 { 1861 public: 1862 /* 1863 * Create the hash entry. argc is the number of arguments to the 1864 * macro, and argv is an array of pointers to null-terminated 1865 * strings with the argument names, in the order defined in the 1866 * macro. 1867 * 1868 * If has_args is false, the macro does not take a parameter list at 1869 * all. Note that it is possible for has_args to be true and argc 1870 * to be zero, because a macro can be defined to take an argument 1871 * list with no arguments (i.e., empty parens). A macro with an 1872 * empty argument list is distinct from a macro with no argument 1873 * list: in the former case, the empty parens are required, and are 1874 * removed from the input stream and replaced with the macro's 1875 * expansion. 1876 * 1877 * We'll make a copy of the argument list vector, strings, and 1878 * expansion text, so the caller is free to forget all of that after 1879 * creating the entry instance. 1880 */ 1881 CTcHashEntryPpDefine(const textchar_t *str, size_t len, int copy, 1882 int has_args, int argc, int has_varargs, 1883 const char **argv, const size_t *argvlen, 1884 const char *expansion, size_t expan_len); 1885 1886 ~CTcHashEntryPpDefine(); 1887 1888 /* get the expansion text and its length */ get_expansion()1889 const char *get_expansion() const { return expan_; } get_expan_len()1890 size_t get_expan_len() const { return expan_len_; } 1891 1892 private: 1893 /* expansion */ 1894 char *expan_; 1895 size_t expan_len_; 1896 }; 1897 1898 1899 /* 1900 * Hash table entry for __FILE__ and __LINE__ 1901 */ 1902 class CTcHashEntryPpSpecial: public CTcHashEntryPp 1903 { 1904 public: CTcHashEntryPpSpecial(CTcTokenizer * tok,const char * str)1905 CTcHashEntryPpSpecial(CTcTokenizer *tok, const char *str) 1906 : CTcHashEntryPp(str, strlen(str), FALSE) 1907 { 1908 /* remember my tokenizer */ 1909 tok_ = tok; 1910 } 1911 1912 /* these special macros are not undef'able */ is_undefable()1913 virtual int is_undefable() const { return FALSE; } 1914 1915 /* special macros are pseudo-macros provided by the preprocessor */ is_pseudo()1916 virtual int is_pseudo() const { return TRUE; } 1917 1918 protected: 1919 /* my tokenizer */ 1920 CTcTokenizer *tok_; 1921 }; 1922 1923 class CTcHashEntryPpFILE: public CTcHashEntryPpSpecial 1924 { 1925 public: CTcHashEntryPpFILE(CTcTokenizer * tok)1926 CTcHashEntryPpFILE(CTcTokenizer *tok) 1927 : CTcHashEntryPpSpecial(tok, "__FILE__") { } 1928 1929 /* our expansion is the current filename, in single quotes */ get_expansion()1930 const char *get_expansion() const { return get_base_text(); } get_expan_len()1931 size_t get_expan_len() const { return strlen(get_base_text()); } 1932 1933 private: 1934 /* get our expansion base text */ get_base_text()1935 const char *get_base_text() const 1936 { 1937 /* 1938 * if we're in test-report mode, use the root name only; 1939 * otherwise, use the full name with path 1940 */ 1941 if (tok_->get_test_report_mode()) 1942 return tok_->get_last_desc()->get_squoted_rootname(); 1943 else 1944 return tok_->get_last_desc()->get_squoted_fname(); 1945 } 1946 }; 1947 1948 class CTcHashEntryPpLINE: public CTcHashEntryPpSpecial 1949 { 1950 public: CTcHashEntryPpLINE(CTcTokenizer * tok)1951 CTcHashEntryPpLINE(CTcTokenizer *tok) 1952 : CTcHashEntryPpSpecial(tok, "__LINE__") { } 1953 1954 /* our expansion is the line number as a decimal string */ get_expansion()1955 const char *get_expansion() const 1956 { gen_expansion(tok_); return buf_; } get_expan_len()1957 size_t get_expan_len() const 1958 { gen_expansion(tok_); return strlen(buf_); } 1959 1960 private: 1961 /* generate the expansion text into our internal buffer */ gen_expansion(CTcTokenizer * tok)1962 static void gen_expansion(CTcTokenizer *tok) 1963 { sprintf(buf_, "%ld", tok->get_last_linenum()); } 1964 1965 /* internal buffer */ 1966 static char buf_[20]; 1967 }; 1968 1969 1970 /* 1971 * Hash entry for preprocessor arguments 1972 */ 1973 class CTcHashEntryPpArg: public CVmHashEntryCS 1974 { 1975 public: CTcHashEntryPpArg(const char * str,size_t len,int copy,int argnum)1976 CTcHashEntryPpArg(const char *str, size_t len, int copy, int argnum) 1977 : CVmHashEntryCS(str, len, copy) 1978 { 1979 /* remember the argument number */ 1980 argnum_ = argnum; 1981 } 1982 1983 /* get my argument number */ get_argnum()1984 int get_argnum() const { return argnum_; } 1985 1986 private: 1987 /* argument number */ 1988 int argnum_; 1989 }; 1990 1991 1992 /* ------------------------------------------------------------------------ */ 1993 /* 1994 * Previously-included file list entry. Each time we include a file, 1995 * we'll add an entry to a list of files; in the future, we'll consult 1996 * this list to ensure that we don't include the same file again. 1997 */ 1998 struct tctok_incfile_t 1999 { 2000 /* next entry in the list of previously-included files */ 2001 tctok_incfile_t *nxt; 2002 2003 /* name of this file (we'll allocate memory to hold the name) */ 2004 char fname[1]; 2005 }; 2006 2007 /* ------------------------------------------------------------------------ */ 2008 /* 2009 * Include path list entry. This structure defines one include path; we 2010 * maintain a list of these structures. 2011 */ 2012 struct tctok_incpath_t 2013 { 2014 /* next entry in the list */ 2015 tctok_incpath_t *nxt; 2016 2017 /* path */ 2018 char path[1]; 2019 }; 2020 2021 #endif /* TCTOK_H */ 2022 2023