1 /* $Header: d:/cvsroot/tads/tads3/tctok.h,v 1.5 1999/07/11 00:46:59 MJRoberts Exp $ */
2 
3 /*
4  *   Copyright (c) 1999, 2002 Michael J. Roberts.  All Rights Reserved.
5  *
6  *   Please see the accompanying license file, LICENSE.TXT, for information
7  *   on using and copying this software.
8  */
9 /*
10 Name
11   tctok.h - TADS3 compiler tokenizer and preprocessor
12 Function
13 
14 Notes
15   The tokenizer is layered with the preprocessor, so that the preprocessor
16   can deal with include files, macro expansion, and preprocessor directives.
17 Modified
18   04/12/99 MJRoberts  - Creation
19 */
20 
21 #ifndef TCTOK_H
22 #define TCTOK_H
23 
24 #include <stdlib.h>
25 #include <string.h>
26 #include <assert.h>
27 
28 #include "os.h"
29 #include "t3std.h"
30 #include "utf8.h"
31 #include "vmhash.h"
32 #include "vmerr.h"
33 #include "tcerr.h"
34 #include "tcerrnum.h"
35 
36 
37 /* ------------------------------------------------------------------------ */
38 /*
39  *   Constants
40  */
41 
42 /* maximum length of a symbol name, in characters */
43 const size_t TOK_SYM_MAX_LEN = 80;
44 
45 /*
46  *   Maximum buffer required to hold a symbol, in bytes.  Each UTF-8
47  *   character may take up three bytes, plus we need a null terminator
48  *   byte.
49  */
50 const size_t TOK_SYM_MAX_BUFFER = (3*TOK_SYM_MAX_LEN + 1);
51 
52 /* maximum #if nesting level */
53 const size_t TOK_MAX_IF_NESTING = 100;
54 
55 /* maximum number of parameters per macro */
56 const int TOK_MAX_MACRO_ARGS = 128;
57 
58 /*
59  *   Special token flag characters - these are a characters that can't
60  *   occur in an input file (we guarantee this by converting any
61  *   occurrences of this character to a space on reading input).  We use
62  *   these to flag certain special properties of tokens in the input
63  *   buffer.
64  *
65  *   We use ASCII characters in the control range (0x01 (^A) through 0x1A
66  *   (^Z), excluding 0x09 (tab), 0x0A (LF), 0x0D (CR), and 0x0C (Page
67  *   Feed); a well-formed source file would never use any of these
68  *   characters in input.  Even if it does, we won't get confused, since
69  *   we'll always translate these to a space if we find them in input; but
70  *   choosing characters that *should* never occur in valid input will
71  *   ensure that we never alter the meaning of valid source by this
72  *   translation.
73  */
74 
75 /*
76  *   macro parameter flag - we use this in the internal storage of a
77  *   #define expansion to flag where the formal parameters are mentioned,
78  *   so that we can substitute the actuals when expanding the macro
79  */
80 const char TOK_MACRO_FORMAL_FLAG = 0x01;
81 
82 /*
83  *   Token fully expanded flag.  Whenever we detect that a particular
84  *   token has been fully expanded in the course of a particular macro
85  *   expansion, we'll insert this byte before the token; on subsequent
86  *   re-scans, whenever we see this flag, we'll realize that the token
87  *   needs no further consideration of expansion.
88  */
89 const char TOK_FULLY_EXPANDED_FLAG = 0x02;
90 
91 /*
92  *   Macro substitution end marker.  Each time we expand a macro, we'll
93  *   insert immediately after the macro expansion a special pseudo-token,
94  *   consisting of this flag followed by a pointer to the symbol table
95  *   entry for the symbol expanded.  As we expand macros, we'll check to
96  *   see if any of these special flags appear in the buffer after the
97  *   macro about to be expanded.  If we find such a flag matching the
98  *   symbol about to be expanded, we'll know the symbol has already been
99  *   fully expanded on a previous scan and thus must not be expanded
100  *   again.
101  */
102 const char TOK_MACRO_EXP_END = 0x03;
103 
104 /*
105  *   End-of-line flag.  This serves as a local end-of-file marker for
106  *   preprocessor lines.  Because preprocessor lines must be considered in
107  *   isolation, we need some way when parsing one to tell the tokenizer
108  *   not to try to read another line when it reaches the end of the
109  *   current line.  This flag serves this purpose: when the tokenizer
110  *   encounters one of these flags, it will simply return end-of-file
111  *   until the caller explicitly reads a new source line.
112  */
113 const char TOK_END_PP_LINE = 0x04;
114 
115 /*
116  *   "#foreach" marker flag.  This marks the presence of a #foreach token in
117  *   a macro's expansion.  We leave the text of the expansion area intact,
118  *   but we replace the #foreach token with this marker character.
119  */
120 const char TOK_MACRO_FOREACH_FLAG = 0x05;
121 
122 /*
123  *   "#argcount" marker flag.  This marks the presence of a #argcount token
124  *   in a macro's expansion.
125  */
126 const char TOK_MACRO_ARGCOUNT_FLAG = 0x06;
127 
128 /*
129  *   "#ifempty" and #ifnempty" marker flags
130  */
131 const char TOK_MACRO_IFEMPTY_FLAG = 0x07;
132 const char TOK_MACRO_IFNEMPTY_FLAG = 0x08;
133 
134 
135 /* ------------------------------------------------------------------------ */
136 /*
137  *   #if state
138  */
139 enum tok_if_t
140 {
141     TOKIF_NONE,                                /* not in a #if block at all */
142     TOKIF_IF_YES,                           /* processing a true #if branch */
143     TOKIF_IF_NO,                           /* processing a false #if branch */
144     TOKIF_IF_DONE,      /* done with true #if/#elif; skip #elif's and #else */
145     TOKIF_ELSE_YES,                       /* processing a true #else branch */
146     TOKIF_ELSE_NO                        /* processing a false #else branch */
147 };
148 
149 /*
150  *   #if stack entry
151  */
152 struct tok_if_info_t
153 {
154     /* state */
155     tok_if_t state;
156 
157     /* file descriptor and line number of starting #if */
158     class CTcTokFileDesc *desc;
159     long linenum;
160 };
161 
162 /* ------------------------------------------------------------------------ */
163 /*
164  *   Token Types
165  */
166 
167 enum tc_toktyp_t
168 {
169     TOKT_INVALID,                                          /* invalid token */
170     TOKT_NULLTOK,          /* null token - caller should read another token */
171     TOKT_EOF,                                                /* end of file */
172     TOKT_MACRO_FORMAL,          /* formal parameter replacement placeholder */
173     TOKT_MACRO_FOREACH,               /* macro varargs #foreach placeholder */
174     TOKT_MACRO_ARGCOUNT,             /* macro varargs #argcount placeholder */
175     TOKT_MACRO_IFEMPTY,                       /* #ifempty macro placeholder */
176     TOKT_MACRO_IFNEMPTY,                     /* #ifnempty macro placeholder */
177     TOKT_SYM,                                              /* symbolic name */
178     TOKT_INT,                                                    /* integer */
179     TOKT_SSTR,                                      /* single-quoted string */
180     TOKT_DSTR,                                      /* double-quoted string */
181     TOKT_DSTR_START,          /* start of a dstring with embedding - "...<< */
182     TOKT_DSTR_MID,          /* middle of a dstring with embedding - >>...<< */
183     TOKT_DSTR_END,              /* end of a dstring with embedding - >>..." */
184     TOKT_LPAR,                                            /* left paren '(' */
185     TOKT_RPAR,                                           /* right paren ')' */
186     TOKT_COMMA,                                                /* comma ',' */
187     TOKT_DOT,                                                 /* period '.' */
188     TOKT_LBRACE,                                          /* left brace '{' */
189     TOKT_RBRACE,                                         /* right brace '}' */
190     TOKT_LBRACK,                                 /* left square bracket '[' */
191     TOKT_RBRACK,                                /* right square bracket ']' */
192     TOKT_EQ,                                             /* equals sign '=' */
193     TOKT_EQEQ,                                   /* double-equals sign '==' */
194     TOKT_ASI,                      /* colon-equals assignment operator ':=' */
195     TOKT_PLUS,                                             /* plus sign '+' */
196     TOKT_MINUS,                                           /* minus sign '-' */
197     TOKT_TIMES,                                /* multiplication symbol '*' */
198     TOKT_DIV,                                        /* division symbol '/' */
199     TOKT_MOD,                                                 /* modulo '%' */
200     TOKT_GT,                                       /* greater-than sign '>' */
201     TOKT_LT,                                          /* less-than sign '<' */
202     TOKT_GE,                                  /* greater-or-equal sign '>=' */
203     TOKT_LE,                                     /* less-or-equal sign '<=' */
204     TOKT_NE,                                /* not-equals sign '!=' or '<>' */
205     TOKT_ARROW,                                        /* arrow symbol '->' */
206     TOKT_COLON,                                                /* colon ':' */
207     TOKT_SEM,                                              /* semicolon ';' */
208     TOKT_AND,                                            /* bitwise AND '&' */
209     TOKT_ANDAND,                                        /* logical AND '&&' */
210     TOKT_OR,                                              /* bitwise OR '|' */
211     TOKT_OROR,                                           /* logical OR '||' */
212     TOKT_XOR,                                            /* bitwise XOR '^' */
213     TOKT_SHL,                                            /* shift left '<<' */
214     TOKT_SHR,                                           /* shift right '>>' */
215     TOKT_INC,                                             /* increment '++' */
216     TOKT_DEC,                                             /* decrement '--' */
217     TOKT_PLUSEQ,                                        /* plus-equals '+=' */
218     TOKT_MINEQ,                                        /* minus-equals '-=' */
219     TOKT_TIMESEQ,                                      /* times-equals '*=' */
220     TOKT_DIVEQ,                                       /* divide-equals '/=' */
221     TOKT_MODEQ,                                          /* mod-equals '%=' */
222     TOKT_ANDEQ,                                          /* and-equals '&=' */
223     TOKT_OREQ,                                            /* or-equals '|=' */
224     TOKT_XOREQ,                                          /* xor-equals '^=' */
225     TOKT_SHLEQ,                              /* shift-left-and-assign '<<=' */
226     TOKT_SHREQ,                             /* shift-right-and-assign '>>=' */
227     TOKT_NOT,                                            /* logical not '!' */
228     TOKT_BNOT,                                           /* bitwise not '~' */
229     TOKT_POUND,                                                /* pound '#' */
230     TOKT_POUNDPOUND,                                   /* double-pound '##' */
231     TOKT_POUNDAT,                                          /* pound-at '#@' */
232     TOKT_ELLIPSIS,                                        /* ellipsis '...' */
233     TOKT_QUESTION,                                     /* question mark '?' */
234     TOKT_COLONCOLON,                                   /* double-colon '::' */
235     TOKT_FLOAT,                                    /* floating-point number */
236     TOKT_AT,                                                     /* at-sign */
237 
238     /* keywords */
239     TOKT_SELF,
240     TOKT_INHERITED,
241     TOKT_ARGCOUNT,
242     TOKT_IF,
243     TOKT_ELSE,
244     TOKT_FOR,
245     TOKT_WHILE,
246     TOKT_DO,
247     TOKT_SWITCH,
248     TOKT_CASE,
249     TOKT_DEFAULT,
250     TOKT_GOTO,
251     TOKT_BREAK,
252     TOKT_CONTINUE,
253     TOKT_FUNCTION,
254     TOKT_RETURN,
255     TOKT_LOCAL,
256     TOKT_OBJECT,
257     TOKT_NIL,
258     TOKT_TRUE,
259     TOKT_PASS,
260     TOKT_EXTERNAL,
261     TOKT_EXTERN,
262     TOKT_FORMATSTRING,
263     TOKT_CLASS,
264     TOKT_REPLACE,
265     TOKT_MODIFY,
266     TOKT_NEW,
267     TOKT_DELETE,
268     TOKT_THROW,
269     TOKT_TRY,
270     TOKT_CATCH,
271     TOKT_FINALLY,
272     TOKT_INTRINSIC,
273     TOKT_DICTIONARY,
274     TOKT_GRAMMAR,
275     TOKT_ENUM,
276     TOKT_TEMPLATE,
277     TOKT_STATIC,
278     TOKT_FOREACH,
279     TOKT_EXPORT,
280     TOKT_DELEGATED,
281     TOKT_TARGETPROP,
282     TOKT_PROPERTYSET,
283     TOKT_TARGETOBJ,
284     TOKT_DEFININGOBJ,
285     TOKT_TRANSIENT,
286     TOKT_REPLACED,
287 
288     /* type names */
289     TOKT_VOID,
290     TOKT_INTKW,
291     TOKT_STRING,
292     TOKT_LIST,
293     TOKT_BOOLEAN,
294     TOKT_PROPERTY,
295     TOKT_ANY
296 };
297 
298 /* ------------------------------------------------------------------------ */
299 /*
300  *   Source Block.  As we read the source file, we need to keep quoted
301  *   strings and symbol names around for later reference, in case they're
302  *   needed after reading more tokens and flushing the line buffer.  We'll
303  *   copy needed text into our source blocks, which we keep in memory
304  *   throughout the compilation, so that we can be certain we can
305  *   reference these strings at any time.
306  */
307 
308 /* size of a source block */
309 const size_t TCTOK_SRC_BLOCK_SIZE = 50000;
310 
311 /* source block class */
312 class CTcTokSrcBlock
313 {
314 public:
CTcTokSrcBlock()315     CTcTokSrcBlock()
316     {
317         /* no next block yet */
318         nxt_ = 0;
319     }
320 
~CTcTokSrcBlock()321     ~CTcTokSrcBlock()
322     {
323         /* delete the next block in line */
324         if (nxt_ != 0)
325             delete nxt_;
326     }
327 
328     /* get/set the next block */
get_next()329     CTcTokSrcBlock *get_next() const { return nxt_; }
set_next(CTcTokSrcBlock * blk)330     void set_next(CTcTokSrcBlock *blk) { nxt_ = blk; }
331 
332     /* get a pointer to the block's buffer */
get_buf()333     char *get_buf() { return buf_; }
334 
335 private:
336     /* the next block in the list */
337     CTcTokSrcBlock *nxt_;
338 
339     /* bytes of the list entry */
340     char buf_[TCTOK_SRC_BLOCK_SIZE];
341 };
342 
343 
344 /* ------------------------------------------------------------------------ */
345 /*
346  *   String Buffer.  We use these buffers for reading input lines and
347  *   expanding macros.
348  */
349 class CTcTokString
350 {
351 public:
CTcTokString()352     CTcTokString()
353     {
354         /* no buffer yet */
355         buf_ = 0;
356         buf_len_ = 0;
357         buf_size_ = 0;
358     }
359 
~CTcTokString()360     virtual ~CTcTokString()
361     {
362         /* delete our buffer */
363         if (buf_ != 0)
364             t3free(buf_);
365     }
366 
367     /* ensure that a given amount of space if available */
ensure_space(size_t siz)368     virtual void ensure_space(size_t siz)
369     {
370         /* make sure there's room for the requested size plus a null byte */
371         if (buf_size_ < siz + 1)
372         {
373             /* increase to the next 4k increment */
374             buf_size_ = (siz + 4095 + 1) & ~4095;
375 
376             /* allocate or re-allocate the buffer */
377             if (buf_ == 0)
378                 buf_ = (char *)t3malloc(buf_size_);
379             else
380                 buf_ = (char *)t3realloc(buf_, buf_size_);
381 
382             /* throw an error if that failed */
383             if (buf_ == 0)
384                 err_throw(TCERR_NO_STRBUF_MEM);
385         }
386     }
387 
388     /* expand the buffer */
expand()389     void expand()
390     {
391         /* expand to the next 4k increment */
392         ensure_space(buf_size_ + 4096);
393     }
394 
395     /* get the text and the length of the text */
get_text()396     const char *get_text() const { return buf_; }
get_text_len()397     size_t get_text_len() const { return buf_len_; }
398 
399     /* get the end of the text */
get_text_end()400     const char *get_text_end() const { return buf_ + buf_len_; }
401 
402     /* append text to the buffer */
append(const char * p,size_t len)403     virtual void append(const char *p, size_t len)
404     {
405         /* make sure we have space available */
406         ensure_space(buf_len_ + len);
407 
408         /* copy the text onto the end of our buffer */
409         memcpy(buf_ + buf_len_, p, len);
410 
411         /* add it to the length of the text */
412         buf_len_ += len;
413 
414         /* null-terminte it */
415         buf_[buf_len_] = '\0';
416     }
417 
418     /* copy text into the buffer, replacing existing text */
copy(const char * p,size_t len)419     virtual void copy(const char *p, size_t len)
420     {
421         /* ensure we have enough space */
422         ensure_space(len);
423 
424         /* copy the text */
425         memcpy(buf_, p, len);
426 
427         /* set our length */
428         buf_len_ = len;
429 
430         /* null-terminate it */
431         buf_[buf_len_] = '\0';
432     }
433 
434     /* clear any existing text */
clear_text()435     virtual void clear_text()
436     {
437         /* zero the length */
438         buf_len_ = 0;
439 
440         /* put a null terminator at the start of the buffer if possible */
441         if (buf_size_ > 0)
442             buf_[0] = '\0';
443     }
444 
445     /* get the buffer, for copying text directly into it */
get_buf()446     virtual char *get_buf() const { return buf_; }
get_buf_size()447     size_t get_buf_size() const { return buf_size_; }
448 
449     /*
450      *   Set the text length - use this after copying directly into the
451      *   buffer to set the length, excluding the null terminator.  We'll
452      *   add a null terminator at the given length.
453      */
set_text_len(size_t len)454     virtual void set_text_len(size_t len)
455     {
456         /* set the new length */
457         buf_len_ = len;
458 
459         /* add a null terminator after the new length */
460         if (len < buf_size_)
461             buf_[len] = '\0';
462     }
463 
464 protected:
465     /* buffer */
466     char *buf_;
467 
468     /* size of the buffer */
469     size_t buf_size_;
470 
471     /* length of the text in the buffer (excluding trailing null) */
472     size_t buf_len_;
473 };
474 
475 
476 /*
477  *   String buffer subclass for a non-allocated string that merely
478  *   references another buffer.  This can be used anywhere a CTcString is
479  *   required, but does not require any allocation.
480  *
481  *   These objects can only be used in 'const' contexts: the underlying
482  *   buffer cannot be changed or expanded, since we do not own the
483  *   underlying buffer.
484  */
485 class CTcTokStringRef: public CTcTokString
486 {
487 public:
CTcTokStringRef()488     CTcTokStringRef()
489     {
490         /* we have no referenced buffer yet */
491         buf_ = 0;
492         buf_size_ = 0;
493         buf_len_ = 0;
494     }
495 
~CTcTokStringRef()496     ~CTcTokStringRef()
497     {
498         /* we don't own the underlying buffer, so simply forget about it */
499         buf_ = 0;
500     }
501 
502     /* we can't make any changes to the underlying buffer */
ensure_space(size_t)503     void ensure_space(size_t) { }
append(const char *,size_t)504     void append(const char *, size_t) { assert(FALSE); }
copy(const char *,size_t)505     void copy(const char *, size_t) { assert(FALSE); }
clear_text()506     void clear_text() { assert(FALSE); }
get_buf()507     char *get_buf() const { assert(FALSE); return 0; }
set_text_len(size_t)508     void set_text_len(size_t) { assert(FALSE); }
509 
510     /* set my underlying buffer */
set_buffer(const char * buf,size_t len)511     void set_buffer(const char *buf, size_t len)
512     {
513         buf_ = (char *)buf;
514         buf_size_ = len + 1;
515         buf_len_ = len;
516     }
517 };
518 
519 /* ------------------------------------------------------------------------ */
520 /*
521  *   Token
522  */
523 class CTcToken
524 {
525 public:
526     /* get/set the token type */
gettyp()527     tc_toktyp_t gettyp() const { return typ_; }
settyp(tc_toktyp_t typ)528     void settyp(tc_toktyp_t typ) { typ_ = typ; }
529 
530     /* get/set the fully-expanded flag */
get_fully_expanded()531     int get_fully_expanded() const { return fully_expanded_; }
set_fully_expanded(int flag)532     void set_fully_expanded(int flag) { fully_expanded_ = flag; }
533 
534     /* get/set the text pointer */
get_text()535     const char *get_text() const { return text_; }
get_text_len()536     size_t get_text_len() const { return text_len_; }
set_text(const char * txt,size_t len)537     void set_text(const char *txt, size_t len)
538     {
539         text_ = txt;
540         text_len_ = len;
541     }
542 
543     /* get/set the integer value */
get_int_val()544     long get_int_val() const { return int_val_; }
set_int_val(long val)545     void set_int_val(long val) { typ_ = TOKT_INT; int_val_ = val; }
546 
547     /*
548      *   compare the text to the given string - returns true if the text
549      *   matches, false if not
550      */
text_matches(const char * txt,size_t len)551     int text_matches(const char *txt, size_t len) const
552     {
553         return (len == text_len_
554                 && memcmp(txt, text_, len) == 0);
555     }
556 
557 private:
558     /* token type */
559     tc_toktyp_t typ_;
560 
561     /*
562      *   Pointer to the token's text.  This is a pointer into the
563      *   tokenizer's symbol table or into the token list itself, so this
564      *   pointer is valid as long as the tokenizer and its token list are
565      *   valid.
566      */
567     const char *text_;
568     size_t text_len_;
569 
570     /* integer value - valid when the token type is TOKT_INT */
571     long int_val_;
572 
573     /*
574      *   flag: the token has been fully expanded, and should not be
575      *   expanded further on any subsequent rescan for macros
576      */
577     uint fully_expanded_ : 1;
578 };
579 
580 
581 /* ------------------------------------------------------------------------ */
582 /*
583  *   Macro Expansion Resource object.  This object is a collection of
584  *   resources that are needed for a macro expansion.  To avoid frequent
585  *   allocating and freeing of these resources, we keep a pool of these
586  *   objects around so that we can re-use them as needed.  We'll
587  *   dynamically expand the pool as necessary, so this doesn't impose any
588  *   pre-set limits; it simply avoids lots of memory allocation activity.
589  */
590 class CTcMacroRsc
591 {
592 public:
CTcMacroRsc()593     CTcMacroRsc()
594     {
595         /* we're not in any lists yet */
596         next_avail_ = 0;
597         next_ = 0;
598     }
599 
600     /* buffer for expansion of the whole line */
601     CTcTokString line_exp_;
602 
603     /* buffer for expansion of current macro on line */
604     CTcTokString macro_exp_;
605 
606     /* buffer for expansion of an actual parameter value */
607     CTcTokString actual_exp_buf_;
608 
609     /* next resource object in the "available" list */
610     CTcMacroRsc *next_avail_;
611 
612     /* next resource object in the master list */
613     CTcMacroRsc *next_;
614 };
615 
616 
617 /* ------------------------------------------------------------------------ */
618 /*
619  *   Abstract token source interface.  This is used to allow external code
620  *   to inject their own substreams into the main token stream.
621  */
622 class CTcTokenSource
623 {
624 public:
625     /*
626      *   Get the next token from the source.  Returns null if there are no
627      *   more tokens.
628      */
629     virtual const CTcToken *get_next_token() = 0;
630 
631     /* set the enclosing external token source and current token */
set_enclosing_source(CTcTokenSource * src,const CTcToken * tok)632     void set_enclosing_source(CTcTokenSource *src, const CTcToken *tok)
633     {
634         /* remember the enclosing source */
635         enclosing_src_ = src;
636 
637         /* remember the current token */
638         enclosing_curtok_ = *tok;
639     }
640 
641     /* get the enclosing external token source */
get_enclosing_source()642     CTcTokenSource *get_enclosing_source() const
643         { return enclosing_src_; }
644 
645     /* get the token that was current when this source was inserted */
get_enclosing_curtok()646     const CTcToken *get_enclosing_curtok() const
647         { return &enclosing_curtok_; }
648 
649 protected:
650     /* the enclosing external token source */
651     CTcTokenSource *enclosing_src_;
652 
653     /*
654      *   the current token in effect enclosing this source - this is the
655      *   token that comes immediately after the source's tokens, because a
656      *   source is inserted before the current token
657      */
658     CTcToken enclosing_curtok_;
659 };
660 
661 
662 /* ------------------------------------------------------------------------ */
663 /*
664  *   Tokenizer.  This object reads a file and constructs a representation
665  *   of the file as a token list in memory.  The tokenizer interprets
666  *   preprocessor directives and expands macros.
667  */
668 class CTcTokenizer
669 {
670 public:
671     /*
672      *   Create the tokenizer and start reading from the given file.  The
673      *   default character set is generally specified by the user (on the
674      *   compiler command line, for example), or obtained from the
675      *   operating system.
676      */
677     CTcTokenizer(class CResLoader *res_loader, const char *default_charset);
678 
679     /* destroy the tokenizer */
680     ~CTcTokenizer();
681 
682     /*
683      *   Reset the tokenizer.  Deletes the current source object and all
684      *   saved token text.  This can be used after compilation of a unit
685      *   is completed and the intermediate parser state can be completely
686      *   discarded.
687      */
688     void reset();
689 
690     /*
691      *   Set the source file.  'src_filename' is the fully-resolved local
692      *   filename of the source file; 'orig_name' is the original name as
693      *   given on the command line, in the makefile, or wherever it came
694      *   from.  We keep track of the original name so that we can pass
695      *   information to the debugger indicating the name as it was originally
696      *   given; this is more useful than the resolved filename, because we
697      *   might want to run the debugger on another machine with a different
698      *   local directory structure.
699      */
700     int set_source(const char *src_filename, const char *orig_name);
701 
702     /* set the source to a memory buffer */
703     void set_source_buf(const char *buf);
704 
705     /*
706      *   Add a #include directory to the include path.  We search the
707      *   include path in the order in which they were defined.
708      */
709     void add_inc_path(const char *path);
710 
711     /*
712      *   Set preprocess-only mode.  In this mode, we'll retain
713      *   preprocessor directives that will be needed if the preprocessed
714      *   result is itself compiled; for example, we'll retain #line,
715      *   #pragma C, #error, and #pragma message directives.
716      */
set_mode_pp_only(int flag)717     void set_mode_pp_only(int flag) { pp_only_mode_ = flag; }
718 
719     /*
720      *   Set list-includes mode.  In this mode, we'll simply scan source
721      *   files and write to the standard output a list of the names of all
722      *   of the #include files.
723      */
set_list_includes_mode(int flag)724     void set_list_includes_mode(int flag) { list_includes_mode_ = flag; }
725 
726     /*
727      *   Get/set the test-report mode.  In this mode, we'll expand __FILE__
728      *   macros with the root name only.
729      */
get_test_report_mode()730     int get_test_report_mode() const { return test_report_mode_; }
set_test_report_mode(int flag)731     void set_test_report_mode(int flag) { test_report_mode_ = flag; }
732 
733     /* enable or disable preprocessing directives */
enable_pp(int enable)734     void enable_pp(int enable) { allow_pp_ = enable; }
735 
736     /* get the type of the current token */
cur()737     tc_toktyp_t cur() const { return curtok_.gettyp(); }
738 
739     /* get the next token, reading a new line of source if necessary */
740     tc_toktyp_t next();
741 
742     /*
743      *   Un-get the current token and back up to the previous token.  The
744      *   maximum un-get depth is one token - after un-getting one token,
745      *   another token must not be un-gotten until after reading another
746      *   token.
747      *
748      *   Tokens un-got with this routine are accessible only to next(),
749      *   not to any of the lower-level token readers.
750      */
751     void unget();
752 
753     /* get the current token */
getcur()754     const class CTcToken *getcur() const { return &curtok_; }
755 
756     /*
757      *   Copy the current token.  This makes a copy of the token's text in
758      *   tokenizer source memory, to ensure that the reference to the text
759      *   buffer the caller is keeping will remain valid forever.
760      */
761     const class CTcToken *copycur();
762 
763     /* make a safely storable copy of a given token */
764     void copytok(class CTcToken *dst, const class CTcToken *src);
765 
766     /* check to see if the current token matches the given text */
767     int cur_tok_matches(const char *txt, size_t len);
768 
769     /*
770      *   Set an external token source.  We'll read tokens from this source
771      *   until it is exhausted, at which point we'll revert to the enclosing
772      *   source.
773      *
774      *   The new source is inserted before the current token, so the current
775      *   token will become current once again when this source is exhausted.
776      *   We'll automatically advance to the next token, which (unless we
777      *   have an ungotten token stashed) will go to the first token in the
778      *   new source.
779      */
set_external_source(CTcTokenSource * src)780     void set_external_source(CTcTokenSource *src)
781     {
782         /*
783          *   store the old source in the new source, so we can restore the
784          *   old source when we have exhausted the new source
785          */
786         src->set_enclosing_source(ext_src_, &curtok_);
787 
788         /* set the new external source */
789         ext_src_ = src;
790 
791         /* skip to the next token */
792         next();
793     }
794 
795     /* clear all external sources, returning to the real token stream */
796     void clear_external_sources();
797 
798     /*
799      *   assume that we should have found '>>' sequence after an embedded
800      *   expression in a string - used by parsers to resynchronize after
801      *   an apparent syntax error
802      */
803     void assume_missing_dstr_cont();
804 
805     /* define a macro */
806     void add_define(const char *sym, size_t len, const char *expansion,
807                     size_t expan_len);
808 
add_define(const char * sym,const char * expansion,size_t expan_len)809     void add_define(const char *sym, const char *expansion, size_t expan_len)
810         { add_define(sym, strlen(sym), expansion, expan_len); }
811 
add_define(const char * sym,const char * expansion)812     void add_define(const char *sym, const char *expansion)
813         { add_define(sym, strlen(sym), expansion, strlen(expansion)); }
814 
815     /* add a macro, given the symbol entry */
816     void add_define(class CTcHashEntryPp *entry);
817 
818     /* undefine a previously defined macro */
819     void undefine(const char *sym, size_t len);
undefine(const char * sym)820     void undefine(const char *sym) { undefine(sym, strlen(sym)); }
821 
822     /* find a #define symbol */
823     class CTcHashEntryPp *find_define(const char *sym, size_t len) const;
824 
825     /* find an #undef symbol */
826     class CTcHashEntryPp *find_undef(const char *sym, size_t len) const;
827 
828     /* enumerate all of the #define symbols through a callback */
829     void enum_defines(void (*func)(void *ctx, class CTcHashEntryPp *entry),
830                       void *ctx);
831 
832     /* read the next line and handle preprocessor directives */
833     int read_line_pp();
834 
835     /* get the file descriptor and line number of the last line read */
get_last_desc()836     class CTcTokFileDesc *get_last_desc() const { return last_desc_; }
get_last_linenum()837     long get_last_linenum() const { return last_linenum_; }
get_last_pos(class CTcTokFileDesc ** desc,long * linenum)838     void get_last_pos(class CTcTokFileDesc **desc, long *linenum) const
839     {
840         *desc = last_desc_;
841         *linenum = last_linenum_;
842     }
843 
844     /*
845      *   set the current file descriptor and line number -- this can be
846      *   used to force the line position to a previously-saved value
847      *   (during code generation, for example) for error-reporting and
848      *   debug-record purposes
849      */
set_line_info(class CTcTokFileDesc * desc,long linenum)850     void set_line_info(class CTcTokFileDesc *desc, long linenum)
851     {
852         last_desc_ = desc;
853         last_linenum_ = linenum;
854     }
855 
856     /*
857      *   Parse a preprocessor constant expression.  We always parse out of
858      *   the macro expansion buffer (expbuf_), but the caller must set p_
859      *   to point to the starting point on the expansion line prior to
860      *   calling this routine.
861      *
862      *   If 'read_first' is true, we'll read a token into curtok_ before
863      *   parsing; otherwise, we'll assume the caller has already primed
864      *   the pump by reading the first token.
865      *
866      *   If 'last_on_line' is true, we'll flag an error if anything is
867      *   left on the line after we finish parsing the expression.
868      *
869      *   If 'add_line_ending' is true, we'll add an end-of-line marker to
870      *   the expansion buffer, so that the tokenizer won't attempt to read
871      *   past the end of the line.  Since a preprocessor expression must
872      *   be contained entirely on a single logical line, we must never try
873      *   to read past the end of the current line when parsing a
874      *   preprocessor expression.
875      */
876     int pp_parse_expr(class CTcConstVal *result,
877                       int read_first, int last_on_line, int add_line_ending);
878 
879     /* log an error, optionally with parameters */
880     static void log_error(int errnum, ...);
881 
882     /*
883      *   log an error with the current token text as the parameter,
884      *   suitable for a "%.*s" format list entry (hence we'll provide two
885      *   parameters: an integer with the length of the token text, and a
886      *   pointer to the token text string)
887      */
888     void log_error_curtok(int errnum);
889 
890     /* log a warning, optionally with parameters */
891     static void log_warning(int errnum, ...);
892 
893     /* log a warning with the current token as the parameter */
894     void log_warning_curtok(int errnum);
895 
896     /* log a warning or error for the current token */
897     void log_error_or_warning_curtok(tc_severity_t sev, int errnum);
898 
899     /* log a warning or error for a given token */
900     void log_error_or_warning_with_tok(tc_severity_t sev, int errnum,
901                                        const CTcToken *tok);
902 
903     /*
904      *   log then throw a fatal error (this is different from an internal
905      *   error in that it indicates an unrecoverable error in the input;
906      *   an internal error indicates that something is wrong with the
907      *   compiler itself)
908      */
909     static void throw_fatal_error(int errnum, ...);
910 
911     /*
912      *   log then throw an internal error (internal errors are always
913      *   fatal: these indicate that something has gone wrong in the
914      *   compiler, and are equivalent to an assert failure)
915      */
916     static void throw_internal_error(int errnum, ...);
917 
918     /* display a string/number value */
919     void msg_str(const char *str, size_t len) const;
920     void msg_long(long val) const;
921 
922     /* get the current line */
get_cur_line()923     const char *get_cur_line() const { return linebuf_.get_text(); }
get_cur_line_len()924     size_t get_cur_line_len() const { return linebuf_.get_text_len(); }
925 
926     /* get the #define hash table */
get_defines_table()927     class CVmHashTable *get_defines_table() const { return defines_; }
928 
929     /*
930      *   look up a token as a keyword; returns true and fills in 'kw' with
931      *   the keyword token ID if the token is in fact a keyword, or
932      *   returns false if it's not a keyword
933      */
934     int look_up_keyword(const CTcToken *tok, tc_toktyp_t *kw);
935 
936     /*
937      *   Get the next token on the line, filling in the token object.
938      *   Advances the pointer to the character immediately following the
939      *   token.
940      *
941      *   If the token is a string, and the string contains backslash
942      *   sequences, we'll modify the source string by translating each
943      *   backslash sequences; for example, a "\n" sequence is changed into
944      *   an ASCII 10.
945      */
946     static tc_toktyp_t next_on_line(utf8_ptr *p, CTcToken *tok,
947                                     int *in_embedding);
948 
949     /*
950      *   Get the text of an operator token.  Returns a pointer to a
951      *   constant, static, null-terminated string, suitable for use in
952      *   error messages.
953      */
954     static const char *get_op_text(tc_toktyp_t op);
955 
956     /*
957      *   Store text in the source list.  Text stored here is available
958      *   throughout compilation.
959      */
960     const char *store_source(const char *txt, size_t len);
961 
962     /*
963      *   Get the index of the next source file descriptor that will be
964      *   created.  The linker can use this information to fix up
965      *   references to file descriptors in an object file when loading
966      *   multiple object files.
967      */
get_next_filedesc_index()968     int get_next_filedesc_index() const { return next_filedesc_id_; }
969 
970     /* get the number of source file descriptors in the master list */
get_filedesc_count()971     int get_filedesc_count() const { return next_filedesc_id_; }
972 
973     /* get the file descriptor at the given (0-based) index */
get_filedesc(size_t idx)974     class CTcTokFileDesc *get_filedesc(size_t idx) const
975     {
976         /* return the array entry at the index, if the index is valid */
977         return (idx < desc_list_cnt_ ? desc_list_[idx] : 0);
978     }
979 
980     /* get the head of the master source file descriptor list */
get_first_filedesc()981     class CTcTokFileDesc *get_first_filedesc() const { return desc_head_; }
982 
983     /*
984      *   Create a new file descriptor and add it to the master list.  This
985      *   creates the new descriptor unconditionally, even if a descriptor
986      *   for the same source file already exists.
987      */
create_file_desc(const char * fname,size_t len)988     class CTcTokFileDesc *create_file_desc(const char *fname, size_t len)
989         { return get_file_desc(fname, len, TRUE, fname, len); }
990 
991     /*
992      *   Set the string capture file.  Once this is set, we'll write the
993      *   contents of each string token that we encounter to this file,
994      *   with a newline after each token.
995      */
996     void set_string_capture(osfildef *fp);
997 
998     /* write macros to a file, for debugger use */
999     void write_macros_to_file_for_debug(class CVmFile *fp);
1000 
1001     /*
1002      *   Load macros from a file.  If any errors occur, we'll flag them
1003      *   through the error handler object and return a non-zero value.
1004      *   Returns zero on success.
1005      */
1006     int load_macros_from_file(class CVmStream *fp,
1007                               class CTcTokLoadMacErr *err_handler);
1008 
1009     /* receive notification that the compiler is done with all parsing */
parsing_done()1010     void parsing_done()
1011     {
1012         /* forget any input file position */
1013         set_line_info(0, 0);
1014     }
1015 
1016 private:
1017     /* skip whitespace and token markers */
1018     static void skip_ws_and_markers(utf8_ptr *p);
1019 
1020     /*
1021      *   get the next token on the line; if we go past the end of the
1022      *   string buffer, we'll return EOF
1023      */
1024     static tc_toktyp_t next_on_line(const CTcTokString *srcbuf, utf8_ptr *p,
1025                                     CTcToken *tok, int *in_embedding);
1026 
1027     /*
1028      *   get the next token on the current line, updating the internal
1029      *   character position pointer to point just past the token, and filling
1030      *   in the internal current token object with the toen data
1031      */
next_on_line()1032     tc_toktyp_t next_on_line() { return next_on_line(&p_, &curtok_, 0); }
1033 
1034     /* get the next token on the line, with string translation */
next_on_line_xlat(int * in_embedding)1035     tc_toktyp_t next_on_line_xlat(int *in_embedding)
1036         { return next_on_line_xlat(&p_, &curtok_, in_embedding); }
1037 
1038     /*
1039      *   get the next token, translating strings and storing string and
1040      *   symbol text in the source block list
1041      */
1042     tc_toktyp_t next_on_line_xlat_keep();
1043 
1044     /*
1045      *   get the next token on the line, translating strings to internal
1046      *   format
1047      */
1048     tc_toktyp_t next_on_line_xlat(utf8_ptr *p, CTcToken *tok,
1049                                   int *in_embedding);
1050 
1051     /*
1052      *   translate a string to internal format by converting escape
1053      *   sequences; overwrites the original buffer
1054      */
1055     tc_toktyp_t xlat_string(utf8_ptr *p, CTcToken *tok,
1056                             int *in_embedding);
1057 
1058     /*
1059      *   translate a string into a given buffer; if 'force_embed_end' is
1060      *   true, we'll act as though we're continuing the string after the
1061      *   '>>' after an embedded expression, no matter what the actual
1062      *   input looks like
1063      */
1064     tc_toktyp_t xlat_string_to(char *dst, utf8_ptr *p, CTcToken *tok,
1065                                int *in_embedding, int force_embed_end);
1066 
1067     /*
1068      *   Translate a string, saving the translated version in the source
1069      *   block list.  If 'force_end_embed' is true, we'll act as though we
1070      *   were looking at '>>' (or, more precisely, we'll act as though
1071      *   '>>' immediately preceded the current input), regardless of what
1072      *   the actual input looks like.
1073      */
1074     tc_toktyp_t xlat_string_to_src(int *in_embedding, int force_end_embed);
1075 
1076     /* initialize the source block list */
1077     void init_src_block_list();
1078 
1079     /* delete current source file, including all including parents */
1080     void delete_source();
1081 
1082     /*
1083      *   read the next line; processes comments, but does not expand
1084      *   macros or parse preprocessor directives
1085      */
1086     char *read_line(int append);
1087 
1088     /* set the source read pointer to the start of a new line */
start_new_line(char * p,size_t len)1089     void start_new_line(char *p, size_t len)
1090     {
1091         /* set the read pointer to the start of the line */
1092         p_.set(p);
1093 
1094         /* remember where the current line starts and its total length */
1095         line_start_ = p;
1096         line_len_ = len;
1097     }
1098 
1099     /* reserve space for text in the source list */
1100     void reserve_source(size_t len);
1101 
1102     /*
1103      *   Commit space in the source list - this is used when text is
1104      *   directly stored after reserving space.  The size reserved may be
1105      *   greater than the size committed, because it is sometimes more
1106      *   efficient to make a guess that may overestimate the amount we
1107      *   actually end up needing.
1108      */
1109     void commit_source(size_t len);
1110 
1111     /* unsplice text from the current line and make it the next line */
1112     void unsplice_line(const char *new_line_start);
1113 
1114     /* parse a string */
1115     static tc_toktyp_t tokenize_string(utf8_ptr *p, CTcToken *tok,
1116                                        int *in_embedding);
1117 
1118     /* process comments */
1119     void process_comments(size_t start_ofs);
1120 
1121     /* splice lines for a string that runs across multiple lines */
1122     void splice_string();
1123 
1124     /* expand macros in the current line */
1125     int expand_macros_curline(int read_more, int allow_defined,
1126                               int append_to_expbuf);
1127 
1128     /* expand all of the macros in the given text */
1129     int expand_macros(class CTcTokString *srcbuf, utf8_ptr *src,
1130                       class CTcTokString *expbuf, int read_more,
1131                       int allow_defined, int append);
1132 
1133     /* expand the macro at the current token on the current line */
1134     int expand_macro(class CTcMacroRsc *res, class CTcTokString *expbuf,
1135                      const class CTcTokString *srcbuf, utf8_ptr *src,
1136                      size_t macro_srcbuf_ofs, CTcHashEntryPp *entry,
1137                      int read_more, int allow_defined, int *expanded);
1138 
1139     /* scan for a prior expansion of a macro within the current context */
1140     static int scan_for_prior_expansion(utf8_ptr src, const char *src_end,
1141                                         const class CTcHashEntryPp *entry);
1142 
1143     /* remove end-of-macro-expansion flags from a buffer */
1144     static void remove_end_markers(class CTcTokString *buf);
1145 
1146     /* change a buffer to use individual token full-expansion markers */
1147     void mark_full_exp_tokens(CTcTokString *dstbuf,
1148                               const class CTcTokString *srcbuf,
1149                               int append) const;
1150 
1151     /* allocate a macro expansion resource */
1152     class CTcMacroRsc *alloc_macro_rsc();
1153 
1154     /* release a macro expansion resource */
1155     void release_macro_rsc(class CTcMacroRsc *rsc);
1156 
1157     /*
1158      *   Parse the actual parameters to a macro.  Fills in argofs[] and
1159      *   arglen[] with the offsets (from srcbuf->get_buf()) and lengths,
1160      *   respectively, of each actual parameter's text.
1161      */
1162     int parse_macro_actuals(const class CTcTokString *srcbuf, utf8_ptr *src,
1163                             const CTcHashEntryPp *macro_entry,
1164                             size_t argofs[TOK_MAX_MACRO_ARGS],
1165                             size_t arglen[TOK_MAX_MACRO_ARGS],
1166                             int read_more, int *found_actuals);
1167 
1168     /* splice the next line for reading more macro actuals */
1169     tc_toktyp_t actual_splice_next_line(const CTcTokString *srcbuf,
1170                                         utf8_ptr *src, CTcToken *tok);
1171 
1172     /* substitute the actual parameters in a macro's expansion */
1173     int substitute_macro_actuals(class CTcMacroRsc *rsc,
1174                                  class CTcTokString *subexp,
1175                                  CTcHashEntryPp *macro_entry,
1176                                  const class CTcTokString *srcbuf,
1177                                  const size_t *argofs, const size_t *arglen,
1178                                  int allow_defined);
1179 
1180     /* stringize a macro actual parameter into an expansion buffer */
1181     void stringize_macro_actual(class CTcTokString *expbuf,
1182                                 const char *actual_val, size_t actual_len,
1183                                 char quote_char, int add_open_quote,
1184                                 int add_close_quote);
1185 
1186     /* skip a delimited macro expansion area (#foreach, #ifempty, etc) */
1187     void skip_delimited_group(utf8_ptr *p, int parts_to_skip);
1188 
1189     /* expand a defined() preprocessor operator */
1190     int expand_defined(class CTcTokString *subexp,
1191                        const class CTcTokString *srcbuf, utf8_ptr *src);
1192 
1193     /* add a file to the list of files to be included only once */
1194     void add_include_once(const char *fname);
1195 
1196     /* find a file in the list of files to be included only once */
1197     int find_include_once(const char *fname);
1198 
1199     /* process a #pragma directive */
1200     void pp_pragma();
1201 
1202     /* process a #charset directive */
1203     void pp_charset();
1204 
1205     /* process a #include directive */
1206     void pp_include();
1207 
1208     /* process a #define directive */
1209     void pp_define();
1210 
1211     /* process a #if directive */
1212     void pp_if();
1213 
1214     /* process a #ifdef directive */
1215     void pp_ifdef();
1216 
1217     /* process a #ifdef directive */
1218     void pp_ifndef();
1219 
1220     /* process a #ifdef or #ifndef */
1221     void pp_ifdef_or_ifndef(int sense);
1222 
1223     /* process a #else directive */
1224     void pp_else();
1225 
1226     /* process a #elif directive */
1227     void pp_elif();
1228 
1229     /* process a #endif directive */
1230     void pp_endif();
1231 
1232     /* process a #error directive */
1233     void pp_error();
1234 
1235     /* process a #undef directive */
1236     void pp_undef();
1237 
1238     /* process a #line directive */
1239     void pp_line();
1240 
1241     /* get a lone identifier for a preprocessor directive */
1242     int pp_get_lone_ident(char *buf, size_t bufl);
1243 
1244     /* process a #pragma C directive */
1245     // void pragma_c(); - not currently used
1246 
1247     /* process a #pragma once directive */
1248     void pragma_once();
1249 
1250     /* process a #pragma all_once directive */
1251     void pragma_all_once();
1252 
1253     /* process a #pragma message directive */
1254     void pragma_message();
1255 
1256     /* process a #pragma newline_spacing(on/off) directive */
1257     void pragma_newline_spacing();
1258 
1259     /*
1260      *   Determine if we're in a false #if branch.  If we're inside a #if
1261      *   block, and the state is either IF_NO, IF_DONE, or ELSE_NO, or
1262      *   we're inside a #if nested within any negative branch, we're in a
1263      *   not-taken branch of a #if block.
1264      */
in_false_if()1265     int in_false_if() const
1266     {
1267         return (if_sp_ != 0
1268                 && (if_false_level_ != 0
1269                     || if_stack_[if_sp_ - 1].state == TOKIF_IF_NO
1270                     || if_stack_[if_sp_ - 1].state == TOKIF_IF_DONE
1271                     || if_stack_[if_sp_ - 1].state == TOKIF_ELSE_NO));
1272     }
1273 
1274     /* push a new #if level with the given state */
1275     void push_if(tok_if_t state);
1276 
1277     /* get the current #if state */
get_if_state()1278     tok_if_t get_if_state() const
1279     {
1280         if (if_sp_ == 0)
1281             return TOKIF_NONE;
1282         else
1283             return if_stack_[if_sp_ - 1].state;
1284     }
1285 
1286     /* switch the current #if level to the given state */
change_if_state(tok_if_t state)1287     void change_if_state(tok_if_t state)
1288     {
1289         if (if_sp_ != 0)
1290             if_stack_[if_sp_ - 1].state = state;
1291     }
1292 
1293     /* pop the current #if level */
1294     void pop_if();
1295 
1296     /*
1297      *   Find or create a descriptor for the given filename.  'fname' is
1298      *   the full file system path specifying the file.  'orig_fname' is
1299      *   the filename as originally specified by the user, if different;
1300      *   in the case of #include files, this indicates the name that was
1301      *   specified in the directive itself, whereas 'fname' is the actual
1302      *   filename that resulted from searching the include path for the
1303      *   given name.
1304      */
1305     class CTcTokFileDesc *get_file_desc(const char *fname, size_t fname_len,
1306                                         int always_create,
1307                                         const char *orig_fname,
1308                                         size_t orig_fname_len);
1309 
1310     /* clear the line buffer */
1311     void clear_linebuf();
1312 
1313     /* flag: ALL_ONCE mode - we include each file only once */
1314     int all_once_ : 1;
1315 
1316     /* flag: warn on ignoring a redundant #include file */
1317     int warn_on_ignore_incl_ : 1;
1318 
1319     /*
1320      *   Flag: in preprocess-only mode.  In this mode, we'll leave certain
1321      *   preprocessor directives intact in the source, since they'll be
1322      *   needed in a subsequent compilation of the preprocessed source.
1323      *   For example, we'll leave #line directives, #pragma C, #error, and
1324      *   #pragma message directives in the preprocessed result.
1325      */
1326     int pp_only_mode_ : 1;
1327 
1328     /*
1329      *   Flag: in test reporting mode.  In this mode, we'll expand __FILE__
1330      *   macros with the root name only.
1331      */
1332     int test_report_mode_ : 1;
1333 
1334     /*
1335      *   Flag: in preprocess-for-includes mode.  In this mode, we'll do
1336      *   nothing except run the preprocessor and generate a list of the
1337      *   header files that are included, along with header files they
1338      *   include, and so on.
1339      */
1340     int list_includes_mode_ : 1;
1341 
1342     /*
1343      *   Flag: treat newlines in strings as whitespace.  When this is true,
1344      *   whenever we find a newline character in a string, we'll convert the
1345      *   newline and all leading whitespace on the next line to a single
1346      *   space character.  When this is false, we'll entirely strip out each
1347      *   newline in a string and all whitespace that immediately follows;
1348      *   this mode is desirable for some languages, such as Chinese, where
1349      *   whitespace is not conventionally used as a token separator in
1350      *   ordinary text.
1351      */
1352     int string_newline_spacing_ : 1;
1353 
1354     /*
1355      *   flag: we're parsing a preprocessor constant expression (for a
1356      *   #if, for example; this doesn't apply to simple macro expansion)
1357      */
1358     int in_pp_expr_ : 1;
1359 
1360     /* resource loader */
1361     class CResLoader *res_loader_;
1362 
1363     /*
1364      *   name of our default character set - this is generally specified
1365      *   by the user (on the compiler command line, for example), or
1366      *   obtained from the operating system
1367      */
1368     char *default_charset_;
1369 
1370     /* input (to unicode) character mapper for the default character set */
1371     class CCharmapToUni *default_mapper_;
1372 
1373     /* head of list of previously-included files */
1374     struct tctok_incfile_t *prev_includes_;
1375 
1376     /* head and tail of include path list */
1377     struct tctok_incpath_t *incpath_head_;
1378     struct tctok_incpath_t *incpath_tail_;
1379 
1380     /* file descriptor and line number of last line read */
1381     class CTcTokFileDesc *last_desc_;
1382     long last_linenum_;
1383 
1384     /* file descriptor and line number of last line appended */
1385     class CTcTokFileDesc *appended_desc_;
1386     long appended_linenum_;
1387 
1388     /* current input stream */
1389     class CTcTokStream *str_;
1390 
1391     /* master list of file descriptors */
1392     class CTcTokFileDesc *desc_head_;
1393     class CTcTokFileDesc *desc_tail_;
1394 
1395     /*
1396      *   array of file descriptors (we keep the list in both an array and
1397      *   a linked list, since we need both sequential and indexed access;
1398      *   this isn't a lot of trouble since we never need to remove an
1399      *   entry from the list)
1400      */
1401     class CTcTokFileDesc **desc_list_;
1402 
1403     /* number of entries in desc_list_ */
1404     size_t desc_list_cnt_;
1405 
1406     /* number of slots allocated in desc_list_ array */
1407     size_t desc_list_alo_;
1408 
1409     /* next file descriptor ID to be assigned */
1410     int next_filedesc_id_;
1411 
1412     /* pointer to current position in current line */
1413     utf8_ptr p_;
1414 
1415     /* pointer to start of current line, and length of current line */
1416     const char *line_start_;
1417     size_t line_len_;
1418 
1419     /* input buffer */
1420     CTcTokString linebuf_;
1421 
1422     /*
1423      *   unsplice buffer - we'll put any unspliced text into this buffer,
1424      *   then read it back at the next read_line()
1425      */
1426     CTcTokString unsplicebuf_;
1427 
1428     /*
1429      *   Flag: in a string.  If this is '\0', we're not in a string;
1430      *   otherwise, this is the quote character that ends the string.
1431      */
1432     wchar_t in_quote_;
1433 
1434     /* flag: in an embedded expression during line processing */
1435     uint comment_in_embedding_ : 1;
1436 
1437     /* flag: macro processing token stream is in an embedded expression */
1438     int macro_in_embedding_;
1439 
1440     /* flag: main token stream is in an embedded expression */
1441     int main_in_embedding_;
1442 
1443     /*
1444      *   #if state stack.  if_sp_ is the index of the next nesting slot;
1445      *   if if_sp_ is zero, it means that we're not in a #if at all.
1446      *
1447      *   Separately, the if_false_level_ is the level of #if's contained
1448      *   within a false #if branch.  This is separate because, once we're
1449      *   in a false #if branch, everything within it is false.
1450      */
1451     int if_sp_;
1452     tok_if_info_t if_stack_[TOK_MAX_IF_NESTING];
1453     int if_false_level_;
1454 
1455     /* source block list head */
1456     CTcTokSrcBlock *src_head_;
1457 
1458     /* current (and last) source block */
1459     CTcTokSrcBlock *src_cur_;
1460 
1461     /* pointer to next available byte in the current source block */
1462     char *src_ptr_;
1463 
1464     /* number of bytes remaining in the current source block */
1465     size_t src_rem_;
1466 
1467     /* current token */
1468     CTcToken curtok_;
1469 
1470     /* previous token (for unget) */
1471     CTcToken prvtok_;
1472 
1473     /*
1474      *   next token, if a token has been un-gotten, and a flag indicating
1475      *   that this is indeed the case.
1476      */
1477     CTcToken nxttok_;
1478     unsigned int nxttok_valid_ : 1;
1479 
1480     /* the external token source, if any */
1481     CTcTokenSource *ext_src_;
1482 
1483     /* macro expansion buffer */
1484     CTcTokString expbuf_;
1485 
1486     /* symbol table for #define symbols */
1487     class CVmHashTable *defines_;
1488 
1489     /*
1490      *   symbol table for symbols explicitly undefined; we keep track of
1491      *   these so that we can exclude anything ever undefined from the debug
1492      *   macro records, since only static global macros can be handled in the
1493      *   debug records
1494      */
1495     class CVmHashTable *undefs_;
1496 
1497     /* symbol table for TADS keywords */
1498     class CVmHashTable *kw_;
1499 
1500     /* head of macro resource pool list */
1501     class CTcMacroRsc *macro_res_head_;
1502 
1503     /* head of list of available macro resources */
1504     class CTcMacroRsc *macro_res_avail_;
1505 
1506     /*
1507      *   string capture file - if this is non-null, we'll capture all of
1508      *   the strings we read to this file, one string per line
1509      */
1510     osfildef *string_fp_;
1511 
1512     /* character mapper for writing to the string capture file */
1513     class CCharmapToLocal *string_fp_map_;
1514 
1515     /* true -> allow preprocessor directives */
1516     unsigned int allow_pp_;
1517 };
1518 
1519 /* ------------------------------------------------------------------------ */
1520 /*
1521  *   Error handler interface.  Callers of load_macros_from_file() in
1522  *   CTcTokenizer must provide an implementation of this interface to handle
1523  *   errors that occur while loading macros.
1524  */
1525 class CTcTokLoadMacErr
1526 {
1527 public:
1528     /*
1529      *   Flag an error.  The error codes are taken from the following list:
1530      *
1531      *   1 - a macro name symbol in the file is too long (it exceeds the
1532      *   maximum symbol length for the preprocessor)
1533      *
1534      *   2 - a formal parameter name is too long
1535      */
1536     virtual void log_error(int err) = 0;
1537 };
1538 
1539 /* ------------------------------------------------------------------------ */
1540 /*
1541  *   Tokenizer File Descriptor.  Each unique source file has a separate
1542  *   file descriptor, which keeps track of the file's name.
1543  */
1544 class CTcTokFileDesc
1545 {
1546 public:
1547     /* create a file descriptor */
1548     CTcTokFileDesc(const char *fname, size_t fname_len, int index,
1549                    CTcTokFileDesc *orig_desc,
1550                    const char *orig_fname, size_t orig_fname_len);
1551 
1552     /* delete the descriptor */
1553     ~CTcTokFileDesc();
1554 
1555     /* get the filename */
get_fname()1556     const char *get_fname() const { return fname_; }
1557 
1558     /* get the original filename string */
get_orig_fname()1559     const char *get_orig_fname() const { return orig_fname_; }
1560 
1561     /*
1562      *   get the filename as a double-quoted string (backslashes and
1563      *   double-quotes will be escaped with backslashes)
1564      */
get_dquoted_fname()1565     const char *get_dquoted_fname() const { return dquoted_fname_; }
1566 
1567     /*
1568      *   get the root filename (i.e., with no path prefix) as a
1569      *   double-quoted string
1570      */
get_dquoted_rootname()1571     const char *get_dquoted_rootname() const { return dquoted_rootname_; }
1572 
1573     /* get the filename as a single-quoted string */
get_squoted_fname()1574     const char *get_squoted_fname() const { return squoted_fname_; }
1575 
1576     /* get the root filename as a single-quoted string */
get_squoted_rootname()1577     const char *get_squoted_rootname() const { return squoted_rootname_; }
1578 
1579     /* get/set the next file descriptor in the descriptor chain */
get_next()1580     CTcTokFileDesc *get_next() const { return next_; }
set_next(CTcTokFileDesc * nxt)1581     void set_next(CTcTokFileDesc *nxt) { next_ = nxt; }
1582 
1583     /* get my index in the master list */
get_index()1584     int get_index() const { return index_; }
1585 
1586     /* get the original descriptor for this file in the list */
get_orig()1587     CTcTokFileDesc *get_orig() const { return orig_; }
1588 
1589     /*
1590      *   get the list index of the original entry (returns my own list
1591      *   index if I am the original entry)
1592      */
get_orig_index()1593     int get_orig_index() const
1594         { return orig_ == 0 ? index_ : orig_->get_index(); }
1595 
1596     /*
1597      *   Add a source line position to our list.  We keep an index of the
1598      *   byte-code address for each executable source line, so that
1599      *   debuggers can find the compiled code corresponding to a source
1600      *   location.  The image builder gives us this information during the
1601      *   linking process.  The address is the absolute location in the
1602      *   image file of the executable code for the given source line (the
1603      *   first line in the file is numbered 1).
1604      */
1605     void add_source_line(ulong linenum, ulong line_addr);
1606 
1607     /*
1608      *   Enumerate the source lines, calling the callback for each one.
1609      *   We will only enumerate source lines which actually have an
1610      *   associated code location - source lines that generated no
1611      *   executable code are skipped.  We'll enumerate the lines in
1612      *   ascending order of line number, and each line number will appear
1613      *   only once.
1614      */
1615     void enum_source_lines(void (*cbfunc)(void *ctx, ulong linenum,
1616                                           ulong byte_code_addr),
1617                            void *cbctx);
1618 
1619 private:
1620     /* index in the master list */
1621     int index_;
1622 
1623     /* filename string - this is the actual file system filename */
1624     char *fname_;
1625 
1626     /*
1627      *   original filename string, if different from fname_ - this is the
1628      *   filename as specified by the user, before it was adjusted with
1629      *   include paths or other extra location information
1630      */
1631     char *orig_fname_;
1632 
1633     /* double-quoted version of the filename */
1634     char *dquoted_fname_;
1635 
1636     /* single-quoted version of the filename */
1637     char *squoted_fname_;
1638 
1639     /* single-quoted version of the root filename */
1640     char *squoted_rootname_;
1641 
1642     /* double-quoted version of the root filename */
1643     char *dquoted_rootname_;
1644 
1645     /* next descriptor in the master descriptor list */
1646     CTcTokFileDesc *next_;
1647 
1648     /*
1649      *   The original file descriptor with the same filename.  If we
1650      *   create multiple descriptors for the same filename (because, for
1651      *   example, the same header is included in several different object
1652      *   files), we'll keep track of the original descriptor for the file
1653      *   in all of the copies.
1654      */
1655     CTcTokFileDesc *orig_;
1656 
1657     /* source line pages */
1658     struct CTcTokSrcPage **src_pages_;
1659 
1660     /* number of source line page slots allocated */
1661     size_t src_pages_alo_;
1662 };
1663 
1664 
1665 /* ------------------------------------------------------------------------ */
1666 /*
1667  *   Tokenizer Input Stream
1668  */
1669 class CTcTokStream
1670 {
1671 public:
1672     /* create a token stream */
1673     CTcTokStream(class CTcTokFileDesc *desc, class CTcSrcObject *src,
1674                  CTcTokStream *parent, int charset_error,
1675                  int init_if_level);
1676 
1677     /* delete the stream */
1678     ~CTcTokStream();
1679 
1680     /* get/set the associated file descriptor */
get_desc()1681     class CTcTokFileDesc *get_desc() const { return desc_; }
set_desc(class CTcTokFileDesc * desc)1682     void set_desc(class CTcTokFileDesc *desc) { desc_ = desc; }
1683 
1684     /* get the underlying source file */
get_src()1685     class CTcSrcObject *get_src() const { return src_; }
1686 
1687     /* get the line number of the next line to be read */
get_next_linenum()1688     long get_next_linenum() const { return next_linenum_; }
1689 
1690     /* set the next line number */
set_next_linenum(long l)1691     void set_next_linenum(long l) { next_linenum_ = l; }
1692 
1693     /* get the enclosing stream */
get_parent()1694     CTcTokStream *get_parent() const { return parent_; }
1695 
1696     /* count having read a line */
count_line()1697     void count_line() { ++next_linenum_; }
1698 
1699     /* was there a #charset error when opening the file? */
get_charset_error()1700     int get_charset_error() const { return charset_error_; }
1701 
1702     /* get/set the in-comment status */
is_in_comment()1703     int is_in_comment() const { return in_comment_; }
set_in_comment(int f)1704     void set_in_comment(int f) { in_comment_ = f; }
1705 
1706     /* get/set the pragma C mode */
1707     // int is_pragma_c() const { return pragma_c_; }
1708     // void set_pragma_c(int f) { pragma_c_ = f; }
1709 
1710     /* get/set if nesting level at the start of the file */
get_init_if_level()1711     int get_init_if_level() const { return init_if_level_; }
set_init_if_level(int level)1712     void set_init_if_level(int level) { init_if_level_ = level; }
1713 
1714     /* get/set the newline spacing mode */
get_newline_spacing()1715     int get_newline_spacing() const { return newline_spacing_; }
set_newline_spacing(int f)1716     void set_newline_spacing(int f) { newline_spacing_ = f; }
1717 
1718 private:
1719     /* file descriptor associated with this file */
1720     class CTcTokFileDesc *desc_;
1721 
1722     /* the underlying source reader */
1723     class CTcSrcObject *src_;
1724 
1725     /*
1726      *   the enclosing stream - this is the stream that #include'd the
1727      *   current stream
1728      */
1729     CTcTokStream *parent_;
1730 
1731     /* line number of next line to be read */
1732     ulong next_linenum_;
1733 
1734     /* #if nesting level at the start of the file */
1735     int init_if_level_;
1736 
1737     /* flag: we were unable to load the map in the #charset directive */
1738     uint charset_error_ : 1;
1739 
1740     /* the stream is in a multi-line comment */
1741     uint in_comment_ : 1;
1742 
1743     /* newline_spacing mode when the stream was stacked */
1744     uint newline_spacing_ : 1;
1745 
1746     /* flag: we're in #pragma C+ mode */
1747     // uint pragma_c_ : 1; - #pragma C is not currently used
1748 };
1749 
1750 /* ------------------------------------------------------------------------ */
1751 /*
1752  *   Keyword Hash Table Entry
1753  */
1754 class CTcHashEntryKw: public CVmHashEntryCS
1755 {
1756 public:
CTcHashEntryKw(const textchar_t * str,tc_toktyp_t tokid)1757     CTcHashEntryKw(const textchar_t *str, tc_toktyp_t tokid)
1758         : CVmHashEntryCS(str, strlen(str), FALSE)
1759     {
1760         /* save the token ID for the keyword */
1761         tokid_ = tokid;
1762     }
1763 
1764     /* get the token ID */
get_tok_id()1765     tc_toktyp_t get_tok_id() const { return tokid_; }
1766 
1767 private:
1768     /* our token ID */
1769     tc_toktyp_t tokid_;
1770 };
1771 
1772 /* ------------------------------------------------------------------------ */
1773 /*
1774  *   basic #define symbol table entry
1775  */
1776 class CTcHashEntryPp: public CVmHashEntryCS
1777 {
1778 public:
CTcHashEntryPp(const textchar_t * str,size_t len,int copy)1779     CTcHashEntryPp(const textchar_t *str, size_t len, int copy)
1780         : CVmHashEntryCS(str, len, copy)
1781     {
1782         /* by default, we have no arguments */
1783         has_args_ = FALSE;
1784         has_varargs_ = FALSE;
1785         argc_ = 0;
1786         argv_ = 0;
1787         params_table_ = 0;
1788     }
1789 
1790     /* get the expansion text */
1791     virtual const char *get_expansion() const = 0;
1792     virtual size_t get_expan_len() const = 0;
1793 
1794     /* certain special macros (__LINE__, __FILE__) aren't undef'able */
is_undefable()1795     virtual int is_undefable() const { return TRUE; }
1796 
1797     /*
1798      *   most macros are real symbols, created by #define's, but some are
1799      *   special pseudo-macros, like __LINE__ and __FILE__, that the
1800      *   preprocessor provides
1801      */
is_pseudo()1802     virtual int is_pseudo() const { return FALSE; }
1803 
1804     /* does the macro have an argument list? */
has_args()1805     int has_args() const { return has_args_; }
1806 
1807     /* get the number of arguments */
get_argc()1808     int get_argc() const { return argc_; }
1809 
1810     /* do we have a variable number of arguments? */
has_varargs()1811     int has_varargs() const { return has_varargs_; }
1812 
1813     /*
1814      *   get the minimum number of allowed arguments - if we have varargs,
1815      *   this is one less than the number of formals listed, since the last
1816      *   formal can correspond to any number of actuals, including zero
1817      */
get_min_argc()1818     int get_min_argc() const { return has_varargs_ ? argc_ - 1 : argc_; }
1819 
1820     /* get the name of an argument by position (0 = first argument) */
get_arg_name(int idx)1821     const char *get_arg_name(int idx) const { return argv_[idx]; }
1822 
1823     /* get the parameter hash table entry for the parameter */
get_arg_entry(int idx)1824     class CTcHashEntryPpArg *get_arg_entry(int idx) const
1825         { return arg_entry_[idx]; }
1826 
1827     /* get the parameters hash table */
get_params_table()1828     const CVmHashTable *get_params_table() const { return params_table_; }
1829 
1830 protected:
1831     /* argument list */
1832     char **argv_;
1833 
1834     /* list of parameter hash entries */
1835     class CTcHashEntryPpArg **arg_entry_;
1836 
1837     /* parameter hash table */
1838     CVmHashTable *params_table_;
1839 
1840     /* argument count */
1841     int argc_;
1842 
1843     /* flag: the macro has a parameter list */
1844     uint has_args_ : 1;
1845 
1846     /*
1847      *   flag: the parameter list takes a variable number of arguments; if
1848      *   this is set, then argc_ is one greater than the minimum number of
1849      *   arguments required, and the last formal receives the varying part
1850      *   of the actual parameter list, which can contain zero or more
1851      *   actuals
1852      */
1853     uint has_varargs_ : 1;
1854 };
1855 
1856 /*
1857  *   #define symbol hash table entry
1858  */
1859 class CTcHashEntryPpDefine: public CTcHashEntryPp
1860 {
1861 public:
1862     /*
1863      *   Create the hash entry.  argc is the number of arguments to the
1864      *   macro, and argv is an array of pointers to null-terminated
1865      *   strings with the argument names, in the order defined in the
1866      *   macro.
1867      *
1868      *   If has_args is false, the macro does not take a parameter list at
1869      *   all.  Note that it is possible for has_args to be true and argc
1870      *   to be zero, because a macro can be defined to take an argument
1871      *   list with no arguments (i.e., empty parens).  A macro with an
1872      *   empty argument list is distinct from a macro with no argument
1873      *   list: in the former case, the empty parens are required, and are
1874      *   removed from the input stream and replaced with the macro's
1875      *   expansion.
1876      *
1877      *   We'll make a copy of the argument list vector, strings, and
1878      *   expansion text, so the caller is free to forget all of that after
1879      *   creating the entry instance.
1880      */
1881     CTcHashEntryPpDefine(const textchar_t *str, size_t len, int copy,
1882                          int has_args, int argc, int has_varargs,
1883                          const char **argv, const size_t *argvlen,
1884                          const char *expansion, size_t expan_len);
1885 
1886     ~CTcHashEntryPpDefine();
1887 
1888     /* get the expansion text and its length */
get_expansion()1889     const char *get_expansion() const { return expan_; }
get_expan_len()1890     size_t get_expan_len() const { return expan_len_; }
1891 
1892 private:
1893     /* expansion */
1894     char *expan_;
1895     size_t expan_len_;
1896 };
1897 
1898 
1899 /*
1900  *   Hash table entry for __FILE__ and __LINE__
1901  */
1902 class CTcHashEntryPpSpecial: public CTcHashEntryPp
1903 {
1904 public:
CTcHashEntryPpSpecial(CTcTokenizer * tok,const char * str)1905     CTcHashEntryPpSpecial(CTcTokenizer *tok, const char *str)
1906         : CTcHashEntryPp(str, strlen(str), FALSE)
1907     {
1908         /* remember my tokenizer */
1909         tok_ = tok;
1910     }
1911 
1912     /* these special macros are not undef'able */
is_undefable()1913     virtual int is_undefable() const { return FALSE; }
1914 
1915     /* special macros are pseudo-macros provided by the preprocessor */
is_pseudo()1916     virtual int is_pseudo() const { return TRUE; }
1917 
1918 protected:
1919     /* my tokenizer */
1920     CTcTokenizer *tok_;
1921 };
1922 
1923 class CTcHashEntryPpFILE: public CTcHashEntryPpSpecial
1924 {
1925 public:
CTcHashEntryPpFILE(CTcTokenizer * tok)1926     CTcHashEntryPpFILE(CTcTokenizer *tok)
1927         : CTcHashEntryPpSpecial(tok, "__FILE__") { }
1928 
1929     /* our expansion is the current filename, in single quotes */
get_expansion()1930     const char *get_expansion() const { return get_base_text(); }
get_expan_len()1931     size_t get_expan_len() const { return strlen(get_base_text()); }
1932 
1933 private:
1934     /* get our expansion base text */
get_base_text()1935     const char *get_base_text() const
1936     {
1937         /*
1938          *   if we're in test-report mode, use the root name only;
1939          *   otherwise, use the full name with path
1940          */
1941         if (tok_->get_test_report_mode())
1942             return tok_->get_last_desc()->get_squoted_rootname();
1943         else
1944             return tok_->get_last_desc()->get_squoted_fname();
1945     }
1946 };
1947 
1948 class CTcHashEntryPpLINE: public CTcHashEntryPpSpecial
1949 {
1950 public:
CTcHashEntryPpLINE(CTcTokenizer * tok)1951     CTcHashEntryPpLINE(CTcTokenizer *tok)
1952         : CTcHashEntryPpSpecial(tok, "__LINE__") { }
1953 
1954     /* our expansion is the line number as a decimal string */
get_expansion()1955     const char *get_expansion() const
1956         { gen_expansion(tok_); return buf_; }
get_expan_len()1957     size_t get_expan_len() const
1958         { gen_expansion(tok_); return strlen(buf_); }
1959 
1960 private:
1961     /* generate the expansion text into our internal buffer */
gen_expansion(CTcTokenizer * tok)1962     static void gen_expansion(CTcTokenizer *tok)
1963         { sprintf(buf_, "%ld", tok->get_last_linenum()); }
1964 
1965     /* internal buffer */
1966     static char buf_[20];
1967 };
1968 
1969 
1970 /*
1971  *   Hash entry for preprocessor arguments
1972  */
1973 class CTcHashEntryPpArg: public CVmHashEntryCS
1974 {
1975 public:
CTcHashEntryPpArg(const char * str,size_t len,int copy,int argnum)1976     CTcHashEntryPpArg(const char *str, size_t len, int copy, int argnum)
1977         : CVmHashEntryCS(str, len, copy)
1978     {
1979         /* remember the argument number */
1980         argnum_ = argnum;
1981     }
1982 
1983     /* get my argument number */
get_argnum()1984     int get_argnum() const { return argnum_; }
1985 
1986 private:
1987     /* argument number */
1988     int argnum_;
1989 };
1990 
1991 
1992 /* ------------------------------------------------------------------------ */
1993 /*
1994  *   Previously-included file list entry.  Each time we include a file,
1995  *   we'll add an entry to a list of files; in the future, we'll consult
1996  *   this list to ensure that we don't include the same file again.
1997  */
1998 struct tctok_incfile_t
1999 {
2000     /* next entry in the list of previously-included files */
2001     tctok_incfile_t *nxt;
2002 
2003     /* name of this file (we'll allocate memory to hold the name) */
2004     char fname[1];
2005 };
2006 
2007 /* ------------------------------------------------------------------------ */
2008 /*
2009  *   Include path list entry.  This structure defines one include path; we
2010  *   maintain a list of these structures.
2011  */
2012 struct tctok_incpath_t
2013 {
2014     /* next entry in the list */
2015     tctok_incpath_t *nxt;
2016 
2017     /* path */
2018     char path[1];
2019 };
2020 
2021 #endif /* TCTOK_H */
2022 
2023