1 #ifdef RCSID
2 static char RCSid[] =
3 "$Header: d:/cvsroot/tads/tads3/tctok.cpp,v 1.5 1999/07/11 00:46:58 MJRoberts Exp $";
4 #endif
5 
6 /*
7  *   Copyright (c) 1999, 2002 Michael J. Roberts.  All Rights Reserved.
8  *
9  *   Please see the accompanying license file, LICENSE.TXT, for information
10  *   on using and copying this software.
11  */
12 /*
13 Name
14   tctok.cpp - TADS3 compiler tokenizer
15 Function
16 
17 Notes
18   The tokenizer features an integrated C-style preprocessor.  The
19   preprocessor is integrated into the tokenizer for efficiency; since
20   the preprocessor uses the same lexical structure as the the TADS
21   language, we need only tokenize the input stream once, and the result
22   can be used both for preprocessing and for parsing.
23 Modified
24   04/12/99 MJRoberts  - Creation
25 */
26 
27 #include <stdio.h>
28 #include <string.h>
29 #include <stdarg.h>
30 #include <time.h>
31 
32 #include "os.h"
33 #include "t3std.h"
34 #include "vmerr.h"
35 #include "vmhash.h"
36 #include "tcerr.h"
37 #include "tcerrnum.h"
38 #include "tctok.h"
39 #include "tcsrc.h"
40 #include "tcmain.h"
41 #include "tchost.h"
42 #include "tcprs.h"
43 #include "tctarg.h"
44 #include "charmap.h"
45 #include "vmfile.h"
46 
47 
48 /* ------------------------------------------------------------------------ */
49 /*
50  *   Initialize the tokenizer
51  */
CTcTokenizer(CResLoader * res_loader,const char * default_charset)52 CTcTokenizer::CTcTokenizer(CResLoader *res_loader,
53                            const char *default_charset)
54 {
55     int i;
56     time_t timer;
57     struct tm *tblk;
58     const char *tstr;
59     char timebuf[50];
60     struct kwdef
61     {
62         const char *kw_text;
63         tc_toktyp_t kw_tok_id;
64     };
65     static const kwdef kwlist[] =
66     {
67         { "self", TOKT_SELF },
68         { "targetprop", TOKT_TARGETPROP },
69         { "targetobj", TOKT_TARGETOBJ },
70         { "definingobj", TOKT_DEFININGOBJ },
71         { "inherited", TOKT_INHERITED },
72         { "delegated", TOKT_DELEGATED },
73         { "argcount", TOKT_ARGCOUNT },
74         { "if", TOKT_IF },
75         { "else", TOKT_ELSE },
76         { "for", TOKT_FOR },
77         { "while", TOKT_WHILE },
78         { "do", TOKT_DO },
79         { "switch", TOKT_SWITCH },
80         { "case", TOKT_CASE },
81         { "default", TOKT_DEFAULT },
82         { "goto", TOKT_GOTO },
83         { "break", TOKT_BREAK },
84         { "continue", TOKT_CONTINUE },
85 //      { "and", TOKT_AND },
86 //      { "or", TOKT_OR },
87 //      { "not", TOKT_NOT },
88         { "function", TOKT_FUNCTION },
89         { "return", TOKT_RETURN },
90         { "local", TOKT_LOCAL },
91         { "object", TOKT_OBJECT },
92         { "nil", TOKT_NIL },
93         { "true", TOKT_TRUE },
94         { "pass", TOKT_PASS },
95         { "external", TOKT_EXTERNAL },
96         { "extern", TOKT_EXTERN },
97         { "formatstring", TOKT_FORMATSTRING },
98         { "class", TOKT_CLASS },
99         { "replace", TOKT_REPLACE },
100         { "modify", TOKT_MODIFY },
101         { "new", TOKT_NEW },
102         { "delete", TOKT_DELETE },
103         { "throw", TOKT_THROW },
104         { "try", TOKT_TRY },
105         { "catch", TOKT_CATCH },
106         { "finally", TOKT_FINALLY },
107         { "intrinsic", TOKT_INTRINSIC },
108         { "dictionary", TOKT_DICTIONARY },
109         { "grammar", TOKT_GRAMMAR },
110         { "enum", TOKT_ENUM },
111         { "template", TOKT_TEMPLATE },
112         { "static", TOKT_STATIC },
113         { "foreach", TOKT_FOREACH },
114         { "export", TOKT_EXPORT },
115         { "propertyset", TOKT_PROPERTYSET },
116         { "transient", TOKT_TRANSIENT },
117         { "replaced", TOKT_REPLACED },
118 
119         { "void", TOKT_VOID },
120         { "int", TOKT_INT },
121         { "string", TOKT_STRING },
122         { "list", TOKT_LIST },
123         { "boolean", TOKT_BOOLEAN },
124         { "property", TOKT_PROPERTY },
125         { "any", TOKT_ANY },
126 
127         /* end-of-table marker */
128         { 0, TOKT_INVALID }
129     };
130     const kwdef *kwp;
131 
132     /* remember my resource loader */
133     res_loader_ = res_loader;
134 
135     /* there's no stream yet */
136     str_ = 0;
137 
138     /* no external source yet */
139     ext_src_ = 0;
140 
141     /* start numbering the file descriptors at zero */
142     next_filedesc_id_ = 0;
143 
144     /* there are no file descriptors yet */
145     desc_head_ = 0;
146     desc_tail_ = 0;
147     desc_list_ = 0;
148     desc_list_cnt_ = desc_list_alo_ = 0;
149 
150     /* empty out the input line buffer */
151     clear_linebuf();
152 
153     /* start out with a minimal line buffer size */
154     linebuf_.ensure_space(4096);
155     expbuf_.ensure_space(4096);
156 
157     /* set up at the beginning of the input line buffer */
158     start_new_line(linebuf_.get_buf(), linebuf_.get_text_len());
159 
160     /* remember the default character set */
161     default_charset_ = lib_copy_str(default_charset);
162 
163     /* we don't have a default character mapper yet */
164     default_mapper_ = 0;
165 
166     /* create an input mapper for the default character set, if specified */
167     if (default_charset != 0)
168         default_mapper_ = CCharmapToUni::load(res_loader, default_charset);
169 
170     /*
171      *   if the default character set wasn't specified, or we failed to
172      *   load a mapper for the specified character set, use a plain ASCII
173      *   mapper
174      */
175     if (default_mapper_ == 0)
176         default_mapper_ = new CCharmapToUniASCII();
177 
178     /* presume we're not in preprocessor-only mode */
179     pp_only_mode_ = FALSE;
180 
181     /* presume we're not in list-includes mode */
182     list_includes_mode_ = FALSE;
183 
184     /* presume we're not in test report mode */
185     test_report_mode_ = FALSE;
186 
187     /* allow preprocessing directives */
188     allow_pp_ = TRUE;
189 
190     /* there are no previously-included files yet */
191     prev_includes_ = 0;
192 
193     /* presume we'll convert newlines in strings to whitespace */
194     string_newline_spacing_ = TRUE;
195 
196     /* start out with ALL_ONCE mode off */
197     all_once_ = FALSE;
198 
199     /* by default, ignore redundant includes without warning */
200     warn_on_ignore_incl_ = FALSE;
201 
202     /* there are no include path entries yet */
203     incpath_head_ = incpath_tail_ = 0;
204 
205     /* not in a quoted string yet */
206     in_quote_ = '\0';
207 
208     /* not in an embedded expression yet */
209     comment_in_embedding_ = FALSE;
210     macro_in_embedding_ = FALSE;
211     main_in_embedding_ = FALSE;
212 
213     /* not in a #if block yet */
214     if_sp_ = 0;
215     if_false_level_ = 0;
216 
217     /* not processing a preprocessor constant expression */
218     in_pp_expr_ = FALSE;
219 
220     /* we don't have a current or appended line yet */
221     last_desc_ = 0;
222     last_linenum_ = 0;
223     appended_desc_ = 0;
224     appended_linenum_ = 0;
225 
226     /* allocate the first token-list block */
227     init_src_block_list();
228 
229     /* create the #define and #undef symbol tables */
230     defines_ = new CVmHashTable(512, new CVmHashFuncCS(), TRUE);
231     undefs_ = new CVmHashTable(64, new CVmHashFuncCS(), TRUE);
232 
233     /* create the special __LINE__ and __FILE__ macros */
234     defines_->add(new CTcHashEntryPpLINE(this));
235     defines_->add(new CTcHashEntryPpFILE(this));
236 
237     /* get the current time and date */
238     timer = time(0);
239     tblk = localtime(&timer);
240     tstr = asctime(tblk);
241 
242     /*
243      *   add the __DATE__ macro - the format is "Mmm dd yyyy", where "Mmm"
244      *   is the three-letter month name generated by asctime(), "dd" is
245      *   the day of the month, with a leading space for numbers less than
246      *   ten, and "yyyy" is the year.
247      */
248     sprintf(timebuf, "'%.3s %2d %4d'",
249             tstr + 4, tblk->tm_mday, tblk->tm_year + 1900);
250     add_define("__DATE__", timebuf);
251 
252     /* add the __TIME__ macro - 24-hour "hh:mm:ss" format */
253     sprintf(timebuf, "'%.8s'", tstr + 11);
254     add_define("__TIME__", timebuf);
255 
256     /*
257      *   Allocate a pool of macro resources.  The number we start with is
258      *   arbitrary, since we'll add more as needed, but we want to try to
259      *   allocate enough up front that we avoid time-consuming memory
260      *   allocations later.  On the other hand, we don't want to
261      *   pre-allocate a huge number of objects that we'll never use.
262      */
263     for (macro_res_avail_ = 0, macro_res_head_ = 0, i = 0 ; i < 7 ; ++i)
264     {
265         CTcMacroRsc *rsc;
266 
267         /* allocate a new object */
268         rsc = new CTcMacroRsc();
269 
270         /* add it onto the master list */
271         rsc->next_ = macro_res_head_;
272         macro_res_head_ = rsc;
273 
274         /* add it onto the available list */
275         rsc->next_avail_ = macro_res_avail_;
276         macro_res_avail_ = rsc;
277     }
278 
279     /* create the keyword hash table */
280     kw_ = new CVmHashTable(64, new CVmHashFuncCS(), TRUE);
281 
282     /* populate the keyword table */
283     for (kwp = kwlist ; kwp->kw_text != 0 ; ++kwp)
284         kw_->add(new CTcHashEntryKw(kwp->kw_text, kwp->kw_tok_id));
285 
286     /* no ungot token yet */
287     nxttok_valid_ = FALSE;
288 
289     /* no string capture file */
290     string_fp_ = 0;
291     string_fp_map_ = 0;
292 }
293 
294 /*
295  *   Initialize the source save block list
296  */
init_src_block_list()297 void CTcTokenizer::init_src_block_list()
298 {
299     /* allocate the first source block */
300     src_cur_ = src_head_ = new CTcTokSrcBlock();
301 
302     /* set up to write into the first block */
303     src_ptr_ = src_head_->get_buf();
304     src_rem_ = TCTOK_SRC_BLOCK_SIZE;
305 }
306 
307 
308 /* ------------------------------------------------------------------------ */
309 /*
310  *   Delete the tokenizer
311  */
~CTcTokenizer()312 CTcTokenizer::~CTcTokenizer()
313 {
314     /* delete all streams */
315     delete_source();
316 
317     /* delete all file descriptors */
318     while (desc_head_ != 0)
319     {
320         CTcTokFileDesc *nxt;
321 
322         /* remember the next descriptor */
323         nxt = desc_head_->get_next();
324 
325         /* delete this one */
326         delete desc_head_;
327 
328         /* move on to the next one */
329         desc_head_ = nxt;
330     }
331 
332     /* delete the file descriptor index array */
333     if (desc_list_ != 0)
334         t3free(desc_list_);
335 
336     /* delete our default character set string copy */
337     lib_free_str(default_charset_);
338 
339     /* release our reference on our default character mapper */
340     default_mapper_->release_ref();
341 
342     /* forget about all of our previous include files */
343     while (prev_includes_ != 0)
344     {
345         tctok_incfile_t *nxt;
346 
347         /* remember the next file */
348         nxt = prev_includes_->nxt;
349 
350         /* delete this one */
351         t3free(prev_includes_);
352 
353         /* move on to the next one */
354         prev_includes_ = nxt;
355     }
356 
357     /* delete the include path list */
358     while (incpath_head_ != 0)
359     {
360         tctok_incpath_t *nxt;
361 
362         /* remember the next entry in the path */
363         nxt = incpath_head_->nxt;
364 
365         /* delete this entry */
366         t3free(incpath_head_);
367 
368         /* move on to the next one */
369         incpath_head_ = nxt;
370     }
371 
372     /* delete the macro resources */
373     while (macro_res_head_ != 0)
374     {
375         CTcMacroRsc *nxt;
376 
377         /* remember the next one */
378         nxt = macro_res_head_->next_;
379 
380         /* delete this one */
381         delete macro_res_head_;
382 
383         /* move on to the next one */
384         macro_res_head_ = nxt;
385     }
386 
387     /* delete the token list */
388     delete src_head_;
389 
390     /* delete the #define and #undef symbol tables */
391     delete defines_;
392     delete undefs_;
393 
394     /* delete the keyword hash table */
395     delete kw_;
396 
397     /* if we created a mapping for the string capture file, release it */
398     if (string_fp_map_ != 0)
399         string_fp_map_->release_ref();
400 }
401 
402 /* ------------------------------------------------------------------------ */
403 /*
404  *   Clear the line buffer
405  */
clear_linebuf()406 void CTcTokenizer::clear_linebuf()
407 {
408     /* clear the buffer */
409     linebuf_.clear_text();
410 
411     /* reset our read point to the start of the line buffer */
412     p_.set(linebuf_.get_buf());
413 }
414 
415 /* ------------------------------------------------------------------------ */
416 /*
417  *   Get a textual representation of an operator token
418  */
get_op_text(tc_toktyp_t op)419 const char *CTcTokenizer::get_op_text(tc_toktyp_t op)
420 {
421     struct tokname_t
422     {
423         tc_toktyp_t typ;
424         const char *nm;
425     };
426     static const tokname_t toknames[] =
427     {
428         { TOKT_EOF, "<end of file>" },
429         { TOKT_SYM, "<symbol>" },
430         { TOKT_INT, "<integer>" },
431         { TOKT_SSTR, "<single-quoted string>" },
432         { TOKT_DSTR, "<double-quoted string>" },
433         { TOKT_DSTR_START, "<double-quoted string>" },
434         { TOKT_DSTR_MID, "<double-quoted string>" },
435         { TOKT_DSTR_END, "<double-quoted string>" },
436         { TOKT_LPAR, "(" },
437         { TOKT_RPAR, ")" },
438         { TOKT_COMMA, "," },
439         { TOKT_DOT, "." },
440         { TOKT_LBRACE, "{" },
441         { TOKT_RBRACE, "}", },
442         { TOKT_LBRACK, "[", },
443         { TOKT_RBRACK, "]", },
444         { TOKT_EQ, "=", },
445         { TOKT_EQEQ, "==", },
446         { TOKT_ASI, ":=" },
447         { TOKT_PLUS, "+" },
448         { TOKT_MINUS, "-" },
449         { TOKT_TIMES, "*" },
450         { TOKT_DIV, "/", },
451         { TOKT_MOD, "%" },
452         { TOKT_GT, ">" },
453         { TOKT_LT, "<" },
454         { TOKT_GE, ">=" },
455         { TOKT_LE, "<=" },
456         { TOKT_NE, "!=" },
457         { TOKT_ARROW, "->" },
458         { TOKT_COLON, ":" },
459         { TOKT_SEM, ";" },
460         { TOKT_AND, "&" },
461         { TOKT_ANDAND, "&&" },
462         { TOKT_OR, "|" },
463         { TOKT_OROR, "||" },
464         { TOKT_XOR, "^" },
465         { TOKT_SHL, "<<" },
466         { TOKT_SHR, ">>" },
467         { TOKT_INC, "++" },
468         { TOKT_DEC, "--" },
469         { TOKT_PLUSEQ, "+=" },
470         { TOKT_MINEQ, "-=" },
471         { TOKT_TIMESEQ, "*=" },
472         { TOKT_DIVEQ, "/=" },
473         { TOKT_MODEQ, "%=" },
474         { TOKT_ANDEQ, "&=" },
475         { TOKT_OREQ, "|=" },
476         { TOKT_XOREQ, "^=" },
477         { TOKT_SHLEQ, "<<=" },
478         { TOKT_SHREQ, ">>=" },
479         { TOKT_NOT, "! (not)" },
480         { TOKT_BNOT, "~" },
481         { TOKT_POUND, "#" },
482         { TOKT_POUNDPOUND, "##" },
483         { TOKT_POUNDAT, "#@" },
484         { TOKT_ELLIPSIS, "..." },
485         { TOKT_QUESTION, "?" },
486         { TOKT_COLONCOLON, "::" },
487         { TOKT_FLOAT, "<float>" },
488         { TOKT_AT, "@" },
489         { TOKT_SELF, "self" },
490         { TOKT_TARGETPROP, "targetprop" },
491         { TOKT_TARGETOBJ, "targetobj" },
492         { TOKT_DEFININGOBJ, "definingobj" },
493         { TOKT_INHERITED, "inherited" },
494         { TOKT_DELEGATED, "delegated" },
495         { TOKT_IF, "if" },
496         { TOKT_ELSE, "else" },
497         { TOKT_FOR, "for" },
498         { TOKT_WHILE, "while" },
499         { TOKT_DO, "do" },
500         { TOKT_SWITCH, "switch" },
501         { TOKT_CASE, "case" },
502         { TOKT_DEFAULT, "default" },
503         { TOKT_GOTO, "goto" },
504         { TOKT_BREAK, "break" },
505         { TOKT_CONTINUE, "continue" },
506         { TOKT_FUNCTION, "function" },
507         { TOKT_RETURN, "return" },
508         { TOKT_LOCAL, "local" },
509         { TOKT_OBJECT, "object" },
510         { TOKT_NIL, "nil" },
511         { TOKT_TRUE, "true" },
512         { TOKT_PASS, "pass" },
513         { TOKT_EXTERNAL, "external" },
514         { TOKT_EXTERN, "extern" },
515         { TOKT_FORMATSTRING, "formatstring" },
516         { TOKT_CLASS, "class" },
517         { TOKT_REPLACE, "replace" },
518         { TOKT_MODIFY, "modify" },
519         { TOKT_NEW, "new" },
520         { TOKT_DELETE, "delete" },
521         { TOKT_THROW, "throw" },
522         { TOKT_TRY, "try" },
523         { TOKT_CATCH, "catch" },
524         { TOKT_FINALLY, "finally" },
525         { TOKT_INTRINSIC, "intrinsic" },
526         { TOKT_DICTIONARY, "dictionary" },
527         { TOKT_GRAMMAR, "grammar" },
528         { TOKT_ENUM, "enum" },
529         { TOKT_TEMPLATE, "template" },
530         { TOKT_STATIC, "static" },
531         { TOKT_FOREACH, "foreach" },
532         { TOKT_EXPORT, "export" },
533         { TOKT_PROPERTYSET, "propertyset" },
534         { TOKT_TRANSIENT, "transient" },
535         { TOKT_REPLACED, "replaced" },
536         { TOKT_VOID, "void" },
537         { TOKT_INTKW, "int" },
538         { TOKT_STRING, "string" },
539         { TOKT_LIST, "list" },
540         { TOKT_BOOLEAN, "boolean" },
541         { TOKT_PROPERTY, "property" },
542         { TOKT_ANY, "any"},
543         { TOKT_INVALID, 0 }
544     };
545     const tokname_t *p;
546 
547     /* search for the token */
548     for (p = toknames ; p->nm != 0 ; ++p)
549     {
550         /* if this is our token, return the associated name string */
551         if (p->typ == op)
552             return p->nm;
553     }
554 
555     /* we didn't find it */
556     return "<unknown>";
557 }
558 
559 /* ------------------------------------------------------------------------ */
560 /*
561  *   Reset the tokenizer.  Delete the current source object and all of the
562  *   saved source text.  This can be used after compilation of a unit
563  *   (such as a debugger expression) is completed and the intermediate
564  *   parser state is no longer needed.
565  */
reset()566 void CTcTokenizer::reset()
567 {
568     /* delete the source object */
569     delete_source();
570 
571     /* delete saved token text */
572     if (src_head_ != 0)
573     {
574         /* delete the list */
575         delete src_head_;
576 
577         /* re-initialize the source block list */
578         init_src_block_list();
579     }
580 }
581 
582 /* ------------------------------------------------------------------------ */
583 /*
584  *   Delete the source file, if any, including any parent include files.
585  */
delete_source()586 void CTcTokenizer::delete_source()
587 {
588     /* delete the current stream and all enclosing parents */
589     while (str_ != 0)
590     {
591         CTcTokStream *nxt;
592 
593         /* remember the next stream in the list */
594         nxt = str_->get_parent();
595 
596         /* delete this stream */
597         delete str_;
598 
599         /* move up to the next one */
600         str_ = nxt;
601     }
602 
603     /* there are no more streams */
604     str_ = 0;
605 }
606 
607 
608 /* ------------------------------------------------------------------------ */
609 /*
610  *   Set up to read a source file.  Returns zero on success, or a non-zero
611  *   error code on failure.
612  */
set_source(const char * src_filename,const char * orig_name)613 int CTcTokenizer::set_source(const char *src_filename, const char *orig_name)
614 {
615     CTcTokFileDesc *desc;
616     CTcSrcFile *src;
617     int charset_error;
618     int default_charset_error;
619 
620     /* empty out the input line buffer */
621     clear_linebuf();
622 
623     /* set up at the beginning of the input line buffer */
624     start_new_line(linebuf_.get_buf(), linebuf_.get_text_len());
625 
626     /* create a reader for the source file */
627     src = CTcSrcFile::open_source(src_filename, res_loader_,
628                                   default_charset_, &charset_error,
629                                   &default_charset_error);
630     if (src == 0)
631     {
632         /* if we had a problem loading the default character set, log it */
633         if (default_charset_error)
634             log_error(TCERR_CANT_LOAD_DEFAULT_CHARSET, default_charset_);
635 
636         /* return failure */
637         return TCERR_CANT_OPEN_SRC;
638     }
639 
640     /* find or create a file descriptor for this filename */
641     desc = get_file_desc(src_filename, strlen(src_filename), FALSE,
642                          orig_name, strlen(orig_name));
643 
644     /*
645      *   Create a stream to read the source file.  The new stream has no
646      *   parent, because this is the top-level source file, and was not
647      *   included from any other file.
648      */
649     str_ = new CTcTokStream(desc, src, 0, charset_error, if_sp_);
650 
651     /* success */
652     return 0;
653 }
654 
655 /*
656  *   Set up to read source code from a memory buffer
657  */
set_source_buf(const char * buf)658 void CTcTokenizer::set_source_buf(const char *buf)
659 {
660     CTcSrcMemory *src;
661 
662     /* empty out the input line buffer */
663     clear_linebuf();
664 
665     /* reset the scanning state to the start of a brand new stream */
666     in_pp_expr_ = FALSE;
667     last_linenum_ = 0;
668     unsplicebuf_.clear_text();
669     in_quote_ = 0;
670     comment_in_embedding_ = FALSE;
671     macro_in_embedding_ = FALSE;
672     main_in_embedding_ = FALSE;
673     if_sp_ = 0;
674     if_false_level_ = 0;
675     nxttok_valid_ = FALSE;
676 
677     /* set up at the beginning of the input line buffer */
678     start_new_line(linebuf_.get_buf(), linebuf_.get_text_len());
679 
680     /* create a reader for the memory buffer */
681     src = new CTcSrcMemory(buf, default_mapper_);
682 
683     /*
684      *   Create a stream to read the source file.  The new stream has no
685      *   parent, because this is the top-level source file, and was not
686      *   included from any other file.
687      */
688     str_ = new CTcTokStream(0, src, 0, 0, if_sp_);
689 }
690 
691 /* ------------------------------------------------------------------------ */
692 /*
693  *   Find or create a file descriptor for a given filename
694  */
get_file_desc(const char * fname,size_t fname_len,int always_create,const char * orig_fname,size_t orig_fname_len)695 CTcTokFileDesc *CTcTokenizer::get_file_desc(const char *fname,
696                                             size_t fname_len,
697                                             int always_create,
698                                             const char *orig_fname,
699                                             size_t orig_fname_len)
700 {
701     CTcTokFileDesc *orig_desc;
702     CTcTokFileDesc *desc;
703 
704     /* presume we won't find an original descriptor in the list */
705     orig_desc = 0;
706 
707     /*
708      *   Search the list of existing descriptors to find one that matches.
709      *   Do this regardless of whether we're allowed to re-use an existing
710      *   one or not - even if we're creating a new one unconditionaly, we
711      *   need to know if there's an earlier copy that already exists so we
712      *   can associate the new one with the original.
713      */
714     for (desc = desc_head_ ; desc != 0 ; desc = desc->get_next())
715     {
716         /* check for a name match */
717         if (strlen(desc->get_fname()) == fname_len
718             && memcmp(desc->get_fname(), fname, fname_len) == 0)
719         {
720             /*
721              *   if we're allowed to return an existing descriptor, return
722              *   this one, since it's for the same filename
723              */
724             if (!always_create)
725                 return desc;
726 
727             /*
728              *   we have to create a new descriptor even though we have an
729              *   existing one - remember the original so we can point the
730              *   new one back to the original
731              */
732             orig_desc = desc;
733 
734             /*
735              *   no need to look any further - we've found the first
736              *   instance of this filename in our list
737              */
738             break;
739         }
740     }
741 
742     /* we didn't find a match - create a new descriptor */
743     desc = new CTcTokFileDesc(fname, fname_len, next_filedesc_id_++,
744                               orig_desc, orig_fname, orig_fname_len);
745 
746     /* link it in at the end of the master list */
747     desc->set_next(0);
748     if (desc_tail_ == 0)
749         desc_head_ = desc;
750     else
751         desc_tail_->set_next(desc);
752     desc_tail_ = desc;
753 
754     /* expand our array index if necessary */
755     if (desc_list_cnt_ >= desc_list_alo_)
756     {
757         size_t siz;
758 
759         /* allocate or expand the array */
760         desc_list_alo_ += 10;
761         siz = desc_list_alo_ * sizeof(desc_list_[0]);
762         if (desc_list_ == 0)
763             desc_list_ = (CTcTokFileDesc **)t3malloc(siz);
764         else
765             desc_list_ = (CTcTokFileDesc **)t3realloc(desc_list_, siz);
766     }
767 
768     /* add the new array entry */
769     desc_list_[desc_list_cnt_++] = desc;
770 
771     /* return it */
772     return desc;
773 }
774 
775 
776 /* ------------------------------------------------------------------------ */
777 /*
778  *   Add an include path entry.  Each new entry goes at the end of the
779  *   list, after all previous entries.
780  */
add_inc_path(const char * path)781 void CTcTokenizer::add_inc_path(const char *path)
782 {
783     tctok_incpath_t *entry;
784 
785     /* create a new path list entry */
786     entry = (tctok_incpath_t *)t3malloc(sizeof(tctok_incpath_t)
787                                         + strlen(path));
788 
789     /* store the path in the entry */
790     strcpy(entry->path, path);
791 
792     /* link this entry at the end of our list */
793     if (incpath_tail_ != 0)
794         incpath_tail_->nxt = entry;
795     else
796         incpath_head_ = entry;
797     incpath_tail_ = entry;
798     entry->nxt = 0;
799 }
800 
801 
802 /* ------------------------------------------------------------------------ */
803 /*
804  *   Set the string capture file.
805  */
set_string_capture(osfildef * fp)806 void CTcTokenizer::set_string_capture(osfildef *fp)
807 {
808     /* remember the capture file */
809     string_fp_ = fp;
810 
811     /*
812      *   if we don't already have a character mapping to translate from
813      *   our internal unicode characters back into the source file
814      *   character set, create one now
815      */
816     if (string_fp_map_ == 0)
817     {
818         /* try creating a mapping for the default character set */
819         if (default_charset_ != 0)
820             string_fp_map_ =
821                 CCharmapToLocal::load(res_loader_, default_charset_);
822 
823         /* if we couldn't create the mapping, use a default ASCII mapping */
824         if (string_fp_map_ == 0)
825             string_fp_map_ = CCharmapToLocal::load(res_loader_, "us-ascii");
826     }
827 }
828 
829 
830 /* ------------------------------------------------------------------------ */
831 /*
832  *   Get the next token in the input stream, reading additional lines from
833  *   the source file as needed.
834  */
next()835 tc_toktyp_t CTcTokenizer::next()
836 {
837     /* the current token is about to become the previous token */
838     prvtok_ = curtok_;
839 
840     /* if there's an un-got token, return it */
841     if (nxttok_valid_)
842     {
843         /* get the previously-saved token */
844         curtok_ = nxttok_;
845 
846         /* we've now consumed nxttok_ */
847         nxttok_valid_ = FALSE;
848 
849         /* return the new token's type */
850         return curtok_.gettyp();
851     }
852 
853     /* if there's an external source, get its next token */
854     if (ext_src_ != 0)
855     {
856         const CTcToken *ext_tok;
857 
858         /* get the next token from the external source */
859         ext_tok = ext_src_->get_next_token();
860 
861         /* check to see if we got a token */
862         if (ext_tok == 0)
863         {
864             /*
865              *   restore the current token in effect before this source was
866              *   active
867              */
868             curtok_ = *ext_src_->get_enclosing_curtok();
869 
870             /*
871              *   this source has no more tokens - restore the enclosing
872              *   source, and keep going so we try getting a token from it
873              */
874             ext_src_ = ext_src_->get_enclosing_source();
875 
876             /* return the token type */
877             return curtok_.gettyp();
878         }
879         else
880         {
881             /* we got a token - copy it to our internal token buffer */
882             curtok_ = *ext_tok;
883 
884             /* return its type */
885             return curtok_.gettyp();
886         }
887     }
888 
889     /* keep going until we get a valid token */
890     for (;;)
891     {
892         tc_toktyp_t typ;
893 
894         /*
895          *   read the next token from the current line, applying
896          *   appropriate string translations and storing strings and
897          *   symbols in the source block list
898          */
899         typ = next_on_line_xlat_keep();
900 
901         /* if it's the "null" token, skip it and read another token */
902         if (typ == TOKT_NULLTOK)
903             continue;
904 
905         /* if we found a valid token, we're done - return the token */
906         if (typ != TOKT_EOF)
907             return typ;
908 
909         /*
910          *   if we're at the end of a preprocess line, don't read another
911          *   line - just return end of file
912          */
913         if (p_.getch() == TOK_END_PP_LINE)
914             return TOKT_EOF;
915 
916         /*
917          *   we've reached the end of the line - read another line,
918          *   applying preprocessing directives and expanding macros as
919          *   needed
920          */
921         if (read_line_pp())
922         {
923             /* no more lines are available - return end of file */
924             return TOKT_EOF;
925         }
926     }
927 }
928 
929 /* ------------------------------------------------------------------------ */
930 /*
931  *   clear external token sources, returning to the true input stream
932  */
clear_external_sources()933 void CTcTokenizer::clear_external_sources()
934 {
935     /*
936      *   restore the current token as it was before the outermost external
937      *   source was first established
938      */
939     if (ext_src_ != 0)
940     {
941         CTcTokenSource *outer;
942 
943         /* find the outermost source */
944         for (outer = ext_src_ ; outer->get_enclosing_source() != 0 ;
945              outer = ext_src_->get_enclosing_source()) ;
946 
947         /* restore its original next token */
948         curtok_ = *ext_src_->get_enclosing_curtok();
949     }
950 
951     /* there's no external source now */
952     ext_src_ = 0;
953 }
954 
955 /* ------------------------------------------------------------------------ */
956 /*
957  *   Make a safely storable copy of the current token.
958  */
copycur()959 const CTcToken *CTcTokenizer::copycur()
960 {
961     /* if the current token is a symbol, it already has a safe copy */
962     if (curtok_.gettyp() == TOKT_SYM)
963         return getcur();
964 
965     /* save the current token's text in permanent tokenizer memory */
966     curtok_.set_text(store_source(curtok_.get_text(), curtok_.get_text_len()),
967                      curtok_.get_text_len());
968 
969     /* return the current token, now that we've made it safe */
970     return &curtok_;
971 }
972 
973 /*
974  *   Make a safely storable copy of a given token.
975  */
copytok(CTcToken * dst,const CTcToken * src)976 void CTcTokenizer::copytok(CTcToken *dst, const CTcToken *src)
977 {
978     /* start with an exact copy of the token */
979     *dst = *src;
980 
981     /* if the token is a symbol, it already has a safe copy */
982     if (src->gettyp() == TOKT_SYM)
983         return;
984 
985     /* save the token's text in permanent tokenizer memory */
986     dst->set_text(store_source(dst->get_text(), dst->get_text_len()),
987                   dst->get_text_len());
988 }
989 
990 
991 /* ------------------------------------------------------------------------ */
992 /*
993  *   Check to see if the current token matches the given text
994  */
cur_tok_matches(const char * txt,size_t len)995 int CTcTokenizer::cur_tok_matches(const char *txt, size_t len)
996 {
997     /* if the length matches, and the text matches exactly, it matches */
998     return (getcur()->get_text_len() == len
999             && memcmp(getcur()->get_text(), txt, len) == 0);
1000 }
1001 
1002 /* ------------------------------------------------------------------------ */
1003 /*
1004  *   Un-get the current token
1005  */
unget()1006 void CTcTokenizer::unget()
1007 {
1008     /*
1009      *   remember the current token as the next one to fetch, and flag
1010      *   that this is valid
1011      */
1012     nxttok_ = curtok_;
1013     nxttok_valid_ = TRUE;
1014 
1015     /* go back to the previous token */
1016     curtok_ = prvtok_;
1017 }
1018 
1019 /* ------------------------------------------------------------------------ */
1020 /*
1021  *   Assume that we should have just found a '>>' terminating an embedded
1022  *   expression in a double-quoted string.  If possible, back out the
1023  *   previous token and re-scan it as though it had started with '>>'.
1024  *
1025  *   This is to be called by a higher-level parser when it determines
1026  *   that, syntactically, we should have found the '>>' leaving an
1027  *   embedded expression.
1028  */
assume_missing_dstr_cont()1029 void CTcTokenizer::assume_missing_dstr_cont()
1030 {
1031     /* act as though we had just seen '>>' */
1032     xlat_string_to_src(&main_in_embedding_, TRUE);
1033 }
1034 
1035 
1036 /* ------------------------------------------------------------------------ */
1037 /*
1038  *   Skip whitespace and macro expansion markers
1039  */
skip_ws_and_markers(utf8_ptr * p)1040 void CTcTokenizer::skip_ws_and_markers(utf8_ptr *p)
1041 {
1042     /* keep going until we find something interesting */
1043     for (;;)
1044     {
1045         wchar_t cur;
1046 
1047         /* get the current character */
1048         cur = p->getch();
1049 
1050         /*
1051          *   if it's a macro expansion end marker, skip it as though it
1052          *   were whitespace; otherwise, if it's whitespace, skip it;
1053          *   otherwise, we're done skipping leading whitespace
1054          */
1055         if (cur == TOK_MACRO_EXP_END)
1056         {
1057             /* skip the embedded pointer value that follows */
1058             p->set(p->getptr() + 1 + sizeof(CTcHashEntryPp *));
1059         }
1060         else if (is_space(cur))
1061         {
1062             /* skip the space */
1063             p->inc();
1064         }
1065         else
1066         {
1067             /* it's not whitespace or equivalent - we're done */
1068             return;
1069         }
1070     }
1071 }
1072 
1073 /* ------------------------------------------------------------------------ */
1074 /*
1075  *   Get the next token from the input stream, operating on the current
1076  *   line only.
1077  */
next_on_line(utf8_ptr * p,CTcToken * tok,int * in_embedding)1078 tc_toktyp_t CTcTokenizer::next_on_line(utf8_ptr *p, CTcToken *tok,
1079                                        int *in_embedding)
1080 {
1081     wchar_t cur;
1082     tc_toktyp_t typ;
1083     utf8_ptr start;
1084     int num_minus;
1085 
1086     /* skip whitespace */
1087     skip_ws_and_markers(p);
1088 
1089     /* remember where the token starts */
1090     start = *p;
1091 
1092     /* if there's nothing left in the current line, return EOF */
1093     if (p->getch() == '\0')
1094     {
1095         /* indicate end of file */
1096         typ = TOKT_EOF;
1097         goto done;
1098     }
1099 
1100     /* get the initial character, and skip it */
1101     cur = p->getch();
1102     p->inc();
1103 
1104     /* presume the token will not be marked as fully macro-expanded */
1105     tok->set_fully_expanded(FALSE);
1106 
1107     /* presume it's not a number with a minus sign */
1108     num_minus = FALSE;
1109 
1110     /* see what we have */
1111     switch(cur)
1112     {
1113     case TOK_MACRO_FORMAL_FLAG:
1114         /*
1115          *   this is a two-byte formal parameter sequence in a macro
1116          *   expansion - skip the second byte of the two-byte sequence,
1117          *   and return the special token type for this sequence
1118          */
1119         typ = TOKT_MACRO_FORMAL;
1120 
1121         /*
1122          *   skip the second byte - note that we want to skip exactly one
1123          *   byte, regardless of what the byte looks like as a utf-8
1124          *   partial character, since it's not a utf-8 character at all
1125          */
1126         p->set(p->getptr() + 1);
1127         break;
1128 
1129     case TOK_MACRO_FOREACH_FLAG:
1130         /*
1131          *   this is the special macro '#foreach' flag - return it as a
1132          *   special pseudo-token
1133          */
1134         typ = TOKT_MACRO_FOREACH;
1135         break;
1136 
1137     case TOK_MACRO_IFEMPTY_FLAG:
1138         /* #ifempty macro flag */
1139         typ = TOKT_MACRO_IFEMPTY;
1140         break;
1141 
1142     case TOK_MACRO_IFNEMPTY_FLAG:
1143         /* #ifnempty macro flag */
1144         typ = TOKT_MACRO_IFNEMPTY;
1145         break;
1146 
1147     case TOK_MACRO_ARGCOUNT_FLAG:
1148         /* it's the special macro '#argcount' flag */
1149         typ = TOKT_MACRO_ARGCOUNT;
1150         break;
1151 
1152     case TOK_FULLY_EXPANDED_FLAG:
1153         /* set the token flag indicating that it has been fully expanded */
1154         tok->set_fully_expanded(TRUE);
1155 
1156         /* the token symbol starts at the byte after the flag byte */
1157         start = p->getptr();
1158 
1159         /* read the first character of the symbol */
1160         cur = p->getch();
1161         p->inc();
1162 
1163         /* tokenize the symbol that follows */
1164         goto tokenize_symbol;
1165 
1166     case TOK_END_PP_LINE:
1167         /*
1168          *   Preprocess line-ending marker - when we reach the end of a
1169          *   preprocessor line, we can't read another source line, because
1170          *   a preprocessor directive consists of only a single logical
1171          *   source line.  Once we see this, return end-of-file until the
1172          *   caller explicitly reads a new source line.
1173          *
1174          *   Keep the read pointer stuck on this flag byte, so that we
1175          *   return end-of-file on a subsequent attempt to get the next
1176          *   token.
1177          */
1178         *p = start;
1179         typ = TOKT_EOF;
1180         break;
1181 
1182     case '0':
1183     case '1':
1184     case '2':
1185     case '3':
1186     case '4':
1187     case '5':
1188     case '6':
1189     case '7':
1190     case '8':
1191     case '9':
1192         {
1193             long acc;
1194 
1195             /*
1196              *   Start out with the leading digit in the accumulator.  Note
1197              *   that the character set internally is always UTF-8.
1198              */
1199             acc = value_of_digit(cur);
1200 
1201             /*
1202              *   If it's a leading zero, treat as octal or hex.  '0x' means
1203              *   hex; otherwise, '0' means octal.
1204              */
1205             if (cur == '0')
1206             {
1207                 /* check for hex - if it's not hex, it's octal */
1208                 if (p->getch() == 'x' || p->getch() == 'X')
1209                 {
1210                     /* skip the 'x' */
1211                     p->inc();
1212 
1213                     /*
1214                      *   scan the hex number - keep going until we find
1215                      *   something that's not a hex digit
1216                      */
1217                     for (;;)
1218                     {
1219                         /* get this character */
1220                         cur = p->getch();
1221 
1222                         /* if it's not a hex digit, stop scanning */
1223                         if (!is_xdigit(cur))
1224                             break;
1225 
1226                         /*
1227                          *   Shift the accumulator and add this digit's value.
1228                          *   Note that we can save a test - if the character is
1229                          *   >= lower-case 'a', we know it's not an upper-case
1230                          *   letter because the lower-case letters all have
1231                          *   values above the upper-case letters in UTF-8
1232                          *   encoding (which we always use as the internal
1233                          *   character set).  Since we already know it's a
1234                          *   valid hex digit (we wouldn't be here if it
1235                          *   weren't), we can just check to see if it's at
1236                          *   least lower-case 'a', and we automatically know
1237                          *   then whether it's in the 'a'-'f' range or the
1238                          *   'A'-'F' range.
1239                          */
1240                         acc *= 16;
1241                         acc += value_of_xdigit(cur);
1242 
1243                         /* move on */
1244                         p->inc();
1245                     }
1246                 }
1247                 else
1248                 {
1249                     /* scan octal digits */
1250                     for ( ; is_odigit(p->getch()) ; p->inc())
1251                         acc = 8*acc + value_of_odigit(p->getch());
1252                 }
1253             }
1254             else
1255             {
1256                 /* scan decimal digits */
1257                 for ( ; is_digit(p->getch()) ; p->inc())
1258                     acc = 10*acc + value_of_digit(p->getch());
1259             }
1260 
1261             /* negate the value if we had a minus sign */
1262             if (num_minus)
1263                 acc = -acc;
1264 
1265             /*
1266              *   if we stopped at a decimal point or an exponent, it's a
1267              *   floating point number
1268              */
1269             if (p->getch() == '.' || p->getch() == 'e' || p->getch() == 'E')
1270                 goto do_float;
1271 
1272             /* it's an integer value */
1273             typ = TOKT_INT;
1274 
1275             /* set the integer value */
1276             tok->set_int_val(acc);
1277         }
1278         break;
1279 
1280     do_float:
1281         {
1282             int found_decpt;
1283 
1284             /* start over and parse the float */
1285             for (*p = start, found_decpt = FALSE ; ; p->inc())
1286             {
1287                 /* get this character and move on */
1288                 cur = p->getch();
1289 
1290                 /* see what we have */
1291                 if (is_digit(cur))
1292                 {
1293                     /* we have another digit; just keep going */
1294                 }
1295                 else if (!found_decpt && cur == '.')
1296                 {
1297                     /* it's the decimal point - note it and keep going */
1298                     found_decpt = TRUE;
1299                 }
1300                 else if (cur == 'e' || cur == 'E')
1301                 {
1302                     /* it's the exponent - if there's a sign, skip it */
1303                     p->inc();
1304                     cur = p->getch();
1305                     if (cur == '+' || cur == '-')
1306                         p->inc();
1307 
1308                     /* keep going until we find no more digits */
1309                     while (is_digit(p->getch()))
1310                         p->inc();
1311 
1312                     /* the end of the exponent is the end of the number */
1313                     break;
1314                 }
1315                 else
1316                 {
1317                     /* everything else ends the number */
1318                     break;
1319                 }
1320             }
1321         }
1322 
1323         /* it's a float */
1324         typ = TOKT_FLOAT;
1325         break;
1326 
1327     case '"':
1328     case '\'':
1329         *p = start;
1330         return tokenize_string(p, tok, in_embedding);
1331 
1332     case '(':
1333         typ = TOKT_LPAR;
1334         break;
1335 
1336     case ')':
1337         typ = TOKT_RPAR;
1338         break;
1339 
1340     case ',':
1341         typ = TOKT_COMMA;
1342         break;
1343 
1344     case '.':
1345         /* check for '...' and floating-point numbers */
1346         if (p->getch() == '.' && p->getch_at(1) == '.')
1347         {
1348             p->inc();
1349             p->inc();
1350             typ = TOKT_ELLIPSIS;
1351         }
1352         else if (is_digit(p->getch()))
1353             goto do_float;
1354         else
1355             typ = TOKT_DOT;
1356         break;
1357 
1358     case '{':
1359         typ = TOKT_LBRACE;
1360         break;
1361 
1362     case '}':
1363         typ = TOKT_RBRACE;
1364         break;
1365 
1366     case '[':
1367         typ = TOKT_LBRACK;
1368         break;
1369 
1370     case ']':
1371         typ = TOKT_RBRACK;
1372         break;
1373 
1374     case '=':
1375         /* check for '==' */
1376         if (p->getch() == '=')
1377         {
1378             p->inc();
1379             typ = TOKT_EQEQ;
1380         }
1381         else
1382             typ = TOKT_EQ;
1383         break;
1384 
1385     case ':':
1386         /* check for ':=' and '::' */
1387         if (p->getch() == '=')
1388         {
1389             p->inc();
1390             typ = TOKT_ASI;
1391         }
1392         else if (p->getch() == ':')
1393         {
1394             p->inc();
1395             typ = TOKT_COLONCOLON;
1396         }
1397         else
1398             typ = TOKT_COLON;
1399         break;
1400 
1401     case '?':
1402         typ = TOKT_QUESTION;
1403         break;
1404 
1405     case '+':
1406         /* check for '++' and '+=' */
1407         if (p->getch() == '+')
1408         {
1409             p->inc();
1410             typ = TOKT_INC;
1411         }
1412         else if (p->getch() == '=')
1413         {
1414             p->inc();
1415             typ = TOKT_PLUSEQ;
1416         }
1417         else
1418             typ = TOKT_PLUS;
1419         break;
1420 
1421     case '-':
1422         /* check for '--', '->' and '-=' */
1423         if (p->getch() == '-')
1424         {
1425             p->inc();
1426             typ = TOKT_DEC;
1427         }
1428         else if (p->getch() == '=')
1429         {
1430             p->inc();
1431             typ = TOKT_MINEQ;
1432         }
1433         else if (p->getch() == '>')
1434         {
1435             p->inc();
1436             typ = TOKT_ARROW;
1437         }
1438         else
1439             typ = TOKT_MINUS;
1440         break;
1441 
1442     case '*':
1443         /* check for '*=' */
1444         if (p->getch() == '=')
1445         {
1446             p->inc();
1447             typ = TOKT_TIMESEQ;
1448         }
1449         else
1450             typ = TOKT_TIMES;
1451         break;
1452 
1453     case '/':
1454         /* check for '/=' */
1455         if (p->getch() == '=')
1456         {
1457             p->inc();
1458             typ = TOKT_DIVEQ;
1459         }
1460         else
1461             typ = TOKT_DIV;
1462         break;
1463 
1464     case '%':
1465         /* check for '%=' */
1466         if (p->getch() == '=')
1467         {
1468             p->inc();
1469             typ = TOKT_MODEQ;
1470         }
1471         else
1472             typ = TOKT_MOD;
1473         break;
1474 
1475     case '>':
1476         /* check for '>>=', '>>' and '>=' */
1477         if (p->getch() == '=')
1478         {
1479             p->inc();
1480             typ = TOKT_GE;
1481         }
1482         else if (p->getch() == '>')
1483         {
1484             /* check for the end of an embedded expression */
1485             if (in_embedding != 0 && *in_embedding)
1486             {
1487                 *p = start;
1488                 return tokenize_string(p, tok, in_embedding);
1489             }
1490 
1491             /* check for '>>=' */
1492             p->inc();
1493             if (p->getch() == '=')
1494             {
1495                 p->inc();
1496                 typ = TOKT_SHREQ;
1497             }
1498             else
1499                 typ = TOKT_SHR;
1500         }
1501         else
1502             typ = TOKT_GT;
1503         break;
1504 
1505     case '<':
1506         /* check for '<<=', '<<', '<>', and '<=' */
1507         if (p->getch() == '=')
1508         {
1509             p->inc();
1510             typ = TOKT_LE;
1511         }
1512         else if (p->getch() == '<')
1513         {
1514             /* check for '<<=' */
1515             p->inc();
1516             if (p->getch() == '=')
1517             {
1518                 p->inc();
1519                 typ = TOKT_SHLEQ;
1520             }
1521             else
1522                 typ = TOKT_SHL;
1523         }
1524         else if (p->getch() == '>')
1525         {
1526             p->inc();
1527             typ = TOKT_NE;
1528         }
1529         else
1530             typ = TOKT_LT;
1531         break;
1532 
1533     case ';':
1534         typ = TOKT_SEM;
1535         break;
1536 
1537     case '&':
1538         /* check for '&&' and '&=' */
1539         if (p->getch() == '&')
1540         {
1541             p->inc();
1542             typ = TOKT_ANDAND;
1543         }
1544         else if (p->getch() == '=')
1545         {
1546             p->inc();
1547             typ = TOKT_ANDEQ;
1548         }
1549         else
1550             typ = TOKT_AND;
1551         break;
1552 
1553     case '|':
1554         /* check for '||' and '|=' */
1555         if (p->getch() == '|')
1556         {
1557             p->inc();
1558             typ = TOKT_OROR;
1559         }
1560         else if (p->getch() == '=')
1561         {
1562             p->inc();
1563             typ = TOKT_OREQ;
1564         }
1565         else
1566             typ = TOKT_OR;
1567         break;
1568 
1569     case '^':
1570         /* check for '^=' */
1571         if (p->getch() == '^')
1572         {
1573             p->inc();
1574             typ = TOKT_XOREQ;
1575         }
1576         else
1577             typ = TOKT_XOR;
1578         break;
1579 
1580     case '!':
1581         /* check for '!=' */
1582         if (p->getch() == '=')
1583         {
1584             p->inc();
1585             typ = TOKT_NE;
1586         }
1587         else
1588             typ = TOKT_NOT;
1589         break;
1590 
1591     case '~':
1592         typ = TOKT_BNOT;
1593         break;
1594 
1595     case '@':
1596         typ = TOKT_AT;
1597         break;
1598 
1599     case '#':
1600         /* check for '##' and '#@' */
1601         if (p->getch() == '#')
1602         {
1603             p->inc();
1604             typ = TOKT_POUNDPOUND;
1605         }
1606         else if (p->getch() == '@')
1607         {
1608             p->inc();
1609             typ = TOKT_POUNDAT;
1610         }
1611         else
1612             typ = TOKT_POUND;
1613         break;
1614 
1615     default:
1616         /* check to see if it's a symbol */
1617         if (is_syminit(cur))
1618         {
1619             size_t len, full_len;
1620 
1621             /*
1622              *   scan the identifier (note that we've already skipped the
1623              *   first character, so we start out at length = 1)
1624              */
1625         tokenize_symbol:
1626             for (len = full_len = 1 ; is_sym(p->getch()) ; p->inc())
1627             {
1628                 /* count the full length */
1629                 ++full_len;
1630 
1631                 /*
1632                  *   count this character if we're not over the maximum
1633                  *   length
1634                  */
1635                 if (len < TOK_SYM_MAX_LEN)
1636                     ++len;
1637             }
1638 
1639             /* if we truncated the symbol, issue a warning */
1640             if (full_len != len)
1641                 log_warning(TCERR_SYMBOL_TRUNCATED,
1642                             (int)full_len, start.getptr(),
1643                             (int)len, start.getptr());
1644 
1645             /* it's a symbol */
1646             typ = TOKT_SYM;
1647         }
1648         else
1649         {
1650             /* invalid token */
1651             typ = TOKT_INVALID;
1652         }
1653         break;
1654     }
1655 
1656 done:
1657     /* set the type */
1658     tok->settyp(typ);
1659 
1660     /* set the text */
1661     tok->set_text(start.getptr(), p->getptr() - start.getptr());
1662 
1663     /* return the type */
1664     return typ;
1665 }
1666 
1667 /*
1668  *   get the next token, limiting to the length of the source buffer
1669  */
next_on_line(const CTcTokString * srcbuf,utf8_ptr * p,CTcToken * tok,int * in_embedding)1670 tc_toktyp_t CTcTokenizer::next_on_line(const CTcTokString *srcbuf,
1671                                        utf8_ptr *p, CTcToken *tok,
1672                                        int *in_embedding)
1673 {
1674     /* get the next token */
1675     next_on_line(p, tok, in_embedding);
1676 
1677     /* if the token is past the end of the line, return EOF */
1678     if (tok->get_text() >= srcbuf->get_text_end())
1679     {
1680         /* set the token to indicate end of line */
1681         tok->settyp(TOKT_EOF);
1682 
1683         /* set the token to point to the end of the buffer */
1684         tok->set_text(srcbuf->get_text_end(), 0);
1685     }
1686 
1687     /* return the token type */
1688     return tok->gettyp();
1689 }
1690 
1691 /*
1692  *   Get the next token on the line, translating escapes in strings.  This
1693  *   updates the line buffer in-place to incorporate the translated string
1694  *   text.
1695  */
next_on_line_xlat(utf8_ptr * p,CTcToken * tok,int * in_embedding)1696 tc_toktyp_t CTcTokenizer::next_on_line_xlat(utf8_ptr *p, CTcToken *tok,
1697                                             int *in_embedding)
1698 {
1699     /* skip whitespace */
1700     skip_ws_and_markers(p);
1701 
1702     /* if this is a string, translate escapes */
1703     switch(p->getch())
1704     {
1705     case '"':
1706     case '\'':
1707         /* translate the string */
1708         return xlat_string(p, tok, in_embedding);
1709 
1710     case '>':
1711         /* if we're in an embedding, check for '>>' */
1712         if (in_embedding != 0 && *in_embedding && p->getch_at(1) == '>')
1713             return tokenize_string(p, tok, in_embedding);
1714 
1715         /* use the default case */
1716         goto do_normal;
1717 
1718     default:
1719     do_normal:
1720         /* for anything else, use the default tokenizer */
1721         return next_on_line(p, tok, in_embedding);
1722     }
1723 }
1724 
1725 /*
1726  *   Look up a keyword
1727  */
look_up_keyword(const CTcToken * tok,tc_toktyp_t * kwtok)1728 int CTcTokenizer::look_up_keyword(const CTcToken *tok, tc_toktyp_t *kwtok)
1729 {
1730     CTcHashEntryKw *kw;
1731 
1732     /* look it up in the keyword table */
1733     kw = (CTcHashEntryKw *)kw_->find(tok->get_text(), tok->get_text_len());
1734     if (kw != 0)
1735     {
1736         /* we found the keyword - set 'kw' to the keyword token id */
1737         *kwtok = kw->get_tok_id();
1738 
1739         /* tell the caller we found it */
1740         return TRUE;
1741     }
1742     else
1743     {
1744         /* tell the caller it's not a keyword */
1745         return FALSE;
1746     }
1747 }
1748 
1749 /*
1750  *   Get the next token on the line, translating escape sequences in
1751  *   strings, and storing strings and symbols in the source block list.
1752  *   This routine also translates keywords for token types.
1753  */
next_on_line_xlat_keep()1754 tc_toktyp_t CTcTokenizer::next_on_line_xlat_keep()
1755 {
1756     tc_toktyp_t typ;
1757 
1758     /* keep going until we find a valid symbol */
1759     for (;;)
1760     {
1761         /* skip whitespace and macro expansion flags */
1762         skip_ws_and_markers(&p_);
1763 
1764         /* see what we have */
1765         switch(p_.getch())
1766         {
1767         case '"':
1768         case '\'':
1769             /* it's a string - translate and save it */
1770             return xlat_string_to_src(&main_in_embedding_, FALSE);
1771 
1772         case '>':
1773             /* if we're in an embedding, this is the end of it */
1774             if (main_in_embedding_ && p_.getch_at(1) == '>')
1775                 return xlat_string_to_src(&main_in_embedding_, FALSE);
1776 
1777             /* use the normal parsing */
1778             goto do_normal;
1779 
1780         default:
1781         do_normal:
1782             /* for anything else, use the default tokenizer */
1783             typ = next_on_line(&p_, &curtok_, &main_in_embedding_);
1784 
1785             /* check the token type */
1786             switch(typ)
1787             {
1788             case TOKT_SYM:
1789                 /* symbol */
1790                 {
1791                     const char *p;
1792                     CTcHashEntryKw *kw;
1793 
1794                     /* look it up in the keyword table */
1795                     kw = (CTcHashEntryKw *)kw_->find(curtok_.get_text(),
1796                         curtok_.get_text_len());
1797                     if (kw != 0)
1798                     {
1799                         /* replace the token with the keyword token type */
1800                         typ = kw->get_tok_id();
1801                         curtok_.settyp(typ);
1802                     }
1803                     else
1804                     {
1805                         /* ordinary symbol - save the text */
1806                         p = store_source(curtok_.get_text(),
1807                                          curtok_.get_text_len());
1808 
1809                         /*
1810                          *   change the token's text to point to the
1811                          *   source block, so that this token's text
1812                          *   pointer will remain permanently valid (the
1813                          *   original copy, in the source line buffer,
1814                          *   will be overwritten as soon as we read
1815                          *   another source line; we don't want the caller
1816                          *   to have to worry about this, so we return the
1817                          *   permanent copy)
1818                          */
1819                         curtok_.set_text(p, curtok_.get_text_len());
1820                     }
1821                 }
1822                 break;
1823 
1824             case TOKT_FLOAT:
1825                 /* floating-point number */
1826                 {
1827                     const char *p;
1828 
1829                     /*
1830                      *   save the text so that it remains permanently
1831                      *   valid - we keep track of floats by the original
1832                      *   text, and let the code generator produce the
1833                      *   appropriate object file representation
1834                      */
1835                     p = store_source(curtok_.get_text(),
1836                                      curtok_.get_text_len());
1837                     curtok_.set_text(p, curtok_.get_text_len());
1838                 }
1839                 break;
1840 
1841             case TOKT_INVALID:
1842                 /* log an error for the invalid token */
1843                 log_error_curtok(TCERR_INVALID_CHAR);
1844 
1845                 /* skip this character */
1846                 p_.inc();
1847 
1848                 /* keep going */
1849                 continue;
1850             }
1851         }
1852 
1853         /* return the type */
1854         return typ;
1855     }
1856 }
1857 
1858 
1859 /*
1860  *   Translate the string at the current token position in the input
1861  *   stream to the source block list.
1862  */
xlat_string_to_src(int * in_embedding,int force_embed_end)1863 tc_toktyp_t CTcTokenizer::xlat_string_to_src(int *in_embedding,
1864                                              int force_embed_end)
1865 {
1866     tc_toktyp_t typ;
1867 
1868     /*
1869      *   Reserve space for the entire rest of the line.  This is
1870      *   conservative, in that we will definitely need less space than
1871      *   this.  This might cause us to waste a little space here and
1872      *   there, since we will over-allocate when we have a short string
1873      *   early in a long line, but this will save us the time of scanning
1874      *   the string twice just to see how long it is.
1875      */
1876     reserve_source(line_len_ - (p_.getptr() - line_start_));
1877 
1878     /* translate into the source block */
1879     typ = xlat_string_to(src_ptr_, &p_, &curtok_,
1880                          in_embedding, force_embed_end);
1881 
1882     /* commit the space in the source block */
1883     commit_source(curtok_.get_text_len() + 1);
1884 
1885     /* return the string token */
1886     return typ;
1887 }
1888 
1889 /*
1890  *   Translate a string, setting up the token structure for the string,
1891  *   and writing the translated version of the string directly over the
1892  *   original source buffer of the string.
1893  *
1894  *   Since a translated string can only shrink (because a translated
1895  *   escape sequence is always shorter than the original source version),
1896  *   we don't need a separate buffer, but can simply translate into the
1897  *   source buffer, overwriting the original string as we go.
1898  */
xlat_string(utf8_ptr * p,CTcToken * tok,int * in_embedding)1899 tc_toktyp_t CTcTokenizer::xlat_string(utf8_ptr *p, CTcToken *tok,
1900                                       int *in_embedding)
1901 {
1902     char *dst;
1903 
1904     /*
1905      *   write the translated string over the original string's text,
1906      *   starting at the character after the quote
1907      */
1908     dst = p->getptr() + 1;
1909 
1910     /* translate the string into our destination buffer */
1911     return xlat_string_to(dst, p, tok, in_embedding, FALSE);
1912 }
1913 
1914 /*
1915  *   Translate a string, setting up the token structure for the string.
1916  *   We will update the line buffer in-place to incorporate the translated
1917  *   string text.
1918  */
xlat_string_to(char * dstp,utf8_ptr * p,CTcToken * tok,int * in_embedding,int force_embed_end)1919 tc_toktyp_t CTcTokenizer::xlat_string_to(char *dstp, utf8_ptr *p,
1920                                          CTcToken *tok, int *in_embedding,
1921                                          int force_embed_end)
1922 {
1923     utf8_ptr dst;
1924     wchar_t qu;
1925     utf8_ptr start, end;
1926     int i;
1927 
1928     /* set up our output utf8 pointer */
1929     dst.set(dstp);
1930 
1931     /* note the open quote character */
1932     qu = p->getch();
1933 
1934     /* set the appropriate string token type */
1935     tok->settyp(qu == '"'
1936                 ? TOKT_DSTR
1937                 : (qu == '>' ? TOKT_DSTR_END : TOKT_SSTR));
1938 
1939     /* skip the open quote */
1940     p->inc();
1941 
1942     /* skip the second '>' if it's a '>>' */
1943     if (force_embed_end)
1944     {
1945         /*
1946          *   they want us to assume the embedding ends here, regardless of
1947          *   what we're looking at - act the same as though we had
1948          *   actually seen '>>', but don't skip any input (in fact, back
1949          *   up one, since we already skipped one character for what we
1950          *   had thought was the open quote
1951          */
1952         p->dec();
1953 
1954         /* clear the caller's in-embedding status */
1955         *in_embedding = FALSE;
1956 
1957         /* close with a double quote */
1958         qu = '"';
1959 
1960         /* it's a double-quoted string continuation */
1961         tok->settyp(TOKT_DSTR_END);
1962     }
1963     else if (qu == '>')
1964     {
1965         /* skip the second '>' */
1966         p->inc();
1967 
1968         /* clear the caller's in-embedding status */
1969         *in_embedding = FALSE;
1970 
1971         /* close with a double quote */
1972         qu = '"';
1973     }
1974 
1975     /* remember where the string's contents start */
1976     start = *p;
1977 
1978     /* scan the string and translate quotes */
1979     for (;;)
1980     {
1981         wchar_t cur;
1982 
1983         /* get this character */
1984         cur = p->getch();
1985 
1986         /* if this is the matching quote, we're done */
1987         if (cur == qu)
1988             break;
1989 
1990         /*
1991          *   if we find an end-of-line within the string, it's an error -
1992          *   we should always splice strings together onto a single line
1993          *   before starting to tokenize the line
1994          */
1995         if (cur == '\0')
1996         {
1997             size_t len;
1998             utf8_ptr p;
1999 
2000             /* note where the string ends */
2001             end = dst;
2002 
2003             /* set the token's text pointer */
2004             tok->set_text(dstp, end.getptr() - dstp);
2005 
2006             /* null-terminate the result string */
2007             dst.setch('\0');
2008 
2009             /*
2010              *   get the length of the unterminated string so far, but for
2011              *   error logging, limit the length to twenty characters --
2012              *   we just want to give the user enough information to find
2013              *   the string in error, without making the error message
2014              *   huge
2015              */
2016             p.set(dstp);
2017             len = p.len(end.getptr() - dstp);
2018             if (len > 20)
2019                 len = p.bytelen(20);
2020 
2021             /*
2022              *   Check for a special heuristic case.  If the string was of
2023              *   zero length, and we have something sitting in our
2024              *   unsplice buffer, here's what probably happened: the input
2025              *   was missing a ">>" sequence at the end of an embedded
2026              *   expression, and the parser told us to put it back in.  We
2027              *   had earlier decided we needed to splice up to a quote to
2028              *   end what looked to us like an unterminated string.  If
2029              *   this is the case, we and the parser are working at cross
2030              *   purposes; the parser is smarter than we are, so we should
2031              *   synchronize with it.
2032              */
2033             if (tok->get_text_len() == 0
2034                 && qu == '"'
2035                 && unsplicebuf_.get_text_len() != 0)
2036             {
2037                 char *buf;
2038 
2039                 /*
2040                  *   we must have spliced a line to finish a string -
2041                  *   insert the quote into the splice buffer, and ignore
2042                  *   it here
2043                  */
2044 
2045                 /*
2046                  *   make sure there's room for one more character (plus a
2047                  *   null byte)
2048                  */
2049                 unsplicebuf_.ensure_space(unsplicebuf_.get_text_len() + 2);
2050 
2051                 /* get the buffer pointer */
2052                 buf = unsplicebuf_.get_buf();
2053 
2054                 /* make room for the '"' */
2055                 memmove(buf + 1, buf, unsplicebuf_.get_text_len());
2056                 unsplicebuf_.set_text_len(unsplicebuf_.get_text_len() + 1);
2057 
2058                 /* add the '"' */
2059                 *buf = '"';
2060 
2061                 /*
2062                  *   return the 'null token' to tell the caller to try
2063                  *   again - do not log an error at this point
2064                  */
2065                 return TOKT_NULLTOK;
2066             }
2067 
2068             /* log the error */
2069             log_error(TCERR_UNTERM_STRING,
2070                       (char)qu, (int)len, dstp, (char)qu);
2071 
2072             /* return the string type */
2073             return tok->gettyp();
2074         }
2075 
2076         /* if this is an escape, translate it */
2077         if (cur == '\\')
2078         {
2079             long acc;
2080 
2081             /* get the character after the escape */
2082             p->inc();
2083             cur = p->getch();
2084 
2085             /* see what we have */
2086             switch(cur)
2087             {
2088             case '^':
2089                 /* caps - 0x000F */
2090                 cur = 0x000F;
2091                 break;
2092 
2093             case 'v':
2094                 /* miniscules - 0x000E */
2095                 cur = 0x000E;
2096                 break;
2097 
2098             case 'b':
2099                 /* blank line - 0x000B */
2100                 cur = 0x000B;
2101                 break;
2102 
2103             case ' ':
2104                 /* quoted space - 0x0015 */
2105                 cur = 0x0015;
2106                 break;
2107 
2108             case 'n':
2109                 /* newline - explicitly use Unicode 10 character */
2110                 cur = 10;
2111                 break;
2112 
2113             case 't':
2114                 /* tab - explicitly use Unicode 9 character */
2115                 cur = 9;
2116                 break;
2117 
2118             case 'u':
2119                 /*
2120                  *   Hex unicode character number.  Read up to 4 hex
2121                  *   digits that follow the 'u', and use that as a Unicode
2122                  *   character ID.
2123                  */
2124                 for (i = 0, acc = 0, p->inc() ; i < 4 ; ++i, p->inc())
2125                 {
2126                     /* get the next character */
2127                     cur = p->getch();
2128 
2129                     /*
2130                      *   if it's another hex digit, add it into the
2131                      *   accumulator; otherwise, we're done
2132                      */
2133                     if (is_xdigit(cur))
2134                         acc = 16*acc + value_of_xdigit(cur);
2135                     else
2136                         break;
2137                 }
2138 
2139                 /* use the accumulated value as the character number */
2140                 dst.setch((wchar_t)acc);
2141 
2142                 /*
2143                  *   continue with the current character, since we've
2144                  *   already skipped ahead to the next one
2145                  */
2146                 continue;
2147 
2148             case '0':
2149             case '1':
2150             case '2':
2151             case '3':
2152             case '4':
2153             case '5':
2154             case '6':
2155             case '7':
2156                 /*
2157                  *   Octal ASCII character number.  Accumulate up to three
2158                  *   octal numbers, and use the result as a character ID.
2159                  */
2160                 for (i = 0, acc = 0 ; i < 4 ; ++i, p->inc())
2161                 {
2162                     /* get the next character */
2163                     cur = p->getch();
2164 
2165                     /*
2166                      *   if it's another digit, and it would leave our
2167                      *   result in the 0-255 range, count it; if not,
2168                      *   we're done
2169                      */
2170                     if (is_odigit(cur))
2171                     {
2172                         long new_acc;
2173 
2174                         /* compute the new value */
2175                         new_acc = 8*acc + value_of_odigit(cur);
2176 
2177                         /* if this would be too high, don't count it */
2178                         if (new_acc > 255)
2179                             break;
2180                         else
2181                             acc = new_acc;
2182                     }
2183                     else
2184                         break;
2185                 }
2186 
2187                 /* use the accumulated value as the character number */
2188                 dst.setch((wchar_t)acc);
2189 
2190                 /*
2191                  *   continue with the current character, since we've
2192                  *   already skipped ahead to the next one
2193                  */
2194                 continue;
2195 
2196             case 'x':
2197                 /*
2198                  *   Hex ASCII character number.  Read up to two hex
2199                  *   digits as a character number.
2200                  */
2201                 for (i = 0, acc = 0, p->inc() ; i < 2 ; ++i, p->inc())
2202                 {
2203                     /* get the next character */
2204                     cur = p->getch();
2205 
2206                     /*
2207                      *   if it's another hex digit, add it into the
2208                      *   accumulator; otherwise, we're done
2209                      */
2210                     if (is_xdigit(cur))
2211                         acc = 16*acc + value_of_xdigit(cur);
2212                     else
2213                         break;
2214                 }
2215 
2216                 /* use the accumulated value as the character number */
2217                 dst.setch((wchar_t)acc);
2218 
2219                 /*
2220                  *   continue with the current character, since we've
2221                  *   already skipped ahead to the next one
2222                  */
2223                 continue;
2224 
2225             default:
2226                 /* copy anything else as-is */
2227                 break;
2228             }
2229         }
2230         else if (in_embedding != 0 && !*in_embedding
2231                  && cur == '<' && p->getch_at(1) == '<')
2232         {
2233             /*
2234              *   it's the start of an embedded expression - change the
2235              *   type to so indicate
2236              */
2237             tok->settyp(tok->gettyp() == TOKT_DSTR
2238                         ? TOKT_DSTR_START : TOKT_DSTR_MID);
2239 
2240             /* tell the caller we're in an embedding */
2241             *in_embedding = TRUE;
2242 
2243             /* stop scanning */
2244             break;
2245         }
2246 
2247         /* copy this character to the output position */
2248         dst.setch(cur);
2249 
2250         /* get the next character */
2251         p->inc();
2252     }
2253 
2254     /* note where the string ends */
2255     end = dst;
2256 
2257     /* set the token's text pointer */
2258     tok->set_text(dstp, end.getptr() - dstp);
2259 
2260     /* null-terminate the result string */
2261     dst.setch('\0');
2262 
2263     /* skip an extra character if this is the start of an embedding */
2264     if (p->getch() == '<')
2265         p->inc();
2266 
2267     /* skip the closing quote */
2268     p->inc();
2269 
2270     /* return the string type */
2271     return tok->gettyp();
2272 }
2273 
2274 
2275 /*
2276  *   Skip a string, setting up the token structure for the string.  This
2277  *   routine only parses to the end of the line; if the line ends with the
2278  *   string unterminated, we'll flag an error
2279  */
tokenize_string(utf8_ptr * p,CTcToken * tok,int * in_embedding)2280 tc_toktyp_t CTcTokenizer::tokenize_string(utf8_ptr *p, CTcToken *tok,
2281                                           int *in_embedding)
2282 {
2283     const char *start;
2284     const char *contents_start;
2285     const char *contents_end;
2286     tc_toktyp_t typ;
2287     wchar_t qu;
2288     int allow_embedding;
2289 
2290     /* remember where the text starts */
2291     start = p->getptr();
2292 
2293     /* note the quote type */
2294     qu = p->getch();
2295 
2296     /* skip the quote in the input */
2297     p->inc();
2298 
2299     /* determine the token type based on the quote type */
2300     switch(qu)
2301     {
2302     case '\'':
2303         /* single-quoted string */
2304         typ = TOKT_SSTR;
2305         allow_embedding = FALSE;
2306         break;
2307 
2308     case '>':
2309         /*
2310          *   this must be the next part of a string with embeddings; for now,
2311          *   assume it's the end of the string, although it may just turn out
2312          *   to be the middle
2313          */
2314         typ = TOKT_DSTR_END;
2315         allow_embedding = (in_embedding != 0);
2316 
2317         /* skip the extra '>' character */
2318         p->inc();
2319 
2320         /* clear the embedding flag */
2321         if (in_embedding != 0)
2322             *in_embedding = FALSE;
2323 
2324         /* look for a closing double quote */
2325         qu = '"';
2326         break;
2327 
2328     case '"':
2329         /* regular double-quoted string */
2330         typ = TOKT_DSTR;
2331         allow_embedding = (in_embedding != 0);
2332         break;
2333 
2334     default:
2335         /* anything else is invalid */
2336         typ = TOKT_INVALID;
2337         allow_embedding = FALSE;
2338         break;
2339     }
2340 
2341     /* this is where the string's contents start */
2342     contents_start = p->getptr();
2343 
2344     /* scan the string */
2345     for (;;)
2346     {
2347         wchar_t cur;
2348 
2349         /* get the current character */
2350         cur = p->getch();
2351 
2352         /* see what we have */
2353         if (cur == '\\')
2354         {
2355             /* escape sequence - skip an extra character */
2356             p->inc();
2357         }
2358         else if (cur == '<' && allow_embedding && p->getch_at(1) == '<')
2359         {
2360             /*
2361              *   it's the start of an embedded expression - return the
2362              *   appropriate embedded string part type
2363              */
2364             if (typ == TOKT_DSTR)
2365                 typ = TOKT_DSTR_START;
2366             else
2367                 typ = TOKT_DSTR_MID;
2368 
2369             /* remember that we're in an embedding in the token stream */
2370             *in_embedding = TRUE;
2371 
2372             /* this is where the contents end */
2373             contents_end = p->getptr();
2374 
2375             /* skip the two embedding characters */
2376             p->inc();
2377             p->inc();
2378 
2379             /* we're done - set the text in the token */
2380             tok->set_text(start, p->getptr() - start);
2381 
2382             /* done */
2383             break;
2384         }
2385         else if (cur == qu)
2386         {
2387             /* this is where the contents end */
2388             contents_end = p->getptr();
2389 
2390             /* skip the closing quote */
2391             p->inc();
2392 
2393             /* we're done - set the text in the token */
2394             tok->set_text(start, p->getptr() - start);
2395 
2396             /* done */
2397             break;
2398         }
2399         else if (cur == '\0')
2400         {
2401             /* this is where the contents end */
2402             contents_end = p->getptr();
2403 
2404             /*
2405              *   We have an unterminated string.  If we're evaluating a
2406              *   preprocessor constant expression, log an error; otherwise
2407              *   let it go for now, since we'll catch the error during the
2408              *   normal tokenizing pass for parsing.
2409              */
2410             if (G_tok->in_pp_expr_)
2411                 log_error(TCERR_PP_UNTERM_STRING);
2412 
2413             /* set the partial text */
2414             tok->set_text(start, p->getptr() - start);
2415 
2416             /* end of line - return with the string unfinished */
2417             break;
2418         }
2419 
2420         /* skip this charater of input */
2421         p->inc();
2422     }
2423 
2424     /*
2425      *   if we're not in preprocessor mode, and we're saving string text,
2426      *   write the string to the string text output file
2427      */
2428     if (!G_tok->in_pp_expr_ && G_tok->string_fp_ != 0
2429         && contents_start != contents_end)
2430     {
2431         /* write the line, translating back to the source character set */
2432         G_tok->string_fp_map_
2433             ->write_file(G_tok->string_fp_, contents_start,
2434                          (size_t)(contents_end - contents_start));
2435 
2436         /* add a newline */
2437         osfwb(G_tok->string_fp_, "\n", 1);
2438     }
2439 
2440     /* set the type in the token */
2441     tok->settyp(typ);
2442 
2443     /* return the token type */
2444     return tok->gettyp();
2445 }
2446 
2447 
2448 /* ------------------------------------------------------------------------ */
2449 /*
2450  *   Read a source line and handle preprocessor directives.  This routine
2451  *   will transparently handle #include, #define, and other directives;
2452  *   when this routine returns, the input buffer will have a line of text
2453  *   that contains no # directive.
2454  *
2455  *   Returns zero on success, non-zero upon reaching the end of the input.
2456  */
read_line_pp()2457 int CTcTokenizer::read_line_pp()
2458 {
2459     int started_in_string;
2460     char *p;
2461 
2462     /*
2463      *   Read the next line from the input.  If that fails, return an end
2464      *   of file indication.
2465      */
2466     p = read_line(FALSE);
2467     if (p == 0)
2468         return 1;
2469 
2470     /*
2471      *   before we process comments, note whether or not the line started
2472      *   out within a character string
2473      */
2474     started_in_string = (in_quote_ != '\0');
2475 
2476     /* set up our source pointer to the start of the new line */
2477     start_new_line(p, linebuf_.get_text_len());
2478 
2479     /* skip leading whitespace */
2480     while (is_space(p_.getch()))
2481         p_.inc();
2482 
2483     /*
2484      *   If this line begins with a '#', process the directive.  Ignore
2485      *   any initial '#' if the line started off in a string.
2486      */
2487     if (!started_in_string && p_.getch() == '#' && allow_pp_)
2488     {
2489         struct pp_kw_def
2490         {
2491             const char *kw;
2492             int process_in_false_if;
2493             void (CTcTokenizer::*func)();
2494         };
2495         static pp_kw_def kwlist[] =
2496         {
2497             { "charset", FALSE, &CTcTokenizer::pp_charset },
2498             { "pragma",  FALSE, &CTcTokenizer::pp_pragma },
2499             { "include", FALSE, &CTcTokenizer::pp_include },
2500             { "define",  FALSE, &CTcTokenizer::pp_define },
2501             { "if",      TRUE,  &CTcTokenizer::pp_if },
2502             { "ifdef",   TRUE,  &CTcTokenizer::pp_ifdef },
2503             { "ifndef",  TRUE,  &CTcTokenizer::pp_ifndef },
2504             { "else",    TRUE,  &CTcTokenizer::pp_else },
2505             { "elif",    TRUE,  &CTcTokenizer::pp_elif },
2506             { "endif",   TRUE,  &CTcTokenizer::pp_endif },
2507             { "error",   FALSE, &CTcTokenizer::pp_error },
2508             { "undef",   FALSE, &CTcTokenizer::pp_undef },
2509             { "line",    FALSE, &CTcTokenizer::pp_line },
2510             { 0, 0, 0 }
2511         };
2512         pp_kw_def *kwp;
2513         const char *kwtxt;
2514         size_t kwlen;
2515 
2516         /* skip the '#' */
2517         p_.inc();
2518 
2519         /*
2520          *   If the line ended inside a comment, read the next line until
2521          *   we're no longer in a comment.  The ANSI C preprocessor rules
2522          *   say that a newline in a comment should not be treated as a
2523          *   lexical newline, so pretend that the next line is part of the
2524          *   preprocessor line in such a case.
2525          */
2526         while (str_->is_in_comment())
2527         {
2528             size_t p_ofs;
2529 
2530             /* remember the current offset in the line buffer */
2531             p_ofs = p_.getptr() - linebuf_.get_buf();
2532 
2533             /* append another line - stop at the end of the stream */
2534             if (read_line(TRUE))
2535                 break;
2536 
2537             /* restore the line pointer, in case the buffer moved */
2538             start_new_line(linebuf_.get_buf() + p_ofs,
2539                            linebuf_.get_text_len() - p_ofs);
2540         }
2541 
2542         /* read the directive */
2543         next_on_line();
2544 
2545         /*
2546          *   if we've reached the end of the line, it's a null directive;
2547          *   simply return an empty line
2548          */
2549         if (curtok_.gettyp() == TOKT_EOF)
2550         {
2551             clear_linebuf();
2552             return 0;
2553         }
2554 
2555         /* get the text and length of the keyword */
2556         kwtxt = curtok_.get_text();
2557         kwlen = curtok_.get_text_len();
2558 
2559         /* if it's not a symbol, it's not a valid directive */
2560         if (curtok_.gettyp() != TOKT_SYM)
2561         {
2562             /* log the error and return an empty line */
2563             log_error(TCERR_INV_PP_DIR, (int)kwlen, kwtxt);
2564             clear_linebuf();
2565             return 0;
2566         }
2567 
2568         /* determine which keyword we have, and process it */
2569         for (kwp = kwlist ; kwp->kw != 0 ; ++kwp)
2570         {
2571             /* is this our keyword? */
2572             if (strlen(kwp->kw) == kwlen
2573                 && memcmp(kwtxt, kwp->kw, kwlen) == 0)
2574             {
2575                 /*
2576                  *   This is our directive.
2577                  *
2578                  *   If we're in the false branch of a #if block, only
2579                  *   process the directive if it's a kind of directive
2580                  *   that we should process in false #if branches.  The
2581                  *   only directives that we process in #if branches are
2582                  *   those that would affect the #if branching, such as a
2583                  *   #endif or a nested #if.
2584                  */
2585                 if (!in_false_if() || kwp->process_in_false_if)
2586                 {
2587                     /* invoke the handler to process the directive */
2588                     (this->*(kwp->func))();
2589                 }
2590                 else
2591                 {
2592                     /*
2593                      *   we're in a #if branch not taken - simply clear
2594                      *   the buffer
2595                      */
2596                     clear_linebuf();
2597                 }
2598 
2599                 /* we don't need to look any further */
2600                 break;
2601             }
2602         }
2603 
2604         /*
2605          *   if we didn't find the keyword, log an error and otherwise
2606          *   ignore the entire line
2607          */
2608         if (kwp->kw == 0)
2609             log_error(TCERR_INV_PP_DIR, (int)kwlen, kwtxt);
2610 
2611         /*
2612          *   Preprocessor lines must always be entirely self-contained.
2613          *   Therefore, it's not valid for a string to start on a
2614          *   preprocessor line and continue onto subsequent lines.  If
2615          *   we're marked as being inside a string, there must have been
2616          *   an error on the preprocessor line.  Simply clear the
2617          *   in-string flag; we don't need to issue an error at this
2618          *   point, since the preprocessor line handler should have
2619          *   already caught the problem and reported an error.
2620          */
2621         in_quote_ = '\0';
2622     }
2623     else
2624     {
2625         /*
2626          *   There's no preprocessor directive.
2627          *
2628          *   If we're in a false #if branch, return an empty line.  We
2629          *   return an empty line rather than skipping to the next line so
2630          *   that the caller sees the same number of lines as are in the
2631          *   original source.
2632          */
2633         if (in_false_if())
2634         {
2635             /*
2636              *   it's a #if not taken - we don't want to compile the line
2637              *   at all, so just clear it out
2638              */
2639             clear_linebuf();
2640             expbuf_.clear_text();
2641         }
2642         else
2643         {
2644             /*
2645              *   If we ended the line in a string, splice additional lines
2646              *   onto the end of this line until we find the end of the
2647              *   string, then unsplice the part after the end of the
2648              *   string.
2649              */
2650             if (in_quote_ != '\0')
2651             {
2652                 /* splice additional lines to finish the quote */
2653                 splice_string();
2654             }
2655 
2656             /*
2657              *   Expand macros in the line, splicing additional source
2658              *   lines if necessary to fill out any incomplete actual
2659              *   parameter lists.
2660              */
2661             start_new_line(linebuf_.get_buf(), linebuf_.get_text_len());
2662             expand_macros_curline(TRUE, FALSE, FALSE);
2663         }
2664 
2665         /* store the line in the appropriate place */
2666         if (pp_only_mode_)
2667         {
2668             /*
2669              *   we're only preprocessing - store the macro-expanded line
2670              *   back in the line buffer so that the caller can read out
2671              *   the final preprocessed text
2672              */
2673             linebuf_.copy(expbuf_.get_text(), expbuf_.get_text_len());
2674         }
2675         else
2676         {
2677             /*
2678              *   We're compiling - simply read subsequent tokens out of
2679              *   the expansion buffer.
2680              */
2681             start_new_line(expbuf_.get_buf(), expbuf_.get_text_len());
2682         }
2683     }
2684 
2685     /* return success */
2686     return 0;
2687 }
2688 
2689 /* ------------------------------------------------------------------------ */
2690 /*
2691  *   Read the next line from the input file.  Returns a pointer to the
2692  *   start of the newly-read data on success, or null if we reach the end
2693  *   of the input.
2694  *
2695  *   If 'append' is true, we'll add the line on to the end of the existing
2696  *   buffer; otherwise, we'll overwrite what's in the buffer.
2697  *
2698  *   The only preprocessing performed in this routine is line-splicing.
2699  *   Any line that ends with a backslash character will be spliced with
2700  *   the following line, with the backslash and newline removed.
2701  *
2702  *   The new line will be stored in our internal buffer, and will be
2703  *   null-terminated with the trailing newline removed.
2704  *
2705  *   If we reach the end of the current file, and there's an enclosing
2706  *   file, we'll resume reading from the enclosing file.  Hence, when this
2707  *   routine returns non-zero, it indicates that we've reached the end of
2708  *   the entire source, not just of the current file.
2709  */
read_line(int append)2710 char *CTcTokenizer::read_line(int append)
2711 {
2712     size_t len;
2713     size_t start_len;
2714 
2715     /* if there's no input stream, indicate end-of-file */
2716     if (str_ == 0)
2717         return 0;
2718 
2719     /* if we're not appending, clear out the line buffer */
2720     if (!append)
2721     {
2722         /* start with an empty line */
2723         clear_linebuf();
2724 
2725         /* note the current input position */
2726         last_desc_ = str_->get_desc();
2727         last_linenum_ = str_->get_next_linenum();
2728     }
2729 
2730     /* note where the new data starts */
2731     len = linebuf_.get_text_len();
2732     start_len = len;
2733 
2734     /*
2735      *   if there's anything in the unsplice buffer, use it as the new
2736      *   line
2737      */
2738     if (unsplicebuf_.get_text_len() != 0)
2739     {
2740         /*
2741          *   Copy the unsplice buffer as the current line.  Note that we
2742          *   don't have to worry about any of the complicated cases, such
2743          *   as whether or not it ends with a newline or a backslash,
2744          *   because the unspliced line was already processed as an input
2745          *   line when we read it in the first place.
2746          */
2747         linebuf_.append(unsplicebuf_.get_text(), unsplicebuf_.get_text_len());
2748 
2749         /* clear the unsplice buffer, since it's been consumed now */
2750         unsplicebuf_.clear_text();
2751 
2752         /*
2753          *   make the current line the appended line - if we're
2754          *   unsplicing, it means that we appended, so the current line is
2755          *   now the line from which the last appended text came
2756          */
2757         last_desc_ = appended_desc_;
2758         last_linenum_ = appended_linenum_;
2759 
2760         /* return the new text */
2761         return linebuf_.get_buf() + start_len;
2762     }
2763 
2764     /* if we're appending, note where the appendage is coming from */
2765     if (append)
2766     {
2767         /* remember the last source line appended */
2768         appended_desc_ = str_->get_desc();
2769         appended_linenum_ = str_->get_next_linenum();
2770     }
2771 
2772     /* keep going until we finish reading the input line */
2773     for ( ;; )
2774     {
2775         size_t curlen;
2776 
2777         /* read a line of text from the input file */
2778         curlen = str_->get_src()->
2779                  read_line(linebuf_.get_buf() + len,
2780                            linebuf_.get_buf_size() - len);
2781 
2782         /* check for end of file */
2783         if (curlen == 0)
2784         {
2785             CTcTokStream *old_str;
2786 
2787             /*
2788              *   We've reached the end of the current input stream.  If
2789              *   we've already read anything into the current line, it
2790              *   means that the file ended in mid-line, without a final
2791              *   newline character; ignore this and proceed with the line
2792              *   as it now stands in this case.
2793              */
2794             if (len > start_len)
2795                 break;
2796 
2797             /*
2798              *   We've finished with this stream.  If there's a parent
2799              *   stream, return to it; otherwise, we're at the end of the
2800              *   source.
2801              */
2802 
2803             /*
2804              *   if we didn't close all of the #if/#ifdef levels opened
2805              *   within this file, flag one or more errors
2806              */
2807             while (if_sp_ > str_->get_init_if_level())
2808             {
2809                 const char *fname;
2810 
2811                 /* get the filename from the #if stack */
2812                 fname = if_stack_[if_sp_ - 1].desc->get_fname();
2813 
2814                 /* if we're in test reporting mode, use the root name only */
2815                 if (test_report_mode_)
2816                     fname = os_get_root_name((char *)fname);
2817 
2818                 /* log the error */
2819                 log_error(TCERR_IF_WITHOUT_ENDIF,
2820                           if_stack_[if_sp_ - 1].linenum,
2821                           (int)strlen(fname), fname);
2822 
2823                 /* discard the #if level */
2824                 pop_if();
2825             }
2826 
2827             /* remember the old stream */
2828             old_str = str_;
2829 
2830             /* return to the parent stream, if there is one */
2831             str_ = str_->get_parent();
2832 
2833             /* delete the old stream now that we're done with it */
2834             delete old_str;
2835 
2836             /* note the new file the line will be coming from */
2837             if (!append && str_ != 0)
2838             {
2839                 last_desc_ = str_->get_desc();
2840                 last_linenum_ = str_->get_next_linenum();
2841             }
2842 
2843             /* if there's no stream, return end of file */
2844             if (str_ == 0)
2845                 return 0;
2846 
2847             /*
2848              *   restore the #pragma newline_spacing mode that was in effect
2849              *   when we interrupted the parent stream
2850              */
2851             string_newline_spacing_ = str_->get_newline_spacing();
2852 
2853             /* if there's a parser, notify it of the new pragma C mode */
2854             // if (G_prs != 0)
2855             //    G_prs->set_pragma_c(str_->is_pragma_c());
2856 
2857             /* go back to read the next line from the parent */
2858             continue;
2859         }
2860 
2861         /* set the new length of the buffer contents */
2862         len += curlen - 1;
2863         linebuf_.set_text_len(len);
2864 
2865         /*
2866          *   Check the result to see if it ends in a newline.  If not, it
2867          *   means either that we don't have room in the buffer for the
2868          *   full source line, or we've reached the last line in the file,
2869          *   and it doesn't end with a newline.
2870          *
2871          *   Note that the file reader will always supply us with '\n'
2872          *   newlines, regardless of the local operating system
2873          *   conventions.
2874          *
2875          *   Also, check to see if the line ends with '\\'.  If so, remove
2876          *   the '\\' character and read the next line, since this
2877          *   indicates that the logical line continues onto the next
2878          *   newline-deliminted line.
2879          */
2880         if (len != 0 && linebuf_.get_text()[len - 1] != '\n')
2881         {
2882             /*
2883              *   There's no newline, hence the file reader wasn't able to
2884              *   fit the entire line into our buffer, or else we've read
2885              *   the last line in the file and there's no newline at the
2886              *   end.  If we haven't reached the end of the file, expand
2887              *   our line buffer to make room to read more from this same
2888              *   line.
2889              */
2890             if (!str_->get_src()->at_eof())
2891                 linebuf_.expand();
2892         }
2893         else if (len > 1 && linebuf_.get_text()[len - 2] == '\\')
2894         {
2895             /*
2896              *   There's a backslash at the end of the line, so they want
2897              *   to continue this logical line.  Remove the backslash, and
2898              *   read the next line onto the end of the current line.
2899              *
2900              *   Note that we must remove two characters from the end of
2901              *   the line (and tested for buf_[len-2] above) because we
2902              *   have both a backslash and a newline at the end of the
2903              *   line.
2904              */
2905             len -= 2;
2906             linebuf_.set_text_len(len);
2907 
2908             /* count reading the physical line */
2909             str_->count_line();
2910         }
2911         else
2912         {
2913             /* remove the newline from the buffer */
2914             if (len != 0)
2915             {
2916                 --len;
2917                 linebuf_.set_text_len(len);
2918             }
2919 
2920             /* count reading the line */
2921             str_->count_line();
2922 
2923             /* done */
2924             break;
2925         }
2926     }
2927 
2928     /*
2929      *   remove comments from the newly-read material - this replaces each
2930      *   comment by a single whitespace character
2931      */
2932     process_comments(start_len);
2933 
2934     /*
2935      *   we've successfully read a line -- return a pointer to the start
2936      *   of the newly-read text
2937      */
2938     return linebuf_.get_buf() + start_len;
2939 }
2940 
2941 /*
2942  *   Un-splice a line at the given point.  This breaks the current source
2943  *   line in two, keeping the part before the given point as the current
2944  *   line, but making the part from the given point to the end of the line
2945  *   a new source line.  We'll put the new source line into a special
2946  *   holding buffer, and then fetch this part as a new line the next time
2947  *   we read a line in read_line().
2948  */
unsplice_line(const char * new_line_start)2949 void CTcTokenizer::unsplice_line(const char *new_line_start)
2950 {
2951     size_t keep_len;
2952 
2953     /* make sure the starting point is within the current line */
2954     if (!(new_line_start >= linebuf_.get_text()
2955           && new_line_start <= linebuf_.get_text() + linebuf_.get_text_len()))
2956     {
2957         /* note the error - this is an internal problem */
2958         throw_internal_error(TCERR_UNSPLICE_NOT_CUR);
2959         return;
2960     }
2961 
2962     /*
2963      *   make sure the unsplice buffer is empty - we should never have to
2964      *   unsplice from a line more than once, because unsplicing should
2965      *   terminate the current line at the current point
2966      */
2967     if (unsplicebuf_.get_text_len() != 0)
2968     {
2969         throw_internal_error(TCERR_MULTI_UNSPLICE);
2970         return;
2971     }
2972 
2973     /* calculate the length of the part we're keeping */
2974     keep_len = new_line_start - linebuf_.get_text();
2975 
2976     /* put the remainder of the current line in the unsplice buffer */
2977     unsplicebuf_.append(new_line_start, linebuf_.get_text_len() - keep_len);
2978 
2979     /* cut off the current line at the given point */
2980     linebuf_.set_text_len(keep_len);
2981 }
2982 
2983 
2984 /* ------------------------------------------------------------------------ */
2985 /*
2986  *   Store text in the source array
2987  */
store_source(const char * txt,size_t len)2988 const char *CTcTokenizer::store_source(const char *txt, size_t len)
2989 {
2990     const char *p;
2991 
2992     /* reserve space in the source array */
2993     reserve_source(len);
2994 
2995     /* remember where the string starts */
2996     p = src_ptr_;
2997 
2998     /* store the text */
2999     memcpy(src_ptr_, txt, len);
3000 
3001     /* advance the source block write position and length */
3002     src_ptr_ += len;
3003     src_rem_ -= len;
3004 
3005     /* null-terminate the copied text */
3006     *src_ptr_++ = '\0';
3007     --src_rem_;
3008 
3009     /* return the storage pointer */
3010     return p;
3011 }
3012 
3013 /*
3014  *   Reserve space for text in the source array
3015  */
reserve_source(size_t len)3016 void CTcTokenizer::reserve_source(size_t len)
3017 {
3018     /*
3019      *   if we don't have enough space for this line in the current source
3020      *   block, start a new block
3021      */
3022     if (len + 1 > src_rem_)
3023     {
3024         CTcTokSrcBlock *blk;
3025 
3026         /*
3027          *   if the line is too long for a source block, throw a fatal
3028          *   error
3029          */
3030         if (len + 1 > TCTOK_SRC_BLOCK_SIZE)
3031             throw_fatal_error(TCERR_SRCLINE_TOO_LONG,
3032                               (long)TCTOK_SRC_BLOCK_SIZE);
3033 
3034         /* allocate a new block */
3035         blk = new CTcTokSrcBlock();
3036 
3037         /* link it into our list */
3038         src_cur_->set_next(blk);
3039 
3040         /* it's now the current block */
3041         src_cur_ = blk;
3042 
3043         /* start writing at the start of this block */
3044         src_rem_ = TCTOK_SRC_BLOCK_SIZE;
3045         src_ptr_ = blk->get_buf();
3046     }
3047 }
3048 
3049 /*
3050  *   Commit space previously reserved and now used in the source block
3051  *   list
3052  */
commit_source(size_t len)3053 void CTcTokenizer::commit_source(size_t len)
3054 {
3055     /* advance the write position past the committed text */
3056     src_ptr_ += len;
3057     src_rem_ -= len;
3058 }
3059 
3060 
3061 /* ------------------------------------------------------------------------ */
3062 /*
3063  *   Expand macros in the current line from the current source pointer,
3064  *   filling in expbuf_ with the expanded result.
3065  */
expand_macros_curline(int read_more,int allow_defined,int append_to_expbuf)3066 int CTcTokenizer::expand_macros_curline(int read_more, int allow_defined,
3067                                         int append_to_expbuf)
3068 {
3069     int err;
3070     utf8_ptr p;
3071     char *src;
3072     char *dst;
3073 
3074     /* expand macros in the current line */
3075     err = expand_macros(&linebuf_, &p_, &expbuf_, read_more, allow_defined,
3076                         append_to_expbuf);
3077 
3078     /* if that failed, return an error */
3079     if (err != 0)
3080         return err;
3081 
3082     /*
3083      *   If we're not in preprocessor mode, there's no need to remove the
3084      *   FULLY_EXPANDED flag bytes, since the tokenizer will know to skip
3085      *   them.
3086      */
3087     if (!pp_only_mode_)
3088         return err;
3089 
3090     /*
3091      *   Scan the expansion buffer and remove all of the no-more-expansion
3092      *   flag bytes - we're done expanding the macro now, so we don't need
3093      *   this information any longer.  When we're preprocessing the file,
3094      *   we don't want to leave these in the expanded source.
3095      */
3096     for (src = dst = expbuf_.get_buf(), p.set(src) ; p.getch() != '\0' ; )
3097     {
3098         /* if this isn't a macro flag, copy it */
3099         if (p.getch() == TOK_MACRO_EXP_END)
3100         {
3101             /* skip the flag byte and the following embedded pointer */
3102             src += 1 + sizeof(CTcHashEntryPp *);
3103             p.set(src);
3104         }
3105         else if (p.getch() == TOK_FULLY_EXPANDED_FLAG)
3106         {
3107             /* skip the flag byte */
3108             ++src;
3109             p.set(src);
3110         }
3111         else
3112         {
3113             /* skip this character */
3114             p.inc();
3115 
3116             /* copy the bytes of this character as-is */
3117             while (src < p.getptr())
3118                 *dst++ = *src++;
3119         }
3120     }
3121 
3122     /* set the new buffer length */
3123     expbuf_.set_text_len(dst - expbuf_.get_buf());
3124 
3125     /* return the result */
3126     return err;
3127 }
3128 
3129 /* ------------------------------------------------------------------------ */
3130 /*
3131  *   Expand macros in the current line, reading additional source lines if
3132  *   necessary.
3133  *
3134  *   'src' is a pointer to the start of the text to expand; it must point
3135  *   into the 'srcbuf' buffer.  If 'src' is null, we'll simply start at
3136  *   the beginning of the source buffer.
3137  */
expand_macros(CTcTokString * srcbuf,utf8_ptr * src,CTcTokString * expbuf,int read_more,int allow_defined,int append)3138 int CTcTokenizer::expand_macros(CTcTokString *srcbuf, utf8_ptr *src,
3139                                 CTcTokString *expbuf, int read_more,
3140                                 int allow_defined, int append)
3141 {
3142     tc_toktyp_t typ;
3143     CTcToken tok;
3144     CTcTokString *subexp;
3145     size_t startofs;
3146     utf8_ptr local_src;
3147     CTcTokStringRef local_srcbuf;
3148     CTcMacroRsc *res;
3149     int err;
3150 
3151     /* presume success */
3152     err = 0;
3153 
3154     /* get a macro expansion resource object */
3155     res = alloc_macro_rsc();
3156     if (res == 0)
3157         return 1;
3158 
3159     /* get our subexpression buffer from the resource object */
3160     subexp = &res->line_exp_;
3161 
3162     /* if there's no source buffer or source pointer, provide one */
3163     if (srcbuf == 0)
3164     {
3165         /*
3166          *   there's no source buffer - provide our own non-allocated
3167          *   buffer tied to the caller's buffer
3168          */
3169         local_srcbuf.set_buffer(src->getptr(), strlen(src->getptr()));
3170         srcbuf = &local_srcbuf;
3171     }
3172     else if (src == 0)
3173     {
3174         /*
3175          *   there's no source pointer - start at the beginning of the
3176          *   source buffer
3177          */
3178         local_src.set((char *)srcbuf->get_text());
3179         src = &local_src;
3180     }
3181 
3182     /* clear the expansion buffer, unless we're appending to the buffer */
3183     if (!append)
3184         expbuf->clear_text();
3185 
3186     /*
3187      *   Make sure we have room for a copy of the source line.  This is an
3188      *   optimization for the simple case where we'll just copy the source
3189      *   line unchanged, so that we don't have to repeatedly expand the
3190      *   buffer; we will, however, expand the buffer dynamically later, if
3191      *   this pre-allocation should prove to be insufficient.
3192      */
3193     expbuf->ensure_space(expbuf->get_text_len() + srcbuf->get_text_len());
3194 
3195     /* note the starting offset, if we have an underlying string buffer */
3196     startofs = src->getptr() - srcbuf->get_text();
3197 
3198     /* read the first token */
3199     typ = next_on_line(srcbuf, src, &tok, &macro_in_embedding_);
3200 
3201     /* scan through the tokens on the line, looking for macros to expand */
3202     while (typ != TOKT_EOF)
3203     {
3204         /*
3205          *   if it's a symbol, and it hasn't already been marked as fully
3206          *   expanded, look it up in the #define table
3207          */
3208         if (typ == TOKT_SYM && !tok.get_fully_expanded())
3209         {
3210             CTcHashEntryPp *entry;
3211 
3212             /*
3213              *   Look up the symbol in the #define symbol table.  If we
3214              *   find it, expand the macro.  Otherwise, if the "defined"
3215              *   operator is active, check for that.
3216              *
3217              *   Do not expand the macro if we find that it has already
3218              *   been expanded on a prior scan through the current text.
3219              */
3220             entry = find_define(tok.get_text(), tok.get_text_len());
3221             if ((entry != 0
3222                  && !scan_for_prior_expansion(*src, srcbuf->get_text_end(),
3223                                               entry))
3224                 || (allow_defined
3225                     && tok.get_text_len() == 7
3226                     && memcmp(tok.get_text(), "defined", 7) == 0))
3227             {
3228                 size_t macro_ofs;
3229                 size_t rem_len;
3230                 int expanded;
3231 
3232                 /* get the offset of the macro token in the source buffer */
3233                 macro_ofs = tok.get_text() - srcbuf->get_text();
3234 
3235                 /* expand it into our sub-expansion buffer */
3236                 if (entry != 0)
3237                 {
3238                     /* expand the macro */
3239                     err = expand_macro(res, subexp, srcbuf, src,
3240                                        macro_ofs, entry,
3241                                        read_more, allow_defined, &expanded);
3242                 }
3243                 else
3244                 {
3245                     /* parse and expand the defined() operator */
3246                     err = expand_defined(subexp, srcbuf, src);
3247 
3248                     /* "defined" always expands if there's not an error */
3249                     expanded = TRUE;
3250                 }
3251 
3252                 /* if an error occurred, return failure */
3253                 if (err)
3254                     goto done;
3255 
3256                 /*
3257                  *   if we expanded something, append everything we
3258                  *   skipped preceding the macro, then rescan; otherwise,
3259                  *   just keep going without a rescan
3260                  */
3261                 if (expanded)
3262                 {
3263                     /* copy the preceding text to the output */
3264                     expbuf->append(srcbuf->get_text() + startofs,
3265                                    macro_ofs - startofs);
3266                 }
3267                 else
3268                 {
3269                     /*
3270                      *   we didn't expand - get the next token after the
3271                      *   macro
3272                      */
3273                     typ = next_on_line(srcbuf, src, &tok,
3274                                        &macro_in_embedding_);
3275 
3276                     /* continue processing from this token */
3277                     continue;
3278                 }
3279 
3280                 /*
3281                  *   We must now insert the expansion into the source
3282                  *   buffer at the current point, and re-scan the
3283                  *   expansion, *along with* the rest of the original
3284                  *   source line (this is how ANSI C specifies the
3285                  *   process).
3286                  *
3287                  *   If we can read more, we must be reading out of the
3288                  *   main input line buffer, so insert the expansion text
3289                  *   directly into the original source stream, and
3290                  *   continue reading out of the source stream; this will
3291                  *   simplify the case where we must read more data from
3292                  *   the file in the course of the expansion.  If we can't
3293                  *   read more, simply copy the remainder of the current
3294                  *   input line onto the expanded macro and use it as the
3295                  *   new input buffer.
3296                  */
3297 
3298                 /* get the current offset in the source line */
3299                 startofs = src->getptr() - srcbuf->get_text();
3300 
3301                 /* figure out how much is left on the current line */
3302                 rem_len = srcbuf->get_text_len() - startofs;
3303 
3304                 /* check to see if we can read more */
3305                 if (read_more)
3306                 {
3307                     /*
3308                      *   we're reading from the original line input buffer
3309                      *   -- insert the expansion into the source buffer at
3310                      *   the current point, replacing the original macro
3311                      *   text
3312                      */
3313 
3314                     /* make sure we have room for adding the expansion text */
3315                     srcbuf->ensure_space(macro_ofs + rem_len
3316                                          + subexp->get_text_len());
3317 
3318                     /* make sure src is still pointing to the right place */
3319                     src->set(srcbuf->get_buf() + macro_ofs);
3320 
3321                     /* move the remainder of the current line to make room */
3322                     memmove(srcbuf->get_buf() + macro_ofs
3323                             + subexp->get_text_len(),
3324                             srcbuf->get_buf() + startofs,
3325                             rem_len);
3326 
3327                     /* insert the expansion text */
3328                     memcpy(srcbuf->get_buf() + macro_ofs, subexp->get_buf(),
3329                            subexp->get_text_len());
3330 
3331                     /* set the new source length */
3332                     srcbuf->set_text_len(macro_ofs + rem_len
3333                                          + subexp->get_text_len());
3334 
3335                     /* the new starting offset is the current position */
3336                     startofs = macro_ofs;
3337 
3338                     /* get the next token */
3339                     typ = next_on_line(srcbuf, src, &tok,
3340                                        &macro_in_embedding_);
3341 
3342                     /* continue processing from this token */
3343                     continue;
3344                 }
3345                 else
3346                 {
3347                     /*
3348                      *   we're reading from a read-only buffer -- add the
3349                      *   remainder of the source to the expansion buffer,
3350                      *   and recursively parse the remainder
3351                      */
3352                     subexp->append(srcbuf->get_text() + startofs, rem_len);
3353 
3354                     /*
3355                      *   evaluate the remainder recursively and append it
3356                      *   to the expansion already in progress
3357                      */
3358                     err = expand_macros(subexp, 0, expbuf, FALSE,
3359                                         allow_defined, TRUE);
3360 
3361                     /* we're done */
3362                     goto done;
3363                 }
3364             }
3365         }
3366 
3367         /* get the next token */
3368         typ = next_on_line(srcbuf, src, &tok, &macro_in_embedding_);
3369     }
3370 
3371     /* add the remainder of the source to the output */
3372     expbuf->append(srcbuf->get_text() + startofs,
3373                    tok.get_text() - startofs - srcbuf->get_text());
3374 
3375 done:
3376     /* release our macro resource object */
3377     release_macro_rsc(res);
3378 
3379     /* return the result */
3380     return err;
3381 }
3382 
3383 /*
3384  *   Allocate a macro resource object.  If we're out of resource objects
3385  *   in the pool, we'll add another object to the pool.
3386  */
alloc_macro_rsc()3387 CTcMacroRsc *CTcTokenizer::alloc_macro_rsc()
3388 {
3389     CTcMacroRsc *rsc;
3390 
3391     /*
3392      *   if there's anything in the available list, take the first item
3393      *   off the list and return it
3394      */
3395     if (macro_res_avail_ != 0)
3396     {
3397         /* remember the item to return */
3398         rsc = macro_res_avail_;
3399 
3400         /* remove it from the list */
3401         macro_res_avail_ = macro_res_avail_->next_avail_;
3402 
3403         /* return it */
3404         return rsc;
3405     }
3406 
3407     /* there's nothing on the available list - allocate a new item */
3408     rsc = new CTcMacroRsc();
3409 
3410     /* if that failed, return failure */
3411     if (rsc == 0)
3412     {
3413         log_error(TCERR_OUT_OF_MEM_MAC_EXP);
3414         return 0;
3415     }
3416 
3417     /* add it onto the master list */
3418     rsc->next_ = macro_res_head_;
3419     macro_res_head_ = rsc;
3420 
3421     /* return it */
3422     return rsc;
3423 }
3424 
3425 /*
3426  *   Release a macro resource, returning it to the pool
3427  */
release_macro_rsc(CTcMacroRsc * rsc)3428 void CTcTokenizer::release_macro_rsc(CTcMacroRsc *rsc)
3429 {
3430     /* put it back at the head of the available list */
3431     rsc->next_avail_ = macro_res_avail_;
3432     macro_res_avail_ = rsc;
3433 }
3434 
3435 /*
3436  *   Scan a buffer for a prior-expansion flag for a given macro.  We'll
3437  *   look through the buffer for a TOK_MACRO_EXP_END byte that mentions
3438  *   the given symbol table entry; we'll return true if found, false if
3439  *   not.  True means that the symbol has already been expanded on a prior
3440  *   scan of the text, so it should not be re-expanded now.
3441  */
scan_for_prior_expansion(utf8_ptr src,const char * src_end,const CTcHashEntryPp * entry)3442 int CTcTokenizer::scan_for_prior_expansion(utf8_ptr src, const char *src_end,
3443                                            const CTcHashEntryPp *entry)
3444 {
3445     /* scan the buffer for the expansion flag byte */
3446     while (src.getptr() < src_end)
3447     {
3448         /* if this is the flag, check what follows */
3449         if (src.getch() == TOK_MACRO_EXP_END)
3450         {
3451             CTcHashEntryPp *flag_entry;
3452 
3453             /* read the entry from the buffer */
3454             memcpy(&flag_entry, src.getptr() + 1, sizeof(flag_entry));
3455 
3456             /* if it matches, indicate that we found it */
3457             if (entry == flag_entry)
3458                 return TRUE;
3459 
3460             /* it's not a match - keep scanning after this flag sequence */
3461             src.set(src.getptr() + 1 + sizeof(flag_entry));
3462         }
3463         else
3464         {
3465             /* it's not the flag - skip this character */
3466             src.inc();
3467         }
3468     }
3469 
3470     /* we didn't find it */
3471     return FALSE;
3472 }
3473 
3474 /*
3475  *   Go through a macro expansion and translate from end-of-expansion
3476  *   markers to individual token full-expansion markers.  This is used
3477  *   after we leave a recursion level to convert expanded text into text
3478  *   suitable for use in further expansion at an enclosing recursion
3479  *   level.
3480  */
mark_full_exp_tokens(CTcTokString * dstbuf,const CTcTokString * srcbuf,int append) const3481 void CTcTokenizer::mark_full_exp_tokens(CTcTokString *dstbuf,
3482                                         const CTcTokString *srcbuf,
3483                                         int append) const
3484 {
3485     utf8_ptr p;
3486     CTcToken tok;
3487     const char *start;
3488     int in_embedding;
3489 
3490     /* clear the output buffer if we're not appending to existing text */
3491     if (!append)
3492         dstbuf->clear_text();
3493 
3494     /* remember the starting point */
3495     start = srcbuf->get_text();
3496 
3497     /* not in an embedded expression within the expansion text yet */
3498     in_embedding = FALSE;
3499 
3500     /* scan the source buffer */
3501     p.set((char *)start);
3502     for (;;)
3503     {
3504         CTcHashEntryPp *cur_entry;
3505         tc_toktyp_t typ;
3506         char ch;
3507 
3508         /* get the next token; stop at the end of the line */
3509         typ = next_on_line(srcbuf, &p, &tok, &in_embedding);
3510         if (typ == TOKT_EOF)
3511             break;
3512 
3513         /*
3514          *   if this macro token is being expanded, and it's not already
3515          *   marked for no more expansion, mark it
3516          */
3517         if (typ == TOKT_SYM
3518             && !tok.get_fully_expanded()
3519             && (cur_entry = find_define(tok.get_text(),
3520                                         tok.get_text_len())) != 0
3521             && scan_for_prior_expansion(p, srcbuf->get_text_end(), cur_entry))
3522         {
3523             /*
3524              *   This token has been fully expanded in the substitution
3525              *   buffer but hasn't yet been marked as such - we must
3526              *   insert the fully-expanded marker.  First, add up to the
3527              *   current point to the output buffer.
3528              */
3529             if (tok.get_text() > start)
3530                 dstbuf->append(start, tok.get_text() - start);
3531 
3532             /* add the fully-expanded marker */
3533             ch = TOK_FULLY_EXPANDED_FLAG;
3534             dstbuf->append(&ch, 1);
3535 
3536             /* the new starting point is the start of the symbol token */
3537             start = tok.get_text();
3538         }
3539     }
3540 
3541     /* copy any remaining text to the output */
3542     if (tok.get_text() > start)
3543         dstbuf->append(start, tok.get_text() - start);
3544 
3545     /*
3546      *   Remove any macro expansion end markers from the output buffer.
3547      *   We don't want to leave these around, because they don't apply to
3548      *   the enclosing buffer into which we'll substitute this result.
3549      *   Note that we've already ensured that these markers will be
3550      *   respected for the substitution text by inserting "fully expanded"
3551      *   markers in front of each token to which any of the markers we're
3552      *   removing should apply.
3553      */
3554     remove_end_markers(dstbuf);
3555 }
3556 
3557 
3558 /*
3559  *   Remove end markers from a buffer
3560  */
remove_end_markers(CTcTokString * buf)3561 void CTcTokenizer::remove_end_markers(CTcTokString *buf)
3562 {
3563     char *src;
3564     char *dst;
3565     utf8_ptr p;
3566 
3567     /* scan the buffer */
3568     for (src = dst = buf->get_buf(), p.set(src) ;
3569          p.getptr() < buf->get_text_end() ; )
3570     {
3571         /* check for our flag */
3572         if (p.getch() == TOK_MACRO_EXP_END)
3573         {
3574             /* skip the flag byte and the following embedded pointer */
3575             src += 1 + sizeof(CTcHashEntryPp *);
3576             p.set(src);
3577         }
3578         else
3579         {
3580             /* skip this character */
3581             p.inc();
3582 
3583             /* copy the bytes of this character as-is */
3584             while (src < p.getptr())
3585                 *dst++ = *src++;
3586         }
3587     }
3588 
3589     /* set the new buffer size */
3590     buf->set_text_len(dst - buf->get_buf());
3591 }
3592 
3593 
3594 /*
3595  *   Expand the macro at the current token in the current line.
3596  *
3597  *   'src' is a pointer to the current position in 'srcbuf'.  We'll update
3598  *   'src' to point to the next token after macro or its actual parameters
3599  *   list, if it has one.
3600  */
expand_macro(CTcMacroRsc * rsc,CTcTokString * expbuf,const CTcTokString * srcbuf,utf8_ptr * src,size_t macro_srcbuf_ofs,CTcHashEntryPp * entry,int read_more,int allow_defined,int * expanded)3601 int CTcTokenizer::expand_macro(CTcMacroRsc *rsc, CTcTokString *expbuf,
3602                                const CTcTokString *srcbuf, utf8_ptr *src,
3603                                size_t macro_srcbuf_ofs,
3604                                CTcHashEntryPp *entry, int read_more,
3605                                int allow_defined, int *expanded)
3606 {
3607     CTcTokString *subexp;
3608     size_t argofs[TOK_MAX_MACRO_ARGS];
3609     size_t arglen[TOK_MAX_MACRO_ARGS];
3610     size_t startofs;
3611     const char *start;
3612     const char *end;
3613     int err;
3614     char flagbuf[1 + sizeof(entry)];
3615 
3616     /* presume we won't do any expansion */
3617     *expanded = FALSE;
3618 
3619     /* get our resources */
3620     subexp = &rsc->macro_exp_;
3621 
3622     /* remember our parsing starting offset */
3623     startofs = src->getptr() - srcbuf->get_text();
3624 
3625     /* clear the expansion output buffer */
3626     expbuf->clear_text();
3627 
3628     /* if the macro has arguments, scan the actuals */
3629     if (entry->has_args())
3630     {
3631         int found_actuals;
3632 
3633         /* read the macro arguments */
3634         if (parse_macro_actuals(srcbuf, src, entry, argofs, arglen,
3635                                 read_more, &found_actuals))
3636         {
3637             err = 1;
3638             goto done;
3639         }
3640 
3641         /*
3642          *   If we found no actuals, then this wasn't really an invocation
3643          *   of the macro after all - a function-like macro invoked with
3644          *   no arguments is simply not replaced.  Store the original text
3645          *   in the output buffer and return success.
3646          */
3647         if (!found_actuals)
3648         {
3649             /* copy the original text */
3650             expbuf->copy(srcbuf->get_text() + macro_srcbuf_ofs,
3651                          startofs - macro_srcbuf_ofs);
3652 
3653             /*
3654              *   restore the source read pointer to where it was when we
3655              *   started
3656              */
3657             src->set((char *)srcbuf->get_text() + startofs);
3658 
3659             /* return success */
3660             err = 0;
3661             goto done;
3662         }
3663     }
3664 
3665     /*
3666      *   if there are arguments, replace the macro and substitute actuals
3667      *   for the formals; otherwise, just copy the replacement text
3668      *   directly
3669      */
3670     if (entry->get_argc() != 0)
3671     {
3672         /* substitute the actuals */
3673         if (substitute_macro_actuals(rsc, subexp, entry, srcbuf,
3674                                      argofs, arglen, allow_defined))
3675         {
3676             err = 1;
3677             goto done;
3678         }
3679 
3680         /* set up to parse from the expansion buffer */
3681         start = subexp->get_text();
3682         end = start + subexp->get_text_len();
3683     }
3684     else
3685     {
3686         /*
3687          *   use our local source buffer that simply references the
3688          *   original expansion text, rather than making a copy of the
3689          *   expansion text
3690          */
3691         start = entry->get_expansion();
3692         end = start + entry->get_expan_len();
3693     }
3694 
3695     /* copy the expansion into the output buffer */
3696     expbuf->copy(start, end - start);
3697 
3698     /*
3699      *   After the end of the expansion sequence, insert the
3700      *   fully-expanded flag plus a pointer to the symbol table entry that
3701      *   we just expanded.  This will allow us to detect during the
3702      *   re-scan of the expansion text that this symbol has already been
3703      *   expanded, in which case we must suppress further expansion of the
3704      *   symbol.  This allows us to follow the ANSI C rules for recursive
3705      *   macro usage.
3706      */
3707     flagbuf[0] = TOK_MACRO_EXP_END;
3708     memcpy(&flagbuf[1], &entry, sizeof(entry));
3709     expbuf->append(flagbuf, sizeof(flagbuf));
3710 
3711     /* indicate that we expanded the macro */
3712     *expanded = TRUE;
3713 
3714     /* success */
3715     err = 0;
3716 
3717 done:
3718     /* return the result */
3719     return err;
3720 }
3721 
3722 /*
3723  *   Parse a macro's actual parameter list, filling in the given hash
3724  *   table with the arguments.  Returns zero on success, non-zero on
3725  *   error.  'entry' is the macro's defining symbol table entry.
3726  */
parse_macro_actuals(const CTcTokString * srcbuf,utf8_ptr * src,const CTcHashEntryPp * entry,size_t argofs[TOK_MAX_MACRO_ARGS],size_t arglen[TOK_MAX_MACRO_ARGS],int read_more,int * found_actuals)3727 int CTcTokenizer::parse_macro_actuals(const CTcTokString *srcbuf,
3728                                       utf8_ptr *src,
3729                                       const CTcHashEntryPp *entry,
3730                                       size_t argofs[TOK_MAX_MACRO_ARGS],
3731                                       size_t arglen[TOK_MAX_MACRO_ARGS],
3732                                       int read_more, int *found_actuals)
3733 {
3734     tc_toktyp_t typ;
3735     CTcToken tok;
3736     int argc;
3737     int spliced;
3738     int i;
3739 
3740     /* presume we're not going to do any line splicing */
3741     spliced = FALSE;
3742 
3743     /* no arguments parsed yet */
3744     argc = 0;
3745 
3746     /* get the next token after the macro symbol */
3747     typ = next_on_line(srcbuf, src, &tok, &macro_in_embedding_);
3748 
3749     /* splice another line if necessary */
3750     if (typ == TOKT_EOF && read_more)
3751     {
3752         /* splice a line */
3753         typ = actual_splice_next_line(srcbuf, src, &tok);
3754 
3755         /* note the splice */
3756         spliced = TRUE;
3757     }
3758 
3759     /* if we didn't find an open paren, there's no actual list after all */
3760     if (typ != TOKT_LPAR)
3761     {
3762         /* tell the caller we didn't find any actuals */
3763         *found_actuals = FALSE;
3764 
3765         /* if we spliced a line, unsplice it at the current token */
3766         if (spliced)
3767             unsplice_line(tok.get_text());
3768 
3769         /* return success */
3770         return 0;
3771     }
3772 
3773     /* remember the offset of the start of the first argument */
3774     argofs[argc] = tok.get_text() + tok.get_text_len() - srcbuf->get_text();
3775 
3776     /* skip the open paren */
3777     typ = next_on_line(srcbuf, src, &tok, &macro_in_embedding_);
3778 
3779     /* read the arguments */
3780     while (typ != TOKT_RPAR)
3781     {
3782         utf8_ptr p;
3783         int paren_depth, bracket_depth, brace_depth;
3784         int sp_cnt;
3785 
3786         /* if we have too many arguments, it's an error */
3787         if ((argc >= entry->get_argc() && !entry->has_varargs())
3788             || argc >= TOK_MAX_MACRO_ARGS)
3789         {
3790             /* log the error */
3791             log_error(TCERR_PP_MANY_MACRO_ARGS,
3792                       (int)entry->getlen(), entry->getstr());
3793 
3794             /* scan ahead to to close paren or end of line */
3795             while (typ != TOKT_RPAR && typ != TOKT_EOF)
3796                 typ = next_on_line(srcbuf, src, &tok, &macro_in_embedding_);
3797 
3798             /* done scanning arguments */
3799             break;
3800         }
3801 
3802         /*
3803          *   skip tokens until we find a comma outside of nested parens,
3804          *   square brackets, or curly braces
3805          */
3806         paren_depth = bracket_depth = brace_depth = 0;
3807         while (paren_depth != 0
3808                || bracket_depth != 0
3809                || brace_depth != 0
3810                || (typ != TOKT_COMMA && typ != TOKT_RPAR))
3811         {
3812             /*
3813              *   if it's an open or close paren, brace, or bracket, adjust
3814              *   the depth accordingly
3815              */
3816             switch(typ)
3817             {
3818             case TOKT_LPAR:
3819                 ++paren_depth;
3820                 break;
3821 
3822             case TOKT_RPAR:
3823                 --paren_depth;
3824                 break;
3825 
3826             case TOKT_LBRACE:
3827                 ++brace_depth;
3828                 break;
3829 
3830             case TOKT_RBRACE:
3831                 --brace_depth;
3832                 break;
3833 
3834             case TOKT_LBRACK:
3835                 ++bracket_depth;
3836                 break;
3837 
3838             case TOKT_RBRACK:
3839                 --bracket_depth;
3840                 break;
3841             }
3842 
3843             /* get the next token */
3844             typ = next_on_line(srcbuf, src, &tok, &macro_in_embedding_);
3845 
3846             /*
3847              *   if we're at the end of the line, and we're allowed to
3848              *   read more, splice the next line onto the current line
3849              */
3850             if (typ == TOKT_EOF && read_more)
3851             {
3852                 /* splice a line */
3853                 typ = actual_splice_next_line(srcbuf, src, &tok);
3854 
3855                 /* note that we've done some line splicing */
3856                 spliced = TRUE;
3857             }
3858 
3859             /* if we've reached the end of the file, stop */
3860             if (typ == TOKT_EOF)
3861                 break;
3862         }
3863 
3864         /* if we've reached the end of the file, stop */
3865         if (typ == TOKT_EOF)
3866             break;
3867 
3868         /* remove any trailing whitespace from the actual's text */
3869         sp_cnt = 0;
3870         p.set((char *)tok.get_text());
3871         while (p.getptr() > srcbuf->get_text() + argofs[argc])
3872         {
3873             wchar_t ch;
3874 
3875             /* move to the prior character */
3876             p.dec();
3877 
3878             /* if it's not a space, stop looking */
3879             ch = p.getch();
3880             if (!is_space(ch))
3881             {
3882                 /*
3883                  *   advance past this character so that we keep it in the
3884                  *   expansion
3885                  */
3886                 p.inc();
3887 
3888                 /*
3889                  *   if this last character was a backslash, and we removed
3890                  *   at least one space following it, keep the one space
3891                  *   that immediately follows the backslash, since that
3892                  *   space is part of the backslash's two-character escape
3893                  *   sequence
3894                  */
3895                 if (ch == '\\' && sp_cnt != 0)
3896                     p.inc();
3897 
3898                 /* stop scanning */
3899                 break;
3900             }
3901 
3902             /* that's one more trailing space we've removed - count it */
3903             ++sp_cnt;
3904         }
3905 
3906         /* note the argument length */
3907         arglen[argc] = (p.getptr() - srcbuf->get_text()) - argofs[argc];
3908 
3909         /* count the argument */
3910         ++argc;
3911 
3912         /* check for another argument */
3913         if (typ == TOKT_COMMA)
3914         {
3915             /* remember the offset of the start of this argument */
3916             argofs[argc] = tok.get_text() + tok.get_text_len()
3917                            - srcbuf->get_text();
3918 
3919             /* skip the comma and go back for another argument */
3920             typ = next_on_line(srcbuf, src, &tok, &macro_in_embedding_);
3921         }
3922         else if (typ == TOKT_RPAR)
3923         {
3924             /*
3925              *   No need to look any further.  Note that we don't want to
3926              *   get another token, since we're done parsing the input
3927              *   now, and we want to leave the token stream positioned for
3928              *   the caller just after the extent of the macro, which, in
3929              *   the case of this function-like macro, ends with the
3930              *   closing paren.
3931              */
3932             break;
3933         }
3934     }
3935 
3936     /* if we didn't find the right paren, flag the error */
3937     if (typ != TOKT_RPAR)
3938     {
3939         log_error(read_more
3940                   ? TCERR_PP_MACRO_ARG_RPAR : TCERR_PP_MACRO_ARG_RPAR_1LINE,
3941                   (int)entry->getlen(), entry->getstr());
3942         return 1;
3943     }
3944 
3945     /* remove leading and trailing whitespace from each argument */
3946     for (i = 0 ; i < argc ; ++i)
3947     {
3948         const char *start;
3949         const char *end;
3950         utf8_ptr p;
3951         size_t del_len;
3952         int sp_cnt;
3953 
3954         /* figure the limits of the argument text */
3955         start = srcbuf->get_text() + argofs[i];
3956         end = start + arglen[i];
3957 
3958         /* remove leading whitespace */
3959         for (p.set((char *)start) ; p.getptr() < end && is_space(p.getch()) ;
3960              p.inc()) ;
3961 
3962         /* set the new offset and length */
3963         del_len = p.getptr() - start;
3964         argofs[i] += del_len;
3965         arglen[i] -= del_len;
3966         start += del_len;
3967 
3968         /* remove trailing whitespace */
3969         p.set((char *)end);
3970         sp_cnt = 0;
3971         while (p.getptr() > start)
3972         {
3973             wchar_t ch;
3974 
3975             /* go to the prior character */
3976             p.dec();
3977 
3978             /* if it's not whitespace, keep it */
3979             ch = p.getch();
3980             if (!is_space(ch))
3981             {
3982                 /* put the character back */
3983                 p.inc();
3984 
3985                 /*
3986                  *   if this is a backslash, and a space follows, keep the
3987                  *   immediately following space, since it's part of the
3988                  *   backslash sequence
3989                  */
3990                 if (ch == '\\' && sp_cnt != 0)
3991                     p.inc();
3992 
3993                 /* we're done scanning */
3994                 break;
3995             }
3996 
3997             /* count another removed trailing space */
3998             ++sp_cnt;
3999         }
4000 
4001         /* adjust the length */
4002         arglen[i] -= (end - p.getptr());
4003     }
4004 
4005     /*
4006      *   if we did any line splicing, cut off the rest of the line and
4007      *   push it back into the logical input stream as a new line - this
4008      *   will allow better error message positioning if errors occur in
4009      *   the remainder of the line, since this means we'll only
4010      *   artificially join onto one line the part of the new line that
4011      *   contained the macro parameters
4012      */
4013     if (spliced)
4014         unsplice_line(tok.get_text() + tok.get_text_len());
4015 
4016     /* make sure we found enough arguments */
4017     if (argc < entry->get_min_argc())
4018     {
4019         /* fill in the remaining arguments with empty strings */
4020         for ( ; argc < entry->get_argc() ; ++argc)
4021         {
4022             argofs[argc] = 0;
4023             arglen[argc] = 0;
4024         }
4025 
4026         /* note the error, but proceed with empty arguments */
4027         log_warning(TCERR_PP_FEW_MACRO_ARGS,
4028                     (int)entry->getlen(), entry->getstr());
4029     }
4030 
4031     /*
4032      *   if we have varargs, always supply an empty marker for the last
4033      *   argument
4034      */
4035     if (entry->has_varargs() && argc < TOK_MAX_MACRO_ARGS)
4036     {
4037         argofs[argc] = 0;
4038         arglen[argc] = 0;
4039     }
4040 
4041     /* success - we found an actual parameter list */
4042     *found_actuals = TRUE;
4043     return 0;
4044 }
4045 
4046 /*
4047  *   Splice a line for macro actual parameters.  Sets the source pointer
4048  *   to the start of the new line.  Reads the first token on the spliced
4049  *   line and returns it.
4050  *
4051  *   We will splice new lines until we find a non-empty line or reach the
4052  *   end of the input.  If this returns EOF, it indicates that we've
4053  *   reached the end of the entire input.
4054  */
4055 tc_toktyp_t CTcTokenizer::
actual_splice_next_line(const CTcTokString * srcbuf,utf8_ptr * src,CTcToken * tok)4056    actual_splice_next_line(const CTcTokString *srcbuf,
4057                            utf8_ptr *src, CTcToken *tok)
4058 {
4059     /* add a space onto the end of the current line */
4060     linebuf_.append(" ", 1);
4061 
4062     /* keep going until we find a non-empty line */
4063     for (;;)
4064     {
4065         char *new_line_p;
4066         tc_toktyp_t typ;
4067 
4068         /* splice the next line onto the current line */
4069         new_line_p = read_line(TRUE);
4070 
4071         /*
4072          *   make sure we read additional lines as needed to complete any
4073          *   strings left open at the end of the line
4074          */
4075         if (in_quote_ != '\0')
4076             splice_string();
4077 
4078         /* if there was no more, return end of file */
4079         if (new_line_p == 0)
4080             return TOKT_EOF;
4081 
4082         /* set the source to the start of the additional line */
4083         src->set((char *)new_line_p);
4084 
4085         /* get the next token */
4086         typ = next_on_line(srcbuf, src, tok, &macro_in_embedding_);
4087 
4088         /* if we didn't get EOF, it means we found a non-empty line */
4089         if (typ != TOKT_EOF)
4090             return typ;
4091     }
4092 }
4093 
4094 /*
4095  *   Substitute the actual parameters in a macro's expansion
4096  */
substitute_macro_actuals(CTcMacroRsc * rsc,CTcTokString * subexp,CTcHashEntryPp * entry,const CTcTokString * srcbuf,const size_t * argofs,const size_t * arglen,int allow_defined)4097 int CTcTokenizer::substitute_macro_actuals(CTcMacroRsc *rsc,
4098                                            CTcTokString *subexp,
4099                                            CTcHashEntryPp *entry,
4100                                            const CTcTokString *srcbuf,
4101                                            const size_t *argofs,
4102                                            const size_t *arglen,
4103                                            int allow_defined)
4104 {
4105     const char *start;
4106     utf8_ptr expsrc;
4107     CTcToken prvtok;
4108     CTcToken prvprvtok;
4109     CTcToken tok;
4110     tc_toktyp_t typ;
4111     const CVmHashTable *actuals;
4112     CTcTokString *actual_exp_buf;
4113     const size_t expand_max = 10;
4114     static struct expand_info_t
4115     {
4116         /* type of expansion (#foreach, #ifempty, #ifnempty) */
4117         tc_toktyp_t typ;
4118 
4119         /*
4120          *   flag: this is an iterator type (if this is true, the varargs
4121          *   formal should be expanded to the current argument given by our
4122          *   'arg' member; if this is false, the varargs formal should be
4123          *   expanded as the full varargs list)
4124          */
4125         int is_iterator;
4126 
4127         /* the marker character that delimits the foreach arguments */
4128         wchar_t delim;
4129 
4130         /* location of start of expansion region for foreach */
4131         utf8_ptr start;
4132 
4133         /* current argument index */
4134         int arg;
4135 
4136         /* the current expansion part (0 = first part, etc) */
4137         int part;
4138     }
4139     expand_stack[expand_max], *expand_sp;
4140 
4141     /* get the actuals table */
4142     actuals = entry->get_params_table();
4143 
4144     /* get the actual expansion buffer from the resource object */
4145     actual_exp_buf = &rsc->actual_exp_buf_;
4146 
4147     /*
4148      *   Scan the replacement text for formals, and replace each formal
4149      *   with the actual.  Set up a pointer at the start of the expansion
4150      *   text.
4151      */
4152     start = entry->get_expansion();
4153     expsrc.set((char *)start);
4154 
4155     /* we don't yet have a previous token */
4156     prvtok.settyp(TOKT_EOF);
4157     prvprvtok.settyp(TOKT_EOF);
4158 
4159     /* clear the expansion buffer */
4160     subexp->clear_text();
4161 
4162     /* we have no #foreach/#ifempty/#ifnempty stack yet */
4163     expand_sp = expand_stack;
4164 
4165     /* scan the tokens in the expansion text */
4166     for (typ = next_on_line(&expsrc, &tok, &macro_in_embedding_) ;
4167          typ != TOKT_EOF ; )
4168     {
4169         /*
4170          *   check to see if we've reached the end of a
4171          *   #foreach/#ifempty/#ifnempty
4172          */
4173         if (expand_sp != expand_stack)
4174         {
4175             /* check to see if we're at the delimiter */
4176             if (utf8_ptr::s_getch(tok.get_text()) == (expand_sp-1)->delim)
4177             {
4178                 /* copy the prior expansion so far */
4179                 if (tok.get_text() > start)
4180                     subexp->append(start, tok.get_text() - start);
4181 
4182                 /* go back to the start of the token */
4183                 expsrc.set((char *)tok.get_text());
4184 
4185                 /* see what kind of token we're expanding */
4186                 switch((expand_sp-1)->typ)
4187                 {
4188                 case TOKT_MACRO_FOREACH:
4189                     /* it's a #foreach - process the appropriate part */
4190                     switch ((expand_sp-1)->part)
4191                     {
4192                     case 0:
4193                         /*
4194                          *   We've been doing the first part, which is the
4195                          *   main expansion per actual.  This delimiter thus
4196                          *   introduces the 'between' portion, which we copy
4197                          *   between each iteration, but not after the last
4198                          *   iteration.  So, if we've just done the last
4199                          *   actual, skip this part entirely; otherwise,
4200                          *   keep going, using this part.
4201                          */
4202                         if (argofs[(expand_sp-1)->arg + 1] == 0)
4203                         {
4204                             /* skip this one remaining part */
4205                             skip_delimited_group(&expsrc, 1);
4206 
4207                             /* we're finished with the iteration */
4208                             goto end_foreach;
4209                         }
4210                         else
4211                         {
4212                             /*
4213                              *   we have more arguments, so we want to
4214                              *   expand this part - skip the deliter and
4215                              *   keep going
4216                              */
4217                             expsrc.inc();
4218 
4219                             /* we're now in the next part of the iterator */
4220                             (expand_sp-1)->part++;
4221                         }
4222                         break;
4223 
4224                     case 1:
4225                         /*
4226                          *   We've reached the end of the entire #foreach
4227                          *   string, so we're done with this iteration.
4228                          *   Skip the delimiter.
4229                          */
4230                         expsrc.inc();
4231 
4232                     end_foreach:
4233                         /*
4234                          *   if we have more arguments, start over with the
4235                          *   next iteration; otherwise, pop the #foreach
4236                          *   level
4237                          */
4238                         if (argofs[(expand_sp-1)->arg + 1] == 0)
4239                         {
4240                             /* no more arguments - pop the #foreach level */
4241                             --expand_sp;
4242                         }
4243                         else
4244                         {
4245                             /* we have more arguments - move to the next */
4246                             (expand_sp-1)->arg++;
4247 
4248                             /* go back to the start of the expansion */
4249                             expsrc = (expand_sp-1)->start;
4250 
4251                             /* we have no previous token for pasting ops */
4252                             prvtok.settyp(TOKT_EOF);
4253                             prvprvtok.settyp(TOKT_EOF);
4254 
4255                             /* we're back in the first part of the iterator */
4256                             (expand_sp-1)->part = 0;
4257                         }
4258                         break;
4259                     }
4260                     break;
4261 
4262                 case TOKT_MACRO_IFEMPTY:
4263                 case TOKT_MACRO_IFNEMPTY:
4264                     /*
4265                      *   #ifempty or #ifnempty - we've reached the end of
4266                      *   the conditional text, so simply pop a level and
4267                      *   keep going after the delimiter
4268                      */
4269 
4270                     /* skip the delimiter */
4271                     expsrc.inc();
4272 
4273                     /* pop a level */
4274                     --expand_sp;
4275 
4276                     /* done */
4277                     break;
4278                 }
4279 
4280                 /* the next chunk starts here */
4281                 start = expsrc.getptr();
4282 
4283                 /* get the next token */
4284                 typ = next_on_line(&expsrc, &tok, &macro_in_embedding_);
4285 
4286                 /* we have the next token, so back and process it */
4287                 continue;
4288             }
4289         }
4290 
4291         /* if it's a #foreach marker, start a #foreach iteration */
4292         if (typ == TOKT_MACRO_FOREACH && entry->has_varargs())
4293         {
4294             /* copy the prior expansion so far */
4295             if (tok.get_text() > start)
4296                 subexp->append(start, tok.get_text() - start);
4297 
4298             /* push a #foreach level, if possible */
4299             if (expand_sp - expand_stack >= expand_max)
4300             {
4301                 /*
4302                  *   we can't create another level - log an error and ignore
4303                  *   this new level
4304                  */
4305                 log_error(TCERR_PP_FOREACH_TOO_DEEP);
4306             }
4307             else if (argofs[entry->get_argc() - 1] == 0)
4308             {
4309                 /*
4310                  *   we have no actuals for the variable part of the
4311                  *   formals, so we must iterate zero times through the
4312                  *   #foreach part - in other words, simply skip ahead to
4313                  *   the end of the #foreach
4314                  */
4315                 skip_delimited_group(&expsrc, 2);
4316             }
4317             else
4318             {
4319                 /* remember and skip the marker character */
4320                 expand_sp->delim = expsrc.getch();
4321                 expsrc.inc();
4322 
4323                 /* set the expansion type */
4324                 expand_sp->typ = typ;
4325 
4326                 /*
4327                  *   remember the position where the #foreach started, since
4328                  *   we need to come back here for each use of the variable
4329                  */
4330                 expand_sp->start = expsrc;
4331 
4332                 /* we're an iterator type */
4333                 expand_sp->is_iterator = TRUE;
4334 
4335                 /*
4336                  *   Start at the first argument in the variable part of the
4337                  *   argument list.  The last formal corresponds to the
4338                  *   first variable argument.
4339                  */
4340                 expand_sp->arg = entry->get_argc() - 1;
4341 
4342                 /* we're in the main expansion part of the expression */
4343                 expand_sp->part = 0;
4344 
4345                 /* push the new level */
4346                 ++expand_sp;
4347             }
4348 
4349             /* the next chunk starts here */
4350             start = expsrc.getptr();
4351 
4352             /* get the next token */
4353             typ = next_on_line(&expsrc, &tok, &macro_in_embedding_);
4354 
4355             /* we have the next token, so back and process it */
4356             continue;
4357         }
4358 
4359         /* if it's a varargs #ifempty or #ifnempty flag, expand it */
4360         if ((typ == TOKT_MACRO_IFEMPTY || typ == TOKT_MACRO_IFNEMPTY)
4361             && entry->has_varargs())
4362         {
4363             int is_empty;
4364             int expand;
4365 
4366             /* determine if the varargs list is empty or not */
4367             is_empty = (argofs[entry->get_argc() - 1] == 0);
4368 
4369             /*
4370              *   decide whether or not expand it, according to the empty
4371              *   state and the flag type
4372              */
4373             expand = ((is_empty && typ == TOKT_MACRO_IFEMPTY)
4374                       || (!is_empty && typ == TOKT_MACRO_IFNEMPTY));
4375 
4376             /*
4377              *   if we're going to expand it, push a level; otherwise, just
4378              *   skip the entire expansion
4379              */
4380             if (expand)
4381             {
4382                 /* make sure we have room for another level */
4383                 if (expand_sp - expand_stack >= expand_max)
4384                 {
4385                     /* no room - log an error and ignore the new level */
4386                     log_error(TCERR_PP_FOREACH_TOO_DEEP);
4387                 }
4388                 else
4389                 {
4390                     /* remember and skip the delimiter */
4391                     expand_sp->delim = expsrc.getch();
4392                     expsrc.inc();
4393 
4394                     /*
4395                      *   we're not an iterator type, so inherit the
4396                      *   enclosing level's meaning of the varargs formal
4397                      */
4398                     if (expand_sp - expand_stack == 0)
4399                     {
4400                         /* outermost level - use the whole varargs list */
4401                         expand_sp->is_iterator = FALSE;
4402                     }
4403                     else
4404                     {
4405                         /* use the enclosing level's meaning */
4406                         expand_sp->is_iterator = (expand_sp-1)->is_iterator;
4407                         expand_sp->arg = (expand_sp-1)->arg;
4408                     }
4409 
4410                     /* set the expansion type */
4411                     expand_sp->typ = typ;
4412 
4413                     /* push the new level */
4414                     ++expand_sp;
4415                 }
4416             }
4417             else
4418             {
4419                 /* not expanding - just skip the entire expansion */
4420                 skip_delimited_group(&expsrc, 1);
4421             }
4422 
4423             /* the next chunk starts here */
4424             start = expsrc.getptr();
4425 
4426             /* get the next token */
4427             typ = next_on_line(&expsrc, &tok, &macro_in_embedding_);
4428 
4429             /* we have the next token, so back and process it */
4430             continue;
4431         }
4432 
4433         /* if it's a varargs #argcount indicator, expand it */
4434         if (typ == TOKT_MACRO_ARGCOUNT && entry->has_varargs())
4435         {
4436             char buf[20];
4437             int i;
4438 
4439             /* copy the prior expansion so far */
4440             if (tok.get_text() > start)
4441                 subexp->append(start, tok.get_text() - start);
4442 
4443             /*
4444              *   count the number of arguments after and including the
4445              *   variable argument placeholder
4446              */
4447             for (i = entry->get_argc() - 1 ; argofs[i] != 0 ; ++i) ;
4448 
4449             /* make a string out of the variable argument count */
4450             sprintf(buf, "%d", i - (entry->get_argc() - 1));
4451 
4452             /* add the argument count to the output buffer */
4453             subexp->append(buf, strlen(buf));
4454 
4455             /* the next chunk starts after the #argcount */
4456             start = expsrc.getptr();
4457 
4458             /* get the next token */
4459             typ = next_on_line(&expsrc, &tok, &macro_in_embedding_);
4460 
4461             /* we have the next token, so back and process it */
4462             continue;
4463         }
4464 
4465         /* if it's a symbol, check for an actual */
4466         if (typ == TOKT_MACRO_FORMAL)
4467         {
4468             const char *p;
4469             int argnum;
4470             size_t argnum_len;
4471             int pasting;
4472             int pasting_at_left, pasting_at_right;
4473             int stringize;
4474             char stringize_qu;
4475             tc_toktyp_t stringize_type;
4476             CTcToken paste_at_right_tok;
4477 
4478             /* assume we'll copy up to the start of this token */
4479             p = tok.get_text();
4480 
4481             /*
4482              *   get the index of the actual in the argument vector --
4483              *   this is given by the second byte of the special macro
4484              *   parameter flag token
4485              */
4486             argnum = (int)(uchar)tok.get_text()[1] - 1;
4487 
4488             /*
4489              *   If we have varargs, and this is the varargs argument, and
4490              *   the current #foreach stack level indicates that we're
4491              *   iterating through the varargs list, treat this as a
4492              *   reference to the current argument in the iteration.
4493              */
4494             if (expand_sp != expand_stack
4495                 && argnum == entry->get_argc() - 1
4496                 && (expand_sp-1)->is_iterator)
4497             {
4498                 /*
4499                  *   we're on a #foreach iterator, and this is the varargs
4500                  *   formal - use the current #foreach iteration element
4501                  *   instead
4502                  */
4503                 argnum = (expand_sp-1)->arg;
4504             }
4505 
4506             /*
4507              *   Get the length of this argument.  If we have varargs, and
4508              *   this is the last formal, which is the placeholder for the
4509              *   variable argument list, and we're not in a #foreach
4510              *   iterator, the value is the value of the entire string of
4511              *   variable arguments, including the commas.
4512              */
4513             if (expand_sp == expand_stack
4514                 && entry->has_varargs()
4515                 && argnum == entry->get_argc() - 1)
4516             {
4517                 int i;
4518 
4519                 /*
4520                  *   It's the full varargs list - use the length from the
4521                  *   first varargs argument to the last.  Find the last
4522                  *   argument.
4523                  */
4524                 for (i = argnum ;
4525                      i < TOK_MAX_MACRO_ARGS && argofs[i] != 0 ; ++i) ;
4526 
4527                 /*
4528                  *   The full list length is the distance from the offset of
4529                  *   the first to the end of the last.  If there are no
4530                  *   varargs arguments at all, the length is zero.
4531                  */
4532                 if (i == argnum)
4533                     argnum_len = 0;
4534                 else
4535                     argnum_len = argofs[i-1] + arglen[i-1] - argofs[argnum];
4536             }
4537             else
4538             {
4539                 /*
4540                  *   it's not the full varargs list, so just use the length
4541                  *   of this single actual
4542                  */
4543                 argnum_len = arglen[argnum];
4544             }
4545 
4546             /* assume we won't do any token pasting or stringizing */
4547             pasting = pasting_at_left = pasting_at_right = FALSE;
4548             stringize = FALSE;
4549 
4550             /*
4551              *   if the previous token was a token-pasting operator,
4552              *   remove it and any preceding whitespace from the source
4553              *   material, since we want to append the actual parameter
4554              *   text directly after the preceding token
4555              */
4556         check_paste_left:
4557             if (prvtok.gettyp() == TOKT_POUNDPOUND)
4558             {
4559                 wchar_t prv_ch;
4560 
4561                 /*
4562                  *   note that we have token pasting - we're pasting
4563                  *   something to the left of this token (since we had a
4564                  *   "##" before this token
4565                  */
4566                 pasting = TRUE;
4567                 pasting_at_left = TRUE;
4568 
4569                 /* go back to the ## token */
4570                 p = prvtok.get_text();
4571 
4572                 /* remove any preceding whitespace */
4573                 for (prv_ch = 0 ; p > start ; )
4574                 {
4575                     const char *prvp;
4576 
4577                     /* get the previous character */
4578                     prvp = utf8_ptr::s_dec((char *)p);
4579                     prv_ch = utf8_ptr::s_getch((char *)prvp);
4580 
4581                     /* if it's not a space, we're done */
4582                     if (!is_space(prv_ch))
4583                         break;
4584 
4585                     /* move back over this character */
4586                     p = prvp;
4587                 }
4588 
4589                 /*
4590                  *   Weird special case: if the previous character was a
4591                  *   comma, and the formal we're pasting is a variable
4592                  *   argument formal (i.e., the last formal in a varargs
4593                  *   macro), and the varargs list is empty, then remove the
4594                  *   comma.  This is a handy shorthand notation that allows
4595                  *   the varargs list to be added to a comma-delimited list,
4596                  *   such as a function call's actuals or the contents of a
4597                  *   list.
4598                  */
4599                 if (prv_ch == ','
4600                     && entry->has_varargs()
4601                     && argnum == entry->get_argc() - 1
4602                     && argofs[argnum] == 0)
4603                 {
4604                     /*
4605                      *   it's the special case - move back one more
4606                      *   character to delete the comma
4607                      */
4608                     p = utf8_ptr::s_dec((char *)p);
4609                 }
4610             }
4611             else if (prvtok.gettyp() == TOKT_POUND
4612                      || prvtok.gettyp() == TOKT_POUNDAT)
4613             {
4614                 /* go back to the # token */
4615                 p = prvtok.get_text();
4616 
4617                 /* note that we have stringizing */
4618                 stringize = TRUE;
4619                 stringize_type = prvtok.gettyp();
4620                 stringize_qu = (prvtok.gettyp() == TOKT_POUND
4621                                 ? '"' : '\'');
4622 
4623                 /* go back one more token */
4624                 prvtok = prvprvtok;
4625                 prvprvtok.settyp(TOKT_EOF);
4626 
4627                 /*
4628                  *   go back and check for pasting again, since we could
4629                  *   be pasting to a stringized token
4630                  */
4631                 goto check_paste_left;
4632             }
4633 
4634             /* copy the prior expansion so far */
4635             if (p > start)
4636                 subexp->append(start, p - start);
4637 
4638             /* remember the symbol as the previous token */
4639             prvprvtok = prvtok;
4640             prvtok = tok;
4641 
4642             /* get the next token after the formal */
4643             typ = next_on_line(&expsrc, &tok, &macro_in_embedding_);
4644 
4645             /*
4646              *   If it's followed by a token-pasting operator, we need to
4647              *   paste the next token directly onto the end of the text we
4648              *   just added to the buffer, skipping any intervening
4649              *   whitespace; otherwise, we want to start adding again at
4650              *   the next character after the original token.
4651              */
4652             if (typ == TOKT_POUNDPOUND)
4653             {
4654                 utf8_ptr old_expsrc;
4655                 CTcToken old_tok;
4656 
4657                 /* note that we have pasting to the right of this token */
4658                 pasting = TRUE;
4659                 pasting_at_right = TRUE;
4660 
4661                 /* remember where we started */
4662                 old_expsrc = expsrc;
4663 
4664                 /* remember the current token for a moment */
4665                 old_tok = tok;
4666 
4667                 /* skip to the next token after the ## */
4668                 typ = next_on_line(&expsrc, &tok, &macro_in_embedding_);
4669 
4670                 /* remember the token we're pasting to the right */
4671                 paste_at_right_tok = tok;
4672 
4673                 /* check for pasting to a stringizer */
4674                 if (stringize && typ == stringize_type)
4675                 {
4676                     /*
4677                      *   leave the ## in the stream for now - we'll fix it
4678                      *   up when we stringize the next token, rather than
4679                      *   doing so now
4680                      */
4681                     expsrc = old_expsrc;
4682                     tok = old_tok;
4683                 }
4684                 else
4685                 {
4686                     /*
4687                      *   remember that we have a token-pasting operator,
4688                      *   so that we can tell that we're pasting when we
4689                      *   look at the next token
4690                      */
4691                     prvprvtok = prvtok;
4692                     prvtok = old_tok;
4693                 }
4694 
4695                 /* start next text from here */
4696                 start = tok.get_text();
4697             }
4698             else
4699             {
4700                 /* Start at the end of the symbol token */
4701                 start = prvtok.get_text() + prvtok.get_text_len();
4702             }
4703 
4704             /*
4705              *   If we're not doing any pasting, recursively expand macros
4706              *   in the actual expansion text.  If we're pasting, do not
4707              *   expand any macros in the expansion, since we want to do
4708              *   the pasting before we do any expanding.
4709              */
4710             if (pasting && stringize)
4711             {
4712                 int add_open;
4713                 int add_close;
4714 
4715                 /* presume we'll include the open and close quotes */
4716                 add_close = TRUE;
4717                 add_open = TRUE;
4718 
4719                 /*
4720                  *   If we're pasting to the left, and the buffer so far
4721                  *   ends in the same quote we're adding to this token,
4722                  *   combine the strings by removing the preceding quote
4723                  *   and not adding the open quote on the new string
4724                  */
4725                 if (subexp->get_text_len() > 0
4726                     && *(subexp->get_text_end() - 1) == stringize_qu)
4727                 {
4728                     /* remove the close quote from the expansion so far */
4729                     subexp->set_text_len(subexp->get_text_len() - 1);
4730 
4731                     /* don't add the open quote to the new string */
4732                     add_open = FALSE;
4733                 }
4734 
4735                 /*
4736                  *   If we're pasting to the right, and we have a string
4737                  *   of the same type following, or we will be pasting a
4738                  *   stringizing pair, paste the two strings together to
4739                  *   form one string by removing the close quote from this
4740                  *   string and the open quote from the next string
4741                  */
4742                 if (pasting_at_right && *tok.get_text() == stringize_qu)
4743                     add_close = FALSE;
4744 
4745                 /*
4746                  *   We're both stringizing this argument and pasting
4747                  *   another token - first stringize the actual.
4748                  */
4749                 stringize_macro_actual(subexp,
4750                                        srcbuf->get_text()
4751                                        + argofs[argnum], argnum_len,
4752                                        stringize_qu, add_open, add_close);
4753 
4754                 /*
4755                  *   if we decided to remove the closing quote, we want to
4756                  *   remove the open quote from the following string as
4757                  *   well - copy in the following string without its open
4758                  *   quote
4759                  */
4760                 if (!add_close)
4761                 {
4762                     /*
4763                      *   append the following token without its first
4764                      *   character (its open quote)
4765                      */
4766                     subexp->append(tok.get_text() + 1,
4767                                    tok.get_text_len() - 1);
4768 
4769                     /* move on to the next token */
4770                     prvprvtok = prvtok;
4771                     prvtok = tok;
4772                     typ = next_on_line(&expsrc, &tok, &macro_in_embedding_);
4773 
4774                     /* start from the new token */
4775                     start = tok.get_text();
4776                 }
4777             }
4778             else if (pasting)
4779             {
4780                 const char *argp;
4781                 size_t len;
4782                 int done;
4783                 wchar_t quote_char;
4784 
4785                 /* get the actual argument information */
4786                 argp = srcbuf->get_text() + argofs[argnum];
4787                 len = argnum_len;
4788 
4789                 /*
4790                  *   if we're pasting to the left of this token, and the
4791                  *   token starts with a fully-expanded flag, remove the
4792                  *   flag - we're making up a new token out of this and
4793                  *   what comes before, so the token that we fully
4794                  *   expanded is disappearing, so the fully-expanded
4795                  *   status no longer applies
4796                  */
4797                 if (pasting_at_left && *argp == TOK_FULLY_EXPANDED_FLAG)
4798                 {
4799                     /* skip the flag */
4800                     ++argp;
4801                     --len;
4802                 }
4803 
4804                 /* presume we won't find any quoted strings */
4805                 quote_char = 0;
4806 
4807                 /*
4808                  *   check for string concatenation to the left - if we're
4809                  *   concatenating two strings of the same type, remove
4810                  *   the adjacent quotes to make it a single string
4811                  */
4812                 if (pasting_at_left
4813                     && subexp->get_text_len() > 0
4814                     && (*argp == '\'' || *argp == '"')
4815                     && *(subexp->get_text_end() - 1) == *argp)
4816                 {
4817                     /* remove the close quote from the expansion so far */
4818                     subexp->set_text_len(subexp->get_text_len() - 1);
4819 
4820                     /* remember the quote character */
4821                     quote_char = *argp;
4822 
4823                     /* don't add the open quote to the new string */
4824                     ++argp;
4825                     --len;
4826                 }
4827 
4828                 /* presume we won't have to do anything special */
4829                 done = FALSE;
4830 
4831                 /*
4832                  *   If we're pasting at the right, also remove any
4833                  *   fully-expanded flag just before the last token in the
4834                  *   expansion.
4835                  */
4836                 if (pasting_at_right)
4837                 {
4838                     CTcToken old_tok;
4839                     CTcToken tok;
4840                     utf8_ptr p;
4841 
4842                     /* scan for the final token in the expansion string */
4843                     p.set((char *)argp);
4844                     old_tok.settyp(TOKT_INVALID);
4845                     while (p.getptr() < argp + len)
4846                     {
4847                         /*
4848                          *   get another token - stop at EOF or if we go
4849                          *   past the bounds of the expansion text
4850                          */
4851                         if (next_on_line(&p, &tok, &macro_in_embedding_)
4852                             == TOKT_EOF
4853                             || tok.get_text() >= argp + len)
4854                             break;
4855 
4856                         /* remember the previous token */
4857                         old_tok = tok;
4858                     }
4859 
4860                     /*
4861                      *   if the final token is a symbol, and it has the
4862                      *   fully-expanded flag, we must omit the flag from
4863                      *   the appended text
4864                      */
4865                     if (old_tok.gettyp() == TOKT_SYM
4866                         && old_tok.get_fully_expanded())
4867                     {
4868                         /*
4869                          *   append up to but not including the flag byte
4870                          *   preceding the final token
4871                          */
4872                         subexp->append(argp, tok.get_text() - 1 - argp);
4873 
4874                         /*
4875                          *   append from the last token to the end of the
4876                          *   expansion, skipping the flag byte
4877                          */
4878                         subexp->append(tok.get_text(),
4879                                        len - (tok.get_text() - argp));
4880 
4881                         /* we've done the appending */
4882                         done = TRUE;
4883                     }
4884                     else if (quote_char != 0
4885                              && paste_at_right_tok.get_text_len() != 0
4886                              && *paste_at_right_tok.get_text() == quote_char)
4887                     {
4888                         /*
4889                          *   we're pasting two strings together - append
4890                          *   up to but not including the close quote
4891                          */
4892                         subexp->append(argp, len - 1);
4893 
4894                         /*
4895                          *   append the next token, but do not include the
4896                          *   open quote
4897                          */
4898                         subexp->append(paste_at_right_tok.get_text() + 1,
4899                                        paste_at_right_tok.get_text_len() - 1);
4900 
4901                         /*
4902                          *   restart after the right token, since we've
4903                          *   now fully processed that token
4904                          */
4905                         start = paste_at_right_tok.get_text()
4906                                 + paste_at_right_tok.get_text_len();
4907 
4908                         /* we're done */
4909                         done = TRUE;
4910                     }
4911                 }
4912 
4913                 /*
4914                  *   append the actual without expansion, if we haven't
4915                  *   already handled it specially
4916                  */
4917                 if (!done)
4918                     subexp->append(argp, len);
4919             }
4920             else if (stringize)
4921             {
4922                 /* stringize the actual */
4923                 stringize_macro_actual(subexp,
4924                                        srcbuf->get_text()
4925                                        + argofs[argnum], argnum_len,
4926                                        stringize_qu, TRUE, TRUE);
4927             }
4928             else
4929             {
4930                 CTcTokStringRef actual_src_buf;
4931 
4932                 /* recursively expand macros in the actual text */
4933                 actual_src_buf.
4934                     set_buffer(srcbuf->get_text() + argofs[argnum],
4935                                argnum_len);
4936                 if (expand_macros(&actual_src_buf, 0, actual_exp_buf,
4937                                   FALSE, allow_defined, FALSE))
4938                     return 1;
4939 
4940                 /*
4941                  *   Append the expanded actual, marking any
4942                  *   fully-expanded tokens as such and removing
4943                  *   end-of-expansion markers.
4944                  *
4945                  *   We can't leave end-of-expansion markers in the
4946                  *   expanded actual text, because end-of-expansion
4947                  *   markers apply only to the current recursion level,
4948                  *   and we've now exited the actual's recursion level.
4949                  *   However, we must not expand further anything in the
4950                  *   actual's expansion that has already been fully
4951                  *   expanded.  To achieve both of these goals, we switch
4952                  *   here from marking the run of text (with the end
4953                  *   marker) to marking individual tokens.
4954                  */
4955                 mark_full_exp_tokens(subexp, actual_exp_buf, TRUE);
4956             }
4957 
4958             /* we've already read the next token, so proceed */
4959             continue;
4960         }
4961 
4962         /* remember the current token as the previous token */
4963         prvprvtok = prvtok;
4964         prvtok = tok;
4965 
4966         /* get the next token of the expansion */
4967         typ = next_on_line(&expsrc, &tok, &macro_in_embedding_);
4968     }
4969 
4970     /* copy the remaining replacement text */
4971     subexp->append(start, tok.get_text() - start);
4972 
4973     /* success */
4974     return 0;
4975 }
4976 
4977 /*
4978  *   Skip the source of a delimited macro expansion area (#foreach,
4979  *   #ifempty, #ifnempty).
4980  */
skip_delimited_group(utf8_ptr * p,int parts_to_skip)4981 void CTcTokenizer::skip_delimited_group(utf8_ptr *p, int parts_to_skip)
4982 {
4983     wchar_t delim;
4984 
4985     /* get the delimiter character */
4986     delim = p->getch();
4987 
4988     /*
4989      *   if the delimiter put us at the end of the line, there's nothing to
4990      *   skip
4991      */
4992     if (delim == 0 || delim == TOK_END_PP_LINE)
4993         return;
4994 
4995     /* skip the delimiter */
4996     p->inc();
4997 
4998     /* keep going until we've skipped the desired number of parts */
4999     while (parts_to_skip != 0)
5000     {
5001         wchar_t ch;
5002 
5003         /* read the next character */
5004         ch = p->getch();
5005 
5006         /* if it's the end of the line, give up */
5007         if (ch == 0 || ch == TOK_END_PP_LINE)
5008         {
5009             /*
5010              *   we ran out of input before reaching the delimiter, so this
5011              *   is implicitly the end of it
5012              */
5013             return;
5014         }
5015 
5016         /* check what we have */
5017         if (ch == delim)
5018         {
5019             /* that's one less part to skip */
5020             --parts_to_skip;
5021 
5022             /* skip it */
5023             p->inc();
5024         }
5025         else if (ch == TOK_MACRO_FOREACH_FLAG)
5026         {
5027             /* it's a nested #foreach - skip all of its parts */
5028             skip_delimited_group(p, 2);
5029         }
5030         else if (ch == TOK_MACRO_IFEMPTY_FLAG
5031                  || ch == TOK_MACRO_IFNEMPTY_FLAG)
5032         {
5033             /* nested #ifempty or #ifnempty - skip its expansion */
5034             skip_delimited_group(p, 1);
5035         }
5036         else
5037         {
5038             /* it's nothing special to us - skip it */
5039             p->inc();
5040         }
5041     }
5042 }
5043 
5044 /*
5045  *   Stringize a macro actual parameter value into a macro expansion
5046  *   buffer
5047  */
stringize_macro_actual(CTcTokString * expbuf,const char * actual_val,size_t actual_len,char quote_char,int add_open_quote,int add_close_quote)5048 void CTcTokenizer::stringize_macro_actual(CTcTokString *expbuf,
5049                                           const char *actual_val,
5050                                           size_t actual_len, char quote_char,
5051                                           int add_open_quote,
5052                                           int add_close_quote)
5053 {
5054     utf8_ptr src;
5055     const char *start;
5056     int in_inner_quote;
5057     wchar_t inner_quote_char;
5058     wchar_t prvch;
5059 
5060     /* add the open quote if desired */
5061     if (add_open_quote)
5062         expbuf->append(&quote_char, 1);
5063 
5064     /* remember the start of the current segment */
5065     start = actual_val;
5066 
5067     /*
5068      *   add the characters of the actual parameter value, quoting any
5069      *   quotes or backslashes
5070      */
5071     for (src.set((char *)actual_val), in_inner_quote = FALSE, prvch = '\0' ;
5072          src.getptr() < actual_val + actual_len ; )
5073     {
5074         wchar_t cur;
5075 
5076         /* get this character */
5077         cur = src.getch();
5078 
5079         /* compress runs of whitespace to single spaces */
5080         if (is_space(cur) && prvch != '\\')
5081         {
5082             /* append up to this character */
5083             if (src.getptr() > start)
5084                 expbuf->append(start, src.getptr() - start);
5085 
5086             /* find the next non-space character */
5087             for ( ; src.getptr() < actual_val + actual_len ; src.inc())
5088             {
5089                 if (!is_space(src.getch()))
5090                     break;
5091             }
5092 
5093             /*
5094              *   if we're not at the start or end of the string, add a
5095              *   single space to replace the entire run of whitespace --
5096              *   don't do this at the start or end of the string, since
5097              *   we must remove leading and trailing whitespace
5098              */
5099             if (prvch != '\0' && src.getptr() < actual_val + actual_len)
5100                 expbuf->append(" ", 1);
5101 
5102             /* note that the previous character is a space */
5103             prvch = cur;
5104 
5105             /* this is the new starting point */
5106             start = src.getptr();
5107 
5108             /* proceed - we're already at the next character */
5109             continue;
5110         }
5111 
5112         /*
5113          *   Check to see if we need to quote this character.  Quote any
5114          *   quote mark matching the enclosing quotes; also quote any
5115          *   backslash that occurs within nested quotes within the source
5116          *   material, but not backslashes that occur originally outside
5117          *   quotes.
5118          */
5119         if (cur == quote_char
5120             || (cur == '\\' && in_inner_quote))
5121         {
5122             /* append the segment up to (but not including) this character */
5123             if (src.getptr() > start)
5124                 expbuf->append(start, src.getptr() - start);
5125 
5126             /* add an extra backslash */
5127             expbuf->append("\\", 1);
5128 
5129             /* remember the start of the next segment */
5130             start = src.getptr();
5131         }
5132 
5133         /*
5134          *   if this is a quote character, and it's not itself escaped,
5135          *   reverse our in-quote flag
5136          */
5137         if (prvch != '\\')
5138         {
5139             /*
5140              *   If we're in an inner quote, and it's a match for the open
5141              *   inner quote, we're no longer in a quote.  Otherwise, if
5142              *   we're not in quotes and this is some kind of quote, enter
5143              *   the new quotes.
5144              */
5145             if (in_inner_quote && cur == inner_quote_char)
5146             {
5147                 /* we're leaving the inner quoted string */
5148                 in_inner_quote = FALSE;
5149             }
5150             else if (!in_inner_quote && (cur == '"' || cur == '\''))
5151             {
5152                 /* we're entering a new inner quoted string */
5153                 in_inner_quote = TRUE;
5154                 inner_quote_char = cur;
5155             }
5156         }
5157 
5158         /* remember this as the previous character */
5159         prvch = cur;
5160 
5161         /* move on to the next character */
5162         src.inc();
5163     }
5164 
5165     /* if there's anything in the final segment, append it */
5166     if (src.getptr() > start)
5167         expbuf->append(start, src.getptr() - start);
5168 
5169     /* add the close quote if desired */
5170     if (add_close_quote)
5171         expbuf->append(&quote_char, 1);
5172 }
5173 
5174 /*
5175  *   Expand a "defined" preprocessor operator
5176  */
expand_defined(CTcTokString * subexp,const CTcTokString * srcbuf,utf8_ptr * src)5177 int CTcTokenizer::expand_defined(CTcTokString *subexp,
5178                                  const CTcTokString *srcbuf, utf8_ptr *src)
5179 {
5180     CTcToken tok;
5181     tc_toktyp_t typ;
5182     int paren;
5183     int found;
5184 
5185     /* get the next token */
5186     typ = next_on_line(srcbuf, src, &tok, &macro_in_embedding_);
5187 
5188     /* note whether we have an open paren; if we do, skip it */
5189     paren = (typ == TOKT_LPAR);
5190     if (paren)
5191         typ = next_on_line(srcbuf, src, &tok, &macro_in_embedding_);
5192 
5193     /* get the symbol */
5194     if (typ != TOKT_SYM)
5195     {
5196         log_error(TCERR_PP_DEFINED_NO_SYM,
5197                   (int)tok.get_text_len(), tok.get_text());
5198         return 1;
5199     }
5200 
5201     /* look to see if the symbol is defined */
5202     found = (find_define(tok.get_text(), tok.get_text_len()) != 0);
5203 
5204     /* expand the macro to "1" if found, "0" if not */
5205     subexp->copy(found ? "1" : "0", 1);
5206 
5207     /* check for and skip the matching close paren */
5208     if (paren)
5209     {
5210         /* require the closing paren */
5211         if (next_on_line(srcbuf, src, &tok, &macro_in_embedding_)
5212             != TOKT_RPAR)
5213         {
5214             /* generate an error if we don't find it */
5215             log_error(TCERR_PP_DEFINED_RPAR);
5216             return 1;
5217         }
5218     }
5219 
5220     /* success */
5221     return 0;
5222 }
5223 
5224 
5225 /* ------------------------------------------------------------------------ */
5226 /*
5227  *   Process comments.  Replaces each character of a comment with a space.
5228  */
process_comments(size_t start_ofs)5229 void CTcTokenizer::process_comments(size_t start_ofs)
5230 {
5231     utf8_ptr src;
5232     utf8_ptr dst;
5233     int trailing_sp_after_bs;
5234 
5235     /* we haven't found a backslash followed by trailing space yet */
5236     trailing_sp_after_bs = FALSE;
5237 
5238     /*
5239      *   Scan the line.  When inside a comment, replace each character of
5240      *   the comment with a space.  When outside comments, simply copy
5241      *   characters intact.
5242      *
5243      *   Note that we need a separate src and dst pointer, because the
5244      *   character length of the original and replaced characters may
5245      *   change.  Fortunately, the length will never do anything but
5246      *   shrink or stay the same, since the only change we make is to
5247      *   insert spaces, which are always one byte apiece in UTF-8; we can
5248      *   therefore update the buffer in place.
5249      */
5250     for (src.set(linebuf_.get_buf() + start_ofs),
5251          dst.set(linebuf_.get_buf() + start_ofs) ;
5252          src.getch() != '\0' ; src.inc())
5253     {
5254         wchar_t cur;
5255 
5256         /* get the current character */
5257         cur = src.getch();
5258 
5259         /* check to see if we're in a comment */
5260         if (str_->is_in_comment())
5261         {
5262             /* check to see if the comment is ending */
5263             if (cur == '*' && src.getch_at(1) == '/')
5264             {
5265                 /*
5266                  *   skip an extra character of the source - we'll skip
5267                  *   one in the main loop, so we only need to skip one
5268                  *   more now
5269                  */
5270                 src.inc();
5271 
5272                 /* we're no longer in a comment */
5273                 str_->set_in_comment(FALSE);
5274             }
5275 
5276             /* continue without copying anything from inside the comment */
5277             continue;
5278         }
5279         else if (in_quote_ != '\0')
5280         {
5281             /* see what we have */
5282             if (cur == '\\')
5283             {
5284                 /*
5285                  *   It's a backslash sequence -- copy the backslash to
5286                  *   the output, and skip it.  Note that we don't have to
5287                  *   worry about the line ending with a backslash, since
5288                  *   the line reader will already have considered that to
5289                  *   be a line splice.
5290                  */
5291                 src.inc();
5292                 dst.setch(cur);
5293 
5294                 /* get the next character, so we copy it directly */
5295                 cur = src.getch();
5296             }
5297             else if (cur == in_quote_)
5298             {
5299                 /*
5300                  *   this is the closing quote character - simply note
5301                  *   that we're no longer in a quoted string
5302                  */
5303                 in_quote_ = '\0';
5304             }
5305             else if (in_quote_ == '"' && !comment_in_embedding_
5306                      && cur == '<' && src.getch_at(1) == '<')
5307             {
5308                 /*
5309                  *   it's an embedded expression starting point - skip the
5310                  *   first of the '<' characters (the enclosing loop will
5311                  *   skip the second one)
5312                  */
5313                 src.inc();
5314 
5315                 /* the string is done */
5316                 in_quote_ = '\0';
5317 
5318                 /* we're in an embedding now */
5319                 comment_in_embedding_ = TRUE;
5320 
5321                 /* copy the extra '<' to the output */
5322                 dst.setch('<');
5323             }
5324         }
5325         else
5326         {
5327             /*
5328              *   Monitor the stream for a backslash followed by trailing
5329              *   spaces.  If this is a backslash, note that we might have a
5330              *   backslash with trailing spaces; if it's a space, we might
5331              *   still have this, so leave the flag alone; if it's anything
5332              *   else, clear the flag, since we've found something other
5333              *   than backslashes and spaces.
5334              */
5335             if (cur == '\\')
5336                 trailing_sp_after_bs = TRUE;
5337             else if (!is_space(cur))
5338                 trailing_sp_after_bs = FALSE;
5339 
5340             /* check to see if we're starting a comment */
5341             if (cur == '/')
5342             {
5343                 switch(src.getch_at(1))
5344                 {
5345                 case '*':
5346                     /* note that we're starting a comment */
5347                     str_->set_in_comment(TRUE);
5348 
5349                     /*
5350                      *   replace the starting slash with a space - this
5351                      *   will effectively replace the entire comment with
5352                      *   a single space, since we won't copy anything else
5353                      *   from inside the comment
5354                      */
5355                     cur = ' ';
5356                     break;
5357 
5358                 case '/':
5359                     /*
5360                      *   comment to end of line - we can terminate the
5361                      *   line at the opening slash and return immediately,
5362                      *   because the entire rest of the line is to be
5363                      *   ignored
5364                      */
5365                     dst.setch('\0');
5366                     return;
5367 
5368                 default:
5369                     /* not a comment - copy it as-is */
5370                     break;
5371                 }
5372             }
5373             else if (cur == '"' || cur == '\'')
5374             {
5375                 /* it's the start of a new string */
5376                 in_quote_ = cur;
5377             }
5378             else if (cur < 0x09)
5379             {
5380                 /*
5381                  *   it's a special flag character - we need to guarantee
5382                  *   that this character never occurs in input (it
5383                  *   shouldn't anyway, since it's a control character), so
5384                  *   translate it to a space
5385                  */
5386                 cur = ' ';
5387             }
5388             else if (comment_in_embedding_
5389                      && cur == '>' && src.getch_at(1) == '>')
5390             {
5391                 /*
5392                  *   it's the end of an embedded expression - we're back
5393                  *   in a double-quoted string (only double-quoted strings
5394                  *   can have embedded expressions)
5395                  */
5396                 in_quote_ = '"';
5397                 comment_in_embedding_ = FALSE;
5398 
5399                 /* skip the extra '>' and copy it to the output */
5400                 src.inc();
5401                 dst.setch('>');
5402             }
5403         }
5404 
5405         /* set the current character in the output */
5406         dst.setch(cur);
5407     }
5408 
5409     /* set the updated line buffer length */
5410     linebuf_.set_text_len(dst.getptr() - linebuf_.get_buf());
5411 
5412     /*
5413      *   if we found a backslash with nothing following but whitespace, flag
5414      *   a warning, since they might have meant the backslash as a line
5415      *   continuation signal, but we're not interpreting it that way because
5416      *   of the trailing whitespace
5417      */
5418     if (trailing_sp_after_bs)
5419         log_warning(TCERR_TRAILING_SP_AFTER_BS);
5420 }
5421 
5422 /*
5423  *   Splice strings.  Splice additional lines onto the current line until
5424  *   we find the end of the string.
5425  */
splice_string()5426 void CTcTokenizer::splice_string()
5427 {
5428     utf8_ptr p;
5429     int in_quote;
5430     int in_embedding;
5431     char unterm;
5432 
5433     /* presume we'll find proper termination */
5434     unterm = '\0';
5435 
5436     /*
5437      *   remember the current in-quote and in-embedding status, as of the
5438      *   end of the current line - when we splice, the line reader will
5439      *   update these to the status at the end of the newly-read material,
5440      *   but we want to scan from the beginning of the newly-read material
5441      */
5442     in_quote = in_quote_;
5443     in_embedding = comment_in_embedding_;
5444 
5445     /* keep going until we find the end of the string */
5446     for (;;)
5447     {
5448         char *new_line_p;
5449         wchar_t cur;
5450 
5451         /*
5452          *   append a space at the end of the line, to replace the newline
5453          *   that we've eliminated
5454          */
5455         if (string_newline_spacing_)
5456             linebuf_.append(" ", 1);
5457 
5458         /* splice another line */
5459         new_line_p = read_line(TRUE);
5460 
5461         /* if we reached end of file, there's no more splicing we can do */
5462         if (new_line_p == 0)
5463             break;
5464 
5465         /* skip leading spaces in the new line */
5466         for (p.set(new_line_p) ; is_space(p.getch()) ; p.inc()) ;
5467 
5468         /* if we skipped any spaces, remove them from the text */
5469         if (p.getptr() > new_line_p)
5470         {
5471             size_t rem;
5472             size_t new_len;
5473 
5474             /* calculate the length of the rest of the line */
5475             rem = linebuf_.get_text_len()
5476                   - (p.getptr() - linebuf_.get_buf());
5477 
5478             /* calculate the new length of the line */
5479             new_len = (new_line_p - linebuf_.get_buf()) + rem;
5480 
5481             /* move the rest of the line down over the spaces */
5482             memmove(new_line_p, p.getptr(), rem);
5483 
5484             /* set the new length */
5485             linebuf_.set_text_len(new_len);
5486         }
5487 
5488         /*
5489          *   If the new line contains only "}" or ";", presume that the
5490          *   string is unterminated and terminate it here.  (This
5491          *   heuristic could flag well-formed strings as erroneous, but
5492          *   users can always work around this by moving these characters
5493          *   onto lines that contain at least one other non-whitespace
5494          *   character.)
5495          */
5496         p.set(new_line_p);
5497         if (p.getch() == '}' || p.getch() == ';')
5498         {
5499             /* skip trailing whitespace */
5500             for (p.inc() ; is_space(p.getch()) ; p.inc()) ;
5501 
5502             /*
5503              *   if there's nothing else on the line, presume it's an
5504              *   unterminated string
5505              */
5506             if (p.getch() == '\0')
5507             {
5508                 /* log the error */
5509                 log_error(TCERR_POSSIBLE_UNTERM_STR,
5510                           appended_linenum_);
5511 
5512                 /* remember that it's unterminated */
5513                 unterm = (char)in_quote;
5514 
5515                 /*
5516                  *   since we're adding a presumed close quote that never
5517                  *   appears in the text, we need to figure the new
5518                  *   in-string status for the line; clear the in-quote
5519                  *   flag, and re-scan comments from the current point on
5520                  *   the line
5521                  */
5522                 in_quote_ = '\0';
5523                 process_comments(new_line_p - linebuf_.get_buf());
5524 
5525                 /* we're done - unsplice from the start of the new line */
5526                 p.set(new_line_p);
5527                 goto done;
5528             }
5529         }
5530 
5531         /* scan for the end of the string */
5532         for (p.set(new_line_p) ;; p.inc())
5533         {
5534             /* get this character */
5535             cur = p.getch();
5536 
5537             /* see what we have */
5538             if (cur == '\\')
5539             {
5540                 /* it's a backslash sequence - skip the extra character */
5541                 p.inc();
5542             }
5543             else if (cur == in_quote)
5544             {
5545                 /* it's our quote character - skip it, and we're done */
5546                 p.inc();
5547                 goto done;
5548             }
5549             else if (in_quote == '"' && !in_embedding
5550                      && cur == '<' && p.getch_at(1) == '<')
5551             {
5552                 /*
5553                  *   it's an embedded expression starter - skip the '<<'
5554                  *   sequence and stop scanning
5555                  */
5556                 p.inc();
5557                 p.inc();
5558                 goto done;
5559             }
5560             else if (cur == '\0')
5561             {
5562                 /* end of line - go back and splice another line */
5563                 break;
5564             }
5565         }
5566     }
5567 
5568 done:
5569     /* unsplice the line at the current point */
5570     unsplice_line(p.getptr());
5571 
5572     /* if we found an unterminated string, supply implicit termination */
5573     if (unterm != '\0')
5574         linebuf_.append(&unterm, 1);
5575 }
5576 
5577 
5578 /* ------------------------------------------------------------------------ */
5579 /*
5580  *   Process a #pragma directive
5581  */
pp_pragma()5582 void CTcTokenizer::pp_pragma()
5583 {
5584     struct pp_kw_def
5585     {
5586         const char *kw;
5587         void (CTcTokenizer::*func)();
5588     };
5589     static pp_kw_def kwlist[] =
5590     {
5591 //      { "c", &CTcTokenizer::pragma_c }, -- obsolete
5592         { "once", &CTcTokenizer::pragma_once },
5593         { "all_once", &CTcTokenizer::pragma_all_once },
5594         { "message", &CTcTokenizer::pragma_message },
5595         { "newline_spacing", &CTcTokenizer::pragma_newline_spacing },
5596         { 0, 0 }
5597     };
5598     pp_kw_def *kwp;
5599     size_t kwlen;
5600 
5601     /* get the pragma keyword */
5602     if (next_on_line() != TOKT_SYM)
5603     {
5604         log_warning(TCERR_UNKNOWN_PRAGMA,
5605                     (int)curtok_.get_text_len(), curtok_.get_text());
5606         return;
5607     }
5608 
5609     /* get the keyword length */
5610     kwlen = curtok_.get_text_len();
5611 
5612     /* scan the pragma list */
5613     for (kwp = kwlist ; kwp->kw != 0 ; ++kwp)
5614     {
5615         /* is this our keyword? */
5616         if (strlen(kwp->kw) == kwlen
5617             && memicmp(curtok_.get_text(), kwp->kw, kwlen) == 0)
5618         {
5619             /* this is our keyword - invoke the handler */
5620             (this->*(kwp->func))();
5621 
5622             /* we're done */
5623             return;
5624         }
5625     }
5626 
5627     /* we didn't find it - generate a warning */
5628     log_warning(TCERR_UNKNOWN_PRAGMA, kwlen, curtok_.get_text());
5629 }
5630 
5631 #if 0 // #pragma C is not currently used
5632 /*
5633  *   Process a #pragma C directive
5634  */
5635 void CTcTokenizer::pragma_c()
5636 {
5637     tc_toktyp_t tok;
5638     int new_pragma_c;
5639 
5640     /* get the next token */
5641     tok = next_on_line();
5642 
5643     /*
5644      *   "+" or empty (end of line or whitespace) indicates C mode; "-"
5645      *   indicates standard mode
5646      */
5647     if (tok == TOKT_PLUS || tok == TOKT_EOF)
5648         new_pragma_c = TRUE;
5649     else if (tok == TOKT_MINUS)
5650         new_pragma_c = FALSE;
5651     else
5652     {
5653         log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5654         new_pragma_c = str_->is_pragma_c();
5655     }
5656 
5657     /*
5658      *   retain the pragma in the result if we're in preprocess-only mode,
5659      *   otherwise remove it
5660      */
5661     if (!pp_only_mode_)
5662         clear_linebuf();
5663 
5664     /* set the mode in the stream */
5665     str_->set_pragma_c(new_pragma_c);
5666 
5667     /* if there's a parser, notify it of the change */
5668     if (G_prs != 0)
5669         G_prs->set_pragma_c(new_pragma_c);
5670 }
5671 #endif
5672 
5673 /*
5674  *   Process a #pragma once directive
5675  */
pragma_once()5676 void CTcTokenizer::pragma_once()
5677 {
5678     /* add this file to the ONCE list */
5679     add_include_once(str_->get_desc()->get_fname());
5680 
5681     /* don't retain this pragma in the result */
5682     clear_linebuf();
5683 }
5684 
5685 /*
5686  *   Process a #pragma all_once directive
5687  */
pragma_all_once()5688 void CTcTokenizer::pragma_all_once()
5689 {
5690     tc_toktyp_t tok;
5691 
5692     /* get the next token */
5693     tok = next_on_line();
5694 
5695     /*
5696      *   "+" or empty (end of line or whitespace) indicates ALL_ONCE mode;
5697      *   '-' indicates standard mode
5698      */
5699     if (tok == TOKT_PLUS || tok == TOKT_EOF)
5700         all_once_ = TRUE;
5701     else if (tok == TOKT_MINUS)
5702         all_once_ = FALSE;
5703     else
5704         log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5705 
5706     /* don't retain this pragma in the result */
5707     clear_linebuf();
5708 }
5709 
5710 /*
5711  *   Process a #pragma message directive
5712  */
pragma_message()5713 void CTcTokenizer::pragma_message()
5714 {
5715     size_t startofs;
5716 
5717     /*
5718      *   copy the source line through the "message" token to the macro
5719      *   expansion buffer - we don't want to expand that part, but we want
5720      *   it to appear in the expansion, so just copy the original
5721      */
5722     startofs = (curtok_.get_text() + curtok_.get_text_len()
5723                 - linebuf_.get_text());
5724     expbuf_.copy(linebuf_.get_text(), startofs);
5725 
5726     /* expand macros; don't allow reading additional lines */
5727     if (expand_macros_curline(FALSE, FALSE, TRUE))
5728     {
5729         clear_linebuf();
5730         return;
5731     }
5732 
5733     /*
5734      *   If we're in normal compilation mode, display the message.  If we're
5735      *   in preprocess-only mode, simply retain the message in the
5736      *   preprocessed result, so that it shows up when the result is
5737      *   compiled.
5738      *
5739      *   Ignore messages in list-includes mode.
5740      */
5741     if (!pp_only_mode_ && !list_includes_mode_)
5742     {
5743         /* set up at the first post-processed token */
5744         start_new_line(expbuf_.get_buf() + startofs,
5745                        expbuf_.get_text_len() - startofs);
5746 
5747         /* if there's an open paren, skip it */
5748         if (next_on_line_xlat(0) == TOKT_LPAR)
5749             next_on_line_xlat(0);
5750         else
5751             log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5752 
5753         /* keep going until we reach the closing paren */
5754         while (curtok_.gettyp() != TOKT_RPAR
5755                && curtok_.gettyp() != TOKT_EOF)
5756         {
5757             /* display this token */
5758             switch(curtok_.gettyp())
5759             {
5760             case TOKT_SSTR:
5761             case TOKT_DSTR:
5762             case TOKT_SYM:
5763                 /* display the text of the token */
5764                 msg_str(curtok_.get_text(), curtok_.get_text_len());
5765                 break;
5766 
5767             case TOKT_INT:
5768                 /* display the integer */
5769                 msg_long(curtok_.get_int_val());
5770                 break;
5771 
5772             default:
5773                 /* ignore anything else */
5774                 break;
5775             }
5776 
5777             /* get the next token */
5778             next_on_line_xlat(0);
5779         }
5780 
5781         /* end the line */
5782         msg_str("\n", 1);
5783 
5784         /* remove the message from the result text */
5785         clear_linebuf();
5786     }
5787     else
5788     {
5789         /* preprocessing - copy expanded text to line buffer */
5790         linebuf_.copy(expbuf_.get_text(), expbuf_.get_text_len());
5791     }
5792 }
5793 
5794 /*
5795  *   Process a #pragma newline_spacing(on/off) directive
5796  */
pragma_newline_spacing()5797 void CTcTokenizer::pragma_newline_spacing()
5798 {
5799     int f;
5800 
5801     /* if we're in preprocess-only mode, just pass the pragma through */
5802     if (pp_only_mode_)
5803         return;
5804 
5805     /* get the '(' token and the on/off token */
5806     if (next_on_line() != TOKT_LPAR || next_on_line() != TOKT_SYM)
5807     {
5808         log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5809         goto done;
5810     }
5811 
5812     /* note the new mode flag */
5813     if (curtok_.get_text_len() == 2
5814         && memcmp(curtok_.get_text(), "on", 2) == 0)
5815     {
5816         /* it's 'on' */
5817         f = TRUE;
5818     }
5819     else if (curtok_.get_text_len() == 3
5820              && memcmp(curtok_.get_text(), "off", 3) == 0)
5821     {
5822         /* it's 'off' */
5823         f = FALSE;
5824     }
5825     else
5826     {
5827         log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5828         goto done;
5829     }
5830 
5831     /* make sure we have the ')' token */
5832     if (next_on_line() != TOKT_RPAR)
5833     {
5834         log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5835         goto done;
5836     }
5837 
5838     /* set the new mode */
5839     string_newline_spacing_ = f;
5840 
5841 done:
5842     /* done - discard this line buffer */
5843     clear_linebuf();
5844 }
5845 
5846 
5847 /* ------------------------------------------------------------------------ */
5848 /*
5849  *   Process a #charset directive
5850  */
pp_charset()5851 void CTcTokenizer::pp_charset()
5852 {
5853     /*
5854      *   Encountering a #charset directive within the tokenizer is always
5855      *   an error.  If the file opener managed to use a #charset, we'll
5856      *   never see it, because the file opener will have skipped it before
5857      *   giving us the file.
5858      *
5859      *   If we flagged a #charset error when opening the file, indicate
5860      *   that the problem is that the character set given was unloadable;
5861      *   otherwise, the problem is that #charset is in the wrong place.
5862      */
5863     log_error(str_->get_charset_error()
5864               ? TCERR_CANT_LOAD_CHARSET : TCERR_UNEXPECTED_CHARSET);
5865 
5866     /* don't retain this pragma in the result */
5867     clear_linebuf();
5868 }
5869 
5870 /* ------------------------------------------------------------------------ */
5871 /*
5872  *   Process a #include directive
5873  */
pp_include()5874 void CTcTokenizer::pp_include()
5875 {
5876     wchar_t match;
5877     int is_local;
5878     int is_absolute;
5879     utf8_ptr fname;
5880     CTcSrcFile *new_src;
5881     int charset_error;
5882     int default_charset_error;
5883     char full_name[OSFNMAX];
5884     char lcl_name[OSFNMAX];
5885     int found;
5886     CTcTokFileDesc *desc;
5887     int expand;
5888     utf8_ptr start;
5889 
5890     /* presume we'll expand macros */
5891     expand = TRUE;
5892 
5893     /*
5894      *   Check to see if expansion is needed.  Macro expansion is needed
5895      *   only if the source line is not of one of the following forms:
5896      *
5897      *.  #include "filename"
5898      *.  #include <filename>
5899      */
5900     for (start = p_ ; is_space(p_.getch()) ; p_.inc()) ;
5901     switch(p_.getch())
5902     {
5903     case '<':
5904         /* look for a matching '>' */
5905         match = '>';
5906         goto find_match;
5907 
5908     case '"':
5909         /* look for a matching '"' */
5910         match = '"';
5911         goto find_match;
5912 
5913     find_match:
5914         /* find the matching character */
5915         for (p_.inc() ; p_.getch() != '\0' && p_.getch() != match ;
5916              p_.inc()) ;
5917 
5918         /* if we found it, check for other characters on the line */
5919         if (p_.getch() == match)
5920         {
5921             /* skip the matching character */
5922             p_.inc();
5923 
5924             /* skip whitespace */
5925             while (is_space(p_.getch()))
5926                 p_.inc();
5927 
5928             /*
5929              *   make sure there's nothing else on the line - if not, it's
5930              *   one of the approved formats, so there's no need to do
5931              *   macro expansion
5932              */
5933             if (p_.getch() == 0)
5934                 expand = FALSE;
5935         }
5936         break;
5937     }
5938 
5939     /* expand macros if necessary */
5940     if (expand)
5941     {
5942         /* do the expansion */
5943         if (expand_macros_curline(FALSE, FALSE, FALSE))
5944         {
5945             /* clear the buffer and abort */
5946             clear_linebuf();
5947             return;
5948         }
5949 
5950         /* read from the expansion buffer */
5951         start_new_line(expbuf_.get_buf(), expbuf_.get_text_len());
5952     }
5953     else
5954     {
5955         /* no expansion needed - read from the original starting point */
5956         p_ = start;
5957     }
5958 
5959     /* skip leading whitespace */
5960     for ( ; is_space(p_.getch()) ; p_.inc()) ;
5961 
5962     /* we have to be looking at at '"' or '<' character */
5963     if (p_.getch() == '"')
5964     {
5965         /* look for a matching quote, and look for a local file */
5966         match = '"';
5967         is_local = TRUE;
5968     }
5969     else if (p_.getch() == '<')
5970     {
5971         /* look for a matching angle bracket, and look for a system file */
5972         match = '>';
5973         is_local = FALSE;
5974     }
5975     else
5976     {
5977         /* invalid syntax - log an error and ignore the line */
5978         log_error(TCERR_BAD_INC_SYNTAX);
5979         clear_linebuf();
5980         return;
5981     }
5982 
5983     /* skip the open quote, and remember where the filename starts */
5984     p_.inc();
5985     fname = p_;
5986 
5987     /* find the matching quote */
5988     for ( ; p_.getch() != '\0' && p_.getch() != match ; p_.inc()) ;
5989 
5990     /* if we didn't find the match, log an error and ignore the line */
5991     if (p_.getch() == '\0')
5992     {
5993         log_error(TCERR_BAD_INC_SYNTAX);
5994         clear_linebuf();
5995         return;
5996     }
5997 
5998     /*
5999      *   null-terminate the filename (we don't care what else is in the
6000      *   buffer at this point, so overwriting it isn't a problem)
6001      */
6002     p_.setch('\0');
6003 
6004     /* check to see if the filename is absolute */
6005     is_absolute = os_is_file_absolute(fname.getptr());
6006 
6007     /* we have yet to find the file */
6008     found = FALSE;
6009 
6010     /*
6011      *   in case the name is in portable URL notation, convert from URL
6012      *   notation to local notation; we'll consider this form of the name
6013      *   first, and only if we can't find it in this form will we try
6014      *   treating the name as using local filename conventions
6015      */
6016     os_cvt_url_dir(lcl_name, sizeof(lcl_name), fname.getptr(), FALSE);
6017 
6018     /*
6019      *   Search for the included file.
6020      *
6021      *   First, if it's a local file (in quotes rather than angle
6022      *   brackets), start the search in the directory containing the
6023      *   current file, then look in the directory containing the parent
6024      *   file, and so on.  If we fail to find it, proceed as for a
6025      *   non-local file.
6026      */
6027     if (is_local)
6028     {
6029         CTcTokStream *cur_str;
6030         char pathbuf[OSFNMAX];
6031 
6032         /* start with the current file, and search parents */
6033         for (cur_str = str_ ; cur_str != 0 ; cur_str = cur_str->get_parent())
6034         {
6035             /* get the path to the current file */
6036             os_get_path_name(pathbuf, sizeof(pathbuf),
6037                              last_desc_->get_fname());
6038 
6039             /*
6040              *   try the URL-converted name first - this takes precedence
6041              *   over a local interpretation of the name
6042              */
6043             os_build_full_path(full_name, sizeof(full_name),
6044                                pathbuf, lcl_name);
6045             if (!osfacc(full_name))
6046             {
6047                 found = TRUE;
6048                 break;
6049             }
6050 
6051             /* if it's a relative local name, try again with local naming */
6052             if (!is_absolute)
6053             {
6054                 /*
6055                  *   build the full filename, treating the name as using
6056                  *   local system conventions
6057                  */
6058                 os_build_full_path(full_name, sizeof(full_name),
6059                                    pathbuf, fname.getptr());
6060 
6061                 /* if we found it, so note and stop searching */
6062                 if (!osfacc(full_name))
6063                 {
6064                     found = TRUE;
6065                     break;
6066                 }
6067             }
6068         }
6069     }
6070 
6071     /*
6072      *   If we still haven't found the file (or if it's a non-local file,
6073      *   in angle brackets), search the include path.
6074      */
6075     if (!found)
6076     {
6077         tctok_incpath_t *inc_path;
6078 
6079         /* scan the include path */
6080         for (inc_path = incpath_head_ ; inc_path != 0 ;
6081              inc_path = inc_path->nxt)
6082         {
6083             /* try the URL-converted local name first */
6084             os_build_full_path(full_name, sizeof(full_name),
6085                                inc_path->path, lcl_name);
6086             if (!osfacc(full_name))
6087             {
6088                 found = TRUE;
6089                 break;
6090             }
6091 
6092             /* try with the local name, if it's a relative local name */
6093             if (!is_absolute)
6094             {
6095                 /* build the full name for the file in this directory */
6096                 os_build_full_path(full_name, sizeof(full_name),
6097                                    inc_path->path, fname.getptr());
6098 
6099                 /* if we found it, stop searching */
6100                 if (!osfacc(full_name))
6101                 {
6102                     found = TRUE;
6103                     break;
6104                 }
6105             }
6106         }
6107     }
6108 
6109     /*
6110      *   If the filename specified an absolute path, and we didn't find a
6111      *   file with any of the local interpretations, look at the absolute
6112      *   path.  Note that our portable URL-style notation doesn't allow
6113      *   absolute notation, so we use only the exact name as specified in
6114      *   the #include directive as the absolute form.
6115      */
6116     if (is_absolute && !found)
6117     {
6118         /* use the original filename as the full name */
6119         strcpy(full_name, fname.getptr());
6120 
6121         /* try finding the file */
6122         found = !osfacc(full_name);
6123     }
6124 
6125     /*
6126      *   we have our copy of the filename now; we don't want to retain
6127      *   this directive in the preprocessed source, so clear out the line
6128      *   buffer now
6129      */
6130     clear_linebuf();
6131 
6132     /*
6133      *   if we didn't find the file anywhere, show an error and ignore the
6134      *   #include directive
6135      */
6136     if (!found)
6137     {
6138         log_error(TCERR_INC_NOT_FOUND,
6139                   (int)strlen(fname.getptr()), fname.getptr());
6140         return;
6141     }
6142 
6143     /*
6144      *   Check the list of included files that are marked for inclusion
6145      *   only once.  If we've already included this file, ignore this
6146      *   redundant inclusion.  Check based on the full filename that we
6147      *   resolved from the search path.
6148      */
6149     if (find_include_once(full_name))
6150     {
6151         /* log an error if appropriate */
6152         if (warn_on_ignore_incl_)
6153             log_warning(TCERR_REDUNDANT_INCLUDE,
6154                         (int)strlen(full_name), full_name);
6155 
6156         /* ignore this #include directive */
6157         return;
6158     }
6159 
6160     /* open a file source to read the file */
6161     new_src = CTcSrcFile::open_source(full_name, res_loader_,
6162                                       default_charset_, &charset_error,
6163                                       &default_charset_error);
6164 
6165     /* if we couldn't open the file, log an error and ignore the line */
6166     if (new_src == 0)
6167     {
6168         /*
6169          *   if the error was due to the default character set, log that
6170          *   problem; otherwise, log the general file-open problem
6171          */
6172         if (default_charset_error)
6173             log_error(TCERR_CANT_LOAD_DEFAULT_CHARSET, default_charset_);
6174         else
6175             log_error(TCERR_INC_NOT_FOUND,
6176                       (int)strlen(full_name), full_name);
6177 
6178         /* we can go no further */
6179         return;
6180     }
6181 
6182     /* get the descriptor for the source file */
6183     desc = get_file_desc(full_name, strlen(full_name), FALSE,
6184                          fname.getptr(),
6185                          fname.getptr() != 0 ? strlen(fname.getptr()) : 0);
6186 
6187     /*
6188      *   remember the current #pragma newline_spacing mode, so we can restore
6189      *   it when we reinstate the current stream
6190      */
6191     str_->set_newline_spacing(string_newline_spacing_);
6192 
6193     /*
6194      *   Create and install the new file reader stream object.  By
6195      *   installing it as the current reader, we'll activate it so that
6196      *   the next line read will come from the new stream.  Note that the
6197      *   current stream becomes the parent of the new stream, so that we
6198      *   revert to the current stream when the new stream is exhausted;
6199      *   this will allow us to pick up reading from the current stream at
6200      *   the next line after the #include directive when we've finished
6201      *   including the new file.
6202      */
6203     str_ = new CTcTokStream(desc, new_src, str_, charset_error, if_sp_);
6204 
6205     /*
6206      *   If we're in ALL_ONCE mode, it means that every single file we
6207      *   include should be included only once.
6208      */
6209     if (all_once_)
6210         add_include_once(full_name);
6211 
6212     /*
6213      *   if we're in list-includes mode, write the name of the include file
6214      *   to the standard output
6215      */
6216     if (list_includes_mode_)
6217         G_hostifc->print_msg("#include %s\n", full_name);
6218 }
6219 
6220 /* ------------------------------------------------------------------------ */
6221 /*
6222  *   Add a file to the include-once list.  Once a file is in this list, we
6223  *   won't include it again.
6224  */
add_include_once(const char * fname)6225 void CTcTokenizer::add_include_once(const char *fname)
6226 {
6227     tctok_incfile_t *prvinc;
6228 
6229     /* if the file is already in the list, don't add it again */
6230     if (find_include_once(fname))
6231         return;
6232 
6233     /* create a new entry for the filename */
6234     prvinc = (tctok_incfile_t *)t3malloc(sizeof(tctok_incfile_t)
6235                                          + strlen(fname));
6236 
6237     /* save the filename */
6238     strcpy(prvinc->fname, fname);
6239 
6240     /* link the new entry into our list */
6241     prvinc->nxt = prev_includes_;
6242     prev_includes_ = prvinc;
6243 }
6244 
6245 /*
6246  *   Find a file in the list of files to be included only once.  Returns
6247  *   true if the file is in the list, false if not.
6248  */
find_include_once(const char * fname)6249 int CTcTokenizer::find_include_once(const char *fname)
6250 {
6251     tctok_incfile_t *prvinc;
6252 
6253     /* search the list */
6254     for (prvinc = prev_includes_ ; prvinc != 0 ; prvinc = prvinc->nxt)
6255     {
6256         /* if this one matches, we found it, so return true */
6257         if (strcmp(fname, prvinc->fname) == 0)
6258             return TRUE;
6259     }
6260 
6261     /* we didn't find the file */
6262     return FALSE;
6263 }
6264 
6265 /* ------------------------------------------------------------------------ */
6266 /*
6267  *   Process a #define directive
6268  */
pp_define()6269 void CTcTokenizer::pp_define()
6270 {
6271     const char *macro_name;
6272     size_t macro_len;
6273     const char *argv[TOK_MAX_MACRO_ARGS];
6274     size_t argvlen[TOK_MAX_MACRO_ARGS];
6275     int argc;
6276     int has_args;
6277     const char *expan;
6278     size_t expan_len;
6279     CTcHashEntryPp *entry;
6280     int has_varargs;
6281 
6282     /* get the macro name */
6283     if (next_on_line() != TOKT_SYM)
6284     {
6285         log_error(TCERR_BAD_DEFINE_SYM,
6286                   (int)curtok_.get_text_len(), curtok_.get_text());
6287         clear_linebuf();
6288         return;
6289     }
6290 
6291     /* make a copy of the macro name */
6292     macro_name = curtok_.get_text();
6293     macro_len = curtok_.get_text_len();
6294 
6295     /* no arguments yet */
6296     argc = 0;
6297 
6298     /* presume we won't find a varargs marker */
6299     has_varargs = FALSE;
6300 
6301     /*
6302      *   If there's a '(' immediately after the macro name, without any
6303      *   intervening whitespace, it has arguments; otherwise, it has no
6304      *   arguments.  Note which case we have.
6305      */
6306     if (p_.getch() == '(')
6307     {
6308         int done;
6309         tc_toktyp_t tok;
6310 
6311         /* note that we have an argument list */
6312         has_args = TRUE;
6313 
6314         /* assume we're not done yet */
6315         done = FALSE;
6316 
6317         /* skip the paren and get the next token */
6318         p_.inc();
6319         tok = next_on_line();
6320 
6321         /* check for an empty argument list */
6322         if (tok == TOKT_RPAR)
6323         {
6324             /* note that we're done with the arguments */
6325             done = TRUE;
6326         }
6327 
6328         /* scan the argument list */
6329         while (!done)
6330         {
6331             /* if we have too many arguments, it's an error */
6332             if (argc >= TOK_MAX_MACRO_ARGS)
6333             {
6334                 log_error(TCERR_TOO_MANY_MAC_PARMS,
6335                           macro_name, macro_len, TOK_MAX_MACRO_ARGS);
6336                 clear_linebuf();
6337                 return;
6338             }
6339 
6340             /* if we're at the end of the macro, it's an error */
6341             if (tok == TOKT_EOF)
6342             {
6343                 /* log the error and ignore the line */
6344                 log_error(TCERR_MACRO_NO_RPAR);
6345                 clear_linebuf();
6346                 return;
6347             }
6348 
6349             /* check for a valid initial symbol character */
6350             if (tok != TOKT_SYM)
6351             {
6352                 log_error_curtok(TCERR_BAD_MACRO_ARG_NAME);
6353                 clear_linebuf();
6354                 return;
6355             }
6356 
6357             /* remember the argument name */
6358             argvlen[argc] = curtok_.get_text_len();
6359             argv[argc++] = curtok_.get_text();
6360 
6361             /* get the next token */
6362             tok = next_on_line();
6363 
6364             /* make sure we have a comma or paren following */
6365             if (tok == TOKT_COMMA)
6366             {
6367                 /* we have more arguments - skip the comma */
6368                 tok = next_on_line();
6369             }
6370             else if (tok == TOKT_ELLIPSIS)
6371             {
6372                 /* skip the ellipsis */
6373                 tok = next_on_line();
6374 
6375                 /* note the varargs marker */
6376                 has_varargs = TRUE;
6377 
6378                 /* this must be the last argument */
6379                 if (tok != TOKT_RPAR)
6380                 {
6381                     /* log the error */
6382                     log_error_curtok(TCERR_MACRO_ELLIPSIS_REQ_RPAR);
6383 
6384                     /* discard the line and give up */
6385                     clear_linebuf();
6386                     return;
6387                 }
6388 
6389                 /* that's the last argument - we can stop now */
6390                 done = TRUE;
6391             }
6392             else if (tok == TOKT_RPAR)
6393             {
6394                 /* no more arguments - note that we can stop now */
6395                 done = TRUE;
6396             }
6397             else
6398             {
6399                 /* invalid argument - log an error and discard the line */
6400                 log_error_curtok(TCERR_MACRO_EXP_COMMA);
6401                 clear_linebuf();
6402                 return;
6403             }
6404         }
6405     }
6406     else
6407     {
6408         /*
6409          *   there are no arguments - the macro's expansion starts
6410          *   immediately after the end of the name and any subsequent
6411          *   whitespace
6412          */
6413         has_args = FALSE;
6414     }
6415 
6416     /* skip whitespace leading up to the expansion */
6417     while (is_space(p_.getch()))
6418         p_.inc();
6419 
6420     /* the rest of the line is the expansion */
6421     expan = p_.getptr();
6422 
6423     /* don't allow defining "defined" */
6424     if (macro_len == 7 && memcmp(macro_name, "defined", 7) == 0)
6425     {
6426         /* log an error */
6427         log_error(TCERR_REDEF_OP_DEFINED);
6428 
6429         /* don't retain the directive in the preprocessed result */
6430         clear_linebuf();
6431 
6432         /* ignore the definition */
6433         return;
6434     }
6435 
6436     /* get the length of the expansion text */
6437     expan_len = strlen(expan);
6438 
6439     /*
6440      *   remove any trailing whitespace from the expansion text; however,
6441      *   leave a trailing space if it's preceded by a backslash
6442      */
6443     while (expan_len > 0
6444            && is_space(expan[expan_len-1])
6445            && !(expan_len > 1 && expan[expan_len-2] == '\\'))
6446         --expan_len;
6447 
6448     /*
6449      *   If there are arguments, scan the expansion for formal parameter
6450      *   names.  For each one we find, replace it with the special
6451      *   TOK_MACRO_FORMAL_FLAG character followed by a one-byte value
6452      *   giving the argument index.  This special sequence is less costly
6453      *   to find when we're expanding the macros - by doing the search
6454      *   here, we only need to do it once, rather than each time we expand
6455      *   the macro.
6456      */
6457     if (argc != 0)
6458     {
6459         utf8_ptr src;
6460         size_t dstofs;
6461         tc_toktyp_t typ;
6462         CTcToken tok;
6463         const char *start;
6464         int in_embedding = FALSE;
6465 
6466         /*
6467          *   Generate our modified expansion text in the macro expansion
6468          *   buffer.  Initially, make sure we have room for a copy of the
6469          *   text; we'll resize the buffer later if we find we need even
6470          *   more.
6471          */
6472         expbuf_.ensure_space(expan_len);
6473 
6474         /* scan for argument names, and replace them */
6475         for (start = expan, dstofs = 0, src.set((char *)expan) ;; )
6476         {
6477             /* get the next token */
6478             typ = next_on_line(&src, &tok, &in_embedding);
6479 
6480             /* if we've reached the end of the expansion, we're done */
6481             if (typ == TOKT_EOF)
6482                 break;
6483 
6484             /*
6485              *   If this is a formal parameter name, we'll replace it with
6486              *   a special two-byte sequence; otherwise, we'll keep it
6487              *   unchanged.
6488              */
6489             if (typ == TOKT_SYM)
6490             {
6491                 int i;
6492 
6493                 /* find it in the table */
6494                 for (i = 0 ; i < argc ; ++i)
6495                 {
6496                     /* does it match this argument name? */
6497                     if (argvlen[i] == tok.get_text_len()
6498                         && memcmp(argv[i], tok.get_text(),
6499                                   tok.get_text_len()) == 0)
6500                     {
6501                         size_t new_len;
6502                         size_t arg_len;
6503                         size_t repl_len;
6504                         char flag_byte;
6505 
6506                         /* get the length of the formal name */
6507                         arg_len = argvlen[i];
6508 
6509                         /*
6510                          *   the normal replacement length for a formal
6511                          *   parameter is two bytes - one byte for the flag,
6512                          *   and one for the formal parameter index
6513                          */
6514                         repl_len = 2;
6515 
6516                         /* by default, the flag byte is the formal flag */
6517                         flag_byte = TOK_MACRO_FORMAL_FLAG;
6518 
6519                         /*
6520                          *   Check for special varargs control suffixes.  If
6521                          *   we matched the last argument name, and this is
6522                          *   a varargs macro, we might have a suffix.
6523                          */
6524                         if (has_varargs
6525                             && i == argc - 1
6526                             && src.getch() == '#')
6527                         {
6528                             /* check for the various suffixes */
6529                             if (memcmp(src.getptr() + 1, "foreach", 7) == 0
6530                                 && !is_sym(src.getch_at(8)))
6531                             {
6532                                 /*
6533                                  *   include the suffix length in the token
6534                                  *   length
6535                                  */
6536                                 arg_len += 8;
6537 
6538                                 /*
6539                                  *   the flag byte is the #foreach flag,
6540                                  *   which is a one-byte sequence
6541                                  */
6542                                 flag_byte = TOK_MACRO_FOREACH_FLAG;
6543                                 repl_len = 1;
6544                             }
6545                             else if (memcmp(src.getptr() + 1,
6546                                             "argcount", 8) == 0
6547                                      && !is_sym(src.getch_at(9)))
6548                             {
6549                                 /*
6550                                  *   include the suffix length in the token
6551                                  *   length
6552                                  */
6553                                 arg_len += 9;
6554 
6555                                 /*
6556                                  *   the flag byte is the #argcount flag,
6557                                  *   which is a one-byte sequence
6558                                  */
6559                                 flag_byte = TOK_MACRO_ARGCOUNT_FLAG;
6560                                 repl_len = 1;
6561                             }
6562                             else if (memcmp(src.getptr() + 1,
6563                                             "ifempty", 7) == 0
6564                                      && !is_sym(src.getch_at(8)))
6565                             {
6566                                 /* include the length */
6567                                 arg_len += 8;
6568 
6569                                 /* set the one-byte flag */
6570                                 flag_byte = TOK_MACRO_IFEMPTY_FLAG;
6571                                 repl_len = 1;
6572                             }
6573                             else if (memcmp(src.getptr() + 1,
6574                                             "ifnempty", 8) == 0
6575                                      && !is_sym(src.getch_at(9)))
6576                             {
6577                                 /* include the length */
6578                                 arg_len += 9;
6579 
6580                                 /* set the one-byte flag */
6581                                 flag_byte = TOK_MACRO_IFNEMPTY_FLAG;
6582                                 repl_len = 1;
6583                             }
6584                         }
6585 
6586                         /*
6587                          *   calculate the new length - we're removing the
6588                          *   argument name and adding the replacement string
6589                          *   in its place
6590                          */
6591                         new_len = expan_len + repl_len - arg_len;
6592 
6593                         /*
6594                          *   we need two bytes for the replacement - if
6595                          *   this is more than we're replacing, make sure
6596                          *   we have room for the extra
6597                          */
6598                         if (new_len > expan_len)
6599                             expbuf_.ensure_space(new_len);
6600 
6601                         /*
6602                          *   copy everything up to but not including the
6603                          *   formal name
6604                          */
6605                         if (tok.get_text() > start)
6606                         {
6607                             /* store the text */
6608                             memcpy(expbuf_.get_buf() + dstofs,
6609                                    start, tok.get_text() - start);
6610 
6611                             /* move past the stored text in the output */
6612                             dstofs += tok.get_text() - start;
6613                         }
6614 
6615                         /* the next segment starts after this token */
6616                         start = tok.get_text() + arg_len;
6617 
6618                         /* store the flag byte */
6619                         expbuf_.get_buf()[dstofs++] = flag_byte;
6620 
6621                         /*
6622                          *   If appropriate, store the argument index - this
6623                          *   always fits in one byte because our hard limit
6624                          *   on formal parameters is less than 128 per
6625                          *   macro.  Note that we add one to the index so
6626                          *   that we never store a zero byte, to avoid any
6627                          *   potential confusion with a null terminator
6628                          *   byte.
6629                          */
6630                         if (repl_len > 1)
6631                             expbuf_.get_buf()[dstofs++] = (char)(i + 1);
6632 
6633                         /* remember the new length */
6634                         expan_len = new_len;
6635 
6636                         /* no need to search further for it */
6637                         break;
6638                     }
6639                 }
6640             }
6641         }
6642 
6643         /* copy the last segment */
6644         if (tok.get_text() > start)
6645         {
6646             /* store the text */
6647             memcpy(expbuf_.get_buf() + dstofs, start,
6648                    tok.get_text() - start);
6649         }
6650 
6651         /* set the new length */
6652         expbuf_.set_text_len(expan_len);
6653 
6654         /* use the modified expansion text instead of the original */
6655         expan = expbuf_.get_text();
6656     }
6657 
6658     /*
6659      *   check the symbol table to see if this symbol is already defined -
6660      *   if so, show a warning, but honor the new definition
6661      */
6662     entry = find_define(macro_name, macro_len);
6663     if (entry != 0)
6664     {
6665         /*
6666          *   Check for a trivial redefinition - if the number of arguments
6667          *   is the same, and the type (object-like or function-like) is
6668          *   the same, and the expansion string is identical, there's no
6669          *   need to warn, because the redefinition has no effect and can
6670          *   thus be safely ignored.  Note that we must ignore any
6671          *   differences in the whitespace in the expansions for this
6672          *   comparision.
6673          */
6674         if ((entry->has_args() != 0) == (has_args != 0)
6675             && entry->get_argc() == argc
6676             && lib_strequal_collapse_spaces(expan, expan_len,
6677                                             entry->get_expansion(),
6678                                             entry->get_expan_len()))
6679         {
6680             /* it's a non-trivial redefinition - ignore it */
6681             goto done;
6682         }
6683 
6684         /* log a warning about the redefinition */
6685         log_warning(TCERR_MACRO_REDEF, (int)macro_len, macro_name);
6686 
6687         /* remove and delete the old entry */
6688         defines_->remove(entry);
6689 
6690         /* if the item isn't already in the #undef table, add it */
6691         if (find_undef(macro_name, macro_len) == 0)
6692         {
6693             /*
6694              *   move the entry to the #undef table so that we can keep track
6695              *   of the fact that this macro's definition has changed in the
6696              *   course of the compilation
6697              */
6698             undefs_->add(entry);
6699         }
6700         else
6701         {
6702             /*
6703              *   the name is already in the #undef table, so we don't need
6704              *   another copy - just forget about the old entry entirely
6705              */
6706             delete entry;
6707         }
6708     }
6709 
6710     /* create an entry for the new macro */
6711     entry = new CTcHashEntryPpDefine(macro_name, macro_len, TRUE,
6712                                      has_args, argc, has_varargs,
6713                                      argv, argvlen, expan, expan_len);
6714 
6715     /* add it to the hash table */
6716     defines_->add(entry);
6717 
6718 done:
6719     /* don't retain the directive in the preprocessed source */
6720     clear_linebuf();
6721 }
6722 
6723 /* ------------------------------------------------------------------------ */
6724 /*
6725  *   Process a #ifdef directive
6726  */
pp_ifdef()6727 void CTcTokenizer::pp_ifdef()
6728 {
6729     /* process the ifdef/ifndef with a positive sense */
6730     pp_ifdef_or_ifndef(TRUE);
6731 }
6732 
6733 /*
6734  *   Process a #ifndef directive
6735  */
pp_ifndef()6736 void CTcTokenizer::pp_ifndef()
6737 {
6738     /* process the ifdef/ifndef with a negative sense */
6739     pp_ifdef_or_ifndef(FALSE);
6740 }
6741 
6742 /*
6743  *   Process a #ifdef or #ifndef.  If 'sense' is true, we'll take the
6744  *   branch if the symbol is defined (hence #ifdef), otherwise we'll take
6745  *   it if the symbol isn't defined (hence #ifndef).
6746  */
pp_ifdef_or_ifndef(int sense)6747 void CTcTokenizer::pp_ifdef_or_ifndef(int sense)
6748 {
6749     char macro_name[TOK_SYM_MAX_BUFFER];
6750     int found;
6751     tok_if_t state;
6752 
6753     /* make sure we have a valid symbol */
6754     if (pp_get_lone_ident(macro_name, sizeof(macro_name)))
6755     {
6756         /* clear the line buffer */
6757         clear_linebuf();
6758 
6759         /*
6760          *   push a true if to avoid cascading errors for matching #endif
6761          *   or #else
6762          */
6763         push_if(TOKIF_IF_YES);
6764 
6765         /* we're done */
6766         return;
6767     }
6768 
6769     /* check to see if it's defined */
6770     found = (find_define(macro_name, strlen(macro_name)) != 0);
6771 
6772     /*
6773      *   if we found it and they wanted it found, or we didn't find it and
6774      *   they didn't want it found, take a true branch; otherwise, take a
6775      *   false branch
6776      */
6777     if ((sense != 0) == (found != 0))
6778         state = TOKIF_IF_YES;
6779     else
6780         state = TOKIF_IF_NO;
6781 
6782     /* push the new #if state */
6783     push_if(state);
6784 
6785     /* don't retain the directive in the preprocessed source */
6786     clear_linebuf();
6787 }
6788 
6789 /* ------------------------------------------------------------------------ */
6790 /*
6791  *   Process a #if directive
6792  */
pp_if()6793 void CTcTokenizer::pp_if()
6794 {
6795     CTcConstVal val;
6796 
6797     /* expand macros; don't allow reading additional lines */
6798     if (expand_macros_curline(FALSE, TRUE, FALSE))
6799         goto do_error;
6800 
6801     /*
6802      *   we don't need the original source line any more, and we don't
6803      *   want to copy it to the preprocessed output, so clear it
6804      */
6805     clear_linebuf();
6806 
6807     /* parse out of the expansion buffer */
6808     start_new_line(expbuf_.get_buf(), expbuf_.get_text_len());
6809 
6810     /* parse the preprocessor expression */
6811     if (pp_parse_expr(&val, TRUE, TRUE, TRUE))
6812     {
6813         /*
6814          *   we can't get a value; treat the expression as true and
6815          *   continue parsing, so that we don't throw off the #if nesting
6816          *   level
6817          */
6818         val.set_bool(TRUE);
6819     }
6820 
6821     /* push the new state according to the value of the expression */
6822     push_if(val.get_val_bool() ? TOKIF_IF_YES : TOKIF_IF_NO);
6823 
6824     /* done */
6825     return;
6826 
6827 do_error:
6828     /* clear the line buffer */
6829     clear_linebuf();
6830 
6831     /*
6832      *   push a true if - even though we can't evaluate the condition, we
6833      *   can at least avoid a cascade of errors for the matching #endif
6834      *   and #else
6835      */
6836     push_if(TOKIF_IF_YES);
6837 }
6838 
6839 /* ------------------------------------------------------------------------ */
6840 /*
6841  *   Process a #elif directive
6842  */
pp_elif()6843 void CTcTokenizer::pp_elif()
6844 {
6845     CTcConstVal val;
6846 
6847     /* expand macros; don't allow reading additional lines */
6848     if (expand_macros_curline(FALSE, TRUE, FALSE))
6849     {
6850         clear_linebuf();
6851         return;
6852     }
6853 
6854     /* parse out of the expansion buffer */
6855     start_new_line(expbuf_.get_buf(), expbuf_.get_text_len());
6856 
6857     /* parse the preprocessor expression */
6858     if (pp_parse_expr(&val, TRUE, TRUE, TRUE))
6859     {
6860         clear_linebuf();
6861         return;
6862     }
6863 
6864     /*
6865      *   make sure that the #elif occurs in the same file as the
6866      *   corresponding #if
6867      */
6868     if (if_sp_ <= str_->get_init_if_level())
6869     {
6870         /* log the error */
6871         log_error(TCERR_PP_ELIF_NOT_IN_SAME_FILE);
6872 
6873         /* clear the text and abort */
6874         clear_linebuf();
6875         return;
6876     }
6877 
6878     /* check the current #if state */
6879     switch(get_if_state())
6880     {
6881     case TOKIF_IF_YES:
6882         /*
6883          *   we just took the #if branch, so don't take this or any
6884          *   subsequent #elif or #else branch, regardless of the value of
6885          *   the condition - set the state to DONE to indicate that we're
6886          *   skipping everything through the endif
6887          */
6888         change_if_state(TOKIF_IF_DONE);
6889         break;
6890 
6891     case TOKIF_IF_NO:
6892         /*
6893          *   We haven't yet taken a #if or #elif branch, so we can take
6894          *   this branch if its condition is true.  If this branch's
6895          *   condition is false, stay with NO so that we will consider
6896          *   future #elif and #else branches.
6897          */
6898         if (val.get_val_bool())
6899             change_if_state(TOKIF_IF_YES);
6900         break;
6901 
6902     case TOKIF_IF_DONE:
6903         /*
6904          *   we've already taken a #if or #elif branch, so we must ignore
6905          *   this and subsequent #elif and #else branches until we get to
6906          *   our #endif - just stay in state DONE
6907          */
6908         break;
6909 
6910     case TOKIF_NONE:
6911     case TOKIF_ELSE_YES:
6912     case TOKIF_ELSE_NO:
6913         /*
6914          *   we're not in a #if branch at all, or we're inside a #else; a
6915          *   #elif is not legal here
6916          */
6917         log_error(TCERR_PP_ELIF_WITHOUT_IF);
6918         break;
6919     }
6920 
6921     /* don't retain the directive in the preprocessed source */
6922     clear_linebuf();
6923 }
6924 
6925 /* ------------------------------------------------------------------------ */
6926 /*
6927  *   Process a #else directive
6928  */
pp_else()6929 void CTcTokenizer::pp_else()
6930 {
6931     /* make sure there's nothing but whitespace on the line */
6932     if (next_on_line() != TOKT_EOF)
6933         log_error(TCERR_PP_EXTRA);
6934 
6935     /*
6936      *   make sure that the #else occurs in the same file as the
6937      *   corresponding #if
6938      */
6939     if (if_sp_ <= str_->get_init_if_level())
6940     {
6941         /* log the error */
6942         log_error(TCERR_PP_ELSE_NOT_IN_SAME_FILE);
6943 
6944         /* clear the text and abort */
6945         clear_linebuf();
6946         return;
6947     }
6948 
6949     /* check our current #if state */
6950     switch(get_if_state())
6951     {
6952     case TOKIF_IF_YES:
6953     case TOKIF_IF_DONE:
6954         /*
6955          *   we've already taken a true #if branch, so we don't want to
6956          *   process the #else part - switch to a false #else branch
6957          */
6958         change_if_state(TOKIF_ELSE_NO);
6959         break;
6960 
6961     case TOKIF_IF_NO:
6962         /*
6963          *   we haven't yet found a true #if branch, so take the #else
6964          *   branch -- switch to a true #else branch
6965          */
6966         change_if_state(TOKIF_ELSE_YES);
6967         break;
6968 
6969     case TOKIF_NONE:
6970     case TOKIF_ELSE_YES:
6971     case TOKIF_ELSE_NO:
6972         /*
6973          *   we're not in a #if at all, or we're in a #else - log an error
6974          *   and ignore it
6975          */
6976         log_error(TCERR_PP_ELSE_WITHOUT_IF);
6977         break;
6978     }
6979 
6980     /* don't retain the directive in the preprocessed source */
6981     clear_linebuf();
6982 }
6983 
6984 /* ------------------------------------------------------------------------ */
6985 /*
6986  *   Process a #endif directive
6987  */
pp_endif()6988 void CTcTokenizer::pp_endif()
6989 {
6990     /* make sure the rest of the line is blank */
6991     if (next_on_line() != TOKT_EOF)
6992         log_error(TCERR_PP_EXTRA);
6993 
6994     /* ignore the rest of the line */
6995     clear_linebuf();
6996 
6997     /* if we're not in a #if in the same file it's an error */
6998     if (if_sp_ == 0)
6999     {
7000         log_error(TCERR_PP_ENDIF_WITHOUT_IF);
7001         return;
7002     }
7003     else if (if_sp_ <= str_->get_init_if_level())
7004     {
7005         log_error(TCERR_PP_ENDIF_NOT_IN_SAME_FILE);
7006         return;
7007     }
7008 
7009     /* pop a #if level */
7010     pop_if();
7011 
7012     /* don't retain the directive in the preprocessed source */
7013     clear_linebuf();
7014 }
7015 
7016 /* ------------------------------------------------------------------------ */
7017 /*
7018  *   Process a #error directive
7019  */
pp_error()7020 void CTcTokenizer::pp_error()
7021 {
7022     size_t startofs;
7023 
7024     /*
7025      *   copy the source line through the "error" token to the macro
7026      *   expansion buffer - we don't want to expand that part, but we want
7027      *   it to appear in the expansion, so just copy the original
7028      */
7029     startofs = (curtok_.get_text() + curtok_.get_text_len()
7030                 - linebuf_.get_text());
7031     expbuf_.copy(linebuf_.get_text(), startofs);
7032 
7033     /* expand macros; don't allow reading additional lines */
7034     if (expand_macros_curline(FALSE, FALSE, TRUE))
7035     {
7036         clear_linebuf();
7037         return;
7038     }
7039 
7040     /*
7041      *   If we're in preprocess-only mode, simply retain the text in the
7042      *   processed result, so that the error is processed on a subsequent
7043      *   compilation of the result; otherwise, display the error.
7044      *
7045      *   Ignore #error directives in list-includes mode as well.
7046      */
7047     if (!pp_only_mode_ && !list_includes_mode_)
7048     {
7049         /* display the error */
7050         log_error(TCERR_ERROR_DIRECTIVE,
7051                   (int)expbuf_.get_text_len() - startofs,
7052                   expbuf_.get_text() + startofs);
7053 
7054         /* clear the directive from the result */
7055         clear_linebuf();
7056     }
7057     else
7058     {
7059         /* preprocessing - copy expanded text to line buffer */
7060         linebuf_.copy(expbuf_.get_text(), expbuf_.get_text_len());
7061     }
7062 }
7063 
7064 /* ------------------------------------------------------------------------ */
7065 /*
7066  *   Process a #undef directive
7067  */
pp_undef()7068 void CTcTokenizer::pp_undef()
7069 {
7070     char macro_name[TOK_SYM_MAX_BUFFER];
7071 
7072     /* get the macro name */
7073     if (pp_get_lone_ident(macro_name, sizeof(macro_name)))
7074     {
7075         clear_linebuf();
7076         return;
7077     }
7078 
7079     /* remove it */
7080     undefine(macro_name);
7081 
7082     /* don't retain the directive in the preprocessed source */
7083     clear_linebuf();
7084 }
7085 
7086 /*
7087  *   Programmatically delete a preprocesor symbol
7088  */
undefine(const char * sym,size_t len)7089 void CTcTokenizer::undefine(const char *sym, size_t len)
7090 {
7091     CTcHashEntryPp *entry;
7092 
7093     /*
7094      *   find the macro - if it wasn't defined, silently ignore it, since
7095      *   it's legal to #undef a symbol that wasn't previously defined
7096      */
7097     entry = find_define(sym, len);
7098     if (entry != 0 && entry->is_undefable())
7099     {
7100         /* remove it */
7101         defines_->remove(entry);
7102 
7103         /* if it's not already in the #undef table, move it there */
7104         if (find_undef(sym, len) == 0)
7105         {
7106             /* move it to the #undef table */
7107             undefs_->add(entry);
7108         }
7109         else
7110         {
7111             /*
7112              *   the name is already in the #undef table, so we don't need to
7113              *   add it again - we can forget about this entry entirely
7114              */
7115             delete entry;
7116         }
7117     }
7118 }
7119 
7120 /* ------------------------------------------------------------------------ */
7121 /*
7122  *   Process a #line directive
7123  */
pp_line()7124 void CTcTokenizer::pp_line()
7125 {
7126     CTcConstVal val_line;
7127     CTcConstVal val_fname;
7128     CTcTokFileDesc *desc;
7129 
7130     /* expand macros; don't allow reading additional lines */
7131     if (expand_macros_curline(FALSE, TRUE, FALSE))
7132     {
7133         clear_linebuf();
7134         return;
7135     }
7136 
7137     /*
7138      *   we don't need the original source line any more, and we don't
7139      *   want to copy it to the preprocessed output, so clear it
7140      */
7141     clear_linebuf();
7142 
7143     /* set up to parse from the expansion */
7144     start_new_line(expbuf_.get_buf(), expbuf_.get_text_len());
7145 
7146     /* evaluate the line number expression */
7147     if (pp_parse_expr(&val_line, TRUE, FALSE, TRUE))
7148         return;
7149 
7150     /* if it's not an integer constant, it's an error */
7151     if (val_line.get_type() != TC_CVT_INT)
7152     {
7153         log_error(TCERR_LINE_REQ_INT);
7154         return;
7155     }
7156 
7157     /* evaluate the filename expression */
7158     if (pp_parse_expr(&val_fname, FALSE, TRUE, TRUE))
7159         return;
7160 
7161     /* the filename must be a string expression */
7162     if (val_fname.get_type() != TC_CVT_SSTR)
7163     {
7164         log_error(TCERR_LINE_FILE_REQ_STR);
7165         return;
7166     }
7167 
7168     /* find or create a descriptor for the filename */
7169     desc = get_file_desc(val_fname.get_val_str(),
7170                          val_fname.get_val_str_len(), FALSE, 0, 0);
7171 
7172     /* set the new line number and descriptor in the current stream */
7173     if (str_ != 0)
7174     {
7175         str_->set_next_linenum(val_line.get_val_int());
7176         str_->set_desc(desc);
7177     }
7178 
7179     /*
7180      *   retain the pragma in the result if we're in preprocess-only mode,
7181      *   otherwise remove it
7182      */
7183     if (!pp_only_mode_)
7184         clear_linebuf();
7185 }
7186 
7187 /* ------------------------------------------------------------------------ */
7188 /*
7189  *   Look up a symbol in the #define symbol table
7190  */
find_define(const char * sym,size_t len) const7191 CTcHashEntryPp *CTcTokenizer::find_define(const char *sym, size_t len) const
7192 {
7193     /* look it up in the #define symbol table and return the result */
7194     return (CTcHashEntryPp *)defines_->find(sym, len);
7195 }
7196 
7197 /*
7198  *   Look up a symbol in the #undef table
7199  */
find_undef(const char * sym,size_t len) const7200 CTcHashEntryPp *CTcTokenizer::find_undef(const char *sym, size_t len) const
7201 {
7202     /* look it up in the #define symbol table and return the result */
7203     return (CTcHashEntryPp *)undefs_->find(sym, len);
7204 }
7205 
7206 /*
7207  *   Add a preprocessor macro definition
7208  */
add_define(const char * sym,size_t len,const char * expansion,size_t expan_len)7209 void CTcTokenizer::add_define(const char *sym, size_t len,
7210                               const char *expansion, size_t expan_len)
7211 {
7212     CTcHashEntryPp *entry;
7213 
7214     /* create an entry for the macro, with no argument list */
7215     entry = new CTcHashEntryPpDefine(sym, len, TRUE, FALSE, 0, FALSE, 0, 0,
7216                                      expansion, expan_len);
7217 
7218     /* add the new entry to the table */
7219     defines_->add(entry);
7220 }
7221 
7222 /*
7223  *   Add a preprocessor macro definition
7224  */
add_define(CTcHashEntryPp * entry)7225 void CTcTokenizer::add_define(CTcHashEntryPp *entry)
7226 {
7227     /* add the entry to our symbol table */
7228     defines_->add(entry);
7229 }
7230 
7231 /*
7232  *   parse an expression
7233  */
pp_parse_expr(CTcConstVal * val,int read_first,int last_on_line,int add_line_ending)7234 int CTcTokenizer::pp_parse_expr(CTcConstVal *val, int read_first,
7235                                 int last_on_line, int add_line_ending)
7236 {
7237     CTcPrsNode *expr_tree;
7238     char ch;
7239 
7240     /* add the line ending marker if required */
7241     if (add_line_ending)
7242     {
7243         /*
7244          *   append the special end-of-preprocess-line to the macro
7245          *   expansion buffer
7246          */
7247         ch = TOK_END_PP_LINE;
7248         expbuf_.append(&ch, 1);
7249     }
7250 
7251     /*
7252      *   note that we're pasing a preprocessor expression; this affects
7253      *   error logging in certain cases
7254      */
7255     in_pp_expr_ = TRUE;
7256 
7257     /*
7258      *   parse the expression in preprocessor mode, so that double-quoted
7259      *   strings can be concatenated and compared
7260      */
7261     G_prs->set_pp_expr_mode(TRUE);
7262 
7263     /* get the first token on the line if desired */
7264     if (read_first)
7265         next();
7266 
7267     /* parse the expression */
7268     expr_tree = G_prs->parse_expr();
7269 
7270     /* make sure we're at the end of the line if desired */
7271     if (last_on_line && next() != TOKT_EOF)
7272         log_error(TCERR_PP_EXPR_EXTRA);
7273 
7274     /* if we added the special pp-line-ending marker, remove it */
7275     if (add_line_ending)
7276     {
7277         /*
7278          *   the marker is always the last character - remove it simply by
7279          *   shortening the buffer by a character
7280          */
7281         expbuf_.set_text_len(expbuf_.get_text_len() - 1);
7282     }
7283 
7284     /* return to normal expression mode */
7285     G_prs->set_pp_expr_mode(FALSE);
7286 
7287     /* return to normal tokenizing mode */
7288     in_pp_expr_ = FALSE;
7289 
7290     /* if we didn't get a valid expression, return failure */
7291     if (expr_tree == 0)
7292         return 1;
7293 
7294     /* make sure we got a constant */
7295     if (!expr_tree->is_const())
7296     {
7297         log_error(TCERR_PP_EXPR_NOT_CONST);
7298         return 1;
7299     }
7300 
7301     /* fill in the caller's value */
7302     *val = *expr_tree->get_const_val();
7303 
7304     /* success */
7305     return 0;
7306 }
7307 
7308 /* ------------------------------------------------------------------------ */
7309 /*
7310  *   #define enumeration callback context
7311  */
7312 struct def_enum_cb_t
7313 {
7314     /* original callback function */
7315     void (*cb)(void *, CTcHashEntryPp *);
7316 
7317     /* original callback context */
7318     void *ctx;
7319 };
7320 
7321 /*
7322  *   #define enumeration callback.  This is a simple impedence matcher on the
7323  *   way to the real callbac; we cast the generic hash entry type to the
7324  *   CTcHashEntryPp subclass for the benefit of the real callback.
7325  */
enum_defines_cb(void * ctx0,CVmHashEntry * entry)7326 static void enum_defines_cb(void *ctx0, CVmHashEntry *entry)
7327 {
7328     def_enum_cb_t *ctx;
7329 
7330     /* get our real context */
7331     ctx = (def_enum_cb_t *)ctx0;
7332 
7333     /* invoke the real callback, casting the entry reference appropriately */
7334     (*ctx->cb)(ctx->ctx, (CTcHashEntryPp *)entry);
7335 }
7336 
7337 /*
7338  *   Enumerate the entries in the #define table through a callback
7339  */
enum_defines(void (* cb)(void *,CTcHashEntryPp *),void * ctx)7340 void CTcTokenizer::enum_defines(void (*cb)(void *, CTcHashEntryPp *),
7341                                 void *ctx)
7342 {
7343     def_enum_cb_t myctx;
7344 
7345     /* set up our impedence-matcher context with the real callback info */
7346     myctx.cb = cb;
7347     myctx.ctx = ctx;
7348 
7349     /* enumerate through our impedence-matcher callback */
7350     defines_->enum_entries(&enum_defines_cb, &myctx);
7351 }
7352 
7353 /* ------------------------------------------------------------------------ */
7354 /*
7355  *   Get a lone identifier for a preprocessor directive.  The identifier
7356  *   must be the only thing left on the line; we'll generate an error if
7357  *   extra characters follow on the line.
7358  *
7359  *   If there's no identifier on the line, or there's more information
7360  *   after the identifier, logs an error and returns non-zero; returns
7361  *   zero on success.
7362  */
pp_get_lone_ident(char * buf,size_t bufl)7363 int CTcTokenizer::pp_get_lone_ident(char *buf, size_t bufl)
7364 {
7365     /* get the next token, and make sure it's a symbol */
7366     if (next_on_line() != TOKT_SYM)
7367     {
7368         log_error_curtok(TCERR_BAD_DEFINE_SYM);
7369         return 1;
7370     }
7371 
7372     /* return an error if it doesn't fit */
7373     if (curtok_.get_text_len() > bufl)
7374         return 1;
7375 
7376     /* copy the text */
7377     memcpy(buf, curtok_.get_text(), curtok_.get_text_len());
7378     buf[curtok_.get_text_len()] = '\0';
7379 
7380     /* make sure there's nothing else on the line but whitespace */
7381     if (next_on_line() != TOKT_EOF)
7382     {
7383         log_error(TCERR_PP_EXTRA);
7384         return 1;
7385     }
7386 
7387     /* success */
7388     return 0;
7389 }
7390 
7391 /* ------------------------------------------------------------------------ */
7392 /*
7393  *   Push a new #if level
7394  */
push_if(tok_if_t state)7395 void CTcTokenizer::push_if(tok_if_t state)
7396 {
7397     /* if we're out of space in the stack, throw a fatal error */
7398     if (if_sp_ == TOK_MAX_IF_NESTING)
7399         throw_fatal_error(TCERR_IF_NESTING_OVERFLOW);
7400 
7401     /*
7402      *   if we're in a nested #if in a false #if, increase the nested
7403      *   false #if level
7404      */
7405     if (in_false_if())
7406         ++if_false_level_;
7407 
7408     /* push the state, remembering where the #if was defined */
7409     if_stack_[if_sp_].desc = last_desc_;
7410     if_stack_[if_sp_].linenum = last_linenum_;
7411     if_stack_[if_sp_++].state = state;
7412 }
7413 
7414 /*
7415  *   Pop a #if level
7416  */
pop_if()7417 void CTcTokenizer::pop_if()
7418 {
7419     /* if we're in a nested #if in a false #if, pop the nesting level */
7420     if (if_false_level_ != 0)
7421         --if_false_level_;
7422 
7423     /* pop the main if level */
7424     if (if_sp_ != 0)
7425         --if_sp_;
7426 }
7427 
7428 
7429 /* ------------------------------------------------------------------------ */
7430 /*
7431  *   Log an error
7432  */
log_error(int errnum,...)7433 void CTcTokenizer::log_error(int errnum, ...)
7434 {
7435     va_list marker;
7436 
7437     /* display the message */
7438     va_start(marker, errnum);
7439     G_tcmain->v_log_error(G_tok->get_last_desc(), G_tok->get_last_linenum(),
7440                           TC_SEV_ERROR, errnum, marker);
7441     va_end(marker);
7442 }
7443 
7444 /*
7445  *   Log an error with the current token's text as the parameter data,
7446  *   suitable for use with a "%.*s" display format entry
7447  */
log_error_curtok(int errnum)7448 void CTcTokenizer::log_error_curtok(int errnum)
7449 {
7450     /*
7451      *   display the message, passing "%.*s" parameter data for the
7452      *   current token text: an integer giving the length of the token
7453      *   text, and a pointer to the token text
7454      */
7455     log_error_or_warning_curtok(TC_SEV_ERROR, errnum);
7456 }
7457 
7458 /*
7459  *   Log an error or warning for the current token
7460  */
log_error_or_warning_curtok(tc_severity_t sev,int errnum)7461 void CTcTokenizer::log_error_or_warning_curtok(tc_severity_t sev, int errnum)
7462 {
7463     /* log the error with our current token */
7464     log_error_or_warning_with_tok(sev, errnum, getcur());
7465 }
7466 
7467 /*
7468  *   Log an error or warning with the given token
7469  */
log_error_or_warning_with_tok(tc_severity_t sev,int errnum,const CTcToken * tok)7470 void CTcTokenizer::log_error_or_warning_with_tok(
7471     tc_severity_t sev, int errnum, const CTcToken *tok)
7472 {
7473     const char *tok_txt;
7474     size_t tok_len;
7475     char buf[128];
7476     const char *prefix;
7477     const char *suffix;
7478     utf8_ptr src;
7479     utf8_ptr dst;
7480     size_t rem;
7481     size_t outchars;
7482 
7483     /* see what we have */
7484     switch(tok->gettyp())
7485     {
7486     case TOKT_SSTR:
7487         /* show the string in quotes, but limit the length */
7488         prefix = "'";
7489         suffix = "'";
7490         goto format_string;
7491 
7492     case TOKT_DSTR:
7493         prefix = "\"";
7494         suffix = "\"";
7495         goto format_string;
7496 
7497     case TOKT_DSTR_START:
7498         prefix = "\"";
7499         suffix = "<<";
7500         goto format_string;
7501 
7502     case TOKT_DSTR_MID:
7503         prefix = ">>";
7504         suffix = "<<";
7505         goto format_string;
7506 
7507     case TOKT_DSTR_END:
7508         prefix = ">>";
7509         suffix = "\"";
7510         goto format_string;
7511 
7512     format_string:
7513         /* set the prefix */
7514         strcpy(buf, prefix);
7515 
7516         /*
7517          *   show the string, but limit the length, and convert control
7518          *   characters to escaped representation
7519          */
7520         src.set((char *)tok->get_text());
7521         rem = tok->get_text_len();
7522         for (dst.set(buf  + strlen(buf)), outchars = 0 ;
7523              rem != 0 && outchars < 20 ; src.inc(&rem), ++outchars)
7524         {
7525             /* if this is a control character, escape it */
7526             if (src.getch() < 32)
7527             {
7528                 dst.setch('\\');
7529 
7530                 switch(src.getch())
7531                 {
7532                 case 10:
7533                     dst.setch('n');
7534                     break;
7535 
7536                 case 0x000F:
7537                     dst.setch('^');
7538                     break;
7539 
7540                 case 0x000E:
7541                     dst.setch('v');
7542                     break;
7543 
7544                 case 0x000B:
7545                     dst.setch('b');
7546                     break;
7547 
7548                 case 0x0015:
7549                     dst.setch(' ');
7550                     break;
7551 
7552                 case 9:
7553                     dst.setch('t');
7554                     break;
7555 
7556                 default:
7557                     dst.setch('x');
7558                     dst.setch('0' + (src.getch() >> 12) & 0xf);
7559                     dst.setch('0' + (src.getch() >> 8) & 0xf);
7560                     dst.setch('0' + (src.getch() >> 4) & 0xf);
7561                     dst.setch('0' + (src.getch()) & 0xf);
7562                     break;
7563                 }
7564             }
7565             else
7566             {
7567                 /* put this character as-is */
7568                 dst.setch(src.getch());
7569             }
7570         }
7571 
7572         /* if there's more string left, add "..." */
7573         if (rem != 0)
7574         {
7575             dst.setch('.');
7576             dst.setch('.');
7577             dst.setch('.');
7578         }
7579 
7580         /* add the suffix */
7581         strcpy(dst.getptr(), suffix);
7582 
7583         /* use this buffer as the token string to display */
7584         tok_txt = buf;
7585         tok_len = strlen(tok_txt);
7586         break;
7587 
7588     case TOKT_EOF:
7589         /* show a special "<End Of File>" marker */
7590         tok_txt = "<End Of File>";
7591         tok_len = strlen(tok_txt);
7592         break;
7593 
7594     default:
7595         /* just show the current token text */
7596         tok_txt = tok->get_text();
7597         tok_len = tok->get_text_len();
7598         break;
7599     }
7600 
7601     /* log the error */
7602     G_tcmain->log_error(get_last_desc(), get_last_linenum(),
7603                         sev, errnum, tok_len, tok_txt);
7604 }
7605 
7606 /*
7607  *   Log a warning
7608  */
log_warning(int errnum,...)7609 void CTcTokenizer::log_warning(int errnum, ...)
7610 {
7611     va_list marker;
7612 
7613     /* display the message */
7614     va_start(marker, errnum);
7615     G_tcmain->v_log_error(G_tok->get_last_desc(), G_tok->get_last_linenum(),
7616                           TC_SEV_WARNING, errnum, marker);
7617     va_end(marker);
7618 }
7619 
7620 /*
7621  *   Log a warning with the current token's text as the parameter data,
7622  *   suitable for use with a "%.*s" display format entry
7623  */
log_warning_curtok(int errnum)7624 void CTcTokenizer::log_warning_curtok(int errnum)
7625 {
7626     /*
7627      *   display the warning message, passing "%.*s" parameter data for
7628      *   the current token text: an integer giving the length of the token
7629      *   text, and a pointer to the token text
7630      */
7631     log_error_or_warning_curtok(TC_SEV_WARNING, errnum);
7632 }
7633 
7634 /*
7635  *   Log and throw an internal error
7636  */
throw_internal_error(int errnum,...)7637 void CTcTokenizer::throw_internal_error(int errnum, ...)
7638 {
7639     va_list marker;
7640 
7641     /* display the message */
7642     va_start(marker, errnum);
7643     G_tcmain->v_log_error(G_tok->get_last_desc(), G_tok->get_last_linenum(),
7644                           TC_SEV_INTERNAL, errnum, marker);
7645     va_end(marker);
7646 
7647     /* throw the generic internal error, since we've logged this */
7648     err_throw(TCERR_INTERNAL_ERROR);
7649 }
7650 
7651 /*
7652  *   Log and throw a fatal error
7653  */
throw_fatal_error(int errnum,...)7654 void CTcTokenizer::throw_fatal_error(int errnum, ...)
7655 {
7656     va_list marker;
7657 
7658     /* display the message */
7659     va_start(marker, errnum);
7660     G_tcmain->v_log_error(G_tok->get_last_desc(), G_tok->get_last_linenum(),
7661                           TC_SEV_FATAL, errnum, marker);
7662     va_end(marker);
7663 
7664     /* throw the generic fatal error, since we've logged this */
7665     err_throw(TCERR_FATAL_ERROR);
7666 }
7667 
7668 /*
7669  *   display a string value
7670  */
msg_str(const char * str,size_t len) const7671 void CTcTokenizer::msg_str(const char *str, size_t len) const
7672 {
7673     /* display the string through the host interface */
7674     G_hostifc->print_msg("%.*s", (int)len, str);
7675 }
7676 
7677 /*
7678  *   display a numeric value
7679  */
msg_long(long val) const7680 void CTcTokenizer::msg_long(long val) const
7681 {
7682     /* display the number through the host interface */
7683     G_hostifc->print_msg("%ld", val);
7684 }
7685 
7686 /* ------------------------------------------------------------------------ */
7687 /*
7688  *   Tokenizer Input Stream implementation
7689  */
7690 
7691 /*
7692  *   create a token input stream
7693  */
CTcTokStream(CTcTokFileDesc * desc,CTcSrcObject * src,CTcTokStream * parent,int charset_error,int init_if_level)7694 CTcTokStream::CTcTokStream(CTcTokFileDesc *desc, CTcSrcObject *src,
7695                            CTcTokStream *parent, int charset_error,
7696                            int init_if_level)
7697 {
7698     /* remember the underlying source file */
7699     src_ = src;
7700 
7701     /* remember the file descriptor */
7702     desc_ = desc;
7703 
7704     /* remember the containing stream */
7705     parent_ = parent;
7706 
7707     /* the next line to read is line number 1 */
7708     next_linenum_ = 1;
7709 
7710     /* remember if there was a #charset error */
7711     charset_error_ = charset_error;
7712 
7713     /* we're not in a comment yet */
7714     in_comment_ = FALSE;
7715 
7716     /* remember the starting #if level */
7717     init_if_level_ = init_if_level;
7718 
7719 #if 0 // #pragma C is not currently used
7720     /*
7721      *   start out in parent's pragma C mode, or in non-C mode if we have
7722      *   no parent
7723      */
7724     if (parent != 0)
7725         pragma_c_ = parent->is_pragma_c();
7726     else
7727         pragma_c_ = TRUE;
7728 #endif
7729 }
7730 
7731 /*
7732  *   delete a token input stream
7733  */
~CTcTokStream()7734 CTcTokStream::~CTcTokStream()
7735 {
7736     /* we own the underlying file, so delete it */
7737     if (src_ != 0)
7738         delete src_;
7739 }
7740 
7741 /* ------------------------------------------------------------------------ */
7742 /*
7743  *   File Descriptor
7744  */
7745 
7746 /*
7747  *   Get the length of a string with each instance of the given quote
7748  *   character escaped with a backslash.  We'll also count the escapes we
7749  *   need for each backslash.
7750  */
get_quoted_len(const char * str,wchar_t qu)7751 static size_t get_quoted_len(const char *str, wchar_t qu)
7752 {
7753     utf8_ptr p;
7754     size_t len;
7755 
7756     /*
7757      *   scan the string for instances of the quote mark; each one adds an
7758      *   extra byte to the length needed, since each one requires a
7759      *   backslash character to escape the quote mark
7760      */
7761     for (p.set((char *)str), len = strlen(str) ; p.getch() != '\0' ; p.inc())
7762     {
7763         wchar_t ch;
7764 
7765         /*
7766          *   check to see if this character is quotable - it is quotable if
7767          *   it's a backslash or it's the quote character we're escaping
7768          */
7769         ch = p.getch();
7770         if (ch == qu || ch == '\\')
7771         {
7772             /*
7773              *   we need to escape this character, so add a byte for the
7774              *   backslash we'll need to insert
7775              */
7776             ++len;
7777         }
7778     }
7779 
7780     /* return the length we calculated */
7781     return len;
7782 }
7783 
7784 /*
7785  *   Build a quoted string.  Fills in dst with the source string with each
7786  *   of the given quote marks and each backslash escaped with a backslash.
7787  *   Use get_quoted_len() to determine how much space to allocate for the
7788  *   destination buffer.
7789  */
build_quoted_str(char * dstbuf,const char * src,wchar_t qu)7790 static void build_quoted_str(char *dstbuf, const char *src, wchar_t qu)
7791 {
7792     utf8_ptr p;
7793     utf8_ptr dst;
7794 
7795     /* scan the source string for escapable characters */
7796     for (p.set((char *)src), dst.set(dstbuf), dst.setch(qu) ;
7797          p.getch() != '\0' ; p.inc())
7798     {
7799         wchar_t ch;
7800 
7801         /* get this source character */
7802         ch = p.getch();
7803 
7804         /* add a quote if we have a backslash or the quote character */
7805         if (ch == '\\' || ch == qu)
7806         {
7807             /* add a backslash to escape the character */
7808             dst.setch('\\');
7809         }
7810 
7811         /* add the character */
7812         dst.setch(ch);
7813     }
7814 
7815     /* add the close quote and trailing null */
7816     dst.setch(qu);
7817     dst.setch('\0');
7818 }
7819 
7820 /*
7821  *   create a file descriptor
7822  */
CTcTokFileDesc(const char * fname,size_t fname_len,int index,CTcTokFileDesc * orig_desc,const char * orig_fname,size_t orig_fname_len)7823 CTcTokFileDesc::CTcTokFileDesc(const char *fname, size_t fname_len,
7824                                int index, CTcTokFileDesc *orig_desc,
7825                                const char *orig_fname, size_t orig_fname_len)
7826 {
7827     const char *rootname;
7828 
7829     /* no source pages are allocated yet */
7830     src_pages_ = 0;
7831     src_pages_alo_ = 0;
7832 
7833     /* remember the first instance of this filename in the list */
7834     orig_ = orig_desc;
7835 
7836     /* there's nothing else in our chain yet */
7837     next_ = 0;
7838 
7839     /* remember my index in the master list */
7840     index_ = index;
7841 
7842     /* if there's a filename, save a copy of the name */
7843     fname_ = lib_copy_str(fname, fname_len);
7844 
7845     /* if there's an original filename save it as well */
7846     orig_fname_ = lib_copy_str(orig_fname, orig_fname_len);
7847 
7848     /*
7849      *   get the root filename, since we need to build a quoted version of
7850      *   that as well as of the basic filename
7851      */
7852     rootname = os_get_root_name(fname_);
7853 
7854     /*
7855      *   Allocate space for the quoted versions of the filename - make room
7856      *   for the filename plus the quotes (one on each end) and a null
7857      *   terminator byte.
7858      */
7859     dquoted_fname_ = (char *)t3malloc(get_quoted_len(fname_, '"') + 3);
7860     squoted_fname_ = (char *)t3malloc(get_quoted_len(fname_, '\'') + 3);
7861     dquoted_rootname_ = (char *)t3malloc(get_quoted_len(rootname, '"') + 3);
7862     squoted_rootname_ = (char *)t3malloc(get_quoted_len(rootname, '\'') + 3);
7863 
7864     /* build the quoted version of the name */
7865     build_quoted_str(dquoted_fname_, fname_, '"');
7866     build_quoted_str(squoted_fname_, fname_, '\'');
7867     build_quoted_str(dquoted_rootname_, rootname, '"');
7868     build_quoted_str(squoted_rootname_, rootname, '\'');
7869 }
7870 
7871 /*
7872  *   delete the descriptor
7873  */
~CTcTokFileDesc()7874 CTcTokFileDesc::~CTcTokFileDesc()
7875 {
7876     /* delete the filename and original filename strings */
7877     lib_free_str(fname_);
7878     lib_free_str(orig_fname_);
7879 
7880     /* delete the quotable filename strings */
7881     t3free(dquoted_fname_);
7882     t3free(squoted_fname_);
7883     t3free(dquoted_rootname_);
7884     t3free(squoted_rootname_);
7885 
7886     /* delete each source page we've allocated */
7887     if (src_pages_ != 0)
7888     {
7889         size_t i;
7890 
7891         /* go through the index array and delete each allocated page */
7892         for (i = 0 ; i < src_pages_alo_ ; ++i)
7893         {
7894             /* if this page was allocated, delete it */
7895             if (src_pages_[i] != 0)
7896                 t3free(src_pages_[i]);
7897         }
7898 
7899         /* delete the source page index array */
7900         t3free(src_pages_);
7901     }
7902 }
7903 
7904 /*
7905  *   Source page structure.  Each page tracks a block of source lines.
7906  */
7907 const size_t TCTOK_SRC_PAGE_CNT = 1024;
7908 struct CTcTokSrcPage
7909 {
7910     /*
7911      *   Array of line entries on this page.  Each entry is zero if it
7912      *   hasn't been assigned yet, and contains the absolute image file
7913      *   address of the generated code for the source line if it has been
7914      *   assigned.
7915      */
7916     ulong ofs[TCTOK_SRC_PAGE_CNT];
7917 };
7918 
7919 
7920 /*
7921  *   Add a source line
7922  */
add_source_line(ulong linenum,ulong line_addr)7923 void CTcTokFileDesc::add_source_line(ulong linenum, ulong line_addr)
7924 {
7925     size_t page_idx;
7926     size_t idx;
7927 
7928     /* get the index of the page containing this source line */
7929     page_idx = linenum / TCTOK_SRC_PAGE_CNT;
7930 
7931     /* get the index of the entry within the page */
7932     idx = linenum % TCTOK_SRC_PAGE_CNT;
7933 
7934     /*
7935      *   determine if our page index table is large enough, and expand it
7936      *   if not
7937      */
7938     if (page_idx >= src_pages_alo_)
7939     {
7940         size_t siz;
7941         size_t new_alo;
7942 
7943         /* allocate or expand the source pages array */
7944         new_alo = page_idx + 16;
7945         siz = new_alo * sizeof(src_pages_[0]);
7946         if (src_pages_ == 0)
7947             src_pages_ = (CTcTokSrcPage **)t3malloc(siz);
7948         else
7949             src_pages_ = (CTcTokSrcPage **)t3realloc(src_pages_, siz);
7950 
7951         /* clear the new part */
7952         memset(src_pages_ + src_pages_alo_, 0,
7953                (new_alo - src_pages_alo_) * sizeof(src_pages_[0]));
7954 
7955         /* remember the new allocation size */
7956         src_pages_alo_ = new_alo;
7957     }
7958 
7959     /* if this page isn't allocated, do so now */
7960     if (src_pages_[page_idx] == 0)
7961     {
7962         /* allocate the new page */
7963         src_pages_[page_idx] = (CTcTokSrcPage *)
7964                                t3malloc(sizeof(CTcTokSrcPage));
7965 
7966         /* clear it */
7967         memset(src_pages_[page_idx], 0, sizeof(CTcTokSrcPage));
7968     }
7969 
7970     /*
7971      *   if this source line entry has been previously set, don't change
7972      *   it; otherwise, store the new setting
7973      */
7974     if (src_pages_[page_idx]->ofs[idx] == 0)
7975         src_pages_[page_idx]->ofs[idx] = line_addr;
7976 }
7977 
7978 /*
7979  *   Enumerate source lines
7980  */
enum_source_lines(void (* cbfunc)(void *,ulong,ulong),void * cbctx)7981 void CTcTokFileDesc::enum_source_lines(void (*cbfunc)(void *, ulong, ulong),
7982                                        void *cbctx)
7983 {
7984     size_t page_idx;
7985     CTcTokSrcPage **pg;
7986 
7987     /* loop over all of the pages */
7988     for (page_idx = 0, pg = src_pages_ ; page_idx < src_pages_alo_ ;
7989          ++page_idx, ++pg)
7990     {
7991         size_t i;
7992         ulong linenum;
7993         ulong *p;
7994 
7995         /* if this page is not populated, skip it */
7996         if (*pg == 0)
7997             continue;
7998 
7999         /* calculate the starting line number for this page */
8000         linenum = page_idx * TCTOK_SRC_PAGE_CNT;
8001 
8002         /* loop over the entries on this page */
8003         for (i = 0, p = (*pg)->ofs ; i < TCTOK_SRC_PAGE_CNT ;
8004              ++i, ++p, ++linenum)
8005         {
8006             /* if this entry has been set, call the callback */
8007             if (*p != 0)
8008                 (*cbfunc)(cbctx, linenum, *p);
8009         }
8010     }
8011 }
8012 
8013 /* ------------------------------------------------------------------------ */
8014 /*
8015  *   #define symbol table hash entry
8016  */
8017 
8018 /*
8019  *   create an entry
8020  */
CTcHashEntryPpDefine(const textchar_t * str,size_t len,int copy,int has_args,int argc,int has_varargs,const char ** argv,const size_t * argvlen,const char * expansion,size_t expan_len)8021 CTcHashEntryPpDefine::CTcHashEntryPpDefine(const textchar_t *str, size_t len,
8022                                            int copy, int has_args, int argc,
8023                                            int has_varargs,
8024                                            const char **argv,
8025                                            const size_t *argvlen,
8026                                            const char *expansion,
8027                                            size_t expan_len)
8028     : CTcHashEntryPp(str, len, copy)
8029 {
8030     /* copy the argument list if necessary */
8031     has_args_ = has_args;
8032     has_varargs_ = has_varargs;
8033     argc_ = argc;
8034     if (argc != 0)
8035     {
8036         int i;
8037 
8038         /* allocate the argument list */
8039         argv_ = (char **)t3malloc(argc * sizeof(*argv_));
8040 
8041         /* allocate the parameters hash table */
8042         params_table_ = new CVmHashTable(16, new CVmHashFuncCS(), TRUE);
8043 
8044         /* allocate the entry list */
8045         arg_entry_ = (CTcHashEntryPpArg **)
8046                      t3malloc(argc * sizeof(arg_entry_[0]));
8047 
8048         /* copy the arguments */
8049         for (i = 0 ; i < argc ; ++i)
8050         {
8051             CTcHashEntryPpArg *entry;
8052 
8053             /* copy the argument name */
8054             argv_[i] = lib_copy_str(argv[i], argvlen[i]);
8055 
8056             /*
8057              *   Create the hash entries for this parameters.  We'll use
8058              *   this entry to look up tokens in the expansion text for
8059              *   matches to the formal names when expanding the macro.
8060              *
8061              *   Note that we'll refer directly to our local copy of the
8062              *   argument name, so we don't need to make another copy in
8063              *   the hash entry.
8064              */
8065             entry = new CTcHashEntryPpArg(argv_[i], argvlen[i], FALSE, i);
8066             params_table_->add(entry);
8067 
8068             /* add it to our by-index list */
8069             arg_entry_[i] = entry;
8070         }
8071     }
8072     else
8073     {
8074         /* no arguments */
8075         argv_ = 0;
8076         params_table_ = 0;
8077         arg_entry_ = 0;
8078     }
8079 
8080     /* save the expansion */
8081     expan_ = lib_copy_str(expansion, expan_len);
8082     expan_len_ = expan_len;
8083 }
8084 
8085 /*
8086  *   delete
8087  */
~CTcHashEntryPpDefine()8088 CTcHashEntryPpDefine::~CTcHashEntryPpDefine()
8089 {
8090     int i;
8091 
8092     /* delete the argument list */
8093     if (argv_ != 0)
8094     {
8095         /* delete each argument string */
8096         for (i = 0 ; i < argc_ ; ++i)
8097             lib_free_str(argv_[i]);
8098 
8099         /* delete the argument vector */
8100         t3free(argv_);
8101 
8102         /* delete the argument entry list */
8103         t3free(arg_entry_);
8104 
8105         /* delete the hash table */
8106         delete params_table_;
8107     }
8108 
8109     /* delete the expansion */
8110     lib_free_str(expan_);
8111 }
8112 
8113 /*
8114  *   __LINE__ static buffer
8115  */
8116 char CTcHashEntryPpLINE::buf_[20];
8117 
8118 
8119 /* ------------------------------------------------------------------------ */
8120 /*
8121  *   Load macro definitions from a file.
8122  */
load_macros_from_file(CVmStream * fp,CTcTokLoadMacErr * err_handler)8123 int CTcTokenizer::load_macros_from_file(CVmStream *fp,
8124                                         CTcTokLoadMacErr *err_handler)
8125 {
8126     long cnt;
8127     long i;
8128     size_t curarg;
8129     char *argv[TOK_MAX_MACRO_ARGS];
8130     size_t argvlen[TOK_MAX_MACRO_ARGS];
8131     size_t maxarg;
8132     int result;
8133     char *expan;
8134     size_t expmaxlen;
8135 
8136     /* we haven't allocated any argument buffers yet */
8137     maxarg = 0;
8138 
8139     /* allocate an initial expansion buffer */
8140     expmaxlen = 1024;
8141     expan = (char *)t3malloc(expmaxlen);
8142 
8143     /* presume success */
8144     result = 0;
8145 
8146     /* read the number of macros */
8147     cnt = fp->read_int4();
8148 
8149     /* read each macro */
8150     for (i = 0 ; i < cnt ; ++i)
8151     {
8152         char namebuf[TOK_SYM_MAX_LEN];
8153         size_t namelen;
8154         int flags;
8155         size_t argc;
8156         size_t explen;
8157         CTcHashEntryPp *entry;
8158         int has_args;
8159         int has_varargs;
8160 
8161         /* read the name's length */
8162         namelen = fp->read_uint2();
8163         if (namelen > sizeof(namebuf))
8164         {
8165             /* log an error through the handler */
8166             err_handler->log_error(1);
8167 
8168             /* give up - we can't read any more of the file */
8169             result = 1;
8170             goto done;
8171         }
8172 
8173         /* read the name */
8174         fp->read_bytes(namebuf, namelen);
8175 
8176         /* read and decode the flags */
8177         flags = fp->read_uint2();
8178         has_args = ((flags & 1) != 0);
8179         has_varargs = ((flags & 2) != 0);
8180 
8181         /* read the number of arguments, and read each argument */
8182         argc = fp->read_uint2();
8183         for (curarg = 0 ; curarg < argc ; ++curarg)
8184         {
8185             /* read the length, and make sure it's valid */
8186             argvlen[curarg] = fp->read_uint2();
8187             if (argvlen[curarg] > TOK_SYM_MAX_LEN)
8188             {
8189                 /* log an error */
8190                 err_handler->log_error(2);
8191 
8192                 /* give up - we can't read any more of the file */
8193                 result = 2;
8194                 goto done;
8195             }
8196 
8197             /*
8198              *   if we haven't allocated a buffer for this argument slot yet,
8199              *   allocate it now; allocate the buffer at the maximum symbol
8200              *   size, so we can reuse the same buffer for an argument of
8201              *   other macros we read later
8202              */
8203             while (curarg >= maxarg)
8204                 argv[maxarg++] = (char *)t3malloc(TOK_SYM_MAX_LEN);
8205 
8206             /* read the argument text */
8207             fp->read_bytes(argv[curarg], argvlen[curarg]);
8208         }
8209 
8210         /* read the expansion size */
8211         explen = (size_t)fp->read_int4();
8212 
8213         /* expand the expansion buffer if necessary */
8214         if (explen > expmaxlen)
8215         {
8216             /*
8217              *   overshoot a bit, so that we won't have to reallocate again
8218              *   if we find a slightly larger expansion for a future macro
8219              */
8220             expmaxlen = explen + 512;
8221 
8222             /* allocate the new buffer */
8223             expan = (char *)t3realloc(expan, expmaxlen);
8224         }
8225 
8226         /* read the expansion */
8227         fp->read_bytes(expan, explen);
8228 
8229         /*
8230          *   Before we create the entry, check to see if there's an existing
8231          *   entry with the same name.
8232          */
8233         entry = find_define(namebuf, namelen);
8234         if (entry != 0)
8235         {
8236             /*
8237              *   We have another entry.  If the entry is exactly the same,
8238              *   then we can simply skip the current entry, because we simply
8239              *   want to keep one copy of each macro that's defined
8240              *   identically in mutiple compilation macros.  If the entry is
8241              *   different from the new one, delete both - a macro which
8242              *   appears in two or more compilation units with different
8243              *   meanings is NOT a global macro, and thus we can't include it
8244              *   in the debugging records.
8245              */
8246             if (entry->is_pseudo()
8247                 || entry->has_args() != has_args
8248                 || entry->has_varargs() != has_varargs
8249                 || entry->get_argc() != (int)argc
8250                 || entry->get_expan_len() != explen
8251                 || memcmp(entry->get_expansion(), expan, explen) != 0)
8252             {
8253                 /*
8254                  *   The existing entry is different from the new entry, so
8255                  *   the macro has different meanings in different
8256                  *   compilation units, hence we cannot keep *either*
8257                  *   definition in the debug records.  Delete the existing
8258                  *   macro, and do not create the new macro.  If the existing
8259                  *   macro is a pseudo-macro, keep the old one (since it's
8260                  *   provided by the compiler itself), but still discard the
8261                  *   new one.
8262                  */
8263                 if (!entry->is_pseudo())
8264                     undefine(namebuf, namelen);
8265             }
8266             else
8267             {
8268                 /*
8269                  *   The new entry is identical to the old one, so keep it.
8270                  *   We only need one copy of the entry, though, so simply
8271                  *   keep the old one - there's no need to create a new entry
8272                  *   for the object file data.
8273                  */
8274             }
8275         }
8276         else
8277         {
8278             /*
8279              *   There's no existing macro with the same name, so create a
8280              *   new entry based on the object file data.
8281              */
8282             entry = new CTcHashEntryPpDefine(namebuf, namelen, TRUE,
8283                                              has_args, argc, has_varargs,
8284                                              (const char **)argv, argvlen,
8285                                              expan, explen);
8286 
8287             /* add it to the preprocessor's macro symbol table */
8288             add_define(entry);
8289         }
8290     }
8291 
8292 done:
8293     /* free the argument buffers we allocated */
8294     for (curarg = 0 ; curarg < maxarg ; ++curarg)
8295         t3free(argv[curarg]);
8296 
8297     /* free the expansion buffer */
8298     t3free(expan);
8299 
8300     /* success */
8301     return result;
8302 }
8303 
8304 /* ------------------------------------------------------------------------ */
8305 /*
8306  *   Callback context for writing enumerated #define symbols to a file
8307  */
8308 struct write_macro_ctx_t
8309 {
8310     /* object file we're writing to */
8311     CVmFile *fp;
8312 
8313     /* number of symbols written so far */
8314     unsigned long cnt;
8315 };
8316 
8317 /*
8318  *   Enumeration callback for writing the #define symbols to a file
8319  */
write_macros_cb(void * ctx0,CTcHashEntryPp * entry)8320 static void write_macros_cb(void *ctx0, CTcHashEntryPp *entry)
8321 {
8322     write_macro_ctx_t *ctx = (write_macro_ctx_t *)ctx0;
8323     int flags;
8324     int i;
8325     CVmFile *fp = ctx->fp;
8326 
8327     /*
8328      *   if this is a pseudo-macro (such as __LINE__ or __FILE__), ignore it
8329      *   - these macros do not have permanent global definitions, so they're
8330      *   not usable in the debugger
8331      */
8332     if (entry->is_pseudo())
8333         return;
8334 
8335     /*
8336      *   If the macro was ever redefined or undefined, ignore it - the
8337      *   debugger can only use truly global macros, which are macros that
8338      *   have stable meanings throughout the compilation units where they
8339      *   appear (and which do not have different meanings in different
8340      *   compilation units, but that's not our concern at the moment).  The
8341      *   preprocessor keeps an "undef" table of everything undefined
8342      *   (explicitly, or implicitly via redefinition), so look up this macro
8343      *   in the undef table, and ignore the macro if it we find it.
8344      */
8345     if (G_tok->find_undef(entry->getstr(), entry->getlen()) != 0)
8346         return;
8347 
8348     /* count this macro */
8349     ctx->cnt++;
8350 
8351     /* write the macro's name */
8352     fp->write_int2(entry->getlen());
8353     fp->write_bytes(entry->getstr(), entry->getlen());
8354 
8355     /* write the flag bits */
8356     flags = 0;
8357     if (entry->has_args()) flags |= 1;
8358     if (entry->has_varargs()) flags |= 2;
8359     fp->write_int2(flags);
8360 
8361     /* write the number of arguments, and write each argument */
8362     fp->write_int2(entry->get_argc());
8363     for (i = 0 ; i < entry->get_argc() ; ++i)
8364     {
8365         CTcHashEntryPpArg *arg;
8366 
8367         /* get the argument */
8368         arg = entry->get_arg_entry(i);
8369 
8370         /* write the parameter name */
8371         fp->write_int2(arg->getlen());
8372         fp->write_bytes(arg->getstr(), arg->getlen());
8373     }
8374 
8375     /* write the expansion */
8376     fp->write_int4(entry->get_expan_len());
8377     fp->write_bytes(entry->get_expansion(), entry->get_expan_len());
8378 }
8379 
8380 /*
8381  *   Write all #define symbols to a file, for debugging purposes.  Writes
8382  *   only symbols that have never been undefined or redefined, since the
8383  *   debugger can only make use of global symbols (i.e., symbols with
8384  *   consistent meanings through all compilation units in which they
8385  *   appear).
8386  */
write_macros_to_file_for_debug(CVmFile * fp)8387 void CTcTokenizer::write_macros_to_file_for_debug(CVmFile *fp)
8388 {
8389     long pos;
8390     long endpos;
8391     write_macro_ctx_t ctx;
8392 
8393     /* write a placeholder for the symbol count */
8394     pos = fp->get_pos();
8395     fp->write_int4(0);
8396 
8397     /* write the symbols */
8398     ctx.fp = fp;
8399     ctx.cnt = 0;
8400     enum_defines(&write_macros_cb, &ctx);
8401 
8402     /* go back and fix up the symbol count */
8403     endpos = fp->get_pos();
8404     fp->set_pos(pos);
8405     fp->write_int4(ctx.cnt);
8406 
8407     /* seek back to where we left off */
8408     fp->set_pos(endpos);
8409 }
8410