1 #ifdef RCSID
2 static char RCSid[] =
3 "$Header: d:/cvsroot/tads/tads3/tctok.cpp,v 1.5 1999/07/11 00:46:58 MJRoberts Exp $";
4 #endif
5
6 /*
7 * Copyright (c) 1999, 2002 Michael J. Roberts. All Rights Reserved.
8 *
9 * Please see the accompanying license file, LICENSE.TXT, for information
10 * on using and copying this software.
11 */
12 /*
13 Name
14 tctok.cpp - TADS3 compiler tokenizer
15 Function
16
17 Notes
18 The tokenizer features an integrated C-style preprocessor. The
19 preprocessor is integrated into the tokenizer for efficiency; since
20 the preprocessor uses the same lexical structure as the the TADS
21 language, we need only tokenize the input stream once, and the result
22 can be used both for preprocessing and for parsing.
23 Modified
24 04/12/99 MJRoberts - Creation
25 */
26
27 #include <stdio.h>
28 #include <string.h>
29 #include <stdarg.h>
30 #include <time.h>
31
32 #include "os.h"
33 #include "t3std.h"
34 #include "vmerr.h"
35 #include "vmhash.h"
36 #include "tcerr.h"
37 #include "tcerrnum.h"
38 #include "tctok.h"
39 #include "tcsrc.h"
40 #include "tcmain.h"
41 #include "tchost.h"
42 #include "tcprs.h"
43 #include "tctarg.h"
44 #include "charmap.h"
45 #include "vmfile.h"
46
47
48 /* ------------------------------------------------------------------------ */
49 /*
50 * Initialize the tokenizer
51 */
CTcTokenizer(CResLoader * res_loader,const char * default_charset)52 CTcTokenizer::CTcTokenizer(CResLoader *res_loader,
53 const char *default_charset)
54 {
55 int i;
56 time_t timer;
57 struct tm *tblk;
58 const char *tstr;
59 char timebuf[50];
60 struct kwdef
61 {
62 const char *kw_text;
63 tc_toktyp_t kw_tok_id;
64 };
65 static const kwdef kwlist[] =
66 {
67 { "self", TOKT_SELF },
68 { "targetprop", TOKT_TARGETPROP },
69 { "targetobj", TOKT_TARGETOBJ },
70 { "definingobj", TOKT_DEFININGOBJ },
71 { "inherited", TOKT_INHERITED },
72 { "delegated", TOKT_DELEGATED },
73 { "argcount", TOKT_ARGCOUNT },
74 { "if", TOKT_IF },
75 { "else", TOKT_ELSE },
76 { "for", TOKT_FOR },
77 { "while", TOKT_WHILE },
78 { "do", TOKT_DO },
79 { "switch", TOKT_SWITCH },
80 { "case", TOKT_CASE },
81 { "default", TOKT_DEFAULT },
82 { "goto", TOKT_GOTO },
83 { "break", TOKT_BREAK },
84 { "continue", TOKT_CONTINUE },
85 // { "and", TOKT_AND },
86 // { "or", TOKT_OR },
87 // { "not", TOKT_NOT },
88 { "function", TOKT_FUNCTION },
89 { "return", TOKT_RETURN },
90 { "local", TOKT_LOCAL },
91 { "object", TOKT_OBJECT },
92 { "nil", TOKT_NIL },
93 { "true", TOKT_TRUE },
94 { "pass", TOKT_PASS },
95 { "external", TOKT_EXTERNAL },
96 { "extern", TOKT_EXTERN },
97 { "formatstring", TOKT_FORMATSTRING },
98 { "class", TOKT_CLASS },
99 { "replace", TOKT_REPLACE },
100 { "modify", TOKT_MODIFY },
101 { "new", TOKT_NEW },
102 { "delete", TOKT_DELETE },
103 { "throw", TOKT_THROW },
104 { "try", TOKT_TRY },
105 { "catch", TOKT_CATCH },
106 { "finally", TOKT_FINALLY },
107 { "intrinsic", TOKT_INTRINSIC },
108 { "dictionary", TOKT_DICTIONARY },
109 { "grammar", TOKT_GRAMMAR },
110 { "enum", TOKT_ENUM },
111 { "template", TOKT_TEMPLATE },
112 { "static", TOKT_STATIC },
113 { "foreach", TOKT_FOREACH },
114 { "export", TOKT_EXPORT },
115 { "propertyset", TOKT_PROPERTYSET },
116 { "transient", TOKT_TRANSIENT },
117 { "replaced", TOKT_REPLACED },
118
119 { "void", TOKT_VOID },
120 { "int", TOKT_INT },
121 { "string", TOKT_STRING },
122 { "list", TOKT_LIST },
123 { "boolean", TOKT_BOOLEAN },
124 { "property", TOKT_PROPERTY },
125 { "any", TOKT_ANY },
126
127 /* end-of-table marker */
128 { 0, TOKT_INVALID }
129 };
130 const kwdef *kwp;
131
132 /* remember my resource loader */
133 res_loader_ = res_loader;
134
135 /* there's no stream yet */
136 str_ = 0;
137
138 /* no external source yet */
139 ext_src_ = 0;
140
141 /* start numbering the file descriptors at zero */
142 next_filedesc_id_ = 0;
143
144 /* there are no file descriptors yet */
145 desc_head_ = 0;
146 desc_tail_ = 0;
147 desc_list_ = 0;
148 desc_list_cnt_ = desc_list_alo_ = 0;
149
150 /* empty out the input line buffer */
151 clear_linebuf();
152
153 /* start out with a minimal line buffer size */
154 linebuf_.ensure_space(4096);
155 expbuf_.ensure_space(4096);
156
157 /* set up at the beginning of the input line buffer */
158 start_new_line(linebuf_.get_buf(), linebuf_.get_text_len());
159
160 /* remember the default character set */
161 default_charset_ = lib_copy_str(default_charset);
162
163 /* we don't have a default character mapper yet */
164 default_mapper_ = 0;
165
166 /* create an input mapper for the default character set, if specified */
167 if (default_charset != 0)
168 default_mapper_ = CCharmapToUni::load(res_loader, default_charset);
169
170 /*
171 * if the default character set wasn't specified, or we failed to
172 * load a mapper for the specified character set, use a plain ASCII
173 * mapper
174 */
175 if (default_mapper_ == 0)
176 default_mapper_ = new CCharmapToUniASCII();
177
178 /* presume we're not in preprocessor-only mode */
179 pp_only_mode_ = FALSE;
180
181 /* presume we're not in list-includes mode */
182 list_includes_mode_ = FALSE;
183
184 /* presume we're not in test report mode */
185 test_report_mode_ = FALSE;
186
187 /* allow preprocessing directives */
188 allow_pp_ = TRUE;
189
190 /* there are no previously-included files yet */
191 prev_includes_ = 0;
192
193 /* presume we'll convert newlines in strings to whitespace */
194 string_newline_spacing_ = TRUE;
195
196 /* start out with ALL_ONCE mode off */
197 all_once_ = FALSE;
198
199 /* by default, ignore redundant includes without warning */
200 warn_on_ignore_incl_ = FALSE;
201
202 /* there are no include path entries yet */
203 incpath_head_ = incpath_tail_ = 0;
204
205 /* not in a quoted string yet */
206 in_quote_ = '\0';
207
208 /* not in an embedded expression yet */
209 comment_in_embedding_ = FALSE;
210 macro_in_embedding_ = FALSE;
211 main_in_embedding_ = FALSE;
212
213 /* not in a #if block yet */
214 if_sp_ = 0;
215 if_false_level_ = 0;
216
217 /* not processing a preprocessor constant expression */
218 in_pp_expr_ = FALSE;
219
220 /* we don't have a current or appended line yet */
221 last_desc_ = 0;
222 last_linenum_ = 0;
223 appended_desc_ = 0;
224 appended_linenum_ = 0;
225
226 /* allocate the first token-list block */
227 init_src_block_list();
228
229 /* create the #define and #undef symbol tables */
230 defines_ = new CVmHashTable(512, new CVmHashFuncCS(), TRUE);
231 undefs_ = new CVmHashTable(64, new CVmHashFuncCS(), TRUE);
232
233 /* create the special __LINE__ and __FILE__ macros */
234 defines_->add(new CTcHashEntryPpLINE(this));
235 defines_->add(new CTcHashEntryPpFILE(this));
236
237 /* get the current time and date */
238 timer = time(0);
239 tblk = localtime(&timer);
240 tstr = asctime(tblk);
241
242 /*
243 * add the __DATE__ macro - the format is "Mmm dd yyyy", where "Mmm"
244 * is the three-letter month name generated by asctime(), "dd" is
245 * the day of the month, with a leading space for numbers less than
246 * ten, and "yyyy" is the year.
247 */
248 sprintf(timebuf, "'%.3s %2d %4d'",
249 tstr + 4, tblk->tm_mday, tblk->tm_year + 1900);
250 add_define("__DATE__", timebuf);
251
252 /* add the __TIME__ macro - 24-hour "hh:mm:ss" format */
253 sprintf(timebuf, "'%.8s'", tstr + 11);
254 add_define("__TIME__", timebuf);
255
256 /*
257 * Allocate a pool of macro resources. The number we start with is
258 * arbitrary, since we'll add more as needed, but we want to try to
259 * allocate enough up front that we avoid time-consuming memory
260 * allocations later. On the other hand, we don't want to
261 * pre-allocate a huge number of objects that we'll never use.
262 */
263 for (macro_res_avail_ = 0, macro_res_head_ = 0, i = 0 ; i < 7 ; ++i)
264 {
265 CTcMacroRsc *rsc;
266
267 /* allocate a new object */
268 rsc = new CTcMacroRsc();
269
270 /* add it onto the master list */
271 rsc->next_ = macro_res_head_;
272 macro_res_head_ = rsc;
273
274 /* add it onto the available list */
275 rsc->next_avail_ = macro_res_avail_;
276 macro_res_avail_ = rsc;
277 }
278
279 /* create the keyword hash table */
280 kw_ = new CVmHashTable(64, new CVmHashFuncCS(), TRUE);
281
282 /* populate the keyword table */
283 for (kwp = kwlist ; kwp->kw_text != 0 ; ++kwp)
284 kw_->add(new CTcHashEntryKw(kwp->kw_text, kwp->kw_tok_id));
285
286 /* no ungot token yet */
287 nxttok_valid_ = FALSE;
288
289 /* no string capture file */
290 string_fp_ = 0;
291 string_fp_map_ = 0;
292 }
293
294 /*
295 * Initialize the source save block list
296 */
init_src_block_list()297 void CTcTokenizer::init_src_block_list()
298 {
299 /* allocate the first source block */
300 src_cur_ = src_head_ = new CTcTokSrcBlock();
301
302 /* set up to write into the first block */
303 src_ptr_ = src_head_->get_buf();
304 src_rem_ = TCTOK_SRC_BLOCK_SIZE;
305 }
306
307
308 /* ------------------------------------------------------------------------ */
309 /*
310 * Delete the tokenizer
311 */
~CTcTokenizer()312 CTcTokenizer::~CTcTokenizer()
313 {
314 /* delete all streams */
315 delete_source();
316
317 /* delete all file descriptors */
318 while (desc_head_ != 0)
319 {
320 CTcTokFileDesc *nxt;
321
322 /* remember the next descriptor */
323 nxt = desc_head_->get_next();
324
325 /* delete this one */
326 delete desc_head_;
327
328 /* move on to the next one */
329 desc_head_ = nxt;
330 }
331
332 /* delete the file descriptor index array */
333 if (desc_list_ != 0)
334 t3free(desc_list_);
335
336 /* delete our default character set string copy */
337 lib_free_str(default_charset_);
338
339 /* release our reference on our default character mapper */
340 default_mapper_->release_ref();
341
342 /* forget about all of our previous include files */
343 while (prev_includes_ != 0)
344 {
345 tctok_incfile_t *nxt;
346
347 /* remember the next file */
348 nxt = prev_includes_->nxt;
349
350 /* delete this one */
351 t3free(prev_includes_);
352
353 /* move on to the next one */
354 prev_includes_ = nxt;
355 }
356
357 /* delete the include path list */
358 while (incpath_head_ != 0)
359 {
360 tctok_incpath_t *nxt;
361
362 /* remember the next entry in the path */
363 nxt = incpath_head_->nxt;
364
365 /* delete this entry */
366 t3free(incpath_head_);
367
368 /* move on to the next one */
369 incpath_head_ = nxt;
370 }
371
372 /* delete the macro resources */
373 while (macro_res_head_ != 0)
374 {
375 CTcMacroRsc *nxt;
376
377 /* remember the next one */
378 nxt = macro_res_head_->next_;
379
380 /* delete this one */
381 delete macro_res_head_;
382
383 /* move on to the next one */
384 macro_res_head_ = nxt;
385 }
386
387 /* delete the token list */
388 delete src_head_;
389
390 /* delete the #define and #undef symbol tables */
391 delete defines_;
392 delete undefs_;
393
394 /* delete the keyword hash table */
395 delete kw_;
396
397 /* if we created a mapping for the string capture file, release it */
398 if (string_fp_map_ != 0)
399 string_fp_map_->release_ref();
400 }
401
402 /* ------------------------------------------------------------------------ */
403 /*
404 * Clear the line buffer
405 */
clear_linebuf()406 void CTcTokenizer::clear_linebuf()
407 {
408 /* clear the buffer */
409 linebuf_.clear_text();
410
411 /* reset our read point to the start of the line buffer */
412 p_.set(linebuf_.get_buf());
413 }
414
415 /* ------------------------------------------------------------------------ */
416 /*
417 * Get a textual representation of an operator token
418 */
get_op_text(tc_toktyp_t op)419 const char *CTcTokenizer::get_op_text(tc_toktyp_t op)
420 {
421 struct tokname_t
422 {
423 tc_toktyp_t typ;
424 const char *nm;
425 };
426 static const tokname_t toknames[] =
427 {
428 { TOKT_EOF, "<end of file>" },
429 { TOKT_SYM, "<symbol>" },
430 { TOKT_INT, "<integer>" },
431 { TOKT_SSTR, "<single-quoted string>" },
432 { TOKT_DSTR, "<double-quoted string>" },
433 { TOKT_DSTR_START, "<double-quoted string>" },
434 { TOKT_DSTR_MID, "<double-quoted string>" },
435 { TOKT_DSTR_END, "<double-quoted string>" },
436 { TOKT_LPAR, "(" },
437 { TOKT_RPAR, ")" },
438 { TOKT_COMMA, "," },
439 { TOKT_DOT, "." },
440 { TOKT_LBRACE, "{" },
441 { TOKT_RBRACE, "}", },
442 { TOKT_LBRACK, "[", },
443 { TOKT_RBRACK, "]", },
444 { TOKT_EQ, "=", },
445 { TOKT_EQEQ, "==", },
446 { TOKT_ASI, ":=" },
447 { TOKT_PLUS, "+" },
448 { TOKT_MINUS, "-" },
449 { TOKT_TIMES, "*" },
450 { TOKT_DIV, "/", },
451 { TOKT_MOD, "%" },
452 { TOKT_GT, ">" },
453 { TOKT_LT, "<" },
454 { TOKT_GE, ">=" },
455 { TOKT_LE, "<=" },
456 { TOKT_NE, "!=" },
457 { TOKT_ARROW, "->" },
458 { TOKT_COLON, ":" },
459 { TOKT_SEM, ";" },
460 { TOKT_AND, "&" },
461 { TOKT_ANDAND, "&&" },
462 { TOKT_OR, "|" },
463 { TOKT_OROR, "||" },
464 { TOKT_XOR, "^" },
465 { TOKT_SHL, "<<" },
466 { TOKT_SHR, ">>" },
467 { TOKT_INC, "++" },
468 { TOKT_DEC, "--" },
469 { TOKT_PLUSEQ, "+=" },
470 { TOKT_MINEQ, "-=" },
471 { TOKT_TIMESEQ, "*=" },
472 { TOKT_DIVEQ, "/=" },
473 { TOKT_MODEQ, "%=" },
474 { TOKT_ANDEQ, "&=" },
475 { TOKT_OREQ, "|=" },
476 { TOKT_XOREQ, "^=" },
477 { TOKT_SHLEQ, "<<=" },
478 { TOKT_SHREQ, ">>=" },
479 { TOKT_NOT, "! (not)" },
480 { TOKT_BNOT, "~" },
481 { TOKT_POUND, "#" },
482 { TOKT_POUNDPOUND, "##" },
483 { TOKT_POUNDAT, "#@" },
484 { TOKT_ELLIPSIS, "..." },
485 { TOKT_QUESTION, "?" },
486 { TOKT_COLONCOLON, "::" },
487 { TOKT_FLOAT, "<float>" },
488 { TOKT_AT, "@" },
489 { TOKT_SELF, "self" },
490 { TOKT_TARGETPROP, "targetprop" },
491 { TOKT_TARGETOBJ, "targetobj" },
492 { TOKT_DEFININGOBJ, "definingobj" },
493 { TOKT_INHERITED, "inherited" },
494 { TOKT_DELEGATED, "delegated" },
495 { TOKT_IF, "if" },
496 { TOKT_ELSE, "else" },
497 { TOKT_FOR, "for" },
498 { TOKT_WHILE, "while" },
499 { TOKT_DO, "do" },
500 { TOKT_SWITCH, "switch" },
501 { TOKT_CASE, "case" },
502 { TOKT_DEFAULT, "default" },
503 { TOKT_GOTO, "goto" },
504 { TOKT_BREAK, "break" },
505 { TOKT_CONTINUE, "continue" },
506 { TOKT_FUNCTION, "function" },
507 { TOKT_RETURN, "return" },
508 { TOKT_LOCAL, "local" },
509 { TOKT_OBJECT, "object" },
510 { TOKT_NIL, "nil" },
511 { TOKT_TRUE, "true" },
512 { TOKT_PASS, "pass" },
513 { TOKT_EXTERNAL, "external" },
514 { TOKT_EXTERN, "extern" },
515 { TOKT_FORMATSTRING, "formatstring" },
516 { TOKT_CLASS, "class" },
517 { TOKT_REPLACE, "replace" },
518 { TOKT_MODIFY, "modify" },
519 { TOKT_NEW, "new" },
520 { TOKT_DELETE, "delete" },
521 { TOKT_THROW, "throw" },
522 { TOKT_TRY, "try" },
523 { TOKT_CATCH, "catch" },
524 { TOKT_FINALLY, "finally" },
525 { TOKT_INTRINSIC, "intrinsic" },
526 { TOKT_DICTIONARY, "dictionary" },
527 { TOKT_GRAMMAR, "grammar" },
528 { TOKT_ENUM, "enum" },
529 { TOKT_TEMPLATE, "template" },
530 { TOKT_STATIC, "static" },
531 { TOKT_FOREACH, "foreach" },
532 { TOKT_EXPORT, "export" },
533 { TOKT_PROPERTYSET, "propertyset" },
534 { TOKT_TRANSIENT, "transient" },
535 { TOKT_REPLACED, "replaced" },
536 { TOKT_VOID, "void" },
537 { TOKT_INTKW, "int" },
538 { TOKT_STRING, "string" },
539 { TOKT_LIST, "list" },
540 { TOKT_BOOLEAN, "boolean" },
541 { TOKT_PROPERTY, "property" },
542 { TOKT_ANY, "any"},
543 { TOKT_INVALID, 0 }
544 };
545 const tokname_t *p;
546
547 /* search for the token */
548 for (p = toknames ; p->nm != 0 ; ++p)
549 {
550 /* if this is our token, return the associated name string */
551 if (p->typ == op)
552 return p->nm;
553 }
554
555 /* we didn't find it */
556 return "<unknown>";
557 }
558
559 /* ------------------------------------------------------------------------ */
560 /*
561 * Reset the tokenizer. Delete the current source object and all of the
562 * saved source text. This can be used after compilation of a unit
563 * (such as a debugger expression) is completed and the intermediate
564 * parser state is no longer needed.
565 */
reset()566 void CTcTokenizer::reset()
567 {
568 /* delete the source object */
569 delete_source();
570
571 /* delete saved token text */
572 if (src_head_ != 0)
573 {
574 /* delete the list */
575 delete src_head_;
576
577 /* re-initialize the source block list */
578 init_src_block_list();
579 }
580 }
581
582 /* ------------------------------------------------------------------------ */
583 /*
584 * Delete the source file, if any, including any parent include files.
585 */
delete_source()586 void CTcTokenizer::delete_source()
587 {
588 /* delete the current stream and all enclosing parents */
589 while (str_ != 0)
590 {
591 CTcTokStream *nxt;
592
593 /* remember the next stream in the list */
594 nxt = str_->get_parent();
595
596 /* delete this stream */
597 delete str_;
598
599 /* move up to the next one */
600 str_ = nxt;
601 }
602
603 /* there are no more streams */
604 str_ = 0;
605 }
606
607
608 /* ------------------------------------------------------------------------ */
609 /*
610 * Set up to read a source file. Returns zero on success, or a non-zero
611 * error code on failure.
612 */
set_source(const char * src_filename,const char * orig_name)613 int CTcTokenizer::set_source(const char *src_filename, const char *orig_name)
614 {
615 CTcTokFileDesc *desc;
616 CTcSrcFile *src;
617 int charset_error;
618 int default_charset_error;
619
620 /* empty out the input line buffer */
621 clear_linebuf();
622
623 /* set up at the beginning of the input line buffer */
624 start_new_line(linebuf_.get_buf(), linebuf_.get_text_len());
625
626 /* create a reader for the source file */
627 src = CTcSrcFile::open_source(src_filename, res_loader_,
628 default_charset_, &charset_error,
629 &default_charset_error);
630 if (src == 0)
631 {
632 /* if we had a problem loading the default character set, log it */
633 if (default_charset_error)
634 log_error(TCERR_CANT_LOAD_DEFAULT_CHARSET, default_charset_);
635
636 /* return failure */
637 return TCERR_CANT_OPEN_SRC;
638 }
639
640 /* find or create a file descriptor for this filename */
641 desc = get_file_desc(src_filename, strlen(src_filename), FALSE,
642 orig_name, strlen(orig_name));
643
644 /*
645 * Create a stream to read the source file. The new stream has no
646 * parent, because this is the top-level source file, and was not
647 * included from any other file.
648 */
649 str_ = new CTcTokStream(desc, src, 0, charset_error, if_sp_);
650
651 /* success */
652 return 0;
653 }
654
655 /*
656 * Set up to read source code from a memory buffer
657 */
set_source_buf(const char * buf)658 void CTcTokenizer::set_source_buf(const char *buf)
659 {
660 CTcSrcMemory *src;
661
662 /* empty out the input line buffer */
663 clear_linebuf();
664
665 /* reset the scanning state to the start of a brand new stream */
666 in_pp_expr_ = FALSE;
667 last_linenum_ = 0;
668 unsplicebuf_.clear_text();
669 in_quote_ = 0;
670 comment_in_embedding_ = FALSE;
671 macro_in_embedding_ = FALSE;
672 main_in_embedding_ = FALSE;
673 if_sp_ = 0;
674 if_false_level_ = 0;
675 nxttok_valid_ = FALSE;
676
677 /* set up at the beginning of the input line buffer */
678 start_new_line(linebuf_.get_buf(), linebuf_.get_text_len());
679
680 /* create a reader for the memory buffer */
681 src = new CTcSrcMemory(buf, default_mapper_);
682
683 /*
684 * Create a stream to read the source file. The new stream has no
685 * parent, because this is the top-level source file, and was not
686 * included from any other file.
687 */
688 str_ = new CTcTokStream(0, src, 0, 0, if_sp_);
689 }
690
691 /* ------------------------------------------------------------------------ */
692 /*
693 * Find or create a file descriptor for a given filename
694 */
get_file_desc(const char * fname,size_t fname_len,int always_create,const char * orig_fname,size_t orig_fname_len)695 CTcTokFileDesc *CTcTokenizer::get_file_desc(const char *fname,
696 size_t fname_len,
697 int always_create,
698 const char *orig_fname,
699 size_t orig_fname_len)
700 {
701 CTcTokFileDesc *orig_desc;
702 CTcTokFileDesc *desc;
703
704 /* presume we won't find an original descriptor in the list */
705 orig_desc = 0;
706
707 /*
708 * Search the list of existing descriptors to find one that matches.
709 * Do this regardless of whether we're allowed to re-use an existing
710 * one or not - even if we're creating a new one unconditionaly, we
711 * need to know if there's an earlier copy that already exists so we
712 * can associate the new one with the original.
713 */
714 for (desc = desc_head_ ; desc != 0 ; desc = desc->get_next())
715 {
716 /* check for a name match */
717 if (strlen(desc->get_fname()) == fname_len
718 && memcmp(desc->get_fname(), fname, fname_len) == 0)
719 {
720 /*
721 * if we're allowed to return an existing descriptor, return
722 * this one, since it's for the same filename
723 */
724 if (!always_create)
725 return desc;
726
727 /*
728 * we have to create a new descriptor even though we have an
729 * existing one - remember the original so we can point the
730 * new one back to the original
731 */
732 orig_desc = desc;
733
734 /*
735 * no need to look any further - we've found the first
736 * instance of this filename in our list
737 */
738 break;
739 }
740 }
741
742 /* we didn't find a match - create a new descriptor */
743 desc = new CTcTokFileDesc(fname, fname_len, next_filedesc_id_++,
744 orig_desc, orig_fname, orig_fname_len);
745
746 /* link it in at the end of the master list */
747 desc->set_next(0);
748 if (desc_tail_ == 0)
749 desc_head_ = desc;
750 else
751 desc_tail_->set_next(desc);
752 desc_tail_ = desc;
753
754 /* expand our array index if necessary */
755 if (desc_list_cnt_ >= desc_list_alo_)
756 {
757 size_t siz;
758
759 /* allocate or expand the array */
760 desc_list_alo_ += 10;
761 siz = desc_list_alo_ * sizeof(desc_list_[0]);
762 if (desc_list_ == 0)
763 desc_list_ = (CTcTokFileDesc **)t3malloc(siz);
764 else
765 desc_list_ = (CTcTokFileDesc **)t3realloc(desc_list_, siz);
766 }
767
768 /* add the new array entry */
769 desc_list_[desc_list_cnt_++] = desc;
770
771 /* return it */
772 return desc;
773 }
774
775
776 /* ------------------------------------------------------------------------ */
777 /*
778 * Add an include path entry. Each new entry goes at the end of the
779 * list, after all previous entries.
780 */
add_inc_path(const char * path)781 void CTcTokenizer::add_inc_path(const char *path)
782 {
783 tctok_incpath_t *entry;
784
785 /* create a new path list entry */
786 entry = (tctok_incpath_t *)t3malloc(sizeof(tctok_incpath_t)
787 + strlen(path));
788
789 /* store the path in the entry */
790 strcpy(entry->path, path);
791
792 /* link this entry at the end of our list */
793 if (incpath_tail_ != 0)
794 incpath_tail_->nxt = entry;
795 else
796 incpath_head_ = entry;
797 incpath_tail_ = entry;
798 entry->nxt = 0;
799 }
800
801
802 /* ------------------------------------------------------------------------ */
803 /*
804 * Set the string capture file.
805 */
set_string_capture(osfildef * fp)806 void CTcTokenizer::set_string_capture(osfildef *fp)
807 {
808 /* remember the capture file */
809 string_fp_ = fp;
810
811 /*
812 * if we don't already have a character mapping to translate from
813 * our internal unicode characters back into the source file
814 * character set, create one now
815 */
816 if (string_fp_map_ == 0)
817 {
818 /* try creating a mapping for the default character set */
819 if (default_charset_ != 0)
820 string_fp_map_ =
821 CCharmapToLocal::load(res_loader_, default_charset_);
822
823 /* if we couldn't create the mapping, use a default ASCII mapping */
824 if (string_fp_map_ == 0)
825 string_fp_map_ = CCharmapToLocal::load(res_loader_, "us-ascii");
826 }
827 }
828
829
830 /* ------------------------------------------------------------------------ */
831 /*
832 * Get the next token in the input stream, reading additional lines from
833 * the source file as needed.
834 */
next()835 tc_toktyp_t CTcTokenizer::next()
836 {
837 /* the current token is about to become the previous token */
838 prvtok_ = curtok_;
839
840 /* if there's an un-got token, return it */
841 if (nxttok_valid_)
842 {
843 /* get the previously-saved token */
844 curtok_ = nxttok_;
845
846 /* we've now consumed nxttok_ */
847 nxttok_valid_ = FALSE;
848
849 /* return the new token's type */
850 return curtok_.gettyp();
851 }
852
853 /* if there's an external source, get its next token */
854 if (ext_src_ != 0)
855 {
856 const CTcToken *ext_tok;
857
858 /* get the next token from the external source */
859 ext_tok = ext_src_->get_next_token();
860
861 /* check to see if we got a token */
862 if (ext_tok == 0)
863 {
864 /*
865 * restore the current token in effect before this source was
866 * active
867 */
868 curtok_ = *ext_src_->get_enclosing_curtok();
869
870 /*
871 * this source has no more tokens - restore the enclosing
872 * source, and keep going so we try getting a token from it
873 */
874 ext_src_ = ext_src_->get_enclosing_source();
875
876 /* return the token type */
877 return curtok_.gettyp();
878 }
879 else
880 {
881 /* we got a token - copy it to our internal token buffer */
882 curtok_ = *ext_tok;
883
884 /* return its type */
885 return curtok_.gettyp();
886 }
887 }
888
889 /* keep going until we get a valid token */
890 for (;;)
891 {
892 tc_toktyp_t typ;
893
894 /*
895 * read the next token from the current line, applying
896 * appropriate string translations and storing strings and
897 * symbols in the source block list
898 */
899 typ = next_on_line_xlat_keep();
900
901 /* if it's the "null" token, skip it and read another token */
902 if (typ == TOKT_NULLTOK)
903 continue;
904
905 /* if we found a valid token, we're done - return the token */
906 if (typ != TOKT_EOF)
907 return typ;
908
909 /*
910 * if we're at the end of a preprocess line, don't read another
911 * line - just return end of file
912 */
913 if (p_.getch() == TOK_END_PP_LINE)
914 return TOKT_EOF;
915
916 /*
917 * we've reached the end of the line - read another line,
918 * applying preprocessing directives and expanding macros as
919 * needed
920 */
921 if (read_line_pp())
922 {
923 /* no more lines are available - return end of file */
924 return TOKT_EOF;
925 }
926 }
927 }
928
929 /* ------------------------------------------------------------------------ */
930 /*
931 * clear external token sources, returning to the true input stream
932 */
clear_external_sources()933 void CTcTokenizer::clear_external_sources()
934 {
935 /*
936 * restore the current token as it was before the outermost external
937 * source was first established
938 */
939 if (ext_src_ != 0)
940 {
941 CTcTokenSource *outer;
942
943 /* find the outermost source */
944 for (outer = ext_src_ ; outer->get_enclosing_source() != 0 ;
945 outer = ext_src_->get_enclosing_source()) ;
946
947 /* restore its original next token */
948 curtok_ = *ext_src_->get_enclosing_curtok();
949 }
950
951 /* there's no external source now */
952 ext_src_ = 0;
953 }
954
955 /* ------------------------------------------------------------------------ */
956 /*
957 * Make a safely storable copy of the current token.
958 */
copycur()959 const CTcToken *CTcTokenizer::copycur()
960 {
961 /* if the current token is a symbol, it already has a safe copy */
962 if (curtok_.gettyp() == TOKT_SYM)
963 return getcur();
964
965 /* save the current token's text in permanent tokenizer memory */
966 curtok_.set_text(store_source(curtok_.get_text(), curtok_.get_text_len()),
967 curtok_.get_text_len());
968
969 /* return the current token, now that we've made it safe */
970 return &curtok_;
971 }
972
973 /*
974 * Make a safely storable copy of a given token.
975 */
copytok(CTcToken * dst,const CTcToken * src)976 void CTcTokenizer::copytok(CTcToken *dst, const CTcToken *src)
977 {
978 /* start with an exact copy of the token */
979 *dst = *src;
980
981 /* if the token is a symbol, it already has a safe copy */
982 if (src->gettyp() == TOKT_SYM)
983 return;
984
985 /* save the token's text in permanent tokenizer memory */
986 dst->set_text(store_source(dst->get_text(), dst->get_text_len()),
987 dst->get_text_len());
988 }
989
990
991 /* ------------------------------------------------------------------------ */
992 /*
993 * Check to see if the current token matches the given text
994 */
cur_tok_matches(const char * txt,size_t len)995 int CTcTokenizer::cur_tok_matches(const char *txt, size_t len)
996 {
997 /* if the length matches, and the text matches exactly, it matches */
998 return (getcur()->get_text_len() == len
999 && memcmp(getcur()->get_text(), txt, len) == 0);
1000 }
1001
1002 /* ------------------------------------------------------------------------ */
1003 /*
1004 * Un-get the current token
1005 */
unget()1006 void CTcTokenizer::unget()
1007 {
1008 /*
1009 * remember the current token as the next one to fetch, and flag
1010 * that this is valid
1011 */
1012 nxttok_ = curtok_;
1013 nxttok_valid_ = TRUE;
1014
1015 /* go back to the previous token */
1016 curtok_ = prvtok_;
1017 }
1018
1019 /* ------------------------------------------------------------------------ */
1020 /*
1021 * Assume that we should have just found a '>>' terminating an embedded
1022 * expression in a double-quoted string. If possible, back out the
1023 * previous token and re-scan it as though it had started with '>>'.
1024 *
1025 * This is to be called by a higher-level parser when it determines
1026 * that, syntactically, we should have found the '>>' leaving an
1027 * embedded expression.
1028 */
assume_missing_dstr_cont()1029 void CTcTokenizer::assume_missing_dstr_cont()
1030 {
1031 /* act as though we had just seen '>>' */
1032 xlat_string_to_src(&main_in_embedding_, TRUE);
1033 }
1034
1035
1036 /* ------------------------------------------------------------------------ */
1037 /*
1038 * Skip whitespace and macro expansion markers
1039 */
skip_ws_and_markers(utf8_ptr * p)1040 void CTcTokenizer::skip_ws_and_markers(utf8_ptr *p)
1041 {
1042 /* keep going until we find something interesting */
1043 for (;;)
1044 {
1045 wchar_t cur;
1046
1047 /* get the current character */
1048 cur = p->getch();
1049
1050 /*
1051 * if it's a macro expansion end marker, skip it as though it
1052 * were whitespace; otherwise, if it's whitespace, skip it;
1053 * otherwise, we're done skipping leading whitespace
1054 */
1055 if (cur == TOK_MACRO_EXP_END)
1056 {
1057 /* skip the embedded pointer value that follows */
1058 p->set(p->getptr() + 1 + sizeof(CTcHashEntryPp *));
1059 }
1060 else if (is_space(cur))
1061 {
1062 /* skip the space */
1063 p->inc();
1064 }
1065 else
1066 {
1067 /* it's not whitespace or equivalent - we're done */
1068 return;
1069 }
1070 }
1071 }
1072
1073 /* ------------------------------------------------------------------------ */
1074 /*
1075 * Get the next token from the input stream, operating on the current
1076 * line only.
1077 */
next_on_line(utf8_ptr * p,CTcToken * tok,int * in_embedding)1078 tc_toktyp_t CTcTokenizer::next_on_line(utf8_ptr *p, CTcToken *tok,
1079 int *in_embedding)
1080 {
1081 wchar_t cur;
1082 tc_toktyp_t typ;
1083 utf8_ptr start;
1084 int num_minus;
1085
1086 /* skip whitespace */
1087 skip_ws_and_markers(p);
1088
1089 /* remember where the token starts */
1090 start = *p;
1091
1092 /* if there's nothing left in the current line, return EOF */
1093 if (p->getch() == '\0')
1094 {
1095 /* indicate end of file */
1096 typ = TOKT_EOF;
1097 goto done;
1098 }
1099
1100 /* get the initial character, and skip it */
1101 cur = p->getch();
1102 p->inc();
1103
1104 /* presume the token will not be marked as fully macro-expanded */
1105 tok->set_fully_expanded(FALSE);
1106
1107 /* presume it's not a number with a minus sign */
1108 num_minus = FALSE;
1109
1110 /* see what we have */
1111 switch(cur)
1112 {
1113 case TOK_MACRO_FORMAL_FLAG:
1114 /*
1115 * this is a two-byte formal parameter sequence in a macro
1116 * expansion - skip the second byte of the two-byte sequence,
1117 * and return the special token type for this sequence
1118 */
1119 typ = TOKT_MACRO_FORMAL;
1120
1121 /*
1122 * skip the second byte - note that we want to skip exactly one
1123 * byte, regardless of what the byte looks like as a utf-8
1124 * partial character, since it's not a utf-8 character at all
1125 */
1126 p->set(p->getptr() + 1);
1127 break;
1128
1129 case TOK_MACRO_FOREACH_FLAG:
1130 /*
1131 * this is the special macro '#foreach' flag - return it as a
1132 * special pseudo-token
1133 */
1134 typ = TOKT_MACRO_FOREACH;
1135 break;
1136
1137 case TOK_MACRO_IFEMPTY_FLAG:
1138 /* #ifempty macro flag */
1139 typ = TOKT_MACRO_IFEMPTY;
1140 break;
1141
1142 case TOK_MACRO_IFNEMPTY_FLAG:
1143 /* #ifnempty macro flag */
1144 typ = TOKT_MACRO_IFNEMPTY;
1145 break;
1146
1147 case TOK_MACRO_ARGCOUNT_FLAG:
1148 /* it's the special macro '#argcount' flag */
1149 typ = TOKT_MACRO_ARGCOUNT;
1150 break;
1151
1152 case TOK_FULLY_EXPANDED_FLAG:
1153 /* set the token flag indicating that it has been fully expanded */
1154 tok->set_fully_expanded(TRUE);
1155
1156 /* the token symbol starts at the byte after the flag byte */
1157 start = p->getptr();
1158
1159 /* read the first character of the symbol */
1160 cur = p->getch();
1161 p->inc();
1162
1163 /* tokenize the symbol that follows */
1164 goto tokenize_symbol;
1165
1166 case TOK_END_PP_LINE:
1167 /*
1168 * Preprocess line-ending marker - when we reach the end of a
1169 * preprocessor line, we can't read another source line, because
1170 * a preprocessor directive consists of only a single logical
1171 * source line. Once we see this, return end-of-file until the
1172 * caller explicitly reads a new source line.
1173 *
1174 * Keep the read pointer stuck on this flag byte, so that we
1175 * return end-of-file on a subsequent attempt to get the next
1176 * token.
1177 */
1178 *p = start;
1179 typ = TOKT_EOF;
1180 break;
1181
1182 case '0':
1183 case '1':
1184 case '2':
1185 case '3':
1186 case '4':
1187 case '5':
1188 case '6':
1189 case '7':
1190 case '8':
1191 case '9':
1192 {
1193 long acc;
1194
1195 /*
1196 * Start out with the leading digit in the accumulator. Note
1197 * that the character set internally is always UTF-8.
1198 */
1199 acc = value_of_digit(cur);
1200
1201 /*
1202 * If it's a leading zero, treat as octal or hex. '0x' means
1203 * hex; otherwise, '0' means octal.
1204 */
1205 if (cur == '0')
1206 {
1207 /* check for hex - if it's not hex, it's octal */
1208 if (p->getch() == 'x' || p->getch() == 'X')
1209 {
1210 /* skip the 'x' */
1211 p->inc();
1212
1213 /*
1214 * scan the hex number - keep going until we find
1215 * something that's not a hex digit
1216 */
1217 for (;;)
1218 {
1219 /* get this character */
1220 cur = p->getch();
1221
1222 /* if it's not a hex digit, stop scanning */
1223 if (!is_xdigit(cur))
1224 break;
1225
1226 /*
1227 * Shift the accumulator and add this digit's value.
1228 * Note that we can save a test - if the character is
1229 * >= lower-case 'a', we know it's not an upper-case
1230 * letter because the lower-case letters all have
1231 * values above the upper-case letters in UTF-8
1232 * encoding (which we always use as the internal
1233 * character set). Since we already know it's a
1234 * valid hex digit (we wouldn't be here if it
1235 * weren't), we can just check to see if it's at
1236 * least lower-case 'a', and we automatically know
1237 * then whether it's in the 'a'-'f' range or the
1238 * 'A'-'F' range.
1239 */
1240 acc *= 16;
1241 acc += value_of_xdigit(cur);
1242
1243 /* move on */
1244 p->inc();
1245 }
1246 }
1247 else
1248 {
1249 /* scan octal digits */
1250 for ( ; is_odigit(p->getch()) ; p->inc())
1251 acc = 8*acc + value_of_odigit(p->getch());
1252 }
1253 }
1254 else
1255 {
1256 /* scan decimal digits */
1257 for ( ; is_digit(p->getch()) ; p->inc())
1258 acc = 10*acc + value_of_digit(p->getch());
1259 }
1260
1261 /* negate the value if we had a minus sign */
1262 if (num_minus)
1263 acc = -acc;
1264
1265 /*
1266 * if we stopped at a decimal point or an exponent, it's a
1267 * floating point number
1268 */
1269 if (p->getch() == '.' || p->getch() == 'e' || p->getch() == 'E')
1270 goto do_float;
1271
1272 /* it's an integer value */
1273 typ = TOKT_INT;
1274
1275 /* set the integer value */
1276 tok->set_int_val(acc);
1277 }
1278 break;
1279
1280 do_float:
1281 {
1282 int found_decpt;
1283
1284 /* start over and parse the float */
1285 for (*p = start, found_decpt = FALSE ; ; p->inc())
1286 {
1287 /* get this character and move on */
1288 cur = p->getch();
1289
1290 /* see what we have */
1291 if (is_digit(cur))
1292 {
1293 /* we have another digit; just keep going */
1294 }
1295 else if (!found_decpt && cur == '.')
1296 {
1297 /* it's the decimal point - note it and keep going */
1298 found_decpt = TRUE;
1299 }
1300 else if (cur == 'e' || cur == 'E')
1301 {
1302 /* it's the exponent - if there's a sign, skip it */
1303 p->inc();
1304 cur = p->getch();
1305 if (cur == '+' || cur == '-')
1306 p->inc();
1307
1308 /* keep going until we find no more digits */
1309 while (is_digit(p->getch()))
1310 p->inc();
1311
1312 /* the end of the exponent is the end of the number */
1313 break;
1314 }
1315 else
1316 {
1317 /* everything else ends the number */
1318 break;
1319 }
1320 }
1321 }
1322
1323 /* it's a float */
1324 typ = TOKT_FLOAT;
1325 break;
1326
1327 case '"':
1328 case '\'':
1329 *p = start;
1330 return tokenize_string(p, tok, in_embedding);
1331
1332 case '(':
1333 typ = TOKT_LPAR;
1334 break;
1335
1336 case ')':
1337 typ = TOKT_RPAR;
1338 break;
1339
1340 case ',':
1341 typ = TOKT_COMMA;
1342 break;
1343
1344 case '.':
1345 /* check for '...' and floating-point numbers */
1346 if (p->getch() == '.' && p->getch_at(1) == '.')
1347 {
1348 p->inc();
1349 p->inc();
1350 typ = TOKT_ELLIPSIS;
1351 }
1352 else if (is_digit(p->getch()))
1353 goto do_float;
1354 else
1355 typ = TOKT_DOT;
1356 break;
1357
1358 case '{':
1359 typ = TOKT_LBRACE;
1360 break;
1361
1362 case '}':
1363 typ = TOKT_RBRACE;
1364 break;
1365
1366 case '[':
1367 typ = TOKT_LBRACK;
1368 break;
1369
1370 case ']':
1371 typ = TOKT_RBRACK;
1372 break;
1373
1374 case '=':
1375 /* check for '==' */
1376 if (p->getch() == '=')
1377 {
1378 p->inc();
1379 typ = TOKT_EQEQ;
1380 }
1381 else
1382 typ = TOKT_EQ;
1383 break;
1384
1385 case ':':
1386 /* check for ':=' and '::' */
1387 if (p->getch() == '=')
1388 {
1389 p->inc();
1390 typ = TOKT_ASI;
1391 }
1392 else if (p->getch() == ':')
1393 {
1394 p->inc();
1395 typ = TOKT_COLONCOLON;
1396 }
1397 else
1398 typ = TOKT_COLON;
1399 break;
1400
1401 case '?':
1402 typ = TOKT_QUESTION;
1403 break;
1404
1405 case '+':
1406 /* check for '++' and '+=' */
1407 if (p->getch() == '+')
1408 {
1409 p->inc();
1410 typ = TOKT_INC;
1411 }
1412 else if (p->getch() == '=')
1413 {
1414 p->inc();
1415 typ = TOKT_PLUSEQ;
1416 }
1417 else
1418 typ = TOKT_PLUS;
1419 break;
1420
1421 case '-':
1422 /* check for '--', '->' and '-=' */
1423 if (p->getch() == '-')
1424 {
1425 p->inc();
1426 typ = TOKT_DEC;
1427 }
1428 else if (p->getch() == '=')
1429 {
1430 p->inc();
1431 typ = TOKT_MINEQ;
1432 }
1433 else if (p->getch() == '>')
1434 {
1435 p->inc();
1436 typ = TOKT_ARROW;
1437 }
1438 else
1439 typ = TOKT_MINUS;
1440 break;
1441
1442 case '*':
1443 /* check for '*=' */
1444 if (p->getch() == '=')
1445 {
1446 p->inc();
1447 typ = TOKT_TIMESEQ;
1448 }
1449 else
1450 typ = TOKT_TIMES;
1451 break;
1452
1453 case '/':
1454 /* check for '/=' */
1455 if (p->getch() == '=')
1456 {
1457 p->inc();
1458 typ = TOKT_DIVEQ;
1459 }
1460 else
1461 typ = TOKT_DIV;
1462 break;
1463
1464 case '%':
1465 /* check for '%=' */
1466 if (p->getch() == '=')
1467 {
1468 p->inc();
1469 typ = TOKT_MODEQ;
1470 }
1471 else
1472 typ = TOKT_MOD;
1473 break;
1474
1475 case '>':
1476 /* check for '>>=', '>>' and '>=' */
1477 if (p->getch() == '=')
1478 {
1479 p->inc();
1480 typ = TOKT_GE;
1481 }
1482 else if (p->getch() == '>')
1483 {
1484 /* check for the end of an embedded expression */
1485 if (in_embedding != 0 && *in_embedding)
1486 {
1487 *p = start;
1488 return tokenize_string(p, tok, in_embedding);
1489 }
1490
1491 /* check for '>>=' */
1492 p->inc();
1493 if (p->getch() == '=')
1494 {
1495 p->inc();
1496 typ = TOKT_SHREQ;
1497 }
1498 else
1499 typ = TOKT_SHR;
1500 }
1501 else
1502 typ = TOKT_GT;
1503 break;
1504
1505 case '<':
1506 /* check for '<<=', '<<', '<>', and '<=' */
1507 if (p->getch() == '=')
1508 {
1509 p->inc();
1510 typ = TOKT_LE;
1511 }
1512 else if (p->getch() == '<')
1513 {
1514 /* check for '<<=' */
1515 p->inc();
1516 if (p->getch() == '=')
1517 {
1518 p->inc();
1519 typ = TOKT_SHLEQ;
1520 }
1521 else
1522 typ = TOKT_SHL;
1523 }
1524 else if (p->getch() == '>')
1525 {
1526 p->inc();
1527 typ = TOKT_NE;
1528 }
1529 else
1530 typ = TOKT_LT;
1531 break;
1532
1533 case ';':
1534 typ = TOKT_SEM;
1535 break;
1536
1537 case '&':
1538 /* check for '&&' and '&=' */
1539 if (p->getch() == '&')
1540 {
1541 p->inc();
1542 typ = TOKT_ANDAND;
1543 }
1544 else if (p->getch() == '=')
1545 {
1546 p->inc();
1547 typ = TOKT_ANDEQ;
1548 }
1549 else
1550 typ = TOKT_AND;
1551 break;
1552
1553 case '|':
1554 /* check for '||' and '|=' */
1555 if (p->getch() == '|')
1556 {
1557 p->inc();
1558 typ = TOKT_OROR;
1559 }
1560 else if (p->getch() == '=')
1561 {
1562 p->inc();
1563 typ = TOKT_OREQ;
1564 }
1565 else
1566 typ = TOKT_OR;
1567 break;
1568
1569 case '^':
1570 /* check for '^=' */
1571 if (p->getch() == '^')
1572 {
1573 p->inc();
1574 typ = TOKT_XOREQ;
1575 }
1576 else
1577 typ = TOKT_XOR;
1578 break;
1579
1580 case '!':
1581 /* check for '!=' */
1582 if (p->getch() == '=')
1583 {
1584 p->inc();
1585 typ = TOKT_NE;
1586 }
1587 else
1588 typ = TOKT_NOT;
1589 break;
1590
1591 case '~':
1592 typ = TOKT_BNOT;
1593 break;
1594
1595 case '@':
1596 typ = TOKT_AT;
1597 break;
1598
1599 case '#':
1600 /* check for '##' and '#@' */
1601 if (p->getch() == '#')
1602 {
1603 p->inc();
1604 typ = TOKT_POUNDPOUND;
1605 }
1606 else if (p->getch() == '@')
1607 {
1608 p->inc();
1609 typ = TOKT_POUNDAT;
1610 }
1611 else
1612 typ = TOKT_POUND;
1613 break;
1614
1615 default:
1616 /* check to see if it's a symbol */
1617 if (is_syminit(cur))
1618 {
1619 size_t len, full_len;
1620
1621 /*
1622 * scan the identifier (note that we've already skipped the
1623 * first character, so we start out at length = 1)
1624 */
1625 tokenize_symbol:
1626 for (len = full_len = 1 ; is_sym(p->getch()) ; p->inc())
1627 {
1628 /* count the full length */
1629 ++full_len;
1630
1631 /*
1632 * count this character if we're not over the maximum
1633 * length
1634 */
1635 if (len < TOK_SYM_MAX_LEN)
1636 ++len;
1637 }
1638
1639 /* if we truncated the symbol, issue a warning */
1640 if (full_len != len)
1641 log_warning(TCERR_SYMBOL_TRUNCATED,
1642 (int)full_len, start.getptr(),
1643 (int)len, start.getptr());
1644
1645 /* it's a symbol */
1646 typ = TOKT_SYM;
1647 }
1648 else
1649 {
1650 /* invalid token */
1651 typ = TOKT_INVALID;
1652 }
1653 break;
1654 }
1655
1656 done:
1657 /* set the type */
1658 tok->settyp(typ);
1659
1660 /* set the text */
1661 tok->set_text(start.getptr(), p->getptr() - start.getptr());
1662
1663 /* return the type */
1664 return typ;
1665 }
1666
1667 /*
1668 * get the next token, limiting to the length of the source buffer
1669 */
next_on_line(const CTcTokString * srcbuf,utf8_ptr * p,CTcToken * tok,int * in_embedding)1670 tc_toktyp_t CTcTokenizer::next_on_line(const CTcTokString *srcbuf,
1671 utf8_ptr *p, CTcToken *tok,
1672 int *in_embedding)
1673 {
1674 /* get the next token */
1675 next_on_line(p, tok, in_embedding);
1676
1677 /* if the token is past the end of the line, return EOF */
1678 if (tok->get_text() >= srcbuf->get_text_end())
1679 {
1680 /* set the token to indicate end of line */
1681 tok->settyp(TOKT_EOF);
1682
1683 /* set the token to point to the end of the buffer */
1684 tok->set_text(srcbuf->get_text_end(), 0);
1685 }
1686
1687 /* return the token type */
1688 return tok->gettyp();
1689 }
1690
1691 /*
1692 * Get the next token on the line, translating escapes in strings. This
1693 * updates the line buffer in-place to incorporate the translated string
1694 * text.
1695 */
next_on_line_xlat(utf8_ptr * p,CTcToken * tok,int * in_embedding)1696 tc_toktyp_t CTcTokenizer::next_on_line_xlat(utf8_ptr *p, CTcToken *tok,
1697 int *in_embedding)
1698 {
1699 /* skip whitespace */
1700 skip_ws_and_markers(p);
1701
1702 /* if this is a string, translate escapes */
1703 switch(p->getch())
1704 {
1705 case '"':
1706 case '\'':
1707 /* translate the string */
1708 return xlat_string(p, tok, in_embedding);
1709
1710 case '>':
1711 /* if we're in an embedding, check for '>>' */
1712 if (in_embedding != 0 && *in_embedding && p->getch_at(1) == '>')
1713 return tokenize_string(p, tok, in_embedding);
1714
1715 /* use the default case */
1716 goto do_normal;
1717
1718 default:
1719 do_normal:
1720 /* for anything else, use the default tokenizer */
1721 return next_on_line(p, tok, in_embedding);
1722 }
1723 }
1724
1725 /*
1726 * Look up a keyword
1727 */
look_up_keyword(const CTcToken * tok,tc_toktyp_t * kwtok)1728 int CTcTokenizer::look_up_keyword(const CTcToken *tok, tc_toktyp_t *kwtok)
1729 {
1730 CTcHashEntryKw *kw;
1731
1732 /* look it up in the keyword table */
1733 kw = (CTcHashEntryKw *)kw_->find(tok->get_text(), tok->get_text_len());
1734 if (kw != 0)
1735 {
1736 /* we found the keyword - set 'kw' to the keyword token id */
1737 *kwtok = kw->get_tok_id();
1738
1739 /* tell the caller we found it */
1740 return TRUE;
1741 }
1742 else
1743 {
1744 /* tell the caller it's not a keyword */
1745 return FALSE;
1746 }
1747 }
1748
1749 /*
1750 * Get the next token on the line, translating escape sequences in
1751 * strings, and storing strings and symbols in the source block list.
1752 * This routine also translates keywords for token types.
1753 */
next_on_line_xlat_keep()1754 tc_toktyp_t CTcTokenizer::next_on_line_xlat_keep()
1755 {
1756 tc_toktyp_t typ;
1757
1758 /* keep going until we find a valid symbol */
1759 for (;;)
1760 {
1761 /* skip whitespace and macro expansion flags */
1762 skip_ws_and_markers(&p_);
1763
1764 /* see what we have */
1765 switch(p_.getch())
1766 {
1767 case '"':
1768 case '\'':
1769 /* it's a string - translate and save it */
1770 return xlat_string_to_src(&main_in_embedding_, FALSE);
1771
1772 case '>':
1773 /* if we're in an embedding, this is the end of it */
1774 if (main_in_embedding_ && p_.getch_at(1) == '>')
1775 return xlat_string_to_src(&main_in_embedding_, FALSE);
1776
1777 /* use the normal parsing */
1778 goto do_normal;
1779
1780 default:
1781 do_normal:
1782 /* for anything else, use the default tokenizer */
1783 typ = next_on_line(&p_, &curtok_, &main_in_embedding_);
1784
1785 /* check the token type */
1786 switch(typ)
1787 {
1788 case TOKT_SYM:
1789 /* symbol */
1790 {
1791 const char *p;
1792 CTcHashEntryKw *kw;
1793
1794 /* look it up in the keyword table */
1795 kw = (CTcHashEntryKw *)kw_->find(curtok_.get_text(),
1796 curtok_.get_text_len());
1797 if (kw != 0)
1798 {
1799 /* replace the token with the keyword token type */
1800 typ = kw->get_tok_id();
1801 curtok_.settyp(typ);
1802 }
1803 else
1804 {
1805 /* ordinary symbol - save the text */
1806 p = store_source(curtok_.get_text(),
1807 curtok_.get_text_len());
1808
1809 /*
1810 * change the token's text to point to the
1811 * source block, so that this token's text
1812 * pointer will remain permanently valid (the
1813 * original copy, in the source line buffer,
1814 * will be overwritten as soon as we read
1815 * another source line; we don't want the caller
1816 * to have to worry about this, so we return the
1817 * permanent copy)
1818 */
1819 curtok_.set_text(p, curtok_.get_text_len());
1820 }
1821 }
1822 break;
1823
1824 case TOKT_FLOAT:
1825 /* floating-point number */
1826 {
1827 const char *p;
1828
1829 /*
1830 * save the text so that it remains permanently
1831 * valid - we keep track of floats by the original
1832 * text, and let the code generator produce the
1833 * appropriate object file representation
1834 */
1835 p = store_source(curtok_.get_text(),
1836 curtok_.get_text_len());
1837 curtok_.set_text(p, curtok_.get_text_len());
1838 }
1839 break;
1840
1841 case TOKT_INVALID:
1842 /* log an error for the invalid token */
1843 log_error_curtok(TCERR_INVALID_CHAR);
1844
1845 /* skip this character */
1846 p_.inc();
1847
1848 /* keep going */
1849 continue;
1850 }
1851 }
1852
1853 /* return the type */
1854 return typ;
1855 }
1856 }
1857
1858
1859 /*
1860 * Translate the string at the current token position in the input
1861 * stream to the source block list.
1862 */
xlat_string_to_src(int * in_embedding,int force_embed_end)1863 tc_toktyp_t CTcTokenizer::xlat_string_to_src(int *in_embedding,
1864 int force_embed_end)
1865 {
1866 tc_toktyp_t typ;
1867
1868 /*
1869 * Reserve space for the entire rest of the line. This is
1870 * conservative, in that we will definitely need less space than
1871 * this. This might cause us to waste a little space here and
1872 * there, since we will over-allocate when we have a short string
1873 * early in a long line, but this will save us the time of scanning
1874 * the string twice just to see how long it is.
1875 */
1876 reserve_source(line_len_ - (p_.getptr() - line_start_));
1877
1878 /* translate into the source block */
1879 typ = xlat_string_to(src_ptr_, &p_, &curtok_,
1880 in_embedding, force_embed_end);
1881
1882 /* commit the space in the source block */
1883 commit_source(curtok_.get_text_len() + 1);
1884
1885 /* return the string token */
1886 return typ;
1887 }
1888
1889 /*
1890 * Translate a string, setting up the token structure for the string,
1891 * and writing the translated version of the string directly over the
1892 * original source buffer of the string.
1893 *
1894 * Since a translated string can only shrink (because a translated
1895 * escape sequence is always shorter than the original source version),
1896 * we don't need a separate buffer, but can simply translate into the
1897 * source buffer, overwriting the original string as we go.
1898 */
xlat_string(utf8_ptr * p,CTcToken * tok,int * in_embedding)1899 tc_toktyp_t CTcTokenizer::xlat_string(utf8_ptr *p, CTcToken *tok,
1900 int *in_embedding)
1901 {
1902 char *dst;
1903
1904 /*
1905 * write the translated string over the original string's text,
1906 * starting at the character after the quote
1907 */
1908 dst = p->getptr() + 1;
1909
1910 /* translate the string into our destination buffer */
1911 return xlat_string_to(dst, p, tok, in_embedding, FALSE);
1912 }
1913
1914 /*
1915 * Translate a string, setting up the token structure for the string.
1916 * We will update the line buffer in-place to incorporate the translated
1917 * string text.
1918 */
xlat_string_to(char * dstp,utf8_ptr * p,CTcToken * tok,int * in_embedding,int force_embed_end)1919 tc_toktyp_t CTcTokenizer::xlat_string_to(char *dstp, utf8_ptr *p,
1920 CTcToken *tok, int *in_embedding,
1921 int force_embed_end)
1922 {
1923 utf8_ptr dst;
1924 wchar_t qu;
1925 utf8_ptr start, end;
1926 int i;
1927
1928 /* set up our output utf8 pointer */
1929 dst.set(dstp);
1930
1931 /* note the open quote character */
1932 qu = p->getch();
1933
1934 /* set the appropriate string token type */
1935 tok->settyp(qu == '"'
1936 ? TOKT_DSTR
1937 : (qu == '>' ? TOKT_DSTR_END : TOKT_SSTR));
1938
1939 /* skip the open quote */
1940 p->inc();
1941
1942 /* skip the second '>' if it's a '>>' */
1943 if (force_embed_end)
1944 {
1945 /*
1946 * they want us to assume the embedding ends here, regardless of
1947 * what we're looking at - act the same as though we had
1948 * actually seen '>>', but don't skip any input (in fact, back
1949 * up one, since we already skipped one character for what we
1950 * had thought was the open quote
1951 */
1952 p->dec();
1953
1954 /* clear the caller's in-embedding status */
1955 *in_embedding = FALSE;
1956
1957 /* close with a double quote */
1958 qu = '"';
1959
1960 /* it's a double-quoted string continuation */
1961 tok->settyp(TOKT_DSTR_END);
1962 }
1963 else if (qu == '>')
1964 {
1965 /* skip the second '>' */
1966 p->inc();
1967
1968 /* clear the caller's in-embedding status */
1969 *in_embedding = FALSE;
1970
1971 /* close with a double quote */
1972 qu = '"';
1973 }
1974
1975 /* remember where the string's contents start */
1976 start = *p;
1977
1978 /* scan the string and translate quotes */
1979 for (;;)
1980 {
1981 wchar_t cur;
1982
1983 /* get this character */
1984 cur = p->getch();
1985
1986 /* if this is the matching quote, we're done */
1987 if (cur == qu)
1988 break;
1989
1990 /*
1991 * if we find an end-of-line within the string, it's an error -
1992 * we should always splice strings together onto a single line
1993 * before starting to tokenize the line
1994 */
1995 if (cur == '\0')
1996 {
1997 size_t len;
1998 utf8_ptr p;
1999
2000 /* note where the string ends */
2001 end = dst;
2002
2003 /* set the token's text pointer */
2004 tok->set_text(dstp, end.getptr() - dstp);
2005
2006 /* null-terminate the result string */
2007 dst.setch('\0');
2008
2009 /*
2010 * get the length of the unterminated string so far, but for
2011 * error logging, limit the length to twenty characters --
2012 * we just want to give the user enough information to find
2013 * the string in error, without making the error message
2014 * huge
2015 */
2016 p.set(dstp);
2017 len = p.len(end.getptr() - dstp);
2018 if (len > 20)
2019 len = p.bytelen(20);
2020
2021 /*
2022 * Check for a special heuristic case. If the string was of
2023 * zero length, and we have something sitting in our
2024 * unsplice buffer, here's what probably happened: the input
2025 * was missing a ">>" sequence at the end of an embedded
2026 * expression, and the parser told us to put it back in. We
2027 * had earlier decided we needed to splice up to a quote to
2028 * end what looked to us like an unterminated string. If
2029 * this is the case, we and the parser are working at cross
2030 * purposes; the parser is smarter than we are, so we should
2031 * synchronize with it.
2032 */
2033 if (tok->get_text_len() == 0
2034 && qu == '"'
2035 && unsplicebuf_.get_text_len() != 0)
2036 {
2037 char *buf;
2038
2039 /*
2040 * we must have spliced a line to finish a string -
2041 * insert the quote into the splice buffer, and ignore
2042 * it here
2043 */
2044
2045 /*
2046 * make sure there's room for one more character (plus a
2047 * null byte)
2048 */
2049 unsplicebuf_.ensure_space(unsplicebuf_.get_text_len() + 2);
2050
2051 /* get the buffer pointer */
2052 buf = unsplicebuf_.get_buf();
2053
2054 /* make room for the '"' */
2055 memmove(buf + 1, buf, unsplicebuf_.get_text_len());
2056 unsplicebuf_.set_text_len(unsplicebuf_.get_text_len() + 1);
2057
2058 /* add the '"' */
2059 *buf = '"';
2060
2061 /*
2062 * return the 'null token' to tell the caller to try
2063 * again - do not log an error at this point
2064 */
2065 return TOKT_NULLTOK;
2066 }
2067
2068 /* log the error */
2069 log_error(TCERR_UNTERM_STRING,
2070 (char)qu, (int)len, dstp, (char)qu);
2071
2072 /* return the string type */
2073 return tok->gettyp();
2074 }
2075
2076 /* if this is an escape, translate it */
2077 if (cur == '\\')
2078 {
2079 long acc;
2080
2081 /* get the character after the escape */
2082 p->inc();
2083 cur = p->getch();
2084
2085 /* see what we have */
2086 switch(cur)
2087 {
2088 case '^':
2089 /* caps - 0x000F */
2090 cur = 0x000F;
2091 break;
2092
2093 case 'v':
2094 /* miniscules - 0x000E */
2095 cur = 0x000E;
2096 break;
2097
2098 case 'b':
2099 /* blank line - 0x000B */
2100 cur = 0x000B;
2101 break;
2102
2103 case ' ':
2104 /* quoted space - 0x0015 */
2105 cur = 0x0015;
2106 break;
2107
2108 case 'n':
2109 /* newline - explicitly use Unicode 10 character */
2110 cur = 10;
2111 break;
2112
2113 case 't':
2114 /* tab - explicitly use Unicode 9 character */
2115 cur = 9;
2116 break;
2117
2118 case 'u':
2119 /*
2120 * Hex unicode character number. Read up to 4 hex
2121 * digits that follow the 'u', and use that as a Unicode
2122 * character ID.
2123 */
2124 for (i = 0, acc = 0, p->inc() ; i < 4 ; ++i, p->inc())
2125 {
2126 /* get the next character */
2127 cur = p->getch();
2128
2129 /*
2130 * if it's another hex digit, add it into the
2131 * accumulator; otherwise, we're done
2132 */
2133 if (is_xdigit(cur))
2134 acc = 16*acc + value_of_xdigit(cur);
2135 else
2136 break;
2137 }
2138
2139 /* use the accumulated value as the character number */
2140 dst.setch((wchar_t)acc);
2141
2142 /*
2143 * continue with the current character, since we've
2144 * already skipped ahead to the next one
2145 */
2146 continue;
2147
2148 case '0':
2149 case '1':
2150 case '2':
2151 case '3':
2152 case '4':
2153 case '5':
2154 case '6':
2155 case '7':
2156 /*
2157 * Octal ASCII character number. Accumulate up to three
2158 * octal numbers, and use the result as a character ID.
2159 */
2160 for (i = 0, acc = 0 ; i < 4 ; ++i, p->inc())
2161 {
2162 /* get the next character */
2163 cur = p->getch();
2164
2165 /*
2166 * if it's another digit, and it would leave our
2167 * result in the 0-255 range, count it; if not,
2168 * we're done
2169 */
2170 if (is_odigit(cur))
2171 {
2172 long new_acc;
2173
2174 /* compute the new value */
2175 new_acc = 8*acc + value_of_odigit(cur);
2176
2177 /* if this would be too high, don't count it */
2178 if (new_acc > 255)
2179 break;
2180 else
2181 acc = new_acc;
2182 }
2183 else
2184 break;
2185 }
2186
2187 /* use the accumulated value as the character number */
2188 dst.setch((wchar_t)acc);
2189
2190 /*
2191 * continue with the current character, since we've
2192 * already skipped ahead to the next one
2193 */
2194 continue;
2195
2196 case 'x':
2197 /*
2198 * Hex ASCII character number. Read up to two hex
2199 * digits as a character number.
2200 */
2201 for (i = 0, acc = 0, p->inc() ; i < 2 ; ++i, p->inc())
2202 {
2203 /* get the next character */
2204 cur = p->getch();
2205
2206 /*
2207 * if it's another hex digit, add it into the
2208 * accumulator; otherwise, we're done
2209 */
2210 if (is_xdigit(cur))
2211 acc = 16*acc + value_of_xdigit(cur);
2212 else
2213 break;
2214 }
2215
2216 /* use the accumulated value as the character number */
2217 dst.setch((wchar_t)acc);
2218
2219 /*
2220 * continue with the current character, since we've
2221 * already skipped ahead to the next one
2222 */
2223 continue;
2224
2225 default:
2226 /* copy anything else as-is */
2227 break;
2228 }
2229 }
2230 else if (in_embedding != 0 && !*in_embedding
2231 && cur == '<' && p->getch_at(1) == '<')
2232 {
2233 /*
2234 * it's the start of an embedded expression - change the
2235 * type to so indicate
2236 */
2237 tok->settyp(tok->gettyp() == TOKT_DSTR
2238 ? TOKT_DSTR_START : TOKT_DSTR_MID);
2239
2240 /* tell the caller we're in an embedding */
2241 *in_embedding = TRUE;
2242
2243 /* stop scanning */
2244 break;
2245 }
2246
2247 /* copy this character to the output position */
2248 dst.setch(cur);
2249
2250 /* get the next character */
2251 p->inc();
2252 }
2253
2254 /* note where the string ends */
2255 end = dst;
2256
2257 /* set the token's text pointer */
2258 tok->set_text(dstp, end.getptr() - dstp);
2259
2260 /* null-terminate the result string */
2261 dst.setch('\0');
2262
2263 /* skip an extra character if this is the start of an embedding */
2264 if (p->getch() == '<')
2265 p->inc();
2266
2267 /* skip the closing quote */
2268 p->inc();
2269
2270 /* return the string type */
2271 return tok->gettyp();
2272 }
2273
2274
2275 /*
2276 * Skip a string, setting up the token structure for the string. This
2277 * routine only parses to the end of the line; if the line ends with the
2278 * string unterminated, we'll flag an error
2279 */
tokenize_string(utf8_ptr * p,CTcToken * tok,int * in_embedding)2280 tc_toktyp_t CTcTokenizer::tokenize_string(utf8_ptr *p, CTcToken *tok,
2281 int *in_embedding)
2282 {
2283 const char *start;
2284 const char *contents_start;
2285 const char *contents_end;
2286 tc_toktyp_t typ;
2287 wchar_t qu;
2288 int allow_embedding;
2289
2290 /* remember where the text starts */
2291 start = p->getptr();
2292
2293 /* note the quote type */
2294 qu = p->getch();
2295
2296 /* skip the quote in the input */
2297 p->inc();
2298
2299 /* determine the token type based on the quote type */
2300 switch(qu)
2301 {
2302 case '\'':
2303 /* single-quoted string */
2304 typ = TOKT_SSTR;
2305 allow_embedding = FALSE;
2306 break;
2307
2308 case '>':
2309 /*
2310 * this must be the next part of a string with embeddings; for now,
2311 * assume it's the end of the string, although it may just turn out
2312 * to be the middle
2313 */
2314 typ = TOKT_DSTR_END;
2315 allow_embedding = (in_embedding != 0);
2316
2317 /* skip the extra '>' character */
2318 p->inc();
2319
2320 /* clear the embedding flag */
2321 if (in_embedding != 0)
2322 *in_embedding = FALSE;
2323
2324 /* look for a closing double quote */
2325 qu = '"';
2326 break;
2327
2328 case '"':
2329 /* regular double-quoted string */
2330 typ = TOKT_DSTR;
2331 allow_embedding = (in_embedding != 0);
2332 break;
2333
2334 default:
2335 /* anything else is invalid */
2336 typ = TOKT_INVALID;
2337 allow_embedding = FALSE;
2338 break;
2339 }
2340
2341 /* this is where the string's contents start */
2342 contents_start = p->getptr();
2343
2344 /* scan the string */
2345 for (;;)
2346 {
2347 wchar_t cur;
2348
2349 /* get the current character */
2350 cur = p->getch();
2351
2352 /* see what we have */
2353 if (cur == '\\')
2354 {
2355 /* escape sequence - skip an extra character */
2356 p->inc();
2357 }
2358 else if (cur == '<' && allow_embedding && p->getch_at(1) == '<')
2359 {
2360 /*
2361 * it's the start of an embedded expression - return the
2362 * appropriate embedded string part type
2363 */
2364 if (typ == TOKT_DSTR)
2365 typ = TOKT_DSTR_START;
2366 else
2367 typ = TOKT_DSTR_MID;
2368
2369 /* remember that we're in an embedding in the token stream */
2370 *in_embedding = TRUE;
2371
2372 /* this is where the contents end */
2373 contents_end = p->getptr();
2374
2375 /* skip the two embedding characters */
2376 p->inc();
2377 p->inc();
2378
2379 /* we're done - set the text in the token */
2380 tok->set_text(start, p->getptr() - start);
2381
2382 /* done */
2383 break;
2384 }
2385 else if (cur == qu)
2386 {
2387 /* this is where the contents end */
2388 contents_end = p->getptr();
2389
2390 /* skip the closing quote */
2391 p->inc();
2392
2393 /* we're done - set the text in the token */
2394 tok->set_text(start, p->getptr() - start);
2395
2396 /* done */
2397 break;
2398 }
2399 else if (cur == '\0')
2400 {
2401 /* this is where the contents end */
2402 contents_end = p->getptr();
2403
2404 /*
2405 * We have an unterminated string. If we're evaluating a
2406 * preprocessor constant expression, log an error; otherwise
2407 * let it go for now, since we'll catch the error during the
2408 * normal tokenizing pass for parsing.
2409 */
2410 if (G_tok->in_pp_expr_)
2411 log_error(TCERR_PP_UNTERM_STRING);
2412
2413 /* set the partial text */
2414 tok->set_text(start, p->getptr() - start);
2415
2416 /* end of line - return with the string unfinished */
2417 break;
2418 }
2419
2420 /* skip this charater of input */
2421 p->inc();
2422 }
2423
2424 /*
2425 * if we're not in preprocessor mode, and we're saving string text,
2426 * write the string to the string text output file
2427 */
2428 if (!G_tok->in_pp_expr_ && G_tok->string_fp_ != 0
2429 && contents_start != contents_end)
2430 {
2431 /* write the line, translating back to the source character set */
2432 G_tok->string_fp_map_
2433 ->write_file(G_tok->string_fp_, contents_start,
2434 (size_t)(contents_end - contents_start));
2435
2436 /* add a newline */
2437 osfwb(G_tok->string_fp_, "\n", 1);
2438 }
2439
2440 /* set the type in the token */
2441 tok->settyp(typ);
2442
2443 /* return the token type */
2444 return tok->gettyp();
2445 }
2446
2447
2448 /* ------------------------------------------------------------------------ */
2449 /*
2450 * Read a source line and handle preprocessor directives. This routine
2451 * will transparently handle #include, #define, and other directives;
2452 * when this routine returns, the input buffer will have a line of text
2453 * that contains no # directive.
2454 *
2455 * Returns zero on success, non-zero upon reaching the end of the input.
2456 */
read_line_pp()2457 int CTcTokenizer::read_line_pp()
2458 {
2459 int started_in_string;
2460 char *p;
2461
2462 /*
2463 * Read the next line from the input. If that fails, return an end
2464 * of file indication.
2465 */
2466 p = read_line(FALSE);
2467 if (p == 0)
2468 return 1;
2469
2470 /*
2471 * before we process comments, note whether or not the line started
2472 * out within a character string
2473 */
2474 started_in_string = (in_quote_ != '\0');
2475
2476 /* set up our source pointer to the start of the new line */
2477 start_new_line(p, linebuf_.get_text_len());
2478
2479 /* skip leading whitespace */
2480 while (is_space(p_.getch()))
2481 p_.inc();
2482
2483 /*
2484 * If this line begins with a '#', process the directive. Ignore
2485 * any initial '#' if the line started off in a string.
2486 */
2487 if (!started_in_string && p_.getch() == '#' && allow_pp_)
2488 {
2489 struct pp_kw_def
2490 {
2491 const char *kw;
2492 int process_in_false_if;
2493 void (CTcTokenizer::*func)();
2494 };
2495 static pp_kw_def kwlist[] =
2496 {
2497 { "charset", FALSE, &CTcTokenizer::pp_charset },
2498 { "pragma", FALSE, &CTcTokenizer::pp_pragma },
2499 { "include", FALSE, &CTcTokenizer::pp_include },
2500 { "define", FALSE, &CTcTokenizer::pp_define },
2501 { "if", TRUE, &CTcTokenizer::pp_if },
2502 { "ifdef", TRUE, &CTcTokenizer::pp_ifdef },
2503 { "ifndef", TRUE, &CTcTokenizer::pp_ifndef },
2504 { "else", TRUE, &CTcTokenizer::pp_else },
2505 { "elif", TRUE, &CTcTokenizer::pp_elif },
2506 { "endif", TRUE, &CTcTokenizer::pp_endif },
2507 { "error", FALSE, &CTcTokenizer::pp_error },
2508 { "undef", FALSE, &CTcTokenizer::pp_undef },
2509 { "line", FALSE, &CTcTokenizer::pp_line },
2510 { 0, 0, 0 }
2511 };
2512 pp_kw_def *kwp;
2513 const char *kwtxt;
2514 size_t kwlen;
2515
2516 /* skip the '#' */
2517 p_.inc();
2518
2519 /*
2520 * If the line ended inside a comment, read the next line until
2521 * we're no longer in a comment. The ANSI C preprocessor rules
2522 * say that a newline in a comment should not be treated as a
2523 * lexical newline, so pretend that the next line is part of the
2524 * preprocessor line in such a case.
2525 */
2526 while (str_->is_in_comment())
2527 {
2528 size_t p_ofs;
2529
2530 /* remember the current offset in the line buffer */
2531 p_ofs = p_.getptr() - linebuf_.get_buf();
2532
2533 /* append another line - stop at the end of the stream */
2534 if (read_line(TRUE))
2535 break;
2536
2537 /* restore the line pointer, in case the buffer moved */
2538 start_new_line(linebuf_.get_buf() + p_ofs,
2539 linebuf_.get_text_len() - p_ofs);
2540 }
2541
2542 /* read the directive */
2543 next_on_line();
2544
2545 /*
2546 * if we've reached the end of the line, it's a null directive;
2547 * simply return an empty line
2548 */
2549 if (curtok_.gettyp() == TOKT_EOF)
2550 {
2551 clear_linebuf();
2552 return 0;
2553 }
2554
2555 /* get the text and length of the keyword */
2556 kwtxt = curtok_.get_text();
2557 kwlen = curtok_.get_text_len();
2558
2559 /* if it's not a symbol, it's not a valid directive */
2560 if (curtok_.gettyp() != TOKT_SYM)
2561 {
2562 /* log the error and return an empty line */
2563 log_error(TCERR_INV_PP_DIR, (int)kwlen, kwtxt);
2564 clear_linebuf();
2565 return 0;
2566 }
2567
2568 /* determine which keyword we have, and process it */
2569 for (kwp = kwlist ; kwp->kw != 0 ; ++kwp)
2570 {
2571 /* is this our keyword? */
2572 if (strlen(kwp->kw) == kwlen
2573 && memcmp(kwtxt, kwp->kw, kwlen) == 0)
2574 {
2575 /*
2576 * This is our directive.
2577 *
2578 * If we're in the false branch of a #if block, only
2579 * process the directive if it's a kind of directive
2580 * that we should process in false #if branches. The
2581 * only directives that we process in #if branches are
2582 * those that would affect the #if branching, such as a
2583 * #endif or a nested #if.
2584 */
2585 if (!in_false_if() || kwp->process_in_false_if)
2586 {
2587 /* invoke the handler to process the directive */
2588 (this->*(kwp->func))();
2589 }
2590 else
2591 {
2592 /*
2593 * we're in a #if branch not taken - simply clear
2594 * the buffer
2595 */
2596 clear_linebuf();
2597 }
2598
2599 /* we don't need to look any further */
2600 break;
2601 }
2602 }
2603
2604 /*
2605 * if we didn't find the keyword, log an error and otherwise
2606 * ignore the entire line
2607 */
2608 if (kwp->kw == 0)
2609 log_error(TCERR_INV_PP_DIR, (int)kwlen, kwtxt);
2610
2611 /*
2612 * Preprocessor lines must always be entirely self-contained.
2613 * Therefore, it's not valid for a string to start on a
2614 * preprocessor line and continue onto subsequent lines. If
2615 * we're marked as being inside a string, there must have been
2616 * an error on the preprocessor line. Simply clear the
2617 * in-string flag; we don't need to issue an error at this
2618 * point, since the preprocessor line handler should have
2619 * already caught the problem and reported an error.
2620 */
2621 in_quote_ = '\0';
2622 }
2623 else
2624 {
2625 /*
2626 * There's no preprocessor directive.
2627 *
2628 * If we're in a false #if branch, return an empty line. We
2629 * return an empty line rather than skipping to the next line so
2630 * that the caller sees the same number of lines as are in the
2631 * original source.
2632 */
2633 if (in_false_if())
2634 {
2635 /*
2636 * it's a #if not taken - we don't want to compile the line
2637 * at all, so just clear it out
2638 */
2639 clear_linebuf();
2640 expbuf_.clear_text();
2641 }
2642 else
2643 {
2644 /*
2645 * If we ended the line in a string, splice additional lines
2646 * onto the end of this line until we find the end of the
2647 * string, then unsplice the part after the end of the
2648 * string.
2649 */
2650 if (in_quote_ != '\0')
2651 {
2652 /* splice additional lines to finish the quote */
2653 splice_string();
2654 }
2655
2656 /*
2657 * Expand macros in the line, splicing additional source
2658 * lines if necessary to fill out any incomplete actual
2659 * parameter lists.
2660 */
2661 start_new_line(linebuf_.get_buf(), linebuf_.get_text_len());
2662 expand_macros_curline(TRUE, FALSE, FALSE);
2663 }
2664
2665 /* store the line in the appropriate place */
2666 if (pp_only_mode_)
2667 {
2668 /*
2669 * we're only preprocessing - store the macro-expanded line
2670 * back in the line buffer so that the caller can read out
2671 * the final preprocessed text
2672 */
2673 linebuf_.copy(expbuf_.get_text(), expbuf_.get_text_len());
2674 }
2675 else
2676 {
2677 /*
2678 * We're compiling - simply read subsequent tokens out of
2679 * the expansion buffer.
2680 */
2681 start_new_line(expbuf_.get_buf(), expbuf_.get_text_len());
2682 }
2683 }
2684
2685 /* return success */
2686 return 0;
2687 }
2688
2689 /* ------------------------------------------------------------------------ */
2690 /*
2691 * Read the next line from the input file. Returns a pointer to the
2692 * start of the newly-read data on success, or null if we reach the end
2693 * of the input.
2694 *
2695 * If 'append' is true, we'll add the line on to the end of the existing
2696 * buffer; otherwise, we'll overwrite what's in the buffer.
2697 *
2698 * The only preprocessing performed in this routine is line-splicing.
2699 * Any line that ends with a backslash character will be spliced with
2700 * the following line, with the backslash and newline removed.
2701 *
2702 * The new line will be stored in our internal buffer, and will be
2703 * null-terminated with the trailing newline removed.
2704 *
2705 * If we reach the end of the current file, and there's an enclosing
2706 * file, we'll resume reading from the enclosing file. Hence, when this
2707 * routine returns non-zero, it indicates that we've reached the end of
2708 * the entire source, not just of the current file.
2709 */
read_line(int append)2710 char *CTcTokenizer::read_line(int append)
2711 {
2712 size_t len;
2713 size_t start_len;
2714
2715 /* if there's no input stream, indicate end-of-file */
2716 if (str_ == 0)
2717 return 0;
2718
2719 /* if we're not appending, clear out the line buffer */
2720 if (!append)
2721 {
2722 /* start with an empty line */
2723 clear_linebuf();
2724
2725 /* note the current input position */
2726 last_desc_ = str_->get_desc();
2727 last_linenum_ = str_->get_next_linenum();
2728 }
2729
2730 /* note where the new data starts */
2731 len = linebuf_.get_text_len();
2732 start_len = len;
2733
2734 /*
2735 * if there's anything in the unsplice buffer, use it as the new
2736 * line
2737 */
2738 if (unsplicebuf_.get_text_len() != 0)
2739 {
2740 /*
2741 * Copy the unsplice buffer as the current line. Note that we
2742 * don't have to worry about any of the complicated cases, such
2743 * as whether or not it ends with a newline or a backslash,
2744 * because the unspliced line was already processed as an input
2745 * line when we read it in the first place.
2746 */
2747 linebuf_.append(unsplicebuf_.get_text(), unsplicebuf_.get_text_len());
2748
2749 /* clear the unsplice buffer, since it's been consumed now */
2750 unsplicebuf_.clear_text();
2751
2752 /*
2753 * make the current line the appended line - if we're
2754 * unsplicing, it means that we appended, so the current line is
2755 * now the line from which the last appended text came
2756 */
2757 last_desc_ = appended_desc_;
2758 last_linenum_ = appended_linenum_;
2759
2760 /* return the new text */
2761 return linebuf_.get_buf() + start_len;
2762 }
2763
2764 /* if we're appending, note where the appendage is coming from */
2765 if (append)
2766 {
2767 /* remember the last source line appended */
2768 appended_desc_ = str_->get_desc();
2769 appended_linenum_ = str_->get_next_linenum();
2770 }
2771
2772 /* keep going until we finish reading the input line */
2773 for ( ;; )
2774 {
2775 size_t curlen;
2776
2777 /* read a line of text from the input file */
2778 curlen = str_->get_src()->
2779 read_line(linebuf_.get_buf() + len,
2780 linebuf_.get_buf_size() - len);
2781
2782 /* check for end of file */
2783 if (curlen == 0)
2784 {
2785 CTcTokStream *old_str;
2786
2787 /*
2788 * We've reached the end of the current input stream. If
2789 * we've already read anything into the current line, it
2790 * means that the file ended in mid-line, without a final
2791 * newline character; ignore this and proceed with the line
2792 * as it now stands in this case.
2793 */
2794 if (len > start_len)
2795 break;
2796
2797 /*
2798 * We've finished with this stream. If there's a parent
2799 * stream, return to it; otherwise, we're at the end of the
2800 * source.
2801 */
2802
2803 /*
2804 * if we didn't close all of the #if/#ifdef levels opened
2805 * within this file, flag one or more errors
2806 */
2807 while (if_sp_ > str_->get_init_if_level())
2808 {
2809 const char *fname;
2810
2811 /* get the filename from the #if stack */
2812 fname = if_stack_[if_sp_ - 1].desc->get_fname();
2813
2814 /* if we're in test reporting mode, use the root name only */
2815 if (test_report_mode_)
2816 fname = os_get_root_name((char *)fname);
2817
2818 /* log the error */
2819 log_error(TCERR_IF_WITHOUT_ENDIF,
2820 if_stack_[if_sp_ - 1].linenum,
2821 (int)strlen(fname), fname);
2822
2823 /* discard the #if level */
2824 pop_if();
2825 }
2826
2827 /* remember the old stream */
2828 old_str = str_;
2829
2830 /* return to the parent stream, if there is one */
2831 str_ = str_->get_parent();
2832
2833 /* delete the old stream now that we're done with it */
2834 delete old_str;
2835
2836 /* note the new file the line will be coming from */
2837 if (!append && str_ != 0)
2838 {
2839 last_desc_ = str_->get_desc();
2840 last_linenum_ = str_->get_next_linenum();
2841 }
2842
2843 /* if there's no stream, return end of file */
2844 if (str_ == 0)
2845 return 0;
2846
2847 /*
2848 * restore the #pragma newline_spacing mode that was in effect
2849 * when we interrupted the parent stream
2850 */
2851 string_newline_spacing_ = str_->get_newline_spacing();
2852
2853 /* if there's a parser, notify it of the new pragma C mode */
2854 // if (G_prs != 0)
2855 // G_prs->set_pragma_c(str_->is_pragma_c());
2856
2857 /* go back to read the next line from the parent */
2858 continue;
2859 }
2860
2861 /* set the new length of the buffer contents */
2862 len += curlen - 1;
2863 linebuf_.set_text_len(len);
2864
2865 /*
2866 * Check the result to see if it ends in a newline. If not, it
2867 * means either that we don't have room in the buffer for the
2868 * full source line, or we've reached the last line in the file,
2869 * and it doesn't end with a newline.
2870 *
2871 * Note that the file reader will always supply us with '\n'
2872 * newlines, regardless of the local operating system
2873 * conventions.
2874 *
2875 * Also, check to see if the line ends with '\\'. If so, remove
2876 * the '\\' character and read the next line, since this
2877 * indicates that the logical line continues onto the next
2878 * newline-deliminted line.
2879 */
2880 if (len != 0 && linebuf_.get_text()[len - 1] != '\n')
2881 {
2882 /*
2883 * There's no newline, hence the file reader wasn't able to
2884 * fit the entire line into our buffer, or else we've read
2885 * the last line in the file and there's no newline at the
2886 * end. If we haven't reached the end of the file, expand
2887 * our line buffer to make room to read more from this same
2888 * line.
2889 */
2890 if (!str_->get_src()->at_eof())
2891 linebuf_.expand();
2892 }
2893 else if (len > 1 && linebuf_.get_text()[len - 2] == '\\')
2894 {
2895 /*
2896 * There's a backslash at the end of the line, so they want
2897 * to continue this logical line. Remove the backslash, and
2898 * read the next line onto the end of the current line.
2899 *
2900 * Note that we must remove two characters from the end of
2901 * the line (and tested for buf_[len-2] above) because we
2902 * have both a backslash and a newline at the end of the
2903 * line.
2904 */
2905 len -= 2;
2906 linebuf_.set_text_len(len);
2907
2908 /* count reading the physical line */
2909 str_->count_line();
2910 }
2911 else
2912 {
2913 /* remove the newline from the buffer */
2914 if (len != 0)
2915 {
2916 --len;
2917 linebuf_.set_text_len(len);
2918 }
2919
2920 /* count reading the line */
2921 str_->count_line();
2922
2923 /* done */
2924 break;
2925 }
2926 }
2927
2928 /*
2929 * remove comments from the newly-read material - this replaces each
2930 * comment by a single whitespace character
2931 */
2932 process_comments(start_len);
2933
2934 /*
2935 * we've successfully read a line -- return a pointer to the start
2936 * of the newly-read text
2937 */
2938 return linebuf_.get_buf() + start_len;
2939 }
2940
2941 /*
2942 * Un-splice a line at the given point. This breaks the current source
2943 * line in two, keeping the part before the given point as the current
2944 * line, but making the part from the given point to the end of the line
2945 * a new source line. We'll put the new source line into a special
2946 * holding buffer, and then fetch this part as a new line the next time
2947 * we read a line in read_line().
2948 */
unsplice_line(const char * new_line_start)2949 void CTcTokenizer::unsplice_line(const char *new_line_start)
2950 {
2951 size_t keep_len;
2952
2953 /* make sure the starting point is within the current line */
2954 if (!(new_line_start >= linebuf_.get_text()
2955 && new_line_start <= linebuf_.get_text() + linebuf_.get_text_len()))
2956 {
2957 /* note the error - this is an internal problem */
2958 throw_internal_error(TCERR_UNSPLICE_NOT_CUR);
2959 return;
2960 }
2961
2962 /*
2963 * make sure the unsplice buffer is empty - we should never have to
2964 * unsplice from a line more than once, because unsplicing should
2965 * terminate the current line at the current point
2966 */
2967 if (unsplicebuf_.get_text_len() != 0)
2968 {
2969 throw_internal_error(TCERR_MULTI_UNSPLICE);
2970 return;
2971 }
2972
2973 /* calculate the length of the part we're keeping */
2974 keep_len = new_line_start - linebuf_.get_text();
2975
2976 /* put the remainder of the current line in the unsplice buffer */
2977 unsplicebuf_.append(new_line_start, linebuf_.get_text_len() - keep_len);
2978
2979 /* cut off the current line at the given point */
2980 linebuf_.set_text_len(keep_len);
2981 }
2982
2983
2984 /* ------------------------------------------------------------------------ */
2985 /*
2986 * Store text in the source array
2987 */
store_source(const char * txt,size_t len)2988 const char *CTcTokenizer::store_source(const char *txt, size_t len)
2989 {
2990 const char *p;
2991
2992 /* reserve space in the source array */
2993 reserve_source(len);
2994
2995 /* remember where the string starts */
2996 p = src_ptr_;
2997
2998 /* store the text */
2999 memcpy(src_ptr_, txt, len);
3000
3001 /* advance the source block write position and length */
3002 src_ptr_ += len;
3003 src_rem_ -= len;
3004
3005 /* null-terminate the copied text */
3006 *src_ptr_++ = '\0';
3007 --src_rem_;
3008
3009 /* return the storage pointer */
3010 return p;
3011 }
3012
3013 /*
3014 * Reserve space for text in the source array
3015 */
reserve_source(size_t len)3016 void CTcTokenizer::reserve_source(size_t len)
3017 {
3018 /*
3019 * if we don't have enough space for this line in the current source
3020 * block, start a new block
3021 */
3022 if (len + 1 > src_rem_)
3023 {
3024 CTcTokSrcBlock *blk;
3025
3026 /*
3027 * if the line is too long for a source block, throw a fatal
3028 * error
3029 */
3030 if (len + 1 > TCTOK_SRC_BLOCK_SIZE)
3031 throw_fatal_error(TCERR_SRCLINE_TOO_LONG,
3032 (long)TCTOK_SRC_BLOCK_SIZE);
3033
3034 /* allocate a new block */
3035 blk = new CTcTokSrcBlock();
3036
3037 /* link it into our list */
3038 src_cur_->set_next(blk);
3039
3040 /* it's now the current block */
3041 src_cur_ = blk;
3042
3043 /* start writing at the start of this block */
3044 src_rem_ = TCTOK_SRC_BLOCK_SIZE;
3045 src_ptr_ = blk->get_buf();
3046 }
3047 }
3048
3049 /*
3050 * Commit space previously reserved and now used in the source block
3051 * list
3052 */
commit_source(size_t len)3053 void CTcTokenizer::commit_source(size_t len)
3054 {
3055 /* advance the write position past the committed text */
3056 src_ptr_ += len;
3057 src_rem_ -= len;
3058 }
3059
3060
3061 /* ------------------------------------------------------------------------ */
3062 /*
3063 * Expand macros in the current line from the current source pointer,
3064 * filling in expbuf_ with the expanded result.
3065 */
expand_macros_curline(int read_more,int allow_defined,int append_to_expbuf)3066 int CTcTokenizer::expand_macros_curline(int read_more, int allow_defined,
3067 int append_to_expbuf)
3068 {
3069 int err;
3070 utf8_ptr p;
3071 char *src;
3072 char *dst;
3073
3074 /* expand macros in the current line */
3075 err = expand_macros(&linebuf_, &p_, &expbuf_, read_more, allow_defined,
3076 append_to_expbuf);
3077
3078 /* if that failed, return an error */
3079 if (err != 0)
3080 return err;
3081
3082 /*
3083 * If we're not in preprocessor mode, there's no need to remove the
3084 * FULLY_EXPANDED flag bytes, since the tokenizer will know to skip
3085 * them.
3086 */
3087 if (!pp_only_mode_)
3088 return err;
3089
3090 /*
3091 * Scan the expansion buffer and remove all of the no-more-expansion
3092 * flag bytes - we're done expanding the macro now, so we don't need
3093 * this information any longer. When we're preprocessing the file,
3094 * we don't want to leave these in the expanded source.
3095 */
3096 for (src = dst = expbuf_.get_buf(), p.set(src) ; p.getch() != '\0' ; )
3097 {
3098 /* if this isn't a macro flag, copy it */
3099 if (p.getch() == TOK_MACRO_EXP_END)
3100 {
3101 /* skip the flag byte and the following embedded pointer */
3102 src += 1 + sizeof(CTcHashEntryPp *);
3103 p.set(src);
3104 }
3105 else if (p.getch() == TOK_FULLY_EXPANDED_FLAG)
3106 {
3107 /* skip the flag byte */
3108 ++src;
3109 p.set(src);
3110 }
3111 else
3112 {
3113 /* skip this character */
3114 p.inc();
3115
3116 /* copy the bytes of this character as-is */
3117 while (src < p.getptr())
3118 *dst++ = *src++;
3119 }
3120 }
3121
3122 /* set the new buffer length */
3123 expbuf_.set_text_len(dst - expbuf_.get_buf());
3124
3125 /* return the result */
3126 return err;
3127 }
3128
3129 /* ------------------------------------------------------------------------ */
3130 /*
3131 * Expand macros in the current line, reading additional source lines if
3132 * necessary.
3133 *
3134 * 'src' is a pointer to the start of the text to expand; it must point
3135 * into the 'srcbuf' buffer. If 'src' is null, we'll simply start at
3136 * the beginning of the source buffer.
3137 */
expand_macros(CTcTokString * srcbuf,utf8_ptr * src,CTcTokString * expbuf,int read_more,int allow_defined,int append)3138 int CTcTokenizer::expand_macros(CTcTokString *srcbuf, utf8_ptr *src,
3139 CTcTokString *expbuf, int read_more,
3140 int allow_defined, int append)
3141 {
3142 tc_toktyp_t typ;
3143 CTcToken tok;
3144 CTcTokString *subexp;
3145 size_t startofs;
3146 utf8_ptr local_src;
3147 CTcTokStringRef local_srcbuf;
3148 CTcMacroRsc *res;
3149 int err;
3150
3151 /* presume success */
3152 err = 0;
3153
3154 /* get a macro expansion resource object */
3155 res = alloc_macro_rsc();
3156 if (res == 0)
3157 return 1;
3158
3159 /* get our subexpression buffer from the resource object */
3160 subexp = &res->line_exp_;
3161
3162 /* if there's no source buffer or source pointer, provide one */
3163 if (srcbuf == 0)
3164 {
3165 /*
3166 * there's no source buffer - provide our own non-allocated
3167 * buffer tied to the caller's buffer
3168 */
3169 local_srcbuf.set_buffer(src->getptr(), strlen(src->getptr()));
3170 srcbuf = &local_srcbuf;
3171 }
3172 else if (src == 0)
3173 {
3174 /*
3175 * there's no source pointer - start at the beginning of the
3176 * source buffer
3177 */
3178 local_src.set((char *)srcbuf->get_text());
3179 src = &local_src;
3180 }
3181
3182 /* clear the expansion buffer, unless we're appending to the buffer */
3183 if (!append)
3184 expbuf->clear_text();
3185
3186 /*
3187 * Make sure we have room for a copy of the source line. This is an
3188 * optimization for the simple case where we'll just copy the source
3189 * line unchanged, so that we don't have to repeatedly expand the
3190 * buffer; we will, however, expand the buffer dynamically later, if
3191 * this pre-allocation should prove to be insufficient.
3192 */
3193 expbuf->ensure_space(expbuf->get_text_len() + srcbuf->get_text_len());
3194
3195 /* note the starting offset, if we have an underlying string buffer */
3196 startofs = src->getptr() - srcbuf->get_text();
3197
3198 /* read the first token */
3199 typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_);
3200
3201 /* scan through the tokens on the line, looking for macros to expand */
3202 while (typ != TOKT_EOF)
3203 {
3204 /*
3205 * if it's a symbol, and it hasn't already been marked as fully
3206 * expanded, look it up in the #define table
3207 */
3208 if (typ == TOKT_SYM && !tok.get_fully_expanded())
3209 {
3210 CTcHashEntryPp *entry;
3211
3212 /*
3213 * Look up the symbol in the #define symbol table. If we
3214 * find it, expand the macro. Otherwise, if the "defined"
3215 * operator is active, check for that.
3216 *
3217 * Do not expand the macro if we find that it has already
3218 * been expanded on a prior scan through the current text.
3219 */
3220 entry = find_define(tok.get_text(), tok.get_text_len());
3221 if ((entry != 0
3222 && !scan_for_prior_expansion(*src, srcbuf->get_text_end(),
3223 entry))
3224 || (allow_defined
3225 && tok.get_text_len() == 7
3226 && memcmp(tok.get_text(), "defined", 7) == 0))
3227 {
3228 size_t macro_ofs;
3229 size_t rem_len;
3230 int expanded;
3231
3232 /* get the offset of the macro token in the source buffer */
3233 macro_ofs = tok.get_text() - srcbuf->get_text();
3234
3235 /* expand it into our sub-expansion buffer */
3236 if (entry != 0)
3237 {
3238 /* expand the macro */
3239 err = expand_macro(res, subexp, srcbuf, src,
3240 macro_ofs, entry,
3241 read_more, allow_defined, &expanded);
3242 }
3243 else
3244 {
3245 /* parse and expand the defined() operator */
3246 err = expand_defined(subexp, srcbuf, src);
3247
3248 /* "defined" always expands if there's not an error */
3249 expanded = TRUE;
3250 }
3251
3252 /* if an error occurred, return failure */
3253 if (err)
3254 goto done;
3255
3256 /*
3257 * if we expanded something, append everything we
3258 * skipped preceding the macro, then rescan; otherwise,
3259 * just keep going without a rescan
3260 */
3261 if (expanded)
3262 {
3263 /* copy the preceding text to the output */
3264 expbuf->append(srcbuf->get_text() + startofs,
3265 macro_ofs - startofs);
3266 }
3267 else
3268 {
3269 /*
3270 * we didn't expand - get the next token after the
3271 * macro
3272 */
3273 typ = next_on_line(srcbuf, src, &tok,
3274 ¯o_in_embedding_);
3275
3276 /* continue processing from this token */
3277 continue;
3278 }
3279
3280 /*
3281 * We must now insert the expansion into the source
3282 * buffer at the current point, and re-scan the
3283 * expansion, *along with* the rest of the original
3284 * source line (this is how ANSI C specifies the
3285 * process).
3286 *
3287 * If we can read more, we must be reading out of the
3288 * main input line buffer, so insert the expansion text
3289 * directly into the original source stream, and
3290 * continue reading out of the source stream; this will
3291 * simplify the case where we must read more data from
3292 * the file in the course of the expansion. If we can't
3293 * read more, simply copy the remainder of the current
3294 * input line onto the expanded macro and use it as the
3295 * new input buffer.
3296 */
3297
3298 /* get the current offset in the source line */
3299 startofs = src->getptr() - srcbuf->get_text();
3300
3301 /* figure out how much is left on the current line */
3302 rem_len = srcbuf->get_text_len() - startofs;
3303
3304 /* check to see if we can read more */
3305 if (read_more)
3306 {
3307 /*
3308 * we're reading from the original line input buffer
3309 * -- insert the expansion into the source buffer at
3310 * the current point, replacing the original macro
3311 * text
3312 */
3313
3314 /* make sure we have room for adding the expansion text */
3315 srcbuf->ensure_space(macro_ofs + rem_len
3316 + subexp->get_text_len());
3317
3318 /* make sure src is still pointing to the right place */
3319 src->set(srcbuf->get_buf() + macro_ofs);
3320
3321 /* move the remainder of the current line to make room */
3322 memmove(srcbuf->get_buf() + macro_ofs
3323 + subexp->get_text_len(),
3324 srcbuf->get_buf() + startofs,
3325 rem_len);
3326
3327 /* insert the expansion text */
3328 memcpy(srcbuf->get_buf() + macro_ofs, subexp->get_buf(),
3329 subexp->get_text_len());
3330
3331 /* set the new source length */
3332 srcbuf->set_text_len(macro_ofs + rem_len
3333 + subexp->get_text_len());
3334
3335 /* the new starting offset is the current position */
3336 startofs = macro_ofs;
3337
3338 /* get the next token */
3339 typ = next_on_line(srcbuf, src, &tok,
3340 ¯o_in_embedding_);
3341
3342 /* continue processing from this token */
3343 continue;
3344 }
3345 else
3346 {
3347 /*
3348 * we're reading from a read-only buffer -- add the
3349 * remainder of the source to the expansion buffer,
3350 * and recursively parse the remainder
3351 */
3352 subexp->append(srcbuf->get_text() + startofs, rem_len);
3353
3354 /*
3355 * evaluate the remainder recursively and append it
3356 * to the expansion already in progress
3357 */
3358 err = expand_macros(subexp, 0, expbuf, FALSE,
3359 allow_defined, TRUE);
3360
3361 /* we're done */
3362 goto done;
3363 }
3364 }
3365 }
3366
3367 /* get the next token */
3368 typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_);
3369 }
3370
3371 /* add the remainder of the source to the output */
3372 expbuf->append(srcbuf->get_text() + startofs,
3373 tok.get_text() - startofs - srcbuf->get_text());
3374
3375 done:
3376 /* release our macro resource object */
3377 release_macro_rsc(res);
3378
3379 /* return the result */
3380 return err;
3381 }
3382
3383 /*
3384 * Allocate a macro resource object. If we're out of resource objects
3385 * in the pool, we'll add another object to the pool.
3386 */
alloc_macro_rsc()3387 CTcMacroRsc *CTcTokenizer::alloc_macro_rsc()
3388 {
3389 CTcMacroRsc *rsc;
3390
3391 /*
3392 * if there's anything in the available list, take the first item
3393 * off the list and return it
3394 */
3395 if (macro_res_avail_ != 0)
3396 {
3397 /* remember the item to return */
3398 rsc = macro_res_avail_;
3399
3400 /* remove it from the list */
3401 macro_res_avail_ = macro_res_avail_->next_avail_;
3402
3403 /* return it */
3404 return rsc;
3405 }
3406
3407 /* there's nothing on the available list - allocate a new item */
3408 rsc = new CTcMacroRsc();
3409
3410 /* if that failed, return failure */
3411 if (rsc == 0)
3412 {
3413 log_error(TCERR_OUT_OF_MEM_MAC_EXP);
3414 return 0;
3415 }
3416
3417 /* add it onto the master list */
3418 rsc->next_ = macro_res_head_;
3419 macro_res_head_ = rsc;
3420
3421 /* return it */
3422 return rsc;
3423 }
3424
3425 /*
3426 * Release a macro resource, returning it to the pool
3427 */
release_macro_rsc(CTcMacroRsc * rsc)3428 void CTcTokenizer::release_macro_rsc(CTcMacroRsc *rsc)
3429 {
3430 /* put it back at the head of the available list */
3431 rsc->next_avail_ = macro_res_avail_;
3432 macro_res_avail_ = rsc;
3433 }
3434
3435 /*
3436 * Scan a buffer for a prior-expansion flag for a given macro. We'll
3437 * look through the buffer for a TOK_MACRO_EXP_END byte that mentions
3438 * the given symbol table entry; we'll return true if found, false if
3439 * not. True means that the symbol has already been expanded on a prior
3440 * scan of the text, so it should not be re-expanded now.
3441 */
scan_for_prior_expansion(utf8_ptr src,const char * src_end,const CTcHashEntryPp * entry)3442 int CTcTokenizer::scan_for_prior_expansion(utf8_ptr src, const char *src_end,
3443 const CTcHashEntryPp *entry)
3444 {
3445 /* scan the buffer for the expansion flag byte */
3446 while (src.getptr() < src_end)
3447 {
3448 /* if this is the flag, check what follows */
3449 if (src.getch() == TOK_MACRO_EXP_END)
3450 {
3451 CTcHashEntryPp *flag_entry;
3452
3453 /* read the entry from the buffer */
3454 memcpy(&flag_entry, src.getptr() + 1, sizeof(flag_entry));
3455
3456 /* if it matches, indicate that we found it */
3457 if (entry == flag_entry)
3458 return TRUE;
3459
3460 /* it's not a match - keep scanning after this flag sequence */
3461 src.set(src.getptr() + 1 + sizeof(flag_entry));
3462 }
3463 else
3464 {
3465 /* it's not the flag - skip this character */
3466 src.inc();
3467 }
3468 }
3469
3470 /* we didn't find it */
3471 return FALSE;
3472 }
3473
3474 /*
3475 * Go through a macro expansion and translate from end-of-expansion
3476 * markers to individual token full-expansion markers. This is used
3477 * after we leave a recursion level to convert expanded text into text
3478 * suitable for use in further expansion at an enclosing recursion
3479 * level.
3480 */
mark_full_exp_tokens(CTcTokString * dstbuf,const CTcTokString * srcbuf,int append) const3481 void CTcTokenizer::mark_full_exp_tokens(CTcTokString *dstbuf,
3482 const CTcTokString *srcbuf,
3483 int append) const
3484 {
3485 utf8_ptr p;
3486 CTcToken tok;
3487 const char *start;
3488 int in_embedding;
3489
3490 /* clear the output buffer if we're not appending to existing text */
3491 if (!append)
3492 dstbuf->clear_text();
3493
3494 /* remember the starting point */
3495 start = srcbuf->get_text();
3496
3497 /* not in an embedded expression within the expansion text yet */
3498 in_embedding = FALSE;
3499
3500 /* scan the source buffer */
3501 p.set((char *)start);
3502 for (;;)
3503 {
3504 CTcHashEntryPp *cur_entry;
3505 tc_toktyp_t typ;
3506 char ch;
3507
3508 /* get the next token; stop at the end of the line */
3509 typ = next_on_line(srcbuf, &p, &tok, &in_embedding);
3510 if (typ == TOKT_EOF)
3511 break;
3512
3513 /*
3514 * if this macro token is being expanded, and it's not already
3515 * marked for no more expansion, mark it
3516 */
3517 if (typ == TOKT_SYM
3518 && !tok.get_fully_expanded()
3519 && (cur_entry = find_define(tok.get_text(),
3520 tok.get_text_len())) != 0
3521 && scan_for_prior_expansion(p, srcbuf->get_text_end(), cur_entry))
3522 {
3523 /*
3524 * This token has been fully expanded in the substitution
3525 * buffer but hasn't yet been marked as such - we must
3526 * insert the fully-expanded marker. First, add up to the
3527 * current point to the output buffer.
3528 */
3529 if (tok.get_text() > start)
3530 dstbuf->append(start, tok.get_text() - start);
3531
3532 /* add the fully-expanded marker */
3533 ch = TOK_FULLY_EXPANDED_FLAG;
3534 dstbuf->append(&ch, 1);
3535
3536 /* the new starting point is the start of the symbol token */
3537 start = tok.get_text();
3538 }
3539 }
3540
3541 /* copy any remaining text to the output */
3542 if (tok.get_text() > start)
3543 dstbuf->append(start, tok.get_text() - start);
3544
3545 /*
3546 * Remove any macro expansion end markers from the output buffer.
3547 * We don't want to leave these around, because they don't apply to
3548 * the enclosing buffer into which we'll substitute this result.
3549 * Note that we've already ensured that these markers will be
3550 * respected for the substitution text by inserting "fully expanded"
3551 * markers in front of each token to which any of the markers we're
3552 * removing should apply.
3553 */
3554 remove_end_markers(dstbuf);
3555 }
3556
3557
3558 /*
3559 * Remove end markers from a buffer
3560 */
remove_end_markers(CTcTokString * buf)3561 void CTcTokenizer::remove_end_markers(CTcTokString *buf)
3562 {
3563 char *src;
3564 char *dst;
3565 utf8_ptr p;
3566
3567 /* scan the buffer */
3568 for (src = dst = buf->get_buf(), p.set(src) ;
3569 p.getptr() < buf->get_text_end() ; )
3570 {
3571 /* check for our flag */
3572 if (p.getch() == TOK_MACRO_EXP_END)
3573 {
3574 /* skip the flag byte and the following embedded pointer */
3575 src += 1 + sizeof(CTcHashEntryPp *);
3576 p.set(src);
3577 }
3578 else
3579 {
3580 /* skip this character */
3581 p.inc();
3582
3583 /* copy the bytes of this character as-is */
3584 while (src < p.getptr())
3585 *dst++ = *src++;
3586 }
3587 }
3588
3589 /* set the new buffer size */
3590 buf->set_text_len(dst - buf->get_buf());
3591 }
3592
3593
3594 /*
3595 * Expand the macro at the current token in the current line.
3596 *
3597 * 'src' is a pointer to the current position in 'srcbuf'. We'll update
3598 * 'src' to point to the next token after macro or its actual parameters
3599 * list, if it has one.
3600 */
expand_macro(CTcMacroRsc * rsc,CTcTokString * expbuf,const CTcTokString * srcbuf,utf8_ptr * src,size_t macro_srcbuf_ofs,CTcHashEntryPp * entry,int read_more,int allow_defined,int * expanded)3601 int CTcTokenizer::expand_macro(CTcMacroRsc *rsc, CTcTokString *expbuf,
3602 const CTcTokString *srcbuf, utf8_ptr *src,
3603 size_t macro_srcbuf_ofs,
3604 CTcHashEntryPp *entry, int read_more,
3605 int allow_defined, int *expanded)
3606 {
3607 CTcTokString *subexp;
3608 size_t argofs[TOK_MAX_MACRO_ARGS];
3609 size_t arglen[TOK_MAX_MACRO_ARGS];
3610 size_t startofs;
3611 const char *start;
3612 const char *end;
3613 int err;
3614 char flagbuf[1 + sizeof(entry)];
3615
3616 /* presume we won't do any expansion */
3617 *expanded = FALSE;
3618
3619 /* get our resources */
3620 subexp = &rsc->macro_exp_;
3621
3622 /* remember our parsing starting offset */
3623 startofs = src->getptr() - srcbuf->get_text();
3624
3625 /* clear the expansion output buffer */
3626 expbuf->clear_text();
3627
3628 /* if the macro has arguments, scan the actuals */
3629 if (entry->has_args())
3630 {
3631 int found_actuals;
3632
3633 /* read the macro arguments */
3634 if (parse_macro_actuals(srcbuf, src, entry, argofs, arglen,
3635 read_more, &found_actuals))
3636 {
3637 err = 1;
3638 goto done;
3639 }
3640
3641 /*
3642 * If we found no actuals, then this wasn't really an invocation
3643 * of the macro after all - a function-like macro invoked with
3644 * no arguments is simply not replaced. Store the original text
3645 * in the output buffer and return success.
3646 */
3647 if (!found_actuals)
3648 {
3649 /* copy the original text */
3650 expbuf->copy(srcbuf->get_text() + macro_srcbuf_ofs,
3651 startofs - macro_srcbuf_ofs);
3652
3653 /*
3654 * restore the source read pointer to where it was when we
3655 * started
3656 */
3657 src->set((char *)srcbuf->get_text() + startofs);
3658
3659 /* return success */
3660 err = 0;
3661 goto done;
3662 }
3663 }
3664
3665 /*
3666 * if there are arguments, replace the macro and substitute actuals
3667 * for the formals; otherwise, just copy the replacement text
3668 * directly
3669 */
3670 if (entry->get_argc() != 0)
3671 {
3672 /* substitute the actuals */
3673 if (substitute_macro_actuals(rsc, subexp, entry, srcbuf,
3674 argofs, arglen, allow_defined))
3675 {
3676 err = 1;
3677 goto done;
3678 }
3679
3680 /* set up to parse from the expansion buffer */
3681 start = subexp->get_text();
3682 end = start + subexp->get_text_len();
3683 }
3684 else
3685 {
3686 /*
3687 * use our local source buffer that simply references the
3688 * original expansion text, rather than making a copy of the
3689 * expansion text
3690 */
3691 start = entry->get_expansion();
3692 end = start + entry->get_expan_len();
3693 }
3694
3695 /* copy the expansion into the output buffer */
3696 expbuf->copy(start, end - start);
3697
3698 /*
3699 * After the end of the expansion sequence, insert the
3700 * fully-expanded flag plus a pointer to the symbol table entry that
3701 * we just expanded. This will allow us to detect during the
3702 * re-scan of the expansion text that this symbol has already been
3703 * expanded, in which case we must suppress further expansion of the
3704 * symbol. This allows us to follow the ANSI C rules for recursive
3705 * macro usage.
3706 */
3707 flagbuf[0] = TOK_MACRO_EXP_END;
3708 memcpy(&flagbuf[1], &entry, sizeof(entry));
3709 expbuf->append(flagbuf, sizeof(flagbuf));
3710
3711 /* indicate that we expanded the macro */
3712 *expanded = TRUE;
3713
3714 /* success */
3715 err = 0;
3716
3717 done:
3718 /* return the result */
3719 return err;
3720 }
3721
3722 /*
3723 * Parse a macro's actual parameter list, filling in the given hash
3724 * table with the arguments. Returns zero on success, non-zero on
3725 * error. 'entry' is the macro's defining symbol table entry.
3726 */
parse_macro_actuals(const CTcTokString * srcbuf,utf8_ptr * src,const CTcHashEntryPp * entry,size_t argofs[TOK_MAX_MACRO_ARGS],size_t arglen[TOK_MAX_MACRO_ARGS],int read_more,int * found_actuals)3727 int CTcTokenizer::parse_macro_actuals(const CTcTokString *srcbuf,
3728 utf8_ptr *src,
3729 const CTcHashEntryPp *entry,
3730 size_t argofs[TOK_MAX_MACRO_ARGS],
3731 size_t arglen[TOK_MAX_MACRO_ARGS],
3732 int read_more, int *found_actuals)
3733 {
3734 tc_toktyp_t typ;
3735 CTcToken tok;
3736 int argc;
3737 int spliced;
3738 int i;
3739
3740 /* presume we're not going to do any line splicing */
3741 spliced = FALSE;
3742
3743 /* no arguments parsed yet */
3744 argc = 0;
3745
3746 /* get the next token after the macro symbol */
3747 typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_);
3748
3749 /* splice another line if necessary */
3750 if (typ == TOKT_EOF && read_more)
3751 {
3752 /* splice a line */
3753 typ = actual_splice_next_line(srcbuf, src, &tok);
3754
3755 /* note the splice */
3756 spliced = TRUE;
3757 }
3758
3759 /* if we didn't find an open paren, there's no actual list after all */
3760 if (typ != TOKT_LPAR)
3761 {
3762 /* tell the caller we didn't find any actuals */
3763 *found_actuals = FALSE;
3764
3765 /* if we spliced a line, unsplice it at the current token */
3766 if (spliced)
3767 unsplice_line(tok.get_text());
3768
3769 /* return success */
3770 return 0;
3771 }
3772
3773 /* remember the offset of the start of the first argument */
3774 argofs[argc] = tok.get_text() + tok.get_text_len() - srcbuf->get_text();
3775
3776 /* skip the open paren */
3777 typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_);
3778
3779 /* read the arguments */
3780 while (typ != TOKT_RPAR)
3781 {
3782 utf8_ptr p;
3783 int paren_depth, bracket_depth, brace_depth;
3784 int sp_cnt;
3785
3786 /* if we have too many arguments, it's an error */
3787 if ((argc >= entry->get_argc() && !entry->has_varargs())
3788 || argc >= TOK_MAX_MACRO_ARGS)
3789 {
3790 /* log the error */
3791 log_error(TCERR_PP_MANY_MACRO_ARGS,
3792 (int)entry->getlen(), entry->getstr());
3793
3794 /* scan ahead to to close paren or end of line */
3795 while (typ != TOKT_RPAR && typ != TOKT_EOF)
3796 typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_);
3797
3798 /* done scanning arguments */
3799 break;
3800 }
3801
3802 /*
3803 * skip tokens until we find a comma outside of nested parens,
3804 * square brackets, or curly braces
3805 */
3806 paren_depth = bracket_depth = brace_depth = 0;
3807 while (paren_depth != 0
3808 || bracket_depth != 0
3809 || brace_depth != 0
3810 || (typ != TOKT_COMMA && typ != TOKT_RPAR))
3811 {
3812 /*
3813 * if it's an open or close paren, brace, or bracket, adjust
3814 * the depth accordingly
3815 */
3816 switch(typ)
3817 {
3818 case TOKT_LPAR:
3819 ++paren_depth;
3820 break;
3821
3822 case TOKT_RPAR:
3823 --paren_depth;
3824 break;
3825
3826 case TOKT_LBRACE:
3827 ++brace_depth;
3828 break;
3829
3830 case TOKT_RBRACE:
3831 --brace_depth;
3832 break;
3833
3834 case TOKT_LBRACK:
3835 ++bracket_depth;
3836 break;
3837
3838 case TOKT_RBRACK:
3839 --bracket_depth;
3840 break;
3841 }
3842
3843 /* get the next token */
3844 typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_);
3845
3846 /*
3847 * if we're at the end of the line, and we're allowed to
3848 * read more, splice the next line onto the current line
3849 */
3850 if (typ == TOKT_EOF && read_more)
3851 {
3852 /* splice a line */
3853 typ = actual_splice_next_line(srcbuf, src, &tok);
3854
3855 /* note that we've done some line splicing */
3856 spliced = TRUE;
3857 }
3858
3859 /* if we've reached the end of the file, stop */
3860 if (typ == TOKT_EOF)
3861 break;
3862 }
3863
3864 /* if we've reached the end of the file, stop */
3865 if (typ == TOKT_EOF)
3866 break;
3867
3868 /* remove any trailing whitespace from the actual's text */
3869 sp_cnt = 0;
3870 p.set((char *)tok.get_text());
3871 while (p.getptr() > srcbuf->get_text() + argofs[argc])
3872 {
3873 wchar_t ch;
3874
3875 /* move to the prior character */
3876 p.dec();
3877
3878 /* if it's not a space, stop looking */
3879 ch = p.getch();
3880 if (!is_space(ch))
3881 {
3882 /*
3883 * advance past this character so that we keep it in the
3884 * expansion
3885 */
3886 p.inc();
3887
3888 /*
3889 * if this last character was a backslash, and we removed
3890 * at least one space following it, keep the one space
3891 * that immediately follows the backslash, since that
3892 * space is part of the backslash's two-character escape
3893 * sequence
3894 */
3895 if (ch == '\\' && sp_cnt != 0)
3896 p.inc();
3897
3898 /* stop scanning */
3899 break;
3900 }
3901
3902 /* that's one more trailing space we've removed - count it */
3903 ++sp_cnt;
3904 }
3905
3906 /* note the argument length */
3907 arglen[argc] = (p.getptr() - srcbuf->get_text()) - argofs[argc];
3908
3909 /* count the argument */
3910 ++argc;
3911
3912 /* check for another argument */
3913 if (typ == TOKT_COMMA)
3914 {
3915 /* remember the offset of the start of this argument */
3916 argofs[argc] = tok.get_text() + tok.get_text_len()
3917 - srcbuf->get_text();
3918
3919 /* skip the comma and go back for another argument */
3920 typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_);
3921 }
3922 else if (typ == TOKT_RPAR)
3923 {
3924 /*
3925 * No need to look any further. Note that we don't want to
3926 * get another token, since we're done parsing the input
3927 * now, and we want to leave the token stream positioned for
3928 * the caller just after the extent of the macro, which, in
3929 * the case of this function-like macro, ends with the
3930 * closing paren.
3931 */
3932 break;
3933 }
3934 }
3935
3936 /* if we didn't find the right paren, flag the error */
3937 if (typ != TOKT_RPAR)
3938 {
3939 log_error(read_more
3940 ? TCERR_PP_MACRO_ARG_RPAR : TCERR_PP_MACRO_ARG_RPAR_1LINE,
3941 (int)entry->getlen(), entry->getstr());
3942 return 1;
3943 }
3944
3945 /* remove leading and trailing whitespace from each argument */
3946 for (i = 0 ; i < argc ; ++i)
3947 {
3948 const char *start;
3949 const char *end;
3950 utf8_ptr p;
3951 size_t del_len;
3952 int sp_cnt;
3953
3954 /* figure the limits of the argument text */
3955 start = srcbuf->get_text() + argofs[i];
3956 end = start + arglen[i];
3957
3958 /* remove leading whitespace */
3959 for (p.set((char *)start) ; p.getptr() < end && is_space(p.getch()) ;
3960 p.inc()) ;
3961
3962 /* set the new offset and length */
3963 del_len = p.getptr() - start;
3964 argofs[i] += del_len;
3965 arglen[i] -= del_len;
3966 start += del_len;
3967
3968 /* remove trailing whitespace */
3969 p.set((char *)end);
3970 sp_cnt = 0;
3971 while (p.getptr() > start)
3972 {
3973 wchar_t ch;
3974
3975 /* go to the prior character */
3976 p.dec();
3977
3978 /* if it's not whitespace, keep it */
3979 ch = p.getch();
3980 if (!is_space(ch))
3981 {
3982 /* put the character back */
3983 p.inc();
3984
3985 /*
3986 * if this is a backslash, and a space follows, keep the
3987 * immediately following space, since it's part of the
3988 * backslash sequence
3989 */
3990 if (ch == '\\' && sp_cnt != 0)
3991 p.inc();
3992
3993 /* we're done scanning */
3994 break;
3995 }
3996
3997 /* count another removed trailing space */
3998 ++sp_cnt;
3999 }
4000
4001 /* adjust the length */
4002 arglen[i] -= (end - p.getptr());
4003 }
4004
4005 /*
4006 * if we did any line splicing, cut off the rest of the line and
4007 * push it back into the logical input stream as a new line - this
4008 * will allow better error message positioning if errors occur in
4009 * the remainder of the line, since this means we'll only
4010 * artificially join onto one line the part of the new line that
4011 * contained the macro parameters
4012 */
4013 if (spliced)
4014 unsplice_line(tok.get_text() + tok.get_text_len());
4015
4016 /* make sure we found enough arguments */
4017 if (argc < entry->get_min_argc())
4018 {
4019 /* fill in the remaining arguments with empty strings */
4020 for ( ; argc < entry->get_argc() ; ++argc)
4021 {
4022 argofs[argc] = 0;
4023 arglen[argc] = 0;
4024 }
4025
4026 /* note the error, but proceed with empty arguments */
4027 log_warning(TCERR_PP_FEW_MACRO_ARGS,
4028 (int)entry->getlen(), entry->getstr());
4029 }
4030
4031 /*
4032 * if we have varargs, always supply an empty marker for the last
4033 * argument
4034 */
4035 if (entry->has_varargs() && argc < TOK_MAX_MACRO_ARGS)
4036 {
4037 argofs[argc] = 0;
4038 arglen[argc] = 0;
4039 }
4040
4041 /* success - we found an actual parameter list */
4042 *found_actuals = TRUE;
4043 return 0;
4044 }
4045
4046 /*
4047 * Splice a line for macro actual parameters. Sets the source pointer
4048 * to the start of the new line. Reads the first token on the spliced
4049 * line and returns it.
4050 *
4051 * We will splice new lines until we find a non-empty line or reach the
4052 * end of the input. If this returns EOF, it indicates that we've
4053 * reached the end of the entire input.
4054 */
4055 tc_toktyp_t CTcTokenizer::
actual_splice_next_line(const CTcTokString * srcbuf,utf8_ptr * src,CTcToken * tok)4056 actual_splice_next_line(const CTcTokString *srcbuf,
4057 utf8_ptr *src, CTcToken *tok)
4058 {
4059 /* add a space onto the end of the current line */
4060 linebuf_.append(" ", 1);
4061
4062 /* keep going until we find a non-empty line */
4063 for (;;)
4064 {
4065 char *new_line_p;
4066 tc_toktyp_t typ;
4067
4068 /* splice the next line onto the current line */
4069 new_line_p = read_line(TRUE);
4070
4071 /*
4072 * make sure we read additional lines as needed to complete any
4073 * strings left open at the end of the line
4074 */
4075 if (in_quote_ != '\0')
4076 splice_string();
4077
4078 /* if there was no more, return end of file */
4079 if (new_line_p == 0)
4080 return TOKT_EOF;
4081
4082 /* set the source to the start of the additional line */
4083 src->set((char *)new_line_p);
4084
4085 /* get the next token */
4086 typ = next_on_line(srcbuf, src, tok, ¯o_in_embedding_);
4087
4088 /* if we didn't get EOF, it means we found a non-empty line */
4089 if (typ != TOKT_EOF)
4090 return typ;
4091 }
4092 }
4093
4094 /*
4095 * Substitute the actual parameters in a macro's expansion
4096 */
substitute_macro_actuals(CTcMacroRsc * rsc,CTcTokString * subexp,CTcHashEntryPp * entry,const CTcTokString * srcbuf,const size_t * argofs,const size_t * arglen,int allow_defined)4097 int CTcTokenizer::substitute_macro_actuals(CTcMacroRsc *rsc,
4098 CTcTokString *subexp,
4099 CTcHashEntryPp *entry,
4100 const CTcTokString *srcbuf,
4101 const size_t *argofs,
4102 const size_t *arglen,
4103 int allow_defined)
4104 {
4105 const char *start;
4106 utf8_ptr expsrc;
4107 CTcToken prvtok;
4108 CTcToken prvprvtok;
4109 CTcToken tok;
4110 tc_toktyp_t typ;
4111 const CVmHashTable *actuals;
4112 CTcTokString *actual_exp_buf;
4113 const size_t expand_max = 10;
4114 static struct expand_info_t
4115 {
4116 /* type of expansion (#foreach, #ifempty, #ifnempty) */
4117 tc_toktyp_t typ;
4118
4119 /*
4120 * flag: this is an iterator type (if this is true, the varargs
4121 * formal should be expanded to the current argument given by our
4122 * 'arg' member; if this is false, the varargs formal should be
4123 * expanded as the full varargs list)
4124 */
4125 int is_iterator;
4126
4127 /* the marker character that delimits the foreach arguments */
4128 wchar_t delim;
4129
4130 /* location of start of expansion region for foreach */
4131 utf8_ptr start;
4132
4133 /* current argument index */
4134 int arg;
4135
4136 /* the current expansion part (0 = first part, etc) */
4137 int part;
4138 }
4139 expand_stack[expand_max], *expand_sp;
4140
4141 /* get the actuals table */
4142 actuals = entry->get_params_table();
4143
4144 /* get the actual expansion buffer from the resource object */
4145 actual_exp_buf = &rsc->actual_exp_buf_;
4146
4147 /*
4148 * Scan the replacement text for formals, and replace each formal
4149 * with the actual. Set up a pointer at the start of the expansion
4150 * text.
4151 */
4152 start = entry->get_expansion();
4153 expsrc.set((char *)start);
4154
4155 /* we don't yet have a previous token */
4156 prvtok.settyp(TOKT_EOF);
4157 prvprvtok.settyp(TOKT_EOF);
4158
4159 /* clear the expansion buffer */
4160 subexp->clear_text();
4161
4162 /* we have no #foreach/#ifempty/#ifnempty stack yet */
4163 expand_sp = expand_stack;
4164
4165 /* scan the tokens in the expansion text */
4166 for (typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_) ;
4167 typ != TOKT_EOF ; )
4168 {
4169 /*
4170 * check to see if we've reached the end of a
4171 * #foreach/#ifempty/#ifnempty
4172 */
4173 if (expand_sp != expand_stack)
4174 {
4175 /* check to see if we're at the delimiter */
4176 if (utf8_ptr::s_getch(tok.get_text()) == (expand_sp-1)->delim)
4177 {
4178 /* copy the prior expansion so far */
4179 if (tok.get_text() > start)
4180 subexp->append(start, tok.get_text() - start);
4181
4182 /* go back to the start of the token */
4183 expsrc.set((char *)tok.get_text());
4184
4185 /* see what kind of token we're expanding */
4186 switch((expand_sp-1)->typ)
4187 {
4188 case TOKT_MACRO_FOREACH:
4189 /* it's a #foreach - process the appropriate part */
4190 switch ((expand_sp-1)->part)
4191 {
4192 case 0:
4193 /*
4194 * We've been doing the first part, which is the
4195 * main expansion per actual. This delimiter thus
4196 * introduces the 'between' portion, which we copy
4197 * between each iteration, but not after the last
4198 * iteration. So, if we've just done the last
4199 * actual, skip this part entirely; otherwise,
4200 * keep going, using this part.
4201 */
4202 if (argofs[(expand_sp-1)->arg + 1] == 0)
4203 {
4204 /* skip this one remaining part */
4205 skip_delimited_group(&expsrc, 1);
4206
4207 /* we're finished with the iteration */
4208 goto end_foreach;
4209 }
4210 else
4211 {
4212 /*
4213 * we have more arguments, so we want to
4214 * expand this part - skip the deliter and
4215 * keep going
4216 */
4217 expsrc.inc();
4218
4219 /* we're now in the next part of the iterator */
4220 (expand_sp-1)->part++;
4221 }
4222 break;
4223
4224 case 1:
4225 /*
4226 * We've reached the end of the entire #foreach
4227 * string, so we're done with this iteration.
4228 * Skip the delimiter.
4229 */
4230 expsrc.inc();
4231
4232 end_foreach:
4233 /*
4234 * if we have more arguments, start over with the
4235 * next iteration; otherwise, pop the #foreach
4236 * level
4237 */
4238 if (argofs[(expand_sp-1)->arg + 1] == 0)
4239 {
4240 /* no more arguments - pop the #foreach level */
4241 --expand_sp;
4242 }
4243 else
4244 {
4245 /* we have more arguments - move to the next */
4246 (expand_sp-1)->arg++;
4247
4248 /* go back to the start of the expansion */
4249 expsrc = (expand_sp-1)->start;
4250
4251 /* we have no previous token for pasting ops */
4252 prvtok.settyp(TOKT_EOF);
4253 prvprvtok.settyp(TOKT_EOF);
4254
4255 /* we're back in the first part of the iterator */
4256 (expand_sp-1)->part = 0;
4257 }
4258 break;
4259 }
4260 break;
4261
4262 case TOKT_MACRO_IFEMPTY:
4263 case TOKT_MACRO_IFNEMPTY:
4264 /*
4265 * #ifempty or #ifnempty - we've reached the end of
4266 * the conditional text, so simply pop a level and
4267 * keep going after the delimiter
4268 */
4269
4270 /* skip the delimiter */
4271 expsrc.inc();
4272
4273 /* pop a level */
4274 --expand_sp;
4275
4276 /* done */
4277 break;
4278 }
4279
4280 /* the next chunk starts here */
4281 start = expsrc.getptr();
4282
4283 /* get the next token */
4284 typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_);
4285
4286 /* we have the next token, so back and process it */
4287 continue;
4288 }
4289 }
4290
4291 /* if it's a #foreach marker, start a #foreach iteration */
4292 if (typ == TOKT_MACRO_FOREACH && entry->has_varargs())
4293 {
4294 /* copy the prior expansion so far */
4295 if (tok.get_text() > start)
4296 subexp->append(start, tok.get_text() - start);
4297
4298 /* push a #foreach level, if possible */
4299 if (expand_sp - expand_stack >= expand_max)
4300 {
4301 /*
4302 * we can't create another level - log an error and ignore
4303 * this new level
4304 */
4305 log_error(TCERR_PP_FOREACH_TOO_DEEP);
4306 }
4307 else if (argofs[entry->get_argc() - 1] == 0)
4308 {
4309 /*
4310 * we have no actuals for the variable part of the
4311 * formals, so we must iterate zero times through the
4312 * #foreach part - in other words, simply skip ahead to
4313 * the end of the #foreach
4314 */
4315 skip_delimited_group(&expsrc, 2);
4316 }
4317 else
4318 {
4319 /* remember and skip the marker character */
4320 expand_sp->delim = expsrc.getch();
4321 expsrc.inc();
4322
4323 /* set the expansion type */
4324 expand_sp->typ = typ;
4325
4326 /*
4327 * remember the position where the #foreach started, since
4328 * we need to come back here for each use of the variable
4329 */
4330 expand_sp->start = expsrc;
4331
4332 /* we're an iterator type */
4333 expand_sp->is_iterator = TRUE;
4334
4335 /*
4336 * Start at the first argument in the variable part of the
4337 * argument list. The last formal corresponds to the
4338 * first variable argument.
4339 */
4340 expand_sp->arg = entry->get_argc() - 1;
4341
4342 /* we're in the main expansion part of the expression */
4343 expand_sp->part = 0;
4344
4345 /* push the new level */
4346 ++expand_sp;
4347 }
4348
4349 /* the next chunk starts here */
4350 start = expsrc.getptr();
4351
4352 /* get the next token */
4353 typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_);
4354
4355 /* we have the next token, so back and process it */
4356 continue;
4357 }
4358
4359 /* if it's a varargs #ifempty or #ifnempty flag, expand it */
4360 if ((typ == TOKT_MACRO_IFEMPTY || typ == TOKT_MACRO_IFNEMPTY)
4361 && entry->has_varargs())
4362 {
4363 int is_empty;
4364 int expand;
4365
4366 /* determine if the varargs list is empty or not */
4367 is_empty = (argofs[entry->get_argc() - 1] == 0);
4368
4369 /*
4370 * decide whether or not expand it, according to the empty
4371 * state and the flag type
4372 */
4373 expand = ((is_empty && typ == TOKT_MACRO_IFEMPTY)
4374 || (!is_empty && typ == TOKT_MACRO_IFNEMPTY));
4375
4376 /*
4377 * if we're going to expand it, push a level; otherwise, just
4378 * skip the entire expansion
4379 */
4380 if (expand)
4381 {
4382 /* make sure we have room for another level */
4383 if (expand_sp - expand_stack >= expand_max)
4384 {
4385 /* no room - log an error and ignore the new level */
4386 log_error(TCERR_PP_FOREACH_TOO_DEEP);
4387 }
4388 else
4389 {
4390 /* remember and skip the delimiter */
4391 expand_sp->delim = expsrc.getch();
4392 expsrc.inc();
4393
4394 /*
4395 * we're not an iterator type, so inherit the
4396 * enclosing level's meaning of the varargs formal
4397 */
4398 if (expand_sp - expand_stack == 0)
4399 {
4400 /* outermost level - use the whole varargs list */
4401 expand_sp->is_iterator = FALSE;
4402 }
4403 else
4404 {
4405 /* use the enclosing level's meaning */
4406 expand_sp->is_iterator = (expand_sp-1)->is_iterator;
4407 expand_sp->arg = (expand_sp-1)->arg;
4408 }
4409
4410 /* set the expansion type */
4411 expand_sp->typ = typ;
4412
4413 /* push the new level */
4414 ++expand_sp;
4415 }
4416 }
4417 else
4418 {
4419 /* not expanding - just skip the entire expansion */
4420 skip_delimited_group(&expsrc, 1);
4421 }
4422
4423 /* the next chunk starts here */
4424 start = expsrc.getptr();
4425
4426 /* get the next token */
4427 typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_);
4428
4429 /* we have the next token, so back and process it */
4430 continue;
4431 }
4432
4433 /* if it's a varargs #argcount indicator, expand it */
4434 if (typ == TOKT_MACRO_ARGCOUNT && entry->has_varargs())
4435 {
4436 char buf[20];
4437 int i;
4438
4439 /* copy the prior expansion so far */
4440 if (tok.get_text() > start)
4441 subexp->append(start, tok.get_text() - start);
4442
4443 /*
4444 * count the number of arguments after and including the
4445 * variable argument placeholder
4446 */
4447 for (i = entry->get_argc() - 1 ; argofs[i] != 0 ; ++i) ;
4448
4449 /* make a string out of the variable argument count */
4450 sprintf(buf, "%d", i - (entry->get_argc() - 1));
4451
4452 /* add the argument count to the output buffer */
4453 subexp->append(buf, strlen(buf));
4454
4455 /* the next chunk starts after the #argcount */
4456 start = expsrc.getptr();
4457
4458 /* get the next token */
4459 typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_);
4460
4461 /* we have the next token, so back and process it */
4462 continue;
4463 }
4464
4465 /* if it's a symbol, check for an actual */
4466 if (typ == TOKT_MACRO_FORMAL)
4467 {
4468 const char *p;
4469 int argnum;
4470 size_t argnum_len;
4471 int pasting;
4472 int pasting_at_left, pasting_at_right;
4473 int stringize;
4474 char stringize_qu;
4475 tc_toktyp_t stringize_type;
4476 CTcToken paste_at_right_tok;
4477
4478 /* assume we'll copy up to the start of this token */
4479 p = tok.get_text();
4480
4481 /*
4482 * get the index of the actual in the argument vector --
4483 * this is given by the second byte of the special macro
4484 * parameter flag token
4485 */
4486 argnum = (int)(uchar)tok.get_text()[1] - 1;
4487
4488 /*
4489 * If we have varargs, and this is the varargs argument, and
4490 * the current #foreach stack level indicates that we're
4491 * iterating through the varargs list, treat this as a
4492 * reference to the current argument in the iteration.
4493 */
4494 if (expand_sp != expand_stack
4495 && argnum == entry->get_argc() - 1
4496 && (expand_sp-1)->is_iterator)
4497 {
4498 /*
4499 * we're on a #foreach iterator, and this is the varargs
4500 * formal - use the current #foreach iteration element
4501 * instead
4502 */
4503 argnum = (expand_sp-1)->arg;
4504 }
4505
4506 /*
4507 * Get the length of this argument. If we have varargs, and
4508 * this is the last formal, which is the placeholder for the
4509 * variable argument list, and we're not in a #foreach
4510 * iterator, the value is the value of the entire string of
4511 * variable arguments, including the commas.
4512 */
4513 if (expand_sp == expand_stack
4514 && entry->has_varargs()
4515 && argnum == entry->get_argc() - 1)
4516 {
4517 int i;
4518
4519 /*
4520 * It's the full varargs list - use the length from the
4521 * first varargs argument to the last. Find the last
4522 * argument.
4523 */
4524 for (i = argnum ;
4525 i < TOK_MAX_MACRO_ARGS && argofs[i] != 0 ; ++i) ;
4526
4527 /*
4528 * The full list length is the distance from the offset of
4529 * the first to the end of the last. If there are no
4530 * varargs arguments at all, the length is zero.
4531 */
4532 if (i == argnum)
4533 argnum_len = 0;
4534 else
4535 argnum_len = argofs[i-1] + arglen[i-1] - argofs[argnum];
4536 }
4537 else
4538 {
4539 /*
4540 * it's not the full varargs list, so just use the length
4541 * of this single actual
4542 */
4543 argnum_len = arglen[argnum];
4544 }
4545
4546 /* assume we won't do any token pasting or stringizing */
4547 pasting = pasting_at_left = pasting_at_right = FALSE;
4548 stringize = FALSE;
4549
4550 /*
4551 * if the previous token was a token-pasting operator,
4552 * remove it and any preceding whitespace from the source
4553 * material, since we want to append the actual parameter
4554 * text directly after the preceding token
4555 */
4556 check_paste_left:
4557 if (prvtok.gettyp() == TOKT_POUNDPOUND)
4558 {
4559 wchar_t prv_ch;
4560
4561 /*
4562 * note that we have token pasting - we're pasting
4563 * something to the left of this token (since we had a
4564 * "##" before this token
4565 */
4566 pasting = TRUE;
4567 pasting_at_left = TRUE;
4568
4569 /* go back to the ## token */
4570 p = prvtok.get_text();
4571
4572 /* remove any preceding whitespace */
4573 for (prv_ch = 0 ; p > start ; )
4574 {
4575 const char *prvp;
4576
4577 /* get the previous character */
4578 prvp = utf8_ptr::s_dec((char *)p);
4579 prv_ch = utf8_ptr::s_getch((char *)prvp);
4580
4581 /* if it's not a space, we're done */
4582 if (!is_space(prv_ch))
4583 break;
4584
4585 /* move back over this character */
4586 p = prvp;
4587 }
4588
4589 /*
4590 * Weird special case: if the previous character was a
4591 * comma, and the formal we're pasting is a variable
4592 * argument formal (i.e., the last formal in a varargs
4593 * macro), and the varargs list is empty, then remove the
4594 * comma. This is a handy shorthand notation that allows
4595 * the varargs list to be added to a comma-delimited list,
4596 * such as a function call's actuals or the contents of a
4597 * list.
4598 */
4599 if (prv_ch == ','
4600 && entry->has_varargs()
4601 && argnum == entry->get_argc() - 1
4602 && argofs[argnum] == 0)
4603 {
4604 /*
4605 * it's the special case - move back one more
4606 * character to delete the comma
4607 */
4608 p = utf8_ptr::s_dec((char *)p);
4609 }
4610 }
4611 else if (prvtok.gettyp() == TOKT_POUND
4612 || prvtok.gettyp() == TOKT_POUNDAT)
4613 {
4614 /* go back to the # token */
4615 p = prvtok.get_text();
4616
4617 /* note that we have stringizing */
4618 stringize = TRUE;
4619 stringize_type = prvtok.gettyp();
4620 stringize_qu = (prvtok.gettyp() == TOKT_POUND
4621 ? '"' : '\'');
4622
4623 /* go back one more token */
4624 prvtok = prvprvtok;
4625 prvprvtok.settyp(TOKT_EOF);
4626
4627 /*
4628 * go back and check for pasting again, since we could
4629 * be pasting to a stringized token
4630 */
4631 goto check_paste_left;
4632 }
4633
4634 /* copy the prior expansion so far */
4635 if (p > start)
4636 subexp->append(start, p - start);
4637
4638 /* remember the symbol as the previous token */
4639 prvprvtok = prvtok;
4640 prvtok = tok;
4641
4642 /* get the next token after the formal */
4643 typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_);
4644
4645 /*
4646 * If it's followed by a token-pasting operator, we need to
4647 * paste the next token directly onto the end of the text we
4648 * just added to the buffer, skipping any intervening
4649 * whitespace; otherwise, we want to start adding again at
4650 * the next character after the original token.
4651 */
4652 if (typ == TOKT_POUNDPOUND)
4653 {
4654 utf8_ptr old_expsrc;
4655 CTcToken old_tok;
4656
4657 /* note that we have pasting to the right of this token */
4658 pasting = TRUE;
4659 pasting_at_right = TRUE;
4660
4661 /* remember where we started */
4662 old_expsrc = expsrc;
4663
4664 /* remember the current token for a moment */
4665 old_tok = tok;
4666
4667 /* skip to the next token after the ## */
4668 typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_);
4669
4670 /* remember the token we're pasting to the right */
4671 paste_at_right_tok = tok;
4672
4673 /* check for pasting to a stringizer */
4674 if (stringize && typ == stringize_type)
4675 {
4676 /*
4677 * leave the ## in the stream for now - we'll fix it
4678 * up when we stringize the next token, rather than
4679 * doing so now
4680 */
4681 expsrc = old_expsrc;
4682 tok = old_tok;
4683 }
4684 else
4685 {
4686 /*
4687 * remember that we have a token-pasting operator,
4688 * so that we can tell that we're pasting when we
4689 * look at the next token
4690 */
4691 prvprvtok = prvtok;
4692 prvtok = old_tok;
4693 }
4694
4695 /* start next text from here */
4696 start = tok.get_text();
4697 }
4698 else
4699 {
4700 /* Start at the end of the symbol token */
4701 start = prvtok.get_text() + prvtok.get_text_len();
4702 }
4703
4704 /*
4705 * If we're not doing any pasting, recursively expand macros
4706 * in the actual expansion text. If we're pasting, do not
4707 * expand any macros in the expansion, since we want to do
4708 * the pasting before we do any expanding.
4709 */
4710 if (pasting && stringize)
4711 {
4712 int add_open;
4713 int add_close;
4714
4715 /* presume we'll include the open and close quotes */
4716 add_close = TRUE;
4717 add_open = TRUE;
4718
4719 /*
4720 * If we're pasting to the left, and the buffer so far
4721 * ends in the same quote we're adding to this token,
4722 * combine the strings by removing the preceding quote
4723 * and not adding the open quote on the new string
4724 */
4725 if (subexp->get_text_len() > 0
4726 && *(subexp->get_text_end() - 1) == stringize_qu)
4727 {
4728 /* remove the close quote from the expansion so far */
4729 subexp->set_text_len(subexp->get_text_len() - 1);
4730
4731 /* don't add the open quote to the new string */
4732 add_open = FALSE;
4733 }
4734
4735 /*
4736 * If we're pasting to the right, and we have a string
4737 * of the same type following, or we will be pasting a
4738 * stringizing pair, paste the two strings together to
4739 * form one string by removing the close quote from this
4740 * string and the open quote from the next string
4741 */
4742 if (pasting_at_right && *tok.get_text() == stringize_qu)
4743 add_close = FALSE;
4744
4745 /*
4746 * We're both stringizing this argument and pasting
4747 * another token - first stringize the actual.
4748 */
4749 stringize_macro_actual(subexp,
4750 srcbuf->get_text()
4751 + argofs[argnum], argnum_len,
4752 stringize_qu, add_open, add_close);
4753
4754 /*
4755 * if we decided to remove the closing quote, we want to
4756 * remove the open quote from the following string as
4757 * well - copy in the following string without its open
4758 * quote
4759 */
4760 if (!add_close)
4761 {
4762 /*
4763 * append the following token without its first
4764 * character (its open quote)
4765 */
4766 subexp->append(tok.get_text() + 1,
4767 tok.get_text_len() - 1);
4768
4769 /* move on to the next token */
4770 prvprvtok = prvtok;
4771 prvtok = tok;
4772 typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_);
4773
4774 /* start from the new token */
4775 start = tok.get_text();
4776 }
4777 }
4778 else if (pasting)
4779 {
4780 const char *argp;
4781 size_t len;
4782 int done;
4783 wchar_t quote_char;
4784
4785 /* get the actual argument information */
4786 argp = srcbuf->get_text() + argofs[argnum];
4787 len = argnum_len;
4788
4789 /*
4790 * if we're pasting to the left of this token, and the
4791 * token starts with a fully-expanded flag, remove the
4792 * flag - we're making up a new token out of this and
4793 * what comes before, so the token that we fully
4794 * expanded is disappearing, so the fully-expanded
4795 * status no longer applies
4796 */
4797 if (pasting_at_left && *argp == TOK_FULLY_EXPANDED_FLAG)
4798 {
4799 /* skip the flag */
4800 ++argp;
4801 --len;
4802 }
4803
4804 /* presume we won't find any quoted strings */
4805 quote_char = 0;
4806
4807 /*
4808 * check for string concatenation to the left - if we're
4809 * concatenating two strings of the same type, remove
4810 * the adjacent quotes to make it a single string
4811 */
4812 if (pasting_at_left
4813 && subexp->get_text_len() > 0
4814 && (*argp == '\'' || *argp == '"')
4815 && *(subexp->get_text_end() - 1) == *argp)
4816 {
4817 /* remove the close quote from the expansion so far */
4818 subexp->set_text_len(subexp->get_text_len() - 1);
4819
4820 /* remember the quote character */
4821 quote_char = *argp;
4822
4823 /* don't add the open quote to the new string */
4824 ++argp;
4825 --len;
4826 }
4827
4828 /* presume we won't have to do anything special */
4829 done = FALSE;
4830
4831 /*
4832 * If we're pasting at the right, also remove any
4833 * fully-expanded flag just before the last token in the
4834 * expansion.
4835 */
4836 if (pasting_at_right)
4837 {
4838 CTcToken old_tok;
4839 CTcToken tok;
4840 utf8_ptr p;
4841
4842 /* scan for the final token in the expansion string */
4843 p.set((char *)argp);
4844 old_tok.settyp(TOKT_INVALID);
4845 while (p.getptr() < argp + len)
4846 {
4847 /*
4848 * get another token - stop at EOF or if we go
4849 * past the bounds of the expansion text
4850 */
4851 if (next_on_line(&p, &tok, ¯o_in_embedding_)
4852 == TOKT_EOF
4853 || tok.get_text() >= argp + len)
4854 break;
4855
4856 /* remember the previous token */
4857 old_tok = tok;
4858 }
4859
4860 /*
4861 * if the final token is a symbol, and it has the
4862 * fully-expanded flag, we must omit the flag from
4863 * the appended text
4864 */
4865 if (old_tok.gettyp() == TOKT_SYM
4866 && old_tok.get_fully_expanded())
4867 {
4868 /*
4869 * append up to but not including the flag byte
4870 * preceding the final token
4871 */
4872 subexp->append(argp, tok.get_text() - 1 - argp);
4873
4874 /*
4875 * append from the last token to the end of the
4876 * expansion, skipping the flag byte
4877 */
4878 subexp->append(tok.get_text(),
4879 len - (tok.get_text() - argp));
4880
4881 /* we've done the appending */
4882 done = TRUE;
4883 }
4884 else if (quote_char != 0
4885 && paste_at_right_tok.get_text_len() != 0
4886 && *paste_at_right_tok.get_text() == quote_char)
4887 {
4888 /*
4889 * we're pasting two strings together - append
4890 * up to but not including the close quote
4891 */
4892 subexp->append(argp, len - 1);
4893
4894 /*
4895 * append the next token, but do not include the
4896 * open quote
4897 */
4898 subexp->append(paste_at_right_tok.get_text() + 1,
4899 paste_at_right_tok.get_text_len() - 1);
4900
4901 /*
4902 * restart after the right token, since we've
4903 * now fully processed that token
4904 */
4905 start = paste_at_right_tok.get_text()
4906 + paste_at_right_tok.get_text_len();
4907
4908 /* we're done */
4909 done = TRUE;
4910 }
4911 }
4912
4913 /*
4914 * append the actual without expansion, if we haven't
4915 * already handled it specially
4916 */
4917 if (!done)
4918 subexp->append(argp, len);
4919 }
4920 else if (stringize)
4921 {
4922 /* stringize the actual */
4923 stringize_macro_actual(subexp,
4924 srcbuf->get_text()
4925 + argofs[argnum], argnum_len,
4926 stringize_qu, TRUE, TRUE);
4927 }
4928 else
4929 {
4930 CTcTokStringRef actual_src_buf;
4931
4932 /* recursively expand macros in the actual text */
4933 actual_src_buf.
4934 set_buffer(srcbuf->get_text() + argofs[argnum],
4935 argnum_len);
4936 if (expand_macros(&actual_src_buf, 0, actual_exp_buf,
4937 FALSE, allow_defined, FALSE))
4938 return 1;
4939
4940 /*
4941 * Append the expanded actual, marking any
4942 * fully-expanded tokens as such and removing
4943 * end-of-expansion markers.
4944 *
4945 * We can't leave end-of-expansion markers in the
4946 * expanded actual text, because end-of-expansion
4947 * markers apply only to the current recursion level,
4948 * and we've now exited the actual's recursion level.
4949 * However, we must not expand further anything in the
4950 * actual's expansion that has already been fully
4951 * expanded. To achieve both of these goals, we switch
4952 * here from marking the run of text (with the end
4953 * marker) to marking individual tokens.
4954 */
4955 mark_full_exp_tokens(subexp, actual_exp_buf, TRUE);
4956 }
4957
4958 /* we've already read the next token, so proceed */
4959 continue;
4960 }
4961
4962 /* remember the current token as the previous token */
4963 prvprvtok = prvtok;
4964 prvtok = tok;
4965
4966 /* get the next token of the expansion */
4967 typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_);
4968 }
4969
4970 /* copy the remaining replacement text */
4971 subexp->append(start, tok.get_text() - start);
4972
4973 /* success */
4974 return 0;
4975 }
4976
4977 /*
4978 * Skip the source of a delimited macro expansion area (#foreach,
4979 * #ifempty, #ifnempty).
4980 */
skip_delimited_group(utf8_ptr * p,int parts_to_skip)4981 void CTcTokenizer::skip_delimited_group(utf8_ptr *p, int parts_to_skip)
4982 {
4983 wchar_t delim;
4984
4985 /* get the delimiter character */
4986 delim = p->getch();
4987
4988 /*
4989 * if the delimiter put us at the end of the line, there's nothing to
4990 * skip
4991 */
4992 if (delim == 0 || delim == TOK_END_PP_LINE)
4993 return;
4994
4995 /* skip the delimiter */
4996 p->inc();
4997
4998 /* keep going until we've skipped the desired number of parts */
4999 while (parts_to_skip != 0)
5000 {
5001 wchar_t ch;
5002
5003 /* read the next character */
5004 ch = p->getch();
5005
5006 /* if it's the end of the line, give up */
5007 if (ch == 0 || ch == TOK_END_PP_LINE)
5008 {
5009 /*
5010 * we ran out of input before reaching the delimiter, so this
5011 * is implicitly the end of it
5012 */
5013 return;
5014 }
5015
5016 /* check what we have */
5017 if (ch == delim)
5018 {
5019 /* that's one less part to skip */
5020 --parts_to_skip;
5021
5022 /* skip it */
5023 p->inc();
5024 }
5025 else if (ch == TOK_MACRO_FOREACH_FLAG)
5026 {
5027 /* it's a nested #foreach - skip all of its parts */
5028 skip_delimited_group(p, 2);
5029 }
5030 else if (ch == TOK_MACRO_IFEMPTY_FLAG
5031 || ch == TOK_MACRO_IFNEMPTY_FLAG)
5032 {
5033 /* nested #ifempty or #ifnempty - skip its expansion */
5034 skip_delimited_group(p, 1);
5035 }
5036 else
5037 {
5038 /* it's nothing special to us - skip it */
5039 p->inc();
5040 }
5041 }
5042 }
5043
5044 /*
5045 * Stringize a macro actual parameter value into a macro expansion
5046 * buffer
5047 */
stringize_macro_actual(CTcTokString * expbuf,const char * actual_val,size_t actual_len,char quote_char,int add_open_quote,int add_close_quote)5048 void CTcTokenizer::stringize_macro_actual(CTcTokString *expbuf,
5049 const char *actual_val,
5050 size_t actual_len, char quote_char,
5051 int add_open_quote,
5052 int add_close_quote)
5053 {
5054 utf8_ptr src;
5055 const char *start;
5056 int in_inner_quote;
5057 wchar_t inner_quote_char;
5058 wchar_t prvch;
5059
5060 /* add the open quote if desired */
5061 if (add_open_quote)
5062 expbuf->append("e_char, 1);
5063
5064 /* remember the start of the current segment */
5065 start = actual_val;
5066
5067 /*
5068 * add the characters of the actual parameter value, quoting any
5069 * quotes or backslashes
5070 */
5071 for (src.set((char *)actual_val), in_inner_quote = FALSE, prvch = '\0' ;
5072 src.getptr() < actual_val + actual_len ; )
5073 {
5074 wchar_t cur;
5075
5076 /* get this character */
5077 cur = src.getch();
5078
5079 /* compress runs of whitespace to single spaces */
5080 if (is_space(cur) && prvch != '\\')
5081 {
5082 /* append up to this character */
5083 if (src.getptr() > start)
5084 expbuf->append(start, src.getptr() - start);
5085
5086 /* find the next non-space character */
5087 for ( ; src.getptr() < actual_val + actual_len ; src.inc())
5088 {
5089 if (!is_space(src.getch()))
5090 break;
5091 }
5092
5093 /*
5094 * if we're not at the start or end of the string, add a
5095 * single space to replace the entire run of whitespace --
5096 * don't do this at the start or end of the string, since
5097 * we must remove leading and trailing whitespace
5098 */
5099 if (prvch != '\0' && src.getptr() < actual_val + actual_len)
5100 expbuf->append(" ", 1);
5101
5102 /* note that the previous character is a space */
5103 prvch = cur;
5104
5105 /* this is the new starting point */
5106 start = src.getptr();
5107
5108 /* proceed - we're already at the next character */
5109 continue;
5110 }
5111
5112 /*
5113 * Check to see if we need to quote this character. Quote any
5114 * quote mark matching the enclosing quotes; also quote any
5115 * backslash that occurs within nested quotes within the source
5116 * material, but not backslashes that occur originally outside
5117 * quotes.
5118 */
5119 if (cur == quote_char
5120 || (cur == '\\' && in_inner_quote))
5121 {
5122 /* append the segment up to (but not including) this character */
5123 if (src.getptr() > start)
5124 expbuf->append(start, src.getptr() - start);
5125
5126 /* add an extra backslash */
5127 expbuf->append("\\", 1);
5128
5129 /* remember the start of the next segment */
5130 start = src.getptr();
5131 }
5132
5133 /*
5134 * if this is a quote character, and it's not itself escaped,
5135 * reverse our in-quote flag
5136 */
5137 if (prvch != '\\')
5138 {
5139 /*
5140 * If we're in an inner quote, and it's a match for the open
5141 * inner quote, we're no longer in a quote. Otherwise, if
5142 * we're not in quotes and this is some kind of quote, enter
5143 * the new quotes.
5144 */
5145 if (in_inner_quote && cur == inner_quote_char)
5146 {
5147 /* we're leaving the inner quoted string */
5148 in_inner_quote = FALSE;
5149 }
5150 else if (!in_inner_quote && (cur == '"' || cur == '\''))
5151 {
5152 /* we're entering a new inner quoted string */
5153 in_inner_quote = TRUE;
5154 inner_quote_char = cur;
5155 }
5156 }
5157
5158 /* remember this as the previous character */
5159 prvch = cur;
5160
5161 /* move on to the next character */
5162 src.inc();
5163 }
5164
5165 /* if there's anything in the final segment, append it */
5166 if (src.getptr() > start)
5167 expbuf->append(start, src.getptr() - start);
5168
5169 /* add the close quote if desired */
5170 if (add_close_quote)
5171 expbuf->append("e_char, 1);
5172 }
5173
5174 /*
5175 * Expand a "defined" preprocessor operator
5176 */
expand_defined(CTcTokString * subexp,const CTcTokString * srcbuf,utf8_ptr * src)5177 int CTcTokenizer::expand_defined(CTcTokString *subexp,
5178 const CTcTokString *srcbuf, utf8_ptr *src)
5179 {
5180 CTcToken tok;
5181 tc_toktyp_t typ;
5182 int paren;
5183 int found;
5184
5185 /* get the next token */
5186 typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_);
5187
5188 /* note whether we have an open paren; if we do, skip it */
5189 paren = (typ == TOKT_LPAR);
5190 if (paren)
5191 typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_);
5192
5193 /* get the symbol */
5194 if (typ != TOKT_SYM)
5195 {
5196 log_error(TCERR_PP_DEFINED_NO_SYM,
5197 (int)tok.get_text_len(), tok.get_text());
5198 return 1;
5199 }
5200
5201 /* look to see if the symbol is defined */
5202 found = (find_define(tok.get_text(), tok.get_text_len()) != 0);
5203
5204 /* expand the macro to "1" if found, "0" if not */
5205 subexp->copy(found ? "1" : "0", 1);
5206
5207 /* check for and skip the matching close paren */
5208 if (paren)
5209 {
5210 /* require the closing paren */
5211 if (next_on_line(srcbuf, src, &tok, ¯o_in_embedding_)
5212 != TOKT_RPAR)
5213 {
5214 /* generate an error if we don't find it */
5215 log_error(TCERR_PP_DEFINED_RPAR);
5216 return 1;
5217 }
5218 }
5219
5220 /* success */
5221 return 0;
5222 }
5223
5224
5225 /* ------------------------------------------------------------------------ */
5226 /*
5227 * Process comments. Replaces each character of a comment with a space.
5228 */
process_comments(size_t start_ofs)5229 void CTcTokenizer::process_comments(size_t start_ofs)
5230 {
5231 utf8_ptr src;
5232 utf8_ptr dst;
5233 int trailing_sp_after_bs;
5234
5235 /* we haven't found a backslash followed by trailing space yet */
5236 trailing_sp_after_bs = FALSE;
5237
5238 /*
5239 * Scan the line. When inside a comment, replace each character of
5240 * the comment with a space. When outside comments, simply copy
5241 * characters intact.
5242 *
5243 * Note that we need a separate src and dst pointer, because the
5244 * character length of the original and replaced characters may
5245 * change. Fortunately, the length will never do anything but
5246 * shrink or stay the same, since the only change we make is to
5247 * insert spaces, which are always one byte apiece in UTF-8; we can
5248 * therefore update the buffer in place.
5249 */
5250 for (src.set(linebuf_.get_buf() + start_ofs),
5251 dst.set(linebuf_.get_buf() + start_ofs) ;
5252 src.getch() != '\0' ; src.inc())
5253 {
5254 wchar_t cur;
5255
5256 /* get the current character */
5257 cur = src.getch();
5258
5259 /* check to see if we're in a comment */
5260 if (str_->is_in_comment())
5261 {
5262 /* check to see if the comment is ending */
5263 if (cur == '*' && src.getch_at(1) == '/')
5264 {
5265 /*
5266 * skip an extra character of the source - we'll skip
5267 * one in the main loop, so we only need to skip one
5268 * more now
5269 */
5270 src.inc();
5271
5272 /* we're no longer in a comment */
5273 str_->set_in_comment(FALSE);
5274 }
5275
5276 /* continue without copying anything from inside the comment */
5277 continue;
5278 }
5279 else if (in_quote_ != '\0')
5280 {
5281 /* see what we have */
5282 if (cur == '\\')
5283 {
5284 /*
5285 * It's a backslash sequence -- copy the backslash to
5286 * the output, and skip it. Note that we don't have to
5287 * worry about the line ending with a backslash, since
5288 * the line reader will already have considered that to
5289 * be a line splice.
5290 */
5291 src.inc();
5292 dst.setch(cur);
5293
5294 /* get the next character, so we copy it directly */
5295 cur = src.getch();
5296 }
5297 else if (cur == in_quote_)
5298 {
5299 /*
5300 * this is the closing quote character - simply note
5301 * that we're no longer in a quoted string
5302 */
5303 in_quote_ = '\0';
5304 }
5305 else if (in_quote_ == '"' && !comment_in_embedding_
5306 && cur == '<' && src.getch_at(1) == '<')
5307 {
5308 /*
5309 * it's an embedded expression starting point - skip the
5310 * first of the '<' characters (the enclosing loop will
5311 * skip the second one)
5312 */
5313 src.inc();
5314
5315 /* the string is done */
5316 in_quote_ = '\0';
5317
5318 /* we're in an embedding now */
5319 comment_in_embedding_ = TRUE;
5320
5321 /* copy the extra '<' to the output */
5322 dst.setch('<');
5323 }
5324 }
5325 else
5326 {
5327 /*
5328 * Monitor the stream for a backslash followed by trailing
5329 * spaces. If this is a backslash, note that we might have a
5330 * backslash with trailing spaces; if it's a space, we might
5331 * still have this, so leave the flag alone; if it's anything
5332 * else, clear the flag, since we've found something other
5333 * than backslashes and spaces.
5334 */
5335 if (cur == '\\')
5336 trailing_sp_after_bs = TRUE;
5337 else if (!is_space(cur))
5338 trailing_sp_after_bs = FALSE;
5339
5340 /* check to see if we're starting a comment */
5341 if (cur == '/')
5342 {
5343 switch(src.getch_at(1))
5344 {
5345 case '*':
5346 /* note that we're starting a comment */
5347 str_->set_in_comment(TRUE);
5348
5349 /*
5350 * replace the starting slash with a space - this
5351 * will effectively replace the entire comment with
5352 * a single space, since we won't copy anything else
5353 * from inside the comment
5354 */
5355 cur = ' ';
5356 break;
5357
5358 case '/':
5359 /*
5360 * comment to end of line - we can terminate the
5361 * line at the opening slash and return immediately,
5362 * because the entire rest of the line is to be
5363 * ignored
5364 */
5365 dst.setch('\0');
5366 return;
5367
5368 default:
5369 /* not a comment - copy it as-is */
5370 break;
5371 }
5372 }
5373 else if (cur == '"' || cur == '\'')
5374 {
5375 /* it's the start of a new string */
5376 in_quote_ = cur;
5377 }
5378 else if (cur < 0x09)
5379 {
5380 /*
5381 * it's a special flag character - we need to guarantee
5382 * that this character never occurs in input (it
5383 * shouldn't anyway, since it's a control character), so
5384 * translate it to a space
5385 */
5386 cur = ' ';
5387 }
5388 else if (comment_in_embedding_
5389 && cur == '>' && src.getch_at(1) == '>')
5390 {
5391 /*
5392 * it's the end of an embedded expression - we're back
5393 * in a double-quoted string (only double-quoted strings
5394 * can have embedded expressions)
5395 */
5396 in_quote_ = '"';
5397 comment_in_embedding_ = FALSE;
5398
5399 /* skip the extra '>' and copy it to the output */
5400 src.inc();
5401 dst.setch('>');
5402 }
5403 }
5404
5405 /* set the current character in the output */
5406 dst.setch(cur);
5407 }
5408
5409 /* set the updated line buffer length */
5410 linebuf_.set_text_len(dst.getptr() - linebuf_.get_buf());
5411
5412 /*
5413 * if we found a backslash with nothing following but whitespace, flag
5414 * a warning, since they might have meant the backslash as a line
5415 * continuation signal, but we're not interpreting it that way because
5416 * of the trailing whitespace
5417 */
5418 if (trailing_sp_after_bs)
5419 log_warning(TCERR_TRAILING_SP_AFTER_BS);
5420 }
5421
5422 /*
5423 * Splice strings. Splice additional lines onto the current line until
5424 * we find the end of the string.
5425 */
splice_string()5426 void CTcTokenizer::splice_string()
5427 {
5428 utf8_ptr p;
5429 int in_quote;
5430 int in_embedding;
5431 char unterm;
5432
5433 /* presume we'll find proper termination */
5434 unterm = '\0';
5435
5436 /*
5437 * remember the current in-quote and in-embedding status, as of the
5438 * end of the current line - when we splice, the line reader will
5439 * update these to the status at the end of the newly-read material,
5440 * but we want to scan from the beginning of the newly-read material
5441 */
5442 in_quote = in_quote_;
5443 in_embedding = comment_in_embedding_;
5444
5445 /* keep going until we find the end of the string */
5446 for (;;)
5447 {
5448 char *new_line_p;
5449 wchar_t cur;
5450
5451 /*
5452 * append a space at the end of the line, to replace the newline
5453 * that we've eliminated
5454 */
5455 if (string_newline_spacing_)
5456 linebuf_.append(" ", 1);
5457
5458 /* splice another line */
5459 new_line_p = read_line(TRUE);
5460
5461 /* if we reached end of file, there's no more splicing we can do */
5462 if (new_line_p == 0)
5463 break;
5464
5465 /* skip leading spaces in the new line */
5466 for (p.set(new_line_p) ; is_space(p.getch()) ; p.inc()) ;
5467
5468 /* if we skipped any spaces, remove them from the text */
5469 if (p.getptr() > new_line_p)
5470 {
5471 size_t rem;
5472 size_t new_len;
5473
5474 /* calculate the length of the rest of the line */
5475 rem = linebuf_.get_text_len()
5476 - (p.getptr() - linebuf_.get_buf());
5477
5478 /* calculate the new length of the line */
5479 new_len = (new_line_p - linebuf_.get_buf()) + rem;
5480
5481 /* move the rest of the line down over the spaces */
5482 memmove(new_line_p, p.getptr(), rem);
5483
5484 /* set the new length */
5485 linebuf_.set_text_len(new_len);
5486 }
5487
5488 /*
5489 * If the new line contains only "}" or ";", presume that the
5490 * string is unterminated and terminate it here. (This
5491 * heuristic could flag well-formed strings as erroneous, but
5492 * users can always work around this by moving these characters
5493 * onto lines that contain at least one other non-whitespace
5494 * character.)
5495 */
5496 p.set(new_line_p);
5497 if (p.getch() == '}' || p.getch() == ';')
5498 {
5499 /* skip trailing whitespace */
5500 for (p.inc() ; is_space(p.getch()) ; p.inc()) ;
5501
5502 /*
5503 * if there's nothing else on the line, presume it's an
5504 * unterminated string
5505 */
5506 if (p.getch() == '\0')
5507 {
5508 /* log the error */
5509 log_error(TCERR_POSSIBLE_UNTERM_STR,
5510 appended_linenum_);
5511
5512 /* remember that it's unterminated */
5513 unterm = (char)in_quote;
5514
5515 /*
5516 * since we're adding a presumed close quote that never
5517 * appears in the text, we need to figure the new
5518 * in-string status for the line; clear the in-quote
5519 * flag, and re-scan comments from the current point on
5520 * the line
5521 */
5522 in_quote_ = '\0';
5523 process_comments(new_line_p - linebuf_.get_buf());
5524
5525 /* we're done - unsplice from the start of the new line */
5526 p.set(new_line_p);
5527 goto done;
5528 }
5529 }
5530
5531 /* scan for the end of the string */
5532 for (p.set(new_line_p) ;; p.inc())
5533 {
5534 /* get this character */
5535 cur = p.getch();
5536
5537 /* see what we have */
5538 if (cur == '\\')
5539 {
5540 /* it's a backslash sequence - skip the extra character */
5541 p.inc();
5542 }
5543 else if (cur == in_quote)
5544 {
5545 /* it's our quote character - skip it, and we're done */
5546 p.inc();
5547 goto done;
5548 }
5549 else if (in_quote == '"' && !in_embedding
5550 && cur == '<' && p.getch_at(1) == '<')
5551 {
5552 /*
5553 * it's an embedded expression starter - skip the '<<'
5554 * sequence and stop scanning
5555 */
5556 p.inc();
5557 p.inc();
5558 goto done;
5559 }
5560 else if (cur == '\0')
5561 {
5562 /* end of line - go back and splice another line */
5563 break;
5564 }
5565 }
5566 }
5567
5568 done:
5569 /* unsplice the line at the current point */
5570 unsplice_line(p.getptr());
5571
5572 /* if we found an unterminated string, supply implicit termination */
5573 if (unterm != '\0')
5574 linebuf_.append(&unterm, 1);
5575 }
5576
5577
5578 /* ------------------------------------------------------------------------ */
5579 /*
5580 * Process a #pragma directive
5581 */
pp_pragma()5582 void CTcTokenizer::pp_pragma()
5583 {
5584 struct pp_kw_def
5585 {
5586 const char *kw;
5587 void (CTcTokenizer::*func)();
5588 };
5589 static pp_kw_def kwlist[] =
5590 {
5591 // { "c", &CTcTokenizer::pragma_c }, -- obsolete
5592 { "once", &CTcTokenizer::pragma_once },
5593 { "all_once", &CTcTokenizer::pragma_all_once },
5594 { "message", &CTcTokenizer::pragma_message },
5595 { "newline_spacing", &CTcTokenizer::pragma_newline_spacing },
5596 { 0, 0 }
5597 };
5598 pp_kw_def *kwp;
5599 size_t kwlen;
5600
5601 /* get the pragma keyword */
5602 if (next_on_line() != TOKT_SYM)
5603 {
5604 log_warning(TCERR_UNKNOWN_PRAGMA,
5605 (int)curtok_.get_text_len(), curtok_.get_text());
5606 return;
5607 }
5608
5609 /* get the keyword length */
5610 kwlen = curtok_.get_text_len();
5611
5612 /* scan the pragma list */
5613 for (kwp = kwlist ; kwp->kw != 0 ; ++kwp)
5614 {
5615 /* is this our keyword? */
5616 if (strlen(kwp->kw) == kwlen
5617 && memicmp(curtok_.get_text(), kwp->kw, kwlen) == 0)
5618 {
5619 /* this is our keyword - invoke the handler */
5620 (this->*(kwp->func))();
5621
5622 /* we're done */
5623 return;
5624 }
5625 }
5626
5627 /* we didn't find it - generate a warning */
5628 log_warning(TCERR_UNKNOWN_PRAGMA, kwlen, curtok_.get_text());
5629 }
5630
5631 #if 0 // #pragma C is not currently used
5632 /*
5633 * Process a #pragma C directive
5634 */
5635 void CTcTokenizer::pragma_c()
5636 {
5637 tc_toktyp_t tok;
5638 int new_pragma_c;
5639
5640 /* get the next token */
5641 tok = next_on_line();
5642
5643 /*
5644 * "+" or empty (end of line or whitespace) indicates C mode; "-"
5645 * indicates standard mode
5646 */
5647 if (tok == TOKT_PLUS || tok == TOKT_EOF)
5648 new_pragma_c = TRUE;
5649 else if (tok == TOKT_MINUS)
5650 new_pragma_c = FALSE;
5651 else
5652 {
5653 log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5654 new_pragma_c = str_->is_pragma_c();
5655 }
5656
5657 /*
5658 * retain the pragma in the result if we're in preprocess-only mode,
5659 * otherwise remove it
5660 */
5661 if (!pp_only_mode_)
5662 clear_linebuf();
5663
5664 /* set the mode in the stream */
5665 str_->set_pragma_c(new_pragma_c);
5666
5667 /* if there's a parser, notify it of the change */
5668 if (G_prs != 0)
5669 G_prs->set_pragma_c(new_pragma_c);
5670 }
5671 #endif
5672
5673 /*
5674 * Process a #pragma once directive
5675 */
pragma_once()5676 void CTcTokenizer::pragma_once()
5677 {
5678 /* add this file to the ONCE list */
5679 add_include_once(str_->get_desc()->get_fname());
5680
5681 /* don't retain this pragma in the result */
5682 clear_linebuf();
5683 }
5684
5685 /*
5686 * Process a #pragma all_once directive
5687 */
pragma_all_once()5688 void CTcTokenizer::pragma_all_once()
5689 {
5690 tc_toktyp_t tok;
5691
5692 /* get the next token */
5693 tok = next_on_line();
5694
5695 /*
5696 * "+" or empty (end of line or whitespace) indicates ALL_ONCE mode;
5697 * '-' indicates standard mode
5698 */
5699 if (tok == TOKT_PLUS || tok == TOKT_EOF)
5700 all_once_ = TRUE;
5701 else if (tok == TOKT_MINUS)
5702 all_once_ = FALSE;
5703 else
5704 log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5705
5706 /* don't retain this pragma in the result */
5707 clear_linebuf();
5708 }
5709
5710 /*
5711 * Process a #pragma message directive
5712 */
pragma_message()5713 void CTcTokenizer::pragma_message()
5714 {
5715 size_t startofs;
5716
5717 /*
5718 * copy the source line through the "message" token to the macro
5719 * expansion buffer - we don't want to expand that part, but we want
5720 * it to appear in the expansion, so just copy the original
5721 */
5722 startofs = (curtok_.get_text() + curtok_.get_text_len()
5723 - linebuf_.get_text());
5724 expbuf_.copy(linebuf_.get_text(), startofs);
5725
5726 /* expand macros; don't allow reading additional lines */
5727 if (expand_macros_curline(FALSE, FALSE, TRUE))
5728 {
5729 clear_linebuf();
5730 return;
5731 }
5732
5733 /*
5734 * If we're in normal compilation mode, display the message. If we're
5735 * in preprocess-only mode, simply retain the message in the
5736 * preprocessed result, so that it shows up when the result is
5737 * compiled.
5738 *
5739 * Ignore messages in list-includes mode.
5740 */
5741 if (!pp_only_mode_ && !list_includes_mode_)
5742 {
5743 /* set up at the first post-processed token */
5744 start_new_line(expbuf_.get_buf() + startofs,
5745 expbuf_.get_text_len() - startofs);
5746
5747 /* if there's an open paren, skip it */
5748 if (next_on_line_xlat(0) == TOKT_LPAR)
5749 next_on_line_xlat(0);
5750 else
5751 log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5752
5753 /* keep going until we reach the closing paren */
5754 while (curtok_.gettyp() != TOKT_RPAR
5755 && curtok_.gettyp() != TOKT_EOF)
5756 {
5757 /* display this token */
5758 switch(curtok_.gettyp())
5759 {
5760 case TOKT_SSTR:
5761 case TOKT_DSTR:
5762 case TOKT_SYM:
5763 /* display the text of the token */
5764 msg_str(curtok_.get_text(), curtok_.get_text_len());
5765 break;
5766
5767 case TOKT_INT:
5768 /* display the integer */
5769 msg_long(curtok_.get_int_val());
5770 break;
5771
5772 default:
5773 /* ignore anything else */
5774 break;
5775 }
5776
5777 /* get the next token */
5778 next_on_line_xlat(0);
5779 }
5780
5781 /* end the line */
5782 msg_str("\n", 1);
5783
5784 /* remove the message from the result text */
5785 clear_linebuf();
5786 }
5787 else
5788 {
5789 /* preprocessing - copy expanded text to line buffer */
5790 linebuf_.copy(expbuf_.get_text(), expbuf_.get_text_len());
5791 }
5792 }
5793
5794 /*
5795 * Process a #pragma newline_spacing(on/off) directive
5796 */
pragma_newline_spacing()5797 void CTcTokenizer::pragma_newline_spacing()
5798 {
5799 int f;
5800
5801 /* if we're in preprocess-only mode, just pass the pragma through */
5802 if (pp_only_mode_)
5803 return;
5804
5805 /* get the '(' token and the on/off token */
5806 if (next_on_line() != TOKT_LPAR || next_on_line() != TOKT_SYM)
5807 {
5808 log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5809 goto done;
5810 }
5811
5812 /* note the new mode flag */
5813 if (curtok_.get_text_len() == 2
5814 && memcmp(curtok_.get_text(), "on", 2) == 0)
5815 {
5816 /* it's 'on' */
5817 f = TRUE;
5818 }
5819 else if (curtok_.get_text_len() == 3
5820 && memcmp(curtok_.get_text(), "off", 3) == 0)
5821 {
5822 /* it's 'off' */
5823 f = FALSE;
5824 }
5825 else
5826 {
5827 log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5828 goto done;
5829 }
5830
5831 /* make sure we have the ')' token */
5832 if (next_on_line() != TOKT_RPAR)
5833 {
5834 log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5835 goto done;
5836 }
5837
5838 /* set the new mode */
5839 string_newline_spacing_ = f;
5840
5841 done:
5842 /* done - discard this line buffer */
5843 clear_linebuf();
5844 }
5845
5846
5847 /* ------------------------------------------------------------------------ */
5848 /*
5849 * Process a #charset directive
5850 */
pp_charset()5851 void CTcTokenizer::pp_charset()
5852 {
5853 /*
5854 * Encountering a #charset directive within the tokenizer is always
5855 * an error. If the file opener managed to use a #charset, we'll
5856 * never see it, because the file opener will have skipped it before
5857 * giving us the file.
5858 *
5859 * If we flagged a #charset error when opening the file, indicate
5860 * that the problem is that the character set given was unloadable;
5861 * otherwise, the problem is that #charset is in the wrong place.
5862 */
5863 log_error(str_->get_charset_error()
5864 ? TCERR_CANT_LOAD_CHARSET : TCERR_UNEXPECTED_CHARSET);
5865
5866 /* don't retain this pragma in the result */
5867 clear_linebuf();
5868 }
5869
5870 /* ------------------------------------------------------------------------ */
5871 /*
5872 * Process a #include directive
5873 */
pp_include()5874 void CTcTokenizer::pp_include()
5875 {
5876 wchar_t match;
5877 int is_local;
5878 int is_absolute;
5879 utf8_ptr fname;
5880 CTcSrcFile *new_src;
5881 int charset_error;
5882 int default_charset_error;
5883 char full_name[OSFNMAX];
5884 char lcl_name[OSFNMAX];
5885 int found;
5886 CTcTokFileDesc *desc;
5887 int expand;
5888 utf8_ptr start;
5889
5890 /* presume we'll expand macros */
5891 expand = TRUE;
5892
5893 /*
5894 * Check to see if expansion is needed. Macro expansion is needed
5895 * only if the source line is not of one of the following forms:
5896 *
5897 *. #include "filename"
5898 *. #include <filename>
5899 */
5900 for (start = p_ ; is_space(p_.getch()) ; p_.inc()) ;
5901 switch(p_.getch())
5902 {
5903 case '<':
5904 /* look for a matching '>' */
5905 match = '>';
5906 goto find_match;
5907
5908 case '"':
5909 /* look for a matching '"' */
5910 match = '"';
5911 goto find_match;
5912
5913 find_match:
5914 /* find the matching character */
5915 for (p_.inc() ; p_.getch() != '\0' && p_.getch() != match ;
5916 p_.inc()) ;
5917
5918 /* if we found it, check for other characters on the line */
5919 if (p_.getch() == match)
5920 {
5921 /* skip the matching character */
5922 p_.inc();
5923
5924 /* skip whitespace */
5925 while (is_space(p_.getch()))
5926 p_.inc();
5927
5928 /*
5929 * make sure there's nothing else on the line - if not, it's
5930 * one of the approved formats, so there's no need to do
5931 * macro expansion
5932 */
5933 if (p_.getch() == 0)
5934 expand = FALSE;
5935 }
5936 break;
5937 }
5938
5939 /* expand macros if necessary */
5940 if (expand)
5941 {
5942 /* do the expansion */
5943 if (expand_macros_curline(FALSE, FALSE, FALSE))
5944 {
5945 /* clear the buffer and abort */
5946 clear_linebuf();
5947 return;
5948 }
5949
5950 /* read from the expansion buffer */
5951 start_new_line(expbuf_.get_buf(), expbuf_.get_text_len());
5952 }
5953 else
5954 {
5955 /* no expansion needed - read from the original starting point */
5956 p_ = start;
5957 }
5958
5959 /* skip leading whitespace */
5960 for ( ; is_space(p_.getch()) ; p_.inc()) ;
5961
5962 /* we have to be looking at at '"' or '<' character */
5963 if (p_.getch() == '"')
5964 {
5965 /* look for a matching quote, and look for a local file */
5966 match = '"';
5967 is_local = TRUE;
5968 }
5969 else if (p_.getch() == '<')
5970 {
5971 /* look for a matching angle bracket, and look for a system file */
5972 match = '>';
5973 is_local = FALSE;
5974 }
5975 else
5976 {
5977 /* invalid syntax - log an error and ignore the line */
5978 log_error(TCERR_BAD_INC_SYNTAX);
5979 clear_linebuf();
5980 return;
5981 }
5982
5983 /* skip the open quote, and remember where the filename starts */
5984 p_.inc();
5985 fname = p_;
5986
5987 /* find the matching quote */
5988 for ( ; p_.getch() != '\0' && p_.getch() != match ; p_.inc()) ;
5989
5990 /* if we didn't find the match, log an error and ignore the line */
5991 if (p_.getch() == '\0')
5992 {
5993 log_error(TCERR_BAD_INC_SYNTAX);
5994 clear_linebuf();
5995 return;
5996 }
5997
5998 /*
5999 * null-terminate the filename (we don't care what else is in the
6000 * buffer at this point, so overwriting it isn't a problem)
6001 */
6002 p_.setch('\0');
6003
6004 /* check to see if the filename is absolute */
6005 is_absolute = os_is_file_absolute(fname.getptr());
6006
6007 /* we have yet to find the file */
6008 found = FALSE;
6009
6010 /*
6011 * in case the name is in portable URL notation, convert from URL
6012 * notation to local notation; we'll consider this form of the name
6013 * first, and only if we can't find it in this form will we try
6014 * treating the name as using local filename conventions
6015 */
6016 os_cvt_url_dir(lcl_name, sizeof(lcl_name), fname.getptr(), FALSE);
6017
6018 /*
6019 * Search for the included file.
6020 *
6021 * First, if it's a local file (in quotes rather than angle
6022 * brackets), start the search in the directory containing the
6023 * current file, then look in the directory containing the parent
6024 * file, and so on. If we fail to find it, proceed as for a
6025 * non-local file.
6026 */
6027 if (is_local)
6028 {
6029 CTcTokStream *cur_str;
6030 char pathbuf[OSFNMAX];
6031
6032 /* start with the current file, and search parents */
6033 for (cur_str = str_ ; cur_str != 0 ; cur_str = cur_str->get_parent())
6034 {
6035 /* get the path to the current file */
6036 os_get_path_name(pathbuf, sizeof(pathbuf),
6037 last_desc_->get_fname());
6038
6039 /*
6040 * try the URL-converted name first - this takes precedence
6041 * over a local interpretation of the name
6042 */
6043 os_build_full_path(full_name, sizeof(full_name),
6044 pathbuf, lcl_name);
6045 if (!osfacc(full_name))
6046 {
6047 found = TRUE;
6048 break;
6049 }
6050
6051 /* if it's a relative local name, try again with local naming */
6052 if (!is_absolute)
6053 {
6054 /*
6055 * build the full filename, treating the name as using
6056 * local system conventions
6057 */
6058 os_build_full_path(full_name, sizeof(full_name),
6059 pathbuf, fname.getptr());
6060
6061 /* if we found it, so note and stop searching */
6062 if (!osfacc(full_name))
6063 {
6064 found = TRUE;
6065 break;
6066 }
6067 }
6068 }
6069 }
6070
6071 /*
6072 * If we still haven't found the file (or if it's a non-local file,
6073 * in angle brackets), search the include path.
6074 */
6075 if (!found)
6076 {
6077 tctok_incpath_t *inc_path;
6078
6079 /* scan the include path */
6080 for (inc_path = incpath_head_ ; inc_path != 0 ;
6081 inc_path = inc_path->nxt)
6082 {
6083 /* try the URL-converted local name first */
6084 os_build_full_path(full_name, sizeof(full_name),
6085 inc_path->path, lcl_name);
6086 if (!osfacc(full_name))
6087 {
6088 found = TRUE;
6089 break;
6090 }
6091
6092 /* try with the local name, if it's a relative local name */
6093 if (!is_absolute)
6094 {
6095 /* build the full name for the file in this directory */
6096 os_build_full_path(full_name, sizeof(full_name),
6097 inc_path->path, fname.getptr());
6098
6099 /* if we found it, stop searching */
6100 if (!osfacc(full_name))
6101 {
6102 found = TRUE;
6103 break;
6104 }
6105 }
6106 }
6107 }
6108
6109 /*
6110 * If the filename specified an absolute path, and we didn't find a
6111 * file with any of the local interpretations, look at the absolute
6112 * path. Note that our portable URL-style notation doesn't allow
6113 * absolute notation, so we use only the exact name as specified in
6114 * the #include directive as the absolute form.
6115 */
6116 if (is_absolute && !found)
6117 {
6118 /* use the original filename as the full name */
6119 strcpy(full_name, fname.getptr());
6120
6121 /* try finding the file */
6122 found = !osfacc(full_name);
6123 }
6124
6125 /*
6126 * we have our copy of the filename now; we don't want to retain
6127 * this directive in the preprocessed source, so clear out the line
6128 * buffer now
6129 */
6130 clear_linebuf();
6131
6132 /*
6133 * if we didn't find the file anywhere, show an error and ignore the
6134 * #include directive
6135 */
6136 if (!found)
6137 {
6138 log_error(TCERR_INC_NOT_FOUND,
6139 (int)strlen(fname.getptr()), fname.getptr());
6140 return;
6141 }
6142
6143 /*
6144 * Check the list of included files that are marked for inclusion
6145 * only once. If we've already included this file, ignore this
6146 * redundant inclusion. Check based on the full filename that we
6147 * resolved from the search path.
6148 */
6149 if (find_include_once(full_name))
6150 {
6151 /* log an error if appropriate */
6152 if (warn_on_ignore_incl_)
6153 log_warning(TCERR_REDUNDANT_INCLUDE,
6154 (int)strlen(full_name), full_name);
6155
6156 /* ignore this #include directive */
6157 return;
6158 }
6159
6160 /* open a file source to read the file */
6161 new_src = CTcSrcFile::open_source(full_name, res_loader_,
6162 default_charset_, &charset_error,
6163 &default_charset_error);
6164
6165 /* if we couldn't open the file, log an error and ignore the line */
6166 if (new_src == 0)
6167 {
6168 /*
6169 * if the error was due to the default character set, log that
6170 * problem; otherwise, log the general file-open problem
6171 */
6172 if (default_charset_error)
6173 log_error(TCERR_CANT_LOAD_DEFAULT_CHARSET, default_charset_);
6174 else
6175 log_error(TCERR_INC_NOT_FOUND,
6176 (int)strlen(full_name), full_name);
6177
6178 /* we can go no further */
6179 return;
6180 }
6181
6182 /* get the descriptor for the source file */
6183 desc = get_file_desc(full_name, strlen(full_name), FALSE,
6184 fname.getptr(),
6185 fname.getptr() != 0 ? strlen(fname.getptr()) : 0);
6186
6187 /*
6188 * remember the current #pragma newline_spacing mode, so we can restore
6189 * it when we reinstate the current stream
6190 */
6191 str_->set_newline_spacing(string_newline_spacing_);
6192
6193 /*
6194 * Create and install the new file reader stream object. By
6195 * installing it as the current reader, we'll activate it so that
6196 * the next line read will come from the new stream. Note that the
6197 * current stream becomes the parent of the new stream, so that we
6198 * revert to the current stream when the new stream is exhausted;
6199 * this will allow us to pick up reading from the current stream at
6200 * the next line after the #include directive when we've finished
6201 * including the new file.
6202 */
6203 str_ = new CTcTokStream(desc, new_src, str_, charset_error, if_sp_);
6204
6205 /*
6206 * If we're in ALL_ONCE mode, it means that every single file we
6207 * include should be included only once.
6208 */
6209 if (all_once_)
6210 add_include_once(full_name);
6211
6212 /*
6213 * if we're in list-includes mode, write the name of the include file
6214 * to the standard output
6215 */
6216 if (list_includes_mode_)
6217 G_hostifc->print_msg("#include %s\n", full_name);
6218 }
6219
6220 /* ------------------------------------------------------------------------ */
6221 /*
6222 * Add a file to the include-once list. Once a file is in this list, we
6223 * won't include it again.
6224 */
add_include_once(const char * fname)6225 void CTcTokenizer::add_include_once(const char *fname)
6226 {
6227 tctok_incfile_t *prvinc;
6228
6229 /* if the file is already in the list, don't add it again */
6230 if (find_include_once(fname))
6231 return;
6232
6233 /* create a new entry for the filename */
6234 prvinc = (tctok_incfile_t *)t3malloc(sizeof(tctok_incfile_t)
6235 + strlen(fname));
6236
6237 /* save the filename */
6238 strcpy(prvinc->fname, fname);
6239
6240 /* link the new entry into our list */
6241 prvinc->nxt = prev_includes_;
6242 prev_includes_ = prvinc;
6243 }
6244
6245 /*
6246 * Find a file in the list of files to be included only once. Returns
6247 * true if the file is in the list, false if not.
6248 */
find_include_once(const char * fname)6249 int CTcTokenizer::find_include_once(const char *fname)
6250 {
6251 tctok_incfile_t *prvinc;
6252
6253 /* search the list */
6254 for (prvinc = prev_includes_ ; prvinc != 0 ; prvinc = prvinc->nxt)
6255 {
6256 /* if this one matches, we found it, so return true */
6257 if (strcmp(fname, prvinc->fname) == 0)
6258 return TRUE;
6259 }
6260
6261 /* we didn't find the file */
6262 return FALSE;
6263 }
6264
6265 /* ------------------------------------------------------------------------ */
6266 /*
6267 * Process a #define directive
6268 */
pp_define()6269 void CTcTokenizer::pp_define()
6270 {
6271 const char *macro_name;
6272 size_t macro_len;
6273 const char *argv[TOK_MAX_MACRO_ARGS];
6274 size_t argvlen[TOK_MAX_MACRO_ARGS];
6275 int argc;
6276 int has_args;
6277 const char *expan;
6278 size_t expan_len;
6279 CTcHashEntryPp *entry;
6280 int has_varargs;
6281
6282 /* get the macro name */
6283 if (next_on_line() != TOKT_SYM)
6284 {
6285 log_error(TCERR_BAD_DEFINE_SYM,
6286 (int)curtok_.get_text_len(), curtok_.get_text());
6287 clear_linebuf();
6288 return;
6289 }
6290
6291 /* make a copy of the macro name */
6292 macro_name = curtok_.get_text();
6293 macro_len = curtok_.get_text_len();
6294
6295 /* no arguments yet */
6296 argc = 0;
6297
6298 /* presume we won't find a varargs marker */
6299 has_varargs = FALSE;
6300
6301 /*
6302 * If there's a '(' immediately after the macro name, without any
6303 * intervening whitespace, it has arguments; otherwise, it has no
6304 * arguments. Note which case we have.
6305 */
6306 if (p_.getch() == '(')
6307 {
6308 int done;
6309 tc_toktyp_t tok;
6310
6311 /* note that we have an argument list */
6312 has_args = TRUE;
6313
6314 /* assume we're not done yet */
6315 done = FALSE;
6316
6317 /* skip the paren and get the next token */
6318 p_.inc();
6319 tok = next_on_line();
6320
6321 /* check for an empty argument list */
6322 if (tok == TOKT_RPAR)
6323 {
6324 /* note that we're done with the arguments */
6325 done = TRUE;
6326 }
6327
6328 /* scan the argument list */
6329 while (!done)
6330 {
6331 /* if we have too many arguments, it's an error */
6332 if (argc >= TOK_MAX_MACRO_ARGS)
6333 {
6334 log_error(TCERR_TOO_MANY_MAC_PARMS,
6335 macro_name, macro_len, TOK_MAX_MACRO_ARGS);
6336 clear_linebuf();
6337 return;
6338 }
6339
6340 /* if we're at the end of the macro, it's an error */
6341 if (tok == TOKT_EOF)
6342 {
6343 /* log the error and ignore the line */
6344 log_error(TCERR_MACRO_NO_RPAR);
6345 clear_linebuf();
6346 return;
6347 }
6348
6349 /* check for a valid initial symbol character */
6350 if (tok != TOKT_SYM)
6351 {
6352 log_error_curtok(TCERR_BAD_MACRO_ARG_NAME);
6353 clear_linebuf();
6354 return;
6355 }
6356
6357 /* remember the argument name */
6358 argvlen[argc] = curtok_.get_text_len();
6359 argv[argc++] = curtok_.get_text();
6360
6361 /* get the next token */
6362 tok = next_on_line();
6363
6364 /* make sure we have a comma or paren following */
6365 if (tok == TOKT_COMMA)
6366 {
6367 /* we have more arguments - skip the comma */
6368 tok = next_on_line();
6369 }
6370 else if (tok == TOKT_ELLIPSIS)
6371 {
6372 /* skip the ellipsis */
6373 tok = next_on_line();
6374
6375 /* note the varargs marker */
6376 has_varargs = TRUE;
6377
6378 /* this must be the last argument */
6379 if (tok != TOKT_RPAR)
6380 {
6381 /* log the error */
6382 log_error_curtok(TCERR_MACRO_ELLIPSIS_REQ_RPAR);
6383
6384 /* discard the line and give up */
6385 clear_linebuf();
6386 return;
6387 }
6388
6389 /* that's the last argument - we can stop now */
6390 done = TRUE;
6391 }
6392 else if (tok == TOKT_RPAR)
6393 {
6394 /* no more arguments - note that we can stop now */
6395 done = TRUE;
6396 }
6397 else
6398 {
6399 /* invalid argument - log an error and discard the line */
6400 log_error_curtok(TCERR_MACRO_EXP_COMMA);
6401 clear_linebuf();
6402 return;
6403 }
6404 }
6405 }
6406 else
6407 {
6408 /*
6409 * there are no arguments - the macro's expansion starts
6410 * immediately after the end of the name and any subsequent
6411 * whitespace
6412 */
6413 has_args = FALSE;
6414 }
6415
6416 /* skip whitespace leading up to the expansion */
6417 while (is_space(p_.getch()))
6418 p_.inc();
6419
6420 /* the rest of the line is the expansion */
6421 expan = p_.getptr();
6422
6423 /* don't allow defining "defined" */
6424 if (macro_len == 7 && memcmp(macro_name, "defined", 7) == 0)
6425 {
6426 /* log an error */
6427 log_error(TCERR_REDEF_OP_DEFINED);
6428
6429 /* don't retain the directive in the preprocessed result */
6430 clear_linebuf();
6431
6432 /* ignore the definition */
6433 return;
6434 }
6435
6436 /* get the length of the expansion text */
6437 expan_len = strlen(expan);
6438
6439 /*
6440 * remove any trailing whitespace from the expansion text; however,
6441 * leave a trailing space if it's preceded by a backslash
6442 */
6443 while (expan_len > 0
6444 && is_space(expan[expan_len-1])
6445 && !(expan_len > 1 && expan[expan_len-2] == '\\'))
6446 --expan_len;
6447
6448 /*
6449 * If there are arguments, scan the expansion for formal parameter
6450 * names. For each one we find, replace it with the special
6451 * TOK_MACRO_FORMAL_FLAG character followed by a one-byte value
6452 * giving the argument index. This special sequence is less costly
6453 * to find when we're expanding the macros - by doing the search
6454 * here, we only need to do it once, rather than each time we expand
6455 * the macro.
6456 */
6457 if (argc != 0)
6458 {
6459 utf8_ptr src;
6460 size_t dstofs;
6461 tc_toktyp_t typ;
6462 CTcToken tok;
6463 const char *start;
6464 int in_embedding = FALSE;
6465
6466 /*
6467 * Generate our modified expansion text in the macro expansion
6468 * buffer. Initially, make sure we have room for a copy of the
6469 * text; we'll resize the buffer later if we find we need even
6470 * more.
6471 */
6472 expbuf_.ensure_space(expan_len);
6473
6474 /* scan for argument names, and replace them */
6475 for (start = expan, dstofs = 0, src.set((char *)expan) ;; )
6476 {
6477 /* get the next token */
6478 typ = next_on_line(&src, &tok, &in_embedding);
6479
6480 /* if we've reached the end of the expansion, we're done */
6481 if (typ == TOKT_EOF)
6482 break;
6483
6484 /*
6485 * If this is a formal parameter name, we'll replace it with
6486 * a special two-byte sequence; otherwise, we'll keep it
6487 * unchanged.
6488 */
6489 if (typ == TOKT_SYM)
6490 {
6491 int i;
6492
6493 /* find it in the table */
6494 for (i = 0 ; i < argc ; ++i)
6495 {
6496 /* does it match this argument name? */
6497 if (argvlen[i] == tok.get_text_len()
6498 && memcmp(argv[i], tok.get_text(),
6499 tok.get_text_len()) == 0)
6500 {
6501 size_t new_len;
6502 size_t arg_len;
6503 size_t repl_len;
6504 char flag_byte;
6505
6506 /* get the length of the formal name */
6507 arg_len = argvlen[i];
6508
6509 /*
6510 * the normal replacement length for a formal
6511 * parameter is two bytes - one byte for the flag,
6512 * and one for the formal parameter index
6513 */
6514 repl_len = 2;
6515
6516 /* by default, the flag byte is the formal flag */
6517 flag_byte = TOK_MACRO_FORMAL_FLAG;
6518
6519 /*
6520 * Check for special varargs control suffixes. If
6521 * we matched the last argument name, and this is
6522 * a varargs macro, we might have a suffix.
6523 */
6524 if (has_varargs
6525 && i == argc - 1
6526 && src.getch() == '#')
6527 {
6528 /* check for the various suffixes */
6529 if (memcmp(src.getptr() + 1, "foreach", 7) == 0
6530 && !is_sym(src.getch_at(8)))
6531 {
6532 /*
6533 * include the suffix length in the token
6534 * length
6535 */
6536 arg_len += 8;
6537
6538 /*
6539 * the flag byte is the #foreach flag,
6540 * which is a one-byte sequence
6541 */
6542 flag_byte = TOK_MACRO_FOREACH_FLAG;
6543 repl_len = 1;
6544 }
6545 else if (memcmp(src.getptr() + 1,
6546 "argcount", 8) == 0
6547 && !is_sym(src.getch_at(9)))
6548 {
6549 /*
6550 * include the suffix length in the token
6551 * length
6552 */
6553 arg_len += 9;
6554
6555 /*
6556 * the flag byte is the #argcount flag,
6557 * which is a one-byte sequence
6558 */
6559 flag_byte = TOK_MACRO_ARGCOUNT_FLAG;
6560 repl_len = 1;
6561 }
6562 else if (memcmp(src.getptr() + 1,
6563 "ifempty", 7) == 0
6564 && !is_sym(src.getch_at(8)))
6565 {
6566 /* include the length */
6567 arg_len += 8;
6568
6569 /* set the one-byte flag */
6570 flag_byte = TOK_MACRO_IFEMPTY_FLAG;
6571 repl_len = 1;
6572 }
6573 else if (memcmp(src.getptr() + 1,
6574 "ifnempty", 8) == 0
6575 && !is_sym(src.getch_at(9)))
6576 {
6577 /* include the length */
6578 arg_len += 9;
6579
6580 /* set the one-byte flag */
6581 flag_byte = TOK_MACRO_IFNEMPTY_FLAG;
6582 repl_len = 1;
6583 }
6584 }
6585
6586 /*
6587 * calculate the new length - we're removing the
6588 * argument name and adding the replacement string
6589 * in its place
6590 */
6591 new_len = expan_len + repl_len - arg_len;
6592
6593 /*
6594 * we need two bytes for the replacement - if
6595 * this is more than we're replacing, make sure
6596 * we have room for the extra
6597 */
6598 if (new_len > expan_len)
6599 expbuf_.ensure_space(new_len);
6600
6601 /*
6602 * copy everything up to but not including the
6603 * formal name
6604 */
6605 if (tok.get_text() > start)
6606 {
6607 /* store the text */
6608 memcpy(expbuf_.get_buf() + dstofs,
6609 start, tok.get_text() - start);
6610
6611 /* move past the stored text in the output */
6612 dstofs += tok.get_text() - start;
6613 }
6614
6615 /* the next segment starts after this token */
6616 start = tok.get_text() + arg_len;
6617
6618 /* store the flag byte */
6619 expbuf_.get_buf()[dstofs++] = flag_byte;
6620
6621 /*
6622 * If appropriate, store the argument index - this
6623 * always fits in one byte because our hard limit
6624 * on formal parameters is less than 128 per
6625 * macro. Note that we add one to the index so
6626 * that we never store a zero byte, to avoid any
6627 * potential confusion with a null terminator
6628 * byte.
6629 */
6630 if (repl_len > 1)
6631 expbuf_.get_buf()[dstofs++] = (char)(i + 1);
6632
6633 /* remember the new length */
6634 expan_len = new_len;
6635
6636 /* no need to search further for it */
6637 break;
6638 }
6639 }
6640 }
6641 }
6642
6643 /* copy the last segment */
6644 if (tok.get_text() > start)
6645 {
6646 /* store the text */
6647 memcpy(expbuf_.get_buf() + dstofs, start,
6648 tok.get_text() - start);
6649 }
6650
6651 /* set the new length */
6652 expbuf_.set_text_len(expan_len);
6653
6654 /* use the modified expansion text instead of the original */
6655 expan = expbuf_.get_text();
6656 }
6657
6658 /*
6659 * check the symbol table to see if this symbol is already defined -
6660 * if so, show a warning, but honor the new definition
6661 */
6662 entry = find_define(macro_name, macro_len);
6663 if (entry != 0)
6664 {
6665 /*
6666 * Check for a trivial redefinition - if the number of arguments
6667 * is the same, and the type (object-like or function-like) is
6668 * the same, and the expansion string is identical, there's no
6669 * need to warn, because the redefinition has no effect and can
6670 * thus be safely ignored. Note that we must ignore any
6671 * differences in the whitespace in the expansions for this
6672 * comparision.
6673 */
6674 if ((entry->has_args() != 0) == (has_args != 0)
6675 && entry->get_argc() == argc
6676 && lib_strequal_collapse_spaces(expan, expan_len,
6677 entry->get_expansion(),
6678 entry->get_expan_len()))
6679 {
6680 /* it's a non-trivial redefinition - ignore it */
6681 goto done;
6682 }
6683
6684 /* log a warning about the redefinition */
6685 log_warning(TCERR_MACRO_REDEF, (int)macro_len, macro_name);
6686
6687 /* remove and delete the old entry */
6688 defines_->remove(entry);
6689
6690 /* if the item isn't already in the #undef table, add it */
6691 if (find_undef(macro_name, macro_len) == 0)
6692 {
6693 /*
6694 * move the entry to the #undef table so that we can keep track
6695 * of the fact that this macro's definition has changed in the
6696 * course of the compilation
6697 */
6698 undefs_->add(entry);
6699 }
6700 else
6701 {
6702 /*
6703 * the name is already in the #undef table, so we don't need
6704 * another copy - just forget about the old entry entirely
6705 */
6706 delete entry;
6707 }
6708 }
6709
6710 /* create an entry for the new macro */
6711 entry = new CTcHashEntryPpDefine(macro_name, macro_len, TRUE,
6712 has_args, argc, has_varargs,
6713 argv, argvlen, expan, expan_len);
6714
6715 /* add it to the hash table */
6716 defines_->add(entry);
6717
6718 done:
6719 /* don't retain the directive in the preprocessed source */
6720 clear_linebuf();
6721 }
6722
6723 /* ------------------------------------------------------------------------ */
6724 /*
6725 * Process a #ifdef directive
6726 */
pp_ifdef()6727 void CTcTokenizer::pp_ifdef()
6728 {
6729 /* process the ifdef/ifndef with a positive sense */
6730 pp_ifdef_or_ifndef(TRUE);
6731 }
6732
6733 /*
6734 * Process a #ifndef directive
6735 */
pp_ifndef()6736 void CTcTokenizer::pp_ifndef()
6737 {
6738 /* process the ifdef/ifndef with a negative sense */
6739 pp_ifdef_or_ifndef(FALSE);
6740 }
6741
6742 /*
6743 * Process a #ifdef or #ifndef. If 'sense' is true, we'll take the
6744 * branch if the symbol is defined (hence #ifdef), otherwise we'll take
6745 * it if the symbol isn't defined (hence #ifndef).
6746 */
pp_ifdef_or_ifndef(int sense)6747 void CTcTokenizer::pp_ifdef_or_ifndef(int sense)
6748 {
6749 char macro_name[TOK_SYM_MAX_BUFFER];
6750 int found;
6751 tok_if_t state;
6752
6753 /* make sure we have a valid symbol */
6754 if (pp_get_lone_ident(macro_name, sizeof(macro_name)))
6755 {
6756 /* clear the line buffer */
6757 clear_linebuf();
6758
6759 /*
6760 * push a true if to avoid cascading errors for matching #endif
6761 * or #else
6762 */
6763 push_if(TOKIF_IF_YES);
6764
6765 /* we're done */
6766 return;
6767 }
6768
6769 /* check to see if it's defined */
6770 found = (find_define(macro_name, strlen(macro_name)) != 0);
6771
6772 /*
6773 * if we found it and they wanted it found, or we didn't find it and
6774 * they didn't want it found, take a true branch; otherwise, take a
6775 * false branch
6776 */
6777 if ((sense != 0) == (found != 0))
6778 state = TOKIF_IF_YES;
6779 else
6780 state = TOKIF_IF_NO;
6781
6782 /* push the new #if state */
6783 push_if(state);
6784
6785 /* don't retain the directive in the preprocessed source */
6786 clear_linebuf();
6787 }
6788
6789 /* ------------------------------------------------------------------------ */
6790 /*
6791 * Process a #if directive
6792 */
pp_if()6793 void CTcTokenizer::pp_if()
6794 {
6795 CTcConstVal val;
6796
6797 /* expand macros; don't allow reading additional lines */
6798 if (expand_macros_curline(FALSE, TRUE, FALSE))
6799 goto do_error;
6800
6801 /*
6802 * we don't need the original source line any more, and we don't
6803 * want to copy it to the preprocessed output, so clear it
6804 */
6805 clear_linebuf();
6806
6807 /* parse out of the expansion buffer */
6808 start_new_line(expbuf_.get_buf(), expbuf_.get_text_len());
6809
6810 /* parse the preprocessor expression */
6811 if (pp_parse_expr(&val, TRUE, TRUE, TRUE))
6812 {
6813 /*
6814 * we can't get a value; treat the expression as true and
6815 * continue parsing, so that we don't throw off the #if nesting
6816 * level
6817 */
6818 val.set_bool(TRUE);
6819 }
6820
6821 /* push the new state according to the value of the expression */
6822 push_if(val.get_val_bool() ? TOKIF_IF_YES : TOKIF_IF_NO);
6823
6824 /* done */
6825 return;
6826
6827 do_error:
6828 /* clear the line buffer */
6829 clear_linebuf();
6830
6831 /*
6832 * push a true if - even though we can't evaluate the condition, we
6833 * can at least avoid a cascade of errors for the matching #endif
6834 * and #else
6835 */
6836 push_if(TOKIF_IF_YES);
6837 }
6838
6839 /* ------------------------------------------------------------------------ */
6840 /*
6841 * Process a #elif directive
6842 */
pp_elif()6843 void CTcTokenizer::pp_elif()
6844 {
6845 CTcConstVal val;
6846
6847 /* expand macros; don't allow reading additional lines */
6848 if (expand_macros_curline(FALSE, TRUE, FALSE))
6849 {
6850 clear_linebuf();
6851 return;
6852 }
6853
6854 /* parse out of the expansion buffer */
6855 start_new_line(expbuf_.get_buf(), expbuf_.get_text_len());
6856
6857 /* parse the preprocessor expression */
6858 if (pp_parse_expr(&val, TRUE, TRUE, TRUE))
6859 {
6860 clear_linebuf();
6861 return;
6862 }
6863
6864 /*
6865 * make sure that the #elif occurs in the same file as the
6866 * corresponding #if
6867 */
6868 if (if_sp_ <= str_->get_init_if_level())
6869 {
6870 /* log the error */
6871 log_error(TCERR_PP_ELIF_NOT_IN_SAME_FILE);
6872
6873 /* clear the text and abort */
6874 clear_linebuf();
6875 return;
6876 }
6877
6878 /* check the current #if state */
6879 switch(get_if_state())
6880 {
6881 case TOKIF_IF_YES:
6882 /*
6883 * we just took the #if branch, so don't take this or any
6884 * subsequent #elif or #else branch, regardless of the value of
6885 * the condition - set the state to DONE to indicate that we're
6886 * skipping everything through the endif
6887 */
6888 change_if_state(TOKIF_IF_DONE);
6889 break;
6890
6891 case TOKIF_IF_NO:
6892 /*
6893 * We haven't yet taken a #if or #elif branch, so we can take
6894 * this branch if its condition is true. If this branch's
6895 * condition is false, stay with NO so that we will consider
6896 * future #elif and #else branches.
6897 */
6898 if (val.get_val_bool())
6899 change_if_state(TOKIF_IF_YES);
6900 break;
6901
6902 case TOKIF_IF_DONE:
6903 /*
6904 * we've already taken a #if or #elif branch, so we must ignore
6905 * this and subsequent #elif and #else branches until we get to
6906 * our #endif - just stay in state DONE
6907 */
6908 break;
6909
6910 case TOKIF_NONE:
6911 case TOKIF_ELSE_YES:
6912 case TOKIF_ELSE_NO:
6913 /*
6914 * we're not in a #if branch at all, or we're inside a #else; a
6915 * #elif is not legal here
6916 */
6917 log_error(TCERR_PP_ELIF_WITHOUT_IF);
6918 break;
6919 }
6920
6921 /* don't retain the directive in the preprocessed source */
6922 clear_linebuf();
6923 }
6924
6925 /* ------------------------------------------------------------------------ */
6926 /*
6927 * Process a #else directive
6928 */
pp_else()6929 void CTcTokenizer::pp_else()
6930 {
6931 /* make sure there's nothing but whitespace on the line */
6932 if (next_on_line() != TOKT_EOF)
6933 log_error(TCERR_PP_EXTRA);
6934
6935 /*
6936 * make sure that the #else occurs in the same file as the
6937 * corresponding #if
6938 */
6939 if (if_sp_ <= str_->get_init_if_level())
6940 {
6941 /* log the error */
6942 log_error(TCERR_PP_ELSE_NOT_IN_SAME_FILE);
6943
6944 /* clear the text and abort */
6945 clear_linebuf();
6946 return;
6947 }
6948
6949 /* check our current #if state */
6950 switch(get_if_state())
6951 {
6952 case TOKIF_IF_YES:
6953 case TOKIF_IF_DONE:
6954 /*
6955 * we've already taken a true #if branch, so we don't want to
6956 * process the #else part - switch to a false #else branch
6957 */
6958 change_if_state(TOKIF_ELSE_NO);
6959 break;
6960
6961 case TOKIF_IF_NO:
6962 /*
6963 * we haven't yet found a true #if branch, so take the #else
6964 * branch -- switch to a true #else branch
6965 */
6966 change_if_state(TOKIF_ELSE_YES);
6967 break;
6968
6969 case TOKIF_NONE:
6970 case TOKIF_ELSE_YES:
6971 case TOKIF_ELSE_NO:
6972 /*
6973 * we're not in a #if at all, or we're in a #else - log an error
6974 * and ignore it
6975 */
6976 log_error(TCERR_PP_ELSE_WITHOUT_IF);
6977 break;
6978 }
6979
6980 /* don't retain the directive in the preprocessed source */
6981 clear_linebuf();
6982 }
6983
6984 /* ------------------------------------------------------------------------ */
6985 /*
6986 * Process a #endif directive
6987 */
pp_endif()6988 void CTcTokenizer::pp_endif()
6989 {
6990 /* make sure the rest of the line is blank */
6991 if (next_on_line() != TOKT_EOF)
6992 log_error(TCERR_PP_EXTRA);
6993
6994 /* ignore the rest of the line */
6995 clear_linebuf();
6996
6997 /* if we're not in a #if in the same file it's an error */
6998 if (if_sp_ == 0)
6999 {
7000 log_error(TCERR_PP_ENDIF_WITHOUT_IF);
7001 return;
7002 }
7003 else if (if_sp_ <= str_->get_init_if_level())
7004 {
7005 log_error(TCERR_PP_ENDIF_NOT_IN_SAME_FILE);
7006 return;
7007 }
7008
7009 /* pop a #if level */
7010 pop_if();
7011
7012 /* don't retain the directive in the preprocessed source */
7013 clear_linebuf();
7014 }
7015
7016 /* ------------------------------------------------------------------------ */
7017 /*
7018 * Process a #error directive
7019 */
pp_error()7020 void CTcTokenizer::pp_error()
7021 {
7022 size_t startofs;
7023
7024 /*
7025 * copy the source line through the "error" token to the macro
7026 * expansion buffer - we don't want to expand that part, but we want
7027 * it to appear in the expansion, so just copy the original
7028 */
7029 startofs = (curtok_.get_text() + curtok_.get_text_len()
7030 - linebuf_.get_text());
7031 expbuf_.copy(linebuf_.get_text(), startofs);
7032
7033 /* expand macros; don't allow reading additional lines */
7034 if (expand_macros_curline(FALSE, FALSE, TRUE))
7035 {
7036 clear_linebuf();
7037 return;
7038 }
7039
7040 /*
7041 * If we're in preprocess-only mode, simply retain the text in the
7042 * processed result, so that the error is processed on a subsequent
7043 * compilation of the result; otherwise, display the error.
7044 *
7045 * Ignore #error directives in list-includes mode as well.
7046 */
7047 if (!pp_only_mode_ && !list_includes_mode_)
7048 {
7049 /* display the error */
7050 log_error(TCERR_ERROR_DIRECTIVE,
7051 (int)expbuf_.get_text_len() - startofs,
7052 expbuf_.get_text() + startofs);
7053
7054 /* clear the directive from the result */
7055 clear_linebuf();
7056 }
7057 else
7058 {
7059 /* preprocessing - copy expanded text to line buffer */
7060 linebuf_.copy(expbuf_.get_text(), expbuf_.get_text_len());
7061 }
7062 }
7063
7064 /* ------------------------------------------------------------------------ */
7065 /*
7066 * Process a #undef directive
7067 */
pp_undef()7068 void CTcTokenizer::pp_undef()
7069 {
7070 char macro_name[TOK_SYM_MAX_BUFFER];
7071
7072 /* get the macro name */
7073 if (pp_get_lone_ident(macro_name, sizeof(macro_name)))
7074 {
7075 clear_linebuf();
7076 return;
7077 }
7078
7079 /* remove it */
7080 undefine(macro_name);
7081
7082 /* don't retain the directive in the preprocessed source */
7083 clear_linebuf();
7084 }
7085
7086 /*
7087 * Programmatically delete a preprocesor symbol
7088 */
undefine(const char * sym,size_t len)7089 void CTcTokenizer::undefine(const char *sym, size_t len)
7090 {
7091 CTcHashEntryPp *entry;
7092
7093 /*
7094 * find the macro - if it wasn't defined, silently ignore it, since
7095 * it's legal to #undef a symbol that wasn't previously defined
7096 */
7097 entry = find_define(sym, len);
7098 if (entry != 0 && entry->is_undefable())
7099 {
7100 /* remove it */
7101 defines_->remove(entry);
7102
7103 /* if it's not already in the #undef table, move it there */
7104 if (find_undef(sym, len) == 0)
7105 {
7106 /* move it to the #undef table */
7107 undefs_->add(entry);
7108 }
7109 else
7110 {
7111 /*
7112 * the name is already in the #undef table, so we don't need to
7113 * add it again - we can forget about this entry entirely
7114 */
7115 delete entry;
7116 }
7117 }
7118 }
7119
7120 /* ------------------------------------------------------------------------ */
7121 /*
7122 * Process a #line directive
7123 */
pp_line()7124 void CTcTokenizer::pp_line()
7125 {
7126 CTcConstVal val_line;
7127 CTcConstVal val_fname;
7128 CTcTokFileDesc *desc;
7129
7130 /* expand macros; don't allow reading additional lines */
7131 if (expand_macros_curline(FALSE, TRUE, FALSE))
7132 {
7133 clear_linebuf();
7134 return;
7135 }
7136
7137 /*
7138 * we don't need the original source line any more, and we don't
7139 * want to copy it to the preprocessed output, so clear it
7140 */
7141 clear_linebuf();
7142
7143 /* set up to parse from the expansion */
7144 start_new_line(expbuf_.get_buf(), expbuf_.get_text_len());
7145
7146 /* evaluate the line number expression */
7147 if (pp_parse_expr(&val_line, TRUE, FALSE, TRUE))
7148 return;
7149
7150 /* if it's not an integer constant, it's an error */
7151 if (val_line.get_type() != TC_CVT_INT)
7152 {
7153 log_error(TCERR_LINE_REQ_INT);
7154 return;
7155 }
7156
7157 /* evaluate the filename expression */
7158 if (pp_parse_expr(&val_fname, FALSE, TRUE, TRUE))
7159 return;
7160
7161 /* the filename must be a string expression */
7162 if (val_fname.get_type() != TC_CVT_SSTR)
7163 {
7164 log_error(TCERR_LINE_FILE_REQ_STR);
7165 return;
7166 }
7167
7168 /* find or create a descriptor for the filename */
7169 desc = get_file_desc(val_fname.get_val_str(),
7170 val_fname.get_val_str_len(), FALSE, 0, 0);
7171
7172 /* set the new line number and descriptor in the current stream */
7173 if (str_ != 0)
7174 {
7175 str_->set_next_linenum(val_line.get_val_int());
7176 str_->set_desc(desc);
7177 }
7178
7179 /*
7180 * retain the pragma in the result if we're in preprocess-only mode,
7181 * otherwise remove it
7182 */
7183 if (!pp_only_mode_)
7184 clear_linebuf();
7185 }
7186
7187 /* ------------------------------------------------------------------------ */
7188 /*
7189 * Look up a symbol in the #define symbol table
7190 */
find_define(const char * sym,size_t len) const7191 CTcHashEntryPp *CTcTokenizer::find_define(const char *sym, size_t len) const
7192 {
7193 /* look it up in the #define symbol table and return the result */
7194 return (CTcHashEntryPp *)defines_->find(sym, len);
7195 }
7196
7197 /*
7198 * Look up a symbol in the #undef table
7199 */
find_undef(const char * sym,size_t len) const7200 CTcHashEntryPp *CTcTokenizer::find_undef(const char *sym, size_t len) const
7201 {
7202 /* look it up in the #define symbol table and return the result */
7203 return (CTcHashEntryPp *)undefs_->find(sym, len);
7204 }
7205
7206 /*
7207 * Add a preprocessor macro definition
7208 */
add_define(const char * sym,size_t len,const char * expansion,size_t expan_len)7209 void CTcTokenizer::add_define(const char *sym, size_t len,
7210 const char *expansion, size_t expan_len)
7211 {
7212 CTcHashEntryPp *entry;
7213
7214 /* create an entry for the macro, with no argument list */
7215 entry = new CTcHashEntryPpDefine(sym, len, TRUE, FALSE, 0, FALSE, 0, 0,
7216 expansion, expan_len);
7217
7218 /* add the new entry to the table */
7219 defines_->add(entry);
7220 }
7221
7222 /*
7223 * Add a preprocessor macro definition
7224 */
add_define(CTcHashEntryPp * entry)7225 void CTcTokenizer::add_define(CTcHashEntryPp *entry)
7226 {
7227 /* add the entry to our symbol table */
7228 defines_->add(entry);
7229 }
7230
7231 /*
7232 * parse an expression
7233 */
pp_parse_expr(CTcConstVal * val,int read_first,int last_on_line,int add_line_ending)7234 int CTcTokenizer::pp_parse_expr(CTcConstVal *val, int read_first,
7235 int last_on_line, int add_line_ending)
7236 {
7237 CTcPrsNode *expr_tree;
7238 char ch;
7239
7240 /* add the line ending marker if required */
7241 if (add_line_ending)
7242 {
7243 /*
7244 * append the special end-of-preprocess-line to the macro
7245 * expansion buffer
7246 */
7247 ch = TOK_END_PP_LINE;
7248 expbuf_.append(&ch, 1);
7249 }
7250
7251 /*
7252 * note that we're pasing a preprocessor expression; this affects
7253 * error logging in certain cases
7254 */
7255 in_pp_expr_ = TRUE;
7256
7257 /*
7258 * parse the expression in preprocessor mode, so that double-quoted
7259 * strings can be concatenated and compared
7260 */
7261 G_prs->set_pp_expr_mode(TRUE);
7262
7263 /* get the first token on the line if desired */
7264 if (read_first)
7265 next();
7266
7267 /* parse the expression */
7268 expr_tree = G_prs->parse_expr();
7269
7270 /* make sure we're at the end of the line if desired */
7271 if (last_on_line && next() != TOKT_EOF)
7272 log_error(TCERR_PP_EXPR_EXTRA);
7273
7274 /* if we added the special pp-line-ending marker, remove it */
7275 if (add_line_ending)
7276 {
7277 /*
7278 * the marker is always the last character - remove it simply by
7279 * shortening the buffer by a character
7280 */
7281 expbuf_.set_text_len(expbuf_.get_text_len() - 1);
7282 }
7283
7284 /* return to normal expression mode */
7285 G_prs->set_pp_expr_mode(FALSE);
7286
7287 /* return to normal tokenizing mode */
7288 in_pp_expr_ = FALSE;
7289
7290 /* if we didn't get a valid expression, return failure */
7291 if (expr_tree == 0)
7292 return 1;
7293
7294 /* make sure we got a constant */
7295 if (!expr_tree->is_const())
7296 {
7297 log_error(TCERR_PP_EXPR_NOT_CONST);
7298 return 1;
7299 }
7300
7301 /* fill in the caller's value */
7302 *val = *expr_tree->get_const_val();
7303
7304 /* success */
7305 return 0;
7306 }
7307
7308 /* ------------------------------------------------------------------------ */
7309 /*
7310 * #define enumeration callback context
7311 */
7312 struct def_enum_cb_t
7313 {
7314 /* original callback function */
7315 void (*cb)(void *, CTcHashEntryPp *);
7316
7317 /* original callback context */
7318 void *ctx;
7319 };
7320
7321 /*
7322 * #define enumeration callback. This is a simple impedence matcher on the
7323 * way to the real callbac; we cast the generic hash entry type to the
7324 * CTcHashEntryPp subclass for the benefit of the real callback.
7325 */
enum_defines_cb(void * ctx0,CVmHashEntry * entry)7326 static void enum_defines_cb(void *ctx0, CVmHashEntry *entry)
7327 {
7328 def_enum_cb_t *ctx;
7329
7330 /* get our real context */
7331 ctx = (def_enum_cb_t *)ctx0;
7332
7333 /* invoke the real callback, casting the entry reference appropriately */
7334 (*ctx->cb)(ctx->ctx, (CTcHashEntryPp *)entry);
7335 }
7336
7337 /*
7338 * Enumerate the entries in the #define table through a callback
7339 */
enum_defines(void (* cb)(void *,CTcHashEntryPp *),void * ctx)7340 void CTcTokenizer::enum_defines(void (*cb)(void *, CTcHashEntryPp *),
7341 void *ctx)
7342 {
7343 def_enum_cb_t myctx;
7344
7345 /* set up our impedence-matcher context with the real callback info */
7346 myctx.cb = cb;
7347 myctx.ctx = ctx;
7348
7349 /* enumerate through our impedence-matcher callback */
7350 defines_->enum_entries(&enum_defines_cb, &myctx);
7351 }
7352
7353 /* ------------------------------------------------------------------------ */
7354 /*
7355 * Get a lone identifier for a preprocessor directive. The identifier
7356 * must be the only thing left on the line; we'll generate an error if
7357 * extra characters follow on the line.
7358 *
7359 * If there's no identifier on the line, or there's more information
7360 * after the identifier, logs an error and returns non-zero; returns
7361 * zero on success.
7362 */
pp_get_lone_ident(char * buf,size_t bufl)7363 int CTcTokenizer::pp_get_lone_ident(char *buf, size_t bufl)
7364 {
7365 /* get the next token, and make sure it's a symbol */
7366 if (next_on_line() != TOKT_SYM)
7367 {
7368 log_error_curtok(TCERR_BAD_DEFINE_SYM);
7369 return 1;
7370 }
7371
7372 /* return an error if it doesn't fit */
7373 if (curtok_.get_text_len() > bufl)
7374 return 1;
7375
7376 /* copy the text */
7377 memcpy(buf, curtok_.get_text(), curtok_.get_text_len());
7378 buf[curtok_.get_text_len()] = '\0';
7379
7380 /* make sure there's nothing else on the line but whitespace */
7381 if (next_on_line() != TOKT_EOF)
7382 {
7383 log_error(TCERR_PP_EXTRA);
7384 return 1;
7385 }
7386
7387 /* success */
7388 return 0;
7389 }
7390
7391 /* ------------------------------------------------------------------------ */
7392 /*
7393 * Push a new #if level
7394 */
push_if(tok_if_t state)7395 void CTcTokenizer::push_if(tok_if_t state)
7396 {
7397 /* if we're out of space in the stack, throw a fatal error */
7398 if (if_sp_ == TOK_MAX_IF_NESTING)
7399 throw_fatal_error(TCERR_IF_NESTING_OVERFLOW);
7400
7401 /*
7402 * if we're in a nested #if in a false #if, increase the nested
7403 * false #if level
7404 */
7405 if (in_false_if())
7406 ++if_false_level_;
7407
7408 /* push the state, remembering where the #if was defined */
7409 if_stack_[if_sp_].desc = last_desc_;
7410 if_stack_[if_sp_].linenum = last_linenum_;
7411 if_stack_[if_sp_++].state = state;
7412 }
7413
7414 /*
7415 * Pop a #if level
7416 */
pop_if()7417 void CTcTokenizer::pop_if()
7418 {
7419 /* if we're in a nested #if in a false #if, pop the nesting level */
7420 if (if_false_level_ != 0)
7421 --if_false_level_;
7422
7423 /* pop the main if level */
7424 if (if_sp_ != 0)
7425 --if_sp_;
7426 }
7427
7428
7429 /* ------------------------------------------------------------------------ */
7430 /*
7431 * Log an error
7432 */
log_error(int errnum,...)7433 void CTcTokenizer::log_error(int errnum, ...)
7434 {
7435 va_list marker;
7436
7437 /* display the message */
7438 va_start(marker, errnum);
7439 G_tcmain->v_log_error(G_tok->get_last_desc(), G_tok->get_last_linenum(),
7440 TC_SEV_ERROR, errnum, marker);
7441 va_end(marker);
7442 }
7443
7444 /*
7445 * Log an error with the current token's text as the parameter data,
7446 * suitable for use with a "%.*s" display format entry
7447 */
log_error_curtok(int errnum)7448 void CTcTokenizer::log_error_curtok(int errnum)
7449 {
7450 /*
7451 * display the message, passing "%.*s" parameter data for the
7452 * current token text: an integer giving the length of the token
7453 * text, and a pointer to the token text
7454 */
7455 log_error_or_warning_curtok(TC_SEV_ERROR, errnum);
7456 }
7457
7458 /*
7459 * Log an error or warning for the current token
7460 */
log_error_or_warning_curtok(tc_severity_t sev,int errnum)7461 void CTcTokenizer::log_error_or_warning_curtok(tc_severity_t sev, int errnum)
7462 {
7463 /* log the error with our current token */
7464 log_error_or_warning_with_tok(sev, errnum, getcur());
7465 }
7466
7467 /*
7468 * Log an error or warning with the given token
7469 */
log_error_or_warning_with_tok(tc_severity_t sev,int errnum,const CTcToken * tok)7470 void CTcTokenizer::log_error_or_warning_with_tok(
7471 tc_severity_t sev, int errnum, const CTcToken *tok)
7472 {
7473 const char *tok_txt;
7474 size_t tok_len;
7475 char buf[128];
7476 const char *prefix;
7477 const char *suffix;
7478 utf8_ptr src;
7479 utf8_ptr dst;
7480 size_t rem;
7481 size_t outchars;
7482
7483 /* see what we have */
7484 switch(tok->gettyp())
7485 {
7486 case TOKT_SSTR:
7487 /* show the string in quotes, but limit the length */
7488 prefix = "'";
7489 suffix = "'";
7490 goto format_string;
7491
7492 case TOKT_DSTR:
7493 prefix = "\"";
7494 suffix = "\"";
7495 goto format_string;
7496
7497 case TOKT_DSTR_START:
7498 prefix = "\"";
7499 suffix = "<<";
7500 goto format_string;
7501
7502 case TOKT_DSTR_MID:
7503 prefix = ">>";
7504 suffix = "<<";
7505 goto format_string;
7506
7507 case TOKT_DSTR_END:
7508 prefix = ">>";
7509 suffix = "\"";
7510 goto format_string;
7511
7512 format_string:
7513 /* set the prefix */
7514 strcpy(buf, prefix);
7515
7516 /*
7517 * show the string, but limit the length, and convert control
7518 * characters to escaped representation
7519 */
7520 src.set((char *)tok->get_text());
7521 rem = tok->get_text_len();
7522 for (dst.set(buf + strlen(buf)), outchars = 0 ;
7523 rem != 0 && outchars < 20 ; src.inc(&rem), ++outchars)
7524 {
7525 /* if this is a control character, escape it */
7526 if (src.getch() < 32)
7527 {
7528 dst.setch('\\');
7529
7530 switch(src.getch())
7531 {
7532 case 10:
7533 dst.setch('n');
7534 break;
7535
7536 case 0x000F:
7537 dst.setch('^');
7538 break;
7539
7540 case 0x000E:
7541 dst.setch('v');
7542 break;
7543
7544 case 0x000B:
7545 dst.setch('b');
7546 break;
7547
7548 case 0x0015:
7549 dst.setch(' ');
7550 break;
7551
7552 case 9:
7553 dst.setch('t');
7554 break;
7555
7556 default:
7557 dst.setch('x');
7558 dst.setch('0' + (src.getch() >> 12) & 0xf);
7559 dst.setch('0' + (src.getch() >> 8) & 0xf);
7560 dst.setch('0' + (src.getch() >> 4) & 0xf);
7561 dst.setch('0' + (src.getch()) & 0xf);
7562 break;
7563 }
7564 }
7565 else
7566 {
7567 /* put this character as-is */
7568 dst.setch(src.getch());
7569 }
7570 }
7571
7572 /* if there's more string left, add "..." */
7573 if (rem != 0)
7574 {
7575 dst.setch('.');
7576 dst.setch('.');
7577 dst.setch('.');
7578 }
7579
7580 /* add the suffix */
7581 strcpy(dst.getptr(), suffix);
7582
7583 /* use this buffer as the token string to display */
7584 tok_txt = buf;
7585 tok_len = strlen(tok_txt);
7586 break;
7587
7588 case TOKT_EOF:
7589 /* show a special "<End Of File>" marker */
7590 tok_txt = "<End Of File>";
7591 tok_len = strlen(tok_txt);
7592 break;
7593
7594 default:
7595 /* just show the current token text */
7596 tok_txt = tok->get_text();
7597 tok_len = tok->get_text_len();
7598 break;
7599 }
7600
7601 /* log the error */
7602 G_tcmain->log_error(get_last_desc(), get_last_linenum(),
7603 sev, errnum, tok_len, tok_txt);
7604 }
7605
7606 /*
7607 * Log a warning
7608 */
log_warning(int errnum,...)7609 void CTcTokenizer::log_warning(int errnum, ...)
7610 {
7611 va_list marker;
7612
7613 /* display the message */
7614 va_start(marker, errnum);
7615 G_tcmain->v_log_error(G_tok->get_last_desc(), G_tok->get_last_linenum(),
7616 TC_SEV_WARNING, errnum, marker);
7617 va_end(marker);
7618 }
7619
7620 /*
7621 * Log a warning with the current token's text as the parameter data,
7622 * suitable for use with a "%.*s" display format entry
7623 */
log_warning_curtok(int errnum)7624 void CTcTokenizer::log_warning_curtok(int errnum)
7625 {
7626 /*
7627 * display the warning message, passing "%.*s" parameter data for
7628 * the current token text: an integer giving the length of the token
7629 * text, and a pointer to the token text
7630 */
7631 log_error_or_warning_curtok(TC_SEV_WARNING, errnum);
7632 }
7633
7634 /*
7635 * Log and throw an internal error
7636 */
throw_internal_error(int errnum,...)7637 void CTcTokenizer::throw_internal_error(int errnum, ...)
7638 {
7639 va_list marker;
7640
7641 /* display the message */
7642 va_start(marker, errnum);
7643 G_tcmain->v_log_error(G_tok->get_last_desc(), G_tok->get_last_linenum(),
7644 TC_SEV_INTERNAL, errnum, marker);
7645 va_end(marker);
7646
7647 /* throw the generic internal error, since we've logged this */
7648 err_throw(TCERR_INTERNAL_ERROR);
7649 }
7650
7651 /*
7652 * Log and throw a fatal error
7653 */
throw_fatal_error(int errnum,...)7654 void CTcTokenizer::throw_fatal_error(int errnum, ...)
7655 {
7656 va_list marker;
7657
7658 /* display the message */
7659 va_start(marker, errnum);
7660 G_tcmain->v_log_error(G_tok->get_last_desc(), G_tok->get_last_linenum(),
7661 TC_SEV_FATAL, errnum, marker);
7662 va_end(marker);
7663
7664 /* throw the generic fatal error, since we've logged this */
7665 err_throw(TCERR_FATAL_ERROR);
7666 }
7667
7668 /*
7669 * display a string value
7670 */
msg_str(const char * str,size_t len) const7671 void CTcTokenizer::msg_str(const char *str, size_t len) const
7672 {
7673 /* display the string through the host interface */
7674 G_hostifc->print_msg("%.*s", (int)len, str);
7675 }
7676
7677 /*
7678 * display a numeric value
7679 */
msg_long(long val) const7680 void CTcTokenizer::msg_long(long val) const
7681 {
7682 /* display the number through the host interface */
7683 G_hostifc->print_msg("%ld", val);
7684 }
7685
7686 /* ------------------------------------------------------------------------ */
7687 /*
7688 * Tokenizer Input Stream implementation
7689 */
7690
7691 /*
7692 * create a token input stream
7693 */
CTcTokStream(CTcTokFileDesc * desc,CTcSrcObject * src,CTcTokStream * parent,int charset_error,int init_if_level)7694 CTcTokStream::CTcTokStream(CTcTokFileDesc *desc, CTcSrcObject *src,
7695 CTcTokStream *parent, int charset_error,
7696 int init_if_level)
7697 {
7698 /* remember the underlying source file */
7699 src_ = src;
7700
7701 /* remember the file descriptor */
7702 desc_ = desc;
7703
7704 /* remember the containing stream */
7705 parent_ = parent;
7706
7707 /* the next line to read is line number 1 */
7708 next_linenum_ = 1;
7709
7710 /* remember if there was a #charset error */
7711 charset_error_ = charset_error;
7712
7713 /* we're not in a comment yet */
7714 in_comment_ = FALSE;
7715
7716 /* remember the starting #if level */
7717 init_if_level_ = init_if_level;
7718
7719 #if 0 // #pragma C is not currently used
7720 /*
7721 * start out in parent's pragma C mode, or in non-C mode if we have
7722 * no parent
7723 */
7724 if (parent != 0)
7725 pragma_c_ = parent->is_pragma_c();
7726 else
7727 pragma_c_ = TRUE;
7728 #endif
7729 }
7730
7731 /*
7732 * delete a token input stream
7733 */
~CTcTokStream()7734 CTcTokStream::~CTcTokStream()
7735 {
7736 /* we own the underlying file, so delete it */
7737 if (src_ != 0)
7738 delete src_;
7739 }
7740
7741 /* ------------------------------------------------------------------------ */
7742 /*
7743 * File Descriptor
7744 */
7745
7746 /*
7747 * Get the length of a string with each instance of the given quote
7748 * character escaped with a backslash. We'll also count the escapes we
7749 * need for each backslash.
7750 */
get_quoted_len(const char * str,wchar_t qu)7751 static size_t get_quoted_len(const char *str, wchar_t qu)
7752 {
7753 utf8_ptr p;
7754 size_t len;
7755
7756 /*
7757 * scan the string for instances of the quote mark; each one adds an
7758 * extra byte to the length needed, since each one requires a
7759 * backslash character to escape the quote mark
7760 */
7761 for (p.set((char *)str), len = strlen(str) ; p.getch() != '\0' ; p.inc())
7762 {
7763 wchar_t ch;
7764
7765 /*
7766 * check to see if this character is quotable - it is quotable if
7767 * it's a backslash or it's the quote character we're escaping
7768 */
7769 ch = p.getch();
7770 if (ch == qu || ch == '\\')
7771 {
7772 /*
7773 * we need to escape this character, so add a byte for the
7774 * backslash we'll need to insert
7775 */
7776 ++len;
7777 }
7778 }
7779
7780 /* return the length we calculated */
7781 return len;
7782 }
7783
7784 /*
7785 * Build a quoted string. Fills in dst with the source string with each
7786 * of the given quote marks and each backslash escaped with a backslash.
7787 * Use get_quoted_len() to determine how much space to allocate for the
7788 * destination buffer.
7789 */
build_quoted_str(char * dstbuf,const char * src,wchar_t qu)7790 static void build_quoted_str(char *dstbuf, const char *src, wchar_t qu)
7791 {
7792 utf8_ptr p;
7793 utf8_ptr dst;
7794
7795 /* scan the source string for escapable characters */
7796 for (p.set((char *)src), dst.set(dstbuf), dst.setch(qu) ;
7797 p.getch() != '\0' ; p.inc())
7798 {
7799 wchar_t ch;
7800
7801 /* get this source character */
7802 ch = p.getch();
7803
7804 /* add a quote if we have a backslash or the quote character */
7805 if (ch == '\\' || ch == qu)
7806 {
7807 /* add a backslash to escape the character */
7808 dst.setch('\\');
7809 }
7810
7811 /* add the character */
7812 dst.setch(ch);
7813 }
7814
7815 /* add the close quote and trailing null */
7816 dst.setch(qu);
7817 dst.setch('\0');
7818 }
7819
7820 /*
7821 * create a file descriptor
7822 */
CTcTokFileDesc(const char * fname,size_t fname_len,int index,CTcTokFileDesc * orig_desc,const char * orig_fname,size_t orig_fname_len)7823 CTcTokFileDesc::CTcTokFileDesc(const char *fname, size_t fname_len,
7824 int index, CTcTokFileDesc *orig_desc,
7825 const char *orig_fname, size_t orig_fname_len)
7826 {
7827 const char *rootname;
7828
7829 /* no source pages are allocated yet */
7830 src_pages_ = 0;
7831 src_pages_alo_ = 0;
7832
7833 /* remember the first instance of this filename in the list */
7834 orig_ = orig_desc;
7835
7836 /* there's nothing else in our chain yet */
7837 next_ = 0;
7838
7839 /* remember my index in the master list */
7840 index_ = index;
7841
7842 /* if there's a filename, save a copy of the name */
7843 fname_ = lib_copy_str(fname, fname_len);
7844
7845 /* if there's an original filename save it as well */
7846 orig_fname_ = lib_copy_str(orig_fname, orig_fname_len);
7847
7848 /*
7849 * get the root filename, since we need to build a quoted version of
7850 * that as well as of the basic filename
7851 */
7852 rootname = os_get_root_name(fname_);
7853
7854 /*
7855 * Allocate space for the quoted versions of the filename - make room
7856 * for the filename plus the quotes (one on each end) and a null
7857 * terminator byte.
7858 */
7859 dquoted_fname_ = (char *)t3malloc(get_quoted_len(fname_, '"') + 3);
7860 squoted_fname_ = (char *)t3malloc(get_quoted_len(fname_, '\'') + 3);
7861 dquoted_rootname_ = (char *)t3malloc(get_quoted_len(rootname, '"') + 3);
7862 squoted_rootname_ = (char *)t3malloc(get_quoted_len(rootname, '\'') + 3);
7863
7864 /* build the quoted version of the name */
7865 build_quoted_str(dquoted_fname_, fname_, '"');
7866 build_quoted_str(squoted_fname_, fname_, '\'');
7867 build_quoted_str(dquoted_rootname_, rootname, '"');
7868 build_quoted_str(squoted_rootname_, rootname, '\'');
7869 }
7870
7871 /*
7872 * delete the descriptor
7873 */
~CTcTokFileDesc()7874 CTcTokFileDesc::~CTcTokFileDesc()
7875 {
7876 /* delete the filename and original filename strings */
7877 lib_free_str(fname_);
7878 lib_free_str(orig_fname_);
7879
7880 /* delete the quotable filename strings */
7881 t3free(dquoted_fname_);
7882 t3free(squoted_fname_);
7883 t3free(dquoted_rootname_);
7884 t3free(squoted_rootname_);
7885
7886 /* delete each source page we've allocated */
7887 if (src_pages_ != 0)
7888 {
7889 size_t i;
7890
7891 /* go through the index array and delete each allocated page */
7892 for (i = 0 ; i < src_pages_alo_ ; ++i)
7893 {
7894 /* if this page was allocated, delete it */
7895 if (src_pages_[i] != 0)
7896 t3free(src_pages_[i]);
7897 }
7898
7899 /* delete the source page index array */
7900 t3free(src_pages_);
7901 }
7902 }
7903
7904 /*
7905 * Source page structure. Each page tracks a block of source lines.
7906 */
7907 const size_t TCTOK_SRC_PAGE_CNT = 1024;
7908 struct CTcTokSrcPage
7909 {
7910 /*
7911 * Array of line entries on this page. Each entry is zero if it
7912 * hasn't been assigned yet, and contains the absolute image file
7913 * address of the generated code for the source line if it has been
7914 * assigned.
7915 */
7916 ulong ofs[TCTOK_SRC_PAGE_CNT];
7917 };
7918
7919
7920 /*
7921 * Add a source line
7922 */
add_source_line(ulong linenum,ulong line_addr)7923 void CTcTokFileDesc::add_source_line(ulong linenum, ulong line_addr)
7924 {
7925 size_t page_idx;
7926 size_t idx;
7927
7928 /* get the index of the page containing this source line */
7929 page_idx = linenum / TCTOK_SRC_PAGE_CNT;
7930
7931 /* get the index of the entry within the page */
7932 idx = linenum % TCTOK_SRC_PAGE_CNT;
7933
7934 /*
7935 * determine if our page index table is large enough, and expand it
7936 * if not
7937 */
7938 if (page_idx >= src_pages_alo_)
7939 {
7940 size_t siz;
7941 size_t new_alo;
7942
7943 /* allocate or expand the source pages array */
7944 new_alo = page_idx + 16;
7945 siz = new_alo * sizeof(src_pages_[0]);
7946 if (src_pages_ == 0)
7947 src_pages_ = (CTcTokSrcPage **)t3malloc(siz);
7948 else
7949 src_pages_ = (CTcTokSrcPage **)t3realloc(src_pages_, siz);
7950
7951 /* clear the new part */
7952 memset(src_pages_ + src_pages_alo_, 0,
7953 (new_alo - src_pages_alo_) * sizeof(src_pages_[0]));
7954
7955 /* remember the new allocation size */
7956 src_pages_alo_ = new_alo;
7957 }
7958
7959 /* if this page isn't allocated, do so now */
7960 if (src_pages_[page_idx] == 0)
7961 {
7962 /* allocate the new page */
7963 src_pages_[page_idx] = (CTcTokSrcPage *)
7964 t3malloc(sizeof(CTcTokSrcPage));
7965
7966 /* clear it */
7967 memset(src_pages_[page_idx], 0, sizeof(CTcTokSrcPage));
7968 }
7969
7970 /*
7971 * if this source line entry has been previously set, don't change
7972 * it; otherwise, store the new setting
7973 */
7974 if (src_pages_[page_idx]->ofs[idx] == 0)
7975 src_pages_[page_idx]->ofs[idx] = line_addr;
7976 }
7977
7978 /*
7979 * Enumerate source lines
7980 */
enum_source_lines(void (* cbfunc)(void *,ulong,ulong),void * cbctx)7981 void CTcTokFileDesc::enum_source_lines(void (*cbfunc)(void *, ulong, ulong),
7982 void *cbctx)
7983 {
7984 size_t page_idx;
7985 CTcTokSrcPage **pg;
7986
7987 /* loop over all of the pages */
7988 for (page_idx = 0, pg = src_pages_ ; page_idx < src_pages_alo_ ;
7989 ++page_idx, ++pg)
7990 {
7991 size_t i;
7992 ulong linenum;
7993 ulong *p;
7994
7995 /* if this page is not populated, skip it */
7996 if (*pg == 0)
7997 continue;
7998
7999 /* calculate the starting line number for this page */
8000 linenum = page_idx * TCTOK_SRC_PAGE_CNT;
8001
8002 /* loop over the entries on this page */
8003 for (i = 0, p = (*pg)->ofs ; i < TCTOK_SRC_PAGE_CNT ;
8004 ++i, ++p, ++linenum)
8005 {
8006 /* if this entry has been set, call the callback */
8007 if (*p != 0)
8008 (*cbfunc)(cbctx, linenum, *p);
8009 }
8010 }
8011 }
8012
8013 /* ------------------------------------------------------------------------ */
8014 /*
8015 * #define symbol table hash entry
8016 */
8017
8018 /*
8019 * create an entry
8020 */
CTcHashEntryPpDefine(const textchar_t * str,size_t len,int copy,int has_args,int argc,int has_varargs,const char ** argv,const size_t * argvlen,const char * expansion,size_t expan_len)8021 CTcHashEntryPpDefine::CTcHashEntryPpDefine(const textchar_t *str, size_t len,
8022 int copy, int has_args, int argc,
8023 int has_varargs,
8024 const char **argv,
8025 const size_t *argvlen,
8026 const char *expansion,
8027 size_t expan_len)
8028 : CTcHashEntryPp(str, len, copy)
8029 {
8030 /* copy the argument list if necessary */
8031 has_args_ = has_args;
8032 has_varargs_ = has_varargs;
8033 argc_ = argc;
8034 if (argc != 0)
8035 {
8036 int i;
8037
8038 /* allocate the argument list */
8039 argv_ = (char **)t3malloc(argc * sizeof(*argv_));
8040
8041 /* allocate the parameters hash table */
8042 params_table_ = new CVmHashTable(16, new CVmHashFuncCS(), TRUE);
8043
8044 /* allocate the entry list */
8045 arg_entry_ = (CTcHashEntryPpArg **)
8046 t3malloc(argc * sizeof(arg_entry_[0]));
8047
8048 /* copy the arguments */
8049 for (i = 0 ; i < argc ; ++i)
8050 {
8051 CTcHashEntryPpArg *entry;
8052
8053 /* copy the argument name */
8054 argv_[i] = lib_copy_str(argv[i], argvlen[i]);
8055
8056 /*
8057 * Create the hash entries for this parameters. We'll use
8058 * this entry to look up tokens in the expansion text for
8059 * matches to the formal names when expanding the macro.
8060 *
8061 * Note that we'll refer directly to our local copy of the
8062 * argument name, so we don't need to make another copy in
8063 * the hash entry.
8064 */
8065 entry = new CTcHashEntryPpArg(argv_[i], argvlen[i], FALSE, i);
8066 params_table_->add(entry);
8067
8068 /* add it to our by-index list */
8069 arg_entry_[i] = entry;
8070 }
8071 }
8072 else
8073 {
8074 /* no arguments */
8075 argv_ = 0;
8076 params_table_ = 0;
8077 arg_entry_ = 0;
8078 }
8079
8080 /* save the expansion */
8081 expan_ = lib_copy_str(expansion, expan_len);
8082 expan_len_ = expan_len;
8083 }
8084
8085 /*
8086 * delete
8087 */
~CTcHashEntryPpDefine()8088 CTcHashEntryPpDefine::~CTcHashEntryPpDefine()
8089 {
8090 int i;
8091
8092 /* delete the argument list */
8093 if (argv_ != 0)
8094 {
8095 /* delete each argument string */
8096 for (i = 0 ; i < argc_ ; ++i)
8097 lib_free_str(argv_[i]);
8098
8099 /* delete the argument vector */
8100 t3free(argv_);
8101
8102 /* delete the argument entry list */
8103 t3free(arg_entry_);
8104
8105 /* delete the hash table */
8106 delete params_table_;
8107 }
8108
8109 /* delete the expansion */
8110 lib_free_str(expan_);
8111 }
8112
8113 /*
8114 * __LINE__ static buffer
8115 */
8116 char CTcHashEntryPpLINE::buf_[20];
8117
8118
8119 /* ------------------------------------------------------------------------ */
8120 /*
8121 * Load macro definitions from a file.
8122 */
load_macros_from_file(CVmStream * fp,CTcTokLoadMacErr * err_handler)8123 int CTcTokenizer::load_macros_from_file(CVmStream *fp,
8124 CTcTokLoadMacErr *err_handler)
8125 {
8126 long cnt;
8127 long i;
8128 size_t curarg;
8129 char *argv[TOK_MAX_MACRO_ARGS];
8130 size_t argvlen[TOK_MAX_MACRO_ARGS];
8131 size_t maxarg;
8132 int result;
8133 char *expan;
8134 size_t expmaxlen;
8135
8136 /* we haven't allocated any argument buffers yet */
8137 maxarg = 0;
8138
8139 /* allocate an initial expansion buffer */
8140 expmaxlen = 1024;
8141 expan = (char *)t3malloc(expmaxlen);
8142
8143 /* presume success */
8144 result = 0;
8145
8146 /* read the number of macros */
8147 cnt = fp->read_int4();
8148
8149 /* read each macro */
8150 for (i = 0 ; i < cnt ; ++i)
8151 {
8152 char namebuf[TOK_SYM_MAX_LEN];
8153 size_t namelen;
8154 int flags;
8155 size_t argc;
8156 size_t explen;
8157 CTcHashEntryPp *entry;
8158 int has_args;
8159 int has_varargs;
8160
8161 /* read the name's length */
8162 namelen = fp->read_uint2();
8163 if (namelen > sizeof(namebuf))
8164 {
8165 /* log an error through the handler */
8166 err_handler->log_error(1);
8167
8168 /* give up - we can't read any more of the file */
8169 result = 1;
8170 goto done;
8171 }
8172
8173 /* read the name */
8174 fp->read_bytes(namebuf, namelen);
8175
8176 /* read and decode the flags */
8177 flags = fp->read_uint2();
8178 has_args = ((flags & 1) != 0);
8179 has_varargs = ((flags & 2) != 0);
8180
8181 /* read the number of arguments, and read each argument */
8182 argc = fp->read_uint2();
8183 for (curarg = 0 ; curarg < argc ; ++curarg)
8184 {
8185 /* read the length, and make sure it's valid */
8186 argvlen[curarg] = fp->read_uint2();
8187 if (argvlen[curarg] > TOK_SYM_MAX_LEN)
8188 {
8189 /* log an error */
8190 err_handler->log_error(2);
8191
8192 /* give up - we can't read any more of the file */
8193 result = 2;
8194 goto done;
8195 }
8196
8197 /*
8198 * if we haven't allocated a buffer for this argument slot yet,
8199 * allocate it now; allocate the buffer at the maximum symbol
8200 * size, so we can reuse the same buffer for an argument of
8201 * other macros we read later
8202 */
8203 while (curarg >= maxarg)
8204 argv[maxarg++] = (char *)t3malloc(TOK_SYM_MAX_LEN);
8205
8206 /* read the argument text */
8207 fp->read_bytes(argv[curarg], argvlen[curarg]);
8208 }
8209
8210 /* read the expansion size */
8211 explen = (size_t)fp->read_int4();
8212
8213 /* expand the expansion buffer if necessary */
8214 if (explen > expmaxlen)
8215 {
8216 /*
8217 * overshoot a bit, so that we won't have to reallocate again
8218 * if we find a slightly larger expansion for a future macro
8219 */
8220 expmaxlen = explen + 512;
8221
8222 /* allocate the new buffer */
8223 expan = (char *)t3realloc(expan, expmaxlen);
8224 }
8225
8226 /* read the expansion */
8227 fp->read_bytes(expan, explen);
8228
8229 /*
8230 * Before we create the entry, check to see if there's an existing
8231 * entry with the same name.
8232 */
8233 entry = find_define(namebuf, namelen);
8234 if (entry != 0)
8235 {
8236 /*
8237 * We have another entry. If the entry is exactly the same,
8238 * then we can simply skip the current entry, because we simply
8239 * want to keep one copy of each macro that's defined
8240 * identically in mutiple compilation macros. If the entry is
8241 * different from the new one, delete both - a macro which
8242 * appears in two or more compilation units with different
8243 * meanings is NOT a global macro, and thus we can't include it
8244 * in the debugging records.
8245 */
8246 if (entry->is_pseudo()
8247 || entry->has_args() != has_args
8248 || entry->has_varargs() != has_varargs
8249 || entry->get_argc() != (int)argc
8250 || entry->get_expan_len() != explen
8251 || memcmp(entry->get_expansion(), expan, explen) != 0)
8252 {
8253 /*
8254 * The existing entry is different from the new entry, so
8255 * the macro has different meanings in different
8256 * compilation units, hence we cannot keep *either*
8257 * definition in the debug records. Delete the existing
8258 * macro, and do not create the new macro. If the existing
8259 * macro is a pseudo-macro, keep the old one (since it's
8260 * provided by the compiler itself), but still discard the
8261 * new one.
8262 */
8263 if (!entry->is_pseudo())
8264 undefine(namebuf, namelen);
8265 }
8266 else
8267 {
8268 /*
8269 * The new entry is identical to the old one, so keep it.
8270 * We only need one copy of the entry, though, so simply
8271 * keep the old one - there's no need to create a new entry
8272 * for the object file data.
8273 */
8274 }
8275 }
8276 else
8277 {
8278 /*
8279 * There's no existing macro with the same name, so create a
8280 * new entry based on the object file data.
8281 */
8282 entry = new CTcHashEntryPpDefine(namebuf, namelen, TRUE,
8283 has_args, argc, has_varargs,
8284 (const char **)argv, argvlen,
8285 expan, explen);
8286
8287 /* add it to the preprocessor's macro symbol table */
8288 add_define(entry);
8289 }
8290 }
8291
8292 done:
8293 /* free the argument buffers we allocated */
8294 for (curarg = 0 ; curarg < maxarg ; ++curarg)
8295 t3free(argv[curarg]);
8296
8297 /* free the expansion buffer */
8298 t3free(expan);
8299
8300 /* success */
8301 return result;
8302 }
8303
8304 /* ------------------------------------------------------------------------ */
8305 /*
8306 * Callback context for writing enumerated #define symbols to a file
8307 */
8308 struct write_macro_ctx_t
8309 {
8310 /* object file we're writing to */
8311 CVmFile *fp;
8312
8313 /* number of symbols written so far */
8314 unsigned long cnt;
8315 };
8316
8317 /*
8318 * Enumeration callback for writing the #define symbols to a file
8319 */
write_macros_cb(void * ctx0,CTcHashEntryPp * entry)8320 static void write_macros_cb(void *ctx0, CTcHashEntryPp *entry)
8321 {
8322 write_macro_ctx_t *ctx = (write_macro_ctx_t *)ctx0;
8323 int flags;
8324 int i;
8325 CVmFile *fp = ctx->fp;
8326
8327 /*
8328 * if this is a pseudo-macro (such as __LINE__ or __FILE__), ignore it
8329 * - these macros do not have permanent global definitions, so they're
8330 * not usable in the debugger
8331 */
8332 if (entry->is_pseudo())
8333 return;
8334
8335 /*
8336 * If the macro was ever redefined or undefined, ignore it - the
8337 * debugger can only use truly global macros, which are macros that
8338 * have stable meanings throughout the compilation units where they
8339 * appear (and which do not have different meanings in different
8340 * compilation units, but that's not our concern at the moment). The
8341 * preprocessor keeps an "undef" table of everything undefined
8342 * (explicitly, or implicitly via redefinition), so look up this macro
8343 * in the undef table, and ignore the macro if it we find it.
8344 */
8345 if (G_tok->find_undef(entry->getstr(), entry->getlen()) != 0)
8346 return;
8347
8348 /* count this macro */
8349 ctx->cnt++;
8350
8351 /* write the macro's name */
8352 fp->write_int2(entry->getlen());
8353 fp->write_bytes(entry->getstr(), entry->getlen());
8354
8355 /* write the flag bits */
8356 flags = 0;
8357 if (entry->has_args()) flags |= 1;
8358 if (entry->has_varargs()) flags |= 2;
8359 fp->write_int2(flags);
8360
8361 /* write the number of arguments, and write each argument */
8362 fp->write_int2(entry->get_argc());
8363 for (i = 0 ; i < entry->get_argc() ; ++i)
8364 {
8365 CTcHashEntryPpArg *arg;
8366
8367 /* get the argument */
8368 arg = entry->get_arg_entry(i);
8369
8370 /* write the parameter name */
8371 fp->write_int2(arg->getlen());
8372 fp->write_bytes(arg->getstr(), arg->getlen());
8373 }
8374
8375 /* write the expansion */
8376 fp->write_int4(entry->get_expan_len());
8377 fp->write_bytes(entry->get_expansion(), entry->get_expan_len());
8378 }
8379
8380 /*
8381 * Write all #define symbols to a file, for debugging purposes. Writes
8382 * only symbols that have never been undefined or redefined, since the
8383 * debugger can only make use of global symbols (i.e., symbols with
8384 * consistent meanings through all compilation units in which they
8385 * appear).
8386 */
write_macros_to_file_for_debug(CVmFile * fp)8387 void CTcTokenizer::write_macros_to_file_for_debug(CVmFile *fp)
8388 {
8389 long pos;
8390 long endpos;
8391 write_macro_ctx_t ctx;
8392
8393 /* write a placeholder for the symbol count */
8394 pos = fp->get_pos();
8395 fp->write_int4(0);
8396
8397 /* write the symbols */
8398 ctx.fp = fp;
8399 ctx.cnt = 0;
8400 enum_defines(&write_macros_cb, &ctx);
8401
8402 /* go back and fix up the symbol count */
8403 endpos = fp->get_pos();
8404 fp->set_pos(pos);
8405 fp->write_int4(ctx.cnt);
8406
8407 /* seek back to where we left off */
8408 fp->set_pos(endpos);
8409 }
8410