1 /*
2 * (C) Copyright 2001-2015 Diomidis Spinellis
3 *
4 * This file is part of CScout.
5 *
6 * CScout is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * CScout is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with CScout. If not, see <http://www.gnu.org/licenses/>.
18 *
19 *
20 * A preprocessor lexical token.
21 * The getnext() method for these tokens converts characters into tokens.
22 *
23 */
24
25 #ifndef PLTOKEN_
26 #define PLTOKEN_
27
28 #include "debug.h"
29 #include "tokid.h"
30 #include "ptoken.h"
31 #include "call.h"
32
33 class Fchar;
34
35 // C-preprocessor context influences lexical analysis
36 enum e_cpp_context {
37 cpp_normal, // Normal operation
38 cpp_include, // Set while processing a #include directive (will recognize ABSFNAME and PATHFNAME)
39 cpp_define // Set while processing a #define directive (will recognize CONCAT)
40 };
41
42 // A C preprocessor lexical token
43 class Pltoken: public Ptoken {
44 private:
45 static enum e_cpp_context context;
46 // Allow line comments starting with a semicolon (inside Microsoft asm)
47 static bool semicolon_line_comments;
48 // Echo characters read on standard output
49 static bool echo;
50 template <class C> void update_parts(Tokid& base, Tokid& follow, const C& c0);
51 Tokid t; // Token identifier for delimeters: comma, bracket
52 template <class C> void getnext_analyze();
53 public:
54 template <class C> void getnext();
55 template <class C> void getnext_nospc();
set_context(enum e_cpp_context con)56 static void set_context(enum e_cpp_context con) { context = con; };
set_semicolon_line_comments(bool v)57 static void set_semicolon_line_comments(bool v) { semicolon_line_comments = v; }
get_delimiter_tokid()58 Tokid get_delimiter_tokid() const { return t; }
set_echo()59 static void set_echo() { echo = true; }
clear_echo()60 static void clear_echo() { echo = false; }
61 };
62
63 /*
64 * Given "base" that marks the beginning of a token
65 * "follow" that follows its characters as they are read, and
66 * c0, a new character read, check that the new character
67 * is indeed agreeing with the value of "follow".
68 * If not update "parts" and reset "follow" and "base".
69 */
70 template <class C>
71 void
update_parts(Tokid & base,Tokid & follow,const C & c0)72 Pltoken::update_parts(Tokid& base, Tokid& follow, const C& c0)
73 {
74 if (c0.get_tokid() != follow) {
75 // Discontinuity; save the Tokids we have
76 dequeTpart new_tokids = base.constituents(follow - base);
77 copy(new_tokids.begin(), new_tokids.end(),
78 back_inserter(parts));
79 follow = base = c0.get_tokid();
80 }
81 }
82
83 /*
84 * Construct a preprocessor lexical token using Fchar as the class to
85 * provide character input.
86 * Substitute comments with a single space.
87 * Continous character sequences that can be associated with an identifier
88 * letters and digits are given an equivalence class.
89 *
90 * Should probably be declared as export, but VC 5 can not handle it.
91 */
92 template <class C>
93 void
getnext_analyze()94 Pltoken::getnext_analyze()
95 {
96 int n;
97 C c0, c1;
98 Tokid base, follow;
99 dequeTpart new_tokids;
100
101 parts.clear();
102 c0.getnext();
103 switch (c0.get_char()) {
104 /*
105 * Single character C operators and punctuators
106 * ANSI 3.1.5 p. 32 and 3.1.6 p. 33
107 */
108 case '\n': // Needed for processing directives
109 context = cpp_normal;
110 // FALLTRHOUGH
111 case '[': case ']': case '(': case ')':
112 case '~': case '?': case ':': case ',':
113 case '{': case '}':
114 case EOF:
115 val = (char)(code = c0.get_char());
116 t = c0.get_tokid();
117 break;
118 case ';':
119 if (semicolon_line_comments)
120 goto line_comment;
121 else
122 val = (char)(code = c0.get_char());
123 break;
124 /*
125 * Double character C tokens with more than 2 different outcomes
126 * (e.g. +, +=, ++)
127 */
128 case '+':
129 c0.getnext();
130 switch (c0.get_char()) {
131 case '+': val = "++"; code = INC_OP; break;
132 case '=': val = "+="; code = ADD_ASSIGN; break;
133 default: C::putback(c0); val = (char)(code = '+'); break;
134 }
135 break;
136 case '-':
137 c0.getnext();
138 switch (c0.get_char()) {
139 case '-': val = "--"; code = DEC_OP; break;
140 case '=': val = "-="; code = SUB_ASSIGN; break;
141 case '>': val = "->"; code = PTR_OP; break;
142 default: C::putback(c0); val = (char)(code = '-'); break;
143 }
144 break;
145 case '&':
146 c0.getnext();
147 switch (c0.get_char()) {
148 case '&': val = "&&"; code = AND_OP; break;
149 case '=': val = "&="; code = AND_ASSIGN; break;
150 default: C::putback(c0); val = (char)(code = '&'); break;
151 }
152 break;
153 case '|':
154 c0.getnext();
155 switch (c0.get_char()) {
156 case '|': val = "||"; code = OR_OP; break;
157 case '=': val = "|="; code = OR_ASSIGN; break;
158 default: C::putback(c0); val = (char)(code = '|'); break;
159 }
160 break;
161 /* Simple single/double character tokens (e.g. !, !=) */
162 case '!':
163 c0.getnext();
164 if (c0.get_char() == '=') {
165 val = "!=";
166 code = NE_OP;
167 } else {
168 C::putback(c0);
169 val = (char)(code = '!');
170 }
171 break;
172 case '%':
173 c0.getnext();
174 if (c0.get_char() == '=') {
175 val = "%=";
176 code = MOD_ASSIGN;
177 break;
178 }
179 // Yacc tokens
180 if (Fchar::is_yacc_file()) {
181 extern bool parse_yacc_defs;
182
183 if (c0.get_char() == '%') {
184 val = "%%";
185 code = YMARK;
186 break;
187 }
188 if (c0.get_char() == '{') {
189 val = "%{";
190 code = YLCURL;
191 parse_yacc_defs = false;
192 break;
193 }
194 if (c0.get_char() == '}') {
195 val = "%}";
196 code = YRCURL;
197 parse_yacc_defs = true;
198 break;
199 }
200 }
201 C::putback(c0);
202 val = (char)(code = '%');
203 break;
204 case '*':
205 c0.getnext();
206 if (c0.get_char() == '=') {
207 val = "*=";
208 code = MUL_ASSIGN;
209 } else {
210 C::putback(c0);
211 val = (char)(code = '*');
212 }
213 break;
214 case '=':
215 c0.getnext();
216 if (c0.get_char() == '=') {
217 val = "==";
218 code = EQ_OP;
219 } else {
220 C::putback(c0);
221 val = (char)(code = '=');
222 }
223 break;
224 case '^':
225 c0.getnext();
226 if (c0.get_char() == '=') {
227 val = "^=";
228 code = XOR_ASSIGN;
229 } else {
230 C::putback(c0);
231 val = (char)(code = '^');
232 }
233 break;
234 case '#': /* C-preprocessor token only */
235 // incpp = true; // Overkill, but good enough
236 c0.getnext();
237 if (context == cpp_define && c0.get_char() == '#') {
238 val = "##";
239 code = CPP_CONCAT;
240 } else {
241 C::putback(c0);
242 val = (char)(code = '#');
243 }
244 break;
245 /* Operators starting with < or > */
246 case '>':
247 c0.getnext();
248 switch (c0.get_char()) {
249 case '=': /* >= */
250 code = GE_OP;
251 val = ">=";
252 break;
253 case '>':
254 c0.getnext();
255 if (c0.get_char() == '=') { /* >>= */
256 code = RIGHT_ASSIGN;
257 val = ">>=";
258 } else { /* >> */
259 C::putback(c0);
260 code = RIGHT_OP;
261 val = ">>";
262 }
263 break;
264 default: /* > */
265 C::putback(c0);
266 val = (char)(code = '>');
267 break;
268 }
269 break;
270 case '<':
271 if (context == cpp_include) {
272 // C preprocessor #include <filename>
273 val = "";
274 for (;;) {
275 c0.getnext();
276 if (c0.get_char() == EOF || c0.get_char() == '>')
277 break;
278 val += c0.get_char();
279 }
280 code = PATHFNAME;
281 break;
282 }
283 c0.getnext();
284 switch (c0.get_char()) {
285 case '=': /* <= */
286 code = LE_OP;
287 val = "<=";
288 break;
289 case '<':
290 c0.getnext();
291 if (c0.get_char() == '=') { /* <<= */
292 code = LEFT_ASSIGN;
293 val = "<<=";
294 } else { /* << */
295 C::putback(c0);
296 code = LEFT_OP;
297 val = "<<";
298 }
299 break;
300 default: /* < */
301 C::putback(c0);
302 val = (char)(code = '<');
303 break;
304 }
305 break;
306 /* Comments and / operators */
307 case '/':
308 c0.getnext();
309 switch (c0.get_char()) {
310 case '=': /* /= */
311 code = DIV_ASSIGN;
312 val = "/=";
313 break;
314 case '*': /* Block comment */
315 // Do not delete comments from expanded macros
316 if (!C::is_file_source())
317 goto no_comment;
318 c0.getnext();
319 for (;;) {
320 while (c0.get_char() != '*' && c0.get_char() != EOF) {
321 c0.getnext();
322 }
323 c0.getnext();
324 if (c0.get_char() == EOF)
325 /*
326 * @error
327 * The end of file was reached while
328 * processing a block comment
329 */
330 Error::error(E_FATAL, "EOF in comment");
331 if (c0.get_char() == '/')
332 break;
333 }
334 code = SPACE;
335 val = " ";
336 break;
337 case '/': /* Line comment */
338 // Do not delete comments from expanded macros
339 if (!C::is_file_source())
340 goto no_comment;
341 line_comment:
342 do {
343 c0.getnext();
344 } while (c0.get_char() != '\n' && c0.get_char() != EOF);
345 C::putback(c0);
346 code = SPACE;
347 val = " ";
348 break;
349 no_comment:
350 /*
351 * Comment in an expanded macro.
352 * Could issue a warning here, but Microsoft uses such
353 * line comments, so we handle it in pdtoken.cpp
354 */
355 default: /* / */
356 C::putback(c0);
357 val = (char)(code = '/');
358 break;
359 }
360 break;
361 case '.': /* . and ... */
362 follow = base = c0.get_tokid();
363 c0.getnext();
364 follow++;
365 if (isdigit(c0.get_char())) {
366 update_parts(base, follow, c0);
367 val = string(".") + (char)(c0.get_char());
368 if (DP())
369 cout << "val=[" << val << "]\n";
370 goto pp_number;
371 }
372 if (c0.get_char() != '.') {
373 C::putback(c0);
374 val = (char)(code = '.');
375 break;
376 }
377 c1.getnext();
378 if (c1.get_char() != '.') {
379 C::putback(c1);
380 C::putback(c0);
381 val = (char)(code = '.');
382 break;
383 }
384 code = ELLIPSIS;
385 val = "...";
386 break;
387 /*
388 * Convert whitespace into a single token; whitespace is needed
389 * by the C preprocessor.
390 */
391 case ' ': case '\t': case '\v': case '\f': case '\r':
392 do {
393 c0.getnext();
394 } while (c0.get_char() != EOF && c0.get_char() != '\n' && isspace(c0.get_char()));
395 C::putback(c0);
396 val = " ";
397 code = SPACE;
398 break;
399 /* Could be a long character or string */
400 case 'L':
401 c1.getnext();
402 switch (c1.get_char()) {
403 case '\'':
404 goto char_literal;
405 case '"':
406 goto string_literal;
407 default:
408 C::putback(c1);
409 goto identifier;
410 }
411 case '_': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
412 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm':
413 case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't':
414 case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
415 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
416 case 'H': case 'I': case 'J': case 'K': case 'M': case 'N': case 'O':
417 case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V':
418 case 'W': case 'X': case 'Y': case 'Z':
419 identifier:
420 {
421 val = c0.get_char();
422 Tokid base = c0.get_tokid();
423 if (DP()) cout << "Base:" << base << "\n";
424 Tokid follow = base;
425 for (;;) {
426 c0.getnext();
427 follow++;
428 if (c0.get_char() == EOF ||
429 (!isalnum(c0.get_char()) && c0.get_char() != '_'))
430 break;
431 update_parts(base, follow, c0);
432 val += c0.get_char();
433 }
434 C::putback(c0);
435 dequeTpart new_tokids = base.constituents(follow - base);
436 copy(new_tokids.begin(), new_tokids.end(), back_inserter(parts));
437 // Later it will become TYPE_NAME, IDENTIFIER, or reserved word
438 code = IDENTIFIER;
439 }
440 break;
441 case '\'':
442 char_literal:
443 n = 0;
444 val = "";
445 for (;;) {
446 c0.getnext();
447 if (c0.get_char() == '\\') {
448 // Consume one character after the backslash
449 // ... to deal with the '\'' problem
450 val += '\\';
451 c0.getnext();
452 if (c0.get_char() == EOF) {
453 /*
454 * @error
455 * The end of file was reached while
456 * processing a character literal:
457 * a single quote was never closed
458 */
459 Error::error(E_ERR, "End of file in character literal");
460 break;
461 }
462 val += c0.get_char();
463 // We will deal with escapes later
464 n++;
465 continue;
466 }
467 if (c0.get_char() == EOF || c0.get_char() == '\'')
468 break;
469 val += c0.get_char();
470 n++;
471 }
472 code = CHAR_LITERAL;
473 if (n == 0)
474 /*
475 * @error
476 * Character lirerals must include a character
477 */
478 Error::error(E_WARN, "Empty character literal");
479 if (c0.get_char() == EOF)
480 Error::error(E_ERR, "End of file in character literal");
481 break;
482 case '"':
483 string_literal:
484 val = "";
485 if (context == cpp_include) {
486 // C preprocessor #include "filename"
487 for (;;) {
488 c0.getnext();
489 if (c0.get_char() == EOF || c0.get_char() == '\n' || c0.get_char() == '"')
490 break;
491 val += c0.get_char();
492 }
493 code = ABSFNAME;
494 break;
495 }
496 for (;;) {
497 c0.getnext();
498 if (c0.get_char() == '\\') {
499 val += '\\';
500 // Consume one character after the backslash
501 c0.getnext();
502 if (c0.get_char() == EOF || c0.get_char() == '\n')
503 break;
504 val += c0.get_char();
505 // We will deal with escapes later
506 continue;
507 }
508 if (c0.get_char() == EOF || c0.get_char() == '\n' || c0.get_char() == '"')
509 break;
510 val += c0.get_char();
511 }
512 code = STRING_LITERAL;
513 if (c0.get_char() == EOF)
514 /*
515 * @error
516 * The end of the file was reached while
517 * processing a string
518 */
519 Error::error(E_ERR, "End of file in string literal");
520 if (c0.get_char() == '\n')
521 /*
522 * @error
523 * The end of the line was reached while
524 * processing a string
525 */
526 Error::error(E_ERR, "End of line in string literal");
527 break;
528 /* Various numbers */
529 case '0': case '1': case '2': case '3': case '4':
530 case '5': case '6': case '7': case '8': case '9':
531 val = c0.get_char();
532 follow = base = c0.get_tokid();
533 pp_number:
534 for (;;) {
535 c0.getnext();
536 follow++;
537 if (c0.get_char() == 'e' || c0.get_char() == 'E') {
538 update_parts(base, follow, c0);
539 val += c0.get_char();
540 c0.getnext();
541 follow++;
542 if (c0.get_char() == '+' || c0.get_char() == '-') {
543 update_parts(base, follow, c0);
544 val += c0.get_char();
545 continue;
546 }
547 }
548 if (c0.get_char() == EOF ||
549 (!isalnum(c0.get_char()) && c0.get_char() != '.' && c0.get_char() != '_'))
550 break;
551 update_parts(base, follow, c0);
552 val += c0.get_char();
553 }
554 C::putback(c0);
555 new_tokids = base.constituents(follow - base);
556 copy(new_tokids.begin(), new_tokids.end(), back_inserter(parts));
557 code = PP_NUMBER;
558 break;
559 default:
560 val = (char)(code = c0.get_char());
561 }
562 Call::process_token(*this);
563 // For metric counting filter out whitespace
564 if (code != SPACE && code != '\n')
565 Metrics::call_metrics(&Metrics::add_pptoken);
566 if (DP()) cout << "getnext returns: " << *this << "\n";
567 }
568
569 template <class C>
570 void
getnext()571 Pltoken::getnext()
572 {
573 getnext_analyze<C>();
574 if (echo)
575 cout << get_c_val();
576 }
577
578 template <class C>
579 void
getnext_nospc()580 Pltoken::getnext_nospc()
581 {
582 do {
583 getnext<C>();
584 } while (code == SPACE);
585 }
586
587 #endif // PLTOKEN
588