1 /*
2  * (C) Copyright 2001-2015 Diomidis Spinellis
3  *
4  * This file is part of CScout.
5  *
6  * CScout is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * CScout is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with CScout.  If not, see <http://www.gnu.org/licenses/>.
18  *
19  *
20  * A preprocessor lexical token.
21  * The getnext() method for these tokens converts characters into tokens.
22  *
23  */
24 
25 #ifndef PLTOKEN_
26 #define PLTOKEN_
27 
28 #include "debug.h"
29 #include "tokid.h"
30 #include "ptoken.h"
31 #include "call.h"
32 
33 class Fchar;
34 
35 // C-preprocessor context influences lexical analysis
36 enum e_cpp_context {
37 	cpp_normal,	// Normal operation
38 	cpp_include,	// Set while processing a #include directive (will recognize ABSFNAME and PATHFNAME)
39 	cpp_define	// Set while processing a #define directive (will recognize CONCAT)
40 };
41 
42 // A C preprocessor lexical token
43 class Pltoken: public Ptoken {
44 private:
45 	static enum e_cpp_context context;
46 	// Allow line comments starting with a semicolon (inside Microsoft asm)
47 	static bool semicolon_line_comments;
48 	// Echo characters read on standard output
49 	static bool echo;
50 	template <class C> void update_parts(Tokid& base, Tokid& follow, const C& c0);
51 	Tokid t;		// Token identifier for delimeters: comma, bracket
52 	template <class C> void getnext_analyze();
53 public:
54 	template <class C> void getnext();
55 	template <class C> void getnext_nospc();
set_context(enum e_cpp_context con)56 	static void set_context(enum e_cpp_context con) { context = con; };
set_semicolon_line_comments(bool v)57 	static void set_semicolon_line_comments(bool v) { semicolon_line_comments = v; }
get_delimiter_tokid()58 	Tokid get_delimiter_tokid() const { return t; }
set_echo()59 	static void set_echo() { echo = true; }
clear_echo()60 	static void clear_echo() { echo = false; }
61 };
62 
63 /*
64  * Given "base" that marks the beginning of a token
65  * "follow" that follows its characters as they are read, and
66  * c0, a new character read, check that the new character
67  * is indeed agreeing with the value of "follow".
68  * If not update "parts" and reset "follow" and "base".
69  */
70 template <class C>
71 void
update_parts(Tokid & base,Tokid & follow,const C & c0)72 Pltoken::update_parts(Tokid& base, Tokid& follow, const C& c0)
73 {
74 	if (c0.get_tokid() != follow) {
75 		// Discontinuity; save the Tokids we have
76 		dequeTpart new_tokids = base.constituents(follow - base);
77 		copy(new_tokids.begin(), new_tokids.end(),
78 		     back_inserter(parts));
79 		follow = base = c0.get_tokid();
80 	}
81 }
82 
83 /*
84  * Construct a preprocessor lexical token using Fchar as the class to
85  * provide character input.
86  * Substitute comments with a single space.
87  * Continous character sequences that can be associated with an identifier
88  * letters and digits are given an equivalence class.
89  *
90  * Should probably be declared as export, but VC 5 can not handle it.
91  */
92 template <class C>
93 void
getnext_analyze()94 Pltoken::getnext_analyze()
95 {
96 	int n;
97 	C c0, c1;
98 	Tokid base, follow;
99 	dequeTpart new_tokids;
100 
101 	parts.clear();
102 	c0.getnext();
103 	switch (c0.get_char()) {
104 	/*
105 	 * Single character C operators and punctuators
106 	 * ANSI 3.1.5 p. 32 and 3.1.6 p. 33
107 	 */
108 	case '\n':	// Needed for processing directives
109 		context = cpp_normal;
110 		// FALLTRHOUGH
111 	case '[': case ']': case '(': case ')':
112 	case '~': case '?': case ':': case ',':
113 	case '{': case '}':
114 	case EOF:
115 		val = (char)(code = c0.get_char());
116 		t = c0.get_tokid();
117 		break;
118 	case ';':
119 		if (semicolon_line_comments)
120 			goto line_comment;
121 		else
122 			val = (char)(code = c0.get_char());
123 		break;
124 	/*
125 	 * Double character C tokens with more than 2 different outcomes
126 	 * (e.g. +, +=, ++)
127 	 */
128 	case '+':
129 		c0.getnext();
130 		switch (c0.get_char()) {
131 		case '+': val = "++"; code = INC_OP; break;
132 		case '=': val = "+="; code = ADD_ASSIGN; break;
133 		default:  C::putback(c0); val = (char)(code = '+'); break;
134 		}
135 		break;
136 	case '-':
137 		c0.getnext();
138 		switch (c0.get_char()) {
139 		case '-': val = "--"; code = DEC_OP; break;
140 		case '=': val = "-="; code = SUB_ASSIGN; break;
141 		case '>': val = "->"; code = PTR_OP; break;
142 		default:  C::putback(c0); val = (char)(code = '-'); break;
143 		}
144 		break;
145 	case '&':
146 		c0.getnext();
147 		switch (c0.get_char()) {
148 		case '&': val = "&&"; code = AND_OP; break;
149 		case '=': val = "&="; code = AND_ASSIGN; break;
150 		default:  C::putback(c0); val = (char)(code = '&'); break;
151 		}
152 		break;
153 	case '|':
154 		c0.getnext();
155 		switch (c0.get_char()) {
156 		case '|': val = "||"; code = OR_OP; break;
157 		case '=': val = "|="; code = OR_ASSIGN; break;
158 		default:  C::putback(c0); val = (char)(code = '|'); break;
159 		}
160 		break;
161 	/* Simple single/double character tokens (e.g. !, !=) */
162 	case '!':
163 		c0.getnext();
164 		if (c0.get_char() == '=') {
165 			val = "!=";
166 			code = NE_OP;
167 		} else {
168 			C::putback(c0);
169 			val = (char)(code = '!');
170 		}
171 		break;
172 	case '%':
173 		c0.getnext();
174 		if (c0.get_char() == '=') {
175 			val = "%=";
176 			code = MOD_ASSIGN;
177 			break;
178 		}
179 		// Yacc tokens
180 		if (Fchar::is_yacc_file()) {
181 			extern bool parse_yacc_defs;
182 
183 			if (c0.get_char() == '%') {
184 				val = "%%";
185 				code = YMARK;
186 				break;
187 			}
188 			if (c0.get_char() == '{') {
189 				val = "%{";
190 				code = YLCURL;
191 				parse_yacc_defs = false;
192 				break;
193 			}
194 			if (c0.get_char() == '}') {
195 				val = "%}";
196 				code = YRCURL;
197 				parse_yacc_defs = true;
198 				break;
199 			}
200 		}
201 		C::putback(c0);
202 		val = (char)(code = '%');
203 		break;
204 	case '*':
205 		c0.getnext();
206 		if (c0.get_char() == '=') {
207 			val = "*=";
208 			code = MUL_ASSIGN;
209 		} else {
210 			C::putback(c0);
211 			val = (char)(code = '*');
212 		}
213 		break;
214 	case '=':
215 		c0.getnext();
216 		if (c0.get_char() == '=') {
217 			val = "==";
218 			code = EQ_OP;
219 		} else {
220 			C::putback(c0);
221 			val = (char)(code = '=');
222 		}
223 		break;
224 	case '^':
225 		c0.getnext();
226 		if (c0.get_char() == '=') {
227 			val = "^=";
228 			code = XOR_ASSIGN;
229 		} else {
230 			C::putback(c0);
231 			val = (char)(code = '^');
232 		}
233 		break;
234 	case '#':	/* C-preprocessor token only */
235 		// incpp = true;		// Overkill, but good enough
236 		c0.getnext();
237 		if (context == cpp_define && c0.get_char() == '#') {
238 			val = "##";
239 			code = CPP_CONCAT;
240 		} else {
241 			C::putback(c0);
242 			val = (char)(code = '#');
243 		}
244 		break;
245 	/* Operators starting with < or > */
246 	case '>':
247 		c0.getnext();
248 		switch (c0.get_char()) {
249 		case '=':				/* >= */
250 			code = GE_OP;
251 			val = ">=";
252 			break;
253 		case '>':
254 			c0.getnext();
255 			if (c0.get_char() == '=') {	/* >>= */
256 				code = RIGHT_ASSIGN;
257 				val = ">>=";
258 			} else {			/* >> */
259 				C::putback(c0);
260 				code = RIGHT_OP;
261 				val = ">>";
262 			}
263 			break;
264 		default:				/* > */
265 			C::putback(c0);
266 			val = (char)(code = '>');
267 			break;
268 		}
269 		break;
270 	case '<':
271 		if (context == cpp_include) {
272 			// C preprocessor #include <filename>
273 			val = "";
274 			for (;;) {
275 				c0.getnext();
276 				if (c0.get_char() == EOF || c0.get_char() == '>')
277 					break;
278 				val += c0.get_char();
279 			}
280 			code = PATHFNAME;
281 			break;
282 		}
283 		c0.getnext();
284 		switch (c0.get_char()) {
285 		case '=':				/* <= */
286 			code = LE_OP;
287 			val = "<=";
288 			break;
289 		case '<':
290 			c0.getnext();
291 			if (c0.get_char() == '=') {	/* <<= */
292 				code = LEFT_ASSIGN;
293 				val = "<<=";
294 			} else {			/* << */
295 				C::putback(c0);
296 				code = LEFT_OP;
297 				val = "<<";
298 			}
299 			break;
300 		default:				/* < */
301 			C::putback(c0);
302 			val = (char)(code = '<');
303 			break;
304 		}
305 		break;
306 	/* Comments and / operators */
307 	case '/':
308 		c0.getnext();
309 		switch (c0.get_char()) {
310 		case '=':				/* /= */
311 			code = DIV_ASSIGN;
312 			val = "/=";
313 			break;
314 		case '*':				/* Block comment */
315 			// Do not delete comments from expanded macros
316 			if (!C::is_file_source())
317 				goto no_comment;
318 			c0.getnext();
319 			for (;;) {
320 				while (c0.get_char() != '*' && c0.get_char() != EOF) {
321 					c0.getnext();
322 				}
323 				c0.getnext();
324 				if (c0.get_char() == EOF)
325 					/*
326 					 * @error
327 					 * The end of file was reached while
328 					 * processing a block comment
329 					 */
330 					Error::error(E_FATAL, "EOF in comment");
331 				if (c0.get_char() == '/')
332 					break;
333 			}
334 			code = SPACE;
335 			val = " ";
336 			break;
337 		case '/':				/* Line comment */
338 			// Do not delete comments from expanded macros
339 			if (!C::is_file_source())
340 				goto no_comment;
341 		line_comment:
342 			do {
343 				c0.getnext();
344 			} while (c0.get_char() != '\n' && c0.get_char() != EOF);
345 			C::putback(c0);
346 			code = SPACE;
347 			val = " ";
348 			break;
349 		no_comment:
350 			/*
351 			 * Comment in an expanded macro.
352 			 * Could issue a warning here, but Microsoft uses such
353 			 * line comments, so we handle it in pdtoken.cpp
354 			 */
355 		default:				/* / */
356 			C::putback(c0);
357 			val = (char)(code = '/');
358 			break;
359 		}
360 		break;
361 	case '.':	/* . and ... */
362 		follow = base = c0.get_tokid();
363 		c0.getnext();
364 		follow++;
365 		if (isdigit(c0.get_char())) {
366 			update_parts(base, follow, c0);
367 			val = string(".") + (char)(c0.get_char());
368 			if (DP())
369 				cout << "val=[" << val << "]\n";
370 			goto pp_number;
371 		}
372 		if (c0.get_char() != '.') {
373 			C::putback(c0);
374 			val = (char)(code = '.');
375 			break;
376 		}
377 		c1.getnext();
378 		if (c1.get_char() != '.') {
379 			C::putback(c1);
380 			C::putback(c0);
381 			val = (char)(code = '.');
382 			break;
383 		}
384 		code = ELLIPSIS;
385 		val = "...";
386 		break;
387 	/*
388 	 * Convert whitespace into a single token; whitespace is needed
389 	 * by the C preprocessor.
390 	 */
391 	case ' ': case '\t': case '\v': case '\f': case '\r':
392 		do {
393 			c0.getnext();
394 		} while (c0.get_char() != EOF && c0.get_char() != '\n' && isspace(c0.get_char()));
395 		C::putback(c0);
396 		val = " ";
397 		code = SPACE;
398 		break;
399 	/* Could be a long character or string */
400 	case 'L':
401 		c1.getnext();
402 		switch (c1.get_char()) {
403 		case '\'':
404 			goto char_literal;
405 		case '"':
406 			goto string_literal;
407 		default:
408 			C::putback(c1);
409 			goto identifier;
410 		}
411 	case '_': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
412 	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm':
413 	case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't':
414 	case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
415 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
416 	case 'H': case 'I': case 'J': case 'K': case 'M': case 'N': case 'O':
417 	case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V':
418 	case 'W': case 'X': case 'Y': case 'Z':
419 	identifier:
420 		{
421 		val = c0.get_char();
422 		Tokid base = c0.get_tokid();
423 		if (DP()) cout << "Base:" << base << "\n";
424 		Tokid follow = base;
425 		for (;;) {
426 			c0.getnext();
427 			follow++;
428 			if (c0.get_char() == EOF ||
429 		            (!isalnum(c0.get_char()) && c0.get_char() != '_'))
430 		         	break;
431 			update_parts(base, follow, c0);
432 			val += c0.get_char();
433 		}
434 		C::putback(c0);
435 		dequeTpart new_tokids = base.constituents(follow - base);
436 		copy(new_tokids.begin(), new_tokids.end(), back_inserter(parts));
437 		// Later it will become TYPE_NAME, IDENTIFIER, or reserved word
438 		code = IDENTIFIER;
439 		}
440 		break;
441 	case '\'':
442 	char_literal:
443 		n = 0;
444 		val = "";
445 		for (;;) {
446 			c0.getnext();
447 			if (c0.get_char() == '\\') {
448 				// Consume one character after the backslash
449 				// ... to deal with the '\'' problem
450 				val += '\\';
451 				c0.getnext();
452 				if (c0.get_char() == EOF) {
453 					/*
454 					 * @error
455 					 * The end of file was reached while
456 					 * processing a character literal:
457 					 * a single quote was never closed
458 					 */
459 					Error::error(E_ERR, "End of file in character literal");
460 					break;
461 				}
462 				val += c0.get_char();
463 				// We will deal with escapes later
464 				n++;
465 				continue;
466 			}
467 			if (c0.get_char() == EOF || c0.get_char() == '\'')
468 				break;
469 			val += c0.get_char();
470 			n++;
471 		}
472 		code = CHAR_LITERAL;
473 		if (n == 0)
474 			/*
475 			 * @error
476 			 * Character lirerals must include a character
477 			 */
478 			Error::error(E_WARN, "Empty character literal");
479 		if (c0.get_char() == EOF)
480 			Error::error(E_ERR, "End of file in character literal");
481 		break;
482 	case '"':
483 	string_literal:
484 		val = "";
485 		if (context == cpp_include) {
486 			// C preprocessor #include "filename"
487 			for (;;) {
488 				c0.getnext();
489 				if (c0.get_char() == EOF || c0.get_char() == '\n' || c0.get_char() == '"')
490 					break;
491 				val += c0.get_char();
492 			}
493 			code = ABSFNAME;
494 			break;
495 		}
496 		for (;;) {
497 			c0.getnext();
498 			if (c0.get_char() == '\\') {
499 				val += '\\';
500 				// Consume one character after the backslash
501 				c0.getnext();
502 				if (c0.get_char() == EOF || c0.get_char() == '\n')
503 					break;
504 				val += c0.get_char();
505 				// We will deal with escapes later
506 				continue;
507 			}
508 			if (c0.get_char() == EOF || c0.get_char() == '\n' || c0.get_char() == '"')
509 				break;
510 			val += c0.get_char();
511 		}
512 		code = STRING_LITERAL;
513 		if (c0.get_char() == EOF)
514 			/*
515 			 * @error
516 			 * The end of the file was reached while
517 			 * processing a string
518 			 */
519 			Error::error(E_ERR, "End of file in string literal");
520 		if (c0.get_char() == '\n')
521 			/*
522 			 * @error
523 			 * The end of the line was reached while
524 			 * processing a string
525 			 */
526 			Error::error(E_ERR, "End of line in string literal");
527 		break;
528 	/* Various numbers */
529 	case '0': case '1': case '2': case '3': case '4':
530 	case '5': case '6': case '7': case '8': case '9':
531 		val = c0.get_char();
532 		follow = base = c0.get_tokid();
533 	pp_number:
534 		for (;;) {
535 			c0.getnext();
536 			follow++;
537 			if (c0.get_char() == 'e' || c0.get_char() == 'E') {
538 				update_parts(base, follow, c0);
539 				val += c0.get_char();
540 				c0.getnext();
541 				follow++;
542 				if (c0.get_char() == '+' || c0.get_char() == '-') {
543 					update_parts(base, follow, c0);
544 					val += c0.get_char();
545 					continue;
546 				}
547 			}
548 			if (c0.get_char() == EOF ||
549 		            (!isalnum(c0.get_char()) && c0.get_char() != '.' && c0.get_char() != '_'))
550 		         	break;
551 			update_parts(base, follow, c0);
552 			val += c0.get_char();
553 		}
554 		C::putback(c0);
555 		new_tokids = base.constituents(follow - base);
556 		copy(new_tokids.begin(), new_tokids.end(), back_inserter(parts));
557 		code = PP_NUMBER;
558 		break;
559 	default:
560 		val = (char)(code = c0.get_char());
561 	}
562 	Call::process_token(*this);
563 	// For metric counting filter out whitespace
564 	if (code != SPACE && code != '\n')
565 		Metrics::call_metrics(&Metrics::add_pptoken);
566 	if (DP()) cout << "getnext returns: " << *this << "\n";
567 }
568 
569 template <class C>
570 void
getnext()571 Pltoken::getnext()
572 {
573 	getnext_analyze<C>();
574 	if (echo)
575 		cout << get_c_val();
576 }
577 
578 template <class C>
579 void
getnext_nospc()580 Pltoken::getnext_nospc()
581 {
582 	do {
583 		getnext<C>();
584 	} while (code == SPACE);
585 }
586 
587 #endif // PLTOKEN
588