1 /*
2  * (c) Thomas Pornin 1999 - 2002
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  * 4. The name of the authors may not be used to endorse or promote
13  *    products derived from this software without specific prior written
14  *    permission.
15  *
16  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
22  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
23  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
24  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
25  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
26  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  */
29 
30 #include "tune.h"
31 #include <stdio.h>
32 #include <string.h>
33 #include <stddef.h>
34 #include <limits.h>
35 #include "ucppi.h"
36 #include "mem.h"
37 #ifdef UCPP_MMAP
38 #include <unistd.h>
39 #include <sys/types.h>
40 #include <sys/mman.h>
41 #endif
42 
43 /*
44  * Character classes for description of the automaton.
45  * The characters used for representing classes should not appear
46  * explicitely in an automaton rule.
47  */
48 #define SPC	' '	/* whitespace characters */
49 #define ALP	'Z'	/* A-Z, a-z, _ */
50 #define NUM	'9'	/* 0-9 */
51 #define ANY	'Y'	/* any character */
52 #define VCH	'F'	/* void character (for end of input) */
53 
54 /*
55  * flags and macros to test those flags
56  * STO: the currently read string is a complete token
57  * PUT: the currently read character must be added to the string
58  * FRZ: the currently read character must be kept and read again
59  */
60 #define MOD_MK		255
61 #define noMOD(x)	((x) & 255)
62 #define STO(x)		((x) | 256)
63 #define ttSTO(x)	((x) & 256)
64 #define FRZ(x)		((x) | 512)
65 #define ttFRZ(x)	((x) & 512)
66 #define PUT(x)		((x) | 1024)
67 #define ttPUT(x)	((x) & 1024)
68 
69 /* order is important */
70 enum {
71 	S_START, S_SPACE, S_BANG, S_STRING, S_STRING2, S_COLON,
72 	S_SHARP, S_PCT, S_PCT2, S_PCT3, S_AMPER, S_CHAR, S_CHAR2, S_STAR,
73 	S_PLUS, S_MINUS, S_DOT, S_DOT2, S_SLASH, S_NUMBER, S_NUMBER2, S_LT,
74 	S_LT2, S_EQ, S_GT, S_GT2, S_CIRC, S_PIPE, S_BACKSLASH,
75 	S_COMMENT, S_COMMENT2, S_COMMENT3, S_COMMENT4, S_COMMENT5,
76 	S_NAME, S_NAME_BS, S_LCHAR,
77 	MSTATE,
78 	S_ILL, S_DDOT, S_DDSHARP, S_BS, S_ROGUE_BS, S_BEHEAD, S_DECAY,
79 	S_TRUNC, S_TRUNCC, S_OUCH
80 };
81 
82 #define CMT(x)		((x) >= S_COMMENT && (x) <= S_COMMENT5)
83 
84 #define CMCR	2
85 
86 /*
87  * This is the description of the automaton. It is not used "as is"
88  * but copied at execution time into a table.
89  *
90  * To my utmost displeasure, there are a few hacks in read_token()
91  * (which uses the transformed automaton) about the special handling
92  * of slashes, sharps, and the letter L.
93  */
94 static struct machine_state {
95 	int state;
96 	unsigned char input[CMCR];
97 	int new_state;
98 } cppms[] = {
99 	/* S_START is the generic beginning state */
100 	{ S_START,	{ ANY },	S_ILL			},
101 #ifdef SEMPER_FIDELIS
102 	{ S_START,	{ SPC },	PUT(S_SPACE)		},
103 #else
104 	{ S_START,	{ SPC },	S_SPACE			},
105 #endif
106 	{ S_START,	{ '\n' },	STO(NEWLINE)		},
107 	{ S_START,	{ '!' },	S_BANG			},
108 	{ S_START,	{ '"' },	PUT(S_STRING)		},
109 	{ S_START,	{ '#' },	S_SHARP			},
110 	{ S_START,	{ '%' },	S_PCT			},
111 	{ S_START,	{ '&' },	S_AMPER			},
112 	{ S_START,	{ '\'' },	PUT(S_CHAR)		},
113 	{ S_START,	{ '(' },	STO(LPAR)		},
114 	{ S_START,	{ ')' },	STO(RPAR)		},
115 	{ S_START,	{ '*' },	S_STAR			},
116 	{ S_START,	{ '+' },	S_PLUS			},
117 	{ S_START,	{ ',' },	STO(COMMA)		},
118 	{ S_START,	{ '-' },	S_MINUS			},
119 	{ S_START,	{ '.' },	PUT(S_DOT)		},
120 #ifdef SEMPER_FIDELIS
121 	{ S_START,	{ '/' },	PUT(S_SLASH)		},
122 #else
123 	{ S_START,	{ '/' },	S_SLASH			},
124 #endif
125 	{ S_START,	{ NUM },	PUT(S_NUMBER)		},
126 	{ S_START,	{ ':' },	S_COLON			},
127 	{ S_START,	{ ';' },	STO(SEMIC)		},
128 	{ S_START,	{ '<' },	S_LT			},
129 	{ S_START,	{ '=' },	S_EQ			},
130 	{ S_START,	{ '>' },	S_GT			},
131 	{ S_START,	{ '?' },	STO(QUEST)		},
132 	{ S_START,	{ ALP },	PUT(S_NAME)		},
133 	{ S_START,	{ 'L' },	PUT(S_LCHAR)		},
134 	{ S_START,	{ '[' },	STO(LBRK)		},
135 	{ S_START,	{ ']' },	STO(RBRK)		},
136 	{ S_START,	{ '^' },	S_CIRC			},
137 	{ S_START,	{ '{' },	STO(LBRA)		},
138 	{ S_START,	{ '|' },	S_PIPE			},
139 	{ S_START,	{ '}' },	STO(RBRA)		},
140 	{ S_START,	{ '~' },	STO(NOT)		},
141 	{ S_START,	{ '\\' },	S_BACKSLASH		},
142 
143 	/* after a space */
144 	{ S_SPACE,	{ ANY },	FRZ(STO(NONE))		},
145 #ifdef SEMPER_FIDELIS
146 	{ S_SPACE,	{ SPC },	PUT(S_SPACE)		},
147 #else
148 	{ S_SPACE,	{ SPC },	S_SPACE			},
149 #endif
150 
151 	/* after a ! */
152 	{ S_BANG,	{ ANY },	FRZ(STO(LNOT))		},
153 	{ S_BANG,	{ '=' },	STO(NEQ)		},
154 
155 	/* after a " */
156 	{ S_STRING,	{ ANY },	PUT(S_STRING)		},
157 	{ S_STRING,	{ VCH },	FRZ(S_TRUNC)		},
158 	{ S_STRING,	{ '\n' },	FRZ(S_BEHEAD)		},
159 	{ S_STRING,	{ '\\' },	PUT(S_STRING2)		},
160 	{ S_STRING,	{ '"' },	PUT(STO(STRING))	},
161 
162 	{ S_STRING2,	{ ANY },	PUT(S_STRING)		},
163 	{ S_STRING2,	{ VCH },	FRZ(S_TRUNC)		},
164 
165 	/* after a # */
166 	{ S_SHARP,	{ ANY },	FRZ(STO(SHARP))		},
167 	{ S_SHARP,	{ '#' },	STO(DSHARP)		},
168 
169 	/* after a : */
170 	{ S_COLON,	{ ANY },	FRZ(STO(COLON))		},
171 	{ S_COLON,	{ '>' },	STO(DIG_RBRK)		},
172 
173 	/* after a % */
174 	{ S_PCT,	{ ANY },	FRZ(STO(PCT))		},
175 	{ S_PCT,	{ '=' },	STO(ASPCT)		},
176 	{ S_PCT,	{ '>' },	STO(DIG_RBRA)		},
177 	{ S_PCT,	{ ':' },	S_PCT2			},
178 
179 	/* after a %: */
180 	{ S_PCT2,	{ ANY },	FRZ(STO(DIG_SHARP))	},
181 	{ S_PCT2,	{ '%' },	S_PCT3			},
182 
183 	/* after a %:% */
184 	{ S_PCT3,	{ ANY },	FRZ(S_DDSHARP)		},
185 	{ S_PCT3,	{ ':' },	STO(DIG_DSHARP)		},
186 
187 	/* after a & */
188 	{ S_AMPER,	{ ANY },	FRZ(STO(AND))		},
189 	{ S_AMPER,	{ '=' },	STO(ASAND)		},
190 	{ S_AMPER,	{ '&' },	STO(LAND)		},
191 
192 	/* after a ' */
193 	{ S_CHAR,	{ ANY },	PUT(S_CHAR)		},
194 	{ S_CHAR,	{ VCH },	FRZ(S_TRUNC)		},
195 	{ S_CHAR,	{ '\'' },	PUT(STO(CHAR))		},
196 	{ S_CHAR,	{ '\\' },	PUT(S_CHAR2)		},
197 
198 	/* after a \ in a character constant
199 	   useful only for '\'' */
200 	{ S_CHAR2,	{ ANY },	PUT(S_CHAR)		},
201 	{ S_CHAR2,	{ VCH },	FRZ(S_TRUNC)		},
202 
203 	/* after a * */
204 	{ S_STAR,	{ ANY },	FRZ(STO(STAR))		},
205 	{ S_STAR,	{ '=' },	STO(ASSTAR)		},
206 
207 	/* after a + */
208 	{ S_PLUS,	{ ANY },	FRZ(STO(PLUS))		},
209 	{ S_PLUS,	{ '+' },	STO(PPLUS)		},
210 	{ S_PLUS,	{ '=' },	STO(ASPLUS)		},
211 
212 	/* after a - */
213 	{ S_MINUS,	{ ANY },	FRZ(STO(MINUS))		},
214 	{ S_MINUS,	{ '-' },	STO(MMINUS)		},
215 	{ S_MINUS,	{ '=' },	STO(ASMINUS)		},
216 	{ S_MINUS,	{ '>' },	STO(ARROW)		},
217 
218 	/* after a . */
219 	{ S_DOT,	{ ANY },	FRZ(STO(DOT))		},
220 	{ S_DOT,	{ NUM },	PUT(S_NUMBER)		},
221 	{ S_DOT,	{ '.' },	S_DOT2			},
222 
223 	/* after .. */
224 	{ S_DOT2,	{ ANY },	FRZ(S_DDOT)		},
225 	{ S_DOT2,	{ '.' },	STO(MDOTS)		},
226 
227 	/* after a / */
228 	{ S_SLASH,	{ ANY },	FRZ(STO(SLASH))		},
229 	{ S_SLASH,	{ '=' },	STO(ASSLASH)		},
230 #ifdef SEMPER_FIDELIS
231 	{ S_SLASH,	{ '*' },	PUT(S_COMMENT)		},
232 	{ S_SLASH,	{ '/' },	PUT(S_COMMENT5)		},
233 #else
234 	{ S_SLASH,	{ '*' },	S_COMMENT		},
235 	{ S_SLASH,	{ '/' },	S_COMMENT5		},
236 #endif
237 	/*
238 	 * There is a little hack in read_token() to disable
239 	 * this last rule, if C++ (C99) comments are not enabled.
240 	 */
241 
242 	/* after a number */
243 	{ S_NUMBER,	{ ANY },	FRZ(STO(NUMBER))	},
244 	{ S_NUMBER,	{ ALP, NUM },	PUT(S_NUMBER)		},
245 	{ S_NUMBER,	{ '.' },	PUT(S_NUMBER)		},
246 	{ S_NUMBER,	{ 'E', 'e' },	PUT(S_NUMBER2)		},
247 	{ S_NUMBER,	{ 'P', 'p' },	PUT(S_NUMBER2)		},
248 
249 	{ S_NUMBER2,	{ ANY },	FRZ(STO(NUMBER))	},
250 	{ S_NUMBER2,	{ ALP, NUM },	PUT(S_NUMBER)		},
251 	{ S_NUMBER2,	{ '+', '-' },	PUT(S_NUMBER)		},
252 
253 	/* after a < */
254 	{ S_LT,		{ ANY },	FRZ(STO(LT))		},
255 	{ S_LT,		{ '=' },	STO(LEQ)		},
256 	{ S_LT,		{ '<' },	S_LT2			},
257 	{ S_LT,		{ ':' },	STO(DIG_LBRK)		},
258 	{ S_LT,		{ '%' },	STO(DIG_LBRA)		},
259 
260 	{ S_LT2,	{ ANY },	FRZ(STO(LSH))		},
261 	{ S_LT2,	{ '=' },	STO(ASLSH)		},
262 
263 	/* after a > */
264 	{ S_GT,		{ ANY },	FRZ(STO(GT))		},
265 	{ S_GT,		{ '=' },	STO(GEQ)		},
266 	{ S_GT,		{ '>' },	S_GT2			},
267 
268 	{ S_GT2,	{ ANY },	FRZ(STO(RSH))		},
269 	{ S_GT2,	{ '=' },	STO(ASRSH)		},
270 
271 	/* after a = */
272 	{ S_EQ,		{ ANY },	FRZ(STO(ASGN))		},
273 	{ S_EQ,		{ '=' },	STO(SAME)		},
274 #ifdef CAST_OP
275 	{ S_EQ,		{ '>' },	STO(CAST)		},
276 #endif
277 
278 	/* after a \ */
279 	{ S_BACKSLASH,	{ ANY },	FRZ(S_BS)		},
280 	{ S_BACKSLASH,	{ 'U', 'u' },	FRZ(S_NAME_BS)		},
281 
282 	/* after a letter */
283 	{ S_NAME,	{ ANY },	FRZ(STO(NAME))		},
284 	{ S_NAME,	{ ALP, NUM },	PUT(S_NAME)		},
285 	{ S_NAME,	{ '\\' },	S_NAME_BS		},
286 
287 	/* after a \ in an identifier */
288 	{ S_NAME_BS,	{ ANY },	FRZ(S_ROGUE_BS)		},
289 	{ S_NAME_BS,	{ 'u', 'U' },	PUT(S_NAME)		},
290 
291 	/* after a L */
292 	{ S_LCHAR,	{ ANY },	FRZ(S_NAME)		},
293 	{ S_LCHAR,	{ '"' },	PUT(S_STRING)		},
294 	{ S_LCHAR,	{ '\'' },	PUT(S_CHAR)		},
295 
296 	/* after a ^ */
297 	{ S_CIRC,	{ ANY },	FRZ(STO(CIRC))		},
298 	{ S_CIRC,	{ '=' },	STO(ASCIRC)		},
299 
300 	/* after a | */
301 	{ S_PIPE,	{ ANY },	FRZ(STO(OR))		},
302 	{ S_PIPE,	{ '=' },	STO(ASOR)		},
303 	{ S_PIPE,	{ '|' },	STO(LOR)		},
304 
305 	/* after a / and * */
306 #ifdef SEMPER_FIDELIS
307 	{ S_COMMENT,	{ ANY },	PUT(S_COMMENT)		},
308 	{ S_COMMENT,	{ VCH },	FRZ(S_TRUNCC)		},
309 	{ S_COMMENT,	{ '*' },	PUT(S_COMMENT2)		},
310 
311 	{ S_COMMENT2,	{ ANY },	FRZ(S_COMMENT)		},
312 	{ S_COMMENT2,	{ VCH },	FRZ(S_TRUNCC)		},
313 	{ S_COMMENT2,	{ '*' },	PUT(S_COMMENT2)		},
314 	{ S_COMMENT2,	{ '/' },	STO(PUT(COMMENT))	},
315 
316 	{ S_COMMENT5,	{ ANY },	PUT(S_COMMENT5)		},
317 	{ S_COMMENT5,	{ VCH },	FRZ(S_DECAY)		},
318 	{ S_COMMENT5,	{ '\n' },	FRZ(STO(COMMENT))	},
319 #else
320 	{ S_COMMENT,	{ ANY },	S_COMMENT		},
321 	{ S_COMMENT,	{ VCH },	FRZ(S_TRUNCC)		},
322 	{ S_COMMENT,	{ '*' },	S_COMMENT2		},
323 
324 	{ S_COMMENT2,	{ ANY },	FRZ(S_COMMENT)		},
325 	{ S_COMMENT2,	{ VCH },	FRZ(S_TRUNCC)		},
326 	{ S_COMMENT2,	{ '*' },	S_COMMENT2		},
327 	{ S_COMMENT2,	{ '/' },	STO(COMMENT)		},
328 
329 	{ S_COMMENT5,	{ ANY },	S_COMMENT5		},
330 	{ S_COMMENT5,	{ VCH },	FRZ(S_DECAY)		},
331 	{ S_COMMENT5,	{ '\n' },	FRZ(STO(COMMENT))	},
332 #endif
333 
334 	/* dummy end of machine description */
335 	{ 0,		{ 0 },		0			}
336 };
337 
338 /*
339  * cppm is the table used to store the automaton: if we are in state s
340  * and we read character c, we apply the action cppm[s][c] (jumping to
341  * another state, or emitting a token).
342  * cppm_vch is the table for the special virtual character "end of input"
343  */
344 static int cppm[MSTATE][MAX_CHAR_VAL];
345 static int cppm_vch[MSTATE];
346 
347 /*
348  * init_cppm() fills cppm[][] with the information stored in cppms[].
349  * It must be called before beginning the lexing process.
350  */
init_cppm(void)351 void init_cppm(void)
352 {
353 	int i, j, k, c;
354 	static unsigned char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
355 	static unsigned char lower[] = "abcdefghijklmnopqrstuvwxyz";
356 	unsigned char *cp;
357 
358 	for (i = 0; i < MSTATE; i ++) {
359 		for (j = 0; j < MAX_CHAR_VAL; j ++) cppm[i][j] = S_OUCH;
360 		cppm_vch[i] = S_OUCH;
361 	}
362 	for (i = 0; cppms[i].input[0]; i ++) for (k = 0; k < CMCR; k ++) {
363 		int s = cppms[i].state;
364 		int ns = cppms[i].new_state;
365 
366 		switch (c = cppms[i].input[k]) {
367 		case 0:
368 			break;
369 		case SPC:
370 			/* see space_char() also */
371 			cppm[s][' '] = ns;
372 			cppm[s]['\t'] = ns;
373 			cppm[s]['\v'] = ns;
374 			cppm[s]['\f'] = ns;
375 #ifdef UNBREAKABLE_SPACE
376 			if (MAX_CHAR_VAL > UNBREAKABLE_SPACE)
377 				cppm[s][UNBREAKABLE_SPACE] = ns;
378 #endif
379 			break;
380 		case ALP:
381 			for (cp = upper; *cp; cp ++) cppm[s][(int)*cp] = ns;
382 			for (cp = lower; *cp; cp ++) cppm[s][(int)*cp] = ns;
383 			cppm[s]['_'] = ns;
384 			break;
385 		case NUM:
386 			for (j = '0'; j <= '9'; j ++) cppm[s][j] = ns;
387 			break;
388 		case ANY:
389 			for (j = 0; j < MAX_CHAR_VAL; j ++) cppm[s][j] = ns;
390 			cppm_vch[s] = ns;
391 			break;
392 		case VCH:
393 			cppm_vch[s] = ns;
394 			break;
395 		default:
396 			cppm[s][c] = ns;
397 			break;
398 		}
399 	}
400 }
401 
402 /*
403  * Make some character as equivalent to a letter for identifiers.
404  */
set_identifier_char(int c)405 void set_identifier_char(int c)
406 {
407 	cppm[S_START][c] = PUT(S_NAME);
408 	cppm[S_NAME][c] = PUT(S_NAME);
409 }
410 
411 /*
412  * Remove the "identifier" status from a character.
413  */
unset_identifier_char(int c)414 void unset_identifier_char(int c)
415 {
416 	cppm[S_START][c] = S_ILL;
417 	cppm[S_NAME][c] = FRZ(STO(NAME));
418 }
419 
space_char(int c)420 int space_char(int c)
421 {
422 	if (c == ' ' || c == '\t' || c == '\v' || c == '\f'
423 #ifdef UNBREAKABLE_SPACE
424 		|| c == UNBREAKABLE_SPACE
425 #endif
426 		) return 1;
427 	return 0;
428 }
429 
430 #ifndef NO_UCPP_BUF
431 /*
432  * our output buffer is full, flush it
433  */
flush_output(struct lexer_state * ls)434 void flush_output(struct lexer_state *ls)
435 {
436 	size_t x = ls->sbuf, y = 0, z;
437 
438 	if (ls->sbuf == 0) return;
439 	do {
440 		z = fwrite(ls->output_buf + y, 1, x, ls->output);
441 		x -= z;
442 		y += z;
443 	} while (z && x > 0);
444 	if (!y) {
445 		error(ls->line, "could not flush output (disk full ?)");
446 		die();
447 	}
448 	ls->sbuf = 0;
449 }
450 #endif
451 
452 /*
453  * Output one character; flush the buffer if needed.
454  * This function should not be called, except by put_char().
455  */
write_char(struct lexer_state * ls,unsigned char c)456 static inline void write_char(struct lexer_state *ls, unsigned char c)
457 {
458 #ifndef NO_UCPP_BUF
459 	ls->output_buf[ls->sbuf ++] = c;
460 	if (ls->sbuf == OUTPUT_BUF_MEMG) flush_output(ls);
461 #else
462 	if (putc((int)c, ls->output) == EOF) {
463 		error(ls->line, "output write error (disk full ?)");
464 		die();
465 	}
466 #endif
467 	if (c == '\n') {
468 		ls->oline ++;
469 	}
470 }
471 
472 /*
473  * schedule a character for output
474  */
put_char(struct lexer_state * ls,unsigned char c)475 void put_char(struct lexer_state *ls, unsigned char c)
476 {
477 	if (ls->flags & KEEP_OUTPUT) write_char(ls, c);
478 }
479 
480 /*
481  * get next raw input character
482  */
read_char(struct lexer_state * ls)483 static inline int read_char(struct lexer_state *ls)
484 {
485 	unsigned char c;
486 
487 	if (!ls->input) {
488 		return ((ls->pbuf ++) < ls->ebuf) ?
489 			ls->input_string[ls->pbuf - 1] : -1;
490 	}
491 	while (1) {
492 #ifndef NO_UCPP_BUF
493 		if (ls->pbuf == ls->ebuf) {
494 #ifdef UCPP_MMAP
495 			if (ls->from_mmap) {
496 				munmap((void *)ls->input_buf, ls->ebuf);
497 				ls->from_mmap = 0;
498 				ls->input_buf = ls->input_buf_sav;
499 			}
500 #endif
501 			ls->ebuf = fread(ls->input_buf, 1,
502 				INPUT_BUF_MEMG, ls->input);
503 			ls->pbuf = 0;
504 		}
505 		if (ls->ebuf == 0) return -1;
506 		c = ls->input_buf[ls->pbuf ++];
507 #else
508 		int x = getc(ls->input);
509 
510 		if (x == EOF) return -1;
511 		c = x;
512 #endif
513 		if (ls->flags & COPY_LINE) {
514 			if (c == '\n') {
515 				ls->copy_line[ls->cli] = 0;
516 				ls->cli = 0;
517 			} else if (ls->cli < (COPY_LINE_LENGTH - 1)) {
518 				ls->copy_line[ls->cli ++] = c;
519 			}
520 		}
521 		if (ls->macfile && c == '\n') {
522 			ls->macfile = 0;
523 			continue;
524 		}
525 		ls->macfile = 0;
526 		if (c == '\r') {
527 			/*
528 			 * We found a '\r'; we handle it as a newline
529 			 * and ignore the next newline. This should work
530 			 * with all combinations of Msdos, MacIntosh and
531 			 * Unix files on these three platforms. On other
532 			 * platforms, native file formats are always
533 			 * supported.
534 			 */
535 			ls->macfile = 1;
536 			c = '\n';
537 		}
538 		break;
539 	}
540 	return c;
541 }
542 
543 /*
544  * next_fifo_char(), char_lka1() and char_lka2() give a two character
545  * look-ahead on the input stream; this is needed for trigraphs
546  */
next_fifo_char(struct lexer_state * ls)547 static inline int next_fifo_char(struct lexer_state *ls)
548 {
549 	int c;
550 
551 	if (ls->nlka != 0) {
552 		c = ls->lka[0];
553 		ls->lka[0] = ls->lka[1];
554 		ls->nlka --;
555 	} else c = read_char(ls);
556 	return c;
557 }
558 
char_lka1(struct lexer_state * ls)559 static inline int char_lka1(struct lexer_state *ls)
560 {
561 	if (ls->nlka == 0) {
562 		ls->lka[0] = read_char(ls);
563 		ls->nlka ++;
564 	}
565 	return ls->lka[0];
566 }
567 
char_lka2(struct lexer_state * ls)568 static inline int char_lka2(struct lexer_state *ls)
569 {
570 #ifdef AUDIT
571 	if (ls->nlka == 0) ouch("always in motion future is");
572 #endif
573 	if (ls->nlka == 1) {
574 		ls->lka[1] = read_char(ls);
575 		ls->nlka ++;
576 	}
577 	return ls->lka[1];
578 }
579 
580 static struct trigraph {
581 	int old, new;
582 } trig[9] = {
583 	{ '=', '#' },
584 	{ '/', '\\' },
585 	{ '\'', '^' },
586 	{ '(', '[' },
587 	{ ')', ']' },
588 	{ '!', '|' },
589 	{ '<', '{' },
590 	{ '>', '}' },
591 	{ '-', '~' }
592 };
593 
594 /*
595  * Returns the next character, after treatment of trigraphs and terminating
596  * backslashes. Return value is -1 if there is no more input.
597  */
next_char(struct lexer_state * ls)598 static inline int next_char(struct lexer_state *ls)
599 {
600 	int c;
601 
602 	if (!ls->discard) return ls->last;
603 	ls->discard = 0;
604 	do {
605 		c = next_fifo_char(ls);
606 		/* check trigraphs */
607 		if (c == '?' && char_lka1(ls) == '?'
608 			&& (ls->flags & HANDLE_TRIGRAPHS)) {
609 			int i, d;
610 
611 			d = char_lka2(ls);
612 			for (i = 0; i < 9; i ++) if (d == trig[i].old) {
613 				if (ls->flags & WARN_TRIGRAPHS) {
614 					ls->count_trigraphs ++;
615 				}
616 				if (ls->flags & WARN_TRIGRAPHS_MORE) {
617 					warning(ls->line, "trigraph ?""?%c "
618 						"encountered", d);
619 				}
620 				next_fifo_char(ls);
621 				next_fifo_char(ls);
622 				c = trig[i].new;
623 				break;
624 			}
625 		}
626 		if (c == '\\' && char_lka1(ls) == '\n') {
627 			ls->line ++;
628 			next_fifo_char(ls);
629 		} else if (c == '\r' && char_lka1(ls) == '\n') {
630 			ls->line ++;
631 			next_fifo_char(ls);
632 			c = '\n';
633 			return c;
634 		} else {
635 			ls->last = c;
636 			return c;
637 		}
638 	} while (1);
639 }
640 
641 /*
642  * wrapper for next_char(), to be called from outside
643  * (used by #error, #include directives)
644  */
grap_char(struct lexer_state * ls)645 int grap_char(struct lexer_state *ls)
646 {
647 	return next_char(ls);
648 }
649 
650 /*
651  * Discard the current character, so that the next call to next_char()
652  * will step into the input stream.
653  */
discard_char(struct lexer_state * ls)654 void discard_char(struct lexer_state *ls)
655 {
656 #ifdef AUDIT
657 	if (ls->discard) ouch("overcollecting garbage");
658 #endif
659 	ls->discard = 1;
660 	ls->utf8 = 0;
661 	if (ls->last == '\n') ls->line ++;
662 }
663 
664 /*
665  * Convert an UTF-8 encoded character to a Universal Character Name
666  * using \u (or \U when appropriate).
667  */
utf8_to_string(unsigned char buf[],unsigned long utf8)668 static int utf8_to_string(unsigned char buf[], unsigned long utf8)
669 {
670 	unsigned long val = 0;
671 	static char hex[16] = "0123456789abcdef";
672 
673 	if (utf8 & 0x80UL) {
674 		unsigned long x1, x2, x3, x4;
675 
676 		x1 = (utf8 >> 24) & 0x7fUL;
677 		x2 = (utf8 >> 16) & 0x7fUL;
678 		x3 = (utf8 >> 8) & 0x7fUL;
679 		x4 = (utf8) & 0x3fUL;
680 		x1 &= 0x07UL;
681 		if (x2 & 0x40UL) x2 &= 0x0fUL;
682 		if (x3 & 0x40UL) x3 &= 0x1fUL;
683 		val = x4 | (x3 << 6) | (x2 << 12) | (x1 << 16);
684 	} else val = utf8;
685 	if (val < 128) {
686 		buf[0] = val;
687 		buf[1] = 0;
688 		return 1;
689 	} else if (val < 0xffffUL) {
690 		buf[0] = '\\';
691 		buf[1] = 'u';
692 		buf[2] = hex[(size_t)(val >> 12)];
693 		buf[3] = hex[(size_t)((val >> 8) & 0xfU)];
694 		buf[4] = hex[(size_t)((val >> 4) & 0xfU)];
695 		buf[5] = hex[(size_t)(val & 0xfU)];
696 		buf[6] = 0;
697 		return 6;
698 	}
699 	buf[0] = '\\';
700 	buf[1] = 'U';
701 	buf[2] = '0';
702 	buf[3] = '0';
703 	buf[4] = hex[(size_t)(val >> 20)];
704 	buf[5] = hex[(size_t)((val >> 16) & 0xfU)];
705 	buf[6] = hex[(size_t)((val >> 12) & 0xfU)];
706 	buf[7] = hex[(size_t)((val >> 8) & 0xfU)];
707 	buf[8] = hex[(size_t)((val >> 4) & 0xfU)];
708 	buf[9] = hex[(size_t)(val & 0xfU)];
709 	buf[10] = 0;
710 	return 10;
711 }
712 
713 /*
714  * Scan the identifier and put it in canonical form:
715  *  -- tranform \U0000xxxx into \uxxxx
716  *  -- inside \u and \U, make letters low case
717  *  -- report (some) incorrect use of UCN
718  */
canonize_id(struct lexer_state * ls,char * id)719 static void canonize_id(struct lexer_state *ls, char *id)
720 {
721 	char *c, *d;
722 
723 	for (c = d = id; *c;) {
724 		if (*c == '\\') {
725 			int i;
726 
727 			if (!*(c + 1)) goto canon_error;
728 			if (*(c + 1) == 'U') {
729 				for (i = 0; i < 8 && *(c + i + 2); i ++);
730 				if (i != 8) goto canon_error;
731 				*(d ++) = '\\';
732 				c += 2;
733 				for (i = 0; i < 4 && *(c + i) == '0'; i ++);
734 				if (i == 4) {
735 					*(d ++) = 'u';
736 					c += 4;
737 				} else {
738 					*(d ++) = 'U';
739 					i = 8;
740 				}
741 				for (; i > 0; i --) {
742 					switch (*c) {
743 					case 'A': *(d ++) = 'a'; break;
744 					case 'B': *(d ++) = 'b'; break;
745 					case 'C': *(d ++) = 'c'; break;
746 					case 'D': *(d ++) = 'd'; break;
747 					case 'E': *(d ++) = 'e'; break;
748 					case 'F': *(d ++) = 'f'; break;
749 					default: *(d ++) = *c; break;
750 					}
751 					c ++;
752 				}
753 			} else if (*(c + 1) == 'u') {
754 				for (i = 0; i < 4 && *(c + i + 2); i ++);
755 				if (i != 4) goto canon_error;
756 				*(d ++) = '\\';
757 				*(d ++) = 'u';
758 				c += 2;
759 				for (; i > 0; i --) {
760 					switch (*c) {
761 					case 'A': *(d ++) = 'a'; break;
762 					case 'B': *(d ++) = 'b'; break;
763 					case 'C': *(d ++) = 'c'; break;
764 					case 'D': *(d ++) = 'd'; break;
765 					case 'E': *(d ++) = 'e'; break;
766 					case 'F': *(d ++) = 'f'; break;
767 					default: *(d ++) = *c; break;
768 					}
769 					c ++;
770 				}
771 			} else goto canon_error;
772 			continue;
773 		}
774 		*(d ++) = *(c ++);
775 	}
776 	*d = 0;
777 	return;
778 
779 canon_error:
780 	for (; *c; *(d ++) = *(c ++));
781 	if (ls->flags & WARN_STANDARD) {
782 		warning(ls->line, "malformed identifier with UCN: '%s'", id);
783 	}
784 	*d = 0;
785 }
786 
787 /*
788  * Run the automaton, in order to get the next token.
789  * This function should not be called, except by next_token()
790  *
791  * return value: 1 on error, 2 on end-of-file, 0 otherwise.
792  */
read_token(struct lexer_state * ls)793 static inline int read_token(struct lexer_state *ls)
794 {
795 	int cstat = S_START, nstat;
796 	size_t ltok = 0;
797 	int c, outc = 0, ucn_in_id = 0;
798 	int shift_state;
799 	unsigned long utf8;
800 	long l = ls->line;
801 
802 	ls->ctok->line = l;
803 	if (ls->pending_token) {
804 		if ((ls->ctok->type = ls->pending_token) == BUNCH) {
805 			ls->ctok->name[0] = '\\';
806 			ls->ctok->name[1] = 0;
807 		}
808 		ls->pending_token = 0;
809 		return 0;
810 	}
811 	if (ls->flags & UTF8_SOURCE) {
812 		utf8 = ls->utf8;
813 		shift_state = 0;
814 	}
815 	if (!(ls->flags & LEXER) && (ls->flags & KEEP_OUTPUT))
816 		for (; ls->line > ls->oline;) put_char(ls, '\n');
817 	do {
818 		c = next_char(ls);
819 		if (c < 0) {
820 			if ((ls->flags & UTF8_SOURCE) && shift_state) {
821 				if (ls->flags & WARN_STANDARD)
822 					warning(ls->line, "truncated UTF-8 "
823 						"character");
824 				shift_state = 0;
825 				utf8 = 0;
826 			}
827 			if (cstat == S_START) return 2;
828 			nstat = cppm_vch[cstat];
829 		} else {
830 			if (ls->flags & UTF8_SOURCE) {
831 				if (shift_state) {
832 					if ((c & 0xc0) != 0x80) {
833 						if (ls->flags & WARN_STANDARD)
834 							warning(ls->line,
835 								"truncated "
836 								"UTF-8 "
837 								"character");
838 						shift_state = 0;
839 						utf8 = 0;
840 						c = '_';
841 					} else {
842 						utf8 = (utf8 << 8) | c;
843 						if (-- shift_state) {
844 							ls->discard = 1;
845 							continue;
846 						}
847 						c = '_';
848 					}
849 				} else if ((c & 0xc0) == 0xc0) {
850 					if ((c & 0x30) == 0x30) {
851 						shift_state = 3;
852 					} else if (c & 0x20) {
853 						shift_state = 2;
854 					} else {
855 						shift_state = 1;
856 					}
857 					utf8 = c;
858 					ls->discard = 1;
859 					continue;
860 				} else utf8 = 0;
861 			}
862 			nstat = cppm[cstat][c < MAX_CHAR_VAL ? c : 0];
863 		}
864 #ifdef AUDIT
865 		if (nstat == S_OUCH) {
866 			ouch("bad move...");
867 		}
868 #endif
869 		/*
870 		 * disable C++-like comments
871 		 */
872 		if (nstat == S_COMMENT5 && !(ls->flags & CPLUSPLUS_COMMENTS))
873 			nstat = FRZ(STO(SLASH));
874 
875 		if (noMOD(nstat) >= MSTATE && !ttSTO(nstat))
876 			switch (noMOD(nstat)) {
877 		case S_ILL:
878 			if (ls->flags & CCHARSET) {
879 				error(ls->line, "illegal character '%c'", c);
880 				return 1;
881 			}
882 			nstat = PUT(STO(BUNCH));
883 			break;
884 		case S_BS:
885 			ls->ctok->name[0] = '\\';
886 			ltok ++;
887 			nstat = FRZ(STO(BUNCH));
888 			if (!(ls->flags & LEXER)) put_char(ls, '\\');
889 			break;
890 		case S_ROGUE_BS:
891 			ls->pending_token = BUNCH;
892 			nstat = FRZ(STO(NAME));
893 			break;
894 		case S_DDOT:
895 			ls->pending_token = DOT;
896 			nstat = FRZ(STO(DOT));
897 			break;
898 		case S_DDSHARP:
899 			ls->pending_token = PCT;
900 			nstat = FRZ(STO(DIG_SHARP));
901 			break;
902 		case S_BEHEAD:
903 			error(l, "unfinished string at end of line");
904 			return 1;
905 		case S_DECAY:
906 			warning(l, "unterminated // comment");
907 			nstat = FRZ(STO(COMMENT));
908 			break;
909 		case S_TRUNC:
910 			error(l, "truncated token");
911 			return 1;
912 		case S_TRUNCC:
913 			error(l, "truncated comment");
914 			return 1;
915 #ifdef AUDIT
916 		case S_OUCH:
917 			ouch("machine went out of control");
918 			break;
919 #endif
920 		}
921 		if (!ttFRZ(nstat)) {
922 			discard_char(ls);
923 			if (!(ls->flags & LEXER) && ls->condcomp) {
924 				int z = ttSTO(nstat) ? S_ILL : noMOD(nstat);
925 
926 				if (cstat == S_NAME || z == S_NAME
927 					|| ((CMT(cstat) || CMT(z))
928 					&& (ls->flags & DISCARD_COMMENTS))) {
929 					outc = 0;
930 				} else if (z == S_LCHAR || z == S_SLASH
931 					|| (z == S_SHARP && ls->ltwnl)
932 					|| (z == S_PCT && ls->ltwnl)
933 					|| (z == S_BACKSLASH)) {
934 					outc = c;
935 				} else if (z == S_PCT2 && ls->ltwnl) {
936 					outc = -1;
937 				} else if (z == S_PCT3 && ls->ltwnl) {
938 					/* we have %:% but this still might
939 					   not be a %:%: */
940 					outc = -2;
941 				} else {
942 					if (outc < 0) {
943 						put_char(ls, '%');
944 						put_char(ls, ':');
945 						if (outc == -2)
946 							put_char(ls, '%');
947 						outc = 0;
948 					} else if (outc) {
949 						put_char(ls, outc);
950 						outc = 0;
951 					}
952 					put_char(ls, c);
953 				}
954 			}
955 		} else if (outc == '/' && !(ls->flags & LEXER)
956 			&& ls->condcomp) {
957 			/* this is a hack: we need to dump a pending slash */
958 			put_char(ls, outc);
959 			outc = 0;
960 		}
961 		if (ttPUT(nstat)) {
962 			if (cstat == S_NAME_BS) {
963 				ucn_in_id = 1;
964 				wan(ls->ctok->name, ltok, '\\', ls->tknl);
965 			}
966 			if ((ls->flags & UTF8_SOURCE) && utf8) {
967 				unsigned char buf[11];
968 				int i, j;
969 
970 				for (i = 0, j = utf8_to_string(buf, utf8);
971 					i < j; i ++)
972 					wan(ls->ctok->name, ltok, buf[i],
973 						ls->tknl);
974 				/* if (j > 1) ucn_in_id = 1; */
975 			} else wan(ls->ctok->name, ltok,
976 				(unsigned char)c, ls->tknl);
977 		}
978 		if (ttSTO(nstat)) {
979 			if (S_TOKEN(noMOD(nstat))) {
980 				wan(ls->ctok->name, ltok,
981 					(unsigned char)0, ls->tknl);
982 			}
983 			ls->ctok->type = noMOD(nstat);
984 			break;
985 		}
986 		cstat = noMOD(nstat);
987 	} while (1);
988 	if (!(ls->flags & LEXER) && (ls->flags & DISCARD_COMMENTS)
989 			&& ls->ctok->type == COMMENT) put_char(ls, ' ');
990 	if (ucn_in_id && ls->ctok->type == NAME)
991 		canonize_id(ls, ls->ctok->name);
992 	return 0;
993 }
994 
995 /*
996  * fills ls->ctok with the next token
997  */
next_token(struct lexer_state * ls)998 int next_token(struct lexer_state *ls)
999 {
1000 	if (ls->flags & READ_AGAIN) {
1001 		ls->flags &= ~READ_AGAIN;
1002 		if (!(ls->flags & LEXER)) {
1003 			char *c = S_TOKEN(ls->ctok->type) ?
1004 				ls->ctok->name : token_name(ls->ctok);
1005 			if (ls->ctok->type == OPT_NONE) {
1006 				ls->ctok->type = NONE;
1007 #ifdef SEMPER_FIDELIS
1008 				ls->ctok->name[0] = ' ';
1009 				ls->ctok->name[1] = 0;
1010 #endif
1011 				put_char(ls, ' ');
1012 			} else if (ls->ctok->type != NAME &&
1013 				!(ls->ltwnl && (ls->ctok->type == SHARP
1014 					|| ls->ctok->type == DIG_SHARP)))
1015 				for (; *c; c ++) put_char(ls, *c);
1016 		}
1017 		return 0;
1018 	}
1019 	return read_token(ls);
1020 }
1021