ucpp/ucpp-31b719e/lexer.c

/*
 * (c) Thomas Pornin 1999 - 2002
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 4. The name of the authors may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include "tune.h"
#include <stdio.h>
#include <string.h>
#include <stddef.h>
#include <limits.h>
#include "ucppi.h"
#include "mem.h"
#ifdef UCPP_MMAP
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#endif

/*
 * Character classes for description of the automaton.
 * The characters used for representing classes should not appear
 * explicitely in an automaton rule.
 */
#define SPC	' '	/* whitespace characters */
#define ALP	'Z'	/* A-Z, a-z, _ */
#define NUM	'9'	/* 0-9 */
#define ANY	'Y'	/* any character */
#define VCH	'F'	/* void character (for end of input) */

/*
 * flags and macros to test those flags
 * STO: the currently read string is a complete token
 * PUT: the currently read character must be added to the string
 * FRZ: the currently read character must be kept and read again
 */
#define MOD_MK		255
#define noMOD(x)	((x) & 255)
#define STO(x)		((x) | 256)
#define ttSTO(x)	((x) & 256)
#define FRZ(x)		((x) | 512)
#define ttFRZ(x)	((x) & 512)
#define PUT(x)		((x) | 1024)
#define ttPUT(x)	((x) & 1024)

/* order is important */
enum {
	S_START, S_SPACE, S_BANG, S_STRING, S_STRING2, S_COLON,
	S_SHARP, S_PCT, S_PCT2, S_PCT3, S_AMPER, S_CHAR, S_CHAR2, S_STAR,
	S_PLUS, S_MINUS, S_DOT, S_DOT2, S_SLASH, S_NUMBER, S_NUMBER2, S_LT,
	S_LT2, S_EQ, S_GT, S_GT2, S_CIRC, S_PIPE, S_BACKSLASH,
	S_COMMENT, S_COMMENT2, S_COMMENT3, S_COMMENT4, S_COMMENT5,
	S_NAME, S_NAME_BS, S_LCHAR,
	MSTATE,
	S_ILL, S_DDOT, S_DDSHARP, S_BS, S_ROGUE_BS, S_BEHEAD, S_DECAY,
	S_TRUNC, S_TRUNCC, S_OUCH
};

#define CMT(x)		((x) >= S_COMMENT && (x) <= S_COMMENT5)

#define CMCR	2

/*
 * This is the description of the automaton. It is not used "as is"
 * but copied at execution time into a table.
 *
 * To my utmost displeasure, there are a few hacks in read_token()
 * (which uses the transformed automaton) about the special handling
 * of slashes, sharps, and the letter L.
 */
static struct machine_state {
	int state;
	unsigned char input[CMCR];
	int new_state;
} cppms[] = {
	/* S_START is the generic beginning state */
	{ S_START,	{ ANY },	S_ILL			},
#ifdef SEMPER_FIDELIS
	{ S_START,	{ SPC },	PUT(S_SPACE)		},
#else
	{ S_START,	{ SPC },	S_SPACE			},
#endif
	{ S_START,	{ '\n' },	STO(NEWLINE)		},
	{ S_START,	{ '!' },	S_BANG			},
	{ S_START,	{ '"' },	PUT(S_STRING)		},
	{ S_START,	{ '#' },	S_SHARP			},
	{ S_START,	{ '%' },	S_PCT			},
	{ S_START,	{ '&' },	S_AMPER			},
	{ S_START,	{ '\'' },	PUT(S_CHAR)		},
	{ S_START,	{ '(' },	STO(LPAR)		},
	{ S_START,	{ ')' },	STO(RPAR)		},
	{ S_START,	{ '*' },	S_STAR			},
	{ S_START,	{ '+' },	S_PLUS			},
	{ S_START,	{ ',' },	STO(COMMA)		},
	{ S_START,	{ '-' },	S_MINUS			},
	{ S_START,	{ '.' },	PUT(S_DOT)		},
#ifdef SEMPER_FIDELIS
	{ S_START,	{ '/' },	PUT(S_SLASH)		},
#else
	{ S_START,	{ '/' },	S_SLASH			},
#endif
	{ S_START,	{ NUM },	PUT(S_NUMBER)		},
	{ S_START,	{ ':' },	S_COLON			},
	{ S_START,	{ ';' },	STO(SEMIC)		},
	{ S_START,	{ '<' },	S_LT			},
	{ S_START,	{ '=' },	S_EQ			},
	{ S_START,	{ '>' },	S_GT			},
	{ S_START,	{ '?' },	STO(QUEST)		},
	{ S_START,	{ ALP },	PUT(S_NAME)		},
	{ S_START,	{ 'L' },	PUT(S_LCHAR)		},
	{ S_START,	{ '[' },	STO(LBRK)		},
	{ S_START,	{ ']' },	STO(RBRK)		},
	{ S_START,	{ '^' },	S_CIRC			},
	{ S_START,	{ '{' },	STO(LBRA)		},
	{ S_START,	{ '|' },	S_PIPE			},
	{ S_START,	{ '}' },	STO(RBRA)		},
	{ S_START,	{ '~' },	STO(NOT)		},
	{ S_START,	{ '\\' },	S_BACKSLASH		},

	/* after a space */
	{ S_SPACE,	{ ANY },	FRZ(STO(NONE))		},
#ifdef SEMPER_FIDELIS
	{ S_SPACE,	{ SPC },	PUT(S_SPACE)		},
#else
	{ S_SPACE,	{ SPC },	S_SPACE			},
#endif

	/* after a ! */
	{ S_BANG,	{ ANY },	FRZ(STO(LNOT))		},
	{ S_BANG,	{ '=' },	STO(NEQ)		},

	/* after a " */
	{ S_STRING,	{ ANY },	PUT(S_STRING)		},
	{ S_STRING,	{ VCH },	FRZ(S_TRUNC)		},
	{ S_STRING,	{ '\n' },	FRZ(S_BEHEAD)		},
	{ S_STRING,	{ '\\' },	PUT(S_STRING2)		},
	{ S_STRING,	{ '"' },	PUT(STO(STRING))	},

	{ S_STRING2,	{ ANY },	PUT(S_STRING)		},
	{ S_STRING2,	{ VCH },	FRZ(S_TRUNC)		},

	/* after a # */
	{ S_SHARP,	{ ANY },	FRZ(STO(SHARP))		},
	{ S_SHARP,	{ '#' },	STO(DSHARP)		},

	/* after a : */
	{ S_COLON,	{ ANY },	FRZ(STO(COLON))		},
	{ S_COLON,	{ '>' },	STO(DIG_RBRK)		},

	/* after a % */
	{ S_PCT,	{ ANY },	FRZ(STO(PCT))		},
	{ S_PCT,	{ '=' },	STO(ASPCT)		},
	{ S_PCT,	{ '>' },	STO(DIG_RBRA)		},
	{ S_PCT,	{ ':' },	S_PCT2			},

	/* after a %: */
	{ S_PCT2,	{ ANY },	FRZ(STO(DIG_SHARP))	},
	{ S_PCT2,	{ '%' },	S_PCT3			},

	/* after a %:% */
	{ S_PCT3,	{ ANY },	FRZ(S_DDSHARP)		},
	{ S_PCT3,	{ ':' },	STO(DIG_DSHARP)		},

	/* after a & */
	{ S_AMPER,	{ ANY },	FRZ(STO(AND))		},
	{ S_AMPER,	{ '=' },	STO(ASAND)		},
	{ S_AMPER,	{ '&' },	STO(LAND)		},

	/* after a ' */
	{ S_CHAR,	{ ANY },	PUT(S_CHAR)		},
	{ S_CHAR,	{ VCH },	FRZ(S_TRUNC)		},
	{ S_CHAR,	{ '\'' },	PUT(STO(CHAR))		},
	{ S_CHAR,	{ '\\' },	PUT(S_CHAR2)		},

	/* after a \ in a character constant
	   useful only for '\'' */
	{ S_CHAR2,	{ ANY },	PUT(S_CHAR)		},
	{ S_CHAR2,	{ VCH },	FRZ(S_TRUNC)		},

	/* after a * */
	{ S_STAR,	{ ANY },	FRZ(STO(STAR))		},
	{ S_STAR,	{ '=' },	STO(ASSTAR)		},

	/* after a + */
	{ S_PLUS,	{ ANY },	FRZ(STO(PLUS))		},
	{ S_PLUS,	{ '+' },	STO(PPLUS)		},
	{ S_PLUS,	{ '=' },	STO(ASPLUS)		},

	/* after a - */
	{ S_MINUS,	{ ANY },	FRZ(STO(MINUS))		},
	{ S_MINUS,	{ '-' },	STO(MMINUS)		},
	{ S_MINUS,	{ '=' },	STO(ASMINUS)		},
	{ S_MINUS,	{ '>' },	STO(ARROW)		},

	/* after a . */
	{ S_DOT,	{ ANY },	FRZ(STO(DOT))		},
	{ S_DOT,	{ NUM },	PUT(S_NUMBER)		},
	{ S_DOT,	{ '.' },	S_DOT2			},

	/* after .. */
	{ S_DOT2,	{ ANY },	FRZ(S_DDOT)		},
	{ S_DOT2,	{ '.' },	STO(MDOTS)		},

	/* after a / */
	{ S_SLASH,	{ ANY },	FRZ(STO(SLASH))		},
	{ S_SLASH,	{ '=' },	STO(ASSLASH)		},
#ifdef SEMPER_FIDELIS
	{ S_SLASH,	{ '*' },	PUT(S_COMMENT)		},
	{ S_SLASH,	{ '/' },	PUT(S_COMMENT5)		},
#else
	{ S_SLASH,	{ '*' },	S_COMMENT		},
	{ S_SLASH,	{ '/' },	S_COMMENT5		},
#endif
	/*
	 * There is a little hack in read_token() to disable
	 * this last rule, if C++ (C99) comments are not enabled.
	 */

	/* after a number */
	{ S_NUMBER,	{ ANY },	FRZ(STO(NUMBER))	},
	{ S_NUMBER,	{ ALP, NUM },	PUT(S_NUMBER)		},
	{ S_NUMBER,	{ '.' },	PUT(S_NUMBER)		},
	{ S_NUMBER,	{ 'E', 'e' },	PUT(S_NUMBER2)		},
	{ S_NUMBER,	{ 'P', 'p' },	PUT(S_NUMBER2)		},

	{ S_NUMBER2,	{ ANY },	FRZ(STO(NUMBER))	},
	{ S_NUMBER2,	{ ALP, NUM },	PUT(S_NUMBER)		},
	{ S_NUMBER2,	{ '+', '-' },	PUT(S_NUMBER)		},

	/* after a < */
	{ S_LT,		{ ANY },	FRZ(STO(LT))		},
	{ S_LT,		{ '=' },	STO(LEQ)		},
	{ S_LT,		{ '<' },	S_LT2			},
	{ S_LT,		{ ':' },	STO(DIG_LBRK)		},
	{ S_LT,		{ '%' },	STO(DIG_LBRA)		},

	{ S_LT2,	{ ANY },	FRZ(STO(LSH))		},
	{ S_LT2,	{ '=' },	STO(ASLSH)		},

	/* after a > */
	{ S_GT,		{ ANY },	FRZ(STO(GT))		},
	{ S_GT,		{ '=' },	STO(GEQ)		},
	{ S_GT,		{ '>' },	S_GT2			},

	{ S_GT2,	{ ANY },	FRZ(STO(RSH))		},
	{ S_GT2,	{ '=' },	STO(ASRSH)		},

	/* after a = */
	{ S_EQ,		{ ANY },	FRZ(STO(ASGN))		},
	{ S_EQ,		{ '=' },	STO(SAME)		},
#ifdef CAST_OP
	{ S_EQ,		{ '>' },	STO(CAST)		},
#endif

	/* after a \ */
	{ S_BACKSLASH,	{ ANY },	FRZ(S_BS)		},
	{ S_BACKSLASH,	{ 'U', 'u' },	FRZ(S_NAME_BS)		},

	/* after a letter */
	{ S_NAME,	{ ANY },	FRZ(STO(NAME))		},
	{ S_NAME,	{ ALP, NUM },	PUT(S_NAME)		},
	{ S_NAME,	{ '\\' },	S_NAME_BS		},

	/* after a \ in an identifier */
	{ S_NAME_BS,	{ ANY },	FRZ(S_ROGUE_BS)		},
	{ S_NAME_BS,	{ 'u', 'U' },	PUT(S_NAME)		},

	/* after a L */
	{ S_LCHAR,	{ ANY },	FRZ(S_NAME)		},
	{ S_LCHAR,	{ '"' },	PUT(S_STRING)		},
	{ S_LCHAR,	{ '\'' },	PUT(S_CHAR)		},

	/* after a ^ */
	{ S_CIRC,	{ ANY },	FRZ(STO(CIRC))		},
	{ S_CIRC,	{ '=' },	STO(ASCIRC)		},

	/* after a | */
	{ S_PIPE,	{ ANY },	FRZ(STO(OR))		},
	{ S_PIPE,	{ '=' },	STO(ASOR)		},
	{ S_PIPE,	{ '|' },	STO(LOR)		},

	/* after a / and * */
#ifdef SEMPER_FIDELIS
	{ S_COMMENT,	{ ANY },	PUT(S_COMMENT)		},
	{ S_COMMENT,	{ VCH },	FRZ(S_TRUNCC)		},
	{ S_COMMENT,	{ '*' },	PUT(S_COMMENT2)		},

	{ S_COMMENT2,	{ ANY },	FRZ(S_COMMENT)		},
	{ S_COMMENT2,	{ VCH },	FRZ(S_TRUNCC)		},
	{ S_COMMENT2,	{ '*' },	PUT(S_COMMENT2)		},
	{ S_COMMENT2,	{ '/' },	STO(PUT(COMMENT))	},

	{ S_COMMENT5,	{ ANY },	PUT(S_COMMENT5)		},
	{ S_COMMENT5,	{ VCH },	FRZ(S_DECAY)		},
	{ S_COMMENT5,	{ '\n' },	FRZ(STO(COMMENT))	},
#else
	{ S_COMMENT,	{ ANY },	S_COMMENT		},
	{ S_COMMENT,	{ VCH },	FRZ(S_TRUNCC)		},
	{ S_COMMENT,	{ '*' },	S_COMMENT2		},

	{ S_COMMENT2,	{ ANY },	FRZ(S_COMMENT)		},
	{ S_COMMENT2,	{ VCH },	FRZ(S_TRUNCC)		},
	{ S_COMMENT2,	{ '*' },	S_COMMENT2		},
	{ S_COMMENT2,	{ '/' },	STO(COMMENT)		},

	{ S_COMMENT5,	{ ANY },	S_COMMENT5		},
	{ S_COMMENT5,	{ VCH },	FRZ(S_DECAY)		},
	{ S_COMMENT5,	{ '\n' },	FRZ(STO(COMMENT))	},
#endif

	/* dummy end of machine description */
	{ 0,		{ 0 },		0			}
};

/*
 * cppm is the table used to store the automaton: if we are in state s
 * and we read character c, we apply the action cppm[s][c] (jumping to
 * another state, or emitting a token).
 * cppm_vch is the table for the special virtual character "end of input"
 */
static int cppm[MSTATE][MAX_CHAR_VAL];
static int cppm_vch[MSTATE];

/*
 * init_cppm() fills cppm[][] with the information stored in cppms[].
 * It must be called before beginning the lexing process.
 */
void init_cppm(void)
{
	int i, j, k, c;
	static unsigned char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
	static unsigned char lower[] = "abcdefghijklmnopqrstuvwxyz";
	unsigned char *cp;

	for (i = 0; i < MSTATE; i ++) {
		for (j = 0; j < MAX_CHAR_VAL; j ++) cppm[i][j] = S_OUCH;
		cppm_vch[i] = S_OUCH;
	}
	for (i = 0; cppms[i].input[0]; i ++) for (k = 0; k < CMCR; k ++) {
		int s = cppms[i].state;
		int ns = cppms[i].new_state;

		switch (c = cppms[i].input[k]) {
		case 0:
			break;
		case SPC:
			/* see space_char() also */
			cppm[s][' '] = ns;
			cppm[s]['\t'] = ns;
			cppm[s]['\v'] = ns;
			cppm[s]['\f'] = ns;
#ifdef UNBREAKABLE_SPACE
			if (MAX_CHAR_VAL > UNBREAKABLE_SPACE)
				cppm[s][UNBREAKABLE_SPACE] = ns;
#endif
			break;
		case ALP:
			for (cp = upper; *cp; cp ++) cppm[s][(int)*cp] = ns;
			for (cp = lower; *cp; cp ++) cppm[s][(int)*cp] = ns;
			cppm[s]['_'] = ns;
			break;
		case NUM:
			for (j = '0'; j <= '9'; j ++) cppm[s][j] = ns;
			break;
		case ANY:
			for (j = 0; j < MAX_CHAR_VAL; j ++) cppm[s][j] = ns;
			cppm_vch[s] = ns;
			break;
		case VCH:
			cppm_vch[s] = ns;
			break;
		default:
			cppm[s][c] = ns;
			break;
		}
	}
}

/*
 * Make some character as equivalent to a letter for identifiers.
 */
void set_identifier_char(int c)
{
	cppm[S_START][c] = PUT(S_NAME);
	cppm[S_NAME][c] = PUT(S_NAME);
}

/*
 * Remove the "identifier" status from a character.
 */
void unset_identifier_char(int c)
{
	cppm[S_START][c] = S_ILL;
	cppm[S_NAME][c] = FRZ(STO(NAME));
}

int space_char(int c)
{
	if (c == ' ' || c == '\t' || c == '\v' || c == '\f'
#ifdef UNBREAKABLE_SPACE
		|| c == UNBREAKABLE_SPACE
#endif
		) return 1;
	return 0;
}

#ifndef NO_UCPP_BUF
/*
 * our output buffer is full, flush it
 */
void flush_output(struct lexer_state *ls)
{
	size_t x = ls->sbuf, y = 0, z;

	if (ls->sbuf == 0) return;
	do {
		z = fwrite(ls->output_buf + y, 1, x, ls->output);
		x -= z;
		y += z;
	} while (z && x > 0);
	if (!y) {
		error(ls->line, "could not flush output (disk full ?)");
		die();
	}
	ls->sbuf = 0;
}
#endif

/*
 * Output one character; flush the buffer if needed.
 * This function should not be called, except by put_char().
 */
static inline void write_char(struct lexer_state *ls, unsigned char c)
{
#ifndef NO_UCPP_BUF
	ls->output_buf[ls->sbuf ++] = c;
	if (ls->sbuf == OUTPUT_BUF_MEMG) flush_output(ls);
#else
	if (putc((int)c, ls->output) == EOF) {
		error(ls->line, "output write error (disk full ?)");
		die();
	}
#endif
	if (c == '\n') {
		ls->oline ++;
	}
}

/*
 * schedule a character for output
 */
void put_char(struct lexer_state *ls, unsigned char c)
{
	if (ls->flags & KEEP_OUTPUT) write_char(ls, c);
}

/*
 * get next raw input character
 */
static inline int read_char(struct lexer_state *ls)
{
	unsigned char c;

	if (!ls->input) {
		return ((ls->pbuf ++) < ls->ebuf) ?
			ls->input_string[ls->pbuf - 1] : -1;
	}
	while (1) {
#ifndef NO_UCPP_BUF
		if (ls->pbuf == ls->ebuf) {
#ifdef UCPP_MMAP
			if (ls->from_mmap) {
				munmap((void *)ls->input_buf, ls->ebuf);
				ls->from_mmap = 0;
				ls->input_buf = ls->input_buf_sav;
			}
#endif
			ls->ebuf = fread(ls->input_buf, 1,
				INPUT_BUF_MEMG, ls->input);
			ls->pbuf = 0;
		}
		if (ls->ebuf == 0) return -1;
		c = ls->input_buf[ls->pbuf ++];
#else
		int x = getc(ls->input);

		if (x == EOF) return -1;
		c = x;
#endif
		if (ls->flags & COPY_LINE) {
			if (c == '\n') {
				ls->copy_line[ls->cli] = 0;
				ls->cli = 0;
			} else if (ls->cli < (COPY_LINE_LENGTH - 1)) {
				ls->copy_line[ls->cli ++] = c;
			}
		}
		if (ls->macfile && c == '\n') {
			ls->macfile = 0;
			continue;
		}
		ls->macfile = 0;
		if (c == '\r') {
			/*
			 * We found a '\r'; we handle it as a newline
			 * and ignore the next newline. This should work
			 * with all combinations of Msdos, MacIntosh and
			 * Unix files on these three platforms. On other
			 * platforms, native file formats are always
			 * supported.
			 */
			ls->macfile = 1;
			c = '\n';
		}
		break;
	}
	return c;
}

/*
 * next_fifo_char(), char_lka1() and char_lka2() give a two character
 * look-ahead on the input stream; this is needed for trigraphs
 */
static inline int next_fifo_char(struct lexer_state *ls)
{
	int c;

	if (ls->nlka != 0) {
		c = ls->lka[0];
		ls->lka[0] = ls->lka[1];
		ls->nlka --;
	} else c = read_char(ls);
	return c;
}

static inline int char_lka1(struct lexer_state *ls)
{
	if (ls->nlka == 0) {
		ls->lka[0] = read_char(ls);
		ls->nlka ++;
	}
	return ls->lka[0];
}

static inline int char_lka2(struct lexer_state *ls)
{
#ifdef AUDIT
	if (ls->nlka == 0) ouch("always in motion future is");
#endif
	if (ls->nlka == 1) {
		ls->lka[1] = read_char(ls);
		ls->nlka ++;
	}
	return ls->lka[1];
}

static struct trigraph {
	int old, new;
} trig[9] = {
	{ '=', '#' },
	{ '/', '\\' },
	{ '\'', '^' },
	{ '(', '[' },
	{ ')', ']' },
	{ '!', '|' },
	{ '<', '{' },
	{ '>', '}' },
	{ '-', '~' }
};

/*
 * Returns the next character, after treatment of trigraphs and terminating
 * backslashes. Return value is -1 if there is no more input.
 */
static inline int next_char(struct lexer_state *ls)
{
	int c;

	if (!ls->discard) return ls->last;
	ls->discard = 0;
	do {
		c = next_fifo_char(ls);
		/* check trigraphs */
		if (c == '?' && char_lka1(ls) == '?'
			&& (ls->flags & HANDLE_TRIGRAPHS)) {
			int i, d;

			d = char_lka2(ls);
			for (i = 0; i < 9; i ++) if (d == trig[i].old) {
				if (ls->flags & WARN_TRIGRAPHS) {
					ls->count_trigraphs ++;
				}
				if (ls->flags & WARN_TRIGRAPHS_MORE) {
					warning(ls->line, "trigraph ?""?%c "
						"encountered", d);
				}
				next_fifo_char(ls);
				next_fifo_char(ls);
				c = trig[i].new;
				break;
			}
		}
		if (c == '\\' && char_lka1(ls) == '\n') {
			ls->line ++;
			next_fifo_char(ls);
		} else if (c == '\r' && char_lka1(ls) == '\n') {
			ls->line ++;
			next_fifo_char(ls);
			c = '\n';
			return c;
		} else {
			ls->last = c;
			return c;
		}
	} while (1);
}

/*
 * wrapper for next_char(), to be called from outside
 * (used by #error, #include directives)
 */
int grap_char(struct lexer_state *ls)
{
	return next_char(ls);
}

/*
 * Discard the current character, so that the next call to next_char()
 * will step into the input stream.
 */
void discard_char(struct lexer_state *ls)
{
#ifdef AUDIT
	if (ls->discard) ouch("overcollecting garbage");
#endif
	ls->discard = 1;
	ls->utf8 = 0;
	if (ls->last == '\n') ls->line ++;
}

/*
 * Convert an UTF-8 encoded character to a Universal Character Name
 * using \u (or \U when appropriate).
 */
static int utf8_to_string(unsigned char buf[], unsigned long utf8)
{
	unsigned long val = 0;
	static char hex[16] = "0123456789abcdef";

	if (utf8 & 0x80UL) {
		unsigned long x1, x2, x3, x4;

		x1 = (utf8 >> 24) & 0x7fUL;
		x2 = (utf8 >> 16) & 0x7fUL;
		x3 = (utf8 >> 8) & 0x7fUL;
		x4 = (utf8) & 0x3fUL;
		x1 &= 0x07UL;
		if (x2 & 0x40UL) x2 &= 0x0fUL;
		if (x3 & 0x40UL) x3 &= 0x1fUL;
		val = x4 | (x3 << 6) | (x2 << 12) | (x1 << 16);
	} else val = utf8;
	if (val < 128) {
		buf[0] = val;
		buf[1] = 0;
		return 1;
	} else if (val < 0xffffUL) {
		buf[0] = '\\';
		buf[1] = 'u';
		buf[2] = hex[(size_t)(val >> 12)];
		buf[3] = hex[(size_t)((val >> 8) & 0xfU)];
		buf[4] = hex[(size_t)((val >> 4) & 0xfU)];
		buf[5] = hex[(size_t)(val & 0xfU)];
		buf[6] = 0;
		return 6;
	}
	buf[0] = '\\';
	buf[1] = 'U';
	buf[2] = '0';
	buf[3] = '0';
	buf[4] = hex[(size_t)(val >> 20)];
	buf[5] = hex[(size_t)((val >> 16) & 0xfU)];
	buf[6] = hex[(size_t)((val >> 12) & 0xfU)];
	buf[7] = hex[(size_t)((val >> 8) & 0xfU)];
	buf[8] = hex[(size_t)((val >> 4) & 0xfU)];
	buf[9] = hex[(size_t)(val & 0xfU)];
	buf[10] = 0;
	return 10;
}

/*
 * Scan the identifier and put it in canonical form:
 *  -- tranform \U0000xxxx into \uxxxx
 *  -- inside \u and \U, make letters low case
 *  -- report (some) incorrect use of UCN
 */
static void canonize_id(struct lexer_state *ls, char *id)
{
	char *c, *d;

	for (c = d = id; *c;) {
		if (*c == '\\') {
			int i;

			if (!*(c + 1)) goto canon_error;
			if (*(c + 1) == 'U') {
				for (i = 0; i < 8 && *(c + i + 2); i ++);
				if (i != 8) goto canon_error;
				*(d ++) = '\\';
				c += 2;
				for (i = 0; i < 4 && *(c + i) == '0'; i ++);
				if (i == 4) {
					*(d ++) = 'u';
					c += 4;
				} else {
					*(d ++) = 'U';
					i = 8;
				}
				for (; i > 0; i --) {
					switch (*c) {
					case 'A': *(d ++) = 'a'; break;
					case 'B': *(d ++) = 'b'; break;
					case 'C': *(d ++) = 'c'; break;
					case 'D': *(d ++) = 'd'; break;
					case 'E': *(d ++) = 'e'; break;
					case 'F': *(d ++) = 'f'; break;
					default: *(d ++) = *c; break;
					}
					c ++;
				}
			} else if (*(c + 1) == 'u') {
				for (i = 0; i < 4 && *(c + i + 2); i ++);
				if (i != 4) goto canon_error;
				*(d ++) = '\\';
				*(d ++) = 'u';
				c += 2;
				for (; i > 0; i --) {
					switch (*c) {
					case 'A': *(d ++) = 'a'; break;
					case 'B': *(d ++) = 'b'; break;
					case 'C': *(d ++) = 'c'; break;
					case 'D': *(d ++) = 'd'; break;
					case 'E': *(d ++) = 'e'; break;
					case 'F': *(d ++) = 'f'; break;
					default: *(d ++) = *c; break;
					}
					c ++;
				}
			} else goto canon_error;
			continue;
		}
		*(d ++) = *(c ++);
	}
	*d = 0;
	return;

canon_error:
	for (; *c; *(d ++) = *(c ++));
	if (ls->flags & WARN_STANDARD) {
		warning(ls->line, "malformed identifier with UCN: '%s'", id);
	}
	*d = 0;
}

/*
 * Run the automaton, in order to get the next token.
 * This function should not be called, except by next_token()
 *
 * return value: 1 on error, 2 on end-of-file, 0 otherwise.
 */
static inline int read_token(struct lexer_state *ls)
{
	int cstat = S_START, nstat;
	size_t ltok = 0;
	int c, outc = 0, ucn_in_id = 0;
	int shift_state;
	unsigned long utf8;
	long l = ls->line;

	ls->ctok->line = l;
	if (ls->pending_token) {
		if ((ls->ctok->type = ls->pending_token) == BUNCH) {
			ls->ctok->name[0] = '\\';
			ls->ctok->name[1] = 0;
		}
		ls->pending_token = 0;
		return 0;
	}
	if (ls->flags & UTF8_SOURCE) {
		utf8 = ls->utf8;
		shift_state = 0;
	}
	if (!(ls->flags & LEXER) && (ls->flags & KEEP_OUTPUT))
		for (; ls->line > ls->oline;) put_char(ls, '\n');
	do {
		c = next_char(ls);
		if (c < 0) {
			if ((ls->flags & UTF8_SOURCE) && shift_state) {
				if (ls->flags & WARN_STANDARD)
					warning(ls->line, "truncated UTF-8 "
						"character");
				shift_state = 0;
				utf8 = 0;
			}
			if (cstat == S_START) return 2;
			nstat = cppm_vch[cstat];
		} else {
			if (ls->flags & UTF8_SOURCE) {
				if (shift_state) {
					if ((c & 0xc0) != 0x80) {
						if (ls->flags & WARN_STANDARD)
							warning(ls->line,
								"truncated "
								"UTF-8 "
								"character");
						shift_state = 0;
						utf8 = 0;
						c = '_';
					} else {
						utf8 = (utf8 << 8) | c;
						if (-- shift_state) {
							ls->discard = 1;
							continue;
						}
						c = '_';
					}
				} else if ((c & 0xc0) == 0xc0) {
					if ((c & 0x30) == 0x30) {
						shift_state = 3;
					} else if (c & 0x20) {
						shift_state = 2;
					} else {
						shift_state = 1;
					}
					utf8 = c;
					ls->discard = 1;
					continue;
				} else utf8 = 0;
			}
			nstat = cppm[cstat][c < MAX_CHAR_VAL ? c : 0];
		}
#ifdef AUDIT
		if (nstat == S_OUCH) {
			ouch("bad move...");
		}
#endif
		/*
		 * disable C++-like comments
		 */
		if (nstat == S_COMMENT5 && !(ls->flags & CPLUSPLUS_COMMENTS))
			nstat = FRZ(STO(SLASH));

		if (noMOD(nstat) >= MSTATE && !ttSTO(nstat))
			switch (noMOD(nstat)) {
		case S_ILL:
			if (ls->flags & CCHARSET) {
				error(ls->line, "illegal character '%c'", c);
				return 1;
			}
			nstat = PUT(STO(BUNCH));
			break;
		case S_BS:
			ls->ctok->name[0] = '\\';
			ltok ++;
			nstat = FRZ(STO(BUNCH));
			if (!(ls->flags & LEXER)) put_char(ls, '\\');
			break;
		case S_ROGUE_BS:
			ls->pending_token = BUNCH;
			nstat = FRZ(STO(NAME));
			break;
		case S_DDOT:
			ls->pending_token = DOT;
			nstat = FRZ(STO(DOT));
			break;
		case S_DDSHARP:
			ls->pending_token = PCT;
			nstat = FRZ(STO(DIG_SHARP));
			break;
		case S_BEHEAD:
			error(l, "unfinished string at end of line");
			return 1;
		case S_DECAY:
			warning(l, "unterminated // comment");
			nstat = FRZ(STO(COMMENT));
			break;
		case S_TRUNC:
			error(l, "truncated token");
			return 1;
		case S_TRUNCC:
			error(l, "truncated comment");
			return 1;
#ifdef AUDIT
		case S_OUCH:
			ouch("machine went out of control");
			break;
#endif
		}
		if (!ttFRZ(nstat)) {
			discard_char(ls);
			if (!(ls->flags & LEXER) && ls->condcomp) {
				int z = ttSTO(nstat) ? S_ILL : noMOD(nstat);

				if (cstat == S_NAME || z == S_NAME
					|| ((CMT(cstat) || CMT(z))
					&& (ls->flags & DISCARD_COMMENTS))) {
					outc = 0;
				} else if (z == S_LCHAR || z == S_SLASH
					|| (z == S_SHARP && ls->ltwnl)
					|| (z == S_PCT && ls->ltwnl)
					|| (z == S_BACKSLASH)) {
					outc = c;
				} else if (z == S_PCT2 && ls->ltwnl) {
					outc = -1;
				} else if (z == S_PCT3 && ls->ltwnl) {
					/* we have %:% but this still might
					   not be a %:%: */
					outc = -2;
				} else {
					if (outc < 0) {
						put_char(ls, '%');
						put_char(ls, ':');
						if (outc == -2)
							put_char(ls, '%');
						outc = 0;
					} else if (outc) {
						put_char(ls, outc);
						outc = 0;
					}
					put_char(ls, c);
				}
			}
		} else if (outc == '/' && !(ls->flags & LEXER)
			&& ls->condcomp) {
			/* this is a hack: we need to dump a pending slash */
			put_char(ls, outc);
			outc = 0;
		}
		if (ttPUT(nstat)) {
			if (cstat == S_NAME_BS) {
				ucn_in_id = 1;
				wan(ls->ctok->name, ltok, '\\', ls->tknl);
			}
			if ((ls->flags & UTF8_SOURCE) && utf8) {
				unsigned char buf[11];
				int i, j;

				for (i = 0, j = utf8_to_string(buf, utf8);
					i < j; i ++)
					wan(ls->ctok->name, ltok, buf[i],
						ls->tknl);
				/* if (j > 1) ucn_in_id = 1; */
			} else wan(ls->ctok->name, ltok,
				(unsigned char)c, ls->tknl);
		}
		if (ttSTO(nstat)) {
			if (S_TOKEN(noMOD(nstat))) {
				wan(ls->ctok->name, ltok,
					(unsigned char)0, ls->tknl);
			}
			ls->ctok->type = noMOD(nstat);
			break;
		}
		cstat = noMOD(nstat);
	} while (1);
	if (!(ls->flags & LEXER) && (ls->flags & DISCARD_COMMENTS)
			&& ls->ctok->type == COMMENT) put_char(ls, ' ');
	if (ucn_in_id && ls->ctok->type == NAME)
		canonize_id(ls, ls->ctok->name);
	return 0;
}

/*
 * fills ls->ctok with the next token
 */
int next_token(struct lexer_state *ls)
{
	if (ls->flags & READ_AGAIN) {
		ls->flags &= ~READ_AGAIN;
		if (!(ls->flags & LEXER)) {
			char *c = S_TOKEN(ls->ctok->type) ?
				ls->ctok->name : token_name(ls->ctok);
			if (ls->ctok->type == OPT_NONE) {
				ls->ctok->type = NONE;
#ifdef SEMPER_FIDELIS
				ls->ctok->name[0] = ' ';
				ls->ctok->name[1] = 0;
#endif
				put_char(ls, ' ');
			} else if (ls->ctok->type != NAME &&
				!(ls->ltwnl && (ls->ctok->type == SHARP
					|| ls->ctok->type == DIG_SHARP)))
				for (; *c; c ++) put_char(ls, *c);
		}
		return 0;
	}
	return read_token(ls);
}