/*********************************************************************** * * * This software is part of the ast package * * Copyright (c) 1986-2011 AT&T Intellectual Property * * and is licensed under the * * Eclipse Public License, Version 1.0 * * by AT&T Intellectual Property * * * * A copy of the License is available at * * http://www.eclipse.org/org/documents/epl-v10.html * * (with md5 checksum b35adb5213ca9657e911e9befb180842) * * * * Information and Software Systems Research * * AT&T Research * * Florham Park NJ * * * * Glenn Fowler * * * ***********************************************************************/ #pragma prototyped /* * Glenn Fowler * AT&T Research * * preprocessor and proto lexical analyzer fsm * define PROTOMAIN for standalone proto */ #include "pplib.h" #include "ppfsm.h" /* * lexical FSM encoding * derived from a standalone ansi cpp by Dennis Ritchie * modified for libpp by Glenn Fowler * * fsm[] is initialized from fsminit[]. The encoding is blown out into * fsm[] for time efficiency. When in state state, and one of the * characters in ch arrives, enter nextstate. States >= TERMINAL are * either final, or at least require special action. In fsminit[] there * is a line for each . Early entries are * overwritten by later ones. C_XXX is the universal set and should * always be first. Some of the fsminit[] entries are templates for * groups of states. The OP entries trigger the state copies. States * above TERMINAL are represented in fsm[] as negative values. S_TOK and * S_TOKB encode the resulting token type in the upper bits. These actions * differ in that S_TOKB has a lookahead char. * * fsm[] has three start states: * * PROTO proto (ANSI -> K&R,C++,ANSI) * QUICK standalone ppcpp() * TOKEN tokenizing pplex() * * If the next state remains the same then the fsm[] transition value is 0. * MAX+1 is a power of 2 so that fsm[state][EOF==MAX+1] actually accesses * fsm[state+1][0] which is ~S_EOB for all states. This preserves the * power of 2 fsm[] row size for efficient array indexing. Thanks to * D. G. Korn for the last two observations. The pseudo non-terminal state * fsm[TERMINAL][state+1] is used to differentiate EOB from EOF. * * The bit layout is: * * TERM arg SPLICE next * 15 14-8 7 6-0 */ /* * NOTE: these must be `control' characters for all native codesets * currently ok for {ascii,ebcdic1,ebcdic2,ebcdic3} */ #define C_DEC 001 #define C_EOF 002 #define C_HEX 003 #define C_LET 021 #define C_OCT 022 #define C_XXX 023 #define OP (-1) #define END 0 #define COPY 1 #define copy(t,f) (memcpy(&fsm[t][1],&fsm[f][1],(MAX+1)*sizeof(short)),fsm[TERMINAL][(t)+1]=fsm[TERMINAL][(f)+1]) struct fsminit /* fsm initialization row */ { int state; /* if in this state */ unsigned char ch[4]; /* and see one of these */ int nextstate; /* enter this state if ' }, GT1, }, { TOKEN, { '^' }, CIRC1, }, { TOKEN, { '|' }, OR1, }, { TOKEN, { '(', ')', '[', ']' }, S_CHR, }, { TOKEN, { '{', '}', ',', ';' }, S_CHR, }, { TOKEN, { '~', '?' }, S_CHR, }, /* saw 0, possible oct|hex|dec|dbl constant */ { OCT1, { C_XXX }, BACK(T_DECIMAL), }, { OCT1, { C_LET, C_DEC }, BAD1, }, { OCT1, { C_OCT }, OCT2, }, { OCT1, { 'e', 'E' }, DBL2, }, { OCT1, { 'l', 'L', 'u', 'U' }, QUAL(DEC2), }, { OCT1, { 'x', 'X' }, HEX1, }, { OCT1, { '.' }, DBL1, }, /* saw 0, oct constant */ { OCT2, { C_XXX }, BACK(T_OCTAL), }, { OCT2, { C_LET, C_DEC }, BAD1, }, { OCT2, { C_OCT }, OCT2, }, { OCT2, { 'e', 'E' }, DBL2, }, { OCT2, { 'l', 'L', 'u', 'U' }, QUAL(OCT3), }, { OCT2, { '.' }, DBL1, }, /* oct constant qualifier */ { OCT3, { C_XXX }, BACK(T_OCTAL), }, { OCT3, { C_LET, C_DEC, '.' }, BAD1, }, { OCT3, { 'l', 'L', 'u', 'U' }, QUAL(OCT3), }, /* saw 0 [xX], hex constant */ { HEX1, { C_XXX }, BACK(T_HEXADECIMAL), }, { HEX1, { C_LET }, BAD1, }, { HEX1, { C_HEX }, HEX1, }, { HEX1, { 'e', 'E' }, HEX3, }, { HEX1, { 'l', 'L', 'u', 'U' }, QUAL(HEX2), }, { HEX1, { '.' }, HEX4, }, { HEX1, { 'p', 'P' }, HEX5, }, /* hex constant qualifier */ { HEX2, { C_XXX }, BACK(T_HEXADECIMAL), }, { HEX2, { C_LET, C_DEC, '.' }, BAD1, }, { HEX2, { 'l', 'L', 'u', 'U' }, QUAL(HEX2), }, /* hex [eE][-+] botch */ { HEX3, { C_XXX }, BACK(T_HEXADECIMAL), }, { HEX3, { C_LET, '.', '-', '+'},BAD1, }, { HEX3, { C_HEX }, HEX1, }, { HEX3, { 'e', 'E' }, HEX3, }, { HEX3, { 'l', 'L', 'u', 'U' }, QUAL(HEX2), }, /* hex dbl fraction */ { HEX4, { C_XXX }, BACK(T_HEXDOUBLE), }, { HEX4, { C_LET, '.' }, BAD1, }, { HEX4, { C_HEX }, HEX4, }, { HEX4, { 'p', 'P' }, HEX5, }, { HEX4, { 'f', 'F', 'l', 'L' }, QUAL(HEX8), }, /* optional hex dbl exponent sign */ { HEX5, { C_XXX }, BACK(T_INVALID), }, { HEX5, { C_LET, '.' }, BAD1, }, { HEX5, { '+', '-' }, HEX6, }, { HEX5, { C_DEC }, HEX7, }, /* mandatory hex dbl exponent first digit */ { HEX6, { C_XXX }, BACK(T_INVALID), }, { HEX6, { C_LET, '.' }, BAD1, }, { HEX6, { C_DEC }, HEX7, }, /* hex dbl exponent digits */ { HEX7, { C_XXX }, BACK(T_HEXDOUBLE), }, { HEX7, { C_LET, '.' }, BAD1, }, { HEX7, { C_DEC }, HEX7, }, { HEX7, { 'f', 'F', 'l', 'L' }, QUAL(HEX8), }, /* hex dbl constant qualifier */ { HEX8, { C_XXX }, BACK(T_HEXDOUBLE), }, { HEX8, { C_LET, '.' }, BAD1, }, { HEX8, { 'f', 'F', 'l', 'L' }, QUAL(HEX8), }, /* saw , dec constant */ { DEC1, { C_XXX }, BACK(T_DECIMAL), }, { DEC1, { C_LET }, BAD1, }, { DEC1, { C_DEC }, DEC1, }, { DEC1, { 'e', 'E' }, DBL2, }, { DEC1, { 'l', 'L', 'u', 'U' }, QUAL(DEC2), }, { DEC1, { '.' }, DBL1, }, /* dec constant qualifier */ { DEC2, { C_XXX }, BACK(T_DECIMAL), }, { DEC2, { C_LET, C_DEC }, BAD1, }, { DEC2, { 'l', 'L', 'u', 'U' }, QUAL(DEC2), }, /* saw ., operator or dbl constant */ { DOT1, { C_XXX }, S_CHRB, }, { DOT1, { '.' }, DOT2, }, { DOT1, { C_DEC }, DBL1, }, /* dbl fraction */ { DBL1, { C_XXX }, BACK(T_DOUBLE), }, { DBL1, { C_LET, '.' }, BAD1, }, { DBL1, { C_DEC }, DBL1, }, { DBL1, { 'e', 'E' }, DBL2, }, { DBL1, { 'f', 'F', 'l', 'L' }, QUAL(DBL5), }, /* optional dbl exponent sign */ { DBL2, { C_XXX }, BACK(T_INVALID), }, { DBL2, { C_LET, '.' }, BAD1, }, { DBL2, { '+', '-' }, DBL3, }, { DBL2, { C_DEC }, DBL4, }, /* mandatory dbl exponent first digit */ { DBL3, { C_XXX }, BACK(T_INVALID), }, { DBL3, { C_LET, '.' }, BAD1, }, { DBL3, { C_DEC }, DBL4, }, /* dbl exponent digits */ { DBL4, { C_XXX }, BACK(T_DOUBLE), }, { DBL4, { C_LET, '.' }, BAD1, }, { DBL4, { C_DEC }, DBL4, }, { DBL4, { 'f', 'F', 'l', 'L' }, QUAL(DBL5), }, /* dbl constant qualifier */ { DBL5, { C_XXX }, BACK(T_DOUBLE), }, { DBL5, { C_LET, '.' }, BAD1, }, { DBL5, { 'f', 'F', 'l', 'L' }, QUAL(DBL5), }, /* saw < starting include header */ { HDR1, { C_XXX }, HDR1, }, { HDR1, { '>', '\n', C_EOF }, S_LITEND, }, /* saw expecting = */ { BIN1, { C_XXX }, S_HUH, }, { BIN1, { ' ', '\t' }, BIN1, }, /* 2-char ops */ { SHARP1, { C_XXX }, S_SHARP, }, { PCT1, { C_XXX }, S_CHRB, }, { PCT1, { '=' }, KEEP(T_MODEQ), }, { AND1, { C_XXX }, S_CHRB, }, { AND1, { '=' }, KEEP(T_ANDEQ), }, { AND1, { '&' }, KEEP(T_ANDAND), }, { STAR1, { C_XXX }, S_CHRB, }, { STAR1, { '=' }, KEEP(T_MPYEQ), }, { STAR1, { '/' }, S_COMMENT, }, { PLUS1, { C_XXX }, S_CHRB, }, { PLUS1, { '=' }, KEEP(T_ADDEQ), }, { PLUS1, { '+' }, KEEP(T_ADDADD), }, { MINUS1, { C_XXX }, S_CHRB, }, { MINUS1, { '=' }, KEEP(T_SUBEQ), }, { MINUS1, { '-' }, KEEP(T_SUBSUB), }, { MINUS1, { '>' }, KEEP(T_PTRMEM), }, { COLON1, { C_XXX }, S_CHRB, }, { COLON1, { '=', '>' }, S_HUH, }, { LT1, { C_XXX }, S_CHRB, }, { LT1, { '=' }, KEEP(T_LE), }, { LT1, { '<' }, LSH1, }, { EQ1, { C_XXX }, S_CHRB, }, { EQ1, { '=' }, KEEP(T_EQ), }, { NOT1, { C_XXX }, S_CHRB, }, { NOT1, { '=' }, KEEP(T_NE), }, { GT1, { C_XXX }, S_CHRB, }, { GT1, { '=' }, KEEP(T_GE), }, { GT1, { '>' }, RSH1, }, { CIRC1, { C_XXX }, S_CHRB, }, { CIRC1, { '=' }, KEEP(T_XOREQ), }, { OR1, { C_XXX }, S_CHRB, }, { OR1, { '=' }, KEEP(T_OREQ), }, { OR1, { '|' }, KEEP(T_OROR), }, /* 3-char ops */ { ARROW1, { C_XXX }, BACK(T_PTRMEM), }, { ARROW1, { '*' }, KEEP(T_PTRMEMREF), }, { LSH1, { C_XXX }, BACK(T_LSHIFT), }, { LSH1, { '=' }, KEEP(T_LSHIFTEQ), }, { RSH1, { C_XXX }, BACK(T_RSHIFT), }, { RSH1, { '=' }, KEEP(T_RSHIFTEQ), }, #endif /* end */ { OP, { 0 }, END, } }; short fsm[TERMINAL+1][MAX+1]; char trigraph[MAX+1]; #if PROTOMAIN static char spl[] = { '\\', '\r', 0 }; static char aln[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_$@"; #else static char spl[] = { MARK, '?', '\\', '\r', CC_sub, 0 }; static char aln[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_"; #endif static char* let = &aln[10]; static char hex[] = "fedcbaFEDCBA9876543210"; static char* dec = &hex[12]; static char* oct = &hex[14]; /* * runtime FSM modifications * ppfsm(FSM_INIT,0) must be called first */ void ppfsm(int op, register char* s) { register int c; register int n; register int i; register short* rp; register struct fsminit* fp; #if !PROTOMAIN char* t; int x; #endif switch (op) { #if !PROTOMAIN case FSM_IDADD: while (c = *s++) if (!ppisid(c)) { if (fsm[TOKEN][c] == ~S_HUH) { setid(c); for (i = 0; i < TERMINAL; i++) fsm[i][c] = IDSTATE(fsm[i]['_']); } else error(2, "%c: cannot add to identifier set", c); } break; case FSM_IDDEL: while (c = *s++) if (ppisid(c)) { clrid(c); for (i = 0; i < TERMINAL; i++) fsm[i][c] = ~S_HUH; } break; #endif case FSM_INIT: for (fp = fsminit;; fp++) { if ((n = fp->nextstate) >= TERMINAL) n = ~n; if (fp->state == OP) { #if !PROTOMAIN switch (n) { case COPY: c = fp->ch[0]; n = fp->ch[2]; for (i = fp->ch[1]; i <= n; i++) copy(i, c); continue; default: break; } #endif break; } rp = fsm[fp->state]; for (i = 0; i < sizeof(fp->ch) && (c = fp->ch[i]); i++) { switch (c) { case C_XXX: for (c = 0; c <= MAX; c++) rp[c] = n; /*FALLTHROUGH*/ case C_EOF: fsm[TERMINAL][fp->state+1] = n < 0 ? ~n : n; continue; case C_LET: s = let; break; case C_HEX: s = hex; break; case C_DEC: s = dec; break; case C_OCT: s = oct; break; default: rp[c] = n; continue; } while (c = *s++) rp[c] = n; } } /* * install splice special cases * and same non-terminal transitions */ for (i = 0; i < TERMINAL; i++) { rp = fsm[i]; s = spl; while (c = *s++) if (c != MARK || !INCOMMENT(rp)) { if (rp[c] >= 0) rp[c] = ~rp[c]; rp[c] &= ~SPLICE; } rp[EOB] = ~S_EOB; for (c = 0; c <= MAX; c++) if (rp[c] == i) rp[c] = 0; } fsm[TERMINAL][0] = ~S_EOB; #if !PROTOMAIN /* * default character types */ s = let; while (c = *s++) setid(c); s = dec; while (c = *s++) setdig(c); s = spl; do setsplice(c = *s++); while (c); /* * trigraph map */ trigraph['='] = '#'; trigraph['('] = '['; trigraph['/'] = '\\'; trigraph[')'] = ']'; trigraph['\''] = '^'; trigraph['<'] = '{'; trigraph['!'] = '|'; trigraph['>'] = '}'; trigraph['-'] = '~'; #endif break; #if !PROTOMAIN case FSM_PLUSPLUS: if (pp.option & PLUSPLUS) { fsm[COLON1][':'] = ~KEEP(T_SCOPE); fsm[DOT1]['*'] = ~KEEP(T_DOTREF); fsm[MINUS1]['>'] = ARROW1; fsm[COM1]['/'] = COM5; t = "%<:"; for (i = 0; i < TERMINAL; i++) { rp = fsm[i]; if (!INCOMMENT(rp) && !INQUOTE(rp)) { s = t; while (c = *s++) { if (rp[c] > 0) rp[c] = ~rp[c]; else if (!rp[c]) rp[c] = ~i; rp[c] &= ~SPLICE; } } } s = t; while (c = *s++) setsplice(c); } else { fsm[COLON1][':'] = ~S_CHRB; fsm[DOT1]['*'] = ~S_CHRB; fsm[MINUS1]['>'] = ~KEEP(T_PTRMEM); fsm[COM1]['/'] = (pp.option & PLUSCOMMENT) ? COM5 : ~S_CHRB; } break; #if COMPATIBLE case FSM_COMPATIBILITY: if (pp.state & COMPATIBILITY) { fsm[HEX1]['e'] = HEX1; fsm[HEX1]['E'] = HEX1; fsm[QNUM]['e'] = QNUM; fsm[QNUM]['E'] = QNUM; fsm[QNUM]['u'] = ~QUAL(QNUM); fsm[QNUM]['U'] = ~QUAL(QNUM); } else { fsm[HEX1]['e'] = HEX3; fsm[HEX1]['E'] = HEX3; fsm[QNUM]['e'] = QEXP; fsm[QNUM]['E'] = QEXP; fsm[QNUM]['u'] = QNUM; fsm[QNUM]['U'] = QNUM; } break; #endif case FSM_QUOTADD: while (c = *s++) if (fsm[TOKEN][c] == ~S_HUH) for (i = 0; i < TERMINAL; i++) fsm[i][c] = fsm[i]['"']; else error(2, "%c: cannot add to quote set", c); break; case FSM_QUOTDEL: while (c = *s++) if (c != '"' && fsm[TOKEN][c] == fsm[TOKEN]['"']) for (i = 0; i < TERMINAL; i++) fsm[i][c] = fsm[i]['_']; break; case FSM_OPSPACE: n = s ? BIN1 : ~S_CHRB; fsm[COM1][' '] = fsm[COM1]['\t'] = n; fsm[AND1][' '] = fsm[AND1]['\t'] = n; fsm[STAR1][' '] = fsm[STAR1]['\t'] = n; fsm[PCT1][' '] = fsm[PCT1]['\t'] = n; fsm[PLUS1][' '] = fsm[PLUS1]['\t'] = n; fsm[MINUS1][' '] = fsm[MINUS1]['\t'] = n; fsm[CIRC1][' '] = fsm[CIRC1]['\t'] = n; fsm[OR1][' '] = fsm[OR1]['\t'] = n; fsm[LSH1][' '] = fsm[LSH1]['\t'] = s ? BIN1 : ~BACK(T_LSHIFT); fsm[RSH1][' '] = fsm[RSH1]['\t'] = s ? BIN1 : ~BACK(T_RSHIFT); break; case FSM_MACRO: if (pp.truncate && strlen(s) >= pp.truncate) { x = s[pp.truncate]; s[pp.truncate] = 0; } else x = -1; i = MAC0 + ((c = *s++) != 'L'); if ((n = fsm[QUICK][c]) != (i + NMAC)) { n = i; if (!*s) n += NMAC; } if (fsm[QUICK][c] != n) fsm[QUICK][c] = fsm[QCOM][c] = fsm[QTOK][c] = n; if (c = *s++) { for (;;) { if ((i = n) < HIT0) { if (n < MACN) n++; if (!*s) { n += NMAC; break; } if (fsm[i][c] < HIT0) fsm[i][c] = n; if (fsm[i + NMAC][c] < HIT0) fsm[i + NMAC][c] = n; } else { if (n < HITN) n++; if (!*s) break; if (fsm[i][c] < HIT0) { n -= NMAC; fsm[i][c] = n; } } c = *s++; } if (x >= 0) { *s = x; for (n = CHAR_MIN; n <= CHAR_MAX; n++) if (ppisidig(n)) fsm[HITN][n] = HITN; n = HITN; } if (fsm[i][c] < n) fsm[i][c] = n; if (i < HIT0 && fsm[i + NMAC][c] < n) fsm[i + NMAC][c] = n; } break; #endif } } #if !PROTOMAIN /* * file buffer refill * c is current input char */ void refill(register int c) { if (pp.in->flags & IN_eof) { pp.in->nextchr--; c = 0; } else { *((pp.in->nextchr = pp.in->buffer + PPBAKSIZ) - 1) = c; c = #if PROTOTYPE (pp.in->flags & IN_prototype) ? pppread(pp.in->nextchr) : #endif read(pp.in->fd, pp.in->nextchr, PPBUFSIZ); } if (c > 0) { if (pp.in->nextchr[c - 1] == '\n') pp.in->flags |= IN_newline; else pp.in->flags &= ~IN_newline; #if PROTOTYPE if (!(pp.in->flags & IN_prototype)) #endif if (c < PPBUFSIZ && (pp.in->flags & IN_regular)) { pp.in->flags |= IN_eof; close(pp.in->fd); pp.in->fd = -1; } } else { if (c < 0) { error(ERROR_SYSTEM|3, "read error"); c = 0; } else if ((pp.in->flags ^ pp.in->prev->flags) & IN_c) { static char ket[] = { 0, '}', '\n', 0 }; pp.in->flags ^= IN_c; pp.in->nextchr = ket + 1; c = 2; } pp.in->flags |= IN_eof; } #if CHECKPOINT pp.in->buflen = c; #endif pp.in->nextchr[c] = 0; debug((-7, "refill(\"%s\") = %d = \"%-.*s%s\"", error_info.file, c, (c > 32 ? 32 : c), pp.in->nextchr, c > 32 ? "..." : "")); if (pp.test & 0x0080) sfprintf(sfstderr, "===== refill(\"%s\") = %d =====\n%s\n===== eob(\"%s\") =====\n", error_info.file, c, pp.in->nextchr, error_info.file); } #endif