xref: /dragonfly/contrib/awk/b.c (revision ed569bc2)
14b588458SPeter Avalos /****************************************************************
24b588458SPeter Avalos Copyright (C) Lucent Technologies 1997
34b588458SPeter Avalos All Rights Reserved
44b588458SPeter Avalos 
54b588458SPeter Avalos Permission to use, copy, modify, and distribute this software and
64b588458SPeter Avalos its documentation for any purpose and without fee is hereby
74b588458SPeter Avalos granted, provided that the above copyright notice appear in all
84b588458SPeter Avalos copies and that both that the copyright notice and this
94b588458SPeter Avalos permission notice and warranty disclaimer appear in supporting
104b588458SPeter Avalos documentation, and that the name Lucent Technologies or any of
114b588458SPeter Avalos its entities not be used in advertising or publicity pertaining
124b588458SPeter Avalos to distribution of the software without specific, written prior
134b588458SPeter Avalos permission.
144b588458SPeter Avalos 
154b588458SPeter Avalos LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
164b588458SPeter Avalos INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
174b588458SPeter Avalos IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
184b588458SPeter Avalos SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
194b588458SPeter Avalos WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
204b588458SPeter Avalos IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
214b588458SPeter Avalos ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
224b588458SPeter Avalos THIS SOFTWARE.
234b588458SPeter Avalos ****************************************************************/
244b588458SPeter Avalos 
254b588458SPeter Avalos /* lasciate ogne speranza, voi ch'intrate. */
264b588458SPeter Avalos 
274b588458SPeter Avalos #define	DEBUG
284b588458SPeter Avalos 
294b588458SPeter Avalos #include <ctype.h>
301d48fce0SDaniel Fojt #include <limits.h>
314b588458SPeter Avalos #include <stdio.h>
324b588458SPeter Avalos #include <string.h>
334b588458SPeter Avalos #include <stdlib.h>
344b588458SPeter Avalos #include "awk.h"
3548f09a05SAntonio Huete Jimenez #include "awkgram.tab.h"
364b588458SPeter Avalos 
374b588458SPeter Avalos #define MAXLIN 22
384b588458SPeter Avalos 
394b588458SPeter Avalos #define type(v)		(v)->nobj	/* badly overloaded here */
404b588458SPeter Avalos #define info(v)		(v)->ntype	/* badly overloaded here */
414b588458SPeter Avalos #define left(v)		(v)->narg[0]
424b588458SPeter Avalos #define right(v)	(v)->narg[1]
434b588458SPeter Avalos #define parent(v)	(v)->nnext
444b588458SPeter Avalos 
454b588458SPeter Avalos #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
464b588458SPeter Avalos #define ELEAF	case EMPTYRE:		/* empty string in regexp */
474b588458SPeter Avalos #define UNARY	case STAR: case PLUS: case QUEST:
484b588458SPeter Avalos 
494b588458SPeter Avalos /* encoding in tree Nodes:
504b588458SPeter Avalos 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
514b588458SPeter Avalos 		left is index, right contains value or pointer to value
524b588458SPeter Avalos 	unary (STAR, PLUS, QUEST): left is child, right is null
534b588458SPeter Avalos 	binary (CAT, OR): left and right are children
544b588458SPeter Avalos 	parent contains pointer to parent
554b588458SPeter Avalos */
564b588458SPeter Avalos 
574b588458SPeter Avalos 
584b588458SPeter Avalos int	*setvec;
594b588458SPeter Avalos int	*tmpset;
604b588458SPeter Avalos int	maxsetvec = 0;
614b588458SPeter Avalos 
624b588458SPeter Avalos int	rtok;		/* next token in current re */
634b588458SPeter Avalos int	rlxval;
641d48fce0SDaniel Fojt static const uschar	*rlxstr;
651d48fce0SDaniel Fojt static const uschar	*prestr;	/* current position in current re */
661d48fce0SDaniel Fojt static const uschar	*lastre;	/* origin of last re */
671d48fce0SDaniel Fojt static const uschar	*lastatom;	/* origin of last Atom */
681d48fce0SDaniel Fojt static const uschar	*starttok;
691d48fce0SDaniel Fojt static const uschar 	*basestr;	/* starts with original, replaced during
701d48fce0SDaniel Fojt 				   repetition processing */
711d48fce0SDaniel Fojt static const uschar 	*firstbasestr;
724b588458SPeter Avalos 
734b588458SPeter Avalos static	int setcnt;
744b588458SPeter Avalos static	int poscnt;
754b588458SPeter Avalos 
761d48fce0SDaniel Fojt const char	*patbeg;
774b588458SPeter Avalos int	patlen;
784b588458SPeter Avalos 
791d48fce0SDaniel Fojt #define	NFA	128	/* cache this many dynamic fa's */
804b588458SPeter Avalos fa	*fatab[NFA];
814b588458SPeter Avalos int	nfatab	= 0;	/* entries in fatab */
824b588458SPeter Avalos 
83*ed569bc2SAaron LI extern int u8_nextlen(const char *s);
84*ed569bc2SAaron LI 
85*ed569bc2SAaron LI 
86*ed569bc2SAaron LI /* utf-8 mechanism:
87*ed569bc2SAaron LI 
88*ed569bc2SAaron LI    For most of Awk, utf-8 strings just "work", since they look like
89*ed569bc2SAaron LI    null-terminated sequences of 8-bit bytes.
90*ed569bc2SAaron LI 
91*ed569bc2SAaron LI    Functions like length(), index(), and substr() have to operate
92*ed569bc2SAaron LI    in units of utf-8 characters.  The u8_* functions in run.c
93*ed569bc2SAaron LI    handle this.
94*ed569bc2SAaron LI 
95*ed569bc2SAaron LI    Regular expressions are more complicated, since the basic
96*ed569bc2SAaron LI    mechanism of the goto table used 8-bit byte indices into the
97*ed569bc2SAaron LI    gototab entries to compute the next state.  Unicode is a lot
98*ed569bc2SAaron LI    bigger, so the gototab entries are now structs with a character
99*ed569bc2SAaron LI    and a next state. These are sorted by code point and binary
100*ed569bc2SAaron LI    searched.
101*ed569bc2SAaron LI 
102*ed569bc2SAaron LI    Throughout the RE mechanism in b.c, utf-8 characters are
103*ed569bc2SAaron LI    converted to their utf-32 value.  This mostly shows up in
104*ed569bc2SAaron LI    cclenter, which expands character class ranges like a-z and now
105*ed569bc2SAaron LI    alpha-omega.  The size of a gototab array is still about 256.
106*ed569bc2SAaron LI    This should be dynamic, but for now things work ok for a single
107*ed569bc2SAaron LI    code page of Unicode, which is the most likely case.
108*ed569bc2SAaron LI 
109*ed569bc2SAaron LI    The code changes are localized in run.c and b.c.  I have added a
110*ed569bc2SAaron LI    handful of functions to somewhat better hide the implementation,
111*ed569bc2SAaron LI    but a lot more could be done.
112*ed569bc2SAaron LI 
113*ed569bc2SAaron LI  */
114*ed569bc2SAaron LI 
115*ed569bc2SAaron LI static int entry_cmp(const void *l, const void *r);
116*ed569bc2SAaron LI static int get_gototab(fa*, int, int);
117*ed569bc2SAaron LI static int set_gototab(fa*, int, int, int);
118*ed569bc2SAaron LI static void clear_gototab(fa*, int);
119*ed569bc2SAaron LI extern int u8_rune(int *, const char *);
120*ed569bc2SAaron LI 
1211d48fce0SDaniel Fojt static int *
intalloc(size_t n,const char * f)1221d48fce0SDaniel Fojt intalloc(size_t n, const char *f)
1231d48fce0SDaniel Fojt {
12448f09a05SAntonio Huete Jimenez 	int *p = (int *) calloc(n, sizeof(int));
1251d48fce0SDaniel Fojt 	if (p == NULL)
1261d48fce0SDaniel Fojt 		overflo(f);
1271d48fce0SDaniel Fojt 	return p;
1281d48fce0SDaniel Fojt }
1291d48fce0SDaniel Fojt 
1301d48fce0SDaniel Fojt static void
resizesetvec(const char * f)1311d48fce0SDaniel Fojt resizesetvec(const char *f)
1321d48fce0SDaniel Fojt {
1331d48fce0SDaniel Fojt 	if (maxsetvec == 0)
1341d48fce0SDaniel Fojt 		maxsetvec = MAXLIN;
1351d48fce0SDaniel Fojt 	else
1361d48fce0SDaniel Fojt 		maxsetvec *= 4;
13748f09a05SAntonio Huete Jimenez 	setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
13848f09a05SAntonio Huete Jimenez 	tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
1391d48fce0SDaniel Fojt 	if (setvec == NULL || tmpset == NULL)
1401d48fce0SDaniel Fojt 		overflo(f);
1411d48fce0SDaniel Fojt }
1421d48fce0SDaniel Fojt 
1431d48fce0SDaniel Fojt static void
resize_state(fa * f,int state)1441d48fce0SDaniel Fojt resize_state(fa *f, int state)
1451d48fce0SDaniel Fojt {
146*ed569bc2SAaron LI 	gtt *p;
14748f09a05SAntonio Huete Jimenez 	uschar *p2;
14848f09a05SAntonio Huete Jimenez 	int **p3;
1491d48fce0SDaniel Fojt 	int i, new_count;
1501d48fce0SDaniel Fojt 
1511d48fce0SDaniel Fojt 	if (++state < f->state_count)
1521d48fce0SDaniel Fojt 		return;
1531d48fce0SDaniel Fojt 
1541d48fce0SDaniel Fojt 	new_count = state + 10; /* needs to be tuned */
1551d48fce0SDaniel Fojt 
156*ed569bc2SAaron LI 	p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
1571d48fce0SDaniel Fojt 	if (p == NULL)
1581d48fce0SDaniel Fojt 		goto out;
1591d48fce0SDaniel Fojt 	f->gototab = p;
1601d48fce0SDaniel Fojt 
16148f09a05SAntonio Huete Jimenez 	p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
16248f09a05SAntonio Huete Jimenez 	if (p2 == NULL)
1631d48fce0SDaniel Fojt 		goto out;
16448f09a05SAntonio Huete Jimenez 	f->out = p2;
1651d48fce0SDaniel Fojt 
16648f09a05SAntonio Huete Jimenez 	p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
16748f09a05SAntonio Huete Jimenez 	if (p3 == NULL)
1681d48fce0SDaniel Fojt 		goto out;
16948f09a05SAntonio Huete Jimenez 	f->posns = p3;
1701d48fce0SDaniel Fojt 
1711d48fce0SDaniel Fojt 	for (i = f->state_count; i < new_count; ++i) {
172*ed569bc2SAaron LI 		f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
173*ed569bc2SAaron LI 		if (f->gototab[i].entries == NULL)
1741d48fce0SDaniel Fojt 			goto out;
175*ed569bc2SAaron LI 		f->gototab[i].allocated = NCHARS;
176*ed569bc2SAaron LI 		f->gototab[i].inuse = 0;
1771d48fce0SDaniel Fojt 		f->out[i] = 0;
1781d48fce0SDaniel Fojt 		f->posns[i] = NULL;
1791d48fce0SDaniel Fojt 	}
1801d48fce0SDaniel Fojt 	f->state_count = new_count;
1811d48fce0SDaniel Fojt 	return;
1821d48fce0SDaniel Fojt out:
1831d48fce0SDaniel Fojt 	overflo(__func__);
1841d48fce0SDaniel Fojt }
1851d48fce0SDaniel Fojt 
makedfa(const char * s,bool anchor)1861d48fce0SDaniel Fojt fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
1874b588458SPeter Avalos {
1884b588458SPeter Avalos 	int i, use, nuse;
1894b588458SPeter Avalos 	fa *pfa;
1904b588458SPeter Avalos 	static int now = 1;
1914b588458SPeter Avalos 
1921d48fce0SDaniel Fojt 	if (setvec == NULL) {	/* first time through any RE */
1931d48fce0SDaniel Fojt 		resizesetvec(__func__);
1944b588458SPeter Avalos 	}
1954b588458SPeter Avalos 
1961d48fce0SDaniel Fojt 	if (compile_time != RUNNING)	/* a constant for sure */
1974b588458SPeter Avalos 		return mkdfa(s, anchor);
1984b588458SPeter Avalos 	for (i = 0; i < nfatab; i++)	/* is it there already? */
1994b588458SPeter Avalos 		if (fatab[i]->anchor == anchor
2004b588458SPeter Avalos 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
2014b588458SPeter Avalos 			fatab[i]->use = now++;
2024b588458SPeter Avalos 			return fatab[i];
2034b588458SPeter Avalos 		}
2044b588458SPeter Avalos 	pfa = mkdfa(s, anchor);
2054b588458SPeter Avalos 	if (nfatab < NFA) {	/* room for another */
2064b588458SPeter Avalos 		fatab[nfatab] = pfa;
2074b588458SPeter Avalos 		fatab[nfatab]->use = now++;
2084b588458SPeter Avalos 		nfatab++;
2094b588458SPeter Avalos 		return pfa;
2104b588458SPeter Avalos 	}
2114b588458SPeter Avalos 	use = fatab[0]->use;	/* replace least-recently used */
2124b588458SPeter Avalos 	nuse = 0;
2134b588458SPeter Avalos 	for (i = 1; i < nfatab; i++)
2144b588458SPeter Avalos 		if (fatab[i]->use < use) {
2154b588458SPeter Avalos 			use = fatab[i]->use;
2164b588458SPeter Avalos 			nuse = i;
2174b588458SPeter Avalos 		}
2184b588458SPeter Avalos 	freefa(fatab[nuse]);
2194b588458SPeter Avalos 	fatab[nuse] = pfa;
2204b588458SPeter Avalos 	pfa->use = now++;
2214b588458SPeter Avalos 	return pfa;
2224b588458SPeter Avalos }
2234b588458SPeter Avalos 
mkdfa(const char * s,bool anchor)2241d48fce0SDaniel Fojt fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
2251d48fce0SDaniel Fojt 				/* anchor = true for anchored matches, else false */
2264b588458SPeter Avalos {
2274b588458SPeter Avalos 	Node *p, *p1;
2284b588458SPeter Avalos 	fa *f;
2294b588458SPeter Avalos 
2301d48fce0SDaniel Fojt 	firstbasestr = (const uschar *) s;
2311d48fce0SDaniel Fojt 	basestr = firstbasestr;
2324b588458SPeter Avalos 	p = reparse(s);
2334b588458SPeter Avalos 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
2344b588458SPeter Avalos 		/* put ALL STAR in front of reg.  exp. */
2354b588458SPeter Avalos 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
2364b588458SPeter Avalos 		/* put FINAL after reg.  exp. */
2374b588458SPeter Avalos 
2384b588458SPeter Avalos 	poscnt = 0;
2394b588458SPeter Avalos 	penter(p1);	/* enter parent pointers and leaf indices */
24048f09a05SAntonio Huete Jimenez 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
2411d48fce0SDaniel Fojt 		overflo(__func__);
2424b588458SPeter Avalos 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
2434b588458SPeter Avalos 	cfoll(f, p1);	/* set up follow sets */
2444b588458SPeter Avalos 	freetr(p1);
2451d48fce0SDaniel Fojt 	resize_state(f, 1);
2461d48fce0SDaniel Fojt 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
2471d48fce0SDaniel Fojt 	f->posns[1] = intalloc(1, __func__);
2484b588458SPeter Avalos 	*f->posns[1] = 0;
2494b588458SPeter Avalos 	f->initstat = makeinit(f, anchor);
2504b588458SPeter Avalos 	f->anchor = anchor;
2514b588458SPeter Avalos 	f->restr = (uschar *) tostring(s);
2521d48fce0SDaniel Fojt 	if (firstbasestr != basestr) {
2531d48fce0SDaniel Fojt 		if (basestr)
2541d48fce0SDaniel Fojt 			xfree(basestr);
2551d48fce0SDaniel Fojt 	}
2564b588458SPeter Avalos 	return f;
2574b588458SPeter Avalos }
2584b588458SPeter Avalos 
makeinit(fa * f,bool anchor)2591d48fce0SDaniel Fojt int makeinit(fa *f, bool anchor)
2604b588458SPeter Avalos {
2614b588458SPeter Avalos 	int i, k;
2624b588458SPeter Avalos 
2634b588458SPeter Avalos 	f->curstat = 2;
2644b588458SPeter Avalos 	f->out[2] = 0;
2654b588458SPeter Avalos 	k = *(f->re[0].lfollow);
2664b588458SPeter Avalos 	xfree(f->posns[2]);
2671d48fce0SDaniel Fojt 	f->posns[2] = intalloc(k + 1,  __func__);
2684b588458SPeter Avalos 	for (i = 0; i <= k; i++) {
2694b588458SPeter Avalos 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
2704b588458SPeter Avalos 	}
2714b588458SPeter Avalos 	if ((f->posns[2])[1] == f->accept)
2724b588458SPeter Avalos 		f->out[2] = 1;
273*ed569bc2SAaron LI 	clear_gototab(f, 2);
2744b588458SPeter Avalos 	f->curstat = cgoto(f, 2, HAT);
2754b588458SPeter Avalos 	if (anchor) {
2764b588458SPeter Avalos 		*f->posns[2] = k-1;	/* leave out position 0 */
2774b588458SPeter Avalos 		for (i = 0; i < k; i++) {
2784b588458SPeter Avalos 			(f->posns[0])[i] = (f->posns[2])[i];
2794b588458SPeter Avalos 		}
2804b588458SPeter Avalos 
2814b588458SPeter Avalos 		f->out[0] = f->out[2];
2824b588458SPeter Avalos 		if (f->curstat != 2)
2834b588458SPeter Avalos 			--(*f->posns[f->curstat]);
2844b588458SPeter Avalos 	}
2854b588458SPeter Avalos 	return f->curstat;
2864b588458SPeter Avalos }
2874b588458SPeter Avalos 
penter(Node * p)2884b588458SPeter Avalos void penter(Node *p)	/* set up parent pointers and leaf indices */
2894b588458SPeter Avalos {
2904b588458SPeter Avalos 	switch (type(p)) {
2914b588458SPeter Avalos 	ELEAF
2924b588458SPeter Avalos 	LEAF
2934b588458SPeter Avalos 		info(p) = poscnt;
2944b588458SPeter Avalos 		poscnt++;
2954b588458SPeter Avalos 		break;
2964b588458SPeter Avalos 	UNARY
2974b588458SPeter Avalos 		penter(left(p));
2984b588458SPeter Avalos 		parent(left(p)) = p;
2994b588458SPeter Avalos 		break;
3004b588458SPeter Avalos 	case CAT:
3014b588458SPeter Avalos 	case OR:
3024b588458SPeter Avalos 		penter(left(p));
3034b588458SPeter Avalos 		penter(right(p));
3044b588458SPeter Avalos 		parent(left(p)) = p;
3054b588458SPeter Avalos 		parent(right(p)) = p;
3064b588458SPeter Avalos 		break;
3071d48fce0SDaniel Fojt 	case ZERO:
3081d48fce0SDaniel Fojt 		break;
3094b588458SPeter Avalos 	default:	/* can't happen */
3104b588458SPeter Avalos 		FATAL("can't happen: unknown type %d in penter", type(p));
3114b588458SPeter Avalos 		break;
3124b588458SPeter Avalos 	}
3134b588458SPeter Avalos }
3144b588458SPeter Avalos 
freetr(Node * p)3154b588458SPeter Avalos void freetr(Node *p)	/* free parse tree */
3164b588458SPeter Avalos {
3174b588458SPeter Avalos 	switch (type(p)) {
3184b588458SPeter Avalos 	ELEAF
3194b588458SPeter Avalos 	LEAF
3204b588458SPeter Avalos 		xfree(p);
3214b588458SPeter Avalos 		break;
3224b588458SPeter Avalos 	UNARY
3231d48fce0SDaniel Fojt 	case ZERO:
3244b588458SPeter Avalos 		freetr(left(p));
3254b588458SPeter Avalos 		xfree(p);
3264b588458SPeter Avalos 		break;
3274b588458SPeter Avalos 	case CAT:
3284b588458SPeter Avalos 	case OR:
3294b588458SPeter Avalos 		freetr(left(p));
3304b588458SPeter Avalos 		freetr(right(p));
3314b588458SPeter Avalos 		xfree(p);
3324b588458SPeter Avalos 		break;
3334b588458SPeter Avalos 	default:	/* can't happen */
3344b588458SPeter Avalos 		FATAL("can't happen: unknown type %d in freetr", type(p));
3354b588458SPeter Avalos 		break;
3364b588458SPeter Avalos 	}
3374b588458SPeter Avalos }
3384b588458SPeter Avalos 
3394b588458SPeter Avalos /* in the parsing of regular expressions, metacharacters like . have */
3404b588458SPeter Avalos /* to be seen literally;  \056 is not a metacharacter. */
3414b588458SPeter Avalos 
hexstr(const uschar ** pp,int max)342*ed569bc2SAaron LI int hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
3434b588458SPeter Avalos {			/* only pick up one 8-bit byte (2 chars) */
3441d48fce0SDaniel Fojt 	const uschar *p;
3454b588458SPeter Avalos 	int n = 0;
3464b588458SPeter Avalos 	int i;
3474b588458SPeter Avalos 
348*ed569bc2SAaron LI 	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
349*ed569bc2SAaron LI 		if (isdigit((int) *p))
3504b588458SPeter Avalos 			n = 16 * n + *p - '0';
3514b588458SPeter Avalos 		else if (*p >= 'a' && *p <= 'f')
3524b588458SPeter Avalos 			n = 16 * n + *p - 'a' + 10;
3534b588458SPeter Avalos 		else if (*p >= 'A' && *p <= 'F')
3544b588458SPeter Avalos 			n = 16 * n + *p - 'A' + 10;
3554b588458SPeter Avalos 	}
3561d48fce0SDaniel Fojt 	*pp = p;
3574b588458SPeter Avalos 	return n;
3584b588458SPeter Avalos }
3594b588458SPeter Avalos 
360*ed569bc2SAaron LI 
361*ed569bc2SAaron LI 
3624b588458SPeter Avalos #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
3634b588458SPeter Avalos 
quoted(const uschar ** pp)3641d48fce0SDaniel Fojt int quoted(const uschar **pp)	/* pick up next thing after a \\ */
3654b588458SPeter Avalos 			/* and increment *pp */
3664b588458SPeter Avalos {
3671d48fce0SDaniel Fojt 	const uschar *p = *pp;
3684b588458SPeter Avalos 	int c;
3694b588458SPeter Avalos 
370*ed569bc2SAaron LI /* BUG: should advance by utf-8 char even if makes no sense */
371*ed569bc2SAaron LI 
372*ed569bc2SAaron LI 	if ((c = *p++) == 't') {
3734b588458SPeter Avalos 		c = '\t';
374*ed569bc2SAaron LI 	} else if (c == 'n') {
3754b588458SPeter Avalos 		c = '\n';
376*ed569bc2SAaron LI 	} else if (c == 'f') {
3774b588458SPeter Avalos 		c = '\f';
378*ed569bc2SAaron LI 	} else if (c == 'r') {
3794b588458SPeter Avalos 		c = '\r';
380*ed569bc2SAaron LI 	} else if (c == 'b') {
3814b588458SPeter Avalos 		c = '\b';
382*ed569bc2SAaron LI 	} else if (c == 'v') {
3831d48fce0SDaniel Fojt 		c = '\v';
384*ed569bc2SAaron LI 	} else if (c == 'a') {
3851d48fce0SDaniel Fojt 		c = '\a';
386*ed569bc2SAaron LI 	} else if (c == '\\') {
3874b588458SPeter Avalos 		c = '\\';
388*ed569bc2SAaron LI 	} else if (c == 'x') {	/* 2 hex digits follow */
389*ed569bc2SAaron LI 		c = hexstr(&p, 2);	/* this adds a null if number is invalid */
390*ed569bc2SAaron LI 	} else if (c == 'u') {	/* unicode char number up to 8 hex digits */
391*ed569bc2SAaron LI 		c = hexstr(&p, 8);
3924b588458SPeter Avalos 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
3934b588458SPeter Avalos 		int n = c - '0';
3944b588458SPeter Avalos 		if (isoctdigit(*p)) {
3954b588458SPeter Avalos 			n = 8 * n + *p++ - '0';
3964b588458SPeter Avalos 			if (isoctdigit(*p))
3974b588458SPeter Avalos 				n = 8 * n + *p++ - '0';
3984b588458SPeter Avalos 		}
3994b588458SPeter Avalos 		c = n;
4004b588458SPeter Avalos 	} /* else */
4014b588458SPeter Avalos 		/* c = c; */
4024b588458SPeter Avalos 	*pp = p;
4034b588458SPeter Avalos 	return c;
4044b588458SPeter Avalos }
4054b588458SPeter Avalos 
cclenter(const char * argp)406*ed569bc2SAaron LI int *cclenter(const char *argp)	/* add a character class */
4074b588458SPeter Avalos {
4084b588458SPeter Avalos 	int i, c, c2;
409*ed569bc2SAaron LI 	int n;
410*ed569bc2SAaron LI 	const uschar *p = (const uschar *) argp;
411*ed569bc2SAaron LI 	int *bp, *retp;
412*ed569bc2SAaron LI 	static int *buf = NULL;
4134b588458SPeter Avalos 	static int bufsz = 100;
4144b588458SPeter Avalos 
415*ed569bc2SAaron LI 	if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
4164b588458SPeter Avalos 		FATAL("out of space for character class [%.10s...] 1", p);
4174b588458SPeter Avalos 	bp = buf;
418*ed569bc2SAaron LI 	for (i = 0; *p != 0; ) {
419*ed569bc2SAaron LI 		n = u8_rune(&c, (const char *) p);
420*ed569bc2SAaron LI 		p += n;
4214b588458SPeter Avalos 		if (c == '\\') {
422b12bae18SSascha Wildner 			c = quoted(&p);
4234b588458SPeter Avalos 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
4244b588458SPeter Avalos 			if (*p != 0) {
4254b588458SPeter Avalos 				c = bp[-1];
426*ed569bc2SAaron LI 				/* c2 = *p++; */
427*ed569bc2SAaron LI 				n = u8_rune(&c2, (const char *) p);
428*ed569bc2SAaron LI 				p += n;
4294b588458SPeter Avalos 				if (c2 == '\\')
430*ed569bc2SAaron LI 					c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
4314b588458SPeter Avalos 				if (c > c2) {	/* empty; ignore */
4324b588458SPeter Avalos 					bp--;
4334b588458SPeter Avalos 					i--;
4344b588458SPeter Avalos 					continue;
4354b588458SPeter Avalos 				}
4364b588458SPeter Avalos 				while (c < c2) {
437*ed569bc2SAaron LI 					if (i >= bufsz) {
438*ed569bc2SAaron LI 						bufsz *= 2;
439*ed569bc2SAaron LI 						buf = (int *) realloc(buf, bufsz * sizeof(int));
440*ed569bc2SAaron LI 						if (buf == NULL)
4414b588458SPeter Avalos 							FATAL("out of space for character class [%.10s...] 2", p);
442*ed569bc2SAaron LI 						bp = buf + i;
443*ed569bc2SAaron LI 					}
4444b588458SPeter Avalos 					*bp++ = ++c;
4454b588458SPeter Avalos 					i++;
4464b588458SPeter Avalos 				}
4474b588458SPeter Avalos 				continue;
4484b588458SPeter Avalos 			}
4494b588458SPeter Avalos 		}
450*ed569bc2SAaron LI 		if (i >= bufsz) {
451*ed569bc2SAaron LI 			bufsz *= 2;
452*ed569bc2SAaron LI 			buf = (int *) realloc(buf, bufsz * sizeof(int));
453*ed569bc2SAaron LI 			if (buf == NULL)
454*ed569bc2SAaron LI 				FATAL("out of space for character class [%.10s...] 2", p);
455*ed569bc2SAaron LI 			bp = buf + i;
456*ed569bc2SAaron LI 		}
4574b588458SPeter Avalos 		*bp++ = c;
4584b588458SPeter Avalos 		i++;
4594b588458SPeter Avalos 	}
4604b588458SPeter Avalos 	*bp = 0;
461*ed569bc2SAaron LI 	/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
462*ed569bc2SAaron LI 	/* xfree(op);  BUG: what are we freeing here? */
463*ed569bc2SAaron LI 	retp = (int *) calloc(bp-buf+1, sizeof(int));
464*ed569bc2SAaron LI 	for (i = 0; i < bp-buf+1; i++)
465*ed569bc2SAaron LI 		retp[i] = buf[i];
466*ed569bc2SAaron LI 	return retp;
4674b588458SPeter Avalos }
4684b588458SPeter Avalos 
overflo(const char * s)4694b588458SPeter Avalos void overflo(const char *s)
4704b588458SPeter Avalos {
4711d48fce0SDaniel Fojt 	FATAL("regular expression too big: out of space in %.30s...", s);
4724b588458SPeter Avalos }
4734b588458SPeter Avalos 
cfoll(fa * f,Node * v)4744b588458SPeter Avalos void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
4754b588458SPeter Avalos {
4764b588458SPeter Avalos 	int i;
4774b588458SPeter Avalos 	int *p;
4784b588458SPeter Avalos 
4794b588458SPeter Avalos 	switch (type(v)) {
4804b588458SPeter Avalos 	ELEAF
4814b588458SPeter Avalos 	LEAF
4824b588458SPeter Avalos 		f->re[info(v)].ltype = type(v);
4834b588458SPeter Avalos 		f->re[info(v)].lval.np = right(v);
4844b588458SPeter Avalos 		while (f->accept >= maxsetvec) {	/* guessing here! */
4851d48fce0SDaniel Fojt 			resizesetvec(__func__);
4864b588458SPeter Avalos 		}
4874b588458SPeter Avalos 		for (i = 0; i <= f->accept; i++)
4884b588458SPeter Avalos 			setvec[i] = 0;
4894b588458SPeter Avalos 		setcnt = 0;
4904b588458SPeter Avalos 		follow(v);	/* computes setvec and setcnt */
4911d48fce0SDaniel Fojt 		p = intalloc(setcnt + 1, __func__);
4924b588458SPeter Avalos 		f->re[info(v)].lfollow = p;
4934b588458SPeter Avalos 		*p = setcnt;
4944b588458SPeter Avalos 		for (i = f->accept; i >= 0; i--)
4954b588458SPeter Avalos 			if (setvec[i] == 1)
4964b588458SPeter Avalos 				*++p = i;
4974b588458SPeter Avalos 		break;
4984b588458SPeter Avalos 	UNARY
4994b588458SPeter Avalos 		cfoll(f,left(v));
5004b588458SPeter Avalos 		break;
5014b588458SPeter Avalos 	case CAT:
5024b588458SPeter Avalos 	case OR:
5034b588458SPeter Avalos 		cfoll(f,left(v));
5044b588458SPeter Avalos 		cfoll(f,right(v));
5054b588458SPeter Avalos 		break;
5061d48fce0SDaniel Fojt 	case ZERO:
5071d48fce0SDaniel Fojt 		break;
5084b588458SPeter Avalos 	default:	/* can't happen */
5094b588458SPeter Avalos 		FATAL("can't happen: unknown type %d in cfoll", type(v));
5104b588458SPeter Avalos 	}
5114b588458SPeter Avalos }
5124b588458SPeter Avalos 
first(Node * p)5134b588458SPeter Avalos int first(Node *p)	/* collects initially active leaves of p into setvec */
5144b588458SPeter Avalos 			/* returns 0 if p matches empty string */
5154b588458SPeter Avalos {
5164b588458SPeter Avalos 	int b, lp;
5174b588458SPeter Avalos 
5184b588458SPeter Avalos 	switch (type(p)) {
5194b588458SPeter Avalos 	ELEAF
5204b588458SPeter Avalos 	LEAF
5214b588458SPeter Avalos 		lp = info(p);	/* look for high-water mark of subscripts */
5224b588458SPeter Avalos 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
5231d48fce0SDaniel Fojt 			resizesetvec(__func__);
5244b588458SPeter Avalos 		}
5254b588458SPeter Avalos 		if (type(p) == EMPTYRE) {
5264b588458SPeter Avalos 			setvec[lp] = 0;
5274b588458SPeter Avalos 			return(0);
5284b588458SPeter Avalos 		}
5294b588458SPeter Avalos 		if (setvec[lp] != 1) {
5304b588458SPeter Avalos 			setvec[lp] = 1;
5314b588458SPeter Avalos 			setcnt++;
5324b588458SPeter Avalos 		}
533*ed569bc2SAaron LI 		if (type(p) == CCL && (*(int *) right(p)) == 0)
5344b588458SPeter Avalos 			return(0);		/* empty CCL */
5351d48fce0SDaniel Fojt 		return(1);
5364b588458SPeter Avalos 	case PLUS:
5371d48fce0SDaniel Fojt 		if (first(left(p)) == 0)
5381d48fce0SDaniel Fojt 			return(0);
5394b588458SPeter Avalos 		return(1);
5404b588458SPeter Avalos 	case STAR:
5414b588458SPeter Avalos 	case QUEST:
5424b588458SPeter Avalos 		first(left(p));
5434b588458SPeter Avalos 		return(0);
5444b588458SPeter Avalos 	case CAT:
5454b588458SPeter Avalos 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
5464b588458SPeter Avalos 		return(1);
5474b588458SPeter Avalos 	case OR:
5484b588458SPeter Avalos 		b = first(right(p));
5494b588458SPeter Avalos 		if (first(left(p)) == 0 || b == 0) return(0);
5504b588458SPeter Avalos 		return(1);
5511d48fce0SDaniel Fojt 	case ZERO:
5521d48fce0SDaniel Fojt 		return 0;
5534b588458SPeter Avalos 	}
5544b588458SPeter Avalos 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
5554b588458SPeter Avalos 	return(-1);
5564b588458SPeter Avalos }
5574b588458SPeter Avalos 
follow(Node * v)5584b588458SPeter Avalos void follow(Node *v)	/* collects leaves that can follow v into setvec */
5594b588458SPeter Avalos {
5604b588458SPeter Avalos 	Node *p;
5614b588458SPeter Avalos 
5624b588458SPeter Avalos 	if (type(v) == FINAL)
5634b588458SPeter Avalos 		return;
5644b588458SPeter Avalos 	p = parent(v);
5654b588458SPeter Avalos 	switch (type(p)) {
5664b588458SPeter Avalos 	case STAR:
5674b588458SPeter Avalos 	case PLUS:
5684b588458SPeter Avalos 		first(v);
5694b588458SPeter Avalos 		follow(p);
5704b588458SPeter Avalos 		return;
5714b588458SPeter Avalos 
5724b588458SPeter Avalos 	case OR:
5734b588458SPeter Avalos 	case QUEST:
5744b588458SPeter Avalos 		follow(p);
5754b588458SPeter Avalos 		return;
5764b588458SPeter Avalos 
5774b588458SPeter Avalos 	case CAT:
5784b588458SPeter Avalos 		if (v == left(p)) {	/* v is left child of p */
5794b588458SPeter Avalos 			if (first(right(p)) == 0) {
5804b588458SPeter Avalos 				follow(p);
5814b588458SPeter Avalos 				return;
5824b588458SPeter Avalos 			}
5834b588458SPeter Avalos 		} else		/* v is right child */
5844b588458SPeter Avalos 			follow(p);
5854b588458SPeter Avalos 		return;
5864b588458SPeter Avalos 	}
5874b588458SPeter Avalos }
5884b588458SPeter Avalos 
member(int c,int * sarg)589*ed569bc2SAaron LI int member(int c, int *sarg)	/* is c in s? */
5904b588458SPeter Avalos {
591*ed569bc2SAaron LI 	int *s = (int *) sarg;
5924b588458SPeter Avalos 
5934b588458SPeter Avalos 	while (*s)
5944b588458SPeter Avalos 		if (c == *s++)
5954b588458SPeter Avalos 			return(1);
5964b588458SPeter Avalos 	return(0);
5974b588458SPeter Avalos }
5984b588458SPeter Avalos 
resize_gototab(fa * f,int state)599*ed569bc2SAaron LI static void resize_gototab(fa *f, int state)
600*ed569bc2SAaron LI {
601*ed569bc2SAaron LI 	size_t new_size = f->gototab[state].allocated * 2;
602*ed569bc2SAaron LI 	gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
603*ed569bc2SAaron LI 	if (p == NULL)
604*ed569bc2SAaron LI 		overflo(__func__);
605*ed569bc2SAaron LI 
606*ed569bc2SAaron LI 	// need to initialized the new memory to zero
607*ed569bc2SAaron LI 	size_t orig_size = f->gototab[state].allocated;		// 2nd half of new mem is this size
608*ed569bc2SAaron LI 	memset(p + orig_size, 0, orig_size * sizeof(gtte));	// clean it out
609*ed569bc2SAaron LI 
610*ed569bc2SAaron LI 	f->gototab[state].allocated = new_size;			// update gototab info
611*ed569bc2SAaron LI 	f->gototab[state].entries = p;
612*ed569bc2SAaron LI }
613*ed569bc2SAaron LI 
get_gototab(fa * f,int state,int ch)614*ed569bc2SAaron LI static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
615*ed569bc2SAaron LI {
616*ed569bc2SAaron LI 	gtte key;
617*ed569bc2SAaron LI 	gtte *item;
618*ed569bc2SAaron LI 
619*ed569bc2SAaron LI 	key.ch = ch;
620*ed569bc2SAaron LI 	key.state = 0;	/* irrelevant */
621*ed569bc2SAaron LI 	item = (gtte *) bsearch(& key, f->gototab[state].entries,
622*ed569bc2SAaron LI 			f->gototab[state].inuse, sizeof(gtte),
623*ed569bc2SAaron LI 			entry_cmp);
624*ed569bc2SAaron LI 
625*ed569bc2SAaron LI 	if (item == NULL)
626*ed569bc2SAaron LI 		return 0;
627*ed569bc2SAaron LI 	else
628*ed569bc2SAaron LI 		return item->state;
629*ed569bc2SAaron LI }
630*ed569bc2SAaron LI 
entry_cmp(const void * l,const void * r)631*ed569bc2SAaron LI static int entry_cmp(const void *l, const void *r)
632*ed569bc2SAaron LI {
633*ed569bc2SAaron LI 	const gtte *left, *right;
634*ed569bc2SAaron LI 
635*ed569bc2SAaron LI 	left = (const gtte *) l;
636*ed569bc2SAaron LI 	right = (const gtte *) r;
637*ed569bc2SAaron LI 
638*ed569bc2SAaron LI 	return left->ch - right->ch;
639*ed569bc2SAaron LI }
640*ed569bc2SAaron LI 
set_gototab(fa * f,int state,int ch,int val)641*ed569bc2SAaron LI static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
642*ed569bc2SAaron LI {
643*ed569bc2SAaron LI 	if (f->gototab[state].inuse == 0) {
644*ed569bc2SAaron LI 		f->gototab[state].entries[0].ch = ch;
645*ed569bc2SAaron LI 		f->gototab[state].entries[0].state = val;
646*ed569bc2SAaron LI 		f->gototab[state].inuse++;
647*ed569bc2SAaron LI 		return val;
648*ed569bc2SAaron LI 	} else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
649*ed569bc2SAaron LI 		// not seen yet, insert and return
650*ed569bc2SAaron LI 		gtt *tab = & f->gototab[state];
651*ed569bc2SAaron LI 		if (tab->inuse + 1 >= tab->allocated)
652*ed569bc2SAaron LI 			resize_gototab(f, state);
653*ed569bc2SAaron LI 
654*ed569bc2SAaron LI 		f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch;
655*ed569bc2SAaron LI 		f->gototab[state].entries[f->gototab[state].inuse-1].state = val;
656*ed569bc2SAaron LI 		f->gototab[state].inuse++;
657*ed569bc2SAaron LI 		return val;
658*ed569bc2SAaron LI 	} else {
659*ed569bc2SAaron LI 		// maybe we have it, maybe we don't
660*ed569bc2SAaron LI 		gtte key;
661*ed569bc2SAaron LI 		gtte *item;
662*ed569bc2SAaron LI 
663*ed569bc2SAaron LI 		key.ch = ch;
664*ed569bc2SAaron LI 		key.state = 0;	/* irrelevant */
665*ed569bc2SAaron LI 		item = (gtte *) bsearch(& key, f->gototab[state].entries,
666*ed569bc2SAaron LI 				f->gototab[state].inuse, sizeof(gtte),
667*ed569bc2SAaron LI 				entry_cmp);
668*ed569bc2SAaron LI 
669*ed569bc2SAaron LI 		if (item != NULL) {
670*ed569bc2SAaron LI 			// we have it, update state and return
671*ed569bc2SAaron LI 			item->state = val;
672*ed569bc2SAaron LI 			return item->state;
673*ed569bc2SAaron LI 		}
674*ed569bc2SAaron LI 		// otherwise, fall through to insert and reallocate.
675*ed569bc2SAaron LI 	}
676*ed569bc2SAaron LI 
677*ed569bc2SAaron LI 	gtt *tab = & f->gototab[state];
678*ed569bc2SAaron LI 	if (tab->inuse + 1 >= tab->allocated)
679*ed569bc2SAaron LI 		resize_gototab(f, state);
680*ed569bc2SAaron LI 	++tab->inuse;
681*ed569bc2SAaron LI 	f->gototab[state].entries[tab->inuse].ch = ch;
682*ed569bc2SAaron LI 	f->gototab[state].entries[tab->inuse].state = val;
683*ed569bc2SAaron LI 
684*ed569bc2SAaron LI 	qsort(f->gototab[state].entries,
685*ed569bc2SAaron LI 		f->gototab[state].inuse, sizeof(gtte), entry_cmp);
686*ed569bc2SAaron LI 
687*ed569bc2SAaron LI 	return val; /* not used anywhere at the moment */
688*ed569bc2SAaron LI }
689*ed569bc2SAaron LI 
clear_gototab(fa * f,int state)690*ed569bc2SAaron LI static void clear_gototab(fa *f, int state)
691*ed569bc2SAaron LI {
692*ed569bc2SAaron LI 	memset(f->gototab[state].entries, 0,
693*ed569bc2SAaron LI 		f->gototab[state].allocated * sizeof(gtte));
694*ed569bc2SAaron LI 	f->gototab[state].inuse = 0;
695*ed569bc2SAaron LI }
696*ed569bc2SAaron LI 
match(fa * f,const char * p0)6974b588458SPeter Avalos int match(fa *f, const char *p0)	/* shortest match ? */
6984b588458SPeter Avalos {
6994b588458SPeter Avalos 	int s, ns;
700*ed569bc2SAaron LI 	int n;
701*ed569bc2SAaron LI 	int rune;
7021d48fce0SDaniel Fojt 	const uschar *p = (const uschar *) p0;
7034b588458SPeter Avalos 
704*ed569bc2SAaron LI 	/* return pmatch(f, p0); does it matter whether longest or shortest? */
705*ed569bc2SAaron LI 
7061d48fce0SDaniel Fojt 	s = f->initstat;
7071d48fce0SDaniel Fojt 	assert (s < f->state_count);
7081d48fce0SDaniel Fojt 
7094b588458SPeter Avalos 	if (f->out[s])
7104b588458SPeter Avalos 		return(1);
7114b588458SPeter Avalos 	do {
7124b588458SPeter Avalos 		/* assert(*p < NCHARS); */
713*ed569bc2SAaron LI 		n = u8_rune(&rune, (const char *) p);
714*ed569bc2SAaron LI 		if ((ns = get_gototab(f, s, rune)) != 0)
7154b588458SPeter Avalos 			s = ns;
7164b588458SPeter Avalos 		else
717*ed569bc2SAaron LI 			s = cgoto(f, s, rune);
7184b588458SPeter Avalos 		if (f->out[s])
7194b588458SPeter Avalos 			return(1);
720*ed569bc2SAaron LI 		if (*p == 0)
721*ed569bc2SAaron LI 			break;
722*ed569bc2SAaron LI 		p += n;
723*ed569bc2SAaron LI 	} while (1);  /* was *p++ != 0 */
7244b588458SPeter Avalos 	return(0);
7254b588458SPeter Avalos }
7264b588458SPeter Avalos 
pmatch(fa * f,const char * p0)7274b588458SPeter Avalos int pmatch(fa *f, const char *p0)	/* longest match, for sub */
7284b588458SPeter Avalos {
7294b588458SPeter Avalos 	int s, ns;
730*ed569bc2SAaron LI 	int n;
731*ed569bc2SAaron LI 	int rune;
7321d48fce0SDaniel Fojt 	const uschar *p = (const uschar *) p0;
7331d48fce0SDaniel Fojt 	const uschar *q;
7344b588458SPeter Avalos 
7354b588458SPeter Avalos 	s = f->initstat;
7361d48fce0SDaniel Fojt 	assert(s < f->state_count);
7371d48fce0SDaniel Fojt 
7381d48fce0SDaniel Fojt 	patbeg = (const char *)p;
7394b588458SPeter Avalos 	patlen = -1;
7404b588458SPeter Avalos 	do {
7414b588458SPeter Avalos 		q = p;
7424b588458SPeter Avalos 		do {
7434b588458SPeter Avalos 			if (f->out[s])		/* final state */
7444b588458SPeter Avalos 				patlen = q-p;
7454b588458SPeter Avalos 			/* assert(*q < NCHARS); */
746*ed569bc2SAaron LI 			n = u8_rune(&rune, (const char *) q);
747*ed569bc2SAaron LI 			if ((ns = get_gototab(f, s, rune)) != 0)
7484b588458SPeter Avalos 				s = ns;
7494b588458SPeter Avalos 			else
750*ed569bc2SAaron LI 				s = cgoto(f, s, rune);
7511d48fce0SDaniel Fojt 
7521d48fce0SDaniel Fojt 			assert(s < f->state_count);
7531d48fce0SDaniel Fojt 
7544b588458SPeter Avalos 			if (s == 1) {	/* no transition */
7554b588458SPeter Avalos 				if (patlen >= 0) {
7561d48fce0SDaniel Fojt 					patbeg = (const char *) p;
7574b588458SPeter Avalos 					return(1);
7584b588458SPeter Avalos 				}
7594b588458SPeter Avalos 				else
7604b588458SPeter Avalos 					goto nextin;	/* no match */
7614b588458SPeter Avalos 			}
762*ed569bc2SAaron LI 			if (*q == 0)
763*ed569bc2SAaron LI 				break;
764*ed569bc2SAaron LI 			q += n;
765*ed569bc2SAaron LI 		} while (1);
766*ed569bc2SAaron LI 		q++;  /* was *q++ */
7674b588458SPeter Avalos 		if (f->out[s])
7684b588458SPeter Avalos 			patlen = q-p-1;	/* don't count $ */
7694b588458SPeter Avalos 		if (patlen >= 0) {
7701d48fce0SDaniel Fojt 			patbeg = (const char *) p;
7714b588458SPeter Avalos 			return(1);
7724b588458SPeter Avalos 		}
7734b588458SPeter Avalos 	nextin:
7744b588458SPeter Avalos 		s = 2;
775*ed569bc2SAaron LI 		if (*p == 0)
776*ed569bc2SAaron LI 			break;
777*ed569bc2SAaron LI 		n = u8_rune(&rune, (const char *) p);
778*ed569bc2SAaron LI 		p += n;
779*ed569bc2SAaron LI 	} while (1); /* was *p++ */
7804b588458SPeter Avalos 	return (0);
7814b588458SPeter Avalos }
7824b588458SPeter Avalos 
nematch(fa * f,const char * p0)7834b588458SPeter Avalos int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
7844b588458SPeter Avalos {
7854b588458SPeter Avalos 	int s, ns;
786*ed569bc2SAaron LI         int n;
787*ed569bc2SAaron LI         int rune;
7881d48fce0SDaniel Fojt 	const uschar *p = (const uschar *) p0;
7891d48fce0SDaniel Fojt 	const uschar *q;
7904b588458SPeter Avalos 
7914b588458SPeter Avalos 	s = f->initstat;
7921d48fce0SDaniel Fojt 	assert(s < f->state_count);
7931d48fce0SDaniel Fojt 
7941d48fce0SDaniel Fojt 	patbeg = (const char *)p;
7954b588458SPeter Avalos 	patlen = -1;
7964b588458SPeter Avalos 	while (*p) {
7974b588458SPeter Avalos 		q = p;
7984b588458SPeter Avalos 		do {
7994b588458SPeter Avalos 			if (f->out[s])		/* final state */
8004b588458SPeter Avalos 				patlen = q-p;
8014b588458SPeter Avalos 			/* assert(*q < NCHARS); */
802*ed569bc2SAaron LI 			n = u8_rune(&rune, (const char *) q);
803*ed569bc2SAaron LI 			if ((ns = get_gototab(f, s, rune)) != 0)
8044b588458SPeter Avalos 				s = ns;
8054b588458SPeter Avalos 			else
806*ed569bc2SAaron LI 				s = cgoto(f, s, rune);
8074b588458SPeter Avalos 			if (s == 1) {	/* no transition */
8084b588458SPeter Avalos 				if (patlen > 0) {
8091d48fce0SDaniel Fojt 					patbeg = (const char *) p;
8104b588458SPeter Avalos 					return(1);
8114b588458SPeter Avalos 				} else
8124b588458SPeter Avalos 					goto nnextin;	/* no nonempty match */
8134b588458SPeter Avalos 			}
814*ed569bc2SAaron LI 			if (*q == 0)
815*ed569bc2SAaron LI 				break;
816*ed569bc2SAaron LI 			q += n;
817*ed569bc2SAaron LI 		} while (1);
818*ed569bc2SAaron LI 		q++;
8194b588458SPeter Avalos 		if (f->out[s])
8204b588458SPeter Avalos 			patlen = q-p-1;	/* don't count $ */
8214b588458SPeter Avalos 		if (patlen > 0 ) {
8221d48fce0SDaniel Fojt 			patbeg = (const char *) p;
8234b588458SPeter Avalos 			return(1);
8244b588458SPeter Avalos 		}
8254b588458SPeter Avalos 	nnextin:
8264b588458SPeter Avalos 		s = 2;
8274b588458SPeter Avalos 		p++;
8284b588458SPeter Avalos 	}
8294b588458SPeter Avalos 	return (0);
8304b588458SPeter Avalos }
8314b588458SPeter Avalos 
8321d48fce0SDaniel Fojt 
833*ed569bc2SAaron LI #define MAX_UTF_BYTES	4	// UTF-8 is up to 4 bytes long
834*ed569bc2SAaron LI 
8351d48fce0SDaniel Fojt /*
8361d48fce0SDaniel Fojt  * NAME
8371d48fce0SDaniel Fojt  *     fnematch
8381d48fce0SDaniel Fojt  *
8391d48fce0SDaniel Fojt  * DESCRIPTION
8401d48fce0SDaniel Fojt  *     A stream-fed version of nematch which transfers characters to a
8411d48fce0SDaniel Fojt  *     null-terminated buffer. All characters up to and including the last
8421d48fce0SDaniel Fojt  *     character of the matching text or EOF are placed in the buffer. If
8431d48fce0SDaniel Fojt  *     a match is found, patbeg and patlen are set appropriately.
8441d48fce0SDaniel Fojt  *
8451d48fce0SDaniel Fojt  * RETURN VALUES
8461d48fce0SDaniel Fojt  *     false    No match found.
8471d48fce0SDaniel Fojt  *     true     Match found.
8481d48fce0SDaniel Fojt  */
8491d48fce0SDaniel Fojt 
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)8501d48fce0SDaniel Fojt bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
8511d48fce0SDaniel Fojt {
852*ed569bc2SAaron LI 	char *i, *j, *k, *buf = *pbuf;
8531d48fce0SDaniel Fojt 	int bufsize = *pbufsize;
854*ed569bc2SAaron LI 	int c, n, ns, s;
8551d48fce0SDaniel Fojt 
8561d48fce0SDaniel Fojt 	s = pfa->initstat;
8571d48fce0SDaniel Fojt 	patlen = 0;
8581d48fce0SDaniel Fojt 
8591d48fce0SDaniel Fojt 	/*
860*ed569bc2SAaron LI 	 * buf <= i <= j <= k <= buf+bufsize
8611d48fce0SDaniel Fojt 	 *
8621d48fce0SDaniel Fojt 	 * i: origin of active substring
8631d48fce0SDaniel Fojt 	 * j: current character
864*ed569bc2SAaron LI 	 * k: destination of the next getc
8651d48fce0SDaniel Fojt 	 */
8661d48fce0SDaniel Fojt 
867*ed569bc2SAaron LI 	i = j = k = buf;
868*ed569bc2SAaron LI 
869*ed569bc2SAaron LI 	do {
870*ed569bc2SAaron LI 		/*
871*ed569bc2SAaron LI 		 * Call u8_rune with at least MAX_UTF_BYTES ahead in
872*ed569bc2SAaron LI 		 * the buffer until EOF interferes.
873*ed569bc2SAaron LI 		 */
874*ed569bc2SAaron LI 		if (k - j < MAX_UTF_BYTES) {
875*ed569bc2SAaron LI 			if (k + MAX_UTF_BYTES > buf + bufsize) {
876*ed569bc2SAaron LI 				adjbuf((char **) &buf, &bufsize,
877*ed569bc2SAaron LI 				    bufsize + MAX_UTF_BYTES,
878*ed569bc2SAaron LI 				    quantum, 0, "fnematch");
879*ed569bc2SAaron LI 			}
880*ed569bc2SAaron LI 			for (n = MAX_UTF_BYTES ; n > 0; n--) {
881*ed569bc2SAaron LI 				*k++ = (c = getc(f)) != EOF ? c : 0;
882*ed569bc2SAaron LI 				if (c == EOF) {
883*ed569bc2SAaron LI 					if (ferror(f))
884*ed569bc2SAaron LI 						FATAL("fnematch: getc error");
885*ed569bc2SAaron LI 					break;
886*ed569bc2SAaron LI 				}
887*ed569bc2SAaron LI 			}
888*ed569bc2SAaron LI 		}
889*ed569bc2SAaron LI 
890*ed569bc2SAaron LI 		j += u8_rune(&c, j);
891*ed569bc2SAaron LI 
892*ed569bc2SAaron LI 		if ((ns = get_gototab(pfa, s, c)) != 0)
8931d48fce0SDaniel Fojt 			s = ns;
8941d48fce0SDaniel Fojt 		else
8951d48fce0SDaniel Fojt 			s = cgoto(pfa, s, c);
8961d48fce0SDaniel Fojt 
8971d48fce0SDaniel Fojt 		if (pfa->out[s]) {	/* final state */
898*ed569bc2SAaron LI 			patbeg = i;
899*ed569bc2SAaron LI 			patlen = j - i;
9001d48fce0SDaniel Fojt 			if (c == 0)	/* don't count $ */
9011d48fce0SDaniel Fojt 				patlen--;
9021d48fce0SDaniel Fojt 		}
903*ed569bc2SAaron LI 
904*ed569bc2SAaron LI 		if (c && s != 1)
905*ed569bc2SAaron LI 			continue;  /* origin i still viable, next j */
906*ed569bc2SAaron LI 		if (patlen)
907*ed569bc2SAaron LI 			break;     /* best match found */
908*ed569bc2SAaron LI 
909*ed569bc2SAaron LI 		/* no match at origin i, next i and start over */
910*ed569bc2SAaron LI 		i += u8_rune(&c, i);
911*ed569bc2SAaron LI 		if (c == 0)
912*ed569bc2SAaron LI 			break;    /* no match */
913*ed569bc2SAaron LI 		j = i;
9141d48fce0SDaniel Fojt 		s = 2;
915*ed569bc2SAaron LI 	} while (1);
9161d48fce0SDaniel Fojt 
9171d48fce0SDaniel Fojt 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
9181d48fce0SDaniel Fojt 	*pbuf = buf;
9191d48fce0SDaniel Fojt 	*pbufsize = bufsize;
9201d48fce0SDaniel Fojt 
9211d48fce0SDaniel Fojt 	if (patlen) {
9221d48fce0SDaniel Fojt 		/*
9231d48fce0SDaniel Fojt 		 * Under no circumstances is the last character fed to
9241d48fce0SDaniel Fojt 		 * the automaton part of the match. It is EOF's nullbyte,
9251d48fce0SDaniel Fojt 		 * or it sent the automaton into a state with no further
9261d48fce0SDaniel Fojt 		 * transitions available (s==1), or both. Room for a
9271d48fce0SDaniel Fojt 		 * terminating nullbyte is guaranteed.
9281d48fce0SDaniel Fojt 		 *
9291d48fce0SDaniel Fojt 		 * ungetc any chars after the end of matching text
9301d48fce0SDaniel Fojt 		 * (except for EOF's nullbyte, if present) and null
9311d48fce0SDaniel Fojt 		 * terminate the buffer.
9321d48fce0SDaniel Fojt 		 */
9331d48fce0SDaniel Fojt 		do
934*ed569bc2SAaron LI 			if (*--k && ungetc(*k, f) == EOF)
935*ed569bc2SAaron LI 				FATAL("unable to ungetc '%c'", *k);
936*ed569bc2SAaron LI 		while (k > patbeg + patlen);
937*ed569bc2SAaron LI 		*k = '\0';
9381d48fce0SDaniel Fojt 		return true;
9391d48fce0SDaniel Fojt 	}
9401d48fce0SDaniel Fojt 	else
9411d48fce0SDaniel Fojt 		return false;
9421d48fce0SDaniel Fojt }
9431d48fce0SDaniel Fojt 
reparse(const char * p)9444b588458SPeter Avalos Node *reparse(const char *p)	/* parses regular expression pointed to by p */
9454b588458SPeter Avalos {			/* uses relex() to scan regular expression */
9464b588458SPeter Avalos 	Node *np;
9474b588458SPeter Avalos 
948e5e686a0SDaniel Fojt 	DPRINTF("reparse <%s>\n", p);
9491d48fce0SDaniel Fojt 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
9504b588458SPeter Avalos 	rtok = relex();
9514b588458SPeter Avalos 	/* GNU compatibility: an empty regexp matches anything */
9524b588458SPeter Avalos 	if (rtok == '\0') {
9534b588458SPeter Avalos 		/* FATAL("empty regular expression"); previous */
9544b588458SPeter Avalos 		return(op2(EMPTYRE, NIL, NIL));
9554b588458SPeter Avalos 	}
9564b588458SPeter Avalos 	np = regexp();
9574b588458SPeter Avalos 	if (rtok != '\0')
9584b588458SPeter Avalos 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
9594b588458SPeter Avalos 	return(np);
9604b588458SPeter Avalos }
9614b588458SPeter Avalos 
regexp(void)9624b588458SPeter Avalos Node *regexp(void)	/* top-level parse of reg expr */
9634b588458SPeter Avalos {
9644b588458SPeter Avalos 	return (alt(concat(primary())));
9654b588458SPeter Avalos }
9664b588458SPeter Avalos 
primary(void)9674b588458SPeter Avalos Node *primary(void)
9684b588458SPeter Avalos {
9694b588458SPeter Avalos 	Node *np;
9701d48fce0SDaniel Fojt 	int savelastatom;
9714b588458SPeter Avalos 
9724b588458SPeter Avalos 	switch (rtok) {
9734b588458SPeter Avalos 	case CHAR:
9741d48fce0SDaniel Fojt 		lastatom = starttok;
9754b588458SPeter Avalos 		np = op2(CHAR, NIL, itonp(rlxval));
9764b588458SPeter Avalos 		rtok = relex();
9774b588458SPeter Avalos 		return (unary(np));
9784b588458SPeter Avalos 	case ALL:
9794b588458SPeter Avalos 		rtok = relex();
9804b588458SPeter Avalos 		return (unary(op2(ALL, NIL, NIL)));
9814b588458SPeter Avalos 	case EMPTYRE:
9824b588458SPeter Avalos 		rtok = relex();
9831d48fce0SDaniel Fojt 		return (unary(op2(EMPTYRE, NIL, NIL)));
9844b588458SPeter Avalos 	case DOT:
9851d48fce0SDaniel Fojt 		lastatom = starttok;
9864b588458SPeter Avalos 		rtok = relex();
9874b588458SPeter Avalos 		return (unary(op2(DOT, NIL, NIL)));
9884b588458SPeter Avalos 	case CCL:
9891d48fce0SDaniel Fojt 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
9901d48fce0SDaniel Fojt 		lastatom = starttok;
9914b588458SPeter Avalos 		rtok = relex();
9924b588458SPeter Avalos 		return (unary(np));
9934b588458SPeter Avalos 	case NCCL:
9941d48fce0SDaniel Fojt 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
9951d48fce0SDaniel Fojt 		lastatom = starttok;
9964b588458SPeter Avalos 		rtok = relex();
9974b588458SPeter Avalos 		return (unary(np));
9984b588458SPeter Avalos 	case '^':
9994b588458SPeter Avalos 		rtok = relex();
10004b588458SPeter Avalos 		return (unary(op2(CHAR, NIL, itonp(HAT))));
10014b588458SPeter Avalos 	case '$':
10024b588458SPeter Avalos 		rtok = relex();
10034b588458SPeter Avalos 		return (unary(op2(CHAR, NIL, NIL)));
10044b588458SPeter Avalos 	case '(':
10051d48fce0SDaniel Fojt 		lastatom = starttok;
10061d48fce0SDaniel Fojt 		savelastatom = starttok - basestr; /* Retain over recursion */
10074b588458SPeter Avalos 		rtok = relex();
10084b588458SPeter Avalos 		if (rtok == ')') {	/* special pleading for () */
10094b588458SPeter Avalos 			rtok = relex();
1010*ed569bc2SAaron LI 			return unary(op2(CCL, NIL, (Node *) cclenter("")));
10114b588458SPeter Avalos 		}
10124b588458SPeter Avalos 		np = regexp();
10134b588458SPeter Avalos 		if (rtok == ')') {
10141d48fce0SDaniel Fojt 			lastatom = basestr + savelastatom; /* Restore */
10154b588458SPeter Avalos 			rtok = relex();
10164b588458SPeter Avalos 			return (unary(np));
10174b588458SPeter Avalos 		}
10184b588458SPeter Avalos 		else
10194b588458SPeter Avalos 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
10204b588458SPeter Avalos 	default:
10214b588458SPeter Avalos 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
10224b588458SPeter Avalos 	}
10234b588458SPeter Avalos 	return 0;	/*NOTREACHED*/
10244b588458SPeter Avalos }
10254b588458SPeter Avalos 
concat(Node * np)10264b588458SPeter Avalos Node *concat(Node *np)
10274b588458SPeter Avalos {
10284b588458SPeter Avalos 	switch (rtok) {
10291d48fce0SDaniel Fojt 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
10304b588458SPeter Avalos 		return (concat(op2(CAT, np, primary())));
10311d48fce0SDaniel Fojt 	case EMPTYRE:
10321d48fce0SDaniel Fojt 		rtok = relex();
1033*ed569bc2SAaron LI 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
10341d48fce0SDaniel Fojt 				primary())));
10354b588458SPeter Avalos 	}
10364b588458SPeter Avalos 	return (np);
10374b588458SPeter Avalos }
10384b588458SPeter Avalos 
alt(Node * np)10394b588458SPeter Avalos Node *alt(Node *np)
10404b588458SPeter Avalos {
10414b588458SPeter Avalos 	if (rtok == OR) {
10424b588458SPeter Avalos 		rtok = relex();
10434b588458SPeter Avalos 		return (alt(op2(OR, np, concat(primary()))));
10444b588458SPeter Avalos 	}
10454b588458SPeter Avalos 	return (np);
10464b588458SPeter Avalos }
10474b588458SPeter Avalos 
unary(Node * np)10484b588458SPeter Avalos Node *unary(Node *np)
10494b588458SPeter Avalos {
10504b588458SPeter Avalos 	switch (rtok) {
10514b588458SPeter Avalos 	case STAR:
10524b588458SPeter Avalos 		rtok = relex();
10534b588458SPeter Avalos 		return (unary(op2(STAR, np, NIL)));
10544b588458SPeter Avalos 	case PLUS:
10554b588458SPeter Avalos 		rtok = relex();
10564b588458SPeter Avalos 		return (unary(op2(PLUS, np, NIL)));
10574b588458SPeter Avalos 	case QUEST:
10584b588458SPeter Avalos 		rtok = relex();
10594b588458SPeter Avalos 		return (unary(op2(QUEST, np, NIL)));
10601d48fce0SDaniel Fojt 	case ZERO:
10611d48fce0SDaniel Fojt 		rtok = relex();
10621d48fce0SDaniel Fojt 		return (unary(op2(ZERO, np, NIL)));
10634b588458SPeter Avalos 	default:
10644b588458SPeter Avalos 		return (np);
10654b588458SPeter Avalos 	}
10664b588458SPeter Avalos }
10674b588458SPeter Avalos 
10684b588458SPeter Avalos /*
10694b588458SPeter Avalos  * Character class definitions conformant to the POSIX locale as
10704b588458SPeter Avalos  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
10714b588458SPeter Avalos  * and operating character sets are both ASCII (ISO646) or supersets
10724b588458SPeter Avalos  * thereof.
10734b588458SPeter Avalos  *
10744b588458SPeter Avalos  * Note that to avoid overflowing the temporary buffer used in
10754b588458SPeter Avalos  * relex(), the expanded character class (prior to range expansion)
10764b588458SPeter Avalos  * must be less than twice the size of their full name.
10774b588458SPeter Avalos  */
10784b588458SPeter Avalos 
10794b588458SPeter Avalos /* Because isblank doesn't show up in any of the header files on any
10804b588458SPeter Avalos  * system i use, it's defined here.  if some other locale has a richer
10814b588458SPeter Avalos  * definition of "blank", define HAS_ISBLANK and provide your own
10824b588458SPeter Avalos  * version.
10834b588458SPeter Avalos  * the parentheses here are an attempt to find a path through the maze
10844b588458SPeter Avalos  * of macro definition and/or function and/or version provided.  thanks
10854b588458SPeter Avalos  * to nelson beebe for the suggestion; let's see if it works everywhere.
10864b588458SPeter Avalos  */
10874b588458SPeter Avalos 
10884b588458SPeter Avalos /* #define HAS_ISBLANK */
10894b588458SPeter Avalos #ifndef HAS_ISBLANK
10904b588458SPeter Avalos 
10914b588458SPeter Avalos int (xisblank)(int c)
10924b588458SPeter Avalos {
10934b588458SPeter Avalos 	return c==' ' || c=='\t';
10944b588458SPeter Avalos }
10954b588458SPeter Avalos 
10964b588458SPeter Avalos #endif
10974b588458SPeter Avalos 
10981d48fce0SDaniel Fojt static const struct charclass {
10994b588458SPeter Avalos 	const char *cc_name;
11004b588458SPeter Avalos 	int cc_namelen;
11014b588458SPeter Avalos 	int (*cc_func)(int);
11024b588458SPeter Avalos } charclasses[] = {
11034b588458SPeter Avalos 	{ "alnum",	5,	isalnum },
11044b588458SPeter Avalos 	{ "alpha",	5,	isalpha },
11050020174dSPeter Avalos #ifndef HAS_ISBLANK
11061d48fce0SDaniel Fojt 	{ "blank",	5,	xisblank },
11070020174dSPeter Avalos #else
11080020174dSPeter Avalos 	{ "blank",	5,	isblank },
11090020174dSPeter Avalos #endif
11104b588458SPeter Avalos 	{ "cntrl",	5,	iscntrl },
11114b588458SPeter Avalos 	{ "digit",	5,	isdigit },
11124b588458SPeter Avalos 	{ "graph",	5,	isgraph },
11134b588458SPeter Avalos 	{ "lower",	5,	islower },
11144b588458SPeter Avalos 	{ "print",	5,	isprint },
11154b588458SPeter Avalos 	{ "punct",	5,	ispunct },
11164b588458SPeter Avalos 	{ "space",	5,	isspace },
11174b588458SPeter Avalos 	{ "upper",	5,	isupper },
11184b588458SPeter Avalos 	{ "xdigit",	6,	isxdigit },
11194b588458SPeter Avalos 	{ NULL,		0,	NULL },
11204b588458SPeter Avalos };
11214b588458SPeter Avalos 
11221d48fce0SDaniel Fojt #define REPEAT_SIMPLE		0
11231d48fce0SDaniel Fojt #define REPEAT_PLUS_APPENDED	1
11241d48fce0SDaniel Fojt #define REPEAT_WITH_Q		2
11251d48fce0SDaniel Fojt #define REPEAT_ZERO		3
11261d48fce0SDaniel Fojt 
11271d48fce0SDaniel Fojt static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)11281d48fce0SDaniel Fojt replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
11291d48fce0SDaniel Fojt 	       int atomlen, int firstnum, int secondnum, int special_case)
11301d48fce0SDaniel Fojt {
11311d48fce0SDaniel Fojt 	int i, j;
11321d48fce0SDaniel Fojt 	uschar *buf = 0;
11331d48fce0SDaniel Fojt 	int ret = 1;
11341d48fce0SDaniel Fojt 	int init_q = (firstnum == 0);		/* first added char will be ? */
11351d48fce0SDaniel Fojt 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
11361d48fce0SDaniel Fojt 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
11371d48fce0SDaniel Fojt 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
11381d48fce0SDaniel Fojt 	int size = prefix_length +  suffix_length;
11391d48fce0SDaniel Fojt 
11401d48fce0SDaniel Fojt 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
11411d48fce0SDaniel Fojt 		size += atomlen*(firstnum-1);
11421d48fce0SDaniel Fojt 	}
11431d48fce0SDaniel Fojt 
11441d48fce0SDaniel Fojt 	/* Adjust size of buffer for special cases */
11451d48fce0SDaniel Fojt 	if (special_case == REPEAT_PLUS_APPENDED) {
11461d48fce0SDaniel Fojt 		size++;		/* for the final + */
11471d48fce0SDaniel Fojt 	} else if (special_case == REPEAT_WITH_Q) {
114848f09a05SAntonio Huete Jimenez 		size += init_q + (atomlen+1)* (n_q_reps-init_q);
11491d48fce0SDaniel Fojt 	} else if (special_case == REPEAT_ZERO) {
11501d48fce0SDaniel Fojt 		size += 2;	/* just a null ERE: () */
11511d48fce0SDaniel Fojt 	}
115248f09a05SAntonio Huete Jimenez 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
11531d48fce0SDaniel Fojt 		FATAL("out of space in reg expr %.10s..", lastre);
11541d48fce0SDaniel Fojt 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
11551d48fce0SDaniel Fojt 	j = prefix_length;
11561d48fce0SDaniel Fojt 	if (special_case == REPEAT_ZERO) {
11571d48fce0SDaniel Fojt 		j -= atomlen;
11581d48fce0SDaniel Fojt 		buf[j++] = '(';
11591d48fce0SDaniel Fojt 		buf[j++] = ')';
11601d48fce0SDaniel Fojt 	}
11611d48fce0SDaniel Fojt 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
11621d48fce0SDaniel Fojt 		memcpy(&buf[j], atom, atomlen);
11631d48fce0SDaniel Fojt 		j += atomlen;
11641d48fce0SDaniel Fojt 	}
11651d48fce0SDaniel Fojt 	if (special_case == REPEAT_PLUS_APPENDED) {
11661d48fce0SDaniel Fojt 		buf[j++] = '+';
11671d48fce0SDaniel Fojt 	} else if (special_case == REPEAT_WITH_Q) {
11681d48fce0SDaniel Fojt 		if (init_q)
11691d48fce0SDaniel Fojt 			buf[j++] = '?';
11701d48fce0SDaniel Fojt 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
11711d48fce0SDaniel Fojt 			memcpy(&buf[j], atom, atomlen);
11721d48fce0SDaniel Fojt 			j += atomlen;
11731d48fce0SDaniel Fojt 			buf[j++] = '?';
11741d48fce0SDaniel Fojt 		}
11751d48fce0SDaniel Fojt 	}
11761d48fce0SDaniel Fojt 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
117748f09a05SAntonio Huete Jimenez 	j += suffix_length;
117848f09a05SAntonio Huete Jimenez 	buf[j] = '\0';
11791d48fce0SDaniel Fojt 	/* free old basestr */
11801d48fce0SDaniel Fojt 	if (firstbasestr != basestr) {
11811d48fce0SDaniel Fojt 		if (basestr)
11821d48fce0SDaniel Fojt 			xfree(basestr);
11831d48fce0SDaniel Fojt 	}
11841d48fce0SDaniel Fojt 	basestr = buf;
11851d48fce0SDaniel Fojt 	prestr  = buf + prefix_length;
11861d48fce0SDaniel Fojt 	if (special_case == REPEAT_ZERO) {
11871d48fce0SDaniel Fojt 		prestr  -= atomlen;
11881d48fce0SDaniel Fojt 		ret++;
11891d48fce0SDaniel Fojt 	}
11901d48fce0SDaniel Fojt 	return ret;
11911d48fce0SDaniel Fojt }
11921d48fce0SDaniel Fojt 
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)11931d48fce0SDaniel Fojt static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
11941d48fce0SDaniel Fojt 		  int atomlen, int firstnum, int secondnum)
11951d48fce0SDaniel Fojt {
11961d48fce0SDaniel Fojt 	/*
11971d48fce0SDaniel Fojt 	   In general, the repetition specifier or "bound" is replaced here
11981d48fce0SDaniel Fojt 	   by an equivalent ERE string, repeating the immediately previous atom
11991d48fce0SDaniel Fojt 	   and appending ? and + as needed. Note that the first copy of the
12001d48fce0SDaniel Fojt 	   atom is left in place, except in the special_case of a zero-repeat
12011d48fce0SDaniel Fojt 	   (i.e., {0}).
12021d48fce0SDaniel Fojt 	 */
12031d48fce0SDaniel Fojt 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
12041d48fce0SDaniel Fojt 		if (firstnum < 2) {
12051d48fce0SDaniel Fojt 			/* 0 or 1: should be handled before you get here */
12061d48fce0SDaniel Fojt 			FATAL("internal error");
12071d48fce0SDaniel Fojt 		} else {
12081d48fce0SDaniel Fojt 			return replace_repeat(reptok, reptoklen, atom, atomlen,
12091d48fce0SDaniel Fojt 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
12101d48fce0SDaniel Fojt 		}
12111d48fce0SDaniel Fojt 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
12121d48fce0SDaniel Fojt 		if (firstnum == 0) {	/* {0} or {0,0} */
12131d48fce0SDaniel Fojt 			/* This case is unusual because the resulting
12141d48fce0SDaniel Fojt 			   replacement string might actually be SMALLER than
12151d48fce0SDaniel Fojt 			   the original ERE */
12161d48fce0SDaniel Fojt 			return replace_repeat(reptok, reptoklen, atom, atomlen,
12171d48fce0SDaniel Fojt 					firstnum, secondnum, REPEAT_ZERO);
12181d48fce0SDaniel Fojt 		} else {		/* (firstnum >= 1) */
12191d48fce0SDaniel Fojt 			return replace_repeat(reptok, reptoklen, atom, atomlen,
12201d48fce0SDaniel Fojt 					firstnum, secondnum, REPEAT_SIMPLE);
12211d48fce0SDaniel Fojt 		}
12221d48fce0SDaniel Fojt 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
12231d48fce0SDaniel Fojt 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
12241d48fce0SDaniel Fojt 		return replace_repeat(reptok, reptoklen, atom, atomlen,
12251d48fce0SDaniel Fojt 					firstnum, secondnum, REPEAT_WITH_Q);
12261d48fce0SDaniel Fojt 	} else {	/* Error - shouldn't be here (n>m) */
12271d48fce0SDaniel Fojt 		FATAL("internal error");
12281d48fce0SDaniel Fojt 	}
12291d48fce0SDaniel Fojt 	return 0;
12301d48fce0SDaniel Fojt }
12314b588458SPeter Avalos 
relex(void)12324b588458SPeter Avalos int relex(void)		/* lexical analyzer for reparse */
12334b588458SPeter Avalos {
12344b588458SPeter Avalos 	int c, n;
12354b588458SPeter Avalos 	int cflag;
12361d48fce0SDaniel Fojt 	static uschar *buf = NULL;
12374b588458SPeter Avalos 	static int bufsz = 100;
12384b588458SPeter Avalos 	uschar *bp;
12391d48fce0SDaniel Fojt 	const struct charclass *cc;
12404b588458SPeter Avalos 	int i;
12411d48fce0SDaniel Fojt 	int num, m;
12421d48fce0SDaniel Fojt 	bool commafound, digitfound;
12431d48fce0SDaniel Fojt 	const uschar *startreptok;
12441d48fce0SDaniel Fojt 	static int parens = 0;
12451d48fce0SDaniel Fojt 
12461d48fce0SDaniel Fojt rescan:
12471d48fce0SDaniel Fojt 	starttok = prestr;
12484b588458SPeter Avalos 
1249*ed569bc2SAaron LI 	if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1250*ed569bc2SAaron LI 		prestr += n;
1251*ed569bc2SAaron LI 		starttok = prestr;
1252*ed569bc2SAaron LI 		return CHAR;
1253*ed569bc2SAaron LI 	}
1254*ed569bc2SAaron LI 
12554b588458SPeter Avalos 	switch (c = *prestr++) {
12564b588458SPeter Avalos 	case '|': return OR;
12574b588458SPeter Avalos 	case '*': return STAR;
12584b588458SPeter Avalos 	case '+': return PLUS;
12594b588458SPeter Avalos 	case '?': return QUEST;
12604b588458SPeter Avalos 	case '.': return DOT;
12614b588458SPeter Avalos 	case '\0': prestr--; return '\0';
12624b588458SPeter Avalos 	case '^':
12634b588458SPeter Avalos 	case '$':
12644b588458SPeter Avalos 		return c;
12651d48fce0SDaniel Fojt 	case '(':
12661d48fce0SDaniel Fojt 		parens++;
12671d48fce0SDaniel Fojt 		return c;
12681d48fce0SDaniel Fojt 	case ')':
12691d48fce0SDaniel Fojt 		if (parens) {
12701d48fce0SDaniel Fojt 			parens--;
12711d48fce0SDaniel Fojt 			return c;
12721d48fce0SDaniel Fojt 		}
12731d48fce0SDaniel Fojt 		/* unmatched close parenthesis; per POSIX, treat as literal */
12741d48fce0SDaniel Fojt 		rlxval = c;
12751d48fce0SDaniel Fojt 		return CHAR;
12764b588458SPeter Avalos 	case '\\':
1277b12bae18SSascha Wildner 		rlxval = quoted(&prestr);
12784b588458SPeter Avalos 		return CHAR;
12794b588458SPeter Avalos 	default:
12804b588458SPeter Avalos 		rlxval = c;
12814b588458SPeter Avalos 		return CHAR;
12824b588458SPeter Avalos 	case '[':
128348f09a05SAntonio Huete Jimenez 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
12844b588458SPeter Avalos 			FATAL("out of space in reg expr %.10s..", lastre);
12854b588458SPeter Avalos 		bp = buf;
12864b588458SPeter Avalos 		if (*prestr == '^') {
12874b588458SPeter Avalos 			cflag = 1;
12884b588458SPeter Avalos 			prestr++;
12894b588458SPeter Avalos 		}
12904b588458SPeter Avalos 		else
12914b588458SPeter Avalos 			cflag = 0;
1292*ed569bc2SAaron LI 		n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
12934b588458SPeter Avalos 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
12944b588458SPeter Avalos 			FATAL("out of space for reg expr %.10s...", lastre);
12954b588458SPeter Avalos 		for (; ; ) {
1296*ed569bc2SAaron LI 			if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1297*ed569bc2SAaron LI 				for (i = 0; i < n; i++)
1298*ed569bc2SAaron LI 					*bp++ = *prestr++;
1299*ed569bc2SAaron LI 				continue;
1300*ed569bc2SAaron LI 			}
13014b588458SPeter Avalos 			if ((c = *prestr++) == '\\') {
13024b588458SPeter Avalos 				*bp++ = '\\';
13034b588458SPeter Avalos 				if ((c = *prestr++) == '\0')
13044b588458SPeter Avalos 					FATAL("nonterminated character class %.20s...", lastre);
13054b588458SPeter Avalos 				*bp++ = c;
13064b588458SPeter Avalos 			/* } else if (c == '\n') { */
13074b588458SPeter Avalos 			/* 	FATAL("newline in character class %.20s...", lastre); */
13084b588458SPeter Avalos 			} else if (c == '[' && *prestr == ':') {
13094b588458SPeter Avalos 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
13104b588458SPeter Avalos 				for (cc = charclasses; cc->cc_name; cc++)
13114b588458SPeter Avalos 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
13124b588458SPeter Avalos 						break;
13134b588458SPeter Avalos 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
13144b588458SPeter Avalos 				    prestr[2 + cc->cc_namelen] == ']') {
13154b588458SPeter Avalos 					prestr += cc->cc_namelen + 3;
13161d48fce0SDaniel Fojt 					/*
13171d48fce0SDaniel Fojt 					 * BUG: We begin at 1, instead of 0, since we
13181d48fce0SDaniel Fojt 					 * would otherwise prematurely terminate the
13191d48fce0SDaniel Fojt 					 * string for classes like [[:cntrl:]]. This
13201d48fce0SDaniel Fojt 					 * means that we can't match the NUL character,
13211d48fce0SDaniel Fojt 					 * not without first adapting the entire
13221d48fce0SDaniel Fojt 					 * program to track each string's length.
13231d48fce0SDaniel Fojt 					 */
13241d48fce0SDaniel Fojt 					for (i = 1; i <= UCHAR_MAX; i++) {
132548f09a05SAntonio Huete Jimenez 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
13264b588458SPeter Avalos 						    FATAL("out of space for reg expr %.10s...", lastre);
13274b588458SPeter Avalos 						if (cc->cc_func(i)) {
1328e5e686a0SDaniel Fojt 							/* escape backslash */
1329e5e686a0SDaniel Fojt 							if (i == '\\') {
1330e5e686a0SDaniel Fojt 								*bp++ = '\\';
1331e5e686a0SDaniel Fojt 								n++;
1332e5e686a0SDaniel Fojt 							}
1333e5e686a0SDaniel Fojt 
13344b588458SPeter Avalos 							*bp++ = i;
13354b588458SPeter Avalos 							n++;
13364b588458SPeter Avalos 						}
13374b588458SPeter Avalos 					}
13384b588458SPeter Avalos 				} else
13394b588458SPeter Avalos 					*bp++ = c;
13401d48fce0SDaniel Fojt 			} else if (c == '[' && *prestr == '.') {
13411d48fce0SDaniel Fojt 				char collate_char;
13421d48fce0SDaniel Fojt 				prestr++;
13431d48fce0SDaniel Fojt 				collate_char = *prestr++;
13441d48fce0SDaniel Fojt 				if (*prestr == '.' && prestr[1] == ']') {
13451d48fce0SDaniel Fojt 					prestr += 2;
13461d48fce0SDaniel Fojt 					/* Found it: map via locale TBD: for
13471d48fce0SDaniel Fojt 					   now, simply return this char.  This
13481d48fce0SDaniel Fojt 					   is sufficient to pass conformance
13491d48fce0SDaniel Fojt 					   test awk.ex 156
13501d48fce0SDaniel Fojt 					 */
13511d48fce0SDaniel Fojt 					if (*prestr == ']') {
13521d48fce0SDaniel Fojt 						prestr++;
13531d48fce0SDaniel Fojt 						rlxval = collate_char;
13541d48fce0SDaniel Fojt 						return CHAR;
13551d48fce0SDaniel Fojt 					}
13561d48fce0SDaniel Fojt 				}
13571d48fce0SDaniel Fojt 			} else if (c == '[' && *prestr == '=') {
13581d48fce0SDaniel Fojt 				char equiv_char;
13591d48fce0SDaniel Fojt 				prestr++;
13601d48fce0SDaniel Fojt 				equiv_char = *prestr++;
13611d48fce0SDaniel Fojt 				if (*prestr == '=' && prestr[1] == ']') {
13621d48fce0SDaniel Fojt 					prestr += 2;
13631d48fce0SDaniel Fojt 					/* Found it: map via locale TBD: for now
13641d48fce0SDaniel Fojt 					   simply return this char. This is
13651d48fce0SDaniel Fojt 					   sufficient to pass conformance test
13661d48fce0SDaniel Fojt 					   awk.ex 156
13671d48fce0SDaniel Fojt 					 */
13681d48fce0SDaniel Fojt 					if (*prestr == ']') {
13691d48fce0SDaniel Fojt 						prestr++;
13701d48fce0SDaniel Fojt 						rlxval = equiv_char;
13711d48fce0SDaniel Fojt 						return CHAR;
13721d48fce0SDaniel Fojt 					}
13731d48fce0SDaniel Fojt 				}
13744b588458SPeter Avalos 			} else if (c == '\0') {
13754b588458SPeter Avalos 				FATAL("nonterminated character class %.20s", lastre);
13764b588458SPeter Avalos 			} else if (bp == buf) {	/* 1st char is special */
13774b588458SPeter Avalos 				*bp++ = c;
13784b588458SPeter Avalos 			} else if (c == ']') {
13794b588458SPeter Avalos 				*bp++ = 0;
13804b588458SPeter Avalos 				rlxstr = (uschar *) tostring((char *) buf);
13814b588458SPeter Avalos 				if (cflag == 0)
13824b588458SPeter Avalos 					return CCL;
13834b588458SPeter Avalos 				else
13844b588458SPeter Avalos 					return NCCL;
13854b588458SPeter Avalos 			} else
13864b588458SPeter Avalos 				*bp++ = c;
13874b588458SPeter Avalos 		}
13881d48fce0SDaniel Fojt 		break;
13891d48fce0SDaniel Fojt 	case '{':
1390*ed569bc2SAaron LI 		if (isdigit((int) *(prestr))) {
13911d48fce0SDaniel Fojt 			num = 0;	/* Process as a repetition */
13921d48fce0SDaniel Fojt 			n = -1; m = -1;
13931d48fce0SDaniel Fojt 			commafound = false;
13941d48fce0SDaniel Fojt 			digitfound = false;
13951d48fce0SDaniel Fojt 			startreptok = prestr-1;
13961d48fce0SDaniel Fojt 			/* Remember start of previous atom here ? */
13971d48fce0SDaniel Fojt 		} else {        	/* just a { char, not a repetition */
13981d48fce0SDaniel Fojt 			rlxval = c;
13991d48fce0SDaniel Fojt 			return CHAR;
14001d48fce0SDaniel Fojt                 }
14011d48fce0SDaniel Fojt 		for (; ; ) {
14021d48fce0SDaniel Fojt 			if ((c = *prestr++) == '}') {
14031d48fce0SDaniel Fojt 				if (commafound) {
14041d48fce0SDaniel Fojt 					if (digitfound) { /* {n,m} */
14051d48fce0SDaniel Fojt 						m = num;
14061d48fce0SDaniel Fojt 						if (m < n)
14071d48fce0SDaniel Fojt 							FATAL("illegal repetition expression: class %.20s",
14081d48fce0SDaniel Fojt 								lastre);
14091d48fce0SDaniel Fojt 						if (n == 0 && m == 1) {
14101d48fce0SDaniel Fojt 							return QUEST;
14111d48fce0SDaniel Fojt 						}
14121d48fce0SDaniel Fojt 					} else {	/* {n,} */
14131d48fce0SDaniel Fojt 						if (n == 0)
14141d48fce0SDaniel Fojt 							return STAR;
14151d48fce0SDaniel Fojt 						else if (n == 1)
14161d48fce0SDaniel Fojt 							return PLUS;
14171d48fce0SDaniel Fojt 					}
14181d48fce0SDaniel Fojt 				} else {
14191d48fce0SDaniel Fojt 					if (digitfound) { /* {n} same as {n,n} */
14201d48fce0SDaniel Fojt 						n = num;
14211d48fce0SDaniel Fojt 						m = num;
14221d48fce0SDaniel Fojt 					} else {	/* {} */
14231d48fce0SDaniel Fojt 						FATAL("illegal repetition expression: class %.20s",
14241d48fce0SDaniel Fojt 							lastre);
14251d48fce0SDaniel Fojt 					}
14261d48fce0SDaniel Fojt 				}
14271d48fce0SDaniel Fojt 				if (repeat(starttok, prestr-starttok, lastatom,
14281d48fce0SDaniel Fojt 					   startreptok - lastatom, n, m) > 0) {
14291d48fce0SDaniel Fojt 					if (n == 0 && m == 0) {
14301d48fce0SDaniel Fojt 						return ZERO;
14311d48fce0SDaniel Fojt 					}
14321d48fce0SDaniel Fojt 					/* must rescan input for next token */
14331d48fce0SDaniel Fojt 					goto rescan;
14341d48fce0SDaniel Fojt 				}
14351d48fce0SDaniel Fojt 				/* Failed to replace: eat up {...} characters
14361d48fce0SDaniel Fojt 				   and treat like just PLUS */
14371d48fce0SDaniel Fojt 				return PLUS;
14381d48fce0SDaniel Fojt 			} else if (c == '\0') {
14391d48fce0SDaniel Fojt 				FATAL("nonterminated character class %.20s",
14401d48fce0SDaniel Fojt 					lastre);
14411d48fce0SDaniel Fojt 			} else if (isdigit(c)) {
14421d48fce0SDaniel Fojt 				num = 10 * num + c - '0';
14431d48fce0SDaniel Fojt 				digitfound = true;
14441d48fce0SDaniel Fojt 			} else if (c == ',') {
14451d48fce0SDaniel Fojt 				if (commafound)
14461d48fce0SDaniel Fojt 					FATAL("illegal repetition expression: class %.20s",
14471d48fce0SDaniel Fojt 						lastre);
14481d48fce0SDaniel Fojt 				/* looking for {n,} or {n,m} */
14491d48fce0SDaniel Fojt 				commafound = true;
14501d48fce0SDaniel Fojt 				n = num;
14511d48fce0SDaniel Fojt 				digitfound = false; /* reset */
14521d48fce0SDaniel Fojt 				num = 0;
14531d48fce0SDaniel Fojt 			} else {
14541d48fce0SDaniel Fojt 				FATAL("illegal repetition expression: class %.20s",
14551d48fce0SDaniel Fojt 					lastre);
14561d48fce0SDaniel Fojt 			}
14571d48fce0SDaniel Fojt 		}
14581d48fce0SDaniel Fojt 		break;
14594b588458SPeter Avalos 	}
14604b588458SPeter Avalos }
14614b588458SPeter Avalos 
cgoto(fa * f,int s,int c)14624b588458SPeter Avalos int cgoto(fa *f, int s, int c)
14634b588458SPeter Avalos {
14644b588458SPeter Avalos 	int *p, *q;
14651d48fce0SDaniel Fojt 	int i, j, k;
14664b588458SPeter Avalos 
1467*ed569bc2SAaron LI 	/* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
14684b588458SPeter Avalos 	while (f->accept >= maxsetvec) {	/* guessing here! */
14691d48fce0SDaniel Fojt 		resizesetvec(__func__);
14704b588458SPeter Avalos 	}
14714b588458SPeter Avalos 	for (i = 0; i <= f->accept; i++)
14724b588458SPeter Avalos 		setvec[i] = 0;
14734b588458SPeter Avalos 	setcnt = 0;
14741d48fce0SDaniel Fojt 	resize_state(f, s);
14754b588458SPeter Avalos 	/* compute positions of gototab[s,c] into setvec */
14764b588458SPeter Avalos 	p = f->posns[s];
14774b588458SPeter Avalos 	for (i = 1; i <= *p; i++) {
14784b588458SPeter Avalos 		if ((k = f->re[p[i]].ltype) != FINAL) {
14794b588458SPeter Avalos 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
14804b588458SPeter Avalos 			 || (k == DOT && c != 0 && c != HAT)
14814b588458SPeter Avalos 			 || (k == ALL && c != 0)
14824b588458SPeter Avalos 			 || (k == EMPTYRE && c != 0)
1483*ed569bc2SAaron LI 			 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1484*ed569bc2SAaron LI 			 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
14854b588458SPeter Avalos 				q = f->re[p[i]].lfollow;
14864b588458SPeter Avalos 				for (j = 1; j <= *q; j++) {
14874b588458SPeter Avalos 					if (q[j] >= maxsetvec) {
14881d48fce0SDaniel Fojt 						resizesetvec(__func__);
14894b588458SPeter Avalos 					}
14904b588458SPeter Avalos 					if (setvec[q[j]] == 0) {
14914b588458SPeter Avalos 						setcnt++;
14924b588458SPeter Avalos 						setvec[q[j]] = 1;
14934b588458SPeter Avalos 					}
14944b588458SPeter Avalos 				}
14954b588458SPeter Avalos 			}
14964b588458SPeter Avalos 		}
14974b588458SPeter Avalos 	}
14984b588458SPeter Avalos 	/* determine if setvec is a previous state */
14994b588458SPeter Avalos 	tmpset[0] = setcnt;
15004b588458SPeter Avalos 	j = 1;
15014b588458SPeter Avalos 	for (i = f->accept; i >= 0; i--)
15024b588458SPeter Avalos 		if (setvec[i]) {
15034b588458SPeter Avalos 			tmpset[j++] = i;
15044b588458SPeter Avalos 		}
15051d48fce0SDaniel Fojt 	resize_state(f, f->curstat > s ? f->curstat : s);
15064b588458SPeter Avalos 	/* tmpset == previous state? */
15074b588458SPeter Avalos 	for (i = 1; i <= f->curstat; i++) {
15084b588458SPeter Avalos 		p = f->posns[i];
15094b588458SPeter Avalos 		if ((k = tmpset[0]) != p[0])
15104b588458SPeter Avalos 			goto different;
15114b588458SPeter Avalos 		for (j = 1; j <= k; j++)
15124b588458SPeter Avalos 			if (tmpset[j] != p[j])
15134b588458SPeter Avalos 				goto different;
15144b588458SPeter Avalos 		/* setvec is state i */
15151d48fce0SDaniel Fojt 		if (c != HAT)
1516*ed569bc2SAaron LI 			set_gototab(f, s, c, i);
15174b588458SPeter Avalos 		return i;
15184b588458SPeter Avalos 	  different:;
15194b588458SPeter Avalos 	}
15204b588458SPeter Avalos 
15214b588458SPeter Avalos 	/* add tmpset to current set of states */
15224b588458SPeter Avalos 	++(f->curstat);
15231d48fce0SDaniel Fojt 	resize_state(f, f->curstat);
1524*ed569bc2SAaron LI 	clear_gototab(f, f->curstat);
15254b588458SPeter Avalos 	xfree(f->posns[f->curstat]);
15261d48fce0SDaniel Fojt 	p = intalloc(setcnt + 1, __func__);
15274b588458SPeter Avalos 
15284b588458SPeter Avalos 	f->posns[f->curstat] = p;
15291d48fce0SDaniel Fojt 	if (c != HAT)
1530*ed569bc2SAaron LI 		set_gototab(f, s, c, f->curstat);
15314b588458SPeter Avalos 	for (i = 0; i <= setcnt; i++)
15324b588458SPeter Avalos 		p[i] = tmpset[i];
15334b588458SPeter Avalos 	if (setvec[f->accept])
15344b588458SPeter Avalos 		f->out[f->curstat] = 1;
15354b588458SPeter Avalos 	else
15364b588458SPeter Avalos 		f->out[f->curstat] = 0;
15374b588458SPeter Avalos 	return f->curstat;
15384b588458SPeter Avalos }
15394b588458SPeter Avalos 
15404b588458SPeter Avalos 
freefa(fa * f)15414b588458SPeter Avalos void freefa(fa *f)	/* free a finite automaton */
15424b588458SPeter Avalos {
15434b588458SPeter Avalos 	int i;
15444b588458SPeter Avalos 
15454b588458SPeter Avalos 	if (f == NULL)
15464b588458SPeter Avalos 		return;
15471d48fce0SDaniel Fojt 	for (i = 0; i < f->state_count; i++)
1548*ed569bc2SAaron LI 		xfree(f->gototab[i].entries);
1549*ed569bc2SAaron LI 	xfree(f->gototab);
15504b588458SPeter Avalos 	for (i = 0; i <= f->curstat; i++)
15514b588458SPeter Avalos 		xfree(f->posns[i]);
15524b588458SPeter Avalos 	for (i = 0; i <= f->accept; i++) {
15534b588458SPeter Avalos 		xfree(f->re[i].lfollow);
15544b588458SPeter Avalos 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
15551d48fce0SDaniel Fojt 			xfree(f->re[i].lval.np);
15564b588458SPeter Avalos 	}
15574b588458SPeter Avalos 	xfree(f->restr);
15581d48fce0SDaniel Fojt 	xfree(f->out);
15591d48fce0SDaniel Fojt 	xfree(f->posns);
15601d48fce0SDaniel Fojt 	xfree(f->gototab);
15614b588458SPeter Avalos 	xfree(f);
15624b588458SPeter Avalos }
1563