1/*-
2 * This code is derived from OpenBSD's libc/regex, original license follows:
3 *
4 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5 * Copyright (c) 1992, 1993, 1994
6 *	The Regents of the University of California.  All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Henry Spencer.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	@(#)engine.c	8.5 (Berkeley) 3/20/94
36 */
37
38/*
39 * The matching engine and friends.  This file is #included by regexec.c
40 * after suitable #defines of a variety of macros used herein, so that
41 * different state representations can be used without duplicating masses
42 * of code.
43 */
44
45#ifdef SNAMES
46#define	matcher	smatcher
47#define	fast	sfast
48#define	slow	sslow
49#define	dissect	sdissect
50#define	backref	sbackref
51#define	step	sstep
52#define	print	sprint
53#define	at	sat
54#define	match	smat
55#define	nope	snope
56#define step_back	sstep_back
57#endif
58#ifdef LNAMES
59#define	matcher	lmatcher
60#define	fast	lfast
61#define	slow	lslow
62#define	dissect	ldissect
63#define	backref	lbackref
64#define	step	lstep
65#define	print	lprint
66#define	at	lat
67#define	match	lmat
68#define	nope	lnope
69#define step_back	lstep_back
70#endif
71
72/* another structure passed up and down to avoid zillions of parameters */
73struct match {
74	struct re_guts *g;
75	int eflags;
76	llvm_regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
77	const char *offp;		/* offsets work from here */
78	const char *beginp;		/* start of string -- virtual NUL precedes */
79	const char *endp;		/* end of string -- virtual NUL here */
80	const char *coldp;		/* can be no match starting before here */
81	const char **lastpos;		/* [nplus+1] */
82	STATEVARS;
83	states st;		/* current states */
84	states fresh;		/* states for a fresh start */
85	states tmp;		/* temporary */
86	states empty;		/* empty set of states */
87};
88
89static int matcher(struct re_guts *, const char *, size_t,
90                   llvm_regmatch_t[], int);
91static const char *dissect(struct match *, const char *, const char *, sopno,
92                           sopno);
93static const char *backref(struct match *, const char *, const char *, sopno,
94                           sopno, sopno, int);
95static const char *fast(struct match *, const char *, const char *, sopno, sopno);
96static const char *slow(struct match *, const char *, const char *, sopno, sopno);
97static states step(struct re_guts *, sopno, sopno, states, int, states);
98#define MAX_RECURSION	100
99#define	BOL	(OUT+1)
100#define	EOL	(BOL+1)
101#define	BOLEOL	(BOL+2)
102#define	NOTHING	(BOL+3)
103#define	BOW	(BOL+4)
104#define	EOW	(BOL+5)
105#define	CODEMAX	(BOL+5)		/* highest code used */
106#define	NONCHAR(c)	((c) > CHAR_MAX)
107#define	NNONCHAR	(CODEMAX-CHAR_MAX)
108#ifdef REDEBUG
109static void print(struct match *, char *, states, int, FILE *);
110#endif
111#ifdef REDEBUG
112static void at(struct match *, char *, char *, char *, sopno, sopno);
113#endif
114#ifdef REDEBUG
115static char *pchar(int);
116#endif
117
118#ifdef REDEBUG
119#define	SP(t, s, c)	print(m, t, s, c, stdout)
120#define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
121#define	NOTE(str)	{ if (m->eflags&REG_TRACE) (void)printf("=%s\n", (str)); }
122static int nope = 0;
123#else
124#define	SP(t, s, c)	/* nothing */
125#define	AT(t, p1, p2, s1, s2)	/* nothing */
126#define	NOTE(s)	/* nothing */
127#endif
128
129/*
130 - matcher - the actual matching engine
131 */
132static int			/* 0 success, REG_NOMATCH failure */
133matcher(struct re_guts *g, const char *string, size_t nmatch,
134        llvm_regmatch_t pmatch[],
135    int eflags)
136{
137	const char *endp;
138	size_t i;
139	struct match mv;
140	struct match *m = &mv;
141	const char *dp;
142	const sopno gf = g->firststate+1;	/* +1 for OEND */
143	const sopno gl = g->laststate;
144	const char *start;
145	const char *stop;
146
147	/* simplify the situation where possible */
148	if (g->cflags&REG_NOSUB)
149		nmatch = 0;
150	if (eflags&REG_STARTEND) {
151		start = string + pmatch[0].rm_so;
152		stop = string + pmatch[0].rm_eo;
153	} else {
154		start = string;
155		stop = start + strlen(start);
156	}
157	if (stop < start)
158		return(REG_INVARG);
159
160	/* prescreening; this does wonders for this rather slow code */
161	if (g->must != NULL) {
162		for (dp = start; dp < stop; dp++)
163			if (*dp == g->must[0] && stop - dp >= g->mlen &&
164				memcmp(dp, g->must, (size_t)g->mlen) == 0)
165				break;
166		if (dp == stop)		/* we didn't find g->must */
167			return(REG_NOMATCH);
168	}
169
170	/* match struct setup */
171	m->g = g;
172	m->eflags = eflags;
173	m->pmatch = NULL;
174	m->lastpos = NULL;
175	m->offp = string;
176	m->beginp = start;
177	m->endp = stop;
178	STATESETUP(m, 4);
179	SETUP(m->st);
180	SETUP(m->fresh);
181	SETUP(m->tmp);
182	SETUP(m->empty);
183	CLEAR(m->empty);
184
185	/* this loop does only one repetition except for backrefs */
186	for (;;) {
187		endp = fast(m, start, stop, gf, gl);
188		if (endp == NULL) {		/* a miss */
189			free(m->pmatch);
190			free((void*)m->lastpos);
191			STATETEARDOWN(m);
192			return(REG_NOMATCH);
193		}
194		if (nmatch == 0 && !g->backrefs)
195			break;		/* no further info needed */
196
197		/* where? */
198		assert(m->coldp != NULL);
199		for (;;) {
200			NOTE("finding start");
201			endp = slow(m, m->coldp, stop, gf, gl);
202			if (endp != NULL)
203				break;
204			assert(m->coldp < m->endp);
205			m->coldp++;
206		}
207		if (nmatch == 1 && !g->backrefs)
208			break;		/* no further info needed */
209
210		/* oh my, they want the subexpressions... */
211		if (m->pmatch == NULL)
212			m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) *
213							sizeof(llvm_regmatch_t));
214		if (m->pmatch == NULL) {
215			STATETEARDOWN(m);
216			return(REG_ESPACE);
217		}
218		for (i = 1; i <= m->g->nsub; i++)
219			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
220		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
221			NOTE("dissecting");
222			dp = dissect(m, m->coldp, endp, gf, gl);
223		} else {
224			if (g->nplus > 0 && m->lastpos == NULL)
225				m->lastpos = (const char **)malloc((g->nplus+1) *
226							sizeof(char *));
227			if (g->nplus > 0 && m->lastpos == NULL) {
228				free(m->pmatch);
229				STATETEARDOWN(m);
230				return(REG_ESPACE);
231			}
232			NOTE("backref dissect");
233			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
234		}
235		if (dp != NULL)
236			break;
237
238		/* uh-oh... we couldn't find a subexpression-level match */
239		assert(g->backrefs);	/* must be back references doing it */
240		assert(g->nplus == 0 || m->lastpos != NULL);
241		for (;;) {
242			if (dp != NULL || endp <= m->coldp)
243				break;		/* defeat */
244			NOTE("backoff");
245			endp = slow(m, m->coldp, endp-1, gf, gl);
246			if (endp == NULL)
247				break;		/* defeat */
248			/* try it on a shorter possibility */
249#ifndef NDEBUG
250			for (i = 1; i <= m->g->nsub; i++) {
251				assert(m->pmatch[i].rm_so == -1);
252				assert(m->pmatch[i].rm_eo == -1);
253			}
254#endif
255			NOTE("backoff dissect");
256			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
257		}
258		assert(dp == NULL || dp == endp);
259		if (dp != NULL)		/* found a shorter one */
260			break;
261
262		/* despite initial appearances, there is no match here */
263		NOTE("false alarm");
264		if (m->coldp == stop)
265			break;
266		start = m->coldp + 1;	/* recycle starting later */
267	}
268
269	/* fill in the details if requested */
270	if (nmatch > 0) {
271		pmatch[0].rm_so = m->coldp - m->offp;
272		pmatch[0].rm_eo = endp - m->offp;
273	}
274	if (nmatch > 1) {
275		assert(m->pmatch != NULL);
276		for (i = 1; i < nmatch; i++)
277			if (i <= m->g->nsub)
278				pmatch[i] = m->pmatch[i];
279			else {
280				pmatch[i].rm_so = -1;
281				pmatch[i].rm_eo = -1;
282			}
283	}
284
285	if (m->pmatch != NULL)
286		free((char *)m->pmatch);
287	if (m->lastpos != NULL)
288		free((char *)m->lastpos);
289	STATETEARDOWN(m);
290	return(0);
291}
292
293/* Step back from "stop" to a position where the strip startst..stopst might
294 * match. This can always conservatively return "stop - 1", but may return an
295 * earlier position if matches at later positions are impossible. */
296static const char *
297step_back(struct re_guts *g, const char *start, const char *stop, sopno startst,
298          sopno stopst)
299{
300	/* Always step back at least one character. */
301	assert(stop > start);
302	const char *res = stop - 1;
303
304	/* Check whether the strip startst..stropst starts with a fixed character,
305	 * ignoring any closing parentheses. If not, return a conservative result. */
306	for (;;) {
307		if (startst >= stopst)
308			return res;
309		if (OP(g->strip[startst]) != ORPAREN)
310			break;
311		startst++;
312	}
313	if (OP(g->strip[startst]) != OCHAR)
314		return res;
315
316	/* Find the character that starts the following match. */
317	char ch = OPND(g->strip[startst]);
318	for (; res != start; --res) {
319		if (*res == ch)
320			break;
321	}
322	return res;
323}
324
325/*
326 - dissect - figure out what matched what, no back references
327 */
328static const char *			/* == stop (success) always */
329dissect(struct match *m, const char *start, const char *stop, sopno startst,
330        sopno stopst)
331{
332	int i;
333	sopno ss;	/* start sop of current subRE */
334	sopno es;	/* end sop of current subRE */
335	const char *sp;	/* start of string matched by it */
336	const char *stp;	/* string matched by it cannot pass here */
337	const char *rest;	/* start of rest of string */
338	const char *tail;	/* string unmatched by rest of RE */
339	sopno ssub;	/* start sop of subsubRE */
340	sopno esub;	/* end sop of subsubRE */
341	const char *ssp;	/* start of string matched by subsubRE */
342	const char *sep;	/* end of string matched by subsubRE */
343	const char *oldssp;	/* previous ssp */
344
345	AT("diss", start, stop, startst, stopst);
346	sp = start;
347	for (ss = startst; ss < stopst; ss = es) {
348		/* identify end of subRE */
349		es = ss;
350		switch (OP(m->g->strip[es])) {
351		case OPLUS_:
352		case OQUEST_:
353			es += OPND(m->g->strip[es]);
354			break;
355		case OCH_:
356			while (OP(m->g->strip[es]) != O_CH)
357				es += OPND(m->g->strip[es]);
358			break;
359		}
360		es++;
361
362		/* figure out what it matched */
363		switch (OP(m->g->strip[ss])) {
364		case OEND:
365			assert(nope);
366			break;
367		case OCHAR:
368			sp++;
369			break;
370		case OBOL:
371		case OEOL:
372		case OBOW:
373		case OEOW:
374			break;
375		case OANY:
376		case OANYOF:
377			sp++;
378			break;
379		case OBACK_:
380		case O_BACK:
381			assert(nope);
382			break;
383		/* cases where length of match is hard to find */
384		case OQUEST_:
385			stp = stop;
386			for (;;) {
387				/* how long could this one be? */
388				rest = slow(m, sp, stp, ss, es);
389				assert(rest != NULL);	/* it did match */
390				/* could the rest match the rest? */
391				tail = slow(m, rest, stop, es, stopst);
392				if (tail == stop)
393					break;		/* yes! */
394				/* no -- try a shorter match for this one */
395				stp = step_back(m->g, sp, rest, es, stopst);
396				assert(stp >= sp);	/* it did work */
397			}
398			ssub = ss + 1;
399			esub = es - 1;
400			/* did innards match? */
401			if (slow(m, sp, rest, ssub, esub) != NULL) {
402				const char *dp = dissect(m, sp, rest, ssub, esub);
403				(void)dp; /* avoid warning if assertions off */
404				assert(dp == rest);
405			} else		/* no */
406				assert(sp == rest);
407			sp = rest;
408			break;
409		case OPLUS_:
410			stp = stop;
411			for (;;) {
412				/* how long could this one be? */
413				rest = slow(m, sp, stp, ss, es);
414				assert(rest != NULL);	/* it did match */
415				/* could the rest match the rest? */
416				tail = slow(m, rest, stop, es, stopst);
417				if (tail == stop)
418					break;		/* yes! */
419				/* no -- try a shorter match for this one */
420				stp = step_back(m->g, sp, rest, es, stopst);
421				assert(stp >= sp);	/* it did work */
422			}
423			ssub = ss + 1;
424			esub = es - 1;
425			ssp = sp;
426			oldssp = ssp;
427			for (;;) {	/* find last match of innards */
428				sep = slow(m, ssp, rest, ssub, esub);
429				if (sep == NULL || sep == ssp)
430					break;	/* failed or matched null */
431				oldssp = ssp;	/* on to next try */
432				ssp = sep;
433			}
434			if (sep == NULL) {
435				/* last successful match */
436				sep = ssp;
437				ssp = oldssp;
438			}
439			assert(sep == rest);	/* must exhaust substring */
440			assert(slow(m, ssp, sep, ssub, esub) == rest);
441			{
442				const char *dp = dissect(m, ssp, sep, ssub, esub);
443				(void)dp; /* avoid warning if assertions off */
444				assert(dp == sep);
445			}
446			sp = rest;
447			break;
448		case OCH_:
449			stp = stop;
450			for (;;) {
451				/* how long could this one be? */
452				rest = slow(m, sp, stp, ss, es);
453				assert(rest != NULL);	/* it did match */
454				/* could the rest match the rest? */
455				tail = slow(m, rest, stop, es, stopst);
456				if (tail == stop)
457					break;		/* yes! */
458				/* no -- try a shorter match for this one */
459				stp = rest - 1;
460				assert(stp >= sp);	/* it did work */
461			}
462			ssub = ss + 1;
463			esub = ss + OPND(m->g->strip[ss]) - 1;
464			assert(OP(m->g->strip[esub]) == OOR1);
465			for (;;) {	/* find first matching branch */
466				if (slow(m, sp, rest, ssub, esub) == rest)
467					break;	/* it matched all of it */
468				/* that one missed, try next one */
469				assert(OP(m->g->strip[esub]) == OOR1);
470				esub++;
471				assert(OP(m->g->strip[esub]) == OOR2);
472				ssub = esub + 1;
473				esub += OPND(m->g->strip[esub]);
474				if (OP(m->g->strip[esub]) == OOR2)
475					esub--;
476				else
477					assert(OP(m->g->strip[esub]) == O_CH);
478			}
479			{
480				const char *dp = dissect(m, sp, rest, ssub, esub);
481				(void)dp; /* avoid warning if assertions off */
482				assert(dp == rest);
483			}
484			sp = rest;
485			break;
486		case O_PLUS:
487		case O_QUEST:
488		case OOR1:
489		case OOR2:
490		case O_CH:
491			assert(nope);
492			break;
493		case OLPAREN:
494			i = OPND(m->g->strip[ss]);
495			assert(0 < i && i <= m->g->nsub);
496			m->pmatch[i].rm_so = sp - m->offp;
497			break;
498		case ORPAREN:
499			i = OPND(m->g->strip[ss]);
500			assert(0 < i && i <= m->g->nsub);
501			m->pmatch[i].rm_eo = sp - m->offp;
502			break;
503		default:		/* uh oh */
504			assert(nope);
505			break;
506		}
507	}
508
509	assert(sp == stop);
510	return(sp);
511}
512
513/*
514 - backref - figure out what matched what, figuring in back references
515 */
516static const char *			/* == stop (success) or NULL (failure) */
517backref(struct match *m, const char *start, const char *stop, sopno startst,
518        sopno stopst, sopno lev, int rec)			/* PLUS nesting level */
519{
520	int i;
521	sopno ss;	/* start sop of current subRE */
522	const char *sp;	/* start of string matched by it */
523	sopno ssub;	/* start sop of subsubRE */
524	sopno esub;	/* end sop of subsubRE */
525	const char *ssp;	/* start of string matched by subsubRE */
526	const char *dp;
527	size_t len;
528	int hard;
529	sop s;
530	llvm_regoff_t offsave;
531	cset *cs;
532
533	AT("back", start, stop, startst, stopst);
534	sp = start;
535
536	/* get as far as we can with easy stuff */
537	hard = 0;
538	for (ss = startst; !hard && ss < stopst; ss++)
539		switch (OP(s = m->g->strip[ss])) {
540		case OCHAR:
541			if (sp == stop || *sp++ != (char)OPND(s))
542				return(NULL);
543			break;
544		case OANY:
545			if (sp == stop)
546				return(NULL);
547			sp++;
548			break;
549		case OANYOF:
550			cs = &m->g->sets[OPND(s)];
551			if (sp == stop || !CHIN(cs, *sp++))
552				return(NULL);
553			break;
554		case OBOL:
555			if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
556					(sp < m->endp && *(sp-1) == '\n' &&
557						(m->g->cflags&REG_NEWLINE)) )
558				{ /* yes */ }
559			else
560				return(NULL);
561			break;
562		case OEOL:
563			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
564					(sp < m->endp && *sp == '\n' &&
565						(m->g->cflags&REG_NEWLINE)) )
566				{ /* yes */ }
567			else
568				return(NULL);
569			break;
570		case OBOW:
571			if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
572					(sp < m->endp && *(sp-1) == '\n' &&
573						(m->g->cflags&REG_NEWLINE)) ||
574					(sp > m->beginp &&
575							!ISWORD(*(sp-1))) ) &&
576					(sp < m->endp && ISWORD(*sp)) )
577				{ /* yes */ }
578			else
579				return(NULL);
580			break;
581		case OEOW:
582			if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
583					(sp < m->endp && *sp == '\n' &&
584						(m->g->cflags&REG_NEWLINE)) ||
585					(sp < m->endp && !ISWORD(*sp)) ) &&
586					(sp > m->beginp && ISWORD(*(sp-1))) )
587				{ /* yes */ }
588			else
589				return(NULL);
590			break;
591		case O_QUEST:
592			break;
593		case OOR1:	/* matches null but needs to skip */
594			ss++;
595			s = m->g->strip[ss];
596			do {
597				assert(OP(s) == OOR2);
598				ss += OPND(s);
599			} while (OP(s = m->g->strip[ss]) != O_CH);
600			/* note that the ss++ gets us past the O_CH */
601			break;
602		default:	/* have to make a choice */
603			hard = 1;
604			break;
605		}
606	if (!hard) {		/* that was it! */
607		if (sp != stop)
608			return(NULL);
609		return(sp);
610	}
611	ss--;			/* adjust for the for's final increment */
612
613	/* the hard stuff */
614	AT("hard", sp, stop, ss, stopst);
615	s = m->g->strip[ss];
616	switch (OP(s)) {
617	case OBACK_:		/* the vilest depths */
618		i = OPND(s);
619		assert(0 < i && i <= m->g->nsub);
620		if (m->pmatch[i].rm_eo == -1)
621			return(NULL);
622		assert(m->pmatch[i].rm_so != -1);
623		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
624		if (len == 0 && rec++ > MAX_RECURSION)
625			return(NULL);
626		assert(stop - m->beginp >= len);
627		if (sp > stop - len)
628			return(NULL);	/* not enough left to match */
629		ssp = m->offp + m->pmatch[i].rm_so;
630		if (memcmp(sp, ssp, len) != 0)
631			return(NULL);
632		while (m->g->strip[ss] != SOP(O_BACK, i))
633			ss++;
634		return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
635		break;
636	case OQUEST_:		/* to null or not */
637		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
638		if (dp != NULL)
639			return(dp);	/* not */
640		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
641		break;
642	case OPLUS_:
643		assert(m->lastpos != NULL);
644		assert(lev+1 <= m->g->nplus);
645		m->lastpos[lev+1] = sp;
646		return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
647		break;
648	case O_PLUS:
649		if (sp == m->lastpos[lev])	/* last pass matched null */
650			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
651		/* try another pass */
652		m->lastpos[lev] = sp;
653		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
654		if (dp == NULL)
655			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
656		else
657			return(dp);
658		break;
659	case OCH_:		/* find the right one, if any */
660		ssub = ss + 1;
661		esub = ss + OPND(s) - 1;
662		assert(OP(m->g->strip[esub]) == OOR1);
663		for (;;) {	/* find first matching branch */
664			dp = backref(m, sp, stop, ssub, esub, lev, rec);
665			if (dp != NULL)
666				return(dp);
667			/* that one missed, try next one */
668			if (OP(m->g->strip[esub]) == O_CH)
669				return(NULL);	/* there is none */
670			esub++;
671			assert(OP(m->g->strip[esub]) == OOR2);
672			ssub = esub + 1;
673			esub += OPND(m->g->strip[esub]);
674			if (OP(m->g->strip[esub]) == OOR2)
675				esub--;
676			else
677				assert(OP(m->g->strip[esub]) == O_CH);
678		}
679		break;
680	case OLPAREN:		/* must undo assignment if rest fails */
681		i = OPND(s);
682		assert(0 < i && i <= m->g->nsub);
683		offsave = m->pmatch[i].rm_so;
684		m->pmatch[i].rm_so = sp - m->offp;
685		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
686		if (dp != NULL)
687			return(dp);
688		m->pmatch[i].rm_so = offsave;
689		return(NULL);
690		break;
691	case ORPAREN:		/* must undo assignment if rest fails */
692		i = OPND(s);
693		assert(0 < i && i <= m->g->nsub);
694		offsave = m->pmatch[i].rm_eo;
695		m->pmatch[i].rm_eo = sp - m->offp;
696		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
697		if (dp != NULL)
698			return(dp);
699		m->pmatch[i].rm_eo = offsave;
700		return(NULL);
701		break;
702	default:		/* uh oh */
703		assert(nope);
704		break;
705	}
706
707	/* "can't happen" */
708	assert(nope);
709	/* NOTREACHED */
710        return NULL;
711}
712
713/*
714 - fast - step through the string at top speed
715 */
716static const char *			/* where tentative match ended, or NULL */
717fast(struct match *m, const char *start, const char *stop, sopno startst,
718     sopno stopst)
719{
720	states st = m->st;
721	states fresh = m->fresh;
722	states tmp = m->tmp;
723	const char *p = start;
724	int c = (start == m->beginp) ? OUT : *(start-1);
725	int lastc;	/* previous c */
726	int flagch;
727	int i;
728	const char *coldp;	/* last p after which no match was underway */
729
730	CLEAR(st);
731	SET1(st, startst);
732	st = step(m->g, startst, stopst, st, NOTHING, st);
733	ASSIGN(fresh, st);
734	SP("start", st, *p);
735	coldp = NULL;
736	for (;;) {
737		/* next character */
738		lastc = c;
739		c = (p == m->endp) ? OUT : *p;
740		if (EQ(st, fresh))
741			coldp = p;
742
743		/* is there an EOL and/or BOL between lastc and c? */
744		flagch = '\0';
745		i = 0;
746		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
747				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
748			flagch = BOL;
749			i = m->g->nbol;
750		}
751		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
752				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
753			flagch = (flagch == BOL) ? BOLEOL : EOL;
754			i += m->g->neol;
755		}
756		if (i != 0) {
757			for (; i > 0; i--)
758				st = step(m->g, startst, stopst, st, flagch, st);
759			SP("boleol", st, c);
760		}
761
762		/* how about a word boundary? */
763		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
764					(c != OUT && ISWORD(c)) ) {
765			flagch = BOW;
766		}
767		if ( (lastc != OUT && ISWORD(lastc)) &&
768				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
769			flagch = EOW;
770		}
771		if (flagch == BOW || flagch == EOW) {
772			st = step(m->g, startst, stopst, st, flagch, st);
773			SP("boweow", st, c);
774		}
775
776		/* are we done? */
777		if (ISSET(st, stopst) || p == stop)
778			break;		/* NOTE BREAK OUT */
779
780		/* no, we must deal with this character */
781		ASSIGN(tmp, st);
782		ASSIGN(st, fresh);
783		assert(c != OUT);
784		st = step(m->g, startst, stopst, tmp, c, st);
785		SP("aft", st, c);
786		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
787		p++;
788	}
789
790	assert(coldp != NULL);
791	m->coldp = coldp;
792	if (ISSET(st, stopst))
793		return(p+1);
794	else
795		return(NULL);
796}
797
798/*
799 - slow - step through the string more deliberately
800 */
801static const char *			/* where it ended */
802slow(struct match *m, const char *start, const char *stop, sopno startst,
803     sopno stopst)
804{
805	states st = m->st;
806	states empty = m->empty;
807	states tmp = m->tmp;
808	const char *p = start;
809	int c = (start == m->beginp) ? OUT : *(start-1);
810	int lastc;	/* previous c */
811	int flagch;
812	int i;
813	const char *matchp;	/* last p at which a match ended */
814
815	AT("slow", start, stop, startst, stopst);
816	CLEAR(st);
817	SET1(st, startst);
818	SP("sstart", st, *p);
819	st = step(m->g, startst, stopst, st, NOTHING, st);
820	matchp = NULL;
821	for (;;) {
822		/* next character */
823		lastc = c;
824		c = (p == m->endp) ? OUT : *p;
825
826		/* is there an EOL and/or BOL between lastc and c? */
827		flagch = '\0';
828		i = 0;
829		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
830				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
831			flagch = BOL;
832			i = m->g->nbol;
833		}
834		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
835				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
836			flagch = (flagch == BOL) ? BOLEOL : EOL;
837			i += m->g->neol;
838		}
839		if (i != 0) {
840			for (; i > 0; i--)
841				st = step(m->g, startst, stopst, st, flagch, st);
842			SP("sboleol", st, c);
843		}
844
845		/* how about a word boundary? */
846		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
847					(c != OUT && ISWORD(c)) ) {
848			flagch = BOW;
849		}
850		if ( (lastc != OUT && ISWORD(lastc)) &&
851				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
852			flagch = EOW;
853		}
854		if (flagch == BOW || flagch == EOW) {
855			st = step(m->g, startst, stopst, st, flagch, st);
856			SP("sboweow", st, c);
857		}
858
859		/* are we done? */
860		if (ISSET(st, stopst))
861			matchp = p;
862		if (EQ(st, empty) || p == stop)
863			break;		/* NOTE BREAK OUT */
864
865		/* no, we must deal with this character */
866		ASSIGN(tmp, st);
867		ASSIGN(st, empty);
868		assert(c != OUT);
869		st = step(m->g, startst, stopst, tmp, c, st);
870		SP("saft", st, c);
871		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
872		p++;
873	}
874
875	return(matchp);
876}
877
878
879/*
880 - step - map set of states reachable before char to set reachable after
881 */
882static states
883step(struct re_guts *g,
884    sopno start,		/* start state within strip */
885    sopno stop,			/* state after stop state within strip */
886    states bef,			/* states reachable before */
887    int ch,			/* character or NONCHAR code */
888    states aft)			/* states already known reachable after */
889{
890	cset *cs;
891	sop s;
892	sopno pc;
893	onestate here;		/* note, macros know this name */
894	sopno look;
895	int i;
896
897	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
898		s = g->strip[pc];
899		switch (OP(s)) {
900		case OEND:
901			assert(pc == stop-1);
902			break;
903		case OCHAR:
904			/* only characters can match */
905			assert(!NONCHAR(ch) || ch != (char)OPND(s));
906			if (ch == (char)OPND(s))
907				FWD(aft, bef, 1);
908			break;
909		case OBOL:
910			if (ch == BOL || ch == BOLEOL)
911				FWD(aft, bef, 1);
912			break;
913		case OEOL:
914			if (ch == EOL || ch == BOLEOL)
915				FWD(aft, bef, 1);
916			break;
917		case OBOW:
918			if (ch == BOW)
919				FWD(aft, bef, 1);
920			break;
921		case OEOW:
922			if (ch == EOW)
923				FWD(aft, bef, 1);
924			break;
925		case OANY:
926			if (!NONCHAR(ch))
927				FWD(aft, bef, 1);
928			break;
929		case OANYOF:
930			cs = &g->sets[OPND(s)];
931			if (!NONCHAR(ch) && CHIN(cs, ch))
932				FWD(aft, bef, 1);
933			break;
934		case OBACK_:		/* ignored here */
935		case O_BACK:
936			FWD(aft, aft, 1);
937			break;
938		case OPLUS_:		/* forward, this is just an empty */
939			FWD(aft, aft, 1);
940			break;
941		case O_PLUS:		/* both forward and back */
942			FWD(aft, aft, 1);
943			i = ISSETBACK(aft, OPND(s));
944			BACK(aft, aft, OPND(s));
945			if (!i && ISSETBACK(aft, OPND(s))) {
946				/* oho, must reconsider loop body */
947				pc -= OPND(s) + 1;
948				INIT(here, pc);
949			}
950			break;
951		case OQUEST_:		/* two branches, both forward */
952			FWD(aft, aft, 1);
953			FWD(aft, aft, OPND(s));
954			break;
955		case O_QUEST:		/* just an empty */
956			FWD(aft, aft, 1);
957			break;
958		case OLPAREN:		/* not significant here */
959		case ORPAREN:
960			FWD(aft, aft, 1);
961			break;
962		case OCH_:		/* mark the first two branches */
963			FWD(aft, aft, 1);
964			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
965			FWD(aft, aft, OPND(s));
966			break;
967		case OOR1:		/* done a branch, find the O_CH */
968			if (ISSTATEIN(aft, here)) {
969				for (look = 1;
970						OP(s = g->strip[pc+look]) != O_CH;
971						look += OPND(s))
972					assert(OP(s) == OOR2);
973				FWD(aft, aft, look);
974			}
975			break;
976		case OOR2:		/* propagate OCH_'s marking */
977			FWD(aft, aft, 1);
978			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
979				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
980				FWD(aft, aft, OPND(s));
981			}
982			break;
983		case O_CH:		/* just empty */
984			FWD(aft, aft, 1);
985			break;
986		default:		/* ooooops... */
987			assert(nope);
988			break;
989		}
990	}
991
992	return(aft);
993}
994
995#ifdef REDEBUG
996/*
997 - print - print a set of states
998 */
999static void
1000print(struct match *m, char *caption, states st, int ch, FILE *d)
1001{
1002	struct re_guts *g = m->g;
1003	int i;
1004	int first = 1;
1005
1006	if (!(m->eflags&REG_TRACE))
1007		return;
1008
1009	(void)fprintf(d, "%s", caption);
1010	if (ch != '\0')
1011		(void)fprintf(d, " %s", pchar(ch));
1012	for (i = 0; i < g->nstates; i++)
1013		if (ISSET(st, i)) {
1014			(void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1015			first = 0;
1016		}
1017	(void)fprintf(d, "\n");
1018}
1019
1020/*
1021 - at - print current situation
1022 */
1023static void
1024at(struct match *m, char *title, char *start, char *stop, sopno startst,
1025    sopno stopst)
1026{
1027	if (!(m->eflags&REG_TRACE))
1028		return;
1029
1030	(void)printf("%s %s-", title, pchar(*start));
1031	(void)printf("%s ", pchar(*stop));
1032	(void)printf("%ld-%ld\n", (long)startst, (long)stopst);
1033}
1034
1035#ifndef PCHARDONE
1036#define	PCHARDONE	/* never again */
1037/*
1038 - pchar - make a character printable
1039 *
1040 * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1041 * duplicate here avoids having a debugging-capable regexec.o tied to
1042 * a matching debug.o, and this is convenient.  It all disappears in
1043 * the non-debug compilation anyway, so it doesn't matter much.
1044 */
1045static char *			/* -> representation */
1046pchar(int ch)
1047{
1048	static char pbuf[10];
1049
1050	if (isPrint(ch) || ch == ' ')
1051		(void)snprintf(pbuf, sizeof pbuf, "%c", ch);
1052	else
1053		(void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
1054	return(pbuf);
1055}
1056#endif
1057#endif
1058
1059#undef	matcher
1060#undef	fast
1061#undef	slow
1062#undef	dissect
1063#undef	backref
1064#undef	step
1065#undef	print
1066#undef	at
1067#undef	match
1068#undef	nope
1069#undef	step_back
1070