xref: /netbsd/lib/libc/regex/engine.c (revision c4a72b64)
1 /*	$NetBSD: engine.c,v 1.14 2002/07/22 12:56:17 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5  * Copyright (c) 1992, 1993, 1994
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Henry Spencer.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  *	@(#)engine.c	8.5 (Berkeley) 3/20/94
40  */
41 
42 /*
43  * The matching engine and friends.  This file is #included by regexec.c
44  * after suitable #defines of a variety of macros used herein, so that
45  * different state representations can be used without duplicating masses
46  * of code.
47  */
48 
49 #ifdef SNAMES
50 #define	matcher	smatcher
51 #define	fast	sfast
52 #define	slow	sslow
53 #define	dissect	sdissect
54 #define	backref	sbackref
55 #define	step	sstep
56 #define	print	sprint
57 #define	at	sat
58 #define	match	smat
59 #define	nope	snope
60 #endif
61 #ifdef LNAMES
62 #define	matcher	lmatcher
63 #define	fast	lfast
64 #define	slow	lslow
65 #define	dissect	ldissect
66 #define	backref	lbackref
67 #define	step	lstep
68 #define	print	lprint
69 #define	at	lat
70 #define	match	lmat
71 #define	nope	lnope
72 #endif
73 
74 /* another structure passed up and down to avoid zillions of parameters */
75 struct match {
76 	struct re_guts *g;
77 	int eflags;
78 	regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
79 	char *offp;		/* offsets work from here */
80 	char *beginp;		/* start of string -- virtual NUL precedes */
81 	char *endp;		/* end of string -- virtual NUL here */
82 	char *coldp;		/* can be no match starting before here */
83 	char **lastpos;		/* [nplus+1] */
84 	STATEVARS;
85 	states st;		/* current states */
86 	states fresh;		/* states for a fresh start */
87 	states tmp;		/* temporary */
88 	states empty;		/* empty set of states */
89 };
90 
91 /* ========= begin header generated by ./mkh ========= */
92 #ifdef __cplusplus
93 extern "C" {
94 #endif
95 
96 /* === engine.c === */
97 static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags));
98 static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
99 static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev));
100 static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
101 static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
102 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft));
103 #define	BOL	(OUT+1)
104 #define	EOL	(BOL+1)
105 #define	BOLEOL	(BOL+2)
106 #define	NOTHING	(BOL+3)
107 #define	BOW	(BOL+4)
108 #define	EOW	(BOL+5)
109 #define	CODEMAX	(BOL+5)		/* highest code used */
110 #define	NONCHAR(c)	((c) > CHAR_MAX)
111 #define	NNONCHAR	(CODEMAX-CHAR_MAX)
112 #ifdef REDEBUG
113 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d));
114 #endif
115 #ifdef REDEBUG
116 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst));
117 #endif
118 #ifdef REDEBUG
119 static char *pchar __P((int ch));
120 #endif
121 
122 #ifdef __cplusplus
123 }
124 #endif
125 /* ========= end header generated by ./mkh ========= */
126 
127 #ifdef REDEBUG
128 #define	SP(t, s, c)	print(m, t, s, c, stdout)
129 #define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
130 #define	NOTE(str)	{ if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
131 static int nope = 0;
132 #else
133 #define	SP(t, s, c)	/* nothing */
134 #define	AT(t, p1, p2, s1, s2)	/* nothing */
135 #define	NOTE(s)	/* nothing */
136 #endif
137 
138 /*
139  - matcher - the actual matching engine
140  == static int matcher(struct re_guts *g, char *string, \
141  ==	size_t nmatch, regmatch_t pmatch[], int eflags);
142  */
143 static int			/* 0 success, REG_NOMATCH failure */
144 matcher(g, string, nmatch, pmatch, eflags)
145 struct re_guts *g;
146 char *string;
147 size_t nmatch;
148 regmatch_t pmatch[];
149 int eflags;
150 {
151 	char *endp;
152 	int i;
153 	struct match mv;
154 	struct match *m = &mv;
155 	char *dp;
156 	const sopno gf = g->firststate+1;	/* +1 for OEND */
157 	const sopno gl = g->laststate;
158 	char *start;
159 	char *stop;
160 	int error = 0;
161 
162 	_DIAGASSERT(g != NULL);
163 	_DIAGASSERT(string != NULL);
164 	/* pmatch checked below */
165 
166 	/* simplify the situation where possible */
167 	if (g->cflags&REG_NOSUB)
168 		nmatch = 0;
169 	if (eflags&REG_STARTEND) {
170 		_DIAGASSERT(pmatch != NULL);
171 		start = string + (size_t)pmatch[0].rm_so;
172 		stop = string + (size_t)pmatch[0].rm_eo;
173 	} else {
174 		start = string;
175 		stop = start + strlen(start);
176 	}
177 	if (stop < start)
178 		return(REG_INVARG);
179 
180 	/* prescreening; this does wonders for this rather slow code */
181 	if (g->must != NULL) {
182 		for (dp = start; dp < stop; dp++)
183 			if (*dp == g->must[0] && stop - dp >= g->mlen &&
184 				memcmp(dp, g->must, (size_t)g->mlen) == 0)
185 				break;
186 		if (dp == stop)		/* we didn't find g->must */
187 			return(REG_NOMATCH);
188 	}
189 
190 	/* match struct setup */
191 	m->g = g;
192 	m->eflags = eflags;
193 	m->pmatch = NULL;
194 	m->lastpos = NULL;
195 	m->offp = string;
196 	m->beginp = start;
197 	m->endp = stop;
198 	STATESETUP(m, 4);
199 	SETUP(m->st);
200 	SETUP(m->fresh);
201 	SETUP(m->tmp);
202 	SETUP(m->empty);
203 	CLEAR(m->empty);
204 
205 	/* this loop does only one repetition except for backrefs */
206 	for (;;) {
207 		endp = fast(m, start, stop, gf, gl);
208 		if (endp == NULL) {		/* a miss */
209 			error = REG_NOMATCH;
210 			goto done;
211 		}
212 		if (nmatch == 0 && !g->backrefs)
213 			break;		/* no further info needed */
214 
215 		/* where? */
216 		assert(m->coldp != NULL);
217 		for (;;) {
218 			NOTE("finding start");
219 			endp = slow(m, m->coldp, stop, gf, gl);
220 			if (endp != NULL)
221 				break;
222 			assert(m->coldp < m->endp);
223 			m->coldp++;
224 		}
225 		if (nmatch == 1 && !g->backrefs)
226 			break;		/* no further info needed */
227 
228 		/* oh my, he wants the subexpressions... */
229 		if (m->pmatch == NULL)
230 			m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
231 							sizeof(regmatch_t));
232 		if (m->pmatch == NULL) {
233 			error = REG_ESPACE;
234 			goto done;
235 		}
236 		for (i = 1; i <= m->g->nsub; i++)
237 			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = (regoff_t)-1;
238 		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
239 			NOTE("dissecting");
240 			dp = dissect(m, m->coldp, endp, gf, gl);
241 		} else {
242 			if (g->nplus > 0 && m->lastpos == NULL)
243 				m->lastpos = (char **)malloc((g->nplus+1) *
244 							sizeof(char *));
245 			if (g->nplus > 0 && m->lastpos == NULL) {
246 				error = REG_ESPACE;
247 				goto done;
248 			}
249 			NOTE("backref dissect");
250 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
251 		}
252 		if (dp != NULL)
253 			break;
254 
255 		/* uh-oh... we couldn't find a subexpression-level match */
256 		assert(g->backrefs);	/* must be back references doing it */
257 		assert(g->nplus == 0 || m->lastpos != NULL);
258 		for (;;) {
259 			if (dp != NULL || endp <= m->coldp)
260 				break;		/* defeat */
261 			NOTE("backoff");
262 			endp = slow(m, m->coldp, endp-1, gf, gl);
263 			if (endp == NULL)
264 				break;		/* defeat */
265 			/* try it on a shorter possibility */
266 #ifndef NDEBUG
267 			for (i = 1; i <= m->g->nsub; i++) {
268 				assert(m->pmatch[i].rm_so == (regoff_t)-1);
269 				assert(m->pmatch[i].rm_eo == (regoff_t)-1);
270 			}
271 #endif
272 			NOTE("backoff dissect");
273 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
274 		}
275 		assert(dp == NULL || dp == endp);
276 		if (dp != NULL)		/* found a shorter one */
277 			break;
278 
279 		/* despite initial appearances, there is no match here */
280 		NOTE("false alarm");
281 		start = m->coldp + 1;	/* recycle starting later */
282 		assert(start <= stop);
283 	}
284 
285 	/* fill in the details if requested */
286 	if (nmatch > 0) {
287 		_DIAGASSERT(pmatch != NULL);
288 		pmatch[0].rm_so = m->coldp - m->offp;
289 		pmatch[0].rm_eo = endp - m->offp;
290 	}
291 	if (nmatch > 1) {
292 		assert(m->pmatch != NULL);
293 		for (i = 1; i < nmatch; i++)
294 			if (i <= m->g->nsub)
295 				pmatch[i] = m->pmatch[i];
296 			else {
297 				pmatch[i].rm_so = (regoff_t)-1;
298 				pmatch[i].rm_eo = (regoff_t)-1;
299 			}
300 	}
301 
302 done:
303 	if (m->pmatch != NULL) {
304 		free(m->pmatch);
305 		m->pmatch = NULL;
306 	}
307 	if (m->lastpos != NULL) {
308 		free(m->lastpos);
309 		m->lastpos = NULL;
310 	}
311 	STATETEARDOWN(m);
312 	return error;
313 }
314 
315 /*
316  - dissect - figure out what matched what, no back references
317  == static char *dissect(struct match *m, char *start, \
318  ==	char *stop, sopno startst, sopno stopst);
319  */
320 static char *			/* == stop (success) always */
321 dissect(m, start, stop, startst, stopst)
322 struct match *m;
323 char *start;
324 char *stop;
325 sopno startst;
326 sopno stopst;
327 {
328 	int i;
329 	sopno ss;	/* start sop of current subRE */
330 	sopno es;	/* end sop of current subRE */
331 	char *sp;	/* start of string matched by it */
332 	char *stp;	/* string matched by it cannot pass here */
333 	char *rest;	/* start of rest of string */
334 	char *tail;	/* string unmatched by rest of RE */
335 	sopno ssub;	/* start sop of subsubRE */
336 	sopno esub;	/* end sop of subsubRE */
337 	char *ssp;	/* start of string matched by subsubRE */
338 	char *sep;	/* end of string matched by subsubRE */
339 	char *oldssp;	/* previous ssp */
340 #ifndef NDEBUG
341 	char *dp;
342 #endif
343 
344 	_DIAGASSERT(m != NULL);
345 	_DIAGASSERT(start != NULL);
346 	_DIAGASSERT(stop != NULL);
347 
348 	AT("diss", start, stop, startst, stopst);
349 	sp = start;
350 	for (ss = startst; ss < stopst; ss = es) {
351 		/* identify end of subRE */
352 		es = ss;
353 		switch (OP(m->g->strip[es])) {
354 		case OPLUS_:
355 		case OQUEST_:
356 			es += OPND(m->g->strip[es]);
357 			break;
358 		case OCH_:
359 			while (OP(m->g->strip[es]) != O_CH)
360 				es += OPND(m->g->strip[es]);
361 			break;
362 		}
363 		es++;
364 
365 		/* figure out what it matched */
366 		switch (OP(m->g->strip[ss])) {
367 		case OEND:
368 			assert(nope);
369 			break;
370 		case OCHAR:
371 			sp++;
372 			break;
373 		case OBOL:
374 		case OEOL:
375 		case OBOW:
376 		case OEOW:
377 			break;
378 		case OANY:
379 		case OANYOF:
380 			sp++;
381 			break;
382 		case OBACK_:
383 		case O_BACK:
384 			assert(nope);
385 			break;
386 		/* cases where length of match is hard to find */
387 		case OQUEST_:
388 			stp = stop;
389 			for (;;) {
390 				/* how long could this one be? */
391 				rest = slow(m, sp, stp, ss, es);
392 				assert(rest != NULL);	/* it did match */
393 				/* could the rest match the rest? */
394 				tail = slow(m, rest, stop, es, stopst);
395 				if (tail == stop)
396 					break;		/* yes! */
397 				/* no -- try a shorter match for this one */
398 				stp = rest - 1;
399 				assert(stp >= sp);	/* it did work */
400 			}
401 			ssub = ss + 1;
402 			esub = es - 1;
403 			/* did innards match? */
404 			if (slow(m, sp, rest, ssub, esub) != NULL) {
405 #ifdef NDEBUG
406 				(void)
407 #else
408 				dp =
409 #endif
410 				    dissect(m, sp, rest, ssub, esub);
411 				assert(dp == rest);
412 			} else		/* no */
413 				assert(sp == rest);
414 			sp = rest;
415 			break;
416 		case OPLUS_:
417 			stp = stop;
418 			for (;;) {
419 				/* how long could this one be? */
420 				rest = slow(m, sp, stp, ss, es);
421 				assert(rest != NULL);	/* it did match */
422 				/* could the rest match the rest? */
423 				tail = slow(m, rest, stop, es, stopst);
424 				if (tail == stop)
425 					break;		/* yes! */
426 				/* no -- try a shorter match for this one */
427 				stp = rest - 1;
428 				assert(stp >= sp);	/* it did work */
429 			}
430 			ssub = ss + 1;
431 			esub = es - 1;
432 			ssp = sp;
433 			oldssp = ssp;
434 			for (;;) {	/* find last match of innards */
435 				sep = slow(m, ssp, rest, ssub, esub);
436 				if (sep == NULL || sep == ssp)
437 					break;	/* failed or matched null */
438 				oldssp = ssp;	/* on to next try */
439 				ssp = sep;
440 			}
441 			if (sep == NULL) {
442 				/* last successful match */
443 				sep = ssp;
444 				ssp = oldssp;
445 			}
446 			assert(sep == rest);	/* must exhaust substring */
447 			assert(slow(m, ssp, sep, ssub, esub) == rest);
448 #ifdef NDEBUG
449 			(void)
450 #else
451 			dp =
452 #endif
453 			    dissect(m, ssp, sep, ssub, esub);
454 			assert(dp == sep);
455 			sp = rest;
456 			break;
457 		case OCH_:
458 			stp = stop;
459 			for (;;) {
460 				/* how long could this one be? */
461 				rest = slow(m, sp, stp, ss, es);
462 				assert(rest != NULL);	/* it did match */
463 				/* could the rest match the rest? */
464 				tail = slow(m, rest, stop, es, stopst);
465 				if (tail == stop)
466 					break;		/* yes! */
467 				/* no -- try a shorter match for this one */
468 				stp = rest - 1;
469 				assert(stp >= sp);	/* it did work */
470 			}
471 			ssub = ss + 1;
472 			esub = ss + OPND(m->g->strip[ss]) - 1;
473 			assert(OP(m->g->strip[esub]) == OOR1);
474 			for (;;) {	/* find first matching branch */
475 				if (slow(m, sp, rest, ssub, esub) == rest)
476 					break;	/* it matched all of it */
477 				/* that one missed, try next one */
478 				assert(OP(m->g->strip[esub]) == OOR1);
479 				esub++;
480 				assert(OP(m->g->strip[esub]) == OOR2);
481 				ssub = esub + 1;
482 				esub += OPND(m->g->strip[esub]);
483 				if (OP(m->g->strip[esub]) == OOR2)
484 					esub--;
485 				else
486 					assert(OP(m->g->strip[esub]) == O_CH);
487 			}
488 #ifdef NDEBUG
489 			(void)
490 #else
491 			dp =
492 #endif
493 			    dissect(m, sp, rest, ssub, esub);
494 			assert(dp == rest);
495 			sp = rest;
496 			break;
497 		case O_PLUS:
498 		case O_QUEST:
499 		case OOR1:
500 		case OOR2:
501 		case O_CH:
502 			assert(nope);
503 			break;
504 		case OLPAREN:
505 			i = OPND(m->g->strip[ss]);
506 			assert(0 < i && i <= m->g->nsub);
507 			m->pmatch[i].rm_so = sp - m->offp;
508 			break;
509 		case ORPAREN:
510 			i = OPND(m->g->strip[ss]);
511 			assert(0 < i && i <= m->g->nsub);
512 			m->pmatch[i].rm_eo = sp - m->offp;
513 			break;
514 		default:		/* uh oh */
515 			assert(nope);
516 			break;
517 		}
518 	}
519 
520 	assert(sp == stop);
521 	return(sp);
522 }
523 
524 /*
525  - backref - figure out what matched what, figuring in back references
526  == static char *backref(struct match *m, char *start, \
527  ==	char *stop, sopno startst, sopno stopst, sopno lev);
528  */
529 static char *			/* == stop (success) or NULL (failure) */
530 backref(m, start, stop, startst, stopst, lev)
531 struct match *m;
532 char *start;
533 char *stop;
534 sopno startst;
535 sopno stopst;
536 sopno lev;			/* PLUS nesting level */
537 {
538 	int i;
539 	sopno ss;	/* start sop of current subRE */
540 	char *sp;	/* start of string matched by it */
541 	sopno ssub;	/* start sop of subsubRE */
542 	sopno esub;	/* end sop of subsubRE */
543 	char *ssp;	/* start of string matched by subsubRE */
544 	char *dp;
545 	size_t len;
546 	int hard;
547 	sop s;
548 	regoff_t offsave;
549 	cset *cs;
550 
551 	_DIAGASSERT(m != NULL);
552 	_DIAGASSERT(start != NULL);
553 	_DIAGASSERT(stop != NULL);
554 
555 	AT("back", start, stop, startst, stopst);
556 	sp = start;
557 
558 	/* get as far as we can with easy stuff */
559 	hard = 0;
560 	for (ss = startst; !hard && ss < stopst; ss++)
561 		switch (OP(s = m->g->strip[ss])) {
562 		case OCHAR:
563 			if (sp == stop || *sp++ != (char)OPND(s))
564 				return(NULL);
565 			break;
566 		case OANY:
567 			if (sp == stop)
568 				return(NULL);
569 			sp++;
570 			break;
571 		case OANYOF:
572 			cs = &m->g->sets[OPND(s)];
573 			if (sp == stop || !CHIN(cs, *sp++))
574 				return(NULL);
575 			break;
576 		case OBOL:
577 			if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
578 					(sp < m->endp && *(sp-1) == '\n' &&
579 						(m->g->cflags&REG_NEWLINE)) )
580 				{ /* yes */ }
581 			else
582 				return(NULL);
583 			break;
584 		case OEOL:
585 			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
586 					(sp < m->endp && *sp == '\n' &&
587 						(m->g->cflags&REG_NEWLINE)) )
588 				{ /* yes */ }
589 			else
590 				return(NULL);
591 			break;
592 		case OBOW:
593 			if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
594 					(sp < m->endp && *(sp-1) == '\n' &&
595 						(m->g->cflags&REG_NEWLINE)) ||
596 					(sp > m->beginp &&
597 							!ISWORD(*(sp-1))) ) &&
598 					(sp < m->endp && ISWORD(*sp)) )
599 				{ /* yes */ }
600 			else
601 				return(NULL);
602 			break;
603 		case OEOW:
604 			if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
605 					(sp < m->endp && *sp == '\n' &&
606 						(m->g->cflags&REG_NEWLINE)) ||
607 					(sp < m->endp && !ISWORD(*sp)) ) &&
608 					(sp > m->beginp && ISWORD(*(sp-1))) )
609 				{ /* yes */ }
610 			else
611 				return(NULL);
612 			break;
613 		case O_QUEST:
614 			break;
615 		case OOR1:	/* matches null but needs to skip */
616 			ss++;
617 			s = m->g->strip[ss];
618 			do {
619 				assert(OP(s) == OOR2);
620 				ss += OPND(s);
621 			} while (OP(s = m->g->strip[ss]) != O_CH);
622 			/* note that the ss++ gets us past the O_CH */
623 			break;
624 		default:	/* have to make a choice */
625 			hard = 1;
626 			break;
627 		}
628 	if (!hard) {		/* that was it! */
629 		if (sp != stop)
630 			return(NULL);
631 		return(sp);
632 	}
633 	ss--;			/* adjust for the for's final increment */
634 
635 	/* the hard stuff */
636 	AT("hard", sp, stop, ss, stopst);
637 	s = m->g->strip[ss];
638 	switch (OP(s)) {
639 	case OBACK_:		/* the vilest depths */
640 		i = OPND(s);
641 		assert(0 < i && i <= m->g->nsub);
642 		if (m->pmatch[i].rm_eo == (regoff_t)-1)
643 			return(NULL);
644 		assert(m->pmatch[i].rm_so != (regoff_t)-1);
645 		len = (size_t)(m->pmatch[i].rm_eo - m->pmatch[i].rm_so);
646 		assert(stop - m->beginp >= len);
647 		if (sp > stop - len)
648 			return(NULL);	/* not enough left to match */
649 		ssp = m->offp + (size_t)m->pmatch[i].rm_so;
650 		if (memcmp(sp, ssp, len) != 0)
651 			return(NULL);
652 		while (m->g->strip[ss] != SOP(O_BACK, i))
653 			ss++;
654 		return(backref(m, sp+len, stop, ss+1, stopst, lev));
655 
656 	case OQUEST_:		/* to null or not */
657 		dp = backref(m, sp, stop, ss+1, stopst, lev);
658 		if (dp != NULL)
659 			return(dp);	/* not */
660 		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
661 
662 	case OPLUS_:
663 		assert(m->lastpos != NULL);
664 		assert(lev+1 <= m->g->nplus);
665 		m->lastpos[lev+1] = sp;
666 		return(backref(m, sp, stop, ss+1, stopst, lev+1));
667 
668 	case O_PLUS:
669 		if (sp == m->lastpos[lev])	/* last pass matched null */
670 			return(backref(m, sp, stop, ss+1, stopst, lev-1));
671 		/* try another pass */
672 		m->lastpos[lev] = sp;
673 		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
674 		if (dp == NULL)
675 			dp = backref(m, sp, stop, ss+1, stopst, lev-1);
676 		return(dp);
677 
678 	case OCH_:		/* find the right one, if any */
679 		ssub = ss + 1;
680 		esub = ss + OPND(s) - 1;
681 		assert(OP(m->g->strip[esub]) == OOR1);
682 		for (;;) {	/* find first matching branch */
683 			dp = backref(m, sp, stop, ssub, esub, lev);
684 			if (dp != NULL)
685 				return(dp);
686 			/* that one missed, try next one */
687 			if (OP(m->g->strip[esub]) == O_CH)
688 				return(NULL);	/* there is none */
689 			esub++;
690 			assert(OP(m->g->strip[esub]) == OOR2);
691 			ssub = esub + 1;
692 			esub += OPND(m->g->strip[esub]);
693 			if (OP(m->g->strip[esub]) == OOR2)
694 				esub--;
695 			else
696 				assert(OP(m->g->strip[esub]) == O_CH);
697 		}
698 
699 	case OLPAREN:		/* must undo assignment if rest fails */
700 		i = OPND(s);
701 		assert(0 < i && i <= m->g->nsub);
702 		offsave = m->pmatch[i].rm_so;
703 		m->pmatch[i].rm_so = sp - m->offp;
704 		dp = backref(m, sp, stop, ss+1, stopst, lev);
705 		if (dp != NULL)
706 			return(dp);
707 		m->pmatch[i].rm_so = offsave;
708 		return(NULL);
709 
710 	case ORPAREN:		/* must undo assignment if rest fails */
711 		i = OPND(s);
712 		assert(0 < i && i <= m->g->nsub);
713 		offsave = m->pmatch[i].rm_eo;
714 		m->pmatch[i].rm_eo = sp - m->offp;
715 		dp = backref(m, sp, stop, ss+1, stopst, lev);
716 		if (dp != NULL)
717 			return(dp);
718 		m->pmatch[i].rm_eo = offsave;
719 		return(NULL);
720 
721 	default:		/* uh oh */
722 		assert(nope);
723 		break;
724 	}
725 
726 	/* "can't happen" */
727 	assert(nope);
728 	/* NOTREACHED */
729 	return NULL;
730 }
731 
732 /*
733  - fast - step through the string at top speed
734  == static char *fast(struct match *m, char *start, \
735  ==	char *stop, sopno startst, sopno stopst);
736  */
737 static char *			/* where tentative match ended, or NULL */
738 fast(m, start, stop, startst, stopst)
739 struct match *m;
740 char *start;
741 char *stop;
742 sopno startst;
743 sopno stopst;
744 {
745 	states st = m->st;
746 	states fresh = m->fresh;
747 	states tmp = m->tmp;
748 	char *p = start;
749 	int c = (start == m->beginp) ? OUT : *(start-1);
750 	int lastc;	/* previous c */
751 	int flagch;
752 	int i;
753 	char *coldp;	/* last p after which no match was underway */
754 
755 	_DIAGASSERT(m != NULL);
756 	_DIAGASSERT(start != NULL);
757 	_DIAGASSERT(stop != NULL);
758 
759 	CLEAR(st);
760 	SET1(st, startst);
761 	st = step(m->g, startst, stopst, st, NOTHING, st);
762 	ASSIGN(fresh, st);
763 	SP("start", st, *p);
764 	coldp = NULL;
765 	for (;;) {
766 		/* next character */
767 		lastc = c;
768 		c = (p == m->endp) ? OUT : *p;
769 		if (EQ(st, fresh))
770 			coldp = p;
771 
772 		/* is there an EOL and/or BOL between lastc and c? */
773 		flagch = '\0';
774 		i = 0;
775 		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
776 				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
777 			flagch = BOL;
778 			i = m->g->nbol;
779 		}
780 		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
781 				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
782 			flagch = (flagch == BOL) ? BOLEOL : EOL;
783 			i += m->g->neol;
784 		}
785 		if (i != 0) {
786 			for (; i > 0; i--)
787 				st = step(m->g, startst, stopst, st, flagch, st);
788 			SP("boleol", st, c);
789 		}
790 
791 		/* how about a word boundary? */
792 		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
793 					(c != OUT && ISWORD(c)) ) {
794 			flagch = BOW;
795 		}
796 		if ( (lastc != OUT && ISWORD(lastc)) &&
797 				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
798 			flagch = EOW;
799 		}
800 		if (flagch == BOW || flagch == EOW) {
801 			st = step(m->g, startst, stopst, st, flagch, st);
802 			SP("boweow", st, c);
803 		}
804 
805 		/* are we done? */
806 		if (ISSET(st, stopst) || p == stop)
807 			break;		/* NOTE BREAK OUT */
808 
809 		/* no, we must deal with this character */
810 		ASSIGN(tmp, st);
811 		ASSIGN(st, fresh);
812 		assert(c != OUT);
813 		st = step(m->g, startst, stopst, tmp, c, st);
814 		SP("aft", st, c);
815 		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
816 		p++;
817 	}
818 
819 	assert(coldp != NULL);
820 	m->coldp = coldp;
821 	if (ISSET(st, stopst))
822 		return(p+1);
823 	else
824 		return(NULL);
825 }
826 
827 /*
828  - slow - step through the string more deliberately
829  == static char *slow(struct match *m, char *start, \
830  ==	char *stop, sopno startst, sopno stopst);
831  */
832 static char *			/* where it ended */
833 slow(m, start, stop, startst, stopst)
834 struct match *m;
835 char *start;
836 char *stop;
837 sopno startst;
838 sopno stopst;
839 {
840 	states st = m->st;
841 	states empty = m->empty;
842 	states tmp = m->tmp;
843 	char *p = start;
844 	int c = (start == m->beginp) ? OUT : *(start-1);
845 	int lastc;	/* previous c */
846 	int flagch;
847 	int i;
848 	char *matchp;	/* last p at which a match ended */
849 
850 	_DIAGASSERT(m != NULL);
851 	_DIAGASSERT(start != NULL);
852 	_DIAGASSERT(stop != NULL);
853 
854 	AT("slow", start, stop, startst, stopst);
855 	CLEAR(st);
856 	SET1(st, startst);
857 	SP("sstart", st, *p);
858 	st = step(m->g, startst, stopst, st, NOTHING, st);
859 	matchp = NULL;
860 	for (;;) {
861 		/* next character */
862 		lastc = c;
863 		c = (p == m->endp) ? OUT : *p;
864 
865 		/* is there an EOL and/or BOL between lastc and c? */
866 		flagch = '\0';
867 		i = 0;
868 		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
869 				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
870 			flagch = BOL;
871 			i = m->g->nbol;
872 		}
873 		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
874 				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
875 			flagch = (flagch == BOL) ? BOLEOL : EOL;
876 			i += m->g->neol;
877 		}
878 		if (i != 0) {
879 			for (; i > 0; i--)
880 				st = step(m->g, startst, stopst, st, flagch, st);
881 			SP("sboleol", st, c);
882 		}
883 
884 		/* how about a word boundary? */
885 		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
886 					(c != OUT && ISWORD(c)) ) {
887 			flagch = BOW;
888 		}
889 		if ( (lastc != OUT && ISWORD(lastc)) &&
890 				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
891 			flagch = EOW;
892 		}
893 		if (flagch == BOW || flagch == EOW) {
894 			st = step(m->g, startst, stopst, st, flagch, st);
895 			SP("sboweow", st, c);
896 		}
897 
898 		/* are we done? */
899 		if (ISSET(st, stopst))
900 			matchp = p;
901 		if (EQ(st, empty) || p == stop)
902 			break;		/* NOTE BREAK OUT */
903 
904 		/* no, we must deal with this character */
905 		ASSIGN(tmp, st);
906 		ASSIGN(st, empty);
907 		assert(c != OUT);
908 		st = step(m->g, startst, stopst, tmp, c, st);
909 		SP("saft", st, c);
910 		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
911 		p++;
912 	}
913 
914 	return(matchp);
915 }
916 
917 
918 /*
919  - step - map set of states reachable before char to set reachable after
920  == static states step(struct re_guts *g, sopno start, sopno stop, \
921  ==	states bef, int ch, states aft);
922  == #define	BOL	(OUT+1)
923  == #define	EOL	(BOL+1)
924  == #define	BOLEOL	(BOL+2)
925  == #define	NOTHING	(BOL+3)
926  == #define	BOW	(BOL+4)
927  == #define	EOW	(BOL+5)
928  == #define	CODEMAX	(BOL+5)		// highest code used
929  == #define	NONCHAR(c)	((c) > CHAR_MAX)
930  == #define	NNONCHAR	(CODEMAX-CHAR_MAX)
931  */
932 static states
933 step(g, start, stop, bef, ch, aft)
934 struct re_guts *g;
935 sopno start;			/* start state within strip */
936 sopno stop;			/* state after stop state within strip */
937 states bef;		/* states reachable before */
938 int ch;				/* character or NONCHAR code */
939 states aft;		/* states already known reachable after */
940 {
941 	cset *cs;
942 	sop s;
943 	sopno pc;
944 	onestate here;		/* note, macros know this name */
945 	sopno look;
946 	int i;
947 
948 	_DIAGASSERT(g != NULL);
949 
950 	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
951 		s = g->strip[pc];
952 		switch (OP(s)) {
953 		case OEND:
954 			assert(pc == stop-1);
955 			break;
956 		case OCHAR:
957 			/* only characters can match */
958 			assert(!NONCHAR(ch) || ch != (char)OPND(s));
959 			if (ch == (char)OPND(s))
960 				FWD(aft, bef, 1);
961 			break;
962 		case OBOL:
963 			if (ch == BOL || ch == BOLEOL)
964 				FWD(aft, bef, 1);
965 			break;
966 		case OEOL:
967 			if (ch == EOL || ch == BOLEOL)
968 				FWD(aft, bef, 1);
969 			break;
970 		case OBOW:
971 			if (ch == BOW)
972 				FWD(aft, bef, 1);
973 			break;
974 		case OEOW:
975 			if (ch == EOW)
976 				FWD(aft, bef, 1);
977 			break;
978 		case OANY:
979 			if (!NONCHAR(ch))
980 				FWD(aft, bef, 1);
981 			break;
982 		case OANYOF:
983 			cs = &g->sets[OPND(s)];
984 			if (!NONCHAR(ch) && CHIN(cs, ch))
985 				FWD(aft, bef, 1);
986 			break;
987 		case OBACK_:		/* ignored here */
988 		case O_BACK:
989 			FWD(aft, aft, 1);
990 			break;
991 		case OPLUS_:		/* forward, this is just an empty */
992 			FWD(aft, aft, 1);
993 			break;
994 		case O_PLUS:		/* both forward and back */
995 			FWD(aft, aft, 1);
996 			i = ISSETBACK(aft, OPND(s));
997 			BACK(aft, aft, OPND(s));
998 			if (!i && ISSETBACK(aft, OPND(s))) {
999 				/* oho, must reconsider loop body */
1000 				pc -= OPND(s) + 1;
1001 				INIT(here, pc);
1002 			}
1003 			break;
1004 		case OQUEST_:		/* two branches, both forward */
1005 			FWD(aft, aft, 1);
1006 			FWD(aft, aft, OPND(s));
1007 			break;
1008 		case O_QUEST:		/* just an empty */
1009 			FWD(aft, aft, 1);
1010 			break;
1011 		case OLPAREN:		/* not significant here */
1012 		case ORPAREN:
1013 			FWD(aft, aft, 1);
1014 			break;
1015 		case OCH_:		/* mark the first two branches */
1016 			FWD(aft, aft, 1);
1017 			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1018 			FWD(aft, aft, OPND(s));
1019 			break;
1020 		case OOR1:		/* done a branch, find the O_CH */
1021 			if (ISSTATEIN(aft, here)) {
1022 				for (look = 1;
1023 						OP(s = g->strip[pc+look]) != O_CH;
1024 						look += OPND(s))
1025 					assert(OP(s) == OOR2);
1026 				FWD(aft, aft, look);
1027 			}
1028 			break;
1029 		case OOR2:		/* propagate OCH_'s marking */
1030 			FWD(aft, aft, 1);
1031 			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1032 				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1033 				FWD(aft, aft, OPND(s));
1034 			}
1035 			break;
1036 		case O_CH:		/* just empty */
1037 			FWD(aft, aft, 1);
1038 			break;
1039 		default:		/* ooooops... */
1040 			assert(nope);
1041 			break;
1042 		}
1043 	}
1044 
1045 	return(aft);
1046 }
1047 
1048 #ifdef REDEBUG
1049 /*
1050  - print - print a set of states
1051  == #ifdef REDEBUG
1052  == static void print(struct match *m, char *caption, states st, \
1053  ==	int ch, FILE *d);
1054  == #endif
1055  */
1056 static void
1057 print(m, caption, st, ch, d)
1058 struct match *m;
1059 char *caption;
1060 states st;
1061 int ch;
1062 FILE *d;
1063 {
1064 	struct re_guts *g = m->g;
1065 	int i;
1066 	int first = 1;
1067 
1068 	_DIAGASSERT(m != NULL);
1069 	_DIAGASSERT(caption != NULL);
1070 
1071 	if (!(m->eflags&REG_TRACE))
1072 		return;
1073 
1074 	_DIAGASSERT(d != NULL);
1075 
1076 	fprintf(d, "%s", caption);
1077 	if (ch != '\0')
1078 		fprintf(d, " %s", pchar(ch));
1079 	for (i = 0; i < g->nstates; i++)
1080 		if (ISSET(st, i)) {
1081 			fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1082 			first = 0;
1083 		}
1084 	fprintf(d, "\n");
1085 }
1086 
1087 /*
1088  - at - print current situation
1089  == #ifdef REDEBUG
1090  == static void at(struct match *m, char *title, char *start, char *stop, \
1091  ==						sopno startst, sopno stopst);
1092  == #endif
1093  */
1094 static void
1095 at(m, title, start, stop, startst, stopst)
1096 struct match *m;
1097 char *title;
1098 char *start;
1099 char *stop;
1100 sopno startst;
1101 sopno stopst;
1102 {
1103 
1104 	_DIAGASSERT(m != NULL);
1105 	_DIAGASSERT(title != NULL);
1106 	_DIAGASSERT(start != NULL);
1107 	_DIAGASSERT(stop != NULL);
1108 
1109 	if (!(m->eflags&REG_TRACE))
1110 		return;
1111 
1112 	printf("%s %s-", title, pchar(*start));
1113 	printf("%s ", pchar(*stop));
1114 	printf("%ld-%ld\n", (long)startst, (long)stopst);
1115 }
1116 
1117 #ifndef PCHARDONE
1118 #define	PCHARDONE	/* never again */
1119 /*
1120  - pchar - make a character printable
1121  == #ifdef REDEBUG
1122  == static char *pchar(int ch);
1123  == #endif
1124  *
1125  * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1126  * duplicate here avoids having a debugging-capable regexec.o tied to
1127  * a matching debug.o, and this is convenient.  It all disappears in
1128  * the non-debug compilation anyway, so it doesn't matter much.
1129  */
1130 static char *			/* -> representation */
1131 pchar(ch)
1132 int ch;
1133 {
1134 	static char pbuf[10];
1135 
1136 	if (isprint(ch) || ch == ' ')
1137 		(void)snprintf(pbuf, sizeof pbuf, "%c", ch);
1138 	else
1139 		(void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
1140 	return(pbuf);
1141 }
1142 #endif
1143 #endif
1144 
1145 #undef	matcher
1146 #undef	fast
1147 #undef	slow
1148 #undef	dissect
1149 #undef	backref
1150 #undef	step
1151 #undef	print
1152 #undef	at
1153 #undef	match
1154 #undef	nope
1155