1 /*
2  * re_*exec and friends - match REs
3  *
4  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
5  *
6  * Development of this software was funded, in part, by Cray Research Inc.,
7  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8  * Corporation, none of whom are responsible for the results.  The author
9  * thanks all of them.
10  *
11  * Redistribution and use in source and binary forms -- with or without
12  * modification -- are permitted for any purpose, provided that
13  * redistributions in source form retain this entire copyright notice and
14  * indicate the origin and nature of any modifications.
15  *
16  * I'd appreciate being given credit for this package in the documentation
17  * of software which uses it, but that is not a requirement.
18  *
19  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
22  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *
30  */
31 
32 #include "regguts.h"
33 
34 
35 
36 /* lazy-DFA representation */
37 struct arcp {			/* "pointer" to an outarc */
38 	struct sset *ss;
39 	color co;
40 };
41 
42 struct sset {			/* state set */
43 	unsigned *states;	/* pointer to bitvector */
44 	unsigned hash;		/* hash of bitvector */
45 #		define	HASH(bv, nw)	(((nw) == 1) ? *(bv) : hash(bv, nw))
46 #	define	HIT(h,bv,ss,nw)	((ss)->hash == (h) && ((nw) == 1 || \
47 		memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
48 	int flags;
49 #		define	STARTER		01	/* the initial state set */
50 #		define	POSTSTATE	02	/* includes the goal state */
51 #		define	LOCKED		04	/* locked in cache */
52 #		define	NOPROGRESS	010	/* zero-progress state set */
53 	struct arcp ins;	/* chain of inarcs pointing here */
54 	chr *lastseen;		/* last entered on arrival here */
55 	struct sset **outs;	/* outarc vector indexed by color */
56 	struct arcp *inchain;	/* chain-pointer vector for outarcs */
57 };
58 
59 struct dfa {
60 	int nssets;		/* size of cache */
61 	int nssused;		/* how many entries occupied yet */
62 	int nstates;		/* number of states */
63 	int ncolors;		/* length of outarc and inchain vectors */
64 	int wordsper;		/* length of state-set bitvectors */
65 	struct sset *ssets;	/* state-set cache */
66 	unsigned *statesarea;	/* bitvector storage */
67 	unsigned *work;		/* pointer to work area within statesarea */
68 	struct sset **outsarea;	/* outarc-vector storage */
69 	struct arcp *incarea;	/* inchain storage */
70 	struct cnfa *cnfa;
71 	struct colormap *cm;
72 	chr *lastpost;		/* location of last cache-flushed success */
73 	chr *lastnopr;		/* location of last cache-flushed NOPROGRESS */
74 	struct sset *search;	/* replacement-search-pointer memory */
75 	int cptsmalloced;	/* were the areas individually malloced? */
76 	char *mallocarea;	/* self, or master malloced area, or NULL */
77 };
78 
79 #define	WORK	1		/* number of work bitvectors needed */
80 
81 /* setup for non-malloc allocation for small cases */
82 #define	FEWSTATES	20	/* must be less than UBITS */
83 #define	FEWCOLORS	15
84 struct smalldfa {
85 	struct dfa dfa;
86 	struct sset ssets[FEWSTATES*2];
87 	unsigned statesarea[FEWSTATES*2 + WORK];
88 	struct sset *outsarea[FEWSTATES*2 * FEWCOLORS];
89 	struct arcp incarea[FEWSTATES*2 * FEWCOLORS];
90 };
91 #define	DOMALLOC	((struct smalldfa *)NULL)	/* force malloc */
92 
93 
94 
95 /* internal variables, bundled for easy passing around */
96 struct vars {
97 	regex_t *re;
98 	struct guts *g;
99 	int eflags;		/* copies of arguments */
100 	size_t nmatch;
101 	regmatch_t *pmatch;
102 	rm_detail_t *details;
103 	chr *start;		/* start of string */
104 	chr *stop;		/* just past end of string */
105 	int err;		/* error code if any (0 none) */
106 	regoff_t *mem;		/* memory vector for backtracking */
107 	struct smalldfa dfa1;
108 	struct smalldfa dfa2;
109 };
110 #define	VISERR(vv)	((vv)->err != 0)	/* have we seen an error yet? */
111 #define	ISERR()	VISERR(v)
112 #define	VERR(vv,e)	(((vv)->err) ? (vv)->err : ((vv)->err = (e)))
113 #define	ERR(e)	VERR(v, e)		/* record an error */
114 #define	NOERR()	{if (ISERR()) return v->err;}	/* if error seen, return it */
115 #define	OFF(p)	((p) - v->start)
116 #define	LOFF(p)	((long)OFF(p))
117 
118 
119 
120 /*
121  * forward declarations
122  */
123 /* =====^!^===== begin forwards =====^!^===== */
124 /* automatically gathered by fwd; do not hand-edit */
125 /* === regexec.c === */
126 int exec _ANSI_ARGS_((regex_t *, CONST chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int));
127 static int find _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *));
128 static int cfind _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *));
129 static int cfindloop _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **));
130 static VOID zapsubs _ANSI_ARGS_((regmatch_t *, size_t));
131 static VOID zapmem _ANSI_ARGS_((struct vars *, struct subre *));
132 static VOID subset _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
133 static int dissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
134 static int condissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
135 static int altdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
136 static int cdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
137 static int ccondissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
138 static int crevdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
139 static int cbrdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
140 static int caltdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *));
141 /* === rege_dfa.c === */
142 static chr *longest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *, int *));
143 static chr *shortest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *));
144 static chr *lastcold _ANSI_ARGS_((struct vars *, struct dfa *));
145 static struct dfa *newdfa _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *, struct smalldfa *));
146 static VOID freedfa _ANSI_ARGS_((struct dfa *));
147 static unsigned hash _ANSI_ARGS_((unsigned *, int));
148 static struct sset *initialize _ANSI_ARGS_((struct vars *, struct dfa *, chr *));
149 static struct sset *miss _ANSI_ARGS_((struct vars *, struct dfa *, struct sset *, pcolor, chr *, chr *));
150 static int lacon _ANSI_ARGS_((struct vars *, struct cnfa *, chr *, pcolor));
151 static struct sset *getvacant _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *));
152 static struct sset *pickss _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *));
153 /* automatically gathered by fwd; do not hand-edit */
154 /* =====^!^===== end forwards =====^!^===== */
155 
156 
157 
158 /*
159  - exec - match regular expression
160  ^ int exec(regex_t *, CONST chr *, size_t, rm_detail_t *,
161  ^					size_t, regmatch_t [], int);
162  */
163 int
exec(re,string,len,details,nmatch,pmatch,flags)164 exec(re, string, len, details, nmatch, pmatch, flags)
165 regex_t *re;
166 CONST chr *string;
167 size_t len;
168 rm_detail_t *details;
169 size_t nmatch;
170 regmatch_t pmatch[];
171 int flags;
172 {
173 	struct vars var;
174 	register struct vars *v = &var;
175 	int st;
176 	size_t n;
177 	int backref;
178 #	define	LOCALMAT	20
179 	regmatch_t mat[LOCALMAT];
180 #	define	LOCALMEM	40
181 	regoff_t mem[LOCALMEM];
182 
183 	/* sanity checks */
184 	if (re == NULL || string == NULL || re->re_magic != REMAGIC)
185 		return REG_INVARG;
186 	if (re->re_csize != sizeof(chr))
187 		return REG_MIXED;
188 
189 	/* setup */
190 	v->re = re;
191 	v->g = (struct guts *)re->re_guts;
192 	if ((v->g->cflags&REG_EXPECT) && details == NULL)
193 		return REG_INVARG;
194 	if (v->g->info&REG_UIMPOSSIBLE)
195 		return REG_NOMATCH;
196 	backref = (v->g->info&REG_UBACKREF) ? 1 : 0;
197 	v->eflags = flags;
198 	if (v->g->cflags&REG_NOSUB)
199 		nmatch = 0;		/* override client */
200 	v->nmatch = nmatch;
201 	if (backref) {
202 		/* need work area */
203 		if (v->g->nsub + 1 <= LOCALMAT)
204 			v->pmatch = mat;
205 		else
206 			v->pmatch = (regmatch_t *)MALLOC((v->g->nsub + 1) *
207 							sizeof(regmatch_t));
208 		if (v->pmatch == NULL)
209 			return REG_ESPACE;
210 		v->nmatch = v->g->nsub + 1;
211 	} else
212 		v->pmatch = pmatch;
213 	v->details = details;
214 	v->start = (chr *)string;
215 	v->stop = (chr *)string + len;
216 	v->err = 0;
217 	if (backref) {
218 		/* need retry memory */
219 		assert(v->g->ntree >= 0);
220 		n = (size_t)v->g->ntree;
221 		if (n <= LOCALMEM)
222 			v->mem = mem;
223 		else
224 			v->mem = (regoff_t *)MALLOC(n*sizeof(regoff_t));
225 		if (v->mem == NULL) {
226 			if (v->pmatch != pmatch && v->pmatch != mat)
227 				FREE(v->pmatch);
228 			return REG_ESPACE;
229 		}
230 	} else
231 		v->mem = NULL;
232 
233 	/* do it */
234 	assert(v->g->tree != NULL);
235 	if (backref)
236 		st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
237 	else
238 		st = find(v, &v->g->tree->cnfa, &v->g->cmap);
239 
240 	/* copy (portion of) match vector over if necessary */
241 	if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) {
242 		zapsubs(pmatch, nmatch);
243 		n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
244 		memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t));
245 	}
246 
247 	/* clean up */
248 	if (v->pmatch != pmatch && v->pmatch != mat)
249 		FREE(v->pmatch);
250 	if (v->mem != NULL && v->mem != mem)
251 		FREE(v->mem);
252 	return st;
253 }
254 
255 /*
256  - find - find a match for the main NFA (no-complications case)
257  ^ static int find(struct vars *, struct cnfa *, struct colormap *);
258  */
259 static int
find(v,cnfa,cm)260 find(v, cnfa, cm)
261 struct vars *v;
262 struct cnfa *cnfa;
263 struct colormap *cm;
264 {
265 	struct dfa *s;
266 	struct dfa *d;
267 	chr *begin;
268 	chr *end = NULL;
269 	chr *cold;
270 	chr *open;		/* open and close of range of possible starts */
271 	chr *close;
272 	int hitend;
273 	int shorter = (v->g->tree->flags&SHORTER) ? 1 : 0;
274 
275 	/* first, a shot with the search RE */
276 	s = newdfa(v, &v->g->search, cm, &v->dfa1);
277 	assert(!(ISERR() && s != NULL));
278 	NOERR();
279 	MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
280 	cold = NULL;
281 	close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *)NULL);
282 	freedfa(s);
283 	NOERR();
284 	if (v->g->cflags&REG_EXPECT) {
285 		assert(v->details != NULL);
286 		if (cold != NULL)
287 			v->details->rm_extend.rm_so = OFF(cold);
288 		else
289 			v->details->rm_extend.rm_so = OFF(v->stop);
290 		v->details->rm_extend.rm_eo = OFF(v->stop);	/* unknown */
291 	}
292 	if (close == NULL)		/* not found */
293 		return REG_NOMATCH;
294 	if (v->nmatch == 0)		/* found, don't need exact location */
295 		return REG_OKAY;
296 
297 	/* find starting point and match */
298 	assert(cold != NULL);
299 	open = cold;
300 	cold = NULL;
301 	MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
302 	d = newdfa(v, cnfa, cm, &v->dfa1);
303 	assert(!(ISERR() && d != NULL));
304 	NOERR();
305 	for (begin = open; begin <= close; begin++) {
306 		MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
307 		if (shorter)
308 			end = shortest(v, d, begin, begin, v->stop,
309 							(chr **)NULL, &hitend);
310 		else
311 			end = longest(v, d, begin, v->stop, &hitend);
312 		NOERR();
313 		if (hitend && cold == NULL)
314 			cold = begin;
315 		if (end != NULL)
316 			break;		/* NOTE BREAK OUT */
317 	}
318 	assert(end != NULL);		/* search RE succeeded so loop should */
319 	freedfa(d);
320 
321 	/* and pin down details */
322 	assert(v->nmatch > 0);
323 	v->pmatch[0].rm_so = OFF(begin);
324 	v->pmatch[0].rm_eo = OFF(end);
325 	if (v->g->cflags&REG_EXPECT) {
326 		if (cold != NULL)
327 			v->details->rm_extend.rm_so = OFF(cold);
328 		else
329 			v->details->rm_extend.rm_so = OFF(v->stop);
330 		v->details->rm_extend.rm_eo = OFF(v->stop);	/* unknown */
331 	}
332 	if (v->nmatch == 1)		/* no need for submatches */
333 		return REG_OKAY;
334 
335 	/* submatches */
336 	zapsubs(v->pmatch, v->nmatch);
337 	return dissect(v, v->g->tree, begin, end);
338 }
339 
340 /*
341  - cfind - find a match for the main NFA (with complications)
342  ^ static int cfind(struct vars *, struct cnfa *, struct colormap *);
343  */
344 static int
cfind(v,cnfa,cm)345 cfind(v, cnfa, cm)
346 struct vars *v;
347 struct cnfa *cnfa;
348 struct colormap *cm;
349 {
350 	struct dfa *s;
351 	struct dfa *d;
352 	chr *cold;
353 	int ret;
354 
355 	s = newdfa(v, &v->g->search, cm, &v->dfa1);
356 	NOERR();
357 	d = newdfa(v, cnfa, cm, &v->dfa2);
358 	if (ISERR()) {
359 		assert(d == NULL);
360 		freedfa(s);
361 		return v->err;
362 	}
363 
364 	ret = cfindloop(v, cnfa, cm, d, s, &cold);
365 
366 	freedfa(d);
367 	freedfa(s);
368 	NOERR();
369 	if (v->g->cflags&REG_EXPECT) {
370 		assert(v->details != NULL);
371 		if (cold != NULL)
372 			v->details->rm_extend.rm_so = OFF(cold);
373 		else
374 			v->details->rm_extend.rm_so = OFF(v->stop);
375 		v->details->rm_extend.rm_eo = OFF(v->stop);	/* unknown */
376 	}
377 	return ret;
378 }
379 
380 /*
381  - cfindloop - the heart of cfind
382  ^ static int cfindloop(struct vars *, struct cnfa *, struct colormap *,
383  ^	struct dfa *, struct dfa *, chr **);
384  */
385 static int
cfindloop(v,cnfa,cm,d,s,coldp)386 cfindloop(v, cnfa, cm, d, s, coldp)
387 struct vars *v;
388 struct cnfa *cnfa;
389 struct colormap *cm;
390 struct dfa *d;
391 struct dfa *s;
392 chr **coldp;			/* where to put coldstart pointer */
393 {
394 	chr *begin;
395 	chr *end;
396 	chr *cold;
397 	chr *open;		/* open and close of range of possible starts */
398 	chr *close;
399 	chr *estart;
400 	chr *estop;
401 	int er;
402 	int shorter = v->g->tree->flags&SHORTER;
403 	int hitend;
404 
405 	assert(d != NULL && s != NULL);
406 	cold = NULL;
407 	close = v->start;
408 	do {
409 		MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
410 		close = shortest(v, s, close, close, v->stop, &cold, (int *)NULL);
411 		if (close == NULL)
412 			break;				/* NOTE BREAK */
413 		assert(cold != NULL);
414 		open = cold;
415 		cold = NULL;
416 		MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
417 		for (begin = open; begin <= close; begin++) {
418 			MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
419 			estart = begin;
420 			estop = v->stop;
421 			for (;;) {
422 				if (shorter)
423 					end = shortest(v, d, begin, estart,
424 						estop, (chr **)NULL, &hitend);
425 				else
426 					end = longest(v, d, begin, estop,
427 								&hitend);
428 				if (hitend && cold == NULL)
429 					cold = begin;
430 				if (end == NULL)
431 					break;		/* NOTE BREAK OUT */
432 				MDEBUG(("tentative end %ld\n", LOFF(end)));
433 				zapsubs(v->pmatch, v->nmatch);
434 				zapmem(v, v->g->tree);
435 				er = cdissect(v, v->g->tree, begin, end);
436 				if (er == REG_OKAY) {
437 					if (v->nmatch > 0) {
438 						v->pmatch[0].rm_so = OFF(begin);
439 						v->pmatch[0].rm_eo = OFF(end);
440 					}
441 					*coldp = cold;
442 					return REG_OKAY;
443 				}
444 				if (er != REG_NOMATCH) {
445 					ERR(er);
446 					return er;
447 				}
448 				if ((shorter) ? end == estop : end == begin) {
449 					/* no point in trying again */
450 					*coldp = cold;
451 					return REG_NOMATCH;
452 				}
453 				/* go around and try again */
454 				if (shorter)
455 					estart = end + 1;
456 				else
457 					estop = end - 1;
458 			}
459 		}
460 	} while (close < v->stop);
461 
462 	*coldp = cold;
463 	return REG_NOMATCH;
464 }
465 
466 /*
467  - zapsubs - initialize the subexpression matches to "no match"
468  ^ static VOID zapsubs(regmatch_t *, size_t);
469  */
470 static VOID
zapsubs(p,n)471 zapsubs(p, n)
472 regmatch_t *p;
473 size_t n;
474 {
475 	size_t i;
476 
477 	for (i = n-1; i > 0; i--) {
478 		p[i].rm_so = -1;
479 		p[i].rm_eo = -1;
480 	}
481 }
482 
483 /*
484  - zapmem - initialize the retry memory of a subtree to zeros
485  ^ static VOID zapmem(struct vars *, struct subre *);
486  */
487 static VOID
zapmem(v,t)488 zapmem(v, t)
489 struct vars *v;
490 struct subre *t;
491 {
492 	if (t == NULL)
493 		return;
494 
495 	assert(v->mem != NULL);
496 	v->mem[t->retry] = 0;
497 	if (t->op == '(') {
498 		assert(t->subno > 0);
499 		v->pmatch[t->subno].rm_so = -1;
500 		v->pmatch[t->subno].rm_eo = -1;
501 	}
502 
503 	if (t->left != NULL)
504 		zapmem(v, t->left);
505 	if (t->right != NULL)
506 		zapmem(v, t->right);
507 }
508 
509 /*
510  - subset - set any subexpression relevant to a successful subre
511  ^ static VOID subset(struct vars *, struct subre *, chr *, chr *);
512  */
513 static VOID
subset(v,sub,begin,end)514 subset(v, sub, begin, end)
515 struct vars *v;
516 struct subre *sub;
517 chr *begin;
518 chr *end;
519 {
520 	int n = sub->subno;
521 
522 	assert(n > 0);
523 	if ((size_t)n >= v->nmatch)
524 		return;
525 
526 	MDEBUG(("setting %d\n", n));
527 	v->pmatch[n].rm_so = OFF(begin);
528 	v->pmatch[n].rm_eo = OFF(end);
529 }
530 
531 /*
532  - dissect - determine subexpression matches (uncomplicated case)
533  ^ static int dissect(struct vars *, struct subre *, chr *, chr *);
534  */
535 static int			/* regexec return code */
dissect(v,t,begin,end)536 dissect(v, t, begin, end)
537 struct vars *v;
538 struct subre *t;
539 chr *begin;			/* beginning of relevant substring */
540 chr *end;			/* end of same */
541 {
542 	assert(t != NULL);
543 	MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end)));
544 
545 	switch (t->op) {
546 	case '=':		/* terminal node */
547 		assert(t->left == NULL && t->right == NULL);
548 		return REG_OKAY;	/* no action, parent did the work */
549 		break;
550 	case '|':		/* alternation */
551 		assert(t->left != NULL);
552 		return altdissect(v, t, begin, end);
553 		break;
554 	case 'b':		/* back ref -- shouldn't be calling us! */
555 		return REG_ASSERT;
556 		break;
557 	case '.':		/* concatenation */
558 		assert(t->left != NULL && t->right != NULL);
559 		return condissect(v, t, begin, end);
560 		break;
561 	case '(':		/* capturing */
562 		assert(t->left != NULL && t->right == NULL);
563 		assert(t->subno > 0);
564 		subset(v, t, begin, end);
565 		return dissect(v, t->left, begin, end);
566 		break;
567 	default:
568 		return REG_ASSERT;
569 		break;
570 	}
571 }
572 
573 /*
574  - condissect - determine concatenation subexpression matches (uncomplicated)
575  ^ static int condissect(struct vars *, struct subre *, chr *, chr *);
576  */
577 static int			/* regexec return code */
condissect(v,t,begin,end)578 condissect(v, t, begin, end)
579 struct vars *v;
580 struct subre *t;
581 chr *begin;			/* beginning of relevant substring */
582 chr *end;			/* end of same */
583 {
584 	struct dfa *d;
585 	struct dfa *d2;
586 	chr *mid;
587 	int i;
588 	int shorter = (t->left->flags&SHORTER) ? 1 : 0;
589 	chr *stop = (shorter) ? end : begin;
590 
591 	assert(t->op == '.');
592 	assert(t->left != NULL && t->left->cnfa.nstates > 0);
593 	assert(t->right != NULL && t->right->cnfa.nstates > 0);
594 
595 	d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
596 	NOERR();
597 	d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2);
598 	if (ISERR()) {
599 		assert(d2 == NULL);
600 		freedfa(d);
601 		return v->err;
602 	}
603 
604 	/* pick a tentative midpoint */
605 	if (shorter)
606 		mid = shortest(v, d, begin, begin, end, (chr **)NULL,
607 								(int *)NULL);
608 	else
609 		mid = longest(v, d, begin, end, (int *)NULL);
610 	if (mid == NULL) {
611 		freedfa(d);
612 		freedfa(d2);
613 		return REG_ASSERT;
614 	}
615 	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
616 
617 	/* iterate until satisfaction or failure */
618 	while (longest(v, d2, mid, end, (int *)NULL) != end) {
619 		/* that midpoint didn't work, find a new one */
620 		if (mid == stop) {
621 			/* all possibilities exhausted! */
622 			MDEBUG(("no midpoint!\n"));
623 			freedfa(d);
624 			freedfa(d2);
625 			return REG_ASSERT;
626 		}
627 		if (shorter)
628 			mid = shortest(v, d, begin, mid+1, end, (chr **)NULL,
629 								(int *)NULL);
630 		else
631 			mid = longest(v, d, begin, mid-1, (int *)NULL);
632 		if (mid == NULL) {
633 			/* failed to find a new one! */
634 			MDEBUG(("failed midpoint!\n"));
635 			freedfa(d);
636 			freedfa(d2);
637 			return REG_ASSERT;
638 		}
639 		MDEBUG(("new midpoint %ld\n", LOFF(mid)));
640 	}
641 
642 	/* satisfaction */
643 	MDEBUG(("successful\n"));
644 	freedfa(d);
645 	freedfa(d2);
646 	i = dissect(v, t->left, begin, mid);
647 	if (i != REG_OKAY)
648 		return i;
649 	return dissect(v, t->right, mid, end);
650 }
651 
652 /*
653  - altdissect - determine alternative subexpression matches (uncomplicated)
654  ^ static int altdissect(struct vars *, struct subre *, chr *, chr *);
655  */
656 static int			/* regexec return code */
altdissect(v,t,begin,end)657 altdissect(v, t, begin, end)
658 struct vars *v;
659 struct subre *t;
660 chr *begin;			/* beginning of relevant substring */
661 chr *end;			/* end of same */
662 {
663 	struct dfa *d;
664 	int i;
665 
666 	assert(t != NULL);
667 	assert(t->op == '|');
668 
669 	for (i = 0; t != NULL; t = t->right, i++) {
670 		MDEBUG(("trying %dth\n", i));
671 		assert(t->left != NULL && t->left->cnfa.nstates > 0);
672 		d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
673 		if (ISERR())
674 			return v->err;
675 		if (longest(v, d, begin, end, (int *)NULL) == end) {
676 			MDEBUG(("success\n"));
677 			freedfa(d);
678 			return dissect(v, t->left, begin, end);
679 		}
680 		freedfa(d);
681 	}
682 	return REG_ASSERT;	/* none of them matched?!? */
683 }
684 
685 /*
686  - cdissect - determine subexpression matches (with complications)
687  * The retry memory stores the offset of the trial midpoint from begin,
688  * plus 1 so that 0 uniquely means "clean slate".
689  ^ static int cdissect(struct vars *, struct subre *, chr *, chr *);
690  */
691 static int			/* regexec return code */
cdissect(v,t,begin,end)692 cdissect(v, t, begin, end)
693 struct vars *v;
694 struct subre *t;
695 chr *begin;			/* beginning of relevant substring */
696 chr *end;			/* end of same */
697 {
698 	int er;
699 
700 	assert(t != NULL);
701 	MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
702 
703 	switch (t->op) {
704 	case '=':		/* terminal node */
705 		assert(t->left == NULL && t->right == NULL);
706 		return REG_OKAY;	/* no action, parent did the work */
707 		break;
708 	case '|':		/* alternation */
709 		assert(t->left != NULL);
710 		return caltdissect(v, t, begin, end);
711 		break;
712 	case 'b':		/* back ref -- shouldn't be calling us! */
713 		assert(t->left == NULL && t->right == NULL);
714 		return cbrdissect(v, t, begin, end);
715 		break;
716 	case '.':		/* concatenation */
717 		assert(t->left != NULL && t->right != NULL);
718 		return ccondissect(v, t, begin, end);
719 		break;
720 	case '(':		/* capturing */
721 		assert(t->left != NULL && t->right == NULL);
722 		assert(t->subno > 0);
723 		er = cdissect(v, t->left, begin, end);
724 		if (er == REG_OKAY)
725 			subset(v, t, begin, end);
726 		return er;
727 		break;
728 	default:
729 		return REG_ASSERT;
730 		break;
731 	}
732 }
733 
734 /*
735  - ccondissect - concatenation subexpression matches (with complications)
736  * The retry memory stores the offset of the trial midpoint from begin,
737  * plus 1 so that 0 uniquely means "clean slate".
738  ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *);
739  */
740 static int			/* regexec return code */
ccondissect(v,t,begin,end)741 ccondissect(v, t, begin, end)
742 struct vars *v;
743 struct subre *t;
744 chr *begin;			/* beginning of relevant substring */
745 chr *end;			/* end of same */
746 {
747 	struct dfa *d;
748 	struct dfa *d2;
749 	chr *mid;
750 	int er;
751 
752 	assert(t->op == '.');
753 	assert(t->left != NULL && t->left->cnfa.nstates > 0);
754 	assert(t->right != NULL && t->right->cnfa.nstates > 0);
755 
756 	if (t->left->flags&SHORTER)		/* reverse scan */
757 		return crevdissect(v, t, begin, end);
758 
759 	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
760 	if (ISERR())
761 		return v->err;
762 	d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
763 	if (ISERR()) {
764 		freedfa(d);
765 		return v->err;
766 	}
767 	MDEBUG(("cconcat %d\n", t->retry));
768 
769 	/* pick a tentative midpoint */
770 	if (v->mem[t->retry] == 0) {
771 		mid = longest(v, d, begin, end, (int *)NULL);
772 		if (mid == NULL) {
773 			freedfa(d);
774 			freedfa(d2);
775 			return REG_NOMATCH;
776 		}
777 		MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
778 		v->mem[t->retry] = (mid - begin) + 1;
779 	} else {
780 		mid = begin + (v->mem[t->retry] - 1);
781 		MDEBUG(("working midpoint %ld\n", LOFF(mid)));
782 	}
783 
784 	/* iterate until satisfaction or failure */
785 	for (;;) {
786 		/* try this midpoint on for size */
787 		er = cdissect(v, t->left, begin, mid);
788 		if (er == REG_OKAY &&
789 				longest(v, d2, mid, end, (int *)NULL) == end &&
790 				(er = cdissect(v, t->right, mid, end)) ==
791 								REG_OKAY)
792 			break;			/* NOTE BREAK OUT */
793 		if (er != REG_OKAY && er != REG_NOMATCH) {
794 			freedfa(d);
795 			freedfa(d2);
796 			return er;
797 		}
798 
799 		/* that midpoint didn't work, find a new one */
800 		if (mid == begin) {
801 			/* all possibilities exhausted */
802 			MDEBUG(("%d no midpoint\n", t->retry));
803 			freedfa(d);
804 			freedfa(d2);
805 			return REG_NOMATCH;
806 		}
807 		mid = longest(v, d, begin, mid-1, (int *)NULL);
808 		if (mid == NULL) {
809 			/* failed to find a new one */
810 			MDEBUG(("%d failed midpoint\n", t->retry));
811 			freedfa(d);
812 			freedfa(d2);
813 			return REG_NOMATCH;
814 		}
815 		MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
816 		v->mem[t->retry] = (mid - begin) + 1;
817 		zapmem(v, t->left);
818 		zapmem(v, t->right);
819 	}
820 
821 	/* satisfaction */
822 	MDEBUG(("successful\n"));
823 	freedfa(d);
824 	freedfa(d2);
825 	return REG_OKAY;
826 }
827 
828 /*
829  - crevdissect - determine backref shortest-first subexpression matches
830  * The retry memory stores the offset of the trial midpoint from begin,
831  * plus 1 so that 0 uniquely means "clean slate".
832  ^ static int crevdissect(struct vars *, struct subre *, chr *, chr *);
833  */
834 static int			/* regexec return code */
crevdissect(v,t,begin,end)835 crevdissect(v, t, begin, end)
836 struct vars *v;
837 struct subre *t;
838 chr *begin;			/* beginning of relevant substring */
839 chr *end;			/* end of same */
840 {
841 	struct dfa *d;
842 	struct dfa *d2;
843 	chr *mid;
844 	int er;
845 
846 	assert(t->op == '.');
847 	assert(t->left != NULL && t->left->cnfa.nstates > 0);
848 	assert(t->right != NULL && t->right->cnfa.nstates > 0);
849 	assert(t->left->flags&SHORTER);
850 
851 	/* concatenation -- need to split the substring between parts */
852 	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
853 	if (ISERR())
854 		return v->err;
855 	d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
856 	if (ISERR()) {
857 		freedfa(d);
858 		return v->err;
859 	}
860 	MDEBUG(("crev %d\n", t->retry));
861 
862 	/* pick a tentative midpoint */
863 	if (v->mem[t->retry] == 0) {
864 		mid = shortest(v, d, begin, begin, end, (chr **)NULL, (int *)NULL);
865 		if (mid == NULL) {
866 			freedfa(d);
867 			freedfa(d2);
868 			return REG_NOMATCH;
869 		}
870 		MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
871 		v->mem[t->retry] = (mid - begin) + 1;
872 	} else {
873 		mid = begin + (v->mem[t->retry] - 1);
874 		MDEBUG(("working midpoint %ld\n", LOFF(mid)));
875 	}
876 
877 	/* iterate until satisfaction or failure */
878 	for (;;) {
879 		/* try this midpoint on for size */
880 		er = cdissect(v, t->left, begin, mid);
881 		if (er == REG_OKAY &&
882 				longest(v, d2, mid, end, (int *)NULL) == end &&
883 				(er = cdissect(v, t->right, mid, end)) ==
884 								REG_OKAY)
885 			break;			/* NOTE BREAK OUT */
886 		if (er != REG_OKAY && er != REG_NOMATCH) {
887 			freedfa(d);
888 			freedfa(d2);
889 			return er;
890 		}
891 
892 		/* that midpoint didn't work, find a new one */
893 		if (mid == end) {
894 			/* all possibilities exhausted */
895 			MDEBUG(("%d no midpoint\n", t->retry));
896 			freedfa(d);
897 			freedfa(d2);
898 			return REG_NOMATCH;
899 		}
900 		mid = shortest(v, d, begin, mid+1, end, (chr **)NULL, (int *)NULL);
901 		if (mid == NULL) {
902 			/* failed to find a new one */
903 			MDEBUG(("%d failed midpoint\n", t->retry));
904 			freedfa(d);
905 			freedfa(d2);
906 			return REG_NOMATCH;
907 		}
908 		MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
909 		v->mem[t->retry] = (mid - begin) + 1;
910 		zapmem(v, t->left);
911 		zapmem(v, t->right);
912 	}
913 
914 	/* satisfaction */
915 	MDEBUG(("successful\n"));
916 	freedfa(d);
917 	freedfa(d2);
918 	return REG_OKAY;
919 }
920 
921 /*
922  - cbrdissect - determine backref subexpression matches
923  ^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
924  */
925 static int			/* regexec return code */
cbrdissect(v,t,begin,end)926 cbrdissect(v, t, begin, end)
927 struct vars *v;
928 struct subre *t;
929 chr *begin;			/* beginning of relevant substring */
930 chr *end;			/* end of same */
931 {
932 	int i;
933 	int n = t->subno;
934 	size_t len;
935 	chr *paren;
936 	chr *p;
937 	chr *stop;
938 	int min = t->min;
939 	int max = t->max;
940 
941 	assert(t != NULL);
942 	assert(t->op == 'b');
943 	assert(n >= 0);
944 	assert((size_t)n < v->nmatch);
945 
946 	MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max));
947 
948 	if (v->pmatch[n].rm_so == -1)
949 		return REG_NOMATCH;
950 	paren = v->start + v->pmatch[n].rm_so;
951 	len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
952 
953 	/* no room to maneuver -- retries are pointless */
954 	if (v->mem[t->retry])
955 		return REG_NOMATCH;
956 	v->mem[t->retry] = 1;
957 
958 	/* special-case zero-length string */
959 	if (len == 0) {
960 		if (begin == end)
961 			return REG_OKAY;
962 		return REG_NOMATCH;
963 	}
964 
965 	/* and too-short string */
966 	assert(end >= begin);
967 	if ((size_t)(end - begin) < len)
968 		return REG_NOMATCH;
969 	stop = end - len;
970 
971 	/* count occurrences */
972 	i = 0;
973 	for (p = begin; p <= stop && (i < max || max == INFINITY); p += len) {
974 		if ((*v->g->compare)(paren, p, len) != 0)
975 				break;
976 		i++;
977 	}
978 	MDEBUG(("cbackref found %d\n", i));
979 
980 	/* and sort it out */
981 	if (p != end)			/* didn't consume all of it */
982 		return REG_NOMATCH;
983 	if (min <= i && (i <= max || max == INFINITY))
984 		return REG_OKAY;
985 	return REG_NOMATCH;		/* out of range */
986 }
987 
988 /*
989  - caltdissect - determine alternative subexpression matches (w. complications)
990  ^ static int caltdissect(struct vars *, struct subre *, chr *, chr *);
991  */
992 static int			/* regexec return code */
caltdissect(v,t,begin,end)993 caltdissect(v, t, begin, end)
994 struct vars *v;
995 struct subre *t;
996 chr *begin;			/* beginning of relevant substring */
997 chr *end;			/* end of same */
998 {
999 	struct dfa *d;
1000 	int er;
1001 #	define	UNTRIED	0	/* not yet tried at all */
1002 #	define	TRYING	1	/* top matched, trying submatches */
1003 #	define	TRIED	2	/* top didn't match or submatches exhausted */
1004 
1005 	if (t == NULL)
1006 		return REG_NOMATCH;
1007 	assert(t->op == '|');
1008 	if (v->mem[t->retry] == TRIED)
1009 		return caltdissect(v, t->right, begin, end);
1010 
1011 	MDEBUG(("calt n%d\n", t->retry));
1012 	assert(t->left != NULL);
1013 
1014 	if (v->mem[t->retry] == UNTRIED) {
1015 		d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
1016 		if (ISERR())
1017 			return v->err;
1018 		if (longest(v, d, begin, end, (int *)NULL) != end) {
1019 			freedfa(d);
1020 			v->mem[t->retry] = TRIED;
1021 			return caltdissect(v, t->right, begin, end);
1022 		}
1023 		freedfa(d);
1024 		MDEBUG(("calt matched\n"));
1025 		v->mem[t->retry] = TRYING;
1026 	}
1027 
1028 	er = cdissect(v, t->left, begin, end);
1029 	if (er != REG_NOMATCH)
1030 		return er;
1031 
1032 	v->mem[t->retry] = TRIED;
1033 	return caltdissect(v, t->right, begin, end);
1034 }
1035 
1036 
1037 
1038 #include "rege_dfa.c"
1039