1 /*
2  * re_*exec and friends - match REs
3  *
4  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
5  *
6  * Development of this software was funded, in part, by Cray Research Inc.,
7  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8  * Corporation, none of whom are responsible for the results.  The author
9  * thanks all of them.
10  *
11  * Redistribution and use in source and binary forms -- with or without
12  * modification -- are permitted for any purpose, provided that
13  * redistributions in source form retain this entire copyright notice and
14  * indicate the origin and nature of any modifications.
15  *
16  * I'd appreciate being given credit for this package in the documentation of
17  * software which uses it, but that is not a requirement.
18  *
19  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
22  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include "regguts.h"
32 
33 /*
34  * Lazy-DFA representation.
35  */
36 
37 struct arcp {			/* "pointer" to an outarc */
38     struct sset *ss;
39     color co;
40 };
41 
42 struct sset {			/* state set */
43     unsigned *states;		/* pointer to bitvector */
44     unsigned hash;		/* hash of bitvector */
45 #define	HASH(bv, nw)	(((nw) == 1) ? *(bv) : hash(bv, nw))
46 #define	HIT(h,bv,ss,nw)	((ss)->hash == (h) && ((nw) == 1 || \
47 	memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
48     int flags;
49 #define	STARTER		01	/* the initial state set */
50 #define	POSTSTATE	02	/* includes the goal state */
51 #define	LOCKED		04	/* locked in cache */
52 #define	NOPROGRESS	010	/* zero-progress state set */
53     struct arcp ins;		/* chain of inarcs pointing here */
54     chr *lastseen;		/* last entered on arrival here */
55     struct sset **outs;		/* outarc vector indexed by color */
56     struct arcp *inchain;	/* chain-pointer vector for outarcs */
57 };
58 
59 struct dfa {
60     int nssets;			/* size of cache */
61     int nssused;		/* how many entries occupied yet */
62     int nstates;		/* number of states */
63     int ncolors;		/* length of outarc and inchain vectors */
64     int wordsper;		/* length of state-set bitvectors */
65     struct sset *ssets;		/* state-set cache */
66     unsigned *statesarea;	/* bitvector storage */
67     unsigned *work;		/* pointer to work area within statesarea */
68     struct sset **outsarea;	/* outarc-vector storage */
69     struct arcp *incarea;	/* inchain storage */
70     struct cnfa *cnfa;
71     struct colormap *cm;
72     chr *lastpost;		/* location of last cache-flushed success */
73     chr *lastnopr;		/* location of last cache-flushed NOPROGRESS */
74     struct sset *search;	/* replacement-search-pointer memory */
75     int cptsmalloced;		/* were the areas individually malloced? */
76     char *mallocarea;		/* self, or master malloced area, or NULL */
77 };
78 
79 #define	WORK	1		/* number of work bitvectors needed */
80 
81 /*
82  * Setup for non-malloc allocation for small cases.
83  */
84 
85 #define	FEWSTATES	20	/* must be less than UBITS */
86 #define	FEWCOLORS	15
87 struct smalldfa {
88     struct dfa dfa;
89     struct sset ssets[FEWSTATES*2];
90     unsigned statesarea[FEWSTATES*2 + WORK];
91     struct sset *outsarea[FEWSTATES*2 * FEWCOLORS];
92     struct arcp incarea[FEWSTATES*2 * FEWCOLORS];
93 };
94 #define	DOMALLOC	((struct smalldfa *)NULL)	/* force malloc */
95 
96 /*
97  * Internal variables, bundled for easy passing around.
98  */
99 
100 struct vars {
101     regex_t *re;
102     struct guts *g;
103     int eflags;			/* copies of arguments */
104     size_t nmatch;
105     regmatch_t *pmatch;
106     rm_detail_t *details;
107     chr *start;			/* start of string */
108     chr *stop;			/* just past end of string */
109     int err;			/* error code if any (0 none) */
110     regoff_t *mem;		/* memory vector for backtracking */
111     struct smalldfa dfa1;
112     struct smalldfa dfa2;
113 };
114 #define	VISERR(vv) ((vv)->err != 0)	/* have we seen an error yet? */
115 #define	ISERR()	VISERR(v)
116 #define	VERR(vv,e) (((vv)->err) ? (vv)->err : ((vv)->err = (e)))
117 #define	ERR(e)	VERR(v, e)	/* record an error */
118 #define	NOERR()	{if (ISERR()) return v->err;}	/* if error seen, return it */
119 #define	OFF(p)	((p) - v->start)
120 #define	LOFF(p)	((long)OFF(p))
121 
122 /*
123  * forward declarations
124  */
125 /* =====^!^===== begin forwards =====^!^===== */
126 /* automatically gathered by fwd; do not hand-edit */
127 /* === regexec.c === */
128 int exec(regex_t *, CONST chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int);
129 static int find(struct vars *, struct cnfa *, struct colormap *);
130 static int cfind(struct vars *, struct cnfa *, struct colormap *);
131 static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
132 static VOID zapsubs(regmatch_t *, size_t);
133 static VOID zapmem(struct vars *, struct subre *);
134 static VOID subset(struct vars *, struct subre *, chr *, chr *);
135 static int dissect(struct vars *, struct subre *, chr *, chr *);
136 static int condissect(struct vars *, struct subre *, chr *, chr *);
137 static int altdissect(struct vars *, struct subre *, chr *, chr *);
138 static int cdissect(struct vars *, struct subre *, chr *, chr *);
139 static int ccondissect(struct vars *, struct subre *, chr *, chr *);
140 static int crevdissect(struct vars *, struct subre *, chr *, chr *);
141 static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
142 static int caltdissect(struct vars *, struct subre *, chr *, chr *);
143 /* === rege_dfa.c === */
144 static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
145 static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
146 static chr *lastcold(struct vars *, struct dfa *);
147 static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
148 static VOID freedfa(struct dfa *);
149 static unsigned hash(unsigned *, int);
150 static struct sset *initialize(struct vars *, struct dfa *, chr *);
151 static struct sset *miss(struct vars *, struct dfa *, struct sset *, pcolor, chr *, chr *);
152 static int lacon(struct vars *, struct cnfa *, chr *, pcolor);
153 static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *);
154 static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
155 /* automatically gathered by fwd; do not hand-edit */
156 /* =====^!^===== end forwards =====^!^===== */
157 
158 /*
159  - exec - match regular expression
160  ^ int exec(regex_t *, CONST chr *, size_t, rm_detail_t *,
161  ^					size_t, regmatch_t [], int);
162  */
163 int
exec(regex_t * re,CONST chr * string,size_t len,rm_detail_t * details,size_t nmatch,regmatch_t pmatch[],int flags)164 exec(
165     regex_t *re,
166     CONST chr *string,
167     size_t len,
168     rm_detail_t *details,
169     size_t nmatch,
170     regmatch_t pmatch[],
171     int flags)
172 {
173     AllocVars(v);
174     int st;
175     size_t n;
176     int backref;
177 #define	LOCALMAT	20
178     regmatch_t mat[LOCALMAT];
179 #define	LOCALMEM	40
180     regoff_t mem[LOCALMEM];
181 
182     /*
183      * Sanity checks.
184      */
185 
186     if (re == NULL || string == NULL || re->re_magic != REMAGIC) {
187 	FreeVars(v);
188 	return REG_INVARG;
189     }
190     if (re->re_csize != sizeof(chr)) {
191 	FreeVars(v);
192 	return REG_MIXED;
193     }
194 
195     /*
196      * Setup.
197      */
198 
199     v->re = re;
200     v->g = (struct guts *)re->re_guts;
201     if ((v->g->cflags&REG_EXPECT) && details == NULL) {
202 	FreeVars(v);
203 	return REG_INVARG;
204     }
205     if (v->g->info&REG_UIMPOSSIBLE) {
206 	FreeVars(v);
207 	return REG_NOMATCH;
208     }
209     backref = (v->g->info&REG_UBACKREF) ? 1 : 0;
210     v->eflags = flags;
211     if (v->g->cflags&REG_NOSUB) {
212 	nmatch = 0;		/* override client */
213     }
214     v->nmatch = nmatch;
215     if (backref) {
216 	/*
217 	 * Need work area.
218 	 */
219 
220 	if (v->g->nsub + 1 <= LOCALMAT) {
221 	    v->pmatch = mat;
222 	} else {
223 	    v->pmatch = (regmatch_t *)
224 		    MALLOC((v->g->nsub + 1) * sizeof(regmatch_t));
225 	}
226 	if (v->pmatch == NULL) {
227 	    FreeVars(v);
228 	    return REG_ESPACE;
229 	}
230 	v->nmatch = v->g->nsub + 1;
231     } else {
232 	v->pmatch = pmatch;
233     }
234     v->details = details;
235     v->start = (chr *)string;
236     v->stop = (chr *)string + len;
237     v->err = 0;
238     if (backref) {
239 	/*
240 	 * Need retry memory.
241 	 */
242 
243 	assert(v->g->ntree >= 0);
244 	n = (size_t)v->g->ntree;
245 	if (n <= LOCALMEM) {
246 	    v->mem = mem;
247 	} else {
248 	    v->mem = (regoff_t *) MALLOC(n*sizeof(regoff_t));
249 	}
250 	if (v->mem == NULL) {
251 	    if (v->pmatch != pmatch && v->pmatch != mat) {
252 		FREE(v->pmatch);
253 	    }
254 	    FreeVars(v);
255 	    return REG_ESPACE;
256 	}
257     } else {
258 	v->mem = NULL;
259     }
260 
261     /*
262      * Do it.
263      */
264 
265     assert(v->g->tree != NULL);
266     if (backref) {
267 	st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
268     } else {
269 	st = find(v, &v->g->tree->cnfa, &v->g->cmap);
270     }
271 
272     /*
273      * Copy (portion of) match vector over if necessary.
274      */
275 
276     if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) {
277 	zapsubs(pmatch, nmatch);
278 	n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
279 	memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t));
280     }
281 
282     /*
283      * Clean up.
284      */
285 
286     if (v->pmatch != pmatch && v->pmatch != mat) {
287 	FREE(v->pmatch);
288     }
289     if (v->mem != NULL && v->mem != mem) {
290 	FREE(v->mem);
291     }
292     FreeVars(v);
293     return st;
294 }
295 
296 /*
297  - find - find a match for the main NFA (no-complications case)
298  ^ static int find(struct vars *, struct cnfa *, struct colormap *);
299  */
300 static int
find(struct vars * v,struct cnfa * cnfa,struct colormap * cm)301 find(
302     struct vars *v,
303     struct cnfa *cnfa,
304     struct colormap *cm)
305 {
306     struct dfa *s;
307     struct dfa *d;
308     chr *begin;
309     chr *end = NULL;
310     chr *cold;
311     chr *open;			/* Open and close of range of possible
312 				 * starts */
313     chr *close;
314     int hitend;
315     int shorter = (v->g->tree->flags&SHORTER) ? 1 : 0;
316 
317     /*
318      * First, a shot with the search RE.
319      */
320 
321     s = newdfa(v, &v->g->search, cm, &v->dfa1);
322     assert(!(ISERR() && s != NULL));
323     NOERR();
324     MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
325     cold = NULL;
326     close = shortest(v, s, v->start, v->start, v->stop, &cold, NULL);
327     freedfa(s);
328     NOERR();
329     if (v->g->cflags&REG_EXPECT) {
330 	assert(v->details != NULL);
331 	if (cold != NULL) {
332 	    v->details->rm_extend.rm_so = OFF(cold);
333 	} else {
334 	    v->details->rm_extend.rm_so = OFF(v->stop);
335 	}
336 	v->details->rm_extend.rm_eo = OFF(v->stop);	/* unknown */
337     }
338     if (close == NULL) {	/* not found */
339 	return REG_NOMATCH;
340     }
341     if (v->nmatch == 0) {	/* found, don't need exact location */
342 	return REG_OKAY;
343     }
344 
345     /*
346      * Find starting point and match.
347      */
348 
349     assert(cold != NULL);
350     open = cold;
351     cold = NULL;
352     MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
353     d = newdfa(v, cnfa, cm, &v->dfa1);
354     assert(!(ISERR() && d != NULL));
355     NOERR();
356     for (begin = open; begin <= close; begin++) {
357 	MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
358 	if (shorter) {
359 	    end = shortest(v, d, begin, begin, v->stop, NULL, &hitend);
360 	} else {
361 	    end = longest(v, d, begin, v->stop, &hitend);
362 	}
363 	NOERR();
364 	if (hitend && cold == NULL) {
365 	    cold = begin;
366 	}
367 	if (end != NULL) {
368 	    break;		/* NOTE BREAK OUT */
369 	}
370     }
371     assert(end != NULL);	/* search RE succeeded so loop should */
372     freedfa(d);
373 
374     /*
375      * And pin down details.
376      */
377 
378     assert(v->nmatch > 0);
379     v->pmatch[0].rm_so = OFF(begin);
380     v->pmatch[0].rm_eo = OFF(end);
381     if (v->g->cflags&REG_EXPECT) {
382 	if (cold != NULL) {
383 	    v->details->rm_extend.rm_so = OFF(cold);
384 	} else {
385 	    v->details->rm_extend.rm_so = OFF(v->stop);
386 	}
387 	v->details->rm_extend.rm_eo = OFF(v->stop);	/* unknown */
388     }
389     if (v->nmatch == 1) {	/* no need for submatches */
390 	return REG_OKAY;
391     }
392 
393     /*
394      * Submatches.
395      */
396 
397     zapsubs(v->pmatch, v->nmatch);
398     return dissect(v, v->g->tree, begin, end);
399 }
400 
401 /*
402  - cfind - find a match for the main NFA (with complications)
403  ^ static int cfind(struct vars *, struct cnfa *, struct colormap *);
404  */
405 static int
cfind(struct vars * v,struct cnfa * cnfa,struct colormap * cm)406 cfind(
407     struct vars *v,
408     struct cnfa *cnfa,
409     struct colormap *cm)
410 {
411     struct dfa *s;
412     struct dfa *d;
413     chr *cold = NULL; /* silence gcc 4 warning */
414     int ret;
415 
416     s = newdfa(v, &v->g->search, cm, &v->dfa1);
417     NOERR();
418     d = newdfa(v, cnfa, cm, &v->dfa2);
419     if (ISERR()) {
420 	assert(d == NULL);
421 	freedfa(s);
422 	return v->err;
423     }
424 
425     ret = cfindloop(v, cnfa, cm, d, s, &cold);
426 
427     freedfa(d);
428     freedfa(s);
429     NOERR();
430     if (v->g->cflags&REG_EXPECT) {
431 	assert(v->details != NULL);
432 	if (cold != NULL) {
433 	    v->details->rm_extend.rm_so = OFF(cold);
434 	} else {
435 	    v->details->rm_extend.rm_so = OFF(v->stop);
436 	}
437 	v->details->rm_extend.rm_eo = OFF(v->stop);	/* unknown */
438     }
439     return ret;
440 }
441 
442 /*
443  - cfindloop - the heart of cfind
444  ^ static int cfindloop(struct vars *, struct cnfa *, struct colormap *,
445  ^	struct dfa *, struct dfa *, chr **);
446  */
447 static int
cfindloop(struct vars * v,struct cnfa * cnfa,struct colormap * cm,struct dfa * d,struct dfa * s,chr ** coldp)448 cfindloop(
449     struct vars *v,
450     struct cnfa *cnfa,
451     struct colormap *cm,
452     struct dfa *d,
453     struct dfa *s,
454     chr **coldp)		/* where to put coldstart pointer */
455 {
456     chr *begin;
457     chr *end;
458     chr *cold;
459     chr *open;			/* Open and close of range of possible
460 				 * starts */
461     chr *close;
462     chr *estart;
463     chr *estop;
464     int er;
465     int shorter = v->g->tree->flags&SHORTER;
466     int hitend;
467 
468     assert(d != NULL && s != NULL);
469     cold = NULL;
470     close = v->start;
471     do {
472 	MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
473 	close = shortest(v, s, close, close, v->stop, &cold, NULL);
474 	if (close == NULL) {
475 	    break;		/* NOTE BREAK */
476 	}
477 	assert(cold != NULL);
478 	open = cold;
479 	cold = NULL;
480 	MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
481 	for (begin = open; begin <= close; begin++) {
482 	    MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
483 	    estart = begin;
484 	    estop = v->stop;
485 	    for (;;) {
486 		if (shorter) {
487 		    end = shortest(v, d, begin, estart, estop, NULL, &hitend);
488 		} else {
489 		    end = longest(v, d, begin, estop, &hitend);
490 		}
491 		if (hitend && cold == NULL) {
492 		    cold = begin;
493 		}
494 		if (end == NULL) {
495 		    break;	/* NOTE BREAK OUT */
496 		}
497 
498 		MDEBUG(("tentative end %ld\n", LOFF(end)));
499 		zapsubs(v->pmatch, v->nmatch);
500 		zapmem(v, v->g->tree);
501 		er = cdissect(v, v->g->tree, begin, end);
502 		if (er == REG_OKAY) {
503 		    if (v->nmatch > 0) {
504 			v->pmatch[0].rm_so = OFF(begin);
505 			v->pmatch[0].rm_eo = OFF(end);
506 		    }
507 		    *coldp = cold;
508 		    return REG_OKAY;
509 		}
510 		if (er != REG_NOMATCH) {
511 		    ERR(er);
512 		    return er;
513 		}
514 		if ((shorter) ? end == estop : end == begin) {
515 		    break;
516 		}
517 
518 		/*
519 		 * Go around and try again
520 		 */
521 
522 		if (shorter) {
523 		    estart = end + 1;
524 		} else {
525 		    estop = end - 1;
526 		}
527 	    }
528 	}
529     } while (close < v->stop);
530 
531     *coldp = cold;
532     return REG_NOMATCH;
533 }
534 
535 /*
536  - zapsubs - initialize the subexpression matches to "no match"
537  ^ static VOID zapsubs(regmatch_t *, size_t);
538  */
539 static void
zapsubs(regmatch_t * p,size_t n)540 zapsubs(
541     regmatch_t *p,
542     size_t n)
543 {
544     size_t i;
545 
546     for (i = n-1; i > 0; i--) {
547 	p[i].rm_so = -1;
548 	p[i].rm_eo = -1;
549     }
550 }
551 
552 /*
553  - zapmem - initialize the retry memory of a subtree to zeros
554  ^ static VOID zapmem(struct vars *, struct subre *);
555  */
556 static void
zapmem(struct vars * v,struct subre * t)557 zapmem(
558     struct vars *v,
559     struct subre *t)
560 {
561     if (t == NULL) {
562 	return;
563     }
564 
565     assert(v->mem != NULL);
566     v->mem[t->retry] = 0;
567     if (t->op == '(') {
568 	assert(t->subno > 0);
569 	v->pmatch[t->subno].rm_so = -1;
570 		v->pmatch[t->subno].rm_eo = -1;
571     }
572 
573     if (t->left != NULL) {
574 	zapmem(v, t->left);
575     }
576     if (t->right != NULL) {
577 	zapmem(v, t->right);
578     }
579 }
580 
581 /*
582  - subset - set any subexpression relevant to a successful subre
583  ^ static VOID subset(struct vars *, struct subre *, chr *, chr *);
584  */
585 static void
subset(struct vars * v,struct subre * sub,chr * begin,chr * end)586 subset(
587     struct vars *v,
588     struct subre *sub,
589     chr *begin,
590     chr *end)
591 {
592     int n = sub->subno;
593 
594     assert(n > 0);
595     if ((size_t)n >= v->nmatch) {
596 	return;
597     }
598 
599     MDEBUG(("setting %d\n", n));
600     v->pmatch[n].rm_so = OFF(begin);
601     v->pmatch[n].rm_eo = OFF(end);
602 }
603 
604 /*
605  - dissect - determine subexpression matches (uncomplicated case)
606  ^ static int dissect(struct vars *, struct subre *, chr *, chr *);
607  */
608 static int			/* regexec return code */
dissect(struct vars * v,struct subre * t,chr * begin,chr * end)609 dissect(
610     struct vars *v,
611     struct subre *t,
612     chr *begin,			/* beginning of relevant substring */
613     chr *end)			/* end of same */
614 {
615     assert(t != NULL);
616     MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end)));
617 
618     switch (t->op) {
619     case '=':			/* terminal node */
620 	assert(t->left == NULL && t->right == NULL);
621 	return REG_OKAY;	/* no action, parent did the work */
622     case '|':			/* alternation */
623 	assert(t->left != NULL);
624 	return altdissect(v, t, begin, end);
625     case 'b':			/* back ref -- shouldn't be calling us! */
626 	return REG_ASSERT;
627     case '.':			/* concatenation */
628 	assert(t->left != NULL && t->right != NULL);
629 	return condissect(v, t, begin, end);
630     case '(':			/* capturing */
631 	assert(t->left != NULL && t->right == NULL);
632 	assert(t->subno > 0);
633 	subset(v, t, begin, end);
634 	return dissect(v, t->left, begin, end);
635     default:
636 	return REG_ASSERT;
637     }
638 }
639 
640 /*
641  - condissect - determine concatenation subexpression matches (uncomplicated)
642  ^ static int condissect(struct vars *, struct subre *, chr *, chr *);
643  */
644 static int			/* regexec return code */
condissect(struct vars * v,struct subre * t,chr * begin,chr * end)645 condissect(
646     struct vars *v,
647     struct subre *t,
648     chr *begin,			/* beginning of relevant substring */
649     chr *end)			/* end of same */
650 {
651     struct dfa *d;
652     struct dfa *d2;
653     chr *mid;
654     int i;
655     int shorter = (t->left->flags&SHORTER) ? 1 : 0;
656     chr *stop = (shorter) ? end : begin;
657 
658     assert(t->op == '.');
659     assert(t->left != NULL && t->left->cnfa.nstates > 0);
660     assert(t->right != NULL && t->right->cnfa.nstates > 0);
661 
662     d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
663     NOERR();
664     d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2);
665     if (ISERR()) {
666 	assert(d2 == NULL);
667 	freedfa(d);
668 	return v->err;
669     }
670 
671     /*
672      * Pick a tentative midpoint.
673      */
674 
675     if (shorter) {
676 	mid = shortest(v, d, begin, begin, end, NULL, NULL);
677     } else {
678 	mid = longest(v, d, begin, end, NULL);
679     }
680     if (mid == NULL) {
681 	freedfa(d);
682 	freedfa(d2);
683 	return REG_ASSERT;
684     }
685     MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
686 
687     /*
688      * Iterate until satisfaction or failure.
689      */
690 
691     while (longest(v, d2, mid, end, NULL) != end) {
692 	/*
693 	 * That midpoint didn't work, find a new one.
694 	 */
695 
696 	if (mid == stop) {
697 	    /*
698 	     * All possibilities exhausted!
699 	     */
700 
701 	    MDEBUG(("no midpoint!\n"));
702 	    freedfa(d);
703 	    freedfa(d2);
704 	    return REG_ASSERT;
705 	}
706 	if (shorter) {
707 	    mid = shortest(v, d, begin, mid+1, end, NULL, NULL);
708 	} else {
709 	    mid = longest(v, d, begin, mid-1, NULL);
710 	}
711 	if (mid == NULL) {
712 	    /*
713 	     * Failed to find a new one!
714 	     */
715 
716 	    MDEBUG(("failed midpoint!\n"));
717 	    freedfa(d);
718 	    freedfa(d2);
719 	    return REG_ASSERT;
720 	}
721 	MDEBUG(("new midpoint %ld\n", LOFF(mid)));
722     }
723 
724     /*
725      * Satisfaction.
726      */
727 
728     MDEBUG(("successful\n"));
729     freedfa(d);
730     freedfa(d2);
731     i = dissect(v, t->left, begin, mid);
732     if (i != REG_OKAY) {
733 	return i;
734     }
735     return dissect(v, t->right, mid, end);
736 }
737 
738 /*
739  - altdissect - determine alternative subexpression matches (uncomplicated)
740  ^ static int altdissect(struct vars *, struct subre *, chr *, chr *);
741  */
742 static int			/* regexec return code */
altdissect(struct vars * v,struct subre * t,chr * begin,chr * end)743 altdissect(
744     struct vars *v,
745     struct subre *t,
746     chr *begin,			/* beginning of relevant substring */
747     chr *end)			/* end of same */
748 {
749     struct dfa *d;
750     int i;
751 
752     assert(t != NULL);
753     assert(t->op == '|');
754 
755     for (i = 0; t != NULL; t = t->right, i++) {
756 	MDEBUG(("trying %dth\n", i));
757 	assert(t->left != NULL && t->left->cnfa.nstates > 0);
758 	d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
759 	if (ISERR()) {
760 	    return v->err;
761 	}
762 	if (longest(v, d, begin, end, NULL) == end) {
763 	    MDEBUG(("success\n"));
764 	    freedfa(d);
765 	    return dissect(v, t->left, begin, end);
766 	}
767 	freedfa(d);
768     }
769     return REG_ASSERT;		/* none of them matched?!? */
770 }
771 
772 /*
773  - cdissect - determine subexpression matches (with complications)
774  * The retry memory stores the offset of the trial midpoint from begin, plus 1
775  * so that 0 uniquely means "clean slate".
776  ^ static int cdissect(struct vars *, struct subre *, chr *, chr *);
777  */
778 static int			/* regexec return code */
cdissect(struct vars * v,struct subre * t,chr * begin,chr * end)779 cdissect(
780     struct vars *v,
781     struct subre *t,
782     chr *begin,			/* beginning of relevant substring */
783     chr *end)			/* end of same */
784 {
785     int er;
786 
787     assert(t != NULL);
788     MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
789 
790     switch (t->op) {
791     case '=':			/* terminal node */
792 	assert(t->left == NULL && t->right == NULL);
793 	return REG_OKAY;	/* no action, parent did the work */
794     case '|':			/* alternation */
795 	assert(t->left != NULL);
796 	return caltdissect(v, t, begin, end);
797     case 'b':			/* back ref -- shouldn't be calling us! */
798 	assert(t->left == NULL && t->right == NULL);
799 	return cbrdissect(v, t, begin, end);
800     case '.':			/* concatenation */
801 	assert(t->left != NULL && t->right != NULL);
802 	return ccondissect(v, t, begin, end);
803     case '(':			/* capturing */
804 	assert(t->left != NULL && t->right == NULL);
805 	assert(t->subno > 0);
806 	er = cdissect(v, t->left, begin, end);
807 	if (er == REG_OKAY) {
808 	    subset(v, t, begin, end);
809 	}
810 	return er;
811     default:
812 	return REG_ASSERT;
813     }
814 }
815 
816 /*
817  - ccondissect - concatenation subexpression matches (with complications)
818  * The retry memory stores the offset of the trial midpoint from begin, plus 1
819  * so that 0 uniquely means "clean slate".
820  ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *);
821  */
822 static int			/* regexec return code */
ccondissect(struct vars * v,struct subre * t,chr * begin,chr * end)823 ccondissect(
824     struct vars *v,
825     struct subre *t,
826     chr *begin,			/* beginning of relevant substring */
827     chr *end)			/* end of same */
828 {
829     struct dfa *d, *d2;
830     chr *mid;
831     int er;
832 
833     assert(t->op == '.');
834     assert(t->left != NULL && t->left->cnfa.nstates > 0);
835     assert(t->right != NULL && t->right->cnfa.nstates > 0);
836 
837     if (t->left->flags&SHORTER) { /* reverse scan */
838 	return crevdissect(v, t, begin, end);
839     }
840 
841     d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
842     if (ISERR()) {
843 	return v->err;
844     }
845     d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
846     if (ISERR()) {
847 	freedfa(d);
848 	return v->err;
849     }
850     MDEBUG(("cconcat %d\n", t->retry));
851 
852     /*
853      * Pick a tentative midpoint.
854      */
855 
856     if (v->mem[t->retry] == 0) {
857 	mid = longest(v, d, begin, end, NULL);
858 	if (mid == NULL) {
859 	    freedfa(d);
860 	    freedfa(d2);
861 	    return REG_NOMATCH;
862 	}
863 	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
864 	v->mem[t->retry] = (mid - begin) + 1;
865     } else {
866 	mid = begin + (v->mem[t->retry] - 1);
867 	MDEBUG(("working midpoint %ld\n", LOFF(mid)));
868     }
869 
870     /*
871      * Iterate until satisfaction or failure.
872      */
873 
874     for (;;) {
875 	/*
876 	 * Try this midpoint on for size.
877 	 */
878 
879 	if (longest(v, d2, mid, end, NULL) == end) {
880 	    er = cdissect(v, t->left, begin, mid);
881 	    if (er == REG_OKAY) {
882 		er = cdissect(v, t->right, mid, end);
883 		if (er == REG_OKAY) {
884 		    /*
885 		     * Satisfaction.
886 		     */
887 
888 		    MDEBUG(("successful\n"));
889 		    freedfa(d);
890 		    freedfa(d2);
891 		    return REG_OKAY;
892 		}
893 	    }
894 	    if ((er != REG_OKAY) && (er != REG_NOMATCH)) {
895 		freedfa(d);
896 		freedfa(d2);
897 		return er;
898 	    }
899 	}
900 
901 	/*
902 	 * That midpoint didn't work, find a new one.
903 	 */
904 
905 	if (mid == begin) {
906 	    /*
907 	     * All possibilities exhausted.
908 	     */
909 
910 	    MDEBUG(("%d no midpoint\n", t->retry));
911 	    freedfa(d);
912 	    freedfa(d2);
913 	    return REG_NOMATCH;
914 	}
915 	mid = longest(v, d, begin, mid-1, NULL);
916 	if (mid == NULL) {
917 	    /*
918 	     * Failed to find a new one.
919 	     */
920 
921 	    MDEBUG(("%d failed midpoint\n", t->retry));
922 	    freedfa(d);
923 	    freedfa(d2);
924 	    return REG_NOMATCH;
925 	}
926 	MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
927 	v->mem[t->retry] = (mid - begin) + 1;
928 	zapmem(v, t->left);
929 	zapmem(v, t->right);
930     }
931 }
932 
933 /*
934  - crevdissect - determine backref shortest-first subexpression matches
935  * The retry memory stores the offset of the trial midpoint from begin, plus 1
936  * so that 0 uniquely means "clean slate".
937  ^ static int crevdissect(struct vars *, struct subre *, chr *, chr *);
938  */
939 static int			/* regexec return code */
crevdissect(struct vars * v,struct subre * t,chr * begin,chr * end)940 crevdissect(
941     struct vars *v,
942     struct subre *t,
943     chr *begin,			/* beginning of relevant substring */
944     chr *end)			/* end of same */
945 {
946     struct dfa *d;
947     struct dfa *d2;
948     chr *mid;
949     int er;
950 
951     assert(t->op == '.');
952     assert(t->left != NULL && t->left->cnfa.nstates > 0);
953     assert(t->right != NULL && t->right->cnfa.nstates > 0);
954     assert(t->left->flags&SHORTER);
955 
956     /*
957      * Concatenation -- need to split the substring between parts.
958      */
959 
960     d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
961     if (ISERR()) {
962 	return v->err;
963     }
964     d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
965     if (ISERR()) {
966 	freedfa(d);
967 	return v->err;
968     }
969     MDEBUG(("crev %d\n", t->retry));
970 
971     /*
972      * Pick a tentative midpoint.
973      */
974 
975     if (v->mem[t->retry] == 0) {
976 	mid = shortest(v, d, begin, begin, end, NULL, NULL);
977 	if (mid == NULL) {
978 	    freedfa(d);
979 	    freedfa(d2);
980 	    return REG_NOMATCH;
981 	}
982 	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
983 	v->mem[t->retry] = (mid - begin) + 1;
984     } else {
985 	mid = begin + (v->mem[t->retry] - 1);
986 	MDEBUG(("working midpoint %ld\n", LOFF(mid)));
987     }
988 
989     /*
990      * Iterate until satisfaction or failure.
991      */
992 
993     for (;;) {
994 	/*
995 	 * Try this midpoint on for size.
996 	 */
997 
998 	if (longest(v, d2, mid, end, NULL) == end) {
999 	    er = cdissect(v, t->left, begin, mid);
1000 	    if (er == REG_OKAY) {
1001 		er = cdissect(v, t->right, mid, end);
1002 		if (er == REG_OKAY) {
1003 		    /*
1004 		     * Satisfaction.
1005 		     */
1006 
1007 		    MDEBUG(("successful\n"));
1008 		    freedfa(d);
1009 		    freedfa(d2);
1010 		    return REG_OKAY;
1011 		}
1012 	    }
1013 	    if (er != REG_OKAY && er != REG_NOMATCH) {
1014 		freedfa(d);
1015 		freedfa(d2);
1016 		return er;
1017 	    }
1018 	}
1019 
1020 	/*
1021 	 * That midpoint didn't work, find a new one.
1022 	 */
1023 
1024 	if (mid == end) {
1025 	    /*
1026 	     * All possibilities exhausted.
1027 	     */
1028 
1029 	    MDEBUG(("%d no midpoint\n", t->retry));
1030 	    freedfa(d);
1031 	    freedfa(d2);
1032 	    return REG_NOMATCH;
1033 	}
1034 	mid = shortest(v, d, begin, mid+1, end, NULL, NULL);
1035 	if (mid == NULL) {
1036 	    /*
1037 	     * Failed to find a new one.
1038 	     */
1039 
1040 	    MDEBUG(("%d failed midpoint\n", t->retry));
1041 	    freedfa(d);
1042 	    freedfa(d2);
1043 	    return REG_NOMATCH;
1044 	}
1045 	MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
1046 	v->mem[t->retry] = (mid - begin) + 1;
1047 	zapmem(v, t->left);
1048 	zapmem(v, t->right);
1049     }
1050 }
1051 
1052 /*
1053  - cbrdissect - determine backref subexpression matches
1054  ^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
1055  */
1056 static int			/* regexec return code */
cbrdissect(struct vars * v,struct subre * t,chr * begin,chr * end)1057 cbrdissect(
1058     struct vars *v,
1059     struct subre *t,
1060     chr *begin,			/* beginning of relevant substring */
1061     chr *end)			/* end of same */
1062 {
1063     int i;
1064     int n = t->subno;
1065     size_t len;
1066     chr *paren;
1067     chr *p;
1068     chr *stop;
1069     int min = t->min;
1070     int max = t->max;
1071 
1072     assert(t != NULL);
1073     assert(t->op == 'b');
1074     assert(n >= 0);
1075     assert((size_t)n < v->nmatch);
1076 
1077     MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max));
1078 
1079     if (v->pmatch[n].rm_so == -1) {
1080 	return REG_NOMATCH;
1081     }
1082     paren = v->start + v->pmatch[n].rm_so;
1083     len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
1084 
1085     /*
1086      * No room to maneuver -- retries are pointless.
1087      */
1088 
1089     if (v->mem[t->retry]) {
1090 	return REG_NOMATCH;
1091     }
1092     v->mem[t->retry] = 1;
1093 
1094     /*
1095      * Special-case zero-length string.
1096      */
1097 
1098     if (len == 0) {
1099 	if (begin == end) {
1100 	    return REG_OKAY;
1101 	}
1102 	return REG_NOMATCH;
1103     }
1104 
1105     /*
1106      * And too-short string.
1107      */
1108 
1109     assert(end >= begin);
1110     if ((size_t)(end - begin) < len) {
1111 	return REG_NOMATCH;
1112     }
1113     stop = end - len;
1114 
1115     /*
1116      * Count occurrences.
1117      */
1118 
1119     i = 0;
1120     for (p = begin; p <= stop && (i < max || max == DUPINF); p += len) {
1121 	if ((*v->g->compare)(paren, p, len) != 0) {
1122 	    break;
1123 	}
1124 	i++;
1125     }
1126     MDEBUG(("cbackref found %d\n", i));
1127 
1128     /*
1129      * And sort it out.
1130      */
1131 
1132     if (p != end) {		/* didn't consume all of it */
1133 	return REG_NOMATCH;
1134     }
1135     if (min <= i && (i <= max || max == DUPINF)) {
1136 	return REG_OKAY;
1137     }
1138     return REG_NOMATCH;		/* out of range */
1139 }
1140 
1141 /*
1142  - caltdissect - determine alternative subexpression matches (w. complications)
1143  ^ static int caltdissect(struct vars *, struct subre *, chr *, chr *);
1144  */
1145 static int			/* regexec return code */
caltdissect(struct vars * v,struct subre * t,chr * begin,chr * end)1146 caltdissect(
1147     struct vars *v,
1148     struct subre *t,
1149     chr *begin,			/* beginning of relevant substring */
1150     chr *end)			/* end of same */
1151 {
1152     struct dfa *d;
1153     int er;
1154 #define	UNTRIED	0		/* not yet tried at all */
1155 #define	TRYING	1		/* top matched, trying submatches */
1156 #define	TRIED	2		/* top didn't match or submatches exhausted */
1157 
1158     if (t == NULL) {
1159 	return REG_NOMATCH;
1160     }
1161     assert(t->op == '|');
1162     if (v->mem[t->retry] == TRIED) {
1163 	return caltdissect(v, t->right, begin, end);
1164     }
1165 
1166     MDEBUG(("calt n%d\n", t->retry));
1167     assert(t->left != NULL);
1168 
1169     if (v->mem[t->retry] == UNTRIED) {
1170 	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
1171 	if (ISERR()) {
1172 	    return v->err;
1173 	}
1174 	if (longest(v, d, begin, end, NULL) != end) {
1175 	    freedfa(d);
1176 	    v->mem[t->retry] = TRIED;
1177 	    return caltdissect(v, t->right, begin, end);
1178 	}
1179 	freedfa(d);
1180 	MDEBUG(("calt matched\n"));
1181 	v->mem[t->retry] = TRYING;
1182     }
1183 
1184     er = cdissect(v, t->left, begin, end);
1185     if (er != REG_NOMATCH) {
1186 	return er;
1187     }
1188 
1189     v->mem[t->retry] = TRIED;
1190     return caltdissect(v, t->right, begin, end);
1191 }
1192 
1193 #include "rege_dfa.c"
1194 
1195 /*
1196  * Local Variables:
1197  * mode: c
1198  * c-basic-offset: 4
1199  * fill-column: 78
1200  * End:
1201  */
1202