1 /*
2  * re_*exec and friends - match REs
3  *
4  * Copyright © 1998, 1999 Henry Spencer.  All rights reserved.
5  *
6  * Development of this software was funded, in part, by Cray Research Inc.,
7  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8  * Corporation, none of whom are responsible for the results.  The author
9  * thanks all of them.
10  *
11  * Redistribution and use in source and binary forms -- with or without
12  * modification -- are permitted for any purpose, provided that
13  * redistributions in source form retain this entire copyright notice and
14  * indicate the origin and nature of any modifications.
15  *
16  * I'd appreciate being given credit for this package in the documentation of
17  * software which uses it, but that is not a requirement.
18  *
19  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
22  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include "regguts.h"
32 
33 /*
34  * Lazy-DFA representation.
35  */
36 
37 struct arcp {			/* "pointer" to an outarc */
38     struct sset *ss;
39     color co;
40 };
41 
42 struct sset {			/* state set */
43     unsigned *states;		/* pointer to bitvector */
44     unsigned hash;		/* hash of bitvector */
45 #define	HASH(bv, nw)	(((nw) == 1) ? *(bv) : hash(bv, nw))
46 #define	HIT(h,bv,ss,nw)	((ss)->hash == (h) && ((nw) == 1 || \
47 	memcmp((void*)(bv), (void*)((ss)->states), (nw)*sizeof(unsigned)) == 0))
48     int flags;
49 #define	STARTER		01	/* the initial state set */
50 #define	POSTSTATE	02	/* includes the goal state */
51 #define	LOCKED		04	/* locked in cache */
52 #define	NOPROGRESS	010	/* zero-progress state set */
53     struct arcp ins;		/* chain of inarcs pointing here */
54     chr *lastseen;		/* last entered on arrival here */
55     struct sset **outs;		/* outarc vector indexed by color */
56     struct arcp *inchain;	/* chain-pointer vector for outarcs */
57 };
58 
59 struct dfa {
60     int nssets;			/* size of cache */
61     int nssused;		/* how many entries occupied yet */
62     int nstates;		/* number of states */
63     int ncolors;		/* length of outarc and inchain vectors */
64     int wordsper;		/* length of state-set bitvectors */
65     struct sset *ssets;		/* state-set cache */
66     unsigned *statesarea;	/* bitvector storage */
67     unsigned *work;		/* pointer to work area within statesarea */
68     struct sset **outsarea;	/* outarc-vector storage */
69     struct arcp *incarea;	/* inchain storage */
70     struct cnfa *cnfa;
71     struct colormap *cm;
72     chr *lastpost;		/* location of last cache-flushed success */
73     chr *lastnopr;		/* location of last cache-flushed NOPROGRESS */
74     struct sset *search;	/* replacement-search-pointer memory */
75     int cptsmalloced;		/* were the areas individually malloced? */
76     char *mallocarea;		/* self, or malloced area, or NULL */
77 };
78 
79 #define	WORK	1		/* number of work bitvectors needed */
80 
81 /*
82  * Setup for non-malloc allocation for small cases.
83  */
84 
85 #define	FEWSTATES	20	/* must be less than UBITS */
86 #define	FEWCOLORS	15
87 struct smalldfa {
88     struct dfa dfa;
89     struct sset ssets[FEWSTATES*2];
90     unsigned statesarea[FEWSTATES*2 + WORK];
91     struct sset *outsarea[FEWSTATES*2 * FEWCOLORS];
92     struct arcp incarea[FEWSTATES*2 * FEWCOLORS];
93 };
94 
95 /*
96  * Internal variables, bundled for easy passing around.
97  */
98 
99 struct vars {
100     regex_t *re;
101     struct guts *g;
102     int eflags;			/* copies of arguments */
103     size_t nmatch;
104     regmatch_t *pmatch;
105     rm_detail_t *details;
106     chr *start;			/* start of string */
107     chr *stop;			/* just past end of string */
108     int err;			/* error code if any (0 none) */
109     struct dfa **subdfas;	/* per-subre DFAs */
110     struct smalldfa dfa1;
111     struct smalldfa dfa2;
112 };
113 #define	VISERR(vv) ((vv)->err != 0)	/* have we seen an error yet? */
114 #define	ISERR()	VISERR(v)
115 #define VERR(vv,e) ((vv)->err = ((vv)->err ? (vv)->err : (e)))
116 #define	ERR(e)	VERR(v, e)	/* record an error */
117 #define	NOERR()	{if (ISERR()) return v->err;}	/* if error seen, return it */
118 #define	OFF(p)	((p) - v->start)
119 #define	LOFF(p)	((long)OFF(p))
120 
121 /*
122  * forward declarations
123  */
124 /* =====^!^===== begin forwards =====^!^===== */
125 /* automatically gathered by fwd; do not hand-edit */
126 /* === regexec.c === */
127 int exec(regex_t *, const chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int);
128 static struct dfa *getsubdfa(struct vars *, struct subre *);
129 static int simpleFind(struct vars *const, struct cnfa *const, struct colormap *const);
130 static int complicatedFind(struct vars *const, struct cnfa *const, struct colormap *const);
131 static int complicatedFindLoop(struct vars *const, struct dfa *const, struct dfa *const, chr **const);
132 static void zapallsubs(regmatch_t *const, const size_t);
133 static void zaptreesubs(struct vars *const, struct subre *const);
134 static void subset(struct vars *const, struct subre *const, chr *const, chr *const);
135 static int cdissect(struct vars *, struct subre *, chr *, chr *);
136 static int ccondissect(struct vars *, struct subre *, chr *, chr *);
137 static int crevcondissect(struct vars *, struct subre *, chr *, chr *);
138 static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
139 static int caltdissect(struct vars *, struct subre *, chr *, chr *);
140 static int citerdissect(struct vars *, struct subre *, chr *, chr *);
141 static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
142 /* === rege_dfa.c === */
143 static chr *longest(struct vars *const, struct dfa *const, chr *const, chr *const, int *const);
144 static chr *shortest(struct vars *const, struct dfa *const, chr *const, chr *const, chr *const, chr **const, int *const);
145 static chr *lastCold(struct vars *const, struct dfa *const);
146 static struct dfa *newDFA(struct vars *const, struct cnfa *const, struct colormap *const, struct smalldfa *);
147 static void freeDFA(struct dfa *const);
148 static unsigned hash(unsigned *const, const int);
149 static struct sset *initialize(struct vars *const, struct dfa *const, chr *const);
150 static struct sset *miss(struct vars *const, struct dfa *const, struct sset *const, const pcolor, chr *const, chr *const);
151 static int checkLAConstraint(struct vars *const, struct cnfa *const, chr *const, const pcolor);
152 static struct sset *getVacantSS(struct vars *const, struct dfa *const, chr *const, chr *const);
153 static struct sset *pickNextSS(struct vars *const, struct dfa *const, chr *const, chr *const);
154 /* automatically gathered by fwd; do not hand-edit */
155 /* =====^!^===== end forwards =====^!^===== */
156 
157 /*
158  - exec - match regular expression
159  ^ int exec(regex_t *, const chr *, size_t, rm_detail_t *,
160  ^					size_t, regmatch_t [], int);
161  */
162 int
exec(regex_t * re,const chr * string,size_t len,rm_detail_t * details,size_t nmatch,regmatch_t pmatch[],int flags)163 exec(
164     regex_t *re,
165     const chr *string,
166     size_t len,
167     rm_detail_t *details,
168     size_t nmatch,
169     regmatch_t pmatch[],
170     int flags)
171 {
172     AllocVars(v);
173     int st, backref;
174     int n;
175     int i;
176 #define	LOCALMAT	20
177     regmatch_t mat[LOCALMAT];
178 #define LOCALDFAS	40
179     struct dfa *subdfas[LOCALDFAS];
180 
181     /*
182      * Sanity checks.
183      */
184 
185     if (re == NULL || string == NULL || re->re_magic != REMAGIC) {
186 	FreeVars(v);
187 	return REG_INVARG;
188     }
189     if (re->re_csize != sizeof(chr)) {
190 	FreeVars(v);
191 	return REG_MIXED;
192     }
193 
194     /*
195      * Setup.
196      */
197 
198     v->re = re;
199     v->g = (struct guts *)re->re_guts;
200     if ((v->g->cflags&REG_EXPECT) && details == NULL) {
201 	FreeVars(v);
202 	return REG_INVARG;
203     }
204     if (v->g->info&REG_UIMPOSSIBLE) {
205 	FreeVars(v);
206 	return REG_NOMATCH;
207     }
208     backref = (v->g->info&REG_UBACKREF) ? 1 : 0;
209     v->eflags = flags;
210     if (v->g->cflags&REG_NOSUB) {
211 	nmatch = 0;		/* override client */
212     }
213     v->nmatch = nmatch;
214     if (backref) {
215 	/*
216 	 * Need work area.
217 	 */
218 
219 	if (v->g->nsub + 1 <= LOCALMAT) {
220 	    v->pmatch = mat;
221 	} else {
222 	    v->pmatch = (regmatch_t *)
223 		    MALLOC((v->g->nsub + 1) * sizeof(regmatch_t));
224 	}
225 	if (v->pmatch == NULL) {
226 	    FreeVars(v);
227 	    return REG_ESPACE;
228 	}
229 	v->nmatch = v->g->nsub + 1;
230     } else {
231 	v->pmatch = pmatch;
232     }
233     v->details = details;
234     v->start = (chr *)string;
235     v->stop = (chr *)string + len;
236     v->err = 0;
237     assert(v->g->ntree >= 0);
238     n = v->g->ntree;
239     if (n <= LOCALDFAS)
240 	v->subdfas = subdfas;
241     else
242 	v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
243     if (v->subdfas == NULL) {
244 	if (v->pmatch != pmatch && v->pmatch != mat)
245 	    FREE(v->pmatch);
246 	FreeVars(v);
247 	return REG_ESPACE;
248     }
249     for (i = 0; i < n; i++)
250 	v->subdfas[i] = NULL;
251 
252     /*
253      * Do it.
254      */
255 
256     assert(v->g->tree != NULL);
257     if (backref) {
258 	st = complicatedFind(v, &v->g->tree->cnfa, &v->g->cmap);
259     } else {
260 	st = simpleFind(v, &v->g->tree->cnfa, &v->g->cmap);
261     }
262 
263     /*
264      * Copy (portion of) match vector over if necessary.
265      */
266 
267     if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) {
268 	zapallsubs(pmatch, nmatch);
269 	n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
270 	memcpy((void*)(pmatch), (void*)(v->pmatch), n*sizeof(regmatch_t));
271     }
272 
273     /*
274      * Clean up.
275      */
276 
277     if (v->pmatch != pmatch && v->pmatch != mat) {
278 	FREE(v->pmatch);
279     }
280     n = v->g->ntree;
281     for (i = 0; i < n; i++) {
282 	if (v->subdfas[i] != NULL)
283 	    freeDFA(v->subdfas[i]);
284     }
285     if (v->subdfas != subdfas)
286 	FREE(v->subdfas);
287     FreeVars(v);
288     return st;
289 }
290 
291 /*
292  - getsubdfa - create or re-fetch the DFA for a subre node
293  * We only need to create the DFA once per overall regex execution.
294  * The DFA will be freed by the cleanup step in exec().
295  */
296 static struct dfa *
getsubdfa(struct vars * v,struct subre * t)297 getsubdfa(struct vars * v,
298 	  struct subre * t)
299 {
300     if (v->subdfas[t->id] == NULL) {
301 	v->subdfas[t->id] = newDFA(v, &t->cnfa, &v->g->cmap, NULL);
302 	if (ISERR())
303 	    return NULL;
304     }
305     return v->subdfas[t->id];
306 }
307 
308 /*
309  - simpleFind - find a match for the main NFA (no-complications case)
310  ^ static int simpleFind(struct vars *, struct cnfa *, struct colormap *);
311  */
312 static int
simpleFind(struct vars * const v,struct cnfa * const cnfa,struct colormap * const cm)313 simpleFind(
314     struct vars *const v,
315     struct cnfa *const cnfa,
316     struct colormap *const cm)
317 {
318     struct dfa *s, *d;
319     chr *begin, *end = NULL;
320     chr *cold;
321     chr *open, *close;		/* Open and close of range of possible
322 				 * starts */
323     int hitend;
324     int shorter = (v->g->tree->flags&SHORTER) ? 1 : 0;
325 
326     /*
327      * First, a shot with the search RE.
328      */
329 
330     s = newDFA(v, &v->g->search, cm, &v->dfa1);
331     assert(!(ISERR() && s != NULL));
332     NOERR();
333     MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
334     cold = NULL;
335     close = shortest(v, s, v->start, v->start, v->stop, &cold, NULL);
336     freeDFA(s);
337     NOERR();
338     if (v->g->cflags&REG_EXPECT) {
339 	assert(v->details != NULL);
340 	if (cold != NULL) {
341 	    v->details->rm_extend.rm_so = OFF(cold);
342 	} else {
343 	    v->details->rm_extend.rm_so = OFF(v->stop);
344 	}
345 	v->details->rm_extend.rm_eo = OFF(v->stop);	/* unknown */
346     }
347     if (close == NULL) {	/* not found */
348 	return REG_NOMATCH;
349     }
350     if (v->nmatch == 0) {	/* found, don't need exact location */
351 	return REG_OKAY;
352     }
353 
354     /*
355      * Find starting point and match.
356      */
357 
358     assert(cold != NULL);
359     open = cold;
360     cold = NULL;
361     MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
362     d = newDFA(v, cnfa, cm, &v->dfa1);
363     assert(!(ISERR() && d != NULL));
364     NOERR();
365     for (begin = open; begin <= close; begin++) {
366 	MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
367 	if (shorter) {
368 	    end = shortest(v, d, begin, begin, v->stop, NULL, &hitend);
369 	} else {
370 	    end = longest(v, d, begin, v->stop, &hitend);
371 	}
372 	if (ISERR()) {
373 	    freeDFA(d);
374 	    return v->err;
375 	}
376 	if (hitend && cold == NULL) {
377 	    cold = begin;
378 	}
379 	if (end != NULL) {
380 	    break;		/* NOTE BREAK OUT */
381 	}
382     }
383     assert(end != NULL);	/* search RE succeeded so loop should */
384     freeDFA(d);
385 
386     /*
387      * And pin down details.
388      */
389 
390     assert(v->nmatch > 0);
391     v->pmatch[0].rm_so = OFF(begin);
392     v->pmatch[0].rm_eo = OFF(end);
393     if (v->g->cflags&REG_EXPECT) {
394 	if (cold != NULL) {
395 	    v->details->rm_extend.rm_so = OFF(cold);
396 	} else {
397 	    v->details->rm_extend.rm_so = OFF(v->stop);
398 	}
399 	v->details->rm_extend.rm_eo = OFF(v->stop);	/* unknown */
400     }
401     if (v->nmatch == 1) {	/* no need for submatches */
402 	return REG_OKAY;
403     }
404 
405     /*
406      * Find submatches.
407      */
408 
409     zapallsubs(v->pmatch, v->nmatch);
410     return cdissect(v, v->g->tree, begin, end);
411 }
412 
413 /*
414  - complicatedFind - find a match for the main NFA (with complications)
415  ^ static int complicatedFind(struct vars *, struct cnfa *, struct colormap *);
416  */
417 static int
complicatedFind(struct vars * const v,struct cnfa * const cnfa,struct colormap * const cm)418 complicatedFind(
419     struct vars *const v,
420     struct cnfa *const cnfa,
421     struct colormap *const cm)
422 {
423     struct dfa *s, *d;
424     chr *cold = NULL; /* silence gcc 4 warning */
425     int ret;
426 
427     s = newDFA(v, &v->g->search, cm, &v->dfa1);
428     NOERR();
429     d = newDFA(v, cnfa, cm, &v->dfa2);
430     if (ISERR()) {
431 	assert(d == NULL);
432 	freeDFA(s);
433 	return v->err;
434     }
435 
436     ret = complicatedFindLoop(v, d, s, &cold);
437 
438     freeDFA(d);
439     freeDFA(s);
440     NOERR();
441     if (v->g->cflags&REG_EXPECT) {
442 	assert(v->details != NULL);
443 	if (cold != NULL) {
444 	    v->details->rm_extend.rm_so = OFF(cold);
445 	} else {
446 	    v->details->rm_extend.rm_so = OFF(v->stop);
447 	}
448 	v->details->rm_extend.rm_eo = OFF(v->stop);	/* unknown */
449     }
450     return ret;
451 }
452 
453 /*
454  - complicatedFindLoop - the heart of complicatedFind
455  ^ static int complicatedFindLoop(struct vars *,
456  ^	struct dfa *, struct dfa *, chr **);
457  */
458 static int
complicatedFindLoop(struct vars * const v,struct dfa * const d,struct dfa * const s,chr ** const coldp)459 complicatedFindLoop(
460     struct vars *const v,
461     struct dfa *const d,
462     struct dfa *const s,
463     chr **const coldp)		/* where to put coldstart pointer */
464 {
465     chr *begin, *end;
466     chr *cold;
467     chr *open, *close;		/* Open and close of range of possible
468 				 * starts */
469     chr *estart, *estop;
470     int er, hitend;
471     int shorter = v->g->tree->flags&SHORTER;
472 
473     assert(d != NULL && s != NULL);
474     cold = NULL;
475     close = v->start;
476     do {
477 	MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
478 	close = shortest(v, s, close, close, v->stop, &cold, NULL);
479 	if (close == NULL) {
480 	    break;		/* NOTE BREAK */
481 	}
482 	assert(cold != NULL);
483 	open = cold;
484 	cold = NULL;
485 	MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
486 	for (begin = open; begin <= close; begin++) {
487 	    MDEBUG(("\ncomplicatedFind trying at %ld\n", LOFF(begin)));
488 	    estart = begin;
489 	    estop = v->stop;
490 	    for (;;) {
491 		if (shorter) {
492 		    end = shortest(v, d, begin, estart, estop, NULL, &hitend);
493 		} else {
494 		    end = longest(v, d, begin, estop, &hitend);
495 		}
496 		if (hitend && cold == NULL) {
497 		    cold = begin;
498 		}
499 		if (end == NULL) {
500 		    break;	/* NOTE BREAK OUT */
501 		}
502 
503 		MDEBUG(("tentative end %ld\n", LOFF(end)));
504 		zapallsubs(v->pmatch, v->nmatch);
505 		er = cdissect(v, v->g->tree, begin, end);
506 		if (er == REG_OKAY) {
507 		    if (v->nmatch > 0) {
508 			v->pmatch[0].rm_so = OFF(begin);
509 			v->pmatch[0].rm_eo = OFF(end);
510 		    }
511 		    *coldp = cold;
512 		    return REG_OKAY;
513 		}
514 		if (er != REG_NOMATCH) {
515 		    ERR(er);
516 		    *coldp = cold;
517 		    return er;
518 		}
519 		if ((shorter) ? end == estop : end == begin) {
520 		    break;
521 		}
522 
523 		/*
524 		 * Go around and try again
525 		 */
526 
527 		if (shorter) {
528 		    estart = end + 1;
529 		} else {
530 		    estop = end - 1;
531 		}
532 	    }
533 	}
534     } while (close < v->stop);
535 
536     *coldp = cold;
537     return REG_NOMATCH;
538 }
539 
540 /*
541  - zapallsubs - initialize all subexpression matches to "no match"
542  ^ static void zapallsubs(regmatch_t *, size_t);
543  */
544 static void
zapallsubs(regmatch_t * const p,const size_t n)545 zapallsubs(
546     regmatch_t *const p,
547     const size_t n)
548 {
549     size_t i;
550 
551     for (i = n-1; i > 0; i--) {
552 	p[i].rm_so = -1;
553 	p[i].rm_eo = -1;
554     }
555 }
556 
557 /*
558  - zaptreesubs - initialize subexpressions within subtree to "no match"
559  ^ static void zaptreesubs(struct vars *, struct subre *);
560  */
561 static void
zaptreesubs(struct vars * const v,struct subre * const t)562 zaptreesubs(
563     struct vars *const v,
564     struct subre *const t)
565 {
566     if (t->op == '(') {
567 	int n = t->subno;
568 	assert(n > 0);
569 	if ((size_t) n < v->nmatch) {
570 	    v->pmatch[n].rm_so = -1;
571 	    v->pmatch[n].rm_eo = -1;
572 	}
573     }
574 
575     if (t->left != NULL) {
576 	zaptreesubs(v, t->left);
577     }
578     if (t->right != NULL) {
579 	zaptreesubs(v, t->right);
580     }
581 }
582 
583 /*
584  - subset - set subexpression match data for a successful subre
585  ^ static void subset(struct vars *, struct subre *, chr *, chr *);
586  */
587 static void
subset(struct vars * const v,struct subre * const sub,chr * const begin,chr * const end)588 subset(
589     struct vars *const v,
590     struct subre *const sub,
591     chr *const begin,
592     chr *const end)
593 {
594     int n = sub->subno;
595 
596     assert(n > 0);
597     if ((size_t)n >= v->nmatch) {
598 	return;
599     }
600 
601     MDEBUG(("setting %d\n", n));
602     v->pmatch[n].rm_so = OFF(begin);
603     v->pmatch[n].rm_eo = OFF(end);
604 }
605 
606 /*
607  - cdissect - check backrefs and determine subexpression matches
608  * cdissect recursively processes a subre tree to check matching of backrefs
609  * and/or identify submatch boundaries for capture nodes.  The proposed match
610  * runs from "begin" to "end" (not including "end"), and we are basically
611  * "dissecting" it to see where the submatches are.
612  * Before calling any level of cdissect, the caller must have run the node's
613  * DFA and found that the proposed substring satisfies the DFA.  (We make
614  * the caller do that because in concatenation and iteration nodes, it's
615  * much faster to check all the substrings against the child DFAs before we
616  * recurse.)  Also, caller must have cleared subexpression match data via
617  * zaptreesubs (or zapallsubs at the top level).
618  ^ static int cdissect(struct vars *, struct subre *, chr *, chr *);
619  */
620 static int			/* regexec return code */
cdissect(struct vars * v,struct subre * t,chr * begin,chr * end)621 cdissect(
622     struct vars *v,
623     struct subre *t,
624     chr *begin,		/* beginning of relevant substring */
625     chr *end)		/* end of same */
626 {
627     int er;
628 
629     assert(t != NULL);
630     MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
631 
632     switch (t->op) {
633     case '=':			/* terminal node */
634 	assert(t->left == NULL && t->right == NULL);
635 	er = REG_OKAY;		/* no action, parent did the work */
636 	break;
637     case 'b':			/* back reference */
638 	assert(t->left == NULL && t->right == NULL);
639 	er = cbrdissect(v, t, begin, end);
640 	break;
641     case '.':			/* concatenation */
642 	assert(t->left != NULL && t->right != NULL);
643 	if (t->left->flags & SHORTER) /* reverse scan */
644 	    er = crevcondissect(v, t, begin, end);
645 	else
646 	    er = ccondissect(v, t, begin, end);
647 	break;
648     case '|':			/* alternation */
649 	assert(t->left != NULL);
650 	er = caltdissect(v, t, begin, end);
651 	break;
652     case '*':			/* iteration */
653 	assert(t->left != NULL);
654 	if (t->left->flags & SHORTER) /* reverse scan */
655 	    er = creviterdissect(v, t, begin, end);
656 	else
657 	    er = citerdissect(v, t, begin, end);
658 	break;
659     case '(':			/* capturing */
660 	assert(t->left != NULL && t->right == NULL);
661 	assert(t->subno > 0);
662 	er = cdissect(v, t->left, begin, end);
663 	if (er == REG_OKAY) {
664 	    subset(v, t, begin, end);
665 	}
666 	break;
667     default:
668 	er = REG_ASSERT;
669 	break;
670     }
671 
672     /*
673      * We should never have a match failure unless backrefs lurk below;
674      * otherwise, either caller failed to check the DFA, or there's some
675      * inconsistency between the DFA and the node's innards.
676      */
677     assert(er != REG_NOMATCH || (t->flags & BACKR));
678 
679     return er;
680 }
681 
682 /*
683  - ccondissect - dissect match for concatenation node
684  ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *);
685  */
686 static int			/* regexec return code */
ccondissect(struct vars * v,struct subre * t,chr * begin,chr * end)687 ccondissect(
688     struct vars *v,
689     struct subre *t,
690     chr *begin,		/* beginning of relevant substring */
691     chr *end)		/* end of same */
692 {
693     struct dfa *d, *d2;
694     chr *mid;
695 
696     assert(t->op == '.');
697     assert(t->left != NULL && t->left->cnfa.nstates > 0);
698     assert(t->right != NULL && t->right->cnfa.nstates > 0);
699     assert(!(t->left->flags & SHORTER));
700 
701     d = getsubdfa(v, t->left);
702     NOERR();
703     d2 = getsubdfa(v, t->right);
704     NOERR();
705 
706     MDEBUG(("cConcat %d\n", t->id));
707 
708     /*
709      * Pick a tentative midpoint.
710      */
711     mid = longest(v, d, begin, end, (int *) NULL);
712     if (mid == NULL) {
713 	return REG_NOMATCH;
714     }
715     MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
716 
717     /*
718      * Iterate until satisfaction or failure.
719      */
720 
721     for (;;) {
722 	/*
723 	 * Try this midpoint on for size.
724 	 */
725 
726 	if (longest(v, d2, mid, end, NULL) == end) {
727 	    int er = cdissect(v, t->left, begin, mid);
728 
729 	    if (er == REG_OKAY) {
730 		er = cdissect(v, t->right, mid, end);
731 		if (er == REG_OKAY) {
732 		    /*
733 		     * Satisfaction.
734 		     */
735 
736 		    MDEBUG(("successful\n"));
737 		    return REG_OKAY;
738 		}
739 	    }
740 	    if (er != REG_NOMATCH) {
741 		return er;
742 	    }
743 	}
744 
745 	/*
746 	 * That midpoint didn't work, find a new one.
747 	 */
748 
749 	if (mid == begin) {
750 	    /*
751 	     * All possibilities exhausted.
752 	     */
753 
754 	    MDEBUG(("%d no midpoint\n", t->id));
755 	    return REG_NOMATCH;
756 	}
757 	mid = longest(v, d, begin, mid-1, NULL);
758 	if (mid == NULL) {
759 	    /*
760 	     * Failed to find a new one.
761 	     */
762 
763 	    MDEBUG(("%d failed midpoint\n", t->id));
764 	    return REG_NOMATCH;
765 	}
766 	MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
767 	zaptreesubs(v, t->left);
768 	zaptreesubs(v, t->right);
769     }
770 }
771 
772 /*
773  - crevcondissect - dissect match for concatenation node, shortest-first
774  ^ static int crevcondissect(struct vars *, struct subre *, chr *, chr *);
775  */
776 static int			/* regexec return code */
crevcondissect(struct vars * v,struct subre * t,chr * begin,chr * end)777 crevcondissect(
778     struct vars *v,
779     struct subre *t,
780     chr *begin,		/* beginning of relevant substring */
781     chr *end)		/* end of same */
782 {
783     struct dfa *d, *d2;
784     chr *mid;
785 
786     assert(t->op == '.');
787     assert(t->left != NULL && t->left->cnfa.nstates > 0);
788     assert(t->right != NULL && t->right->cnfa.nstates > 0);
789     assert(t->left->flags&SHORTER);
790 
791     d = getsubdfa(v, t->left);
792     NOERR();
793     d2 = getsubdfa(v, t->right);
794     NOERR();
795 
796     MDEBUG(("crevcon %d\n", t->id));
797 
798     /*
799      * Pick a tentative midpoint.
800      */
801 
802     mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
803     if (mid == NULL) {
804 	return REG_NOMATCH;
805     }
806     MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
807 
808     /*
809      * Iterate until satisfaction or failure.
810      */
811 
812     for (;;) {
813 	/*
814 	 * Try this midpoint on for size.
815 	 */
816 
817 	if (longest(v, d2, mid, end, NULL) == end) {
818 	    int er = cdissect(v, t->left, begin, mid);
819 
820 	    if (er == REG_OKAY) {
821 		er = cdissect(v, t->right, mid, end);
822 		if (er == REG_OKAY) {
823 		    /*
824 		     * Satisfaction.
825 		     */
826 
827 		    MDEBUG(("successful\n"));
828 		    return REG_OKAY;
829 		}
830 	    }
831 	    if (er != REG_NOMATCH) {
832 		return er;
833 	    }
834 	}
835 
836 	/*
837 	 * That midpoint didn't work, find a new one.
838 	 */
839 
840 	if (mid == end) {
841 	    /*
842 	     * All possibilities exhausted.
843 	     */
844 
845 	    MDEBUG(("%d no midpoint\n", t->id));
846 	    return REG_NOMATCH;
847 	}
848 	mid = shortest(v, d, begin, mid+1, end, NULL, NULL);
849 	if (mid == NULL) {
850 	    /*
851 	     * Failed to find a new one.
852 	     */
853 
854 	    MDEBUG(("%d failed midpoint\n", t->id));
855 	    return REG_NOMATCH;
856 	}
857 	MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
858 	zaptreesubs(v, t->left);
859 	zaptreesubs(v, t->right);
860     }
861 }
862 
863 /*
864  - cbrdissect - dissect match for backref node
865  ^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
866  */
867 static int			/* regexec return code */
cbrdissect(struct vars * v,struct subre * t,chr * begin,chr * end)868 cbrdissect(
869     struct vars *v,
870     struct subre *t,
871     chr *begin,		/* beginning of relevant substring */
872     chr *end)		/* end of same */
873 {
874     int n = t->subno, min = t->min, max = t->max;
875     size_t numreps;
876     size_t tlen;
877     size_t brlen;
878     chr *brstring;
879     chr *p;
880 
881     assert(t != NULL);
882     assert(t->op == 'b');
883     assert(n >= 0);
884     assert((size_t)n < v->nmatch);
885 
886     MDEBUG(("cbackref n%d %d{%d-%d}\n", t->id, n, min, max));
887 
888     /* get the backreferenced string */
889     if (v->pmatch[n].rm_so == TCL_INDEX_NONE) {
890 	return REG_NOMATCH;
891     }
892     brstring = v->start + v->pmatch[n].rm_so;
893     brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
894 
895     /* special cases for zero-length strings */
896     if (brlen == 0) {
897 	/*
898 	 * matches only if target is zero length, but any number of
899 	 * repetitions can be considered to be present
900 	 */
901 	if (begin == end && min <= max) {
902 	    MDEBUG(("cbackref matched trivially\n"));
903 	    return REG_OKAY;
904 	}
905 	return REG_NOMATCH;
906     }
907     if (begin == end) {
908 	/* matches only if zero repetitions are okay */
909 	if (min == 0) {
910 	    MDEBUG(("cbackref matched trivially\n"));
911 	    return REG_OKAY;
912 	}
913 	return REG_NOMATCH;
914     }
915 
916     /*
917      * check target length to see if it could possibly be an allowed number of
918      * repetitions of brstring
919      */
920 
921     assert(end > begin);
922     tlen = end - begin;
923     if (tlen % brlen != 0)
924 	return REG_NOMATCH;
925     numreps = tlen / brlen;
926     if (numreps < (size_t)min || (numreps > (size_t)max && max != DUPINF))
927 	return REG_NOMATCH;
928 
929     /* okay, compare the actual string contents */
930     p = begin;
931     while (numreps-- > 0) {
932 	if ((*v->g->compare) (brstring, p, brlen) != 0)
933 	    return REG_NOMATCH;
934 	p += brlen;
935     }
936 
937     MDEBUG(("cbackref matched\n"));
938     return REG_OKAY;
939 }
940 
941 /*
942  - caltdissect - dissect match for alternation node
943  ^ static int caltdissect(struct vars *, struct subre *, chr *, chr *);
944  */
945 static int			/* regexec return code */
caltdissect(struct vars * v,struct subre * t,chr * begin,chr * end)946 caltdissect(
947     struct vars *v,
948     struct subre *t,
949     chr *begin,		/* beginning of relevant substring */
950     chr *end)		/* end of same */
951 {
952     struct dfa *d;
953     int er;
954 
955     /* We loop, rather than tail-recurse, to handle a chain of alternatives */
956     while (t != NULL) {
957 	assert(t->op == '|');
958 	assert(t->left != NULL && t->left->cnfa.nstates > 0);
959 
960 	MDEBUG(("calt n%d\n", t->id));
961 
962 	d = getsubdfa(v, t->left);
963 	NOERR();
964 	if (longest(v, d, begin, end, (int *) NULL) == end) {
965 	    MDEBUG(("calt matched\n"));
966 	    er = cdissect(v, t->left, begin, end);
967 	    if (er != REG_NOMATCH) {
968 		return er;
969 	    }
970 	}
971 
972 	t = t->right;
973     }
974 
975     return REG_NOMATCH;
976 }
977 
978 /*
979  - citerdissect - dissect match for iteration node
980  ^ static int citerdissect(struct vars *, struct subre *, chr *, chr *);
981  */
982 static int			/* regexec return code */
citerdissect(struct vars * v,struct subre * t,chr * begin,chr * end)983 citerdissect(struct vars * v,
984 	     struct subre * t,
985 	     chr *begin,	/* beginning of relevant substring */
986 	     chr *end)		/* end of same */
987 {
988     struct dfa *d;
989     chr	  **endpts;
990     chr	   *limit;
991     int		min_matches;
992     size_t	max_matches;
993     int		nverified;
994     int		k;
995     int		i;
996     int		er;
997 
998     assert(t->op == '*');
999     assert(t->left != NULL && t->left->cnfa.nstates > 0);
1000     assert(!(t->left->flags & SHORTER));
1001     assert(begin <= end);
1002 
1003     /*
1004      * If zero matches are allowed, and target string is empty, just declare
1005      * victory.  OTOH, if target string isn't empty, zero matches can't work
1006      * so we pretend the min is 1.
1007      */
1008     min_matches = t->min;
1009     if (min_matches <= 0) {
1010 	if (begin == end)
1011 	    return REG_OKAY;
1012 	min_matches = 1;
1013     }
1014 
1015     /*
1016      * We need workspace to track the endpoints of each sub-match.  Normally
1017      * we consider only nonzero-length sub-matches, so there can be at most
1018      * end-begin of them.  However, if min is larger than that, we will also
1019      * consider zero-length sub-matches in order to find enough matches.
1020      *
1021      * For convenience, endpts[0] contains the "begin" pointer and we store
1022      * sub-match endpoints in endpts[1..max_matches].
1023      */
1024     max_matches = end - begin;
1025     if (max_matches > (size_t)t->max && t->max != DUPINF)
1026 	max_matches = t->max;
1027     if (max_matches < (size_t)min_matches)
1028 	max_matches = min_matches;
1029     endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
1030     if (endpts == NULL)
1031 	return REG_ESPACE;
1032     endpts[0] = begin;
1033 
1034     d = getsubdfa(v, t->left);
1035     if (ISERR()) {
1036 	FREE(endpts);
1037 	return v->err;
1038     }
1039     MDEBUG(("citer %d\n", t->id));
1040 
1041     /*
1042      * Our strategy is to first find a set of sub-match endpoints that are
1043      * valid according to the child node's DFA, and then recursively dissect
1044      * each sub-match to confirm validity.  If any validity check fails,
1045      * backtrack the last sub-match and try again.  And, when we next try for
1046      * a validity check, we need not recheck any successfully verified
1047      * sub-matches that we didn't move the endpoints of.  nverified remembers
1048      * how many sub-matches are currently known okay.
1049      */
1050 
1051     /* initialize to consider first sub-match */
1052     nverified = 0;
1053     k = 1;
1054     limit = end;
1055 
1056     /* iterate until satisfaction or failure */
1057     while (k > 0) {
1058 	/* try to find an endpoint for the k'th sub-match */
1059 	endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
1060 	if (endpts[k] == NULL) {
1061 	    /* no match possible, so see if we can shorten previous one */
1062 	    k--;
1063 	    goto backtrack;
1064 	}
1065 	MDEBUG(("%d: working endpoint %d: %ld\n",
1066 		t->id, k, LOFF(endpts[k])));
1067 
1068 	/* k'th sub-match can no longer be considered verified */
1069 	if (nverified >= k)
1070 	    nverified = k - 1;
1071 
1072 	if (endpts[k] != end) {
1073 	    /* haven't reached end yet, try another iteration if allowed */
1074 	    if ((size_t)k >= max_matches) {
1075 		/* must try to shorten some previous match */
1076 		k--;
1077 		goto backtrack;
1078 	    }
1079 
1080 	    /* reject zero-length match unless necessary to achieve min */
1081 	    if (endpts[k] == endpts[k - 1] &&
1082 		(k >= min_matches || min_matches - k < end - endpts[k]))
1083 		goto backtrack;
1084 
1085 	    k++;
1086 	    limit = end;
1087 	    continue;
1088 	}
1089 
1090 	/*
1091 	 * We've identified a way to divide the string into k sub-matches
1092 	 * that works so far as the child DFA can tell.  If k is an allowed
1093 	 * number of matches, start the slow part: recurse to verify each
1094 	 * sub-match.  We always have k <= max_matches, needn't check that.
1095 	 */
1096 	if (k < min_matches)
1097 	    goto backtrack;
1098 
1099 	MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1100 
1101 	for (i = nverified + 1; i <= k; i++) {
1102 	    zaptreesubs(v, t->left);
1103 	    er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
1104 	    if (er == REG_OKAY) {
1105 		nverified = i;
1106 		continue;
1107 	    }
1108 	    if (er == REG_NOMATCH)
1109 		break;
1110 	    /* oops, something failed */
1111 	    FREE(endpts);
1112 	    return er;
1113 	}
1114 
1115 	if (i > k) {
1116 	    /* satisfaction */
1117 	    MDEBUG(("%d successful\n", t->id));
1118 	    FREE(endpts);
1119 	    return REG_OKAY;
1120 	}
1121 
1122 	/* match failed to verify, so backtrack */
1123 
1124     backtrack:
1125 	/*
1126 	 * Must consider shorter versions of the current sub-match.  However,
1127 	 * we'll only ask for a zero-length match if necessary.
1128 	 */
1129 	while (k > 0) {
1130 	    chr	   *prev_end = endpts[k - 1];
1131 
1132 	    if (endpts[k] > prev_end) {
1133 		limit = endpts[k] - 1;
1134 		if (limit > prev_end ||
1135 		    (k < min_matches && min_matches - k >= end - prev_end)) {
1136 		    /* break out of backtrack loop, continue the outer one */
1137 		    break;
1138 		}
1139 	    }
1140 	    /* can't shorten k'th sub-match any more, consider previous one */
1141 	    k--;
1142 	}
1143     }
1144 
1145     /* all possibilities exhausted */
1146     MDEBUG(("%d failed\n", t->id));
1147     FREE(endpts);
1148     return REG_NOMATCH;
1149 }
1150 
1151 /*
1152  - creviterdissect - dissect match for iteration node, shortest-first
1153  ^ static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
1154  */
1155 static int			/* regexec return code */
creviterdissect(struct vars * v,struct subre * t,chr * begin,chr * end)1156 creviterdissect(struct vars * v,
1157 		struct subre * t,
1158 		chr *begin,	/* beginning of relevant substring */
1159 		chr *end)	/* end of same */
1160 {
1161     struct dfa *d;
1162     chr	  **endpts;
1163     chr	   *limit;
1164     int		min_matches;
1165     size_t	max_matches;
1166     int		nverified;
1167     int		k;
1168     int		i;
1169     int		er;
1170 
1171     assert(t->op == '*');
1172     assert(t->left != NULL && t->left->cnfa.nstates > 0);
1173     assert(t->left->flags & SHORTER);
1174     assert(begin <= end);
1175 
1176     /*
1177      * If zero matches are allowed, and target string is empty, just declare
1178      * victory.  OTOH, if target string isn't empty, zero matches can't work
1179      * so we pretend the min is 1.
1180      */
1181     min_matches = t->min;
1182     if (min_matches <= 0) {
1183 	if (begin == end)
1184 	    return REG_OKAY;
1185 	min_matches = 1;
1186     }
1187 
1188     /*
1189      * We need workspace to track the endpoints of each sub-match.  Normally
1190      * we consider only nonzero-length sub-matches, so there can be at most
1191      * end-begin of them.  However, if min is larger than that, we will also
1192      * consider zero-length sub-matches in order to find enough matches.
1193      *
1194      * For convenience, endpts[0] contains the "begin" pointer and we store
1195      * sub-match endpoints in endpts[1..max_matches].
1196      */
1197     max_matches = end - begin;
1198     if (max_matches > (size_t)t->max && t->max != DUPINF)
1199 	max_matches = t->max;
1200     if (max_matches < (size_t)min_matches)
1201 	max_matches = min_matches;
1202     endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
1203     if (endpts == NULL)
1204 	return REG_ESPACE;
1205     endpts[0] = begin;
1206 
1207     d = getsubdfa(v, t->left);
1208     if (ISERR()) {
1209 	FREE(endpts);
1210 	return v->err;
1211     }
1212     MDEBUG(("creviter %d\n", t->id));
1213 
1214     /*
1215      * Our strategy is to first find a set of sub-match endpoints that are
1216      * valid according to the child node's DFA, and then recursively dissect
1217      * each sub-match to confirm validity.  If any validity check fails,
1218      * backtrack the last sub-match and try again.  And, when we next try for
1219      * a validity check, we need not recheck any successfully verified
1220      * sub-matches that we didn't move the endpoints of.  nverified remembers
1221      * how many sub-matches are currently known okay.
1222      */
1223 
1224     /* initialize to consider first sub-match */
1225     nverified = 0;
1226     k = 1;
1227     limit = begin;
1228 
1229     /* iterate until satisfaction or failure */
1230     while (k > 0) {
1231 	/* disallow zero-length match unless necessary to achieve min */
1232 	if (limit == endpts[k - 1] &&
1233 	    limit != end &&
1234 	    (k >= min_matches || min_matches - k < end - limit))
1235 	    limit++;
1236 
1237 	/* if this is the last allowed sub-match, it must reach to the end */
1238 	if ((size_t)k >= max_matches)
1239 	    limit = end;
1240 
1241 	/* try to find an endpoint for the k'th sub-match */
1242 	endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
1243 			     (chr **) NULL, (int *) NULL);
1244 	if (endpts[k] == NULL) {
1245 	    /* no match possible, so see if we can lengthen previous one */
1246 	    k--;
1247 	    goto backtrack;
1248 	}
1249 	MDEBUG(("%d: working endpoint %d: %ld\n",
1250 		t->id, k, LOFF(endpts[k])));
1251 
1252 	/* k'th sub-match can no longer be considered verified */
1253 	if (nverified >= k)
1254 	    nverified = k - 1;
1255 
1256 	if (endpts[k] != end) {
1257 	    /* haven't reached end yet, try another iteration if allowed */
1258 	    if ((size_t)k >= max_matches) {
1259 		/* must try to lengthen some previous match */
1260 		k--;
1261 		goto backtrack;
1262 	    }
1263 
1264 	    k++;
1265 	    limit = endpts[k - 1];
1266 	    continue;
1267 	}
1268 
1269 	/*
1270 	 * We've identified a way to divide the string into k sub-matches
1271 	 * that works so far as the child DFA can tell.  If k is an allowed
1272 	 * number of matches, start the slow part: recurse to verify each
1273 	 * sub-match.  We always have k <= max_matches, needn't check that.
1274 	 */
1275 	if (k < min_matches)
1276 	    goto backtrack;
1277 
1278 	MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1279 
1280 	for (i = nverified + 1; i <= k; i++) {
1281 	    zaptreesubs(v, t->left);
1282 	    er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
1283 	    if (er == REG_OKAY) {
1284 		nverified = i;
1285 		continue;
1286 	    }
1287 	    if (er == REG_NOMATCH)
1288 		break;
1289 	    /* oops, something failed */
1290 	    FREE(endpts);
1291 	    return er;
1292 	}
1293 
1294 	if (i > k) {
1295 	    /* satisfaction */
1296 	    MDEBUG(("%d successful\n", t->id));
1297 	    FREE(endpts);
1298 	    return REG_OKAY;
1299 	}
1300 
1301 	/* match failed to verify, so backtrack */
1302 
1303     backtrack:
1304 	/*
1305 	 * Must consider longer versions of the current sub-match.
1306 	 */
1307 	while (k > 0) {
1308 	    if (endpts[k] < end) {
1309 		limit = endpts[k] + 1;
1310 		/* break out of backtrack loop, continue the outer one */
1311 		break;
1312 	    }
1313 	    /* can't lengthen k'th sub-match any more, consider previous one */
1314 	    k--;
1315 	}
1316     }
1317 
1318     /* all possibilities exhausted */
1319     MDEBUG(("%d failed\n", t->id));
1320     FREE(endpts);
1321     return REG_NOMATCH;
1322 }
1323 
1324 #include "rege_dfa.c"
1325 
1326 /*
1327  * Local Variables:
1328  * mode: c
1329  * c-basic-offset: 4
1330  * fill-column: 78
1331  * End:
1332  */
1333