1 /*
2 * re_*exec and friends - match REs
3 *
4 * Copyright © 1998, 1999 Henry Spencer. All rights reserved.
5 *
6 * Development of this software was funded, in part, by Cray Research Inc.,
7 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8 * Corporation, none of whom are responsible for the results. The author
9 * thanks all of them.
10 *
11 * Redistribution and use in source and binary forms -- with or without
12 * modification -- are permitted for any purpose, provided that
13 * redistributions in source form retain this entire copyright notice and
14 * indicate the origin and nature of any modifications.
15 *
16 * I'd appreciate being given credit for this package in the documentation of
17 * software which uses it, but that is not a requirement.
18 *
19 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
22 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include "regguts.h"
32
33 /*
34 * Lazy-DFA representation.
35 */
36
37 struct arcp { /* "pointer" to an outarc */
38 struct sset *ss;
39 color co;
40 };
41
42 struct sset { /* state set */
43 unsigned *states; /* pointer to bitvector */
44 unsigned hash; /* hash of bitvector */
45 #define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw))
46 #define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
47 memcmp((void*)(bv), (void*)((ss)->states), (nw)*sizeof(unsigned)) == 0))
48 int flags;
49 #define STARTER 01 /* the initial state set */
50 #define POSTSTATE 02 /* includes the goal state */
51 #define LOCKED 04 /* locked in cache */
52 #define NOPROGRESS 010 /* zero-progress state set */
53 struct arcp ins; /* chain of inarcs pointing here */
54 chr *lastseen; /* last entered on arrival here */
55 struct sset **outs; /* outarc vector indexed by color */
56 struct arcp *inchain; /* chain-pointer vector for outarcs */
57 };
58
59 struct dfa {
60 int nssets; /* size of cache */
61 int nssused; /* how many entries occupied yet */
62 int nstates; /* number of states */
63 int ncolors; /* length of outarc and inchain vectors */
64 int wordsper; /* length of state-set bitvectors */
65 struct sset *ssets; /* state-set cache */
66 unsigned *statesarea; /* bitvector storage */
67 unsigned *work; /* pointer to work area within statesarea */
68 struct sset **outsarea; /* outarc-vector storage */
69 struct arcp *incarea; /* inchain storage */
70 struct cnfa *cnfa;
71 struct colormap *cm;
72 chr *lastpost; /* location of last cache-flushed success */
73 chr *lastnopr; /* location of last cache-flushed NOPROGRESS */
74 struct sset *search; /* replacement-search-pointer memory */
75 int cptsmalloced; /* were the areas individually malloced? */
76 char *mallocarea; /* self, or malloced area, or NULL */
77 };
78
79 #define WORK 1 /* number of work bitvectors needed */
80
81 /*
82 * Setup for non-malloc allocation for small cases.
83 */
84
85 #define FEWSTATES 20 /* must be less than UBITS */
86 #define FEWCOLORS 15
87 struct smalldfa {
88 struct dfa dfa;
89 struct sset ssets[FEWSTATES*2];
90 unsigned statesarea[FEWSTATES*2 + WORK];
91 struct sset *outsarea[FEWSTATES*2 * FEWCOLORS];
92 struct arcp incarea[FEWSTATES*2 * FEWCOLORS];
93 };
94
95 /*
96 * Internal variables, bundled for easy passing around.
97 */
98
99 struct vars {
100 regex_t *re;
101 struct guts *g;
102 int eflags; /* copies of arguments */
103 size_t nmatch;
104 regmatch_t *pmatch;
105 rm_detail_t *details;
106 chr *start; /* start of string */
107 chr *stop; /* just past end of string */
108 int err; /* error code if any (0 none) */
109 struct dfa **subdfas; /* per-subre DFAs */
110 struct smalldfa dfa1;
111 struct smalldfa dfa2;
112 };
113 #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
114 #define ISERR() VISERR(v)
115 #define VERR(vv,e) ((vv)->err = ((vv)->err ? (vv)->err : (e)))
116 #define ERR(e) VERR(v, e) /* record an error */
117 #define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */
118 #define OFF(p) ((p) - v->start)
119 #define LOFF(p) ((long)OFF(p))
120
121 /*
122 * forward declarations
123 */
124 /* =====^!^===== begin forwards =====^!^===== */
125 /* automatically gathered by fwd; do not hand-edit */
126 /* === regexec.c === */
127 int exec(regex_t *, const chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int);
128 static struct dfa *getsubdfa(struct vars *, struct subre *);
129 static int simpleFind(struct vars *const, struct cnfa *const, struct colormap *const);
130 static int complicatedFind(struct vars *const, struct cnfa *const, struct colormap *const);
131 static int complicatedFindLoop(struct vars *const, struct dfa *const, struct dfa *const, chr **const);
132 static void zapallsubs(regmatch_t *const, const size_t);
133 static void zaptreesubs(struct vars *const, struct subre *const);
134 static void subset(struct vars *const, struct subre *const, chr *const, chr *const);
135 static int cdissect(struct vars *, struct subre *, chr *, chr *);
136 static int ccondissect(struct vars *, struct subre *, chr *, chr *);
137 static int crevcondissect(struct vars *, struct subre *, chr *, chr *);
138 static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
139 static int caltdissect(struct vars *, struct subre *, chr *, chr *);
140 static int citerdissect(struct vars *, struct subre *, chr *, chr *);
141 static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
142 /* === rege_dfa.c === */
143 static chr *longest(struct vars *const, struct dfa *const, chr *const, chr *const, int *const);
144 static chr *shortest(struct vars *const, struct dfa *const, chr *const, chr *const, chr *const, chr **const, int *const);
145 static chr *lastCold(struct vars *const, struct dfa *const);
146 static struct dfa *newDFA(struct vars *const, struct cnfa *const, struct colormap *const, struct smalldfa *);
147 static void freeDFA(struct dfa *const);
148 static unsigned hash(unsigned *const, const int);
149 static struct sset *initialize(struct vars *const, struct dfa *const, chr *const);
150 static struct sset *miss(struct vars *const, struct dfa *const, struct sset *const, const pcolor, chr *const, chr *const);
151 static int checkLAConstraint(struct vars *const, struct cnfa *const, chr *const, const pcolor);
152 static struct sset *getVacantSS(struct vars *const, struct dfa *const, chr *const, chr *const);
153 static struct sset *pickNextSS(struct vars *const, struct dfa *const, chr *const, chr *const);
154 /* automatically gathered by fwd; do not hand-edit */
155 /* =====^!^===== end forwards =====^!^===== */
156
157 /*
158 - exec - match regular expression
159 ^ int exec(regex_t *, const chr *, size_t, rm_detail_t *,
160 ^ size_t, regmatch_t [], int);
161 */
162 int
exec(regex_t * re,const chr * string,size_t len,rm_detail_t * details,size_t nmatch,regmatch_t pmatch[],int flags)163 exec(
164 regex_t *re,
165 const chr *string,
166 size_t len,
167 rm_detail_t *details,
168 size_t nmatch,
169 regmatch_t pmatch[],
170 int flags)
171 {
172 AllocVars(v);
173 int st, backref;
174 int n;
175 int i;
176 #define LOCALMAT 20
177 regmatch_t mat[LOCALMAT];
178 #define LOCALDFAS 40
179 struct dfa *subdfas[LOCALDFAS];
180
181 /*
182 * Sanity checks.
183 */
184
185 if (re == NULL || string == NULL || re->re_magic != REMAGIC) {
186 FreeVars(v);
187 return REG_INVARG;
188 }
189 if (re->re_csize != sizeof(chr)) {
190 FreeVars(v);
191 return REG_MIXED;
192 }
193
194 /*
195 * Setup.
196 */
197
198 v->re = re;
199 v->g = (struct guts *)re->re_guts;
200 if ((v->g->cflags®_EXPECT) && details == NULL) {
201 FreeVars(v);
202 return REG_INVARG;
203 }
204 if (v->g->info®_UIMPOSSIBLE) {
205 FreeVars(v);
206 return REG_NOMATCH;
207 }
208 backref = (v->g->info®_UBACKREF) ? 1 : 0;
209 v->eflags = flags;
210 if (v->g->cflags®_NOSUB) {
211 nmatch = 0; /* override client */
212 }
213 v->nmatch = nmatch;
214 if (backref) {
215 /*
216 * Need work area.
217 */
218
219 if (v->g->nsub + 1 <= LOCALMAT) {
220 v->pmatch = mat;
221 } else {
222 v->pmatch = (regmatch_t *)
223 MALLOC((v->g->nsub + 1) * sizeof(regmatch_t));
224 }
225 if (v->pmatch == NULL) {
226 FreeVars(v);
227 return REG_ESPACE;
228 }
229 v->nmatch = v->g->nsub + 1;
230 } else {
231 v->pmatch = pmatch;
232 }
233 v->details = details;
234 v->start = (chr *)string;
235 v->stop = (chr *)string + len;
236 v->err = 0;
237 assert(v->g->ntree >= 0);
238 n = v->g->ntree;
239 if (n <= LOCALDFAS)
240 v->subdfas = subdfas;
241 else
242 v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
243 if (v->subdfas == NULL) {
244 if (v->pmatch != pmatch && v->pmatch != mat)
245 FREE(v->pmatch);
246 FreeVars(v);
247 return REG_ESPACE;
248 }
249 for (i = 0; i < n; i++)
250 v->subdfas[i] = NULL;
251
252 /*
253 * Do it.
254 */
255
256 assert(v->g->tree != NULL);
257 if (backref) {
258 st = complicatedFind(v, &v->g->tree->cnfa, &v->g->cmap);
259 } else {
260 st = simpleFind(v, &v->g->tree->cnfa, &v->g->cmap);
261 }
262
263 /*
264 * Copy (portion of) match vector over if necessary.
265 */
266
267 if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) {
268 zapallsubs(pmatch, nmatch);
269 n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
270 memcpy((void*)(pmatch), (void*)(v->pmatch), n*sizeof(regmatch_t));
271 }
272
273 /*
274 * Clean up.
275 */
276
277 if (v->pmatch != pmatch && v->pmatch != mat) {
278 FREE(v->pmatch);
279 }
280 n = v->g->ntree;
281 for (i = 0; i < n; i++) {
282 if (v->subdfas[i] != NULL)
283 freeDFA(v->subdfas[i]);
284 }
285 if (v->subdfas != subdfas)
286 FREE(v->subdfas);
287 FreeVars(v);
288 return st;
289 }
290
291 /*
292 - getsubdfa - create or re-fetch the DFA for a subre node
293 * We only need to create the DFA once per overall regex execution.
294 * The DFA will be freed by the cleanup step in exec().
295 */
296 static struct dfa *
getsubdfa(struct vars * v,struct subre * t)297 getsubdfa(struct vars * v,
298 struct subre * t)
299 {
300 if (v->subdfas[t->id] == NULL) {
301 v->subdfas[t->id] = newDFA(v, &t->cnfa, &v->g->cmap, NULL);
302 if (ISERR())
303 return NULL;
304 }
305 return v->subdfas[t->id];
306 }
307
308 /*
309 - simpleFind - find a match for the main NFA (no-complications case)
310 ^ static int simpleFind(struct vars *, struct cnfa *, struct colormap *);
311 */
312 static int
simpleFind(struct vars * const v,struct cnfa * const cnfa,struct colormap * const cm)313 simpleFind(
314 struct vars *const v,
315 struct cnfa *const cnfa,
316 struct colormap *const cm)
317 {
318 struct dfa *s, *d;
319 chr *begin, *end = NULL;
320 chr *cold;
321 chr *open, *close; /* Open and close of range of possible
322 * starts */
323 int hitend;
324 int shorter = (v->g->tree->flags&SHORTER) ? 1 : 0;
325
326 /*
327 * First, a shot with the search RE.
328 */
329
330 s = newDFA(v, &v->g->search, cm, &v->dfa1);
331 assert(!(ISERR() && s != NULL));
332 NOERR();
333 MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
334 cold = NULL;
335 close = shortest(v, s, v->start, v->start, v->stop, &cold, NULL);
336 freeDFA(s);
337 NOERR();
338 if (v->g->cflags®_EXPECT) {
339 assert(v->details != NULL);
340 if (cold != NULL) {
341 v->details->rm_extend.rm_so = OFF(cold);
342 } else {
343 v->details->rm_extend.rm_so = OFF(v->stop);
344 }
345 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
346 }
347 if (close == NULL) { /* not found */
348 return REG_NOMATCH;
349 }
350 if (v->nmatch == 0) { /* found, don't need exact location */
351 return REG_OKAY;
352 }
353
354 /*
355 * Find starting point and match.
356 */
357
358 assert(cold != NULL);
359 open = cold;
360 cold = NULL;
361 MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
362 d = newDFA(v, cnfa, cm, &v->dfa1);
363 assert(!(ISERR() && d != NULL));
364 NOERR();
365 for (begin = open; begin <= close; begin++) {
366 MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
367 if (shorter) {
368 end = shortest(v, d, begin, begin, v->stop, NULL, &hitend);
369 } else {
370 end = longest(v, d, begin, v->stop, &hitend);
371 }
372 if (ISERR()) {
373 freeDFA(d);
374 return v->err;
375 }
376 if (hitend && cold == NULL) {
377 cold = begin;
378 }
379 if (end != NULL) {
380 break; /* NOTE BREAK OUT */
381 }
382 }
383 assert(end != NULL); /* search RE succeeded so loop should */
384 freeDFA(d);
385
386 /*
387 * And pin down details.
388 */
389
390 assert(v->nmatch > 0);
391 v->pmatch[0].rm_so = OFF(begin);
392 v->pmatch[0].rm_eo = OFF(end);
393 if (v->g->cflags®_EXPECT) {
394 if (cold != NULL) {
395 v->details->rm_extend.rm_so = OFF(cold);
396 } else {
397 v->details->rm_extend.rm_so = OFF(v->stop);
398 }
399 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
400 }
401 if (v->nmatch == 1) { /* no need for submatches */
402 return REG_OKAY;
403 }
404
405 /*
406 * Find submatches.
407 */
408
409 zapallsubs(v->pmatch, v->nmatch);
410 return cdissect(v, v->g->tree, begin, end);
411 }
412
413 /*
414 - complicatedFind - find a match for the main NFA (with complications)
415 ^ static int complicatedFind(struct vars *, struct cnfa *, struct colormap *);
416 */
417 static int
complicatedFind(struct vars * const v,struct cnfa * const cnfa,struct colormap * const cm)418 complicatedFind(
419 struct vars *const v,
420 struct cnfa *const cnfa,
421 struct colormap *const cm)
422 {
423 struct dfa *s, *d;
424 chr *cold = NULL; /* silence gcc 4 warning */
425 int ret;
426
427 s = newDFA(v, &v->g->search, cm, &v->dfa1);
428 NOERR();
429 d = newDFA(v, cnfa, cm, &v->dfa2);
430 if (ISERR()) {
431 assert(d == NULL);
432 freeDFA(s);
433 return v->err;
434 }
435
436 ret = complicatedFindLoop(v, d, s, &cold);
437
438 freeDFA(d);
439 freeDFA(s);
440 NOERR();
441 if (v->g->cflags®_EXPECT) {
442 assert(v->details != NULL);
443 if (cold != NULL) {
444 v->details->rm_extend.rm_so = OFF(cold);
445 } else {
446 v->details->rm_extend.rm_so = OFF(v->stop);
447 }
448 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
449 }
450 return ret;
451 }
452
453 /*
454 - complicatedFindLoop - the heart of complicatedFind
455 ^ static int complicatedFindLoop(struct vars *,
456 ^ struct dfa *, struct dfa *, chr **);
457 */
458 static int
complicatedFindLoop(struct vars * const v,struct dfa * const d,struct dfa * const s,chr ** const coldp)459 complicatedFindLoop(
460 struct vars *const v,
461 struct dfa *const d,
462 struct dfa *const s,
463 chr **const coldp) /* where to put coldstart pointer */
464 {
465 chr *begin, *end;
466 chr *cold;
467 chr *open, *close; /* Open and close of range of possible
468 * starts */
469 chr *estart, *estop;
470 int er, hitend;
471 int shorter = v->g->tree->flags&SHORTER;
472
473 assert(d != NULL && s != NULL);
474 cold = NULL;
475 close = v->start;
476 do {
477 MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
478 close = shortest(v, s, close, close, v->stop, &cold, NULL);
479 if (close == NULL) {
480 break; /* NOTE BREAK */
481 }
482 assert(cold != NULL);
483 open = cold;
484 cold = NULL;
485 MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
486 for (begin = open; begin <= close; begin++) {
487 MDEBUG(("\ncomplicatedFind trying at %ld\n", LOFF(begin)));
488 estart = begin;
489 estop = v->stop;
490 for (;;) {
491 if (shorter) {
492 end = shortest(v, d, begin, estart, estop, NULL, &hitend);
493 } else {
494 end = longest(v, d, begin, estop, &hitend);
495 }
496 if (hitend && cold == NULL) {
497 cold = begin;
498 }
499 if (end == NULL) {
500 break; /* NOTE BREAK OUT */
501 }
502
503 MDEBUG(("tentative end %ld\n", LOFF(end)));
504 zapallsubs(v->pmatch, v->nmatch);
505 er = cdissect(v, v->g->tree, begin, end);
506 if (er == REG_OKAY) {
507 if (v->nmatch > 0) {
508 v->pmatch[0].rm_so = OFF(begin);
509 v->pmatch[0].rm_eo = OFF(end);
510 }
511 *coldp = cold;
512 return REG_OKAY;
513 }
514 if (er != REG_NOMATCH) {
515 ERR(er);
516 *coldp = cold;
517 return er;
518 }
519 if ((shorter) ? end == estop : end == begin) {
520 break;
521 }
522
523 /*
524 * Go around and try again
525 */
526
527 if (shorter) {
528 estart = end + 1;
529 } else {
530 estop = end - 1;
531 }
532 }
533 }
534 } while (close < v->stop);
535
536 *coldp = cold;
537 return REG_NOMATCH;
538 }
539
540 /*
541 - zapallsubs - initialize all subexpression matches to "no match"
542 ^ static void zapallsubs(regmatch_t *, size_t);
543 */
544 static void
zapallsubs(regmatch_t * const p,const size_t n)545 zapallsubs(
546 regmatch_t *const p,
547 const size_t n)
548 {
549 size_t i;
550
551 for (i = n-1; i > 0; i--) {
552 p[i].rm_so = -1;
553 p[i].rm_eo = -1;
554 }
555 }
556
557 /*
558 - zaptreesubs - initialize subexpressions within subtree to "no match"
559 ^ static void zaptreesubs(struct vars *, struct subre *);
560 */
561 static void
zaptreesubs(struct vars * const v,struct subre * const t)562 zaptreesubs(
563 struct vars *const v,
564 struct subre *const t)
565 {
566 if (t->op == '(') {
567 int n = t->subno;
568 assert(n > 0);
569 if ((size_t) n < v->nmatch) {
570 v->pmatch[n].rm_so = -1;
571 v->pmatch[n].rm_eo = -1;
572 }
573 }
574
575 if (t->left != NULL) {
576 zaptreesubs(v, t->left);
577 }
578 if (t->right != NULL) {
579 zaptreesubs(v, t->right);
580 }
581 }
582
583 /*
584 - subset - set subexpression match data for a successful subre
585 ^ static void subset(struct vars *, struct subre *, chr *, chr *);
586 */
587 static void
subset(struct vars * const v,struct subre * const sub,chr * const begin,chr * const end)588 subset(
589 struct vars *const v,
590 struct subre *const sub,
591 chr *const begin,
592 chr *const end)
593 {
594 int n = sub->subno;
595
596 assert(n > 0);
597 if ((size_t)n >= v->nmatch) {
598 return;
599 }
600
601 MDEBUG(("setting %d\n", n));
602 v->pmatch[n].rm_so = OFF(begin);
603 v->pmatch[n].rm_eo = OFF(end);
604 }
605
606 /*
607 - cdissect - check backrefs and determine subexpression matches
608 * cdissect recursively processes a subre tree to check matching of backrefs
609 * and/or identify submatch boundaries for capture nodes. The proposed match
610 * runs from "begin" to "end" (not including "end"), and we are basically
611 * "dissecting" it to see where the submatches are.
612 * Before calling any level of cdissect, the caller must have run the node's
613 * DFA and found that the proposed substring satisfies the DFA. (We make
614 * the caller do that because in concatenation and iteration nodes, it's
615 * much faster to check all the substrings against the child DFAs before we
616 * recurse.) Also, caller must have cleared subexpression match data via
617 * zaptreesubs (or zapallsubs at the top level).
618 ^ static int cdissect(struct vars *, struct subre *, chr *, chr *);
619 */
620 static int /* regexec return code */
cdissect(struct vars * v,struct subre * t,chr * begin,chr * end)621 cdissect(
622 struct vars *v,
623 struct subre *t,
624 chr *begin, /* beginning of relevant substring */
625 chr *end) /* end of same */
626 {
627 int er;
628
629 assert(t != NULL);
630 MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
631
632 switch (t->op) {
633 case '=': /* terminal node */
634 assert(t->left == NULL && t->right == NULL);
635 er = REG_OKAY; /* no action, parent did the work */
636 break;
637 case 'b': /* back reference */
638 assert(t->left == NULL && t->right == NULL);
639 er = cbrdissect(v, t, begin, end);
640 break;
641 case '.': /* concatenation */
642 assert(t->left != NULL && t->right != NULL);
643 if (t->left->flags & SHORTER) /* reverse scan */
644 er = crevcondissect(v, t, begin, end);
645 else
646 er = ccondissect(v, t, begin, end);
647 break;
648 case '|': /* alternation */
649 assert(t->left != NULL);
650 er = caltdissect(v, t, begin, end);
651 break;
652 case '*': /* iteration */
653 assert(t->left != NULL);
654 if (t->left->flags & SHORTER) /* reverse scan */
655 er = creviterdissect(v, t, begin, end);
656 else
657 er = citerdissect(v, t, begin, end);
658 break;
659 case '(': /* capturing */
660 assert(t->left != NULL && t->right == NULL);
661 assert(t->subno > 0);
662 er = cdissect(v, t->left, begin, end);
663 if (er == REG_OKAY) {
664 subset(v, t, begin, end);
665 }
666 break;
667 default:
668 er = REG_ASSERT;
669 break;
670 }
671
672 /*
673 * We should never have a match failure unless backrefs lurk below;
674 * otherwise, either caller failed to check the DFA, or there's some
675 * inconsistency between the DFA and the node's innards.
676 */
677 assert(er != REG_NOMATCH || (t->flags & BACKR));
678
679 return er;
680 }
681
682 /*
683 - ccondissect - dissect match for concatenation node
684 ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *);
685 */
686 static int /* regexec return code */
ccondissect(struct vars * v,struct subre * t,chr * begin,chr * end)687 ccondissect(
688 struct vars *v,
689 struct subre *t,
690 chr *begin, /* beginning of relevant substring */
691 chr *end) /* end of same */
692 {
693 struct dfa *d, *d2;
694 chr *mid;
695
696 assert(t->op == '.');
697 assert(t->left != NULL && t->left->cnfa.nstates > 0);
698 assert(t->right != NULL && t->right->cnfa.nstates > 0);
699 assert(!(t->left->flags & SHORTER));
700
701 d = getsubdfa(v, t->left);
702 NOERR();
703 d2 = getsubdfa(v, t->right);
704 NOERR();
705
706 MDEBUG(("cConcat %d\n", t->id));
707
708 /*
709 * Pick a tentative midpoint.
710 */
711 mid = longest(v, d, begin, end, (int *) NULL);
712 if (mid == NULL) {
713 return REG_NOMATCH;
714 }
715 MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
716
717 /*
718 * Iterate until satisfaction or failure.
719 */
720
721 for (;;) {
722 /*
723 * Try this midpoint on for size.
724 */
725
726 if (longest(v, d2, mid, end, NULL) == end) {
727 int er = cdissect(v, t->left, begin, mid);
728
729 if (er == REG_OKAY) {
730 er = cdissect(v, t->right, mid, end);
731 if (er == REG_OKAY) {
732 /*
733 * Satisfaction.
734 */
735
736 MDEBUG(("successful\n"));
737 return REG_OKAY;
738 }
739 }
740 if (er != REG_NOMATCH) {
741 return er;
742 }
743 }
744
745 /*
746 * That midpoint didn't work, find a new one.
747 */
748
749 if (mid == begin) {
750 /*
751 * All possibilities exhausted.
752 */
753
754 MDEBUG(("%d no midpoint\n", t->id));
755 return REG_NOMATCH;
756 }
757 mid = longest(v, d, begin, mid-1, NULL);
758 if (mid == NULL) {
759 /*
760 * Failed to find a new one.
761 */
762
763 MDEBUG(("%d failed midpoint\n", t->id));
764 return REG_NOMATCH;
765 }
766 MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
767 zaptreesubs(v, t->left);
768 zaptreesubs(v, t->right);
769 }
770 }
771
772 /*
773 - crevcondissect - dissect match for concatenation node, shortest-first
774 ^ static int crevcondissect(struct vars *, struct subre *, chr *, chr *);
775 */
776 static int /* regexec return code */
crevcondissect(struct vars * v,struct subre * t,chr * begin,chr * end)777 crevcondissect(
778 struct vars *v,
779 struct subre *t,
780 chr *begin, /* beginning of relevant substring */
781 chr *end) /* end of same */
782 {
783 struct dfa *d, *d2;
784 chr *mid;
785
786 assert(t->op == '.');
787 assert(t->left != NULL && t->left->cnfa.nstates > 0);
788 assert(t->right != NULL && t->right->cnfa.nstates > 0);
789 assert(t->left->flags&SHORTER);
790
791 d = getsubdfa(v, t->left);
792 NOERR();
793 d2 = getsubdfa(v, t->right);
794 NOERR();
795
796 MDEBUG(("crevcon %d\n", t->id));
797
798 /*
799 * Pick a tentative midpoint.
800 */
801
802 mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
803 if (mid == NULL) {
804 return REG_NOMATCH;
805 }
806 MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
807
808 /*
809 * Iterate until satisfaction or failure.
810 */
811
812 for (;;) {
813 /*
814 * Try this midpoint on for size.
815 */
816
817 if (longest(v, d2, mid, end, NULL) == end) {
818 int er = cdissect(v, t->left, begin, mid);
819
820 if (er == REG_OKAY) {
821 er = cdissect(v, t->right, mid, end);
822 if (er == REG_OKAY) {
823 /*
824 * Satisfaction.
825 */
826
827 MDEBUG(("successful\n"));
828 return REG_OKAY;
829 }
830 }
831 if (er != REG_NOMATCH) {
832 return er;
833 }
834 }
835
836 /*
837 * That midpoint didn't work, find a new one.
838 */
839
840 if (mid == end) {
841 /*
842 * All possibilities exhausted.
843 */
844
845 MDEBUG(("%d no midpoint\n", t->id));
846 return REG_NOMATCH;
847 }
848 mid = shortest(v, d, begin, mid+1, end, NULL, NULL);
849 if (mid == NULL) {
850 /*
851 * Failed to find a new one.
852 */
853
854 MDEBUG(("%d failed midpoint\n", t->id));
855 return REG_NOMATCH;
856 }
857 MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
858 zaptreesubs(v, t->left);
859 zaptreesubs(v, t->right);
860 }
861 }
862
863 /*
864 - cbrdissect - dissect match for backref node
865 ^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
866 */
867 static int /* regexec return code */
cbrdissect(struct vars * v,struct subre * t,chr * begin,chr * end)868 cbrdissect(
869 struct vars *v,
870 struct subre *t,
871 chr *begin, /* beginning of relevant substring */
872 chr *end) /* end of same */
873 {
874 int n = t->subno, min = t->min, max = t->max;
875 size_t numreps;
876 size_t tlen;
877 size_t brlen;
878 chr *brstring;
879 chr *p;
880
881 assert(t != NULL);
882 assert(t->op == 'b');
883 assert(n >= 0);
884 assert((size_t)n < v->nmatch);
885
886 MDEBUG(("cbackref n%d %d{%d-%d}\n", t->id, n, min, max));
887
888 /* get the backreferenced string */
889 if (v->pmatch[n].rm_so == TCL_INDEX_NONE) {
890 return REG_NOMATCH;
891 }
892 brstring = v->start + v->pmatch[n].rm_so;
893 brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
894
895 /* special cases for zero-length strings */
896 if (brlen == 0) {
897 /*
898 * matches only if target is zero length, but any number of
899 * repetitions can be considered to be present
900 */
901 if (begin == end && min <= max) {
902 MDEBUG(("cbackref matched trivially\n"));
903 return REG_OKAY;
904 }
905 return REG_NOMATCH;
906 }
907 if (begin == end) {
908 /* matches only if zero repetitions are okay */
909 if (min == 0) {
910 MDEBUG(("cbackref matched trivially\n"));
911 return REG_OKAY;
912 }
913 return REG_NOMATCH;
914 }
915
916 /*
917 * check target length to see if it could possibly be an allowed number of
918 * repetitions of brstring
919 */
920
921 assert(end > begin);
922 tlen = end - begin;
923 if (tlen % brlen != 0)
924 return REG_NOMATCH;
925 numreps = tlen / brlen;
926 if (numreps < (size_t)min || (numreps > (size_t)max && max != DUPINF))
927 return REG_NOMATCH;
928
929 /* okay, compare the actual string contents */
930 p = begin;
931 while (numreps-- > 0) {
932 if ((*v->g->compare) (brstring, p, brlen) != 0)
933 return REG_NOMATCH;
934 p += brlen;
935 }
936
937 MDEBUG(("cbackref matched\n"));
938 return REG_OKAY;
939 }
940
941 /*
942 - caltdissect - dissect match for alternation node
943 ^ static int caltdissect(struct vars *, struct subre *, chr *, chr *);
944 */
945 static int /* regexec return code */
caltdissect(struct vars * v,struct subre * t,chr * begin,chr * end)946 caltdissect(
947 struct vars *v,
948 struct subre *t,
949 chr *begin, /* beginning of relevant substring */
950 chr *end) /* end of same */
951 {
952 struct dfa *d;
953 int er;
954
955 /* We loop, rather than tail-recurse, to handle a chain of alternatives */
956 while (t != NULL) {
957 assert(t->op == '|');
958 assert(t->left != NULL && t->left->cnfa.nstates > 0);
959
960 MDEBUG(("calt n%d\n", t->id));
961
962 d = getsubdfa(v, t->left);
963 NOERR();
964 if (longest(v, d, begin, end, (int *) NULL) == end) {
965 MDEBUG(("calt matched\n"));
966 er = cdissect(v, t->left, begin, end);
967 if (er != REG_NOMATCH) {
968 return er;
969 }
970 }
971
972 t = t->right;
973 }
974
975 return REG_NOMATCH;
976 }
977
978 /*
979 - citerdissect - dissect match for iteration node
980 ^ static int citerdissect(struct vars *, struct subre *, chr *, chr *);
981 */
982 static int /* regexec return code */
citerdissect(struct vars * v,struct subre * t,chr * begin,chr * end)983 citerdissect(struct vars * v,
984 struct subre * t,
985 chr *begin, /* beginning of relevant substring */
986 chr *end) /* end of same */
987 {
988 struct dfa *d;
989 chr **endpts;
990 chr *limit;
991 int min_matches;
992 size_t max_matches;
993 int nverified;
994 int k;
995 int i;
996 int er;
997
998 assert(t->op == '*');
999 assert(t->left != NULL && t->left->cnfa.nstates > 0);
1000 assert(!(t->left->flags & SHORTER));
1001 assert(begin <= end);
1002
1003 /*
1004 * If zero matches are allowed, and target string is empty, just declare
1005 * victory. OTOH, if target string isn't empty, zero matches can't work
1006 * so we pretend the min is 1.
1007 */
1008 min_matches = t->min;
1009 if (min_matches <= 0) {
1010 if (begin == end)
1011 return REG_OKAY;
1012 min_matches = 1;
1013 }
1014
1015 /*
1016 * We need workspace to track the endpoints of each sub-match. Normally
1017 * we consider only nonzero-length sub-matches, so there can be at most
1018 * end-begin of them. However, if min is larger than that, we will also
1019 * consider zero-length sub-matches in order to find enough matches.
1020 *
1021 * For convenience, endpts[0] contains the "begin" pointer and we store
1022 * sub-match endpoints in endpts[1..max_matches].
1023 */
1024 max_matches = end - begin;
1025 if (max_matches > (size_t)t->max && t->max != DUPINF)
1026 max_matches = t->max;
1027 if (max_matches < (size_t)min_matches)
1028 max_matches = min_matches;
1029 endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
1030 if (endpts == NULL)
1031 return REG_ESPACE;
1032 endpts[0] = begin;
1033
1034 d = getsubdfa(v, t->left);
1035 if (ISERR()) {
1036 FREE(endpts);
1037 return v->err;
1038 }
1039 MDEBUG(("citer %d\n", t->id));
1040
1041 /*
1042 * Our strategy is to first find a set of sub-match endpoints that are
1043 * valid according to the child node's DFA, and then recursively dissect
1044 * each sub-match to confirm validity. If any validity check fails,
1045 * backtrack the last sub-match and try again. And, when we next try for
1046 * a validity check, we need not recheck any successfully verified
1047 * sub-matches that we didn't move the endpoints of. nverified remembers
1048 * how many sub-matches are currently known okay.
1049 */
1050
1051 /* initialize to consider first sub-match */
1052 nverified = 0;
1053 k = 1;
1054 limit = end;
1055
1056 /* iterate until satisfaction or failure */
1057 while (k > 0) {
1058 /* try to find an endpoint for the k'th sub-match */
1059 endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
1060 if (endpts[k] == NULL) {
1061 /* no match possible, so see if we can shorten previous one */
1062 k--;
1063 goto backtrack;
1064 }
1065 MDEBUG(("%d: working endpoint %d: %ld\n",
1066 t->id, k, LOFF(endpts[k])));
1067
1068 /* k'th sub-match can no longer be considered verified */
1069 if (nverified >= k)
1070 nverified = k - 1;
1071
1072 if (endpts[k] != end) {
1073 /* haven't reached end yet, try another iteration if allowed */
1074 if ((size_t)k >= max_matches) {
1075 /* must try to shorten some previous match */
1076 k--;
1077 goto backtrack;
1078 }
1079
1080 /* reject zero-length match unless necessary to achieve min */
1081 if (endpts[k] == endpts[k - 1] &&
1082 (k >= min_matches || min_matches - k < end - endpts[k]))
1083 goto backtrack;
1084
1085 k++;
1086 limit = end;
1087 continue;
1088 }
1089
1090 /*
1091 * We've identified a way to divide the string into k sub-matches
1092 * that works so far as the child DFA can tell. If k is an allowed
1093 * number of matches, start the slow part: recurse to verify each
1094 * sub-match. We always have k <= max_matches, needn't check that.
1095 */
1096 if (k < min_matches)
1097 goto backtrack;
1098
1099 MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1100
1101 for (i = nverified + 1; i <= k; i++) {
1102 zaptreesubs(v, t->left);
1103 er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
1104 if (er == REG_OKAY) {
1105 nverified = i;
1106 continue;
1107 }
1108 if (er == REG_NOMATCH)
1109 break;
1110 /* oops, something failed */
1111 FREE(endpts);
1112 return er;
1113 }
1114
1115 if (i > k) {
1116 /* satisfaction */
1117 MDEBUG(("%d successful\n", t->id));
1118 FREE(endpts);
1119 return REG_OKAY;
1120 }
1121
1122 /* match failed to verify, so backtrack */
1123
1124 backtrack:
1125 /*
1126 * Must consider shorter versions of the current sub-match. However,
1127 * we'll only ask for a zero-length match if necessary.
1128 */
1129 while (k > 0) {
1130 chr *prev_end = endpts[k - 1];
1131
1132 if (endpts[k] > prev_end) {
1133 limit = endpts[k] - 1;
1134 if (limit > prev_end ||
1135 (k < min_matches && min_matches - k >= end - prev_end)) {
1136 /* break out of backtrack loop, continue the outer one */
1137 break;
1138 }
1139 }
1140 /* can't shorten k'th sub-match any more, consider previous one */
1141 k--;
1142 }
1143 }
1144
1145 /* all possibilities exhausted */
1146 MDEBUG(("%d failed\n", t->id));
1147 FREE(endpts);
1148 return REG_NOMATCH;
1149 }
1150
1151 /*
1152 - creviterdissect - dissect match for iteration node, shortest-first
1153 ^ static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
1154 */
1155 static int /* regexec return code */
creviterdissect(struct vars * v,struct subre * t,chr * begin,chr * end)1156 creviterdissect(struct vars * v,
1157 struct subre * t,
1158 chr *begin, /* beginning of relevant substring */
1159 chr *end) /* end of same */
1160 {
1161 struct dfa *d;
1162 chr **endpts;
1163 chr *limit;
1164 int min_matches;
1165 size_t max_matches;
1166 int nverified;
1167 int k;
1168 int i;
1169 int er;
1170
1171 assert(t->op == '*');
1172 assert(t->left != NULL && t->left->cnfa.nstates > 0);
1173 assert(t->left->flags & SHORTER);
1174 assert(begin <= end);
1175
1176 /*
1177 * If zero matches are allowed, and target string is empty, just declare
1178 * victory. OTOH, if target string isn't empty, zero matches can't work
1179 * so we pretend the min is 1.
1180 */
1181 min_matches = t->min;
1182 if (min_matches <= 0) {
1183 if (begin == end)
1184 return REG_OKAY;
1185 min_matches = 1;
1186 }
1187
1188 /*
1189 * We need workspace to track the endpoints of each sub-match. Normally
1190 * we consider only nonzero-length sub-matches, so there can be at most
1191 * end-begin of them. However, if min is larger than that, we will also
1192 * consider zero-length sub-matches in order to find enough matches.
1193 *
1194 * For convenience, endpts[0] contains the "begin" pointer and we store
1195 * sub-match endpoints in endpts[1..max_matches].
1196 */
1197 max_matches = end - begin;
1198 if (max_matches > (size_t)t->max && t->max != DUPINF)
1199 max_matches = t->max;
1200 if (max_matches < (size_t)min_matches)
1201 max_matches = min_matches;
1202 endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
1203 if (endpts == NULL)
1204 return REG_ESPACE;
1205 endpts[0] = begin;
1206
1207 d = getsubdfa(v, t->left);
1208 if (ISERR()) {
1209 FREE(endpts);
1210 return v->err;
1211 }
1212 MDEBUG(("creviter %d\n", t->id));
1213
1214 /*
1215 * Our strategy is to first find a set of sub-match endpoints that are
1216 * valid according to the child node's DFA, and then recursively dissect
1217 * each sub-match to confirm validity. If any validity check fails,
1218 * backtrack the last sub-match and try again. And, when we next try for
1219 * a validity check, we need not recheck any successfully verified
1220 * sub-matches that we didn't move the endpoints of. nverified remembers
1221 * how many sub-matches are currently known okay.
1222 */
1223
1224 /* initialize to consider first sub-match */
1225 nverified = 0;
1226 k = 1;
1227 limit = begin;
1228
1229 /* iterate until satisfaction or failure */
1230 while (k > 0) {
1231 /* disallow zero-length match unless necessary to achieve min */
1232 if (limit == endpts[k - 1] &&
1233 limit != end &&
1234 (k >= min_matches || min_matches - k < end - limit))
1235 limit++;
1236
1237 /* if this is the last allowed sub-match, it must reach to the end */
1238 if ((size_t)k >= max_matches)
1239 limit = end;
1240
1241 /* try to find an endpoint for the k'th sub-match */
1242 endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
1243 (chr **) NULL, (int *) NULL);
1244 if (endpts[k] == NULL) {
1245 /* no match possible, so see if we can lengthen previous one */
1246 k--;
1247 goto backtrack;
1248 }
1249 MDEBUG(("%d: working endpoint %d: %ld\n",
1250 t->id, k, LOFF(endpts[k])));
1251
1252 /* k'th sub-match can no longer be considered verified */
1253 if (nverified >= k)
1254 nverified = k - 1;
1255
1256 if (endpts[k] != end) {
1257 /* haven't reached end yet, try another iteration if allowed */
1258 if ((size_t)k >= max_matches) {
1259 /* must try to lengthen some previous match */
1260 k--;
1261 goto backtrack;
1262 }
1263
1264 k++;
1265 limit = endpts[k - 1];
1266 continue;
1267 }
1268
1269 /*
1270 * We've identified a way to divide the string into k sub-matches
1271 * that works so far as the child DFA can tell. If k is an allowed
1272 * number of matches, start the slow part: recurse to verify each
1273 * sub-match. We always have k <= max_matches, needn't check that.
1274 */
1275 if (k < min_matches)
1276 goto backtrack;
1277
1278 MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1279
1280 for (i = nverified + 1; i <= k; i++) {
1281 zaptreesubs(v, t->left);
1282 er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
1283 if (er == REG_OKAY) {
1284 nverified = i;
1285 continue;
1286 }
1287 if (er == REG_NOMATCH)
1288 break;
1289 /* oops, something failed */
1290 FREE(endpts);
1291 return er;
1292 }
1293
1294 if (i > k) {
1295 /* satisfaction */
1296 MDEBUG(("%d successful\n", t->id));
1297 FREE(endpts);
1298 return REG_OKAY;
1299 }
1300
1301 /* match failed to verify, so backtrack */
1302
1303 backtrack:
1304 /*
1305 * Must consider longer versions of the current sub-match.
1306 */
1307 while (k > 0) {
1308 if (endpts[k] < end) {
1309 limit = endpts[k] + 1;
1310 /* break out of backtrack loop, continue the outer one */
1311 break;
1312 }
1313 /* can't lengthen k'th sub-match any more, consider previous one */
1314 k--;
1315 }
1316 }
1317
1318 /* all possibilities exhausted */
1319 MDEBUG(("%d failed\n", t->id));
1320 FREE(endpts);
1321 return REG_NOMATCH;
1322 }
1323
1324 #include "rege_dfa.c"
1325
1326 /*
1327 * Local Variables:
1328 * mode: c
1329 * c-basic-offset: 4
1330 * fill-column: 78
1331 * End:
1332 */
1333