1 /*
2 * re_*exec and friends - match REs
3 *
4 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
5 *
6 * Development of this software was funded, in part, by Cray Research Inc.,
7 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8 * Corporation, none of whom are responsible for the results. The author
9 * thanks all of them.
10 *
11 * Redistribution and use in source and binary forms -- with or without
12 * modification -- are permitted for any purpose, provided that
13 * redistributions in source form retain this entire copyright notice and
14 * indicate the origin and nature of any modifications.
15 *
16 * I'd appreciate being given credit for this package in the documentation of
17 * software which uses it, but that is not a requirement.
18 *
19 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
22 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include "regguts.h"
32
33 /*
34 * Lazy-DFA representation.
35 */
36
37 struct arcp { /* "pointer" to an outarc */
38 struct sset *ss;
39 color co;
40 };
41
42 struct sset { /* state set */
43 unsigned *states; /* pointer to bitvector */
44 unsigned hash; /* hash of bitvector */
45 #define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw))
46 #define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
47 memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
48 int flags;
49 #define STARTER 01 /* the initial state set */
50 #define POSTSTATE 02 /* includes the goal state */
51 #define LOCKED 04 /* locked in cache */
52 #define NOPROGRESS 010 /* zero-progress state set */
53 struct arcp ins; /* chain of inarcs pointing here */
54 chr *lastseen; /* last entered on arrival here */
55 struct sset **outs; /* outarc vector indexed by color */
56 struct arcp *inchain; /* chain-pointer vector for outarcs */
57 };
58
59 struct dfa {
60 int nssets; /* size of cache */
61 int nssused; /* how many entries occupied yet */
62 int nstates; /* number of states */
63 int ncolors; /* length of outarc and inchain vectors */
64 int wordsper; /* length of state-set bitvectors */
65 struct sset *ssets; /* state-set cache */
66 unsigned *statesarea; /* bitvector storage */
67 unsigned *work; /* pointer to work area within statesarea */
68 struct sset **outsarea; /* outarc-vector storage */
69 struct arcp *incarea; /* inchain storage */
70 struct cnfa *cnfa;
71 struct colormap *cm;
72 chr *lastpost; /* location of last cache-flushed success */
73 chr *lastnopr; /* location of last cache-flushed NOPROGRESS */
74 struct sset *search; /* replacement-search-pointer memory */
75 int cptsmalloced; /* were the areas individually malloced? */
76 char *mallocarea; /* self, or master malloced area, or NULL */
77 };
78
79 #define WORK 1 /* number of work bitvectors needed */
80
81 /*
82 * Setup for non-malloc allocation for small cases.
83 */
84
85 #define FEWSTATES 20 /* must be less than UBITS */
86 #define FEWCOLORS 15
87 struct smalldfa {
88 struct dfa dfa;
89 struct sset ssets[FEWSTATES*2];
90 unsigned statesarea[FEWSTATES*2 + WORK];
91 struct sset *outsarea[FEWSTATES*2 * FEWCOLORS];
92 struct arcp incarea[FEWSTATES*2 * FEWCOLORS];
93 };
94 #define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */
95
96 /*
97 * Internal variables, bundled for easy passing around.
98 */
99
100 struct vars {
101 regex_t *re;
102 struct guts *g;
103 int eflags; /* copies of arguments */
104 size_t nmatch;
105 regmatch_t *pmatch;
106 rm_detail_t *details;
107 chr *start; /* start of string */
108 chr *stop; /* just past end of string */
109 int err; /* error code if any (0 none) */
110 regoff_t *mem; /* memory vector for backtracking */
111 struct smalldfa dfa1;
112 struct smalldfa dfa2;
113 };
114 #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
115 #define ISERR() VISERR(v)
116 #define VERR(vv,e) (((vv)->err) ? (vv)->err : ((vv)->err = (e)))
117 #define ERR(e) VERR(v, e) /* record an error */
118 #define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */
119 #define OFF(p) ((p) - v->start)
120 #define LOFF(p) ((long)OFF(p))
121
122 /*
123 * forward declarations
124 */
125 /* =====^!^===== begin forwards =====^!^===== */
126 /* automatically gathered by fwd; do not hand-edit */
127 /* === regexec.c === */
128 int exec(regex_t *, CONST chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int);
129 static int find(struct vars *, struct cnfa *, struct colormap *);
130 static int cfind(struct vars *, struct cnfa *, struct colormap *);
131 static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
132 static VOID zapsubs(regmatch_t *, size_t);
133 static VOID zapmem(struct vars *, struct subre *);
134 static VOID subset(struct vars *, struct subre *, chr *, chr *);
135 static int dissect(struct vars *, struct subre *, chr *, chr *);
136 static int condissect(struct vars *, struct subre *, chr *, chr *);
137 static int altdissect(struct vars *, struct subre *, chr *, chr *);
138 static int cdissect(struct vars *, struct subre *, chr *, chr *);
139 static int ccondissect(struct vars *, struct subre *, chr *, chr *);
140 static int crevdissect(struct vars *, struct subre *, chr *, chr *);
141 static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
142 static int caltdissect(struct vars *, struct subre *, chr *, chr *);
143 /* === rege_dfa.c === */
144 static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
145 static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
146 static chr *lastcold(struct vars *, struct dfa *);
147 static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
148 static VOID freedfa(struct dfa *);
149 static unsigned hash(unsigned *, int);
150 static struct sset *initialize(struct vars *, struct dfa *, chr *);
151 static struct sset *miss(struct vars *, struct dfa *, struct sset *, pcolor, chr *, chr *);
152 static int lacon(struct vars *, struct cnfa *, chr *, pcolor);
153 static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *);
154 static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
155 /* automatically gathered by fwd; do not hand-edit */
156 /* =====^!^===== end forwards =====^!^===== */
157
158 /*
159 - exec - match regular expression
160 ^ int exec(regex_t *, CONST chr *, size_t, rm_detail_t *,
161 ^ size_t, regmatch_t [], int);
162 */
163 int
exec(regex_t * re,CONST chr * string,size_t len,rm_detail_t * details,size_t nmatch,regmatch_t pmatch[],int flags)164 exec(
165 regex_t *re,
166 CONST chr *string,
167 size_t len,
168 rm_detail_t *details,
169 size_t nmatch,
170 regmatch_t pmatch[],
171 int flags)
172 {
173 AllocVars(v);
174 int st;
175 size_t n;
176 int backref;
177 #define LOCALMAT 20
178 regmatch_t mat[LOCALMAT];
179 #define LOCALMEM 40
180 regoff_t mem[LOCALMEM];
181
182 /*
183 * Sanity checks.
184 */
185
186 if (re == NULL || string == NULL || re->re_magic != REMAGIC) {
187 FreeVars(v);
188 return REG_INVARG;
189 }
190 if (re->re_csize != sizeof(chr)) {
191 FreeVars(v);
192 return REG_MIXED;
193 }
194
195 /*
196 * Setup.
197 */
198
199 v->re = re;
200 v->g = (struct guts *)re->re_guts;
201 if ((v->g->cflags®_EXPECT) && details == NULL) {
202 FreeVars(v);
203 return REG_INVARG;
204 }
205 if (v->g->info®_UIMPOSSIBLE) {
206 FreeVars(v);
207 return REG_NOMATCH;
208 }
209 backref = (v->g->info®_UBACKREF) ? 1 : 0;
210 v->eflags = flags;
211 if (v->g->cflags®_NOSUB) {
212 nmatch = 0; /* override client */
213 }
214 v->nmatch = nmatch;
215 if (backref) {
216 /*
217 * Need work area.
218 */
219
220 if (v->g->nsub + 1 <= LOCALMAT) {
221 v->pmatch = mat;
222 } else {
223 v->pmatch = (regmatch_t *)
224 MALLOC((v->g->nsub + 1) * sizeof(regmatch_t));
225 }
226 if (v->pmatch == NULL) {
227 FreeVars(v);
228 return REG_ESPACE;
229 }
230 v->nmatch = v->g->nsub + 1;
231 } else {
232 v->pmatch = pmatch;
233 }
234 v->details = details;
235 v->start = (chr *)string;
236 v->stop = (chr *)string + len;
237 v->err = 0;
238 if (backref) {
239 /*
240 * Need retry memory.
241 */
242
243 assert(v->g->ntree >= 0);
244 n = (size_t)v->g->ntree;
245 if (n <= LOCALMEM) {
246 v->mem = mem;
247 } else {
248 v->mem = (regoff_t *) MALLOC(n*sizeof(regoff_t));
249 }
250 if (v->mem == NULL) {
251 if (v->pmatch != pmatch && v->pmatch != mat) {
252 FREE(v->pmatch);
253 }
254 FreeVars(v);
255 return REG_ESPACE;
256 }
257 } else {
258 v->mem = NULL;
259 }
260
261 /*
262 * Do it.
263 */
264
265 assert(v->g->tree != NULL);
266 if (backref) {
267 st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
268 } else {
269 st = find(v, &v->g->tree->cnfa, &v->g->cmap);
270 }
271
272 /*
273 * Copy (portion of) match vector over if necessary.
274 */
275
276 if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) {
277 zapsubs(pmatch, nmatch);
278 n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
279 memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t));
280 }
281
282 /*
283 * Clean up.
284 */
285
286 if (v->pmatch != pmatch && v->pmatch != mat) {
287 FREE(v->pmatch);
288 }
289 if (v->mem != NULL && v->mem != mem) {
290 FREE(v->mem);
291 }
292 FreeVars(v);
293 return st;
294 }
295
296 /*
297 - find - find a match for the main NFA (no-complications case)
298 ^ static int find(struct vars *, struct cnfa *, struct colormap *);
299 */
300 static int
find(struct vars * v,struct cnfa * cnfa,struct colormap * cm)301 find(
302 struct vars *v,
303 struct cnfa *cnfa,
304 struct colormap *cm)
305 {
306 struct dfa *s;
307 struct dfa *d;
308 chr *begin;
309 chr *end = NULL;
310 chr *cold;
311 chr *open; /* Open and close of range of possible
312 * starts */
313 chr *close;
314 int hitend;
315 int shorter = (v->g->tree->flags&SHORTER) ? 1 : 0;
316
317 /*
318 * First, a shot with the search RE.
319 */
320
321 s = newdfa(v, &v->g->search, cm, &v->dfa1);
322 assert(!(ISERR() && s != NULL));
323 NOERR();
324 MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
325 cold = NULL;
326 close = shortest(v, s, v->start, v->start, v->stop, &cold, NULL);
327 freedfa(s);
328 NOERR();
329 if (v->g->cflags®_EXPECT) {
330 assert(v->details != NULL);
331 if (cold != NULL) {
332 v->details->rm_extend.rm_so = OFF(cold);
333 } else {
334 v->details->rm_extend.rm_so = OFF(v->stop);
335 }
336 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
337 }
338 if (close == NULL) { /* not found */
339 return REG_NOMATCH;
340 }
341 if (v->nmatch == 0) { /* found, don't need exact location */
342 return REG_OKAY;
343 }
344
345 /*
346 * Find starting point and match.
347 */
348
349 assert(cold != NULL);
350 open = cold;
351 cold = NULL;
352 MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
353 d = newdfa(v, cnfa, cm, &v->dfa1);
354 assert(!(ISERR() && d != NULL));
355 NOERR();
356 for (begin = open; begin <= close; begin++) {
357 MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
358 if (shorter) {
359 end = shortest(v, d, begin, begin, v->stop, NULL, &hitend);
360 } else {
361 end = longest(v, d, begin, v->stop, &hitend);
362 }
363 NOERR();
364 if (hitend && cold == NULL) {
365 cold = begin;
366 }
367 if (end != NULL) {
368 break; /* NOTE BREAK OUT */
369 }
370 }
371 assert(end != NULL); /* search RE succeeded so loop should */
372 freedfa(d);
373
374 /*
375 * And pin down details.
376 */
377
378 assert(v->nmatch > 0);
379 v->pmatch[0].rm_so = OFF(begin);
380 v->pmatch[0].rm_eo = OFF(end);
381 if (v->g->cflags®_EXPECT) {
382 if (cold != NULL) {
383 v->details->rm_extend.rm_so = OFF(cold);
384 } else {
385 v->details->rm_extend.rm_so = OFF(v->stop);
386 }
387 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
388 }
389 if (v->nmatch == 1) { /* no need for submatches */
390 return REG_OKAY;
391 }
392
393 /*
394 * Submatches.
395 */
396
397 zapsubs(v->pmatch, v->nmatch);
398 return dissect(v, v->g->tree, begin, end);
399 }
400
401 /*
402 - cfind - find a match for the main NFA (with complications)
403 ^ static int cfind(struct vars *, struct cnfa *, struct colormap *);
404 */
405 static int
cfind(struct vars * v,struct cnfa * cnfa,struct colormap * cm)406 cfind(
407 struct vars *v,
408 struct cnfa *cnfa,
409 struct colormap *cm)
410 {
411 struct dfa *s;
412 struct dfa *d;
413 chr *cold = NULL; /* silence gcc 4 warning */
414 int ret;
415
416 s = newdfa(v, &v->g->search, cm, &v->dfa1);
417 NOERR();
418 d = newdfa(v, cnfa, cm, &v->dfa2);
419 if (ISERR()) {
420 assert(d == NULL);
421 freedfa(s);
422 return v->err;
423 }
424
425 ret = cfindloop(v, cnfa, cm, d, s, &cold);
426
427 freedfa(d);
428 freedfa(s);
429 NOERR();
430 if (v->g->cflags®_EXPECT) {
431 assert(v->details != NULL);
432 if (cold != NULL) {
433 v->details->rm_extend.rm_so = OFF(cold);
434 } else {
435 v->details->rm_extend.rm_so = OFF(v->stop);
436 }
437 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
438 }
439 return ret;
440 }
441
442 /*
443 - cfindloop - the heart of cfind
444 ^ static int cfindloop(struct vars *, struct cnfa *, struct colormap *,
445 ^ struct dfa *, struct dfa *, chr **);
446 */
447 static int
cfindloop(struct vars * v,struct cnfa * cnfa,struct colormap * cm,struct dfa * d,struct dfa * s,chr ** coldp)448 cfindloop(
449 struct vars *v,
450 struct cnfa *cnfa,
451 struct colormap *cm,
452 struct dfa *d,
453 struct dfa *s,
454 chr **coldp) /* where to put coldstart pointer */
455 {
456 chr *begin;
457 chr *end;
458 chr *cold;
459 chr *open; /* Open and close of range of possible
460 * starts */
461 chr *close;
462 chr *estart;
463 chr *estop;
464 int er;
465 int shorter = v->g->tree->flags&SHORTER;
466 int hitend;
467
468 assert(d != NULL && s != NULL);
469 cold = NULL;
470 close = v->start;
471 do {
472 MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
473 close = shortest(v, s, close, close, v->stop, &cold, NULL);
474 if (close == NULL) {
475 break; /* NOTE BREAK */
476 }
477 assert(cold != NULL);
478 open = cold;
479 cold = NULL;
480 MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
481 for (begin = open; begin <= close; begin++) {
482 MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
483 estart = begin;
484 estop = v->stop;
485 for (;;) {
486 if (shorter) {
487 end = shortest(v, d, begin, estart, estop, NULL, &hitend);
488 } else {
489 end = longest(v, d, begin, estop, &hitend);
490 }
491 if (hitend && cold == NULL) {
492 cold = begin;
493 }
494 if (end == NULL) {
495 break; /* NOTE BREAK OUT */
496 }
497
498 MDEBUG(("tentative end %ld\n", LOFF(end)));
499 zapsubs(v->pmatch, v->nmatch);
500 zapmem(v, v->g->tree);
501 er = cdissect(v, v->g->tree, begin, end);
502 if (er == REG_OKAY) {
503 if (v->nmatch > 0) {
504 v->pmatch[0].rm_so = OFF(begin);
505 v->pmatch[0].rm_eo = OFF(end);
506 }
507 *coldp = cold;
508 return REG_OKAY;
509 }
510 if (er != REG_NOMATCH) {
511 ERR(er);
512 return er;
513 }
514 if ((shorter) ? end == estop : end == begin) {
515 break;
516 }
517
518 /*
519 * Go around and try again
520 */
521
522 if (shorter) {
523 estart = end + 1;
524 } else {
525 estop = end - 1;
526 }
527 }
528 }
529 } while (close < v->stop);
530
531 *coldp = cold;
532 return REG_NOMATCH;
533 }
534
535 /*
536 - zapsubs - initialize the subexpression matches to "no match"
537 ^ static VOID zapsubs(regmatch_t *, size_t);
538 */
539 static void
zapsubs(regmatch_t * p,size_t n)540 zapsubs(
541 regmatch_t *p,
542 size_t n)
543 {
544 size_t i;
545
546 for (i = n-1; i > 0; i--) {
547 p[i].rm_so = -1;
548 p[i].rm_eo = -1;
549 }
550 }
551
552 /*
553 - zapmem - initialize the retry memory of a subtree to zeros
554 ^ static VOID zapmem(struct vars *, struct subre *);
555 */
556 static void
zapmem(struct vars * v,struct subre * t)557 zapmem(
558 struct vars *v,
559 struct subre *t)
560 {
561 if (t == NULL) {
562 return;
563 }
564
565 assert(v->mem != NULL);
566 v->mem[t->retry] = 0;
567 if (t->op == '(') {
568 assert(t->subno > 0);
569 v->pmatch[t->subno].rm_so = -1;
570 v->pmatch[t->subno].rm_eo = -1;
571 }
572
573 if (t->left != NULL) {
574 zapmem(v, t->left);
575 }
576 if (t->right != NULL) {
577 zapmem(v, t->right);
578 }
579 }
580
581 /*
582 - subset - set any subexpression relevant to a successful subre
583 ^ static VOID subset(struct vars *, struct subre *, chr *, chr *);
584 */
585 static void
subset(struct vars * v,struct subre * sub,chr * begin,chr * end)586 subset(
587 struct vars *v,
588 struct subre *sub,
589 chr *begin,
590 chr *end)
591 {
592 int n = sub->subno;
593
594 assert(n > 0);
595 if ((size_t)n >= v->nmatch) {
596 return;
597 }
598
599 MDEBUG(("setting %d\n", n));
600 v->pmatch[n].rm_so = OFF(begin);
601 v->pmatch[n].rm_eo = OFF(end);
602 }
603
604 /*
605 - dissect - determine subexpression matches (uncomplicated case)
606 ^ static int dissect(struct vars *, struct subre *, chr *, chr *);
607 */
608 static int /* regexec return code */
dissect(struct vars * v,struct subre * t,chr * begin,chr * end)609 dissect(
610 struct vars *v,
611 struct subre *t,
612 chr *begin, /* beginning of relevant substring */
613 chr *end) /* end of same */
614 {
615 assert(t != NULL);
616 MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end)));
617
618 switch (t->op) {
619 case '=': /* terminal node */
620 assert(t->left == NULL && t->right == NULL);
621 return REG_OKAY; /* no action, parent did the work */
622 case '|': /* alternation */
623 assert(t->left != NULL);
624 return altdissect(v, t, begin, end);
625 case 'b': /* back ref -- shouldn't be calling us! */
626 return REG_ASSERT;
627 case '.': /* concatenation */
628 assert(t->left != NULL && t->right != NULL);
629 return condissect(v, t, begin, end);
630 case '(': /* capturing */
631 assert(t->left != NULL && t->right == NULL);
632 assert(t->subno > 0);
633 subset(v, t, begin, end);
634 return dissect(v, t->left, begin, end);
635 default:
636 return REG_ASSERT;
637 }
638 }
639
640 /*
641 - condissect - determine concatenation subexpression matches (uncomplicated)
642 ^ static int condissect(struct vars *, struct subre *, chr *, chr *);
643 */
644 static int /* regexec return code */
condissect(struct vars * v,struct subre * t,chr * begin,chr * end)645 condissect(
646 struct vars *v,
647 struct subre *t,
648 chr *begin, /* beginning of relevant substring */
649 chr *end) /* end of same */
650 {
651 struct dfa *d;
652 struct dfa *d2;
653 chr *mid;
654 int i;
655 int shorter = (t->left->flags&SHORTER) ? 1 : 0;
656 chr *stop = (shorter) ? end : begin;
657
658 assert(t->op == '.');
659 assert(t->left != NULL && t->left->cnfa.nstates > 0);
660 assert(t->right != NULL && t->right->cnfa.nstates > 0);
661
662 d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
663 NOERR();
664 d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2);
665 if (ISERR()) {
666 assert(d2 == NULL);
667 freedfa(d);
668 return v->err;
669 }
670
671 /*
672 * Pick a tentative midpoint.
673 */
674
675 if (shorter) {
676 mid = shortest(v, d, begin, begin, end, NULL, NULL);
677 } else {
678 mid = longest(v, d, begin, end, NULL);
679 }
680 if (mid == NULL) {
681 freedfa(d);
682 freedfa(d2);
683 return REG_ASSERT;
684 }
685 MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
686
687 /*
688 * Iterate until satisfaction or failure.
689 */
690
691 while (longest(v, d2, mid, end, NULL) != end) {
692 /*
693 * That midpoint didn't work, find a new one.
694 */
695
696 if (mid == stop) {
697 /*
698 * All possibilities exhausted!
699 */
700
701 MDEBUG(("no midpoint!\n"));
702 freedfa(d);
703 freedfa(d2);
704 return REG_ASSERT;
705 }
706 if (shorter) {
707 mid = shortest(v, d, begin, mid+1, end, NULL, NULL);
708 } else {
709 mid = longest(v, d, begin, mid-1, NULL);
710 }
711 if (mid == NULL) {
712 /*
713 * Failed to find a new one!
714 */
715
716 MDEBUG(("failed midpoint!\n"));
717 freedfa(d);
718 freedfa(d2);
719 return REG_ASSERT;
720 }
721 MDEBUG(("new midpoint %ld\n", LOFF(mid)));
722 }
723
724 /*
725 * Satisfaction.
726 */
727
728 MDEBUG(("successful\n"));
729 freedfa(d);
730 freedfa(d2);
731 i = dissect(v, t->left, begin, mid);
732 if (i != REG_OKAY) {
733 return i;
734 }
735 return dissect(v, t->right, mid, end);
736 }
737
738 /*
739 - altdissect - determine alternative subexpression matches (uncomplicated)
740 ^ static int altdissect(struct vars *, struct subre *, chr *, chr *);
741 */
742 static int /* regexec return code */
altdissect(struct vars * v,struct subre * t,chr * begin,chr * end)743 altdissect(
744 struct vars *v,
745 struct subre *t,
746 chr *begin, /* beginning of relevant substring */
747 chr *end) /* end of same */
748 {
749 struct dfa *d;
750 int i;
751
752 assert(t != NULL);
753 assert(t->op == '|');
754
755 for (i = 0; t != NULL; t = t->right, i++) {
756 MDEBUG(("trying %dth\n", i));
757 assert(t->left != NULL && t->left->cnfa.nstates > 0);
758 d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
759 if (ISERR()) {
760 return v->err;
761 }
762 if (longest(v, d, begin, end, NULL) == end) {
763 MDEBUG(("success\n"));
764 freedfa(d);
765 return dissect(v, t->left, begin, end);
766 }
767 freedfa(d);
768 }
769 return REG_ASSERT; /* none of them matched?!? */
770 }
771
772 /*
773 - cdissect - determine subexpression matches (with complications)
774 * The retry memory stores the offset of the trial midpoint from begin, plus 1
775 * so that 0 uniquely means "clean slate".
776 ^ static int cdissect(struct vars *, struct subre *, chr *, chr *);
777 */
778 static int /* regexec return code */
cdissect(struct vars * v,struct subre * t,chr * begin,chr * end)779 cdissect(
780 struct vars *v,
781 struct subre *t,
782 chr *begin, /* beginning of relevant substring */
783 chr *end) /* end of same */
784 {
785 int er;
786
787 assert(t != NULL);
788 MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
789
790 switch (t->op) {
791 case '=': /* terminal node */
792 assert(t->left == NULL && t->right == NULL);
793 return REG_OKAY; /* no action, parent did the work */
794 case '|': /* alternation */
795 assert(t->left != NULL);
796 return caltdissect(v, t, begin, end);
797 case 'b': /* back ref -- shouldn't be calling us! */
798 assert(t->left == NULL && t->right == NULL);
799 return cbrdissect(v, t, begin, end);
800 case '.': /* concatenation */
801 assert(t->left != NULL && t->right != NULL);
802 return ccondissect(v, t, begin, end);
803 case '(': /* capturing */
804 assert(t->left != NULL && t->right == NULL);
805 assert(t->subno > 0);
806 er = cdissect(v, t->left, begin, end);
807 if (er == REG_OKAY) {
808 subset(v, t, begin, end);
809 }
810 return er;
811 default:
812 return REG_ASSERT;
813 }
814 }
815
816 /*
817 - ccondissect - concatenation subexpression matches (with complications)
818 * The retry memory stores the offset of the trial midpoint from begin, plus 1
819 * so that 0 uniquely means "clean slate".
820 ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *);
821 */
822 static int /* regexec return code */
ccondissect(struct vars * v,struct subre * t,chr * begin,chr * end)823 ccondissect(
824 struct vars *v,
825 struct subre *t,
826 chr *begin, /* beginning of relevant substring */
827 chr *end) /* end of same */
828 {
829 struct dfa *d, *d2;
830 chr *mid;
831 int er;
832
833 assert(t->op == '.');
834 assert(t->left != NULL && t->left->cnfa.nstates > 0);
835 assert(t->right != NULL && t->right->cnfa.nstates > 0);
836
837 if (t->left->flags&SHORTER) { /* reverse scan */
838 return crevdissect(v, t, begin, end);
839 }
840
841 d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
842 if (ISERR()) {
843 return v->err;
844 }
845 d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
846 if (ISERR()) {
847 freedfa(d);
848 return v->err;
849 }
850 MDEBUG(("cconcat %d\n", t->retry));
851
852 /*
853 * Pick a tentative midpoint.
854 */
855
856 if (v->mem[t->retry] == 0) {
857 mid = longest(v, d, begin, end, NULL);
858 if (mid == NULL) {
859 freedfa(d);
860 freedfa(d2);
861 return REG_NOMATCH;
862 }
863 MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
864 v->mem[t->retry] = (mid - begin) + 1;
865 } else {
866 mid = begin + (v->mem[t->retry] - 1);
867 MDEBUG(("working midpoint %ld\n", LOFF(mid)));
868 }
869
870 /*
871 * Iterate until satisfaction or failure.
872 */
873
874 for (;;) {
875 /*
876 * Try this midpoint on for size.
877 */
878
879 if (longest(v, d2, mid, end, NULL) == end) {
880 er = cdissect(v, t->left, begin, mid);
881 if (er == REG_OKAY) {
882 er = cdissect(v, t->right, mid, end);
883 if (er == REG_OKAY) {
884 /*
885 * Satisfaction.
886 */
887
888 MDEBUG(("successful\n"));
889 freedfa(d);
890 freedfa(d2);
891 return REG_OKAY;
892 }
893 }
894 if ((er != REG_OKAY) && (er != REG_NOMATCH)) {
895 freedfa(d);
896 freedfa(d2);
897 return er;
898 }
899 }
900
901 /*
902 * That midpoint didn't work, find a new one.
903 */
904
905 if (mid == begin) {
906 /*
907 * All possibilities exhausted.
908 */
909
910 MDEBUG(("%d no midpoint\n", t->retry));
911 freedfa(d);
912 freedfa(d2);
913 return REG_NOMATCH;
914 }
915 mid = longest(v, d, begin, mid-1, NULL);
916 if (mid == NULL) {
917 /*
918 * Failed to find a new one.
919 */
920
921 MDEBUG(("%d failed midpoint\n", t->retry));
922 freedfa(d);
923 freedfa(d2);
924 return REG_NOMATCH;
925 }
926 MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
927 v->mem[t->retry] = (mid - begin) + 1;
928 zapmem(v, t->left);
929 zapmem(v, t->right);
930 }
931 }
932
933 /*
934 - crevdissect - determine backref shortest-first subexpression matches
935 * The retry memory stores the offset of the trial midpoint from begin, plus 1
936 * so that 0 uniquely means "clean slate".
937 ^ static int crevdissect(struct vars *, struct subre *, chr *, chr *);
938 */
939 static int /* regexec return code */
crevdissect(struct vars * v,struct subre * t,chr * begin,chr * end)940 crevdissect(
941 struct vars *v,
942 struct subre *t,
943 chr *begin, /* beginning of relevant substring */
944 chr *end) /* end of same */
945 {
946 struct dfa *d;
947 struct dfa *d2;
948 chr *mid;
949 int er;
950
951 assert(t->op == '.');
952 assert(t->left != NULL && t->left->cnfa.nstates > 0);
953 assert(t->right != NULL && t->right->cnfa.nstates > 0);
954 assert(t->left->flags&SHORTER);
955
956 /*
957 * Concatenation -- need to split the substring between parts.
958 */
959
960 d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
961 if (ISERR()) {
962 return v->err;
963 }
964 d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
965 if (ISERR()) {
966 freedfa(d);
967 return v->err;
968 }
969 MDEBUG(("crev %d\n", t->retry));
970
971 /*
972 * Pick a tentative midpoint.
973 */
974
975 if (v->mem[t->retry] == 0) {
976 mid = shortest(v, d, begin, begin, end, NULL, NULL);
977 if (mid == NULL) {
978 freedfa(d);
979 freedfa(d2);
980 return REG_NOMATCH;
981 }
982 MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
983 v->mem[t->retry] = (mid - begin) + 1;
984 } else {
985 mid = begin + (v->mem[t->retry] - 1);
986 MDEBUG(("working midpoint %ld\n", LOFF(mid)));
987 }
988
989 /*
990 * Iterate until satisfaction or failure.
991 */
992
993 for (;;) {
994 /*
995 * Try this midpoint on for size.
996 */
997
998 if (longest(v, d2, mid, end, NULL) == end) {
999 er = cdissect(v, t->left, begin, mid);
1000 if (er == REG_OKAY) {
1001 er = cdissect(v, t->right, mid, end);
1002 if (er == REG_OKAY) {
1003 /*
1004 * Satisfaction.
1005 */
1006
1007 MDEBUG(("successful\n"));
1008 freedfa(d);
1009 freedfa(d2);
1010 return REG_OKAY;
1011 }
1012 }
1013 if (er != REG_OKAY && er != REG_NOMATCH) {
1014 freedfa(d);
1015 freedfa(d2);
1016 return er;
1017 }
1018 }
1019
1020 /*
1021 * That midpoint didn't work, find a new one.
1022 */
1023
1024 if (mid == end) {
1025 /*
1026 * All possibilities exhausted.
1027 */
1028
1029 MDEBUG(("%d no midpoint\n", t->retry));
1030 freedfa(d);
1031 freedfa(d2);
1032 return REG_NOMATCH;
1033 }
1034 mid = shortest(v, d, begin, mid+1, end, NULL, NULL);
1035 if (mid == NULL) {
1036 /*
1037 * Failed to find a new one.
1038 */
1039
1040 MDEBUG(("%d failed midpoint\n", t->retry));
1041 freedfa(d);
1042 freedfa(d2);
1043 return REG_NOMATCH;
1044 }
1045 MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
1046 v->mem[t->retry] = (mid - begin) + 1;
1047 zapmem(v, t->left);
1048 zapmem(v, t->right);
1049 }
1050 }
1051
1052 /*
1053 - cbrdissect - determine backref subexpression matches
1054 ^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
1055 */
1056 static int /* regexec return code */
cbrdissect(struct vars * v,struct subre * t,chr * begin,chr * end)1057 cbrdissect(
1058 struct vars *v,
1059 struct subre *t,
1060 chr *begin, /* beginning of relevant substring */
1061 chr *end) /* end of same */
1062 {
1063 int i;
1064 int n = t->subno;
1065 size_t len;
1066 chr *paren;
1067 chr *p;
1068 chr *stop;
1069 int min = t->min;
1070 int max = t->max;
1071
1072 assert(t != NULL);
1073 assert(t->op == 'b');
1074 assert(n >= 0);
1075 assert((size_t)n < v->nmatch);
1076
1077 MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max));
1078
1079 if (v->pmatch[n].rm_so == -1) {
1080 return REG_NOMATCH;
1081 }
1082 paren = v->start + v->pmatch[n].rm_so;
1083 len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
1084
1085 /*
1086 * No room to maneuver -- retries are pointless.
1087 */
1088
1089 if (v->mem[t->retry]) {
1090 return REG_NOMATCH;
1091 }
1092 v->mem[t->retry] = 1;
1093
1094 /*
1095 * Special-case zero-length string.
1096 */
1097
1098 if (len == 0) {
1099 if (begin == end) {
1100 return REG_OKAY;
1101 }
1102 return REG_NOMATCH;
1103 }
1104
1105 /*
1106 * And too-short string.
1107 */
1108
1109 assert(end >= begin);
1110 if ((size_t)(end - begin) < len) {
1111 return REG_NOMATCH;
1112 }
1113 stop = end - len;
1114
1115 /*
1116 * Count occurrences.
1117 */
1118
1119 i = 0;
1120 for (p = begin; p <= stop && (i < max || max == DUPINF); p += len) {
1121 if ((*v->g->compare)(paren, p, len) != 0) {
1122 break;
1123 }
1124 i++;
1125 }
1126 MDEBUG(("cbackref found %d\n", i));
1127
1128 /*
1129 * And sort it out.
1130 */
1131
1132 if (p != end) { /* didn't consume all of it */
1133 return REG_NOMATCH;
1134 }
1135 if (min <= i && (i <= max || max == DUPINF)) {
1136 return REG_OKAY;
1137 }
1138 return REG_NOMATCH; /* out of range */
1139 }
1140
1141 /*
1142 - caltdissect - determine alternative subexpression matches (w. complications)
1143 ^ static int caltdissect(struct vars *, struct subre *, chr *, chr *);
1144 */
1145 static int /* regexec return code */
caltdissect(struct vars * v,struct subre * t,chr * begin,chr * end)1146 caltdissect(
1147 struct vars *v,
1148 struct subre *t,
1149 chr *begin, /* beginning of relevant substring */
1150 chr *end) /* end of same */
1151 {
1152 struct dfa *d;
1153 int er;
1154 #define UNTRIED 0 /* not yet tried at all */
1155 #define TRYING 1 /* top matched, trying submatches */
1156 #define TRIED 2 /* top didn't match or submatches exhausted */
1157
1158 if (t == NULL) {
1159 return REG_NOMATCH;
1160 }
1161 assert(t->op == '|');
1162 if (v->mem[t->retry] == TRIED) {
1163 return caltdissect(v, t->right, begin, end);
1164 }
1165
1166 MDEBUG(("calt n%d\n", t->retry));
1167 assert(t->left != NULL);
1168
1169 if (v->mem[t->retry] == UNTRIED) {
1170 d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
1171 if (ISERR()) {
1172 return v->err;
1173 }
1174 if (longest(v, d, begin, end, NULL) != end) {
1175 freedfa(d);
1176 v->mem[t->retry] = TRIED;
1177 return caltdissect(v, t->right, begin, end);
1178 }
1179 freedfa(d);
1180 MDEBUG(("calt matched\n"));
1181 v->mem[t->retry] = TRYING;
1182 }
1183
1184 er = cdissect(v, t->left, begin, end);
1185 if (er != REG_NOMATCH) {
1186 return er;
1187 }
1188
1189 v->mem[t->retry] = TRIED;
1190 return caltdissect(v, t->right, begin, end);
1191 }
1192
1193 #include "rege_dfa.c"
1194
1195 /*
1196 * Local Variables:
1197 * mode: c
1198 * c-basic-offset: 4
1199 * fill-column: 78
1200 * End:
1201 */
1202