1 /*
2 * The matching engine and friends. This file is #included by regexec.c
3 * after suitable #defines of a variety of macros used herein, so that
4 * different state representations can be used without duplicating masses
5 * of code.
6 */
7
8 #ifdef SNAMES
9 #define matcher smatcher
10 #define fast sfast
11 #define slow sslow
12 #define dissect sdissect
13 #define backref sbackref
14 #define step sstep
15 #define print sprint
16 #define at sat
17 #define match smat
18 #endif
19 #ifdef LNAMES
20 #define matcher lmatcher
21 #define fast lfast
22 #define slow lslow
23 #define dissect ldissect
24 #define backref lbackref
25 #define step lstep
26 #define print lprint
27 #define at lat
28 #define match lmat
29 #endif
30
31 /* another structure passed up and down to avoid zillions of parameters */
32 struct match {
33 struct re_guts *g;
34 int eflags;
35 my_regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
36 char *offp; /* offsets work from here */
37 char *beginp; /* start of string -- virtual NUL precedes */
38 char *endp; /* end of string -- virtual NUL here */
39 char *coldp; /* can be no match starting before here */
40 char **lastpos; /* [nplus+1] */
41 STATEVARS;
42 states st; /* current states */
43 states fresh; /* states for a fresh start */
44 states tmp; /* temporary */
45 states empty; /* empty set of states */
46 };
47
48 #include "engine.ih"
49
50 #ifdef REDEBUG
51 #define SP(t, s, c) print(m, t, s, c, stdout)
52 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
53 #define NOTE(str) { if (m->eflags&MY_REG_TRACE) printf("=%s\n", (str)); }
54 #else
55 #define SP(t, s, c) /* nothing */
56 #define AT(t, p1, p2, s1, s2) /* nothing */
57 #define NOTE(s) /* nothing */
58 #endif
59
60 /*
61 - matcher - the actual matching engine
62 == static int matcher(register struct re_guts *g, char *string, \
63 == size_t nmatch, regmatch_t pmatch[], int eflags);
64 */
65 static int /* 0 success, MY_REG_NOMATCH failure */
matcher(charset,g,str,nmatch,pmatch,eflags)66 matcher(charset,g, str, nmatch, pmatch, eflags)
67 const CHARSET_INFO *charset;
68 struct re_guts *g;
69 char *str;
70 size_t nmatch;
71 my_regmatch_t pmatch[];
72 int eflags;
73 {
74 char *endp;
75 uint i;
76 struct match mv;
77 struct match *m = &mv;
78 char *dp;
79 const sopno gf = g->firststate+1; /* +1 for OEND */
80 const sopno gl = g->laststate;
81 char *start;
82 char *stop;
83
84 /* simplify the situation where possible */
85 if (g->cflags&MY_REG_NOSUB)
86 nmatch = 0;
87 if (eflags&MY_REG_STARTEND) {
88 start = str + pmatch[0].rm_so;
89 stop = str + pmatch[0].rm_eo;
90 } else {
91 start = str;
92 stop = start + strlen(start);
93 }
94 if (stop < start)
95 return(MY_REG_INVARG);
96
97 /* prescreening; this does wonders for this rather slow code */
98 if (g->must != NULL) {
99 for (dp = start; dp < stop; dp++)
100 if (*dp == g->must[0] && stop - dp >= g->mlen &&
101 memcmp(dp, g->must, (size_t)g->mlen) == 0)
102 break;
103 if (dp == stop) /* we didn't find g->must */
104 return(MY_REG_NOMATCH);
105 }
106
107 /* match struct setup */
108 m->g = g;
109 m->eflags = eflags;
110 m->pmatch = NULL;
111 m->lastpos = NULL;
112 m->offp = str;
113 m->beginp = start;
114 m->endp = stop;
115 STATESETUP(m, 4);
116 SETUP(m->st);
117 SETUP(m->fresh);
118 SETUP(m->tmp);
119 SETUP(m->empty);
120 CLEAR(m->empty);
121
122 /* this loop does only one repetition except for backrefs */
123 for (;;) {
124 endp = fast(charset, m, start, stop, gf, gl);
125 if (endp == NULL) { /* a miss */
126 if (m->pmatch != NULL)
127 free((char *)m->pmatch);
128 if (m->lastpos != NULL)
129 free((char *)m->lastpos);
130 STATETEARDOWN(m);
131 return(MY_REG_NOMATCH);
132 }
133 if (nmatch == 0 && !g->backrefs)
134 break; /* no further info needed */
135
136 /* where? */
137 assert(m->coldp != NULL);
138 for (;;) {
139 NOTE("finding start");
140 endp = slow(charset, m, m->coldp, stop, gf, gl);
141 if (endp != NULL)
142 break;
143 assert(m->coldp < m->endp);
144 m->coldp++;
145 }
146 if (nmatch == 1 && !g->backrefs)
147 break; /* no further info needed */
148
149 /* oh my, he wants the subexpressions... */
150 if (m->pmatch == NULL)
151 m->pmatch = (my_regmatch_t *)malloc((m->g->nsub + 1) *
152 sizeof(my_regmatch_t));
153 if (m->pmatch == NULL) {
154 if (m->lastpos != NULL)
155 free((char *)m->lastpos);
156 STATETEARDOWN(m);
157 return(MY_REG_ESPACE);
158 }
159 for (i = 1; i <= m->g->nsub; i++)
160 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
161 if (!g->backrefs && !(m->eflags&MY_REG_BACKR)) {
162 NOTE("dissecting");
163 dp = dissect(charset, m, m->coldp, endp, gf, gl);
164 } else {
165 if (g->nplus > 0 && m->lastpos == NULL)
166 m->lastpos = (char **)malloc((g->nplus+1) *
167 sizeof(char *));
168 if (g->nplus > 0 && m->lastpos == NULL) {
169 free(m->pmatch);
170 STATETEARDOWN(m);
171 return(MY_REG_ESPACE);
172 }
173 NOTE("backref dissect");
174 dp = backref(charset, m, m->coldp, endp, gf, gl, (sopno)0);
175 }
176 if (dp != NULL)
177 break;
178
179 /* uh-oh... we couldn't find a subexpression-level match */
180 assert(g->backrefs); /* must be back references doing it */
181 assert(g->nplus == 0 || m->lastpos != NULL);
182 for (;;) {
183 if (dp != NULL || endp <= m->coldp)
184 break; /* defeat */
185 NOTE("backoff");
186 endp = slow(charset, m, m->coldp, endp-1, gf, gl);
187 if (endp == NULL)
188 break; /* defeat */
189 /* try it on a shorter possibility */
190 #ifndef NDEBUG
191 for (i = 1; i <= m->g->nsub; i++) {
192 assert(m->pmatch[i].rm_so == -1);
193 assert(m->pmatch[i].rm_eo == -1);
194 }
195 #endif
196 NOTE("backoff dissect");
197 dp = backref(charset, m, m->coldp, endp, gf, gl, (sopno)0);
198 }
199 assert(dp == NULL || dp == endp);
200 if (dp != NULL) /* found a shorter one */
201 break;
202
203 /* despite initial appearances, there is no match here */
204 NOTE("false alarm");
205 start = m->coldp + 1; /* recycle starting later */
206 assert(start <= stop);
207 }
208
209 /* fill in the details if requested */
210 if (nmatch > 0) {
211 pmatch[0].rm_so = m->coldp - m->offp;
212 pmatch[0].rm_eo = endp - m->offp;
213 }
214 if (nmatch > 1) {
215 assert(m->pmatch != NULL);
216 for (i = 1; i < nmatch; i++)
217 if (i <= m->g->nsub)
218 pmatch[i] = m->pmatch[i];
219 else {
220 pmatch[i].rm_so = -1;
221 pmatch[i].rm_eo = -1;
222 }
223 }
224
225 if (m->pmatch != NULL)
226 free((char *)m->pmatch);
227 if (m->lastpos != NULL)
228 free((char *)m->lastpos);
229 STATETEARDOWN(m);
230 return(0);
231 }
232
233 /*
234 - dissect - figure out what matched what, no back references
235 == static char *dissect(register struct match *m, char *start, \
236 == char *stop, sopno startst, sopno stopst);
237 */
238 static char * /* == stop (success) always */
dissect(charset,m,start,stop,startst,stopst)239 dissect(charset, m, start, stop, startst, stopst)
240 const CHARSET_INFO *charset;
241 struct match *m;
242 char *start;
243 char *stop;
244 sopno startst;
245 sopno stopst;
246 {
247 uint i;
248 sopno ss; /* start sop of current subRE */
249 sopno es; /* end sop of current subRE */
250 char *sp; /* start of string matched by it */
251 char *stp; /* string matched by it cannot pass here */
252 char *rest; /* start of rest of string */
253 char *tail; /* string unmatched by rest of RE */
254 sopno ssub; /* start sop of subsubRE */
255 sopno esub; /* end sop of subsubRE */
256 char *ssp; /* start of string matched by subsubRE */
257 char *sep; /* end of string matched by subsubRE */
258 char *oldssp; /* previous ssp */
259
260 AT("diss", start, stop, startst, stopst);
261 sp = start;
262 for (ss = startst; ss < stopst; ss = es) {
263 /* identify end of subRE */
264 es = ss;
265 switch (OP(m->g->strip[es])) {
266 case OPLUS_:
267 case OQUEST_:
268 es += OPND(m->g->strip[es]);
269 break;
270 case OCH_:
271 while (OP(m->g->strip[es]) != O_CH)
272 es += OPND(m->g->strip[es]);
273 break;
274 }
275 es++;
276
277 /* figure out what it matched */
278 switch (OP(m->g->strip[ss])) {
279 case OEND:
280 assert(nope);
281 break;
282 case OCHAR:
283 sp++;
284 break;
285 case OBOL:
286 case OEOL:
287 case OBOW:
288 case OEOW:
289 break;
290 case OANY:
291 case OANYOF:
292 sp++;
293 break;
294 case OBACK_:
295 case O_BACK:
296 assert(nope);
297 break;
298 /* cases where length of match is hard to find */
299 case OQUEST_:
300 stp = stop;
301 for (;;) {
302 /* how long could this one be? */
303 rest = slow(charset, m, sp, stp, ss, es);
304 assert(rest != NULL); /* it did match */
305 /* could the rest match the rest? */
306 tail = slow(charset, m, rest, stop, es, stopst);
307 if (tail == stop)
308 break; /* yes! */
309 /* no -- try a shorter match for this one */
310 stp = rest - 1;
311 assert(stp >= sp); /* it did work */
312 }
313 ssub = ss + 1;
314 esub = es - 1;
315 /* did innards match? */
316 if (slow(charset, m, sp, rest, ssub, esub) != NULL)
317 sp = dissect(charset, m, sp, rest, ssub, esub);
318 assert(sp == rest);
319 sp = rest;
320 break;
321 case OPLUS_:
322 stp = stop;
323 for (;;) {
324 /* how long could this one be? */
325 rest = slow(charset, m, sp, stp, ss, es);
326 assert(rest != NULL); /* it did match */
327 /* could the rest match the rest? */
328 tail = slow(charset, m, rest, stop, es, stopst);
329 if (tail == stop)
330 break; /* yes! */
331 /* no -- try a shorter match for this one */
332 stp = rest - 1;
333 assert(stp >= sp); /* it did work */
334 }
335 ssub = ss + 1;
336 esub = es - 1;
337 ssp = sp;
338 oldssp = ssp;
339 for (;;) { /* find last match of innards */
340 sep = slow(charset, m, ssp, rest, ssub, esub);
341 if (sep == NULL || sep == ssp)
342 break; /* failed or matched null */
343 oldssp = ssp; /* on to next try */
344 ssp = sep;
345 }
346 if (sep == NULL) {
347 /* last successful match */
348 sep = ssp;
349 ssp = oldssp;
350 }
351 assert(sep == rest); /* must exhaust substring */
352 assert(slow(charset, m, ssp, sep, ssub, esub) == rest);
353 sp = dissect(charset, m, ssp, sep, ssub, esub);
354 assert(sp == sep);
355 sp = rest;
356 break;
357 case OCH_:
358 stp = stop;
359 for (;;) {
360 /* how long could this one be? */
361 rest = slow(charset, m, sp, stp, ss, es);
362 assert(rest != NULL); /* it did match */
363 /* could the rest match the rest? */
364 tail = slow(charset, m, rest, stop, es, stopst);
365 if (tail == stop)
366 break; /* yes! */
367 /* no -- try a shorter match for this one */
368 stp = rest - 1;
369 assert(stp >= sp); /* it did work */
370 }
371 ssub = ss + 1;
372 esub = ss + OPND(m->g->strip[ss]) - 1;
373 assert(OP(m->g->strip[esub]) == OOR1);
374 for (;;) { /* find first matching branch */
375 if (slow(charset, m, sp, rest, ssub, esub) == rest)
376 break; /* it matched all of it */
377 /* that one missed, try next one */
378 assert(OP(m->g->strip[esub]) == OOR1);
379 esub++;
380 assert(OP(m->g->strip[esub]) == OOR2);
381 ssub = esub + 1;
382 esub += OPND(m->g->strip[esub]);
383 if (OP(m->g->strip[esub]) == OOR2)
384 esub--;
385 else
386 assert(OP(m->g->strip[esub]) == O_CH);
387 }
388 sp = dissect(charset, m, sp, rest, ssub, esub);
389 assert(sp == rest);
390 sp = rest;
391 break;
392 case O_PLUS:
393 case O_QUEST:
394 case OOR1:
395 case OOR2:
396 case O_CH:
397 assert(nope);
398 break;
399 case OLPAREN:
400 i = OPND(m->g->strip[ss]);
401 assert(0 < i && i <= m->g->nsub);
402 m->pmatch[i].rm_so = sp - m->offp;
403 break;
404 case ORPAREN:
405 i = OPND(m->g->strip[ss]);
406 assert(0 < i && i <= m->g->nsub);
407 m->pmatch[i].rm_eo = sp - m->offp;
408 break;
409 default: /* uh oh */
410 assert(nope);
411 break;
412 }
413 }
414
415 assert(sp == stop);
416 return(sp);
417 }
418
419 /*
420 - backref - figure out what matched what, figuring in back references
421 == static char *backref(register struct match *m, char *start, \
422 == char *stop, sopno startst, sopno stopst, sopno lev);
423 */
424 static char * /* == stop (success) or NULL (failure) */
backref(charset,m,start,stop,startst,stopst,lev)425 backref(charset,m, start, stop, startst, stopst, lev)
426 const CHARSET_INFO *charset;
427 struct match *m;
428 char *start;
429 char *stop;
430 sopno startst;
431 sopno stopst;
432 sopno lev; /* PLUS nesting level */
433 {
434 uint i;
435 sopno ss; /* start sop of current subRE */
436 char *sp; /* start of string matched by it */
437 sopno ssub; /* start sop of subsubRE */
438 sopno esub; /* end sop of subsubRE */
439 char *ssp; /* start of string matched by subsubRE */
440 char *dp;
441 size_t len;
442 int hard;
443 sop s;
444 my_regoff_t offsave;
445 cset *cs;
446
447 AT("back", start, stop, startst, stopst);
448 sp = start;
449
450 /* get as far as we can with easy stuff */
451 hard = 0;
452 for (ss = startst; !hard && ss < stopst; ss++)
453 switch (OP(s = m->g->strip[ss])) {
454 case OCHAR:
455 if (sp == stop || *sp++ != (char)OPND(s))
456 return(NULL);
457 break;
458 case OANY:
459 if (sp == stop)
460 return(NULL);
461 sp++;
462 break;
463 case OANYOF:
464 cs = &m->g->sets[OPND(s)];
465 if (sp == stop || !CHIN(cs, *sp++))
466 return(NULL);
467 break;
468 case OBOL:
469 if ( (sp == m->beginp && !(m->eflags&MY_REG_NOTBOL)) ||
470 (sp < m->endp && *(sp-1) == '\n' &&
471 (m->g->cflags&MY_REG_NEWLINE)) )
472 { /* yes */ }
473 else
474 return(NULL);
475 break;
476 case OEOL:
477 if ( (sp == m->endp && !(m->eflags&MY_REG_NOTEOL)) ||
478 (sp < m->endp && *sp == '\n' &&
479 (m->g->cflags&MY_REG_NEWLINE)) )
480 { /* yes */ }
481 else
482 return(NULL);
483 break;
484 case OBOW:
485 if (( (sp == m->beginp && !(m->eflags&MY_REG_NOTBOL)) ||
486 (sp < m->endp && *(sp-1) == '\n' &&
487 (m->g->cflags&MY_REG_NEWLINE)) ||
488 (sp > m->beginp &&
489 !ISWORD(charset,*(sp-1))) ) &&
490 (sp < m->endp && ISWORD(charset,*sp)) )
491 { /* yes */ }
492 else
493 return(NULL);
494 break;
495 case OEOW:
496 if (( (sp == m->endp && !(m->eflags&MY_REG_NOTEOL)) ||
497 (sp < m->endp && *sp == '\n' &&
498 (m->g->cflags&MY_REG_NEWLINE)) ||
499 (sp < m->endp && !ISWORD(charset,*sp)) ) &&
500 (sp > m->beginp && ISWORD(charset,*(sp-1))) )
501 { /* yes */ }
502 else
503 return(NULL);
504 break;
505 case O_QUEST:
506 break;
507 case OOR1: /* matches null but needs to skip */
508 ss++;
509 s = m->g->strip[ss];
510 do {
511 assert(OP(s) == OOR2);
512 ss += OPND(s);
513 } while (OP(s = m->g->strip[ss]) != O_CH);
514 /* note that the ss++ gets us past the O_CH */
515 break;
516 default: /* have to make a choice */
517 hard = 1;
518 break;
519 }
520 if (!hard) { /* that was it! */
521 if (sp != stop)
522 return(NULL);
523 return(sp);
524 }
525 ss--; /* adjust for the for's final increment */
526
527 /* the hard stuff */
528 AT("hard", sp, stop, ss, stopst);
529 s = m->g->strip[ss];
530 switch (OP(s)) {
531 case OBACK_: /* the vilest depths */
532 i = OPND(s);
533 assert(0 < i && i <= m->g->nsub);
534 if (m->pmatch[i].rm_eo == -1)
535 return(NULL);
536 assert(m->pmatch[i].rm_so != -1);
537 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
538 assert((size_t) (stop - m->beginp) >= len);
539 if (sp > stop - len)
540 return(NULL); /* not enough left to match */
541 ssp = m->offp + m->pmatch[i].rm_so;
542 if (memcmp(sp, ssp, len) != 0)
543 return(NULL);
544 while (m->g->strip[ss] != SOP(O_BACK, i))
545 ss++;
546 return(backref(charset, m, sp+len, stop, ss+1, stopst, lev));
547 break;
548 case OQUEST_: /* to null or not */
549 dp = backref(charset, m, sp, stop, ss+1, stopst, lev);
550 if (dp != NULL)
551 return(dp); /* not */
552 return(backref(charset, m, sp, stop, ss+OPND(s)+1, stopst, lev));
553 break;
554 case OPLUS_:
555 assert(m->lastpos != NULL);
556 assert(lev+1 <= m->g->nplus);
557 m->lastpos[lev+1] = sp;
558 return(backref(charset, m, sp, stop, ss+1, stopst, lev+1));
559 break;
560 case O_PLUS:
561 if (sp == m->lastpos[lev]) /* last pass matched null */
562 return(backref(charset, m, sp, stop, ss+1, stopst, lev-1));
563 /* try another pass */
564 m->lastpos[lev] = sp;
565 dp = backref(charset, m, sp, stop, ss-OPND(s)+1, stopst, lev);
566 if (dp == NULL)
567 return(backref(charset, m, sp, stop, ss+1, stopst, lev-1));
568 else
569 return(dp);
570 break;
571 case OCH_: /* find the right one, if any */
572 ssub = ss + 1;
573 esub = ss + OPND(s) - 1;
574 assert(OP(m->g->strip[esub]) == OOR1);
575 for (;;) { /* find first matching branch */
576 dp = backref(charset, m, sp, stop, ssub, esub, lev);
577 if (dp != NULL)
578 return(dp);
579 /* that one missed, try next one */
580 if (OP(m->g->strip[esub]) == O_CH)
581 return(NULL); /* there is none */
582 esub++;
583 assert(OP(m->g->strip[esub]) == OOR2);
584 ssub = esub + 1;
585 esub += OPND(m->g->strip[esub]);
586 if (OP(m->g->strip[esub]) == OOR2)
587 esub--;
588 else
589 assert(OP(m->g->strip[esub]) == O_CH);
590 }
591 break;
592 case OLPAREN: /* must undo assignment if rest fails */
593 i = OPND(s);
594 assert(0 < i && i <= m->g->nsub);
595 offsave = m->pmatch[i].rm_so;
596 m->pmatch[i].rm_so = sp - m->offp;
597 dp = backref(charset, m, sp, stop, ss+1, stopst, lev);
598 if (dp != NULL)
599 return(dp);
600 m->pmatch[i].rm_so = offsave;
601 return(NULL);
602 break;
603 case ORPAREN: /* must undo assignment if rest fails */
604 i = OPND(s);
605 assert(0 < i && i <= m->g->nsub);
606 offsave = m->pmatch[i].rm_eo;
607 m->pmatch[i].rm_eo = sp - m->offp;
608 dp = backref(charset, m, sp, stop, ss+1, stopst, lev);
609 if (dp != NULL)
610 return(dp);
611 m->pmatch[i].rm_eo = offsave;
612 return(NULL);
613 break;
614 default: /* uh oh */
615 assert(nope);
616 break;
617 }
618
619 /* "can't happen" */
620 assert(nope);
621 /* NOTREACHED */
622 return 0; /* Keep gcc happy */
623 }
624
625 /*
626 - fast - step through the string at top speed
627 == static char *fast(register struct match *m, char *start, \
628 == char *stop, sopno startst, sopno stopst);
629 */
630 static char * /* where tentative match ended, or NULL */
fast(charset,m,start,stop,startst,stopst)631 fast(charset, m, start, stop, startst, stopst)
632 const CHARSET_INFO *charset;
633 struct match *m;
634 char *start;
635 char *stop;
636 sopno startst;
637 sopno stopst;
638 {
639 states st = m->st;
640 states fresh = m->fresh;
641 states tmp = m->tmp;
642 char *p = start;
643 int c = (start == m->beginp) ? OUT : *(start-1);
644 int lastc; /* previous c */
645 int flagch;
646 int i;
647 char *coldp; /* last p after which no match was underway */
648
649 CLEAR(st);
650 SET1(st, startst);
651 st = step(m->g, startst, stopst, st, NOTHING, st);
652 ASSIGN(fresh, st);
653 SP("start", st, *p);
654 coldp = NULL;
655 for (;;) {
656 /* next character */
657 lastc = c;
658 c = (p == m->endp) ? OUT : *p;
659 if (EQ(st, fresh))
660 coldp = p;
661
662 /* is there an EOL and/or BOL between lastc and c? */
663 flagch = '\0';
664 i = 0;
665 if ( (lastc == '\n' && m->g->cflags&MY_REG_NEWLINE) ||
666 (lastc == OUT && !(m->eflags&MY_REG_NOTBOL)) ) {
667 flagch = BOL;
668 i = m->g->nbol;
669 }
670 if ( (c == '\n' && m->g->cflags&MY_REG_NEWLINE) ||
671 (c == OUT && !(m->eflags&MY_REG_NOTEOL)) ) {
672 flagch = (flagch == BOL) ? BOLEOL : EOL;
673 i += m->g->neol;
674 }
675 if (i != 0) {
676 for (; i > 0; i--)
677 st = step(m->g, startst, stopst, st, flagch, st);
678 SP("boleol", st, c);
679 }
680
681 /* how about a word boundary? */
682 if ( (flagch == BOL || (lastc != OUT && !ISWORD(charset,lastc))) &&
683 (c != OUT && ISWORD(charset,c)) ) {
684 flagch = BOW;
685 }
686 if ( (lastc != OUT && ISWORD(charset,lastc)) &&
687 (flagch == EOL || (c != OUT && !ISWORD(charset,c))) ) {
688 flagch = EOW;
689 }
690 if (flagch == BOW || flagch == EOW) {
691 st = step(m->g, startst, stopst, st, flagch, st);
692 SP("boweow", st, c);
693 }
694
695 /* are we done? */
696 if (ISSET(st, stopst) || p == stop)
697 break; /* NOTE BREAK OUT */
698
699 /* no, we must deal with this character */
700 ASSIGN(tmp, st);
701 ASSIGN(st, fresh);
702 assert(c != OUT);
703 st = step(m->g, startst, stopst, tmp, c, st);
704 SP("aft", st, c);
705 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
706 p++;
707 }
708
709 assert(coldp != NULL);
710 m->coldp = coldp;
711 if (ISSET(st, stopst))
712 return(p+1);
713 else
714 return(NULL);
715 }
716
717 /*
718 - slow - step through the string more deliberately
719 == static char *slow(register struct match *m, char *start, \
720 == char *stop, sopno startst, sopno stopst);
721 */
722 static char * /* where it ended */
slow(charset,m,start,stop,startst,stopst)723 slow(charset, m, start, stop, startst, stopst)
724 const CHARSET_INFO *charset;
725 struct match *m;
726 char *start;
727 char *stop;
728 sopno startst;
729 sopno stopst;
730 {
731 states st = m->st;
732 states empty = m->empty;
733 states tmp = m->tmp;
734 char *p = start;
735 int c = (start == m->beginp) ? OUT : *(start-1);
736 int lastc; /* previous c */
737 int flagch;
738 int i;
739 char *matchp; /* last p at which a match ended */
740
741 AT("slow", start, stop, startst, stopst);
742 CLEAR(st);
743 SET1(st, startst);
744 SP("sstart", st, *p);
745 st = step(m->g, startst, stopst, st, NOTHING, st);
746 matchp = NULL;
747 for (;;) {
748 /* next character */
749 lastc = c;
750 c = (p == m->endp) ? OUT : *p;
751
752 /* is there an EOL and/or BOL between lastc and c? */
753 flagch = '\0';
754 i = 0;
755 if ( (lastc == '\n' && m->g->cflags&MY_REG_NEWLINE) ||
756 (lastc == OUT && !(m->eflags&MY_REG_NOTBOL)) ) {
757 flagch = BOL;
758 i = m->g->nbol;
759 }
760 if ( (c == '\n' && m->g->cflags&MY_REG_NEWLINE) ||
761 (c == OUT && !(m->eflags&MY_REG_NOTEOL)) ) {
762 flagch = (flagch == BOL) ? BOLEOL : EOL;
763 i += m->g->neol;
764 }
765 if (i != 0) {
766 for (; i > 0; i--)
767 st = step(m->g, startst, stopst, st, flagch, st);
768 SP("sboleol", st, c);
769 }
770
771 /* how about a word boundary? */
772 if ( (flagch == BOL || (lastc != OUT && !ISWORD(charset,lastc))) &&
773 (c != OUT && ISWORD(charset,c)) ) {
774 flagch = BOW;
775 }
776 if ( (lastc != OUT && ISWORD(charset,lastc)) &&
777 (flagch == EOL || (c != OUT && !ISWORD(charset,c))) ) {
778 flagch = EOW;
779 }
780 if (flagch == BOW || flagch == EOW) {
781 st = step(m->g, startst, stopst, st, flagch, st);
782 SP("sboweow", st, c);
783 }
784
785 /* are we done? */
786 if (ISSET(st, stopst))
787 matchp = p;
788 if (EQ(st, empty) || p == stop)
789 break; /* NOTE BREAK OUT */
790
791 /* no, we must deal with this character */
792 ASSIGN(tmp, st);
793 ASSIGN(st, empty);
794 assert(c != OUT);
795 st = step(m->g, startst, stopst, tmp, c, st);
796 SP("saft", st, c);
797 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
798 p++;
799 }
800
801 return(matchp);
802 }
803
804
805 /*
806 - step - map set of states reachable before char to set reachable after
807 == static states step(register struct re_guts *g, sopno start, sopno stop, \
808 == register states bef, int ch, register states aft);
809 == #define BOL (OUT+1)
810 == #define EOL (BOL+1)
811 == #define BOLEOL (BOL+2)
812 == #define NOTHING (BOL+3)
813 == #define BOW (BOL+4)
814 == #define EOW (BOL+5)
815 == #define CODEMAX (BOL+5) // highest code used
816 == #define NONCHAR(c) ((c) > CHAR_MAX)
817 == #define NNONCHAR (CODEMAX-CHAR_MAX)
818 */
819 static states
step(g,start,stop,bef,ch,aft)820 step(g, start, stop, bef, ch, aft)
821 struct re_guts *g;
822 sopno start; /* start state within strip */
823 sopno stop; /* state after stop state within strip */
824 states bef; /* states reachable before */
825 int ch; /* character or NONCHAR code */
826 states aft; /* states already known reachable after */
827 {
828 cset *cs;
829 sop s;
830 sopno pc;
831 onestate here; /* note, macros know this name */
832 sopno look;
833 onestate i; /* Changed from int by Monty */
834
835 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
836 s = g->strip[pc];
837 switch (OP(s)) {
838 case OEND:
839 assert(pc == stop-1);
840 break;
841 case OCHAR:
842 /* only characters can match */
843 assert(!NONCHAR(ch) || ch != (char)OPND(s));
844 if (ch == (char)OPND(s))
845 FWD(aft, bef, 1);
846 break;
847 case OBOL:
848 if (ch == BOL || ch == BOLEOL)
849 FWD(aft, bef, 1);
850 break;
851 case OEOL:
852 if (ch == EOL || ch == BOLEOL)
853 FWD(aft, bef, 1);
854 break;
855 case OBOW:
856 if (ch == BOW)
857 FWD(aft, bef, 1);
858 break;
859 case OEOW:
860 if (ch == EOW)
861 FWD(aft, bef, 1);
862 break;
863 case OANY:
864 if (!NONCHAR(ch))
865 FWD(aft, bef, 1);
866 break;
867 case OANYOF:
868 cs = &g->sets[OPND(s)];
869 if (!NONCHAR(ch) && CHIN(cs, ch))
870 FWD(aft, bef, 1);
871 break;
872 case OBACK_: /* ignored here */
873 case O_BACK:
874 FWD(aft, aft, 1);
875 break;
876 case OPLUS_: /* forward, this is just an empty */
877 FWD(aft, aft, 1);
878 break;
879 case O_PLUS: /* both forward and back */
880 FWD(aft, aft, 1);
881 i = ISSETBACK(aft, OPND(s));
882 BACK(aft, aft, OPND(s));
883 if (!i && ISSETBACK(aft, OPND(s))) {
884 /* oho, must reconsider loop body */
885 pc -= OPND(s) + 1;
886 INIT(here, pc);
887 }
888 break;
889 case OQUEST_: /* two branches, both forward */
890 FWD(aft, aft, 1);
891 FWD(aft, aft, OPND(s));
892 break;
893 case O_QUEST: /* just an empty */
894 FWD(aft, aft, 1);
895 break;
896 case OLPAREN: /* not significant here */
897 case ORPAREN:
898 FWD(aft, aft, 1);
899 break;
900 case OCH_: /* mark the first two branches */
901 FWD(aft, aft, 1);
902 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
903 FWD(aft, aft, OPND(s));
904 break;
905 case OOR1: /* done a branch, find the O_CH */
906 if (ISSTATEIN(aft, here)) {
907 for (look = 1;
908 OP(s = g->strip[pc+look]) != O_CH;
909 look += OPND(s))
910 assert(OP(s) == OOR2);
911 FWD(aft, aft, look);
912 }
913 break;
914 case OOR2: /* propagate OCH_'s marking */
915 FWD(aft, aft, 1);
916 if (OP(g->strip[pc+OPND(s)]) != O_CH) {
917 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
918 FWD(aft, aft, OPND(s));
919 }
920 break;
921 case O_CH: /* just empty */
922 FWD(aft, aft, 1);
923 break;
924 default: /* ooooops... */
925 assert(nope);
926 break;
927 }
928 }
929
930 return(aft);
931 }
932
933 #ifdef REDEBUG
934 /*
935 - print - print a set of states
936 == #ifdef REDEBUG
937 == static void print(struct match *m, char *caption, states st, \
938 == int ch, FILE *d);
939 == #endif
940 */
941 static void
print(m,caption,st,ch,d)942 print(m, caption, st, ch, d)
943 struct match *m;
944 char *caption;
945 states st;
946 int ch;
947 FILE *d;
948 {
949 struct re_guts *g = m->g;
950 int i;
951 int first = 1;
952 char buf[10];
953
954 if (!(m->eflags&MY_REG_TRACE))
955 return;
956
957 fprintf(d, "%s", caption);
958 if (ch != '\0')
959 fprintf(d, " %s", printchar(ch,buf));
960 for (i = 0; i < g->nstates; i++)
961 if (ISSET(st, i)) {
962 fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
963 first = 0;
964 }
965 fprintf(d, "\n");
966 }
967
968 /*
969 - at - print current situation
970 == #ifdef REDEBUG
971 == static void at(struct match *m, char *title, char *start, char *stop, \
972 == sopno startst, sopno stopst);
973 == #endif
974 */
975 static void
at(m,title,start,stop,startst,stopst)976 at(m, title, start, stop, startst, stopst)
977 struct match *m;
978 char *title;
979 char *start;
980 char *stop;
981 sopno startst;
982 sopno stopst;
983 {
984 char buf[10];
985 if (!(m->eflags&MY_REG_TRACE))
986 return;
987
988 printf("%s %s-", title, printchar(*start,buf));
989 printf("%s ", printchar(*stop,buf));
990 printf("%ld-%ld\n", (long)startst, (long)stopst,buf);
991 }
992
993 #ifndef PCHARDONE
994 #define PCHARDONE /* never again */
995 /*
996 - printchar - make a character printable
997 == #ifdef REDEBUG
998 == static char *printchar(int ch);
999 == #endif
1000 *
1001 * Is this identical to regchar() over in debug.c? Well, yes. But a
1002 * duplicate here avoids having a debugging-capable regexec.o tied to
1003 * a matching debug.o, and this is convenient. It all disappears in
1004 * the non-debug compilation anyway, so it doesn't matter much.
1005 */
1006 static char * /* -> representation */
printchar(ch,pbuf)1007 printchar(ch,pbuf)
1008 int ch;
1009 char *pbuf;
1010 {
1011 if (isprint(ch) || ch == ' ')
1012 sprintf(pbuf, "%c", ch);
1013 else
1014 sprintf(pbuf, "\\%o", ch);
1015 return(pbuf);
1016 }
1017 #endif
1018 #endif
1019
1020 #undef matcher
1021 #undef fast
1022 #undef slow
1023 #undef dissect
1024 #undef backref
1025 #undef step
1026 #undef print
1027 #undef at
1028 #undef match
1029