1 /*-
2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Henry Spencer.
8 *
9 * %sccs.include.redist.c%
10 *
11 * @(#)engine.c 8.5 (Berkeley) 03/20/94
12 */
13
14 /*
15 * The matching engine and friends. This file is #included by regexec.c
16 * after suitable #defines of a variety of macros used herein, so that
17 * different state representations can be used without duplicating masses
18 * of code.
19 */
20
21 #ifdef SNAMES
22 #define matcher smatcher
23 #define fast sfast
24 #define slow sslow
25 #define dissect sdissect
26 #define backref sbackref
27 #define step sstep
28 #define print sprint
29 #define at sat
30 #define match smat
31 #endif
32 #ifdef LNAMES
33 #define matcher lmatcher
34 #define fast lfast
35 #define slow lslow
36 #define dissect ldissect
37 #define backref lbackref
38 #define step lstep
39 #define print lprint
40 #define at lat
41 #define match lmat
42 #endif
43
44 /* another structure passed up and down to avoid zillions of parameters */
45 struct match {
46 struct re_guts *g;
47 int eflags;
48 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
49 char *offp; /* offsets work from here */
50 char *beginp; /* start of string -- virtual NUL precedes */
51 char *endp; /* end of string -- virtual NUL here */
52 char *coldp; /* can be no match starting before here */
53 char **lastpos; /* [nplus+1] */
54 STATEVARS;
55 states st; /* current states */
56 states fresh; /* states for a fresh start */
57 states tmp; /* temporary */
58 states empty; /* empty set of states */
59 };
60
61 /* ========= begin header generated by ./mkh ========= */
62 #ifdef __cplusplus
63 extern "C" {
64 #endif
65
66 /* === engine.c === */
67 static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags));
68 static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
69 static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev));
70 static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
71 static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
72 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft));
73 #define BOL (OUT+1)
74 #define EOL (BOL+1)
75 #define BOLEOL (BOL+2)
76 #define NOTHING (BOL+3)
77 #define BOW (BOL+4)
78 #define EOW (BOL+5)
79 #define CODEMAX (BOL+5) /* highest code used */
80 #define NONCHAR(c) ((c) > CHAR_MAX)
81 #define NNONCHAR (CODEMAX-CHAR_MAX)
82 #ifdef REDEBUG
83 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d));
84 #endif
85 #ifdef REDEBUG
86 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst));
87 #endif
88 #ifdef REDEBUG
89 static char *pchar __P((int ch));
90 #endif
91
92 #ifdef __cplusplus
93 }
94 #endif
95 /* ========= end header generated by ./mkh ========= */
96
97 #ifdef REDEBUG
98 #define SP(t, s, c) print(m, t, s, c, stdout)
99 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
100 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); }
101 #else
102 #define SP(t, s, c) /* nothing */
103 #define AT(t, p1, p2, s1, s2) /* nothing */
104 #define NOTE(s) /* nothing */
105 #endif
106
107 /*
108 - matcher - the actual matching engine
109 == static int matcher(register struct re_guts *g, char *string, \
110 == size_t nmatch, regmatch_t pmatch[], int eflags);
111 */
112 static int /* 0 success, REG_NOMATCH failure */
matcher(g,string,nmatch,pmatch,eflags)113 matcher(g, string, nmatch, pmatch, eflags)
114 register struct re_guts *g;
115 char *string;
116 size_t nmatch;
117 regmatch_t pmatch[];
118 int eflags;
119 {
120 register char *endp;
121 register int i;
122 struct match mv;
123 register struct match *m = &mv;
124 register char *dp;
125 const register sopno gf = g->firststate+1; /* +1 for OEND */
126 const register sopno gl = g->laststate;
127 char *start;
128 char *stop;
129
130 /* simplify the situation where possible */
131 if (g->cflags®_NOSUB)
132 nmatch = 0;
133 if (eflags®_STARTEND) {
134 start = string + pmatch[0].rm_so;
135 stop = string + pmatch[0].rm_eo;
136 } else {
137 start = string;
138 stop = start + strlen(start);
139 }
140 if (stop < start)
141 return(REG_INVARG);
142
143 /* prescreening; this does wonders for this rather slow code */
144 if (g->must != NULL) {
145 for (dp = start; dp < stop; dp++)
146 if (*dp == g->must[0] && stop - dp >= g->mlen &&
147 memcmp(dp, g->must, (size_t)g->mlen) == 0)
148 break;
149 if (dp == stop) /* we didn't find g->must */
150 return(REG_NOMATCH);
151 }
152
153 /* match struct setup */
154 m->g = g;
155 m->eflags = eflags;
156 m->pmatch = NULL;
157 m->lastpos = NULL;
158 m->offp = string;
159 m->beginp = start;
160 m->endp = stop;
161 STATESETUP(m, 4);
162 SETUP(m->st);
163 SETUP(m->fresh);
164 SETUP(m->tmp);
165 SETUP(m->empty);
166 CLEAR(m->empty);
167
168 /* this loop does only one repetition except for backrefs */
169 for (;;) {
170 endp = fast(m, start, stop, gf, gl);
171 if (endp == NULL) { /* a miss */
172 STATETEARDOWN(m);
173 return(REG_NOMATCH);
174 }
175 if (nmatch == 0 && !g->backrefs)
176 break; /* no further info needed */
177
178 /* where? */
179 assert(m->coldp != NULL);
180 for (;;) {
181 NOTE("finding start");
182 endp = slow(m, m->coldp, stop, gf, gl);
183 if (endp != NULL)
184 break;
185 assert(m->coldp < m->endp);
186 m->coldp++;
187 }
188 if (nmatch == 1 && !g->backrefs)
189 break; /* no further info needed */
190
191 /* oh my, he wants the subexpressions... */
192 if (m->pmatch == NULL)
193 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
194 sizeof(regmatch_t));
195 if (m->pmatch == NULL) {
196 STATETEARDOWN(m);
197 return(REG_ESPACE);
198 }
199 for (i = 1; i <= m->g->nsub; i++)
200 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
201 if (!g->backrefs && !(m->eflags®_BACKR)) {
202 NOTE("dissecting");
203 dp = dissect(m, m->coldp, endp, gf, gl);
204 } else {
205 if (g->nplus > 0 && m->lastpos == NULL)
206 m->lastpos = (char **)malloc((g->nplus+1) *
207 sizeof(char *));
208 if (g->nplus > 0 && m->lastpos == NULL) {
209 free(m->pmatch);
210 STATETEARDOWN(m);
211 return(REG_ESPACE);
212 }
213 NOTE("backref dissect");
214 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
215 }
216 if (dp != NULL)
217 break;
218
219 /* uh-oh... we couldn't find a subexpression-level match */
220 assert(g->backrefs); /* must be back references doing it */
221 assert(g->nplus == 0 || m->lastpos != NULL);
222 for (;;) {
223 if (dp != NULL || endp <= m->coldp)
224 break; /* defeat */
225 NOTE("backoff");
226 endp = slow(m, m->coldp, endp-1, gf, gl);
227 if (endp == NULL)
228 break; /* defeat */
229 /* try it on a shorter possibility */
230 #ifndef NDEBUG
231 for (i = 1; i <= m->g->nsub; i++) {
232 assert(m->pmatch[i].rm_so == -1);
233 assert(m->pmatch[i].rm_eo == -1);
234 }
235 #endif
236 NOTE("backoff dissect");
237 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
238 }
239 assert(dp == NULL || dp == endp);
240 if (dp != NULL) /* found a shorter one */
241 break;
242
243 /* despite initial appearances, there is no match here */
244 NOTE("false alarm");
245 start = m->coldp + 1; /* recycle starting later */
246 assert(start <= stop);
247 }
248
249 /* fill in the details if requested */
250 if (nmatch > 0) {
251 pmatch[0].rm_so = m->coldp - m->offp;
252 pmatch[0].rm_eo = endp - m->offp;
253 }
254 if (nmatch > 1) {
255 assert(m->pmatch != NULL);
256 for (i = 1; i < nmatch; i++)
257 if (i <= m->g->nsub)
258 pmatch[i] = m->pmatch[i];
259 else {
260 pmatch[i].rm_so = -1;
261 pmatch[i].rm_eo = -1;
262 }
263 }
264
265 if (m->pmatch != NULL)
266 free((char *)m->pmatch);
267 if (m->lastpos != NULL)
268 free((char *)m->lastpos);
269 STATETEARDOWN(m);
270 return(0);
271 }
272
273 /*
274 - dissect - figure out what matched what, no back references
275 == static char *dissect(register struct match *m, char *start, \
276 == char *stop, sopno startst, sopno stopst);
277 */
278 static char * /* == stop (success) always */
dissect(m,start,stop,startst,stopst)279 dissect(m, start, stop, startst, stopst)
280 register struct match *m;
281 char *start;
282 char *stop;
283 sopno startst;
284 sopno stopst;
285 {
286 register int i;
287 register sopno ss; /* start sop of current subRE */
288 register sopno es; /* end sop of current subRE */
289 register char *sp; /* start of string matched by it */
290 register char *stp; /* string matched by it cannot pass here */
291 register char *rest; /* start of rest of string */
292 register char *tail; /* string unmatched by rest of RE */
293 register sopno ssub; /* start sop of subsubRE */
294 register sopno esub; /* end sop of subsubRE */
295 register char *ssp; /* start of string matched by subsubRE */
296 register char *sep; /* end of string matched by subsubRE */
297 register char *oldssp; /* previous ssp */
298 register char *dp;
299
300 AT("diss", start, stop, startst, stopst);
301 sp = start;
302 for (ss = startst; ss < stopst; ss = es) {
303 /* identify end of subRE */
304 es = ss;
305 switch (OP(m->g->strip[es])) {
306 case OPLUS_:
307 case OQUEST_:
308 es += OPND(m->g->strip[es]);
309 break;
310 case OCH_:
311 while (OP(m->g->strip[es]) != O_CH)
312 es += OPND(m->g->strip[es]);
313 break;
314 }
315 es++;
316
317 /* figure out what it matched */
318 switch (OP(m->g->strip[ss])) {
319 case OEND:
320 assert(nope);
321 break;
322 case OCHAR:
323 sp++;
324 break;
325 case OBOL:
326 case OEOL:
327 case OBOW:
328 case OEOW:
329 break;
330 case OANY:
331 case OANYOF:
332 sp++;
333 break;
334 case OBACK_:
335 case O_BACK:
336 assert(nope);
337 break;
338 /* cases where length of match is hard to find */
339 case OQUEST_:
340 stp = stop;
341 for (;;) {
342 /* how long could this one be? */
343 rest = slow(m, sp, stp, ss, es);
344 assert(rest != NULL); /* it did match */
345 /* could the rest match the rest? */
346 tail = slow(m, rest, stop, es, stopst);
347 if (tail == stop)
348 break; /* yes! */
349 /* no -- try a shorter match for this one */
350 stp = rest - 1;
351 assert(stp >= sp); /* it did work */
352 }
353 ssub = ss + 1;
354 esub = es - 1;
355 /* did innards match? */
356 if (slow(m, sp, rest, ssub, esub) != NULL) {
357 dp = dissect(m, sp, rest, ssub, esub);
358 assert(dp == rest);
359 } else /* no */
360 assert(sp == rest);
361 sp = rest;
362 break;
363 case OPLUS_:
364 stp = stop;
365 for (;;) {
366 /* how long could this one be? */
367 rest = slow(m, sp, stp, ss, es);
368 assert(rest != NULL); /* it did match */
369 /* could the rest match the rest? */
370 tail = slow(m, rest, stop, es, stopst);
371 if (tail == stop)
372 break; /* yes! */
373 /* no -- try a shorter match for this one */
374 stp = rest - 1;
375 assert(stp >= sp); /* it did work */
376 }
377 ssub = ss + 1;
378 esub = es - 1;
379 ssp = sp;
380 oldssp = ssp;
381 for (;;) { /* find last match of innards */
382 sep = slow(m, ssp, rest, ssub, esub);
383 if (sep == NULL || sep == ssp)
384 break; /* failed or matched null */
385 oldssp = ssp; /* on to next try */
386 ssp = sep;
387 }
388 if (sep == NULL) {
389 /* last successful match */
390 sep = ssp;
391 ssp = oldssp;
392 }
393 assert(sep == rest); /* must exhaust substring */
394 assert(slow(m, ssp, sep, ssub, esub) == rest);
395 dp = dissect(m, ssp, sep, ssub, esub);
396 assert(dp == sep);
397 sp = rest;
398 break;
399 case OCH_:
400 stp = stop;
401 for (;;) {
402 /* how long could this one be? */
403 rest = slow(m, sp, stp, ss, es);
404 assert(rest != NULL); /* it did match */
405 /* could the rest match the rest? */
406 tail = slow(m, rest, stop, es, stopst);
407 if (tail == stop)
408 break; /* yes! */
409 /* no -- try a shorter match for this one */
410 stp = rest - 1;
411 assert(stp >= sp); /* it did work */
412 }
413 ssub = ss + 1;
414 esub = ss + OPND(m->g->strip[ss]) - 1;
415 assert(OP(m->g->strip[esub]) == OOR1);
416 for (;;) { /* find first matching branch */
417 if (slow(m, sp, rest, ssub, esub) == rest)
418 break; /* it matched all of it */
419 /* that one missed, try next one */
420 assert(OP(m->g->strip[esub]) == OOR1);
421 esub++;
422 assert(OP(m->g->strip[esub]) == OOR2);
423 ssub = esub + 1;
424 esub += OPND(m->g->strip[esub]);
425 if (OP(m->g->strip[esub]) == OOR2)
426 esub--;
427 else
428 assert(OP(m->g->strip[esub]) == O_CH);
429 }
430 dp = dissect(m, sp, rest, ssub, esub);
431 assert(dp == rest);
432 sp = rest;
433 break;
434 case O_PLUS:
435 case O_QUEST:
436 case OOR1:
437 case OOR2:
438 case O_CH:
439 assert(nope);
440 break;
441 case OLPAREN:
442 i = OPND(m->g->strip[ss]);
443 assert(0 < i && i <= m->g->nsub);
444 m->pmatch[i].rm_so = sp - m->offp;
445 break;
446 case ORPAREN:
447 i = OPND(m->g->strip[ss]);
448 assert(0 < i && i <= m->g->nsub);
449 m->pmatch[i].rm_eo = sp - m->offp;
450 break;
451 default: /* uh oh */
452 assert(nope);
453 break;
454 }
455 }
456
457 assert(sp == stop);
458 return(sp);
459 }
460
461 /*
462 - backref - figure out what matched what, figuring in back references
463 == static char *backref(register struct match *m, char *start, \
464 == char *stop, sopno startst, sopno stopst, sopno lev);
465 */
466 static char * /* == stop (success) or NULL (failure) */
backref(m,start,stop,startst,stopst,lev)467 backref(m, start, stop, startst, stopst, lev)
468 register struct match *m;
469 char *start;
470 char *stop;
471 sopno startst;
472 sopno stopst;
473 sopno lev; /* PLUS nesting level */
474 {
475 register int i;
476 register sopno ss; /* start sop of current subRE */
477 register char *sp; /* start of string matched by it */
478 register sopno ssub; /* start sop of subsubRE */
479 register sopno esub; /* end sop of subsubRE */
480 register char *ssp; /* start of string matched by subsubRE */
481 register char *dp;
482 register size_t len;
483 register int hard;
484 register sop s;
485 register regoff_t offsave;
486 register cset *cs;
487
488 AT("back", start, stop, startst, stopst);
489 sp = start;
490
491 /* get as far as we can with easy stuff */
492 hard = 0;
493 for (ss = startst; !hard && ss < stopst; ss++)
494 switch (OP(s = m->g->strip[ss])) {
495 case OCHAR:
496 if (sp == stop || *sp++ != (char)OPND(s))
497 return(NULL);
498 break;
499 case OANY:
500 if (sp == stop)
501 return(NULL);
502 sp++;
503 break;
504 case OANYOF:
505 cs = &m->g->sets[OPND(s)];
506 if (sp == stop || !CHIN(cs, *sp++))
507 return(NULL);
508 break;
509 case OBOL:
510 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
511 (sp < m->endp && *(sp-1) == '\n' &&
512 (m->g->cflags®_NEWLINE)) )
513 { /* yes */ }
514 else
515 return(NULL);
516 break;
517 case OEOL:
518 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
519 (sp < m->endp && *sp == '\n' &&
520 (m->g->cflags®_NEWLINE)) )
521 { /* yes */ }
522 else
523 return(NULL);
524 break;
525 case OBOW:
526 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
527 (sp < m->endp && *(sp-1) == '\n' &&
528 (m->g->cflags®_NEWLINE)) ||
529 (sp > m->beginp &&
530 !ISWORD(*(sp-1))) ) &&
531 (sp < m->endp && ISWORD(*sp)) )
532 { /* yes */ }
533 else
534 return(NULL);
535 break;
536 case OEOW:
537 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
538 (sp < m->endp && *sp == '\n' &&
539 (m->g->cflags®_NEWLINE)) ||
540 (sp < m->endp && !ISWORD(*sp)) ) &&
541 (sp > m->beginp && ISWORD(*(sp-1))) )
542 { /* yes */ }
543 else
544 return(NULL);
545 break;
546 case O_QUEST:
547 break;
548 case OOR1: /* matches null but needs to skip */
549 ss++;
550 s = m->g->strip[ss];
551 do {
552 assert(OP(s) == OOR2);
553 ss += OPND(s);
554 } while (OP(s = m->g->strip[ss]) != O_CH);
555 /* note that the ss++ gets us past the O_CH */
556 break;
557 default: /* have to make a choice */
558 hard = 1;
559 break;
560 }
561 if (!hard) { /* that was it! */
562 if (sp != stop)
563 return(NULL);
564 return(sp);
565 }
566 ss--; /* adjust for the for's final increment */
567
568 /* the hard stuff */
569 AT("hard", sp, stop, ss, stopst);
570 s = m->g->strip[ss];
571 switch (OP(s)) {
572 case OBACK_: /* the vilest depths */
573 i = OPND(s);
574 assert(0 < i && i <= m->g->nsub);
575 if (m->pmatch[i].rm_eo == -1)
576 return(NULL);
577 assert(m->pmatch[i].rm_so != -1);
578 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
579 assert(stop - m->beginp >= len);
580 if (sp > stop - len)
581 return(NULL); /* not enough left to match */
582 ssp = m->offp + m->pmatch[i].rm_so;
583 if (memcmp(sp, ssp, len) != 0)
584 return(NULL);
585 while (m->g->strip[ss] != SOP(O_BACK, i))
586 ss++;
587 return(backref(m, sp+len, stop, ss+1, stopst, lev));
588 break;
589 case OQUEST_: /* to null or not */
590 dp = backref(m, sp, stop, ss+1, stopst, lev);
591 if (dp != NULL)
592 return(dp); /* not */
593 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
594 break;
595 case OPLUS_:
596 assert(m->lastpos != NULL);
597 assert(lev+1 <= m->g->nplus);
598 m->lastpos[lev+1] = sp;
599 return(backref(m, sp, stop, ss+1, stopst, lev+1));
600 break;
601 case O_PLUS:
602 if (sp == m->lastpos[lev]) /* last pass matched null */
603 return(backref(m, sp, stop, ss+1, stopst, lev-1));
604 /* try another pass */
605 m->lastpos[lev] = sp;
606 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
607 if (dp == NULL)
608 return(backref(m, sp, stop, ss+1, stopst, lev-1));
609 else
610 return(dp);
611 break;
612 case OCH_: /* find the right one, if any */
613 ssub = ss + 1;
614 esub = ss + OPND(s) - 1;
615 assert(OP(m->g->strip[esub]) == OOR1);
616 for (;;) { /* find first matching branch */
617 dp = backref(m, sp, stop, ssub, esub, lev);
618 if (dp != NULL)
619 return(dp);
620 /* that one missed, try next one */
621 if (OP(m->g->strip[esub]) == O_CH)
622 return(NULL); /* there is none */
623 esub++;
624 assert(OP(m->g->strip[esub]) == OOR2);
625 ssub = esub + 1;
626 esub += OPND(m->g->strip[esub]);
627 if (OP(m->g->strip[esub]) == OOR2)
628 esub--;
629 else
630 assert(OP(m->g->strip[esub]) == O_CH);
631 }
632 break;
633 case OLPAREN: /* must undo assignment if rest fails */
634 i = OPND(s);
635 assert(0 < i && i <= m->g->nsub);
636 offsave = m->pmatch[i].rm_so;
637 m->pmatch[i].rm_so = sp - m->offp;
638 dp = backref(m, sp, stop, ss+1, stopst, lev);
639 if (dp != NULL)
640 return(dp);
641 m->pmatch[i].rm_so = offsave;
642 return(NULL);
643 break;
644 case ORPAREN: /* must undo assignment if rest fails */
645 i = OPND(s);
646 assert(0 < i && i <= m->g->nsub);
647 offsave = m->pmatch[i].rm_eo;
648 m->pmatch[i].rm_eo = sp - m->offp;
649 dp = backref(m, sp, stop, ss+1, stopst, lev);
650 if (dp != NULL)
651 return(dp);
652 m->pmatch[i].rm_eo = offsave;
653 return(NULL);
654 break;
655 default: /* uh oh */
656 assert(nope);
657 break;
658 }
659
660 /* "can't happen" */
661 assert(nope);
662 /* NOTREACHED */
663 }
664
665 /*
666 - fast - step through the string at top speed
667 == static char *fast(register struct match *m, char *start, \
668 == char *stop, sopno startst, sopno stopst);
669 */
670 static char * /* where tentative match ended, or NULL */
fast(m,start,stop,startst,stopst)671 fast(m, start, stop, startst, stopst)
672 register struct match *m;
673 char *start;
674 char *stop;
675 sopno startst;
676 sopno stopst;
677 {
678 register states st = m->st;
679 register states fresh = m->fresh;
680 register states tmp = m->tmp;
681 register char *p = start;
682 register int c = (start == m->beginp) ? OUT : *(start-1);
683 register int lastc; /* previous c */
684 register int flagch;
685 register int i;
686 register char *coldp; /* last p after which no match was underway */
687
688 CLEAR(st);
689 SET1(st, startst);
690 st = step(m->g, startst, stopst, st, NOTHING, st);
691 ASSIGN(fresh, st);
692 SP("start", st, *p);
693 coldp = NULL;
694 for (;;) {
695 /* next character */
696 lastc = c;
697 c = (p == m->endp) ? OUT : *p;
698 if (EQ(st, fresh))
699 coldp = p;
700
701 /* is there an EOL and/or BOL between lastc and c? */
702 flagch = '\0';
703 i = 0;
704 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
705 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
706 flagch = BOL;
707 i = m->g->nbol;
708 }
709 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
710 (c == OUT && !(m->eflags®_NOTEOL)) ) {
711 flagch = (flagch == BOL) ? BOLEOL : EOL;
712 i += m->g->neol;
713 }
714 if (i != 0) {
715 for (; i > 0; i--)
716 st = step(m->g, startst, stopst, st, flagch, st);
717 SP("boleol", st, c);
718 }
719
720 /* how about a word boundary? */
721 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
722 (c != OUT && ISWORD(c)) ) {
723 flagch = BOW;
724 }
725 if ( (lastc != OUT && ISWORD(lastc)) &&
726 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
727 flagch = EOW;
728 }
729 if (flagch == BOW || flagch == EOW) {
730 st = step(m->g, startst, stopst, st, flagch, st);
731 SP("boweow", st, c);
732 }
733
734 /* are we done? */
735 if (ISSET(st, stopst) || p == stop)
736 break; /* NOTE BREAK OUT */
737
738 /* no, we must deal with this character */
739 ASSIGN(tmp, st);
740 ASSIGN(st, fresh);
741 assert(c != OUT);
742 st = step(m->g, startst, stopst, tmp, c, st);
743 SP("aft", st, c);
744 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
745 p++;
746 }
747
748 assert(coldp != NULL);
749 m->coldp = coldp;
750 if (ISSET(st, stopst))
751 return(p+1);
752 else
753 return(NULL);
754 }
755
756 /*
757 - slow - step through the string more deliberately
758 == static char *slow(register struct match *m, char *start, \
759 == char *stop, sopno startst, sopno stopst);
760 */
761 static char * /* where it ended */
slow(m,start,stop,startst,stopst)762 slow(m, start, stop, startst, stopst)
763 register struct match *m;
764 char *start;
765 char *stop;
766 sopno startst;
767 sopno stopst;
768 {
769 register states st = m->st;
770 register states empty = m->empty;
771 register states tmp = m->tmp;
772 register char *p = start;
773 register int c = (start == m->beginp) ? OUT : *(start-1);
774 register int lastc; /* previous c */
775 register int flagch;
776 register int i;
777 register char *matchp; /* last p at which a match ended */
778
779 AT("slow", start, stop, startst, stopst);
780 CLEAR(st);
781 SET1(st, startst);
782 SP("sstart", st, *p);
783 st = step(m->g, startst, stopst, st, NOTHING, st);
784 matchp = NULL;
785 for (;;) {
786 /* next character */
787 lastc = c;
788 c = (p == m->endp) ? OUT : *p;
789
790 /* is there an EOL and/or BOL between lastc and c? */
791 flagch = '\0';
792 i = 0;
793 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
794 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
795 flagch = BOL;
796 i = m->g->nbol;
797 }
798 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
799 (c == OUT && !(m->eflags®_NOTEOL)) ) {
800 flagch = (flagch == BOL) ? BOLEOL : EOL;
801 i += m->g->neol;
802 }
803 if (i != 0) {
804 for (; i > 0; i--)
805 st = step(m->g, startst, stopst, st, flagch, st);
806 SP("sboleol", st, c);
807 }
808
809 /* how about a word boundary? */
810 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
811 (c != OUT && ISWORD(c)) ) {
812 flagch = BOW;
813 }
814 if ( (lastc != OUT && ISWORD(lastc)) &&
815 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
816 flagch = EOW;
817 }
818 if (flagch == BOW || flagch == EOW) {
819 st = step(m->g, startst, stopst, st, flagch, st);
820 SP("sboweow", st, c);
821 }
822
823 /* are we done? */
824 if (ISSET(st, stopst))
825 matchp = p;
826 if (EQ(st, empty) || p == stop)
827 break; /* NOTE BREAK OUT */
828
829 /* no, we must deal with this character */
830 ASSIGN(tmp, st);
831 ASSIGN(st, empty);
832 assert(c != OUT);
833 st = step(m->g, startst, stopst, tmp, c, st);
834 SP("saft", st, c);
835 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
836 p++;
837 }
838
839 return(matchp);
840 }
841
842
843 /*
844 - step - map set of states reachable before char to set reachable after
845 == static states step(register struct re_guts *g, sopno start, sopno stop, \
846 == register states bef, int ch, register states aft);
847 == #define BOL (OUT+1)
848 == #define EOL (BOL+1)
849 == #define BOLEOL (BOL+2)
850 == #define NOTHING (BOL+3)
851 == #define BOW (BOL+4)
852 == #define EOW (BOL+5)
853 == #define CODEMAX (BOL+5) // highest code used
854 == #define NONCHAR(c) ((c) > CHAR_MAX)
855 == #define NNONCHAR (CODEMAX-CHAR_MAX)
856 */
857 static states
step(g,start,stop,bef,ch,aft)858 step(g, start, stop, bef, ch, aft)
859 register struct re_guts *g;
860 sopno start; /* start state within strip */
861 sopno stop; /* state after stop state within strip */
862 register states bef; /* states reachable before */
863 int ch; /* character or NONCHAR code */
864 register states aft; /* states already known reachable after */
865 {
866 register cset *cs;
867 register sop s;
868 register sopno pc;
869 register onestate here; /* note, macros know this name */
870 register sopno look;
871 register int i;
872
873 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
874 s = g->strip[pc];
875 switch (OP(s)) {
876 case OEND:
877 assert(pc == stop-1);
878 break;
879 case OCHAR:
880 /* only characters can match */
881 assert(!NONCHAR(ch) || ch != (char)OPND(s));
882 if (ch == (char)OPND(s))
883 FWD(aft, bef, 1);
884 break;
885 case OBOL:
886 if (ch == BOL || ch == BOLEOL)
887 FWD(aft, bef, 1);
888 break;
889 case OEOL:
890 if (ch == EOL || ch == BOLEOL)
891 FWD(aft, bef, 1);
892 break;
893 case OBOW:
894 if (ch == BOW)
895 FWD(aft, bef, 1);
896 break;
897 case OEOW:
898 if (ch == EOW)
899 FWD(aft, bef, 1);
900 break;
901 case OANY:
902 if (!NONCHAR(ch))
903 FWD(aft, bef, 1);
904 break;
905 case OANYOF:
906 cs = &g->sets[OPND(s)];
907 if (!NONCHAR(ch) && CHIN(cs, ch))
908 FWD(aft, bef, 1);
909 break;
910 case OBACK_: /* ignored here */
911 case O_BACK:
912 FWD(aft, aft, 1);
913 break;
914 case OPLUS_: /* forward, this is just an empty */
915 FWD(aft, aft, 1);
916 break;
917 case O_PLUS: /* both forward and back */
918 FWD(aft, aft, 1);
919 i = ISSETBACK(aft, OPND(s));
920 BACK(aft, aft, OPND(s));
921 if (!i && ISSETBACK(aft, OPND(s))) {
922 /* oho, must reconsider loop body */
923 pc -= OPND(s) + 1;
924 INIT(here, pc);
925 }
926 break;
927 case OQUEST_: /* two branches, both forward */
928 FWD(aft, aft, 1);
929 FWD(aft, aft, OPND(s));
930 break;
931 case O_QUEST: /* just an empty */
932 FWD(aft, aft, 1);
933 break;
934 case OLPAREN: /* not significant here */
935 case ORPAREN:
936 FWD(aft, aft, 1);
937 break;
938 case OCH_: /* mark the first two branches */
939 FWD(aft, aft, 1);
940 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
941 FWD(aft, aft, OPND(s));
942 break;
943 case OOR1: /* done a branch, find the O_CH */
944 if (ISSTATEIN(aft, here)) {
945 for (look = 1;
946 OP(s = g->strip[pc+look]) != O_CH;
947 look += OPND(s))
948 assert(OP(s) == OOR2);
949 FWD(aft, aft, look);
950 }
951 break;
952 case OOR2: /* propagate OCH_'s marking */
953 FWD(aft, aft, 1);
954 if (OP(g->strip[pc+OPND(s)]) != O_CH) {
955 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
956 FWD(aft, aft, OPND(s));
957 }
958 break;
959 case O_CH: /* just empty */
960 FWD(aft, aft, 1);
961 break;
962 default: /* ooooops... */
963 assert(nope);
964 break;
965 }
966 }
967
968 return(aft);
969 }
970
971 #ifdef REDEBUG
972 /*
973 - print - print a set of states
974 == #ifdef REDEBUG
975 == static void print(struct match *m, char *caption, states st, \
976 == int ch, FILE *d);
977 == #endif
978 */
979 static void
print(m,caption,st,ch,d)980 print(m, caption, st, ch, d)
981 struct match *m;
982 char *caption;
983 states st;
984 int ch;
985 FILE *d;
986 {
987 register struct re_guts *g = m->g;
988 register int i;
989 register int first = 1;
990
991 if (!(m->eflags®_TRACE))
992 return;
993
994 fprintf(d, "%s", caption);
995 if (ch != '\0')
996 fprintf(d, " %s", pchar(ch));
997 for (i = 0; i < g->nstates; i++)
998 if (ISSET(st, i)) {
999 fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1000 first = 0;
1001 }
1002 fprintf(d, "\n");
1003 }
1004
1005 /*
1006 - at - print current situation
1007 == #ifdef REDEBUG
1008 == static void at(struct match *m, char *title, char *start, char *stop, \
1009 == sopno startst, sopno stopst);
1010 == #endif
1011 */
1012 static void
at(m,title,start,stop,startst,stopst)1013 at(m, title, start, stop, startst, stopst)
1014 struct match *m;
1015 char *title;
1016 char *start;
1017 char *stop;
1018 sopno startst;
1019 sopno stopst;
1020 {
1021 if (!(m->eflags®_TRACE))
1022 return;
1023
1024 printf("%s %s-", title, pchar(*start));
1025 printf("%s ", pchar(*stop));
1026 printf("%ld-%ld\n", (long)startst, (long)stopst);
1027 }
1028
1029 #ifndef PCHARDONE
1030 #define PCHARDONE /* never again */
1031 /*
1032 - pchar - make a character printable
1033 == #ifdef REDEBUG
1034 == static char *pchar(int ch);
1035 == #endif
1036 *
1037 * Is this identical to regchar() over in debug.c? Well, yes. But a
1038 * duplicate here avoids having a debugging-capable regexec.o tied to
1039 * a matching debug.o, and this is convenient. It all disappears in
1040 * the non-debug compilation anyway, so it doesn't matter much.
1041 */
1042 static char * /* -> representation */
pchar(ch)1043 pchar(ch)
1044 int ch;
1045 {
1046 static char pbuf[10];
1047
1048 if (isprint(ch) || ch == ' ')
1049 sprintf(pbuf, "%c", ch);
1050 else
1051 sprintf(pbuf, "\\%o", ch);
1052 return(pbuf);
1053 }
1054 #endif
1055 #endif
1056
1057 #undef matcher
1058 #undef fast
1059 #undef slow
1060 #undef dissect
1061 #undef backref
1062 #undef step
1063 #undef print
1064 #undef at
1065 #undef match
1066