1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * %sccs.include.redist.c% 10 * 11 * @(#)engine.c 8.5 (Berkeley) 03/20/94 12 */ 13 14 /* 15 * The matching engine and friends. This file is #included by regexec.c 16 * after suitable #defines of a variety of macros used herein, so that 17 * different state representations can be used without duplicating masses 18 * of code. 19 */ 20 21 #ifdef SNAMES 22 #define matcher smatcher 23 #define fast sfast 24 #define slow sslow 25 #define dissect sdissect 26 #define backref sbackref 27 #define step sstep 28 #define print sprint 29 #define at sat 30 #define match smat 31 #endif 32 #ifdef LNAMES 33 #define matcher lmatcher 34 #define fast lfast 35 #define slow lslow 36 #define dissect ldissect 37 #define backref lbackref 38 #define step lstep 39 #define print lprint 40 #define at lat 41 #define match lmat 42 #endif 43 44 /* another structure passed up and down to avoid zillions of parameters */ 45 struct match { 46 struct re_guts *g; 47 int eflags; 48 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 49 char *offp; /* offsets work from here */ 50 char *beginp; /* start of string -- virtual NUL precedes */ 51 char *endp; /* end of string -- virtual NUL here */ 52 char *coldp; /* can be no match starting before here */ 53 char **lastpos; /* [nplus+1] */ 54 STATEVARS; 55 states st; /* current states */ 56 states fresh; /* states for a fresh start */ 57 states tmp; /* temporary */ 58 states empty; /* empty set of states */ 59 }; 60 61 /* ========= begin header generated by ./mkh ========= */ 62 #ifdef __cplusplus 63 extern "C" { 64 #endif 65 66 /* === engine.c === */ 67 static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags)); 68 static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 69 static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev)); 70 static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 71 static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 72 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft)); 73 #define BOL (OUT+1) 74 #define EOL (BOL+1) 75 #define BOLEOL (BOL+2) 76 #define NOTHING (BOL+3) 77 #define BOW (BOL+4) 78 #define EOW (BOL+5) 79 #define CODEMAX (BOL+5) /* highest code used */ 80 #define NONCHAR(c) ((c) > CHAR_MAX) 81 #define NNONCHAR (CODEMAX-CHAR_MAX) 82 #ifdef REDEBUG 83 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d)); 84 #endif 85 #ifdef REDEBUG 86 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst)); 87 #endif 88 #ifdef REDEBUG 89 static char *pchar __P((int ch)); 90 #endif 91 92 #ifdef __cplusplus 93 } 94 #endif 95 /* ========= end header generated by ./mkh ========= */ 96 97 #ifdef REDEBUG 98 #define SP(t, s, c) print(m, t, s, c, stdout) 99 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 100 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 101 #else 102 #define SP(t, s, c) /* nothing */ 103 #define AT(t, p1, p2, s1, s2) /* nothing */ 104 #define NOTE(s) /* nothing */ 105 #endif 106 107 /* 108 - matcher - the actual matching engine 109 == static int matcher(register struct re_guts *g, char *string, \ 110 == size_t nmatch, regmatch_t pmatch[], int eflags); 111 */ 112 static int /* 0 success, REG_NOMATCH failure */ 113 matcher(g, string, nmatch, pmatch, eflags) 114 register struct re_guts *g; 115 char *string; 116 size_t nmatch; 117 regmatch_t pmatch[]; 118 int eflags; 119 { 120 register char *endp; 121 register int i; 122 struct match mv; 123 register struct match *m = &mv; 124 register char *dp; 125 const register sopno gf = g->firststate+1; /* +1 for OEND */ 126 const register sopno gl = g->laststate; 127 char *start; 128 char *stop; 129 130 /* simplify the situation where possible */ 131 if (g->cflags®_NOSUB) 132 nmatch = 0; 133 if (eflags®_STARTEND) { 134 start = string + pmatch[0].rm_so; 135 stop = string + pmatch[0].rm_eo; 136 } else { 137 start = string; 138 stop = start + strlen(start); 139 } 140 if (stop < start) 141 return(REG_INVARG); 142 143 /* prescreening; this does wonders for this rather slow code */ 144 if (g->must != NULL) { 145 for (dp = start; dp < stop; dp++) 146 if (*dp == g->must[0] && stop - dp >= g->mlen && 147 memcmp(dp, g->must, (size_t)g->mlen) == 0) 148 break; 149 if (dp == stop) /* we didn't find g->must */ 150 return(REG_NOMATCH); 151 } 152 153 /* match struct setup */ 154 m->g = g; 155 m->eflags = eflags; 156 m->pmatch = NULL; 157 m->lastpos = NULL; 158 m->offp = string; 159 m->beginp = start; 160 m->endp = stop; 161 STATESETUP(m, 4); 162 SETUP(m->st); 163 SETUP(m->fresh); 164 SETUP(m->tmp); 165 SETUP(m->empty); 166 CLEAR(m->empty); 167 168 /* this loop does only one repetition except for backrefs */ 169 for (;;) { 170 endp = fast(m, start, stop, gf, gl); 171 if (endp == NULL) { /* a miss */ 172 STATETEARDOWN(m); 173 return(REG_NOMATCH); 174 } 175 if (nmatch == 0 && !g->backrefs) 176 break; /* no further info needed */ 177 178 /* where? */ 179 assert(m->coldp != NULL); 180 for (;;) { 181 NOTE("finding start"); 182 endp = slow(m, m->coldp, stop, gf, gl); 183 if (endp != NULL) 184 break; 185 assert(m->coldp < m->endp); 186 m->coldp++; 187 } 188 if (nmatch == 1 && !g->backrefs) 189 break; /* no further info needed */ 190 191 /* oh my, he wants the subexpressions... */ 192 if (m->pmatch == NULL) 193 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 194 sizeof(regmatch_t)); 195 if (m->pmatch == NULL) { 196 STATETEARDOWN(m); 197 return(REG_ESPACE); 198 } 199 for (i = 1; i <= m->g->nsub; i++) 200 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 201 if (!g->backrefs && !(m->eflags®_BACKR)) { 202 NOTE("dissecting"); 203 dp = dissect(m, m->coldp, endp, gf, gl); 204 } else { 205 if (g->nplus > 0 && m->lastpos == NULL) 206 m->lastpos = (char **)malloc((g->nplus+1) * 207 sizeof(char *)); 208 if (g->nplus > 0 && m->lastpos == NULL) { 209 free(m->pmatch); 210 STATETEARDOWN(m); 211 return(REG_ESPACE); 212 } 213 NOTE("backref dissect"); 214 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 215 } 216 if (dp != NULL) 217 break; 218 219 /* uh-oh... we couldn't find a subexpression-level match */ 220 assert(g->backrefs); /* must be back references doing it */ 221 assert(g->nplus == 0 || m->lastpos != NULL); 222 for (;;) { 223 if (dp != NULL || endp <= m->coldp) 224 break; /* defeat */ 225 NOTE("backoff"); 226 endp = slow(m, m->coldp, endp-1, gf, gl); 227 if (endp == NULL) 228 break; /* defeat */ 229 /* try it on a shorter possibility */ 230 #ifndef NDEBUG 231 for (i = 1; i <= m->g->nsub; i++) { 232 assert(m->pmatch[i].rm_so == -1); 233 assert(m->pmatch[i].rm_eo == -1); 234 } 235 #endif 236 NOTE("backoff dissect"); 237 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 238 } 239 assert(dp == NULL || dp == endp); 240 if (dp != NULL) /* found a shorter one */ 241 break; 242 243 /* despite initial appearances, there is no match here */ 244 NOTE("false alarm"); 245 start = m->coldp + 1; /* recycle starting later */ 246 assert(start <= stop); 247 } 248 249 /* fill in the details if requested */ 250 if (nmatch > 0) { 251 pmatch[0].rm_so = m->coldp - m->offp; 252 pmatch[0].rm_eo = endp - m->offp; 253 } 254 if (nmatch > 1) { 255 assert(m->pmatch != NULL); 256 for (i = 1; i < nmatch; i++) 257 if (i <= m->g->nsub) 258 pmatch[i] = m->pmatch[i]; 259 else { 260 pmatch[i].rm_so = -1; 261 pmatch[i].rm_eo = -1; 262 } 263 } 264 265 if (m->pmatch != NULL) 266 free((char *)m->pmatch); 267 if (m->lastpos != NULL) 268 free((char *)m->lastpos); 269 STATETEARDOWN(m); 270 return(0); 271 } 272 273 /* 274 - dissect - figure out what matched what, no back references 275 == static char *dissect(register struct match *m, char *start, \ 276 == char *stop, sopno startst, sopno stopst); 277 */ 278 static char * /* == stop (success) always */ 279 dissect(m, start, stop, startst, stopst) 280 register struct match *m; 281 char *start; 282 char *stop; 283 sopno startst; 284 sopno stopst; 285 { 286 register int i; 287 register sopno ss; /* start sop of current subRE */ 288 register sopno es; /* end sop of current subRE */ 289 register char *sp; /* start of string matched by it */ 290 register char *stp; /* string matched by it cannot pass here */ 291 register char *rest; /* start of rest of string */ 292 register char *tail; /* string unmatched by rest of RE */ 293 register sopno ssub; /* start sop of subsubRE */ 294 register sopno esub; /* end sop of subsubRE */ 295 register char *ssp; /* start of string matched by subsubRE */ 296 register char *sep; /* end of string matched by subsubRE */ 297 register char *oldssp; /* previous ssp */ 298 register char *dp; 299 300 AT("diss", start, stop, startst, stopst); 301 sp = start; 302 for (ss = startst; ss < stopst; ss = es) { 303 /* identify end of subRE */ 304 es = ss; 305 switch (OP(m->g->strip[es])) { 306 case OPLUS_: 307 case OQUEST_: 308 es += OPND(m->g->strip[es]); 309 break; 310 case OCH_: 311 while (OP(m->g->strip[es]) != O_CH) 312 es += OPND(m->g->strip[es]); 313 break; 314 } 315 es++; 316 317 /* figure out what it matched */ 318 switch (OP(m->g->strip[ss])) { 319 case OEND: 320 assert(nope); 321 break; 322 case OCHAR: 323 sp++; 324 break; 325 case OBOL: 326 case OEOL: 327 case OBOW: 328 case OEOW: 329 break; 330 case OANY: 331 case OANYOF: 332 sp++; 333 break; 334 case OBACK_: 335 case O_BACK: 336 assert(nope); 337 break; 338 /* cases where length of match is hard to find */ 339 case OQUEST_: 340 stp = stop; 341 for (;;) { 342 /* how long could this one be? */ 343 rest = slow(m, sp, stp, ss, es); 344 assert(rest != NULL); /* it did match */ 345 /* could the rest match the rest? */ 346 tail = slow(m, rest, stop, es, stopst); 347 if (tail == stop) 348 break; /* yes! */ 349 /* no -- try a shorter match for this one */ 350 stp = rest - 1; 351 assert(stp >= sp); /* it did work */ 352 } 353 ssub = ss + 1; 354 esub = es - 1; 355 /* did innards match? */ 356 if (slow(m, sp, rest, ssub, esub) != NULL) { 357 dp = dissect(m, sp, rest, ssub, esub); 358 assert(dp == rest); 359 } else /* no */ 360 assert(sp == rest); 361 sp = rest; 362 break; 363 case OPLUS_: 364 stp = stop; 365 for (;;) { 366 /* how long could this one be? */ 367 rest = slow(m, sp, stp, ss, es); 368 assert(rest != NULL); /* it did match */ 369 /* could the rest match the rest? */ 370 tail = slow(m, rest, stop, es, stopst); 371 if (tail == stop) 372 break; /* yes! */ 373 /* no -- try a shorter match for this one */ 374 stp = rest - 1; 375 assert(stp >= sp); /* it did work */ 376 } 377 ssub = ss + 1; 378 esub = es - 1; 379 ssp = sp; 380 oldssp = ssp; 381 for (;;) { /* find last match of innards */ 382 sep = slow(m, ssp, rest, ssub, esub); 383 if (sep == NULL || sep == ssp) 384 break; /* failed or matched null */ 385 oldssp = ssp; /* on to next try */ 386 ssp = sep; 387 } 388 if (sep == NULL) { 389 /* last successful match */ 390 sep = ssp; 391 ssp = oldssp; 392 } 393 assert(sep == rest); /* must exhaust substring */ 394 assert(slow(m, ssp, sep, ssub, esub) == rest); 395 dp = dissect(m, ssp, sep, ssub, esub); 396 assert(dp == sep); 397 sp = rest; 398 break; 399 case OCH_: 400 stp = stop; 401 for (;;) { 402 /* how long could this one be? */ 403 rest = slow(m, sp, stp, ss, es); 404 assert(rest != NULL); /* it did match */ 405 /* could the rest match the rest? */ 406 tail = slow(m, rest, stop, es, stopst); 407 if (tail == stop) 408 break; /* yes! */ 409 /* no -- try a shorter match for this one */ 410 stp = rest - 1; 411 assert(stp >= sp); /* it did work */ 412 } 413 ssub = ss + 1; 414 esub = ss + OPND(m->g->strip[ss]) - 1; 415 assert(OP(m->g->strip[esub]) == OOR1); 416 for (;;) { /* find first matching branch */ 417 if (slow(m, sp, rest, ssub, esub) == rest) 418 break; /* it matched all of it */ 419 /* that one missed, try next one */ 420 assert(OP(m->g->strip[esub]) == OOR1); 421 esub++; 422 assert(OP(m->g->strip[esub]) == OOR2); 423 ssub = esub + 1; 424 esub += OPND(m->g->strip[esub]); 425 if (OP(m->g->strip[esub]) == OOR2) 426 esub--; 427 else 428 assert(OP(m->g->strip[esub]) == O_CH); 429 } 430 dp = dissect(m, sp, rest, ssub, esub); 431 assert(dp == rest); 432 sp = rest; 433 break; 434 case O_PLUS: 435 case O_QUEST: 436 case OOR1: 437 case OOR2: 438 case O_CH: 439 assert(nope); 440 break; 441 case OLPAREN: 442 i = OPND(m->g->strip[ss]); 443 assert(0 < i && i <= m->g->nsub); 444 m->pmatch[i].rm_so = sp - m->offp; 445 break; 446 case ORPAREN: 447 i = OPND(m->g->strip[ss]); 448 assert(0 < i && i <= m->g->nsub); 449 m->pmatch[i].rm_eo = sp - m->offp; 450 break; 451 default: /* uh oh */ 452 assert(nope); 453 break; 454 } 455 } 456 457 assert(sp == stop); 458 return(sp); 459 } 460 461 /* 462 - backref - figure out what matched what, figuring in back references 463 == static char *backref(register struct match *m, char *start, \ 464 == char *stop, sopno startst, sopno stopst, sopno lev); 465 */ 466 static char * /* == stop (success) or NULL (failure) */ 467 backref(m, start, stop, startst, stopst, lev) 468 register struct match *m; 469 char *start; 470 char *stop; 471 sopno startst; 472 sopno stopst; 473 sopno lev; /* PLUS nesting level */ 474 { 475 register int i; 476 register sopno ss; /* start sop of current subRE */ 477 register char *sp; /* start of string matched by it */ 478 register sopno ssub; /* start sop of subsubRE */ 479 register sopno esub; /* end sop of subsubRE */ 480 register char *ssp; /* start of string matched by subsubRE */ 481 register char *dp; 482 register size_t len; 483 register int hard; 484 register sop s; 485 register regoff_t offsave; 486 register cset *cs; 487 488 AT("back", start, stop, startst, stopst); 489 sp = start; 490 491 /* get as far as we can with easy stuff */ 492 hard = 0; 493 for (ss = startst; !hard && ss < stopst; ss++) 494 switch (OP(s = m->g->strip[ss])) { 495 case OCHAR: 496 if (sp == stop || *sp++ != (char)OPND(s)) 497 return(NULL); 498 break; 499 case OANY: 500 if (sp == stop) 501 return(NULL); 502 sp++; 503 break; 504 case OANYOF: 505 cs = &m->g->sets[OPND(s)]; 506 if (sp == stop || !CHIN(cs, *sp++)) 507 return(NULL); 508 break; 509 case OBOL: 510 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 511 (sp < m->endp && *(sp-1) == '\n' && 512 (m->g->cflags®_NEWLINE)) ) 513 { /* yes */ } 514 else 515 return(NULL); 516 break; 517 case OEOL: 518 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 519 (sp < m->endp && *sp == '\n' && 520 (m->g->cflags®_NEWLINE)) ) 521 { /* yes */ } 522 else 523 return(NULL); 524 break; 525 case OBOW: 526 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 527 (sp < m->endp && *(sp-1) == '\n' && 528 (m->g->cflags®_NEWLINE)) || 529 (sp > m->beginp && 530 !ISWORD(*(sp-1))) ) && 531 (sp < m->endp && ISWORD(*sp)) ) 532 { /* yes */ } 533 else 534 return(NULL); 535 break; 536 case OEOW: 537 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 538 (sp < m->endp && *sp == '\n' && 539 (m->g->cflags®_NEWLINE)) || 540 (sp < m->endp && !ISWORD(*sp)) ) && 541 (sp > m->beginp && ISWORD(*(sp-1))) ) 542 { /* yes */ } 543 else 544 return(NULL); 545 break; 546 case O_QUEST: 547 break; 548 case OOR1: /* matches null but needs to skip */ 549 ss++; 550 s = m->g->strip[ss]; 551 do { 552 assert(OP(s) == OOR2); 553 ss += OPND(s); 554 } while (OP(s = m->g->strip[ss]) != O_CH); 555 /* note that the ss++ gets us past the O_CH */ 556 break; 557 default: /* have to make a choice */ 558 hard = 1; 559 break; 560 } 561 if (!hard) { /* that was it! */ 562 if (sp != stop) 563 return(NULL); 564 return(sp); 565 } 566 ss--; /* adjust for the for's final increment */ 567 568 /* the hard stuff */ 569 AT("hard", sp, stop, ss, stopst); 570 s = m->g->strip[ss]; 571 switch (OP(s)) { 572 case OBACK_: /* the vilest depths */ 573 i = OPND(s); 574 assert(0 < i && i <= m->g->nsub); 575 if (m->pmatch[i].rm_eo == -1) 576 return(NULL); 577 assert(m->pmatch[i].rm_so != -1); 578 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 579 assert(stop - m->beginp >= len); 580 if (sp > stop - len) 581 return(NULL); /* not enough left to match */ 582 ssp = m->offp + m->pmatch[i].rm_so; 583 if (memcmp(sp, ssp, len) != 0) 584 return(NULL); 585 while (m->g->strip[ss] != SOP(O_BACK, i)) 586 ss++; 587 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 588 break; 589 case OQUEST_: /* to null or not */ 590 dp = backref(m, sp, stop, ss+1, stopst, lev); 591 if (dp != NULL) 592 return(dp); /* not */ 593 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); 594 break; 595 case OPLUS_: 596 assert(m->lastpos != NULL); 597 assert(lev+1 <= m->g->nplus); 598 m->lastpos[lev+1] = sp; 599 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 600 break; 601 case O_PLUS: 602 if (sp == m->lastpos[lev]) /* last pass matched null */ 603 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 604 /* try another pass */ 605 m->lastpos[lev] = sp; 606 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); 607 if (dp == NULL) 608 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 609 else 610 return(dp); 611 break; 612 case OCH_: /* find the right one, if any */ 613 ssub = ss + 1; 614 esub = ss + OPND(s) - 1; 615 assert(OP(m->g->strip[esub]) == OOR1); 616 for (;;) { /* find first matching branch */ 617 dp = backref(m, sp, stop, ssub, esub, lev); 618 if (dp != NULL) 619 return(dp); 620 /* that one missed, try next one */ 621 if (OP(m->g->strip[esub]) == O_CH) 622 return(NULL); /* there is none */ 623 esub++; 624 assert(OP(m->g->strip[esub]) == OOR2); 625 ssub = esub + 1; 626 esub += OPND(m->g->strip[esub]); 627 if (OP(m->g->strip[esub]) == OOR2) 628 esub--; 629 else 630 assert(OP(m->g->strip[esub]) == O_CH); 631 } 632 break; 633 case OLPAREN: /* must undo assignment if rest fails */ 634 i = OPND(s); 635 assert(0 < i && i <= m->g->nsub); 636 offsave = m->pmatch[i].rm_so; 637 m->pmatch[i].rm_so = sp - m->offp; 638 dp = backref(m, sp, stop, ss+1, stopst, lev); 639 if (dp != NULL) 640 return(dp); 641 m->pmatch[i].rm_so = offsave; 642 return(NULL); 643 break; 644 case ORPAREN: /* must undo assignment if rest fails */ 645 i = OPND(s); 646 assert(0 < i && i <= m->g->nsub); 647 offsave = m->pmatch[i].rm_eo; 648 m->pmatch[i].rm_eo = sp - m->offp; 649 dp = backref(m, sp, stop, ss+1, stopst, lev); 650 if (dp != NULL) 651 return(dp); 652 m->pmatch[i].rm_eo = offsave; 653 return(NULL); 654 break; 655 default: /* uh oh */ 656 assert(nope); 657 break; 658 } 659 660 /* "can't happen" */ 661 assert(nope); 662 /* NOTREACHED */ 663 } 664 665 /* 666 - fast - step through the string at top speed 667 == static char *fast(register struct match *m, char *start, \ 668 == char *stop, sopno startst, sopno stopst); 669 */ 670 static char * /* where tentative match ended, or NULL */ 671 fast(m, start, stop, startst, stopst) 672 register struct match *m; 673 char *start; 674 char *stop; 675 sopno startst; 676 sopno stopst; 677 { 678 register states st = m->st; 679 register states fresh = m->fresh; 680 register states tmp = m->tmp; 681 register char *p = start; 682 register int c = (start == m->beginp) ? OUT : *(start-1); 683 register int lastc; /* previous c */ 684 register int flagch; 685 register int i; 686 register char *coldp; /* last p after which no match was underway */ 687 688 CLEAR(st); 689 SET1(st, startst); 690 st = step(m->g, startst, stopst, st, NOTHING, st); 691 ASSIGN(fresh, st); 692 SP("start", st, *p); 693 coldp = NULL; 694 for (;;) { 695 /* next character */ 696 lastc = c; 697 c = (p == m->endp) ? OUT : *p; 698 if (EQ(st, fresh)) 699 coldp = p; 700 701 /* is there an EOL and/or BOL between lastc and c? */ 702 flagch = '\0'; 703 i = 0; 704 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 705 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 706 flagch = BOL; 707 i = m->g->nbol; 708 } 709 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 710 (c == OUT && !(m->eflags®_NOTEOL)) ) { 711 flagch = (flagch == BOL) ? BOLEOL : EOL; 712 i += m->g->neol; 713 } 714 if (i != 0) { 715 for (; i > 0; i--) 716 st = step(m->g, startst, stopst, st, flagch, st); 717 SP("boleol", st, c); 718 } 719 720 /* how about a word boundary? */ 721 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 722 (c != OUT && ISWORD(c)) ) { 723 flagch = BOW; 724 } 725 if ( (lastc != OUT && ISWORD(lastc)) && 726 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 727 flagch = EOW; 728 } 729 if (flagch == BOW || flagch == EOW) { 730 st = step(m->g, startst, stopst, st, flagch, st); 731 SP("boweow", st, c); 732 } 733 734 /* are we done? */ 735 if (ISSET(st, stopst) || p == stop) 736 break; /* NOTE BREAK OUT */ 737 738 /* no, we must deal with this character */ 739 ASSIGN(tmp, st); 740 ASSIGN(st, fresh); 741 assert(c != OUT); 742 st = step(m->g, startst, stopst, tmp, c, st); 743 SP("aft", st, c); 744 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 745 p++; 746 } 747 748 assert(coldp != NULL); 749 m->coldp = coldp; 750 if (ISSET(st, stopst)) 751 return(p+1); 752 else 753 return(NULL); 754 } 755 756 /* 757 - slow - step through the string more deliberately 758 == static char *slow(register struct match *m, char *start, \ 759 == char *stop, sopno startst, sopno stopst); 760 */ 761 static char * /* where it ended */ 762 slow(m, start, stop, startst, stopst) 763 register struct match *m; 764 char *start; 765 char *stop; 766 sopno startst; 767 sopno stopst; 768 { 769 register states st = m->st; 770 register states empty = m->empty; 771 register states tmp = m->tmp; 772 register char *p = start; 773 register int c = (start == m->beginp) ? OUT : *(start-1); 774 register int lastc; /* previous c */ 775 register int flagch; 776 register int i; 777 register char *matchp; /* last p at which a match ended */ 778 779 AT("slow", start, stop, startst, stopst); 780 CLEAR(st); 781 SET1(st, startst); 782 SP("sstart", st, *p); 783 st = step(m->g, startst, stopst, st, NOTHING, st); 784 matchp = NULL; 785 for (;;) { 786 /* next character */ 787 lastc = c; 788 c = (p == m->endp) ? OUT : *p; 789 790 /* is there an EOL and/or BOL between lastc and c? */ 791 flagch = '\0'; 792 i = 0; 793 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 794 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 795 flagch = BOL; 796 i = m->g->nbol; 797 } 798 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 799 (c == OUT && !(m->eflags®_NOTEOL)) ) { 800 flagch = (flagch == BOL) ? BOLEOL : EOL; 801 i += m->g->neol; 802 } 803 if (i != 0) { 804 for (; i > 0; i--) 805 st = step(m->g, startst, stopst, st, flagch, st); 806 SP("sboleol", st, c); 807 } 808 809 /* how about a word boundary? */ 810 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 811 (c != OUT && ISWORD(c)) ) { 812 flagch = BOW; 813 } 814 if ( (lastc != OUT && ISWORD(lastc)) && 815 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 816 flagch = EOW; 817 } 818 if (flagch == BOW || flagch == EOW) { 819 st = step(m->g, startst, stopst, st, flagch, st); 820 SP("sboweow", st, c); 821 } 822 823 /* are we done? */ 824 if (ISSET(st, stopst)) 825 matchp = p; 826 if (EQ(st, empty) || p == stop) 827 break; /* NOTE BREAK OUT */ 828 829 /* no, we must deal with this character */ 830 ASSIGN(tmp, st); 831 ASSIGN(st, empty); 832 assert(c != OUT); 833 st = step(m->g, startst, stopst, tmp, c, st); 834 SP("saft", st, c); 835 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 836 p++; 837 } 838 839 return(matchp); 840 } 841 842 843 /* 844 - step - map set of states reachable before char to set reachable after 845 == static states step(register struct re_guts *g, sopno start, sopno stop, \ 846 == register states bef, int ch, register states aft); 847 == #define BOL (OUT+1) 848 == #define EOL (BOL+1) 849 == #define BOLEOL (BOL+2) 850 == #define NOTHING (BOL+3) 851 == #define BOW (BOL+4) 852 == #define EOW (BOL+5) 853 == #define CODEMAX (BOL+5) // highest code used 854 == #define NONCHAR(c) ((c) > CHAR_MAX) 855 == #define NNONCHAR (CODEMAX-CHAR_MAX) 856 */ 857 static states 858 step(g, start, stop, bef, ch, aft) 859 register struct re_guts *g; 860 sopno start; /* start state within strip */ 861 sopno stop; /* state after stop state within strip */ 862 register states bef; /* states reachable before */ 863 int ch; /* character or NONCHAR code */ 864 register states aft; /* states already known reachable after */ 865 { 866 register cset *cs; 867 register sop s; 868 register sopno pc; 869 register onestate here; /* note, macros know this name */ 870 register sopno look; 871 register int i; 872 873 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 874 s = g->strip[pc]; 875 switch (OP(s)) { 876 case OEND: 877 assert(pc == stop-1); 878 break; 879 case OCHAR: 880 /* only characters can match */ 881 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 882 if (ch == (char)OPND(s)) 883 FWD(aft, bef, 1); 884 break; 885 case OBOL: 886 if (ch == BOL || ch == BOLEOL) 887 FWD(aft, bef, 1); 888 break; 889 case OEOL: 890 if (ch == EOL || ch == BOLEOL) 891 FWD(aft, bef, 1); 892 break; 893 case OBOW: 894 if (ch == BOW) 895 FWD(aft, bef, 1); 896 break; 897 case OEOW: 898 if (ch == EOW) 899 FWD(aft, bef, 1); 900 break; 901 case OANY: 902 if (!NONCHAR(ch)) 903 FWD(aft, bef, 1); 904 break; 905 case OANYOF: 906 cs = &g->sets[OPND(s)]; 907 if (!NONCHAR(ch) && CHIN(cs, ch)) 908 FWD(aft, bef, 1); 909 break; 910 case OBACK_: /* ignored here */ 911 case O_BACK: 912 FWD(aft, aft, 1); 913 break; 914 case OPLUS_: /* forward, this is just an empty */ 915 FWD(aft, aft, 1); 916 break; 917 case O_PLUS: /* both forward and back */ 918 FWD(aft, aft, 1); 919 i = ISSETBACK(aft, OPND(s)); 920 BACK(aft, aft, OPND(s)); 921 if (!i && ISSETBACK(aft, OPND(s))) { 922 /* oho, must reconsider loop body */ 923 pc -= OPND(s) + 1; 924 INIT(here, pc); 925 } 926 break; 927 case OQUEST_: /* two branches, both forward */ 928 FWD(aft, aft, 1); 929 FWD(aft, aft, OPND(s)); 930 break; 931 case O_QUEST: /* just an empty */ 932 FWD(aft, aft, 1); 933 break; 934 case OLPAREN: /* not significant here */ 935 case ORPAREN: 936 FWD(aft, aft, 1); 937 break; 938 case OCH_: /* mark the first two branches */ 939 FWD(aft, aft, 1); 940 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 941 FWD(aft, aft, OPND(s)); 942 break; 943 case OOR1: /* done a branch, find the O_CH */ 944 if (ISSTATEIN(aft, here)) { 945 for (look = 1; 946 OP(s = g->strip[pc+look]) != O_CH; 947 look += OPND(s)) 948 assert(OP(s) == OOR2); 949 FWD(aft, aft, look); 950 } 951 break; 952 case OOR2: /* propagate OCH_'s marking */ 953 FWD(aft, aft, 1); 954 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 955 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 956 FWD(aft, aft, OPND(s)); 957 } 958 break; 959 case O_CH: /* just empty */ 960 FWD(aft, aft, 1); 961 break; 962 default: /* ooooops... */ 963 assert(nope); 964 break; 965 } 966 } 967 968 return(aft); 969 } 970 971 #ifdef REDEBUG 972 /* 973 - print - print a set of states 974 == #ifdef REDEBUG 975 == static void print(struct match *m, char *caption, states st, \ 976 == int ch, FILE *d); 977 == #endif 978 */ 979 static void 980 print(m, caption, st, ch, d) 981 struct match *m; 982 char *caption; 983 states st; 984 int ch; 985 FILE *d; 986 { 987 register struct re_guts *g = m->g; 988 register int i; 989 register int first = 1; 990 991 if (!(m->eflags®_TRACE)) 992 return; 993 994 fprintf(d, "%s", caption); 995 if (ch != '\0') 996 fprintf(d, " %s", pchar(ch)); 997 for (i = 0; i < g->nstates; i++) 998 if (ISSET(st, i)) { 999 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1000 first = 0; 1001 } 1002 fprintf(d, "\n"); 1003 } 1004 1005 /* 1006 - at - print current situation 1007 == #ifdef REDEBUG 1008 == static void at(struct match *m, char *title, char *start, char *stop, \ 1009 == sopno startst, sopno stopst); 1010 == #endif 1011 */ 1012 static void 1013 at(m, title, start, stop, startst, stopst) 1014 struct match *m; 1015 char *title; 1016 char *start; 1017 char *stop; 1018 sopno startst; 1019 sopno stopst; 1020 { 1021 if (!(m->eflags®_TRACE)) 1022 return; 1023 1024 printf("%s %s-", title, pchar(*start)); 1025 printf("%s ", pchar(*stop)); 1026 printf("%ld-%ld\n", (long)startst, (long)stopst); 1027 } 1028 1029 #ifndef PCHARDONE 1030 #define PCHARDONE /* never again */ 1031 /* 1032 - pchar - make a character printable 1033 == #ifdef REDEBUG 1034 == static char *pchar(int ch); 1035 == #endif 1036 * 1037 * Is this identical to regchar() over in debug.c? Well, yes. But a 1038 * duplicate here avoids having a debugging-capable regexec.o tied to 1039 * a matching debug.o, and this is convenient. It all disappears in 1040 * the non-debug compilation anyway, so it doesn't matter much. 1041 */ 1042 static char * /* -> representation */ 1043 pchar(ch) 1044 int ch; 1045 { 1046 static char pbuf[10]; 1047 1048 if (isprint(ch) || ch == ' ') 1049 sprintf(pbuf, "%c", ch); 1050 else 1051 sprintf(pbuf, "\\%o", ch); 1052 return(pbuf); 1053 } 1054 #endif 1055 #endif 1056 1057 #undef matcher 1058 #undef fast 1059 #undef slow 1060 #undef dissect 1061 #undef backref 1062 #undef step 1063 #undef print 1064 #undef at 1065 #undef match 1066