1 /*- 2 * Copyright (c) 1992 Henry Spencer. 3 * Copyright (c) 1992, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer of the University of Toronto. 8 * 9 * %sccs.include.redist.c% 10 * 11 * @(#)engine.c 8.1 (Berkeley) 06/04/93 12 */ 13 14 /* 15 * The matching engine and friends. This file is #included by regexec.c 16 * after suitable #defines of a variety of macros used herein, so that 17 * different state representations can be used without duplicating masses 18 * of code. 19 */ 20 21 #ifdef SNAMES 22 #define matcher smatcher 23 #define fast sfast 24 #define slow sslow 25 #define dissect sdissect 26 #define backref sbackref 27 #define step sstep 28 #define print sprint 29 #define at sat 30 #define match smat 31 #endif 32 #ifdef LNAMES 33 #define matcher lmatcher 34 #define fast lfast 35 #define slow lslow 36 #define dissect ldissect 37 #define backref lbackref 38 #define step lstep 39 #define print lprint 40 #define at lat 41 #define match lmat 42 #endif 43 44 /* another structure passed up and down to avoid zillions of parameters */ 45 struct match { 46 struct re_guts *g; 47 int eflags; 48 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 49 char *offp; /* offsets work from here */ 50 char *beginp; /* start of string -- virtual NUL precedes */ 51 char *endp; /* end of string -- virtual NUL here */ 52 char *coldp; /* can be no match starting before here */ 53 char **lastpos; /* [nplus+1] */ 54 STATEVARS; 55 states st; /* current states */ 56 states fresh; /* states for a fresh start */ 57 states tmp; /* temporary */ 58 states empty; /* empty set of states */ 59 }; 60 61 static int matcher(/*register struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags*/); 62 static char *dissect(/*register struct match *m, char *start, char *stop, sopno startst, sopno stopst*/); 63 static char *backref(/*register struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev*/); 64 static char *fast(/*register struct match *m, char *start, char *stop, sopno startst, sopno stopst*/); 65 static char *slow(/*register struct match *m, char *start, char *stop, sopno startst, sopno stopst*/); 66 static states step(/*register struct re_guts *g, int start, int stop, register states bef, int ch, register states aft*/); 67 #define BOL (OUT+1) 68 #define EOL (BOL+1) 69 #define BOLEOL (BOL+2) 70 #define NOTHING (BOL+3) 71 #define BOW (BOL+4) 72 #define EOW (BOL+5) 73 #define CODEMAX (BOL+5) /* highest code used */ 74 #define NONCHAR(c) ((c) > CHAR_MAX) 75 #define NNONCHAR (CODEMAX-CHAR_MAX) 76 #ifdef REDEBUG 77 static void print(/*struct match *m, char *caption, states st, int ch, FILE *d*/); 78 #endif 79 #ifdef REDEBUG 80 static void at(/*struct match *m, char *title, char *start, char *stop, sopno startst, stopno stopst*/); 81 #endif 82 #ifdef REDEBUG 83 static char *pchar(/*int ch*/); 84 #endif 85 86 #ifdef REDEBUG 87 #define SP(t, s, c) print(m, t, s, c, stdout) 88 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 89 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 90 #else 91 #define SP(t, s, c) /* nothing */ 92 #define AT(t, p1, p2, s1, s2) /* nothing */ 93 #define NOTE(s) /* nothing */ 94 #endif 95 96 /* 97 - matcher - the actual matching engine 98 == static int matcher(register struct re_guts *g, char *string, \ 99 == size_t nmatch, regmatch_t pmatch[], int eflags); 100 */ 101 static int /* 0 success, REG_NOMATCH failure */ 102 matcher(g, string, nmatch, pmatch, eflags) 103 register struct re_guts *g; 104 char *string; 105 size_t nmatch; 106 regmatch_t pmatch[]; 107 int eflags; 108 { 109 register char *endp; 110 register int i; 111 struct match mv; 112 register struct match *m = &mv; 113 register char *dp; 114 const register sopno gf = g->firststate+1; /* +1 for OEND */ 115 const register sopno gl = g->laststate; 116 char *start; 117 char *stop; 118 119 /* simplify the situation where possible */ 120 if (g->cflags®_NOSUB) 121 nmatch = 0; 122 if (eflags®_STARTEND) { 123 start = string + pmatch[0].rm_so; 124 stop = string + pmatch[0].rm_eo; 125 } else { 126 start = string; 127 stop = start + strlen(start); 128 } 129 if (stop < start) 130 return(REG_INVARG); 131 132 /* prescreening; this does wonders for this rather slow code */ 133 if (g->must != NULL) { 134 for (dp = start; dp < stop; dp++) 135 if (*dp == g->must[0] && stop - dp >= g->mlen && 136 memcmp(dp, g->must, (size_t)g->mlen) == 0) 137 break; 138 if (dp == stop) /* we didn't find g->must */ 139 return(REG_NOMATCH); 140 } 141 142 /* match struct setup */ 143 m->g = g; 144 m->eflags = eflags; 145 m->pmatch = NULL; 146 m->lastpos = NULL; 147 m->offp = string; 148 m->beginp = start; 149 m->endp = stop; 150 STATESETUP(m, 4); 151 SETUP(m->st); 152 SETUP(m->fresh); 153 SETUP(m->tmp); 154 SETUP(m->empty); 155 CLEAR(m->empty); 156 157 /* this loop does only one repetition except for backrefs */ 158 for (;;) { 159 endp = fast(m, start, stop, gf, gl); 160 if (endp == NULL) { /* a miss */ 161 STATETEARDOWN(m); 162 return(REG_NOMATCH); 163 } 164 if (nmatch == 0 && !g->backrefs) 165 break; /* no further info needed */ 166 167 /* where? */ 168 assert(m->coldp != NULL); 169 for (;;) { 170 NOTE("finding start"); 171 endp = slow(m, m->coldp, stop, gf, gl); 172 if (endp != NULL) 173 break; 174 assert(m->coldp < m->endp); 175 m->coldp++; 176 } 177 if (nmatch == 1 && !g->backrefs) 178 break; /* no further info needed */ 179 180 /* oh my, he wants the subexpressions... */ 181 if (m->pmatch == NULL) 182 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 183 sizeof(regmatch_t)); 184 if (m->pmatch == NULL) { 185 STATETEARDOWN(m); 186 return(REG_ESPACE); 187 } 188 for (i = 1; i <= m->g->nsub; i++) 189 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 190 if (!g->backrefs && !(m->eflags®_BACKR)) { 191 NOTE("dissecting"); 192 dp = dissect(m, m->coldp, endp, gf, gl); 193 } else { 194 if (g->nplus > 0 && m->lastpos == NULL) 195 m->lastpos = (char **)malloc((g->nplus+1) * 196 sizeof(char *)); 197 if (g->nplus > 0 && m->lastpos == NULL) { 198 free(m->pmatch); 199 STATETEARDOWN(m); 200 return(REG_ESPACE); 201 } 202 NOTE("backref dissect"); 203 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 204 } 205 if (dp != NULL) 206 break; 207 208 /* uh-oh... we couldn't find a subexpression-level match */ 209 assert(g->backrefs); /* must be back references doing it */ 210 assert(g->nplus == 0 || m->lastpos != NULL); 211 for (;;) { 212 if (dp != NULL || endp <= m->coldp) 213 break; /* defeat */ 214 NOTE("backoff"); 215 endp = slow(m, m->coldp, endp-1, gf, gl); 216 if (endp == NULL) 217 break; /* defeat */ 218 /* try it on a shorter possibility */ 219 #ifndef NDEBUG 220 for (i = 1; i <= m->g->nsub; i++) { 221 assert(m->pmatch[i].rm_so == -1); 222 assert(m->pmatch[i].rm_eo == -1); 223 } 224 #endif 225 NOTE("backoff dissect"); 226 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 227 } 228 assert(dp == NULL || dp == endp); 229 if (dp != NULL) /* found a shorter one */ 230 break; 231 232 /* despite initial appearances, there is no match here */ 233 NOTE("false alarm"); 234 start = m->coldp + 1; /* recycle starting later */ 235 assert(start <= stop); 236 } 237 238 /* fill in the details if requested */ 239 if (nmatch > 0) { 240 pmatch[0].rm_so = m->coldp - m->offp; 241 pmatch[0].rm_eo = endp - m->offp; 242 } 243 if (nmatch > 1) { 244 assert(m->pmatch != NULL); 245 for (i = 1; i < nmatch; i++) 246 if (i <= m->g->nsub) 247 pmatch[i] = m->pmatch[i]; 248 else { 249 pmatch[i].rm_so = -1; 250 pmatch[i].rm_eo = -1; 251 } 252 } 253 254 if (m->pmatch != NULL) 255 free((char *)m->pmatch); 256 if (m->lastpos != NULL) 257 free((char *)m->lastpos); 258 STATETEARDOWN(m); 259 return(0); 260 } 261 262 /* 263 - dissect - figure out what matched what, no back references 264 == static char *dissect(register struct match *m, char *start, \ 265 == char *stop, sopno startst, sopno stopst); 266 */ 267 static char * /* == stop (success) always */ 268 dissect(m, start, stop, startst, stopst) 269 register struct match *m; 270 char *start; 271 char *stop; 272 sopno startst; 273 sopno stopst; 274 { 275 register int i; 276 register sopno ss; /* start sop of current subRE */ 277 register sopno es; /* end sop of current subRE */ 278 register char *sp; /* start of string matched by it */ 279 register char *stp; /* string matched by it cannot pass here */ 280 register char *rest; /* start of rest of string */ 281 register char *tail; /* string unmatched by rest of RE */ 282 register sopno ssub; /* start sop of subsubRE */ 283 register sopno esub; /* end sop of subsubRE */ 284 register char *ssp; /* start of string matched by subsubRE */ 285 register char *sep; /* end of string matched by subsubRE */ 286 register char *oldssp; /* previous ssp */ 287 register char *dp; 288 289 AT("diss", start, stop, startst, stopst); 290 sp = start; 291 for (ss = startst; ss < stopst; ss = es) { 292 /* identify end of subRE */ 293 es = ss; 294 switch (OP(m->g->strip[es])) { 295 case OPLUS_: 296 case OQUEST_: 297 es += OPND(m->g->strip[es]); 298 break; 299 case OCH_: 300 while (OP(m->g->strip[es]) != O_CH) 301 es += OPND(m->g->strip[es]); 302 break; 303 } 304 es++; 305 306 /* figure out what it matched */ 307 switch (OP(m->g->strip[ss])) { 308 case OEND: 309 assert(nope); 310 break; 311 case OCHAR: 312 sp++; 313 break; 314 case OBOL: 315 case OEOL: 316 case OBOW: 317 case OEOW: 318 break; 319 case OANY: 320 case OANYOF: 321 sp++; 322 break; 323 case OBACK_: 324 case O_BACK: 325 assert(nope); 326 break; 327 /* cases where length of match is hard to find */ 328 case OQUEST_: 329 stp = stop; 330 for (;;) { 331 /* how long could this one be? */ 332 rest = slow(m, sp, stp, ss, es); 333 assert(rest != NULL); /* it did match */ 334 /* could the rest match the rest? */ 335 tail = slow(m, rest, stop, es, stopst); 336 if (tail == stop) 337 break; /* yes! */ 338 /* no -- try a shorter match for this one */ 339 stp = rest - 1; 340 assert(stp >= sp); /* it did work */ 341 } 342 ssub = ss + 1; 343 esub = es - 1; 344 /* did innards match? */ 345 if (slow(m, sp, rest, ssub, esub) != NULL) { 346 dp = dissect(m, sp, rest, ssub, esub); 347 assert(dp == rest); 348 } else /* no */ 349 assert(sp == rest); 350 sp = rest; 351 break; 352 case OPLUS_: 353 stp = stop; 354 for (;;) { 355 /* how long could this one be? */ 356 rest = slow(m, sp, stp, ss, es); 357 assert(rest != NULL); /* it did match */ 358 /* could the rest match the rest? */ 359 tail = slow(m, rest, stop, es, stopst); 360 if (tail == stop) 361 break; /* yes! */ 362 /* no -- try a shorter match for this one */ 363 stp = rest - 1; 364 assert(stp >= sp); /* it did work */ 365 } 366 ssub = ss + 1; 367 esub = es - 1; 368 ssp = sp; 369 oldssp = ssp; 370 for (;;) { /* find last match of innards */ 371 sep = slow(m, ssp, rest, ssub, esub); 372 if (sep == NULL || sep == ssp) 373 break; /* failed or matched null */ 374 oldssp = ssp; /* on to next try */ 375 ssp = sep; 376 } 377 if (sep == NULL) { 378 /* last successful match */ 379 sep = ssp; 380 ssp = oldssp; 381 } 382 assert(sep == rest); /* must exhaust substring */ 383 assert(slow(m, ssp, sep, ssub, esub) == rest); 384 dp = dissect(m, ssp, sep, ssub, esub); 385 assert(dp == sep); 386 sp = rest; 387 break; 388 case OCH_: 389 stp = stop; 390 for (;;) { 391 /* how long could this one be? */ 392 rest = slow(m, sp, stp, ss, es); 393 assert(rest != NULL); /* it did match */ 394 /* could the rest match the rest? */ 395 tail = slow(m, rest, stop, es, stopst); 396 if (tail == stop) 397 break; /* yes! */ 398 /* no -- try a shorter match for this one */ 399 stp = rest - 1; 400 assert(stp >= sp); /* it did work */ 401 } 402 ssub = ss + 1; 403 esub = ss + OPND(m->g->strip[ss]) - 1; 404 assert(OP(m->g->strip[esub]) == OOR1); 405 for (;;) { /* find first matching branch */ 406 if (slow(m, sp, rest, ssub, esub) == rest) 407 break; /* it matched all of it */ 408 /* that one missed, try next one */ 409 assert(OP(m->g->strip[esub]) == OOR1); 410 esub++; 411 assert(OP(m->g->strip[esub]) == OOR2); 412 ssub = esub + 1; 413 esub += OPND(m->g->strip[esub]); 414 if (OP(m->g->strip[esub]) == OOR2) 415 esub--; 416 else 417 assert(OP(m->g->strip[esub]) == O_CH); 418 } 419 dp = dissect(m, sp, rest, ssub, esub); 420 assert(dp == rest); 421 sp = rest; 422 break; 423 case O_PLUS: 424 case O_QUEST: 425 case OOR1: 426 case OOR2: 427 case O_CH: 428 assert(nope); 429 break; 430 case OLPAREN: 431 i = OPND(m->g->strip[ss]); 432 m->pmatch[i].rm_so = sp - m->offp; 433 break; 434 case ORPAREN: 435 i = OPND(m->g->strip[ss]); 436 m->pmatch[i].rm_eo = sp - m->offp; 437 break; 438 default: /* uh oh */ 439 assert(nope); 440 break; 441 } 442 } 443 444 assert(sp == stop); 445 return(sp); 446 } 447 448 /* 449 - backref - figure out what matched what, figuring in back references 450 == static char *backref(register struct match *m, char *start, \ 451 == char *stop, sopno startst, sopno stopst, sopno lev); 452 */ 453 static char * /* == stop (success) or NULL (failure) */ 454 backref(m, start, stop, startst, stopst, lev) 455 register struct match *m; 456 char *start; 457 char *stop; 458 sopno startst; 459 sopno stopst; 460 sopno lev; /* PLUS nesting level */ 461 { 462 register int i; 463 register sopno ss; /* start sop of current subRE */ 464 register char *sp; /* start of string matched by it */ 465 register sopno ssub; /* start sop of subsubRE */ 466 register sopno esub; /* end sop of subsubRE */ 467 register char *ssp; /* start of string matched by subsubRE */ 468 register char *dp; 469 register size_t len; 470 register int hard; 471 register sop s; 472 register regoff_t offsave; 473 register cset *cs; 474 475 AT("back", start, stop, startst, stopst); 476 sp = start; 477 478 /* get as far as we can with easy stuff */ 479 hard = 0; 480 for (ss = startst; !hard && ss < stopst; ss++) 481 switch (OP(s = m->g->strip[ss])) { 482 case OCHAR: 483 if (sp == stop || *sp++ != (char)OPND(s)) 484 return(NULL); 485 break; 486 case OANY: 487 if (sp == stop) 488 return(NULL); 489 sp++; 490 break; 491 case OANYOF: 492 cs = &m->g->sets[OPND(s)]; 493 if (sp == stop || !CHIN(cs, *sp++)) 494 return(NULL); 495 break; 496 case OBOL: 497 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 498 (sp < m->endp && *(sp-1) == '\n' && 499 (m->g->cflags®_NEWLINE)) ) 500 { /* yes */ } 501 else 502 return(NULL); 503 break; 504 case OEOL: 505 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 506 (sp < m->endp && *sp == '\n' && 507 (m->g->cflags®_NEWLINE)) ) 508 { /* yes */ } 509 else 510 return(NULL); 511 break; 512 case OBOW: 513 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 514 (sp < m->endp && *(sp-1) == '\n' && 515 (m->g->cflags®_NEWLINE)) || 516 (sp > m->beginp && 517 !isalnum(*(sp-1))) ) && 518 (sp < m->endp && isalnum(*sp)) ) 519 { /* yes */ } 520 else 521 return(NULL); 522 break; 523 case OEOW: 524 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 525 (sp < m->endp && *sp == '\n' && 526 (m->g->cflags®_NEWLINE)) || 527 (sp < m->endp && !isalnum(*sp)) ) && 528 (sp > m->beginp && isalnum(*(sp-1))) ) 529 { /* yes */ } 530 else 531 return(NULL); 532 break; 533 case O_QUEST: 534 break; 535 case OOR1: /* matches null but needs to skip */ 536 ss++; 537 s = m->g->strip[ss]; 538 do { 539 assert(OP(s) == OOR2); 540 ss += OPND(s); 541 } while (OP(s = m->g->strip[ss]) != O_CH); 542 /* note that the ss++ gets us past the O_CH */ 543 break; 544 default: /* have to make a choice */ 545 hard = 1; 546 break; 547 } 548 if (!hard) { /* that was it! */ 549 if (sp != stop) 550 return(NULL); 551 return(sp); 552 } 553 ss--; /* adjust for the for's final increment */ 554 555 /* the hard stuff */ 556 AT("hard", sp, stop, ss, stopst); 557 s = m->g->strip[ss]; 558 switch (OP(s)) { 559 case OBACK_: /* the vilest depths */ 560 i = OPND(s); 561 if (m->pmatch[i].rm_eo == -1) 562 return(NULL); 563 assert(m->pmatch[i].rm_so != -1); 564 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 565 assert(stop - m->beginp >= len); 566 if (sp > stop - len) 567 return(NULL); /* not enough left to match */ 568 ssp = m->offp + m->pmatch[i].rm_so; 569 if (memcmp(sp, ssp, len) != 0) 570 return(NULL); 571 while (m->g->strip[ss] != SOP(O_BACK, i)) 572 ss++; 573 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 574 break; 575 case OQUEST_: /* to null or not */ 576 dp = backref(m, sp, stop, ss+1, stopst, lev); 577 if (dp != NULL) 578 return(dp); /* not */ 579 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); 580 break; 581 case OPLUS_: 582 assert(m->lastpos != NULL); 583 assert(lev+1 <= m->g->nplus); 584 m->lastpos[lev+1] = sp; 585 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 586 break; 587 case O_PLUS: 588 if (sp == m->lastpos[lev]) /* last pass matched null */ 589 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 590 /* try another pass */ 591 m->lastpos[lev] = sp; 592 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); 593 if (dp == NULL) 594 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 595 else 596 return(dp); 597 break; 598 case OCH_: /* find the right one, if any */ 599 ssub = ss + 1; 600 esub = ss + OPND(s) - 1; 601 assert(OP(m->g->strip[esub]) == OOR1); 602 for (;;) { /* find first matching branch */ 603 dp = backref(m, sp, stop, ssub, esub, lev); 604 if (dp != NULL) 605 return(dp); 606 /* that one missed, try next one */ 607 if (OP(m->g->strip[esub]) == O_CH) 608 return(NULL); /* there is none */ 609 esub++; 610 assert(OP(m->g->strip[esub]) == OOR2); 611 ssub = esub + 1; 612 esub += OPND(m->g->strip[esub]); 613 if (OP(m->g->strip[esub]) == OOR2) 614 esub--; 615 else 616 assert(OP(m->g->strip[esub]) == O_CH); 617 } 618 break; 619 case OLPAREN: /* must undo assignment if rest fails */ 620 i = OPND(s); 621 offsave = m->pmatch[i].rm_so; 622 m->pmatch[i].rm_so = sp - m->offp; 623 dp = backref(m, sp, stop, ss+1, stopst, lev); 624 if (dp != NULL) 625 return(dp); 626 m->pmatch[i].rm_so = offsave; 627 return(NULL); 628 break; 629 case ORPAREN: /* must undo assignment if rest fails */ 630 i = OPND(s); 631 offsave = m->pmatch[i].rm_eo; 632 m->pmatch[i].rm_eo = sp - m->offp; 633 dp = backref(m, sp, stop, ss+1, stopst, lev); 634 if (dp != NULL) 635 return(dp); 636 m->pmatch[i].rm_eo = offsave; 637 return(NULL); 638 break; 639 default: /* uh oh */ 640 assert(nope); 641 break; 642 } 643 644 /* "can't happen" */ 645 assert(nope); 646 /* NOTREACHED */ 647 } 648 649 /* 650 - fast - step through the string at top speed 651 == static char *fast(register struct match *m, char *start, \ 652 == char *stop, sopno startst, sopno stopst); 653 */ 654 static char * /* where tentative match ended, or NULL */ 655 fast(m, start, stop, startst, stopst) 656 register struct match *m; 657 char *start; 658 char *stop; 659 sopno startst; 660 sopno stopst; 661 { 662 register states st = m->st; 663 register states fresh = m->fresh; 664 register states tmp = m->tmp; 665 register char *p = start; 666 register int c = (start == m->beginp) ? OUT : *(start-1); 667 register int lastc; /* previous c */ 668 register int flagch; 669 register int i; 670 register char *coldp; /* last p after which no match was underway */ 671 672 CLEAR(st); 673 SET1(st, startst); 674 st = step(m->g, startst, stopst, st, NOTHING, st); 675 ASSIGN(fresh, st); 676 SP("start", st, *p); 677 coldp = NULL; 678 for (;;) { 679 /* next character */ 680 lastc = c; 681 c = (p == m->endp) ? OUT : *p; 682 if (EQ(st, fresh)) 683 coldp = p; 684 685 /* is there an EOL and/or BOL between lastc and c? */ 686 flagch = '\0'; 687 i = 0; 688 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 689 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 690 flagch = BOL; 691 i = m->g->nbol; 692 } 693 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 694 (c == OUT && !(m->eflags®_NOTEOL)) ) { 695 flagch = (flagch == BOL) ? BOLEOL : EOL; 696 i += m->g->neol; 697 } 698 if (i != 0) { 699 for (; i > 0; i--) 700 st = step(m->g, startst, stopst, st, flagch, st); 701 SP("boleol", st, c); 702 } 703 704 /* how about a word boundary? */ 705 if ( (flagch == BOL || (lastc != OUT && !isalnum(lastc))) && 706 (c != OUT && isalnum(c)) ) { 707 flagch = BOW; 708 } 709 if ( (lastc != OUT && isalnum(lastc)) && 710 (flagch == EOL || (c != OUT && !isalnum(c))) ) { 711 flagch = EOW; 712 } 713 if (flagch == BOW || flagch == EOW) { 714 st = step(m->g, startst, stopst, st, flagch, st); 715 SP("boweow", st, c); 716 } 717 718 /* are we done? */ 719 if (ISSET(st, stopst) || p == stop) 720 break; /* NOTE BREAK OUT */ 721 722 /* no, we must deal with this character */ 723 ASSIGN(tmp, st); 724 ASSIGN(st, fresh); 725 assert(c != OUT); 726 st = step(m->g, startst, stopst, tmp, c, st); 727 SP("aft", st, c); 728 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 729 p++; 730 } 731 732 assert(coldp != NULL); 733 m->coldp = coldp; 734 if (ISSET(st, stopst)) 735 return(p+1); 736 else 737 return(NULL); 738 } 739 740 /* 741 - slow - step through the string more deliberately 742 == static char *slow(register struct match *m, char *start, \ 743 == char *stop, sopno startst, sopno stopst); 744 */ 745 static char * /* where it ended */ 746 slow(m, start, stop, startst, stopst) 747 register struct match *m; 748 char *start; 749 char *stop; 750 sopno startst; 751 sopno stopst; 752 { 753 register states st = m->st; 754 register states empty = m->empty; 755 register states tmp = m->tmp; 756 register char *p = start; 757 register int c = (start == m->beginp) ? OUT : *(start-1); 758 register int lastc; /* previous c */ 759 register int flagch; 760 register int i; 761 register char *matchp; /* last p at which a match ended */ 762 763 AT("slow", start, stop, startst, stopst); 764 CLEAR(st); 765 SET1(st, startst); 766 SP("sstart", st, *p); 767 st = step(m->g, startst, stopst, st, NOTHING, st); 768 matchp = NULL; 769 for (;;) { 770 /* next character */ 771 lastc = c; 772 c = (p == m->endp) ? OUT : *p; 773 774 /* is there an EOL and/or BOL between lastc and c? */ 775 flagch = '\0'; 776 i = 0; 777 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 778 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 779 flagch = BOL; 780 i = m->g->nbol; 781 } 782 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 783 (c == OUT && !(m->eflags®_NOTEOL)) ) { 784 flagch = (flagch == BOL) ? BOLEOL : EOL; 785 i += m->g->neol; 786 } 787 if (i != 0) { 788 for (; i > 0; i--) 789 st = step(m->g, startst, stopst, st, flagch, st); 790 SP("sboleol", st, c); 791 } 792 793 /* how about a word boundary? */ 794 if ( (flagch == BOL || (lastc != OUT && !isalnum(lastc))) && 795 (c != OUT && isalnum(c)) ) { 796 flagch = BOW; 797 } 798 if ( (lastc != OUT && isalnum(lastc)) && 799 (flagch == EOL || (c != OUT && !isalnum(c))) ) { 800 flagch = EOW; 801 } 802 if (flagch == BOW || flagch == EOW) { 803 st = step(m->g, startst, stopst, st, flagch, st); 804 SP("sboweow", st, c); 805 } 806 807 /* are we done? */ 808 if (ISSET(st, stopst)) 809 matchp = p; 810 if (EQ(st, empty) || p == stop) 811 break; /* NOTE BREAK OUT */ 812 813 /* no, we must deal with this character */ 814 ASSIGN(tmp, st); 815 ASSIGN(st, empty); 816 assert(c != OUT); 817 st = step(m->g, startst, stopst, tmp, c, st); 818 SP("saft", st, c); 819 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 820 p++; 821 } 822 823 return(matchp); 824 } 825 826 827 /* 828 - step - map set of states reachable before char to set reachable after 829 == static states step(register struct re_guts *g, int start, int stop, \ 830 == register states bef, int ch, register states aft); 831 == #define BOL (OUT+1) 832 == #define EOL (BOL+1) 833 == #define BOLEOL (BOL+2) 834 == #define NOTHING (BOL+3) 835 == #define BOW (BOL+4) 836 == #define EOW (BOL+5) 837 == #define CODEMAX (BOL+5) // highest code used 838 == #define NONCHAR(c) ((c) > CHAR_MAX) 839 == #define NNONCHAR (CODEMAX-CHAR_MAX) 840 */ 841 static states 842 step(g, start, stop, bef, ch, aft) 843 register struct re_guts *g; 844 int start; /* start state within strip */ 845 int stop; /* state after stop state within strip */ 846 register states bef; /* states reachable before */ 847 int ch; /* character or NONCHAR code */ 848 register states aft; /* states already known reachable after */ 849 { 850 register cset *cs; 851 register sop s; 852 register sopno pc; 853 register onestate here; /* note, macros know this name */ 854 register sopno look; 855 register int i; 856 857 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 858 s = g->strip[pc]; 859 switch (OP(s)) { 860 case OEND: 861 assert(pc == stop-1); 862 break; 863 case OCHAR: 864 /* only characters can match */ 865 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 866 if (ch == (char)OPND(s)) 867 FWD(aft, bef, 1); 868 break; 869 case OBOL: 870 if (ch == BOL || ch == BOLEOL) 871 FWD(aft, bef, 1); 872 break; 873 case OEOL: 874 if (ch == EOL || ch == BOLEOL) 875 FWD(aft, bef, 1); 876 break; 877 case OBOW: 878 if (ch == BOW) 879 FWD(aft, bef, 1); 880 break; 881 case OEOW: 882 if (ch == EOW) 883 FWD(aft, bef, 1); 884 break; 885 case OANY: 886 if (!NONCHAR(ch)) 887 FWD(aft, bef, 1); 888 break; 889 case OANYOF: 890 cs = &g->sets[OPND(s)]; 891 if (!NONCHAR(ch) && CHIN(cs, ch)) 892 FWD(aft, bef, 1); 893 break; 894 case OBACK_: /* ignored here */ 895 case O_BACK: 896 FWD(aft, aft, 1); 897 break; 898 case OPLUS_: /* forward, this is just an empty */ 899 FWD(aft, aft, 1); 900 break; 901 case O_PLUS: /* both forward and back */ 902 FWD(aft, aft, 1); 903 i = ISSETBACK(aft, OPND(s)); 904 BACK(aft, aft, OPND(s)); 905 if (!i && ISSETBACK(aft, OPND(s))) { 906 /* oho, must reconsider loop body */ 907 pc -= OPND(s) + 1; 908 INIT(here, pc); 909 } 910 break; 911 case OQUEST_: /* two branches, both forward */ 912 FWD(aft, aft, 1); 913 FWD(aft, aft, OPND(s)); 914 break; 915 case O_QUEST: /* just an empty */ 916 FWD(aft, aft, 1); 917 break; 918 case OLPAREN: /* not significant here */ 919 case ORPAREN: 920 FWD(aft, aft, 1); 921 break; 922 case OCH_: /* mark the first two branches */ 923 FWD(aft, aft, 1); 924 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 925 FWD(aft, aft, OPND(s)); 926 break; 927 case OOR1: /* done a branch, find the O_CH */ 928 if (ISSTATEIN(aft, here)) { 929 for (look = 1; 930 OP(s = g->strip[pc+look]) != O_CH; 931 look += OPND(s)) 932 assert(OP(s) == OOR2); 933 FWD(aft, aft, look); 934 } 935 break; 936 case OOR2: /* propagate OCH_'s marking */ 937 FWD(aft, aft, 1); 938 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 939 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 940 FWD(aft, aft, OPND(s)); 941 } 942 break; 943 case O_CH: /* just empty */ 944 FWD(aft, aft, 1); 945 break; 946 default: /* ooooops... */ 947 assert(nope); 948 break; 949 } 950 } 951 952 return(aft); 953 } 954 955 #ifdef REDEBUG 956 /* 957 - print - print a set of states 958 == #ifdef REDEBUG 959 == static void print(struct match *m, char *caption, states st, \ 960 == int ch, FILE *d); 961 == #endif 962 */ 963 static void 964 print(m, caption, st, ch, d) 965 struct match *m; 966 char *caption; 967 states st; 968 int ch; 969 FILE *d; 970 { 971 register struct re_guts *g = m->g; 972 register int i; 973 register int first = 1; 974 975 if (!(m->eflags®_TRACE)) 976 return; 977 978 fprintf(d, "%s", caption); 979 if (ch != '\0') 980 fprintf(d, " %s", pchar(ch)); 981 for (i = 0; i < g->nstates; i++) 982 if (ISSET(st, i)) { 983 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 984 first = 0; 985 } 986 fprintf(d, "\n"); 987 } 988 989 /* 990 - at - print current situation 991 == #ifdef REDEBUG 992 == static void at(struct match *m, char *title, char *start, char *stop, \ 993 == sopno startst, stopno stopst); 994 == #endif 995 */ 996 static void 997 at(m, title, start, stop, startst, stopst) 998 struct match *m; 999 char *title; 1000 char *start; 1001 char *stop; 1002 sopno startst; 1003 sopno stopst; 1004 { 1005 if (!(m->eflags®_TRACE)) 1006 return; 1007 1008 printf("%s %s-", title, pchar(*start)); 1009 printf("%s ", pchar(*stop)); 1010 printf("%ld-%ld\n", (long)startst, (long)stopst); 1011 } 1012 1013 #ifndef PCHARDONE 1014 #define PCHARDONE /* never again */ 1015 /* 1016 - pchar - make a character printable 1017 == #ifdef REDEBUG 1018 == static char *pchar(int ch); 1019 == #endif 1020 * 1021 * Is this identical to regchar() over in debug.c? Well, yes. But a 1022 * duplicate here avoids having a debugging-capable regexec.o tied to 1023 * a matching debug.o, and this is convenient. It all disappears in 1024 * the non-debug compilation anyway, so it doesn't matter much. 1025 */ 1026 static char * /* -> representation */ 1027 pchar(ch) 1028 int ch; 1029 { 1030 static char pbuf[10]; 1031 1032 if (isprint(ch) || ch == ' ') 1033 sprintf(pbuf, "%c", ch); 1034 else 1035 sprintf(pbuf, "\\%o", ch); 1036 return(pbuf); 1037 } 1038 #endif 1039 #endif 1040 1041 #undef matcher 1042 #undef fast 1043 #undef slow 1044 #undef dissect 1045 #undef backref 1046 #undef step 1047 #undef print 1048 #undef at 1049 #undef match 1050