1 /* $OpenBSD: engine.c,v 1.25 2018/07/11 12:38:46 martijn Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38 /* 39 * The matching engine and friends. This file is #included by regexec.c 40 * after suitable #defines of a variety of macros used herein, so that 41 * different state representations can be used without duplicating masses 42 * of code. 43 */ 44 45 #ifdef SNAMES 46 #define matcher smatcher 47 #define fast sfast 48 #define slow sslow 49 #define dissect sdissect 50 #define backref sbackref 51 #define step sstep 52 #define print sprint 53 #define at sat 54 #define match smat 55 #define nope snope 56 #endif 57 #ifdef LNAMES 58 #define matcher lmatcher 59 #define fast lfast 60 #define slow lslow 61 #define dissect ldissect 62 #define backref lbackref 63 #define step lstep 64 #define print lprint 65 #define at lat 66 #define match lmat 67 #define nope lnope 68 #endif 69 70 /* another structure passed up and down to avoid zillions of parameters */ 71 struct match { 72 struct re_guts *g; 73 int eflags; 74 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 75 const char *offp; /* offsets work from here */ 76 const char *beginp; /* start of string -- virtual NUL precedes */ 77 const char *endp; /* end of string -- virtual NUL here */ 78 const char *coldp; /* can be no match starting before here */ 79 const char **lastpos; /* [nplus+1] */ 80 STATEVARS; 81 states st; /* current states */ 82 states fresh; /* states for a fresh start */ 83 states tmp; /* temporary */ 84 states empty; /* empty set of states */ 85 }; 86 87 static int matcher(struct re_guts *, const char *, size_t, regmatch_t[], int); 88 static const char *dissect(struct match *, const char *, const char *, sopno, 89 sopno); 90 static const char *backref(struct match *, const char *, const char *, sopno, 91 sopno, sopno, int); 92 static const char *fast(struct match *, const char *, const char *, sopno, 93 sopno); 94 static const char *slow(struct match *, const char *, const char *, sopno, 95 sopno); 96 static states step(struct re_guts *, sopno, sopno, states, int, states); 97 #define MAX_RECURSION 100 98 #define BOL (OUT+1) 99 #define EOL (BOL+1) 100 #define BOLEOL (BOL+2) 101 #define NOTHING (BOL+3) 102 #define BOW (BOL+4) 103 #define EOW (BOL+5) 104 /* update nonchars[] array below when adding fake chars here */ 105 #define CODEMAX (BOL+5) /* highest code used */ 106 #define NONCHAR(c) ((c) > CHAR_MAX) 107 #define NNONCHAR (CODEMAX-CHAR_MAX) 108 #ifdef REDEBUG 109 static void print(struct match *, const char *, states, int, FILE *); 110 #endif 111 #ifdef REDEBUG 112 static void at(struct match *, const char *, const char *, const char *, 113 sopno, sopno); 114 #endif 115 #ifdef REDEBUG 116 static const char *pchar(int); 117 #endif 118 119 #ifdef REDEBUG 120 #define SP(t, s, c) print(m, t, s, c, stdout) 121 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 122 #define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 123 static int nope = 0; 124 #else 125 #define SP(t, s, c) /* nothing */ 126 #define AT(t, p1, p2, s1, s2) /* nothing */ 127 #define NOTE(s) /* nothing */ 128 #endif 129 130 /* 131 - matcher - the actual matching engine 132 */ 133 static int /* 0 success, REG_NOMATCH failure */ 134 matcher(struct re_guts *g, const char *string, size_t nmatch, 135 regmatch_t pmatch[], int eflags) 136 { 137 const char *endp; 138 int i; 139 struct match mv; 140 struct match *m = &mv; 141 const char *dp; 142 const sopno gf = g->firststate+1; /* +1 for OEND */ 143 const sopno gl = g->laststate; 144 const char *start; 145 const char *stop; 146 147 /* simplify the situation where possible */ 148 if (g->cflags®_NOSUB) 149 nmatch = 0; 150 if (eflags®_STARTEND) { 151 start = string + pmatch[0].rm_so; 152 stop = string + pmatch[0].rm_eo; 153 } else { 154 start = string; 155 stop = start + strlen(start); 156 } 157 if (stop < start) 158 return(REG_INVARG); 159 160 /* prescreening; this does wonders for this rather slow code */ 161 if (g->must != NULL) { 162 for (dp = start; dp < stop; dp++) 163 if (*dp == g->must[0] && stop - dp >= g->mlen && 164 memcmp(dp, g->must, g->mlen) == 0) 165 break; 166 if (dp == stop) /* we didn't find g->must */ 167 return(REG_NOMATCH); 168 } 169 170 /* match struct setup */ 171 m->g = g; 172 m->eflags = eflags; 173 m->pmatch = NULL; 174 m->lastpos = NULL; 175 m->offp = string; 176 m->beginp = start; 177 m->endp = stop; 178 STATESETUP(m, 4); 179 SETUP(m->st); 180 SETUP(m->fresh); 181 SETUP(m->tmp); 182 SETUP(m->empty); 183 CLEAR(m->empty); 184 185 /* this loop does only one repetition except for backrefs */ 186 for (;;) { 187 endp = fast(m, start, stop, gf, gl); 188 if (endp == NULL) { /* a miss */ 189 free(m->pmatch); 190 free(m->lastpos); 191 STATETEARDOWN(m); 192 return(REG_NOMATCH); 193 } 194 if (nmatch == 0 && !g->backrefs) 195 break; /* no further info needed */ 196 197 /* where? */ 198 assert(m->coldp != NULL); 199 for (;;) { 200 NOTE("finding start"); 201 endp = slow(m, m->coldp, stop, gf, gl); 202 if (endp != NULL) 203 break; 204 assert(m->coldp < m->endp); 205 m->coldp++; 206 } 207 if (nmatch == 1 && !g->backrefs) 208 break; /* no further info needed */ 209 210 /* oh my, he wants the subexpressions... */ 211 if (m->pmatch == NULL) 212 m->pmatch = reallocarray(NULL, m->g->nsub + 1, 213 sizeof(regmatch_t)); 214 if (m->pmatch == NULL) { 215 STATETEARDOWN(m); 216 return(REG_ESPACE); 217 } 218 for (i = 1; i <= m->g->nsub; i++) 219 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 220 if (!g->backrefs && !(m->eflags®_BACKR)) { 221 NOTE("dissecting"); 222 dp = dissect(m, m->coldp, endp, gf, gl); 223 } else { 224 if (g->nplus > 0 && m->lastpos == NULL) 225 m->lastpos = reallocarray(NULL, 226 g->nplus+1, sizeof(char *)); 227 if (g->nplus > 0 && m->lastpos == NULL) { 228 free(m->pmatch); 229 STATETEARDOWN(m); 230 return(REG_ESPACE); 231 } 232 NOTE("backref dissect"); 233 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 234 } 235 if (dp != NULL) 236 break; 237 238 /* uh-oh... we couldn't find a subexpression-level match */ 239 assert(g->backrefs); /* must be back references doing it */ 240 assert(g->nplus == 0 || m->lastpos != NULL); 241 for (;;) { 242 if (dp != NULL || endp <= m->coldp) 243 break; /* defeat */ 244 NOTE("backoff"); 245 endp = slow(m, m->coldp, endp-1, gf, gl); 246 if (endp == NULL) 247 break; /* defeat */ 248 /* try it on a shorter possibility */ 249 #ifndef NDEBUG 250 for (i = 1; i <= m->g->nsub; i++) { 251 assert(m->pmatch[i].rm_so == -1); 252 assert(m->pmatch[i].rm_eo == -1); 253 } 254 #endif 255 NOTE("backoff dissect"); 256 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 257 } 258 assert(dp == NULL || dp == endp); 259 if (dp != NULL) /* found a shorter one */ 260 break; 261 262 /* despite initial appearances, there is no match here */ 263 NOTE("false alarm"); 264 if (m->coldp == stop) 265 break; 266 start = m->coldp + 1; /* recycle starting later */ 267 } 268 269 /* fill in the details if requested */ 270 if (nmatch > 0) { 271 pmatch[0].rm_so = m->coldp - m->offp; 272 pmatch[0].rm_eo = endp - m->offp; 273 } 274 if (nmatch > 1) { 275 assert(m->pmatch != NULL); 276 for (i = 1; i < nmatch; i++) 277 if (i <= m->g->nsub) 278 pmatch[i] = m->pmatch[i]; 279 else { 280 pmatch[i].rm_so = -1; 281 pmatch[i].rm_eo = -1; 282 } 283 } 284 285 free(m->pmatch); 286 free(m->lastpos); 287 STATETEARDOWN(m); 288 return(0); 289 } 290 291 /* 292 - dissect - figure out what matched what, no back references 293 */ 294 static const char * /* == stop (success) always */ 295 dissect(struct match *m, const char *start, const char *stop, sopno startst, 296 sopno stopst) 297 { 298 int i; 299 sopno ss; /* start sop of current subRE */ 300 sopno es; /* end sop of current subRE */ 301 const char *sp; /* start of string matched by it */ 302 const char *stp; /* string matched by it cannot pass here */ 303 const char *rest; /* start of rest of string */ 304 const char *tail; /* string unmatched by rest of RE */ 305 sopno ssub; /* start sop of subsubRE */ 306 sopno esub; /* end sop of subsubRE */ 307 const char *ssp; /* start of string matched by subsubRE */ 308 const char *sep; /* end of string matched by subsubRE */ 309 const char *oldssp; /* previous ssp */ 310 const char *dp; 311 312 AT("diss", start, stop, startst, stopst); 313 sp = start; 314 for (ss = startst; ss < stopst; ss = es) { 315 /* identify end of subRE */ 316 es = ss; 317 switch (OP(m->g->strip[es])) { 318 case OPLUS_: 319 case OQUEST_: 320 es += OPND(m->g->strip[es]); 321 break; 322 case OCH_: 323 while (OP(m->g->strip[es]) != O_CH) 324 es += OPND(m->g->strip[es]); 325 break; 326 } 327 es++; 328 329 /* figure out what it matched */ 330 switch (OP(m->g->strip[ss])) { 331 case OEND: 332 assert(nope); 333 break; 334 case OCHAR: 335 sp++; 336 break; 337 case OBOL: 338 case OEOL: 339 case OBOW: 340 case OEOW: 341 break; 342 case OANY: 343 case OANYOF: 344 sp++; 345 break; 346 case OBACK_: 347 case O_BACK: 348 assert(nope); 349 break; 350 /* cases where length of match is hard to find */ 351 case OQUEST_: 352 stp = stop; 353 for (;;) { 354 /* how long could this one be? */ 355 rest = slow(m, sp, stp, ss, es); 356 assert(rest != NULL); /* it did match */ 357 /* could the rest match the rest? */ 358 tail = slow(m, rest, stop, es, stopst); 359 if (tail == stop) 360 break; /* yes! */ 361 /* no -- try a shorter match for this one */ 362 stp = rest - 1; 363 assert(stp >= sp); /* it did work */ 364 } 365 ssub = ss + 1; 366 esub = es - 1; 367 /* did innards match? */ 368 if (slow(m, sp, rest, ssub, esub) != NULL) { 369 dp = dissect(m, sp, rest, ssub, esub); 370 assert(dp == rest); 371 } else /* no */ 372 assert(sp == rest); 373 sp = rest; 374 break; 375 case OPLUS_: 376 stp = stop; 377 for (;;) { 378 /* how long could this one be? */ 379 rest = slow(m, sp, stp, ss, es); 380 assert(rest != NULL); /* it did match */ 381 /* could the rest match the rest? */ 382 tail = slow(m, rest, stop, es, stopst); 383 if (tail == stop) 384 break; /* yes! */ 385 /* no -- try a shorter match for this one */ 386 stp = rest - 1; 387 assert(stp >= sp); /* it did work */ 388 } 389 ssub = ss + 1; 390 esub = es - 1; 391 ssp = sp; 392 oldssp = ssp; 393 for (;;) { /* find last match of innards */ 394 sep = slow(m, ssp, rest, ssub, esub); 395 if (sep == NULL || sep == ssp) 396 break; /* failed or matched null */ 397 oldssp = ssp; /* on to next try */ 398 ssp = sep; 399 } 400 if (sep == NULL) { 401 /* last successful match */ 402 sep = ssp; 403 ssp = oldssp; 404 } 405 assert(sep == rest); /* must exhaust substring */ 406 assert(slow(m, ssp, sep, ssub, esub) == rest); 407 dp = dissect(m, ssp, sep, ssub, esub); 408 assert(dp == sep); 409 sp = rest; 410 break; 411 case OCH_: 412 stp = stop; 413 for (;;) { 414 /* how long could this one be? */ 415 rest = slow(m, sp, stp, ss, es); 416 assert(rest != NULL); /* it did match */ 417 /* could the rest match the rest? */ 418 tail = slow(m, rest, stop, es, stopst); 419 if (tail == stop) 420 break; /* yes! */ 421 /* no -- try a shorter match for this one */ 422 stp = rest - 1; 423 assert(stp >= sp); /* it did work */ 424 } 425 ssub = ss + 1; 426 esub = ss + OPND(m->g->strip[ss]) - 1; 427 assert(OP(m->g->strip[esub]) == OOR1); 428 for (;;) { /* find first matching branch */ 429 if (slow(m, sp, rest, ssub, esub) == rest) 430 break; /* it matched all of it */ 431 /* that one missed, try next one */ 432 assert(OP(m->g->strip[esub]) == OOR1); 433 esub++; 434 assert(OP(m->g->strip[esub]) == OOR2); 435 ssub = esub + 1; 436 esub += OPND(m->g->strip[esub]); 437 if (OP(m->g->strip[esub]) == OOR2) 438 esub--; 439 else 440 assert(OP(m->g->strip[esub]) == O_CH); 441 } 442 dp = dissect(m, sp, rest, ssub, esub); 443 assert(dp == rest); 444 sp = rest; 445 break; 446 case O_PLUS: 447 case O_QUEST: 448 case OOR1: 449 case OOR2: 450 case O_CH: 451 assert(nope); 452 break; 453 case OLPAREN: 454 i = OPND(m->g->strip[ss]); 455 assert(0 < i && i <= m->g->nsub); 456 m->pmatch[i].rm_so = sp - m->offp; 457 break; 458 case ORPAREN: 459 i = OPND(m->g->strip[ss]); 460 assert(0 < i && i <= m->g->nsub); 461 m->pmatch[i].rm_eo = sp - m->offp; 462 break; 463 default: /* uh oh */ 464 assert(nope); 465 break; 466 } 467 } 468 469 assert(sp == stop); 470 return(sp); 471 } 472 473 /* 474 - backref - figure out what matched what, figuring in back references 475 */ 476 static const char * /* == stop (success) or NULL (failure) */ 477 backref(struct match *m, const char *start, const char *stop, sopno startst, 478 sopno stopst, sopno lev, int rec) /* PLUS nesting level */ 479 { 480 int i; 481 sopno ss; /* start sop of current subRE */ 482 const char *sp; /* start of string matched by it */ 483 sopno ssub; /* start sop of subsubRE */ 484 sopno esub; /* end sop of subsubRE */ 485 const char *ssp;/* start of string matched by subsubRE */ 486 const char *dp; 487 size_t len; 488 int hard; 489 sop s; 490 regoff_t offsave; 491 cset *cs; 492 493 AT("back", start, stop, startst, stopst); 494 sp = start; 495 496 /* get as far as we can with easy stuff */ 497 hard = 0; 498 for (ss = startst; !hard && ss < stopst; ss++) 499 switch (OP(s = m->g->strip[ss])) { 500 case OCHAR: 501 if (sp == stop || *sp++ != (char)OPND(s)) 502 return(NULL); 503 break; 504 case OANY: 505 if (sp == stop) 506 return(NULL); 507 sp++; 508 break; 509 case OANYOF: 510 cs = &m->g->sets[OPND(s)]; 511 if (sp == stop || !CHIN(cs, *sp++)) 512 return(NULL); 513 break; 514 case OBOL: 515 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 516 (sp > m->offp && sp < m->endp && 517 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) 518 { /* yes */ } 519 else 520 return(NULL); 521 break; 522 case OEOL: 523 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 524 (sp < m->endp && *sp == '\n' && 525 (m->g->cflags®_NEWLINE)) ) 526 { /* yes */ } 527 else 528 return(NULL); 529 break; 530 case OBOW: 531 if (sp < m->endp && ISWORD(*sp) && 532 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 533 (sp > m->offp && !ISWORD(*(sp-1))))) 534 { /* yes */ } 535 else 536 return(NULL); 537 break; 538 case OEOW: 539 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 540 (sp < m->endp && *sp == '\n' && 541 (m->g->cflags®_NEWLINE)) || 542 (sp < m->endp && !ISWORD(*sp)) ) && 543 (sp > m->beginp && ISWORD(*(sp-1))) ) 544 { /* yes */ } 545 else 546 return(NULL); 547 break; 548 case O_QUEST: 549 break; 550 case OOR1: /* matches null but needs to skip */ 551 ss++; 552 s = m->g->strip[ss]; 553 do { 554 assert(OP(s) == OOR2); 555 ss += OPND(s); 556 } while (OP(s = m->g->strip[ss]) != O_CH); 557 /* note that the ss++ gets us past the O_CH */ 558 break; 559 default: /* have to make a choice */ 560 hard = 1; 561 break; 562 } 563 if (!hard) { /* that was it! */ 564 if (sp != stop) 565 return(NULL); 566 return(sp); 567 } 568 ss--; /* adjust for the for's final increment */ 569 570 /* the hard stuff */ 571 AT("hard", sp, stop, ss, stopst); 572 s = m->g->strip[ss]; 573 switch (OP(s)) { 574 case OBACK_: /* the vilest depths */ 575 i = OPND(s); 576 assert(0 < i && i <= m->g->nsub); 577 if (m->pmatch[i].rm_eo == -1) 578 return(NULL); 579 assert(m->pmatch[i].rm_so != -1); 580 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 581 if (len == 0 && rec++ > MAX_RECURSION) 582 return(NULL); 583 assert(stop - m->beginp >= len); 584 if (sp > stop - len) 585 return(NULL); /* not enough left to match */ 586 ssp = m->offp + m->pmatch[i].rm_so; 587 if (memcmp(sp, ssp, len) != 0) 588 return(NULL); 589 while (m->g->strip[ss] != SOP(O_BACK, i)) 590 ss++; 591 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 592 break; 593 case OQUEST_: /* to null or not */ 594 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 595 if (dp != NULL) 596 return(dp); /* not */ 597 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 598 break; 599 case OPLUS_: 600 assert(m->lastpos != NULL); 601 assert(lev+1 <= m->g->nplus); 602 m->lastpos[lev+1] = sp; 603 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 604 break; 605 case O_PLUS: 606 if (sp == m->lastpos[lev]) /* last pass matched null */ 607 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 608 /* try another pass */ 609 m->lastpos[lev] = sp; 610 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 611 if (dp == NULL) 612 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 613 else 614 return(dp); 615 break; 616 case OCH_: /* find the right one, if any */ 617 ssub = ss + 1; 618 esub = ss + OPND(s) - 1; 619 assert(OP(m->g->strip[esub]) == OOR1); 620 for (;;) { /* find first matching branch */ 621 dp = backref(m, sp, stop, ssub, esub, lev, rec); 622 if (dp != NULL) 623 return(dp); 624 /* that one missed, try next one */ 625 if (OP(m->g->strip[esub]) == O_CH) 626 return(NULL); /* there is none */ 627 esub++; 628 assert(OP(m->g->strip[esub]) == OOR2); 629 ssub = esub + 1; 630 esub += OPND(m->g->strip[esub]); 631 if (OP(m->g->strip[esub]) == OOR2) 632 esub--; 633 else 634 assert(OP(m->g->strip[esub]) == O_CH); 635 } 636 break; 637 case OLPAREN: /* must undo assignment if rest fails */ 638 i = OPND(s); 639 assert(0 < i && i <= m->g->nsub); 640 offsave = m->pmatch[i].rm_so; 641 m->pmatch[i].rm_so = sp - m->offp; 642 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 643 if (dp != NULL) 644 return(dp); 645 m->pmatch[i].rm_so = offsave; 646 return(NULL); 647 break; 648 case ORPAREN: /* must undo assignment if rest fails */ 649 i = OPND(s); 650 assert(0 < i && i <= m->g->nsub); 651 offsave = m->pmatch[i].rm_eo; 652 m->pmatch[i].rm_eo = sp - m->offp; 653 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 654 if (dp != NULL) 655 return(dp); 656 m->pmatch[i].rm_eo = offsave; 657 return(NULL); 658 break; 659 default: /* uh oh */ 660 assert(nope); 661 break; 662 } 663 664 /* "can't happen" */ 665 assert(nope); 666 /* NOTREACHED */ 667 return NULL; 668 } 669 670 /* 671 - fast - step through the string at top speed 672 */ 673 static const char * /* where tentative match ended, or NULL */ 674 fast(struct match *m, const char *start, const char *stop, sopno startst, 675 sopno stopst) 676 { 677 states st = m->st; 678 states fresh = m->fresh; 679 states tmp = m->tmp; 680 const char *p = start; 681 int c; 682 int lastc; /* previous c */ 683 int flagch; 684 int i; 685 const char *coldp; /* last p after which no match was underway */ 686 687 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 688 c = OUT; 689 else 690 c = *(start-1); 691 692 CLEAR(st); 693 SET1(st, startst); 694 st = step(m->g, startst, stopst, st, NOTHING, st); 695 ASSIGN(fresh, st); 696 SP("start", st, *p); 697 coldp = NULL; 698 for (;;) { 699 /* next character */ 700 lastc = c; 701 c = (p == m->endp) ? OUT : *p; 702 if (EQ(st, fresh)) 703 coldp = p; 704 705 /* is there an EOL and/or BOL between lastc and c? */ 706 flagch = '\0'; 707 i = 0; 708 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 709 (lastc == OUT && !(m->eflags®_NOTBOL))) { 710 flagch = BOL; 711 i = m->g->nbol; 712 } 713 if ((c == '\n' && m->g->cflags®_NEWLINE) || 714 (c == OUT && !(m->eflags®_NOTEOL)) ) { 715 flagch = (flagch == BOL) ? BOLEOL : EOL; 716 i += m->g->neol; 717 } 718 if (i != 0) { 719 for (; i > 0; i--) 720 st = step(m->g, startst, stopst, 721 st, flagch, st); 722 SP("boleol", st, c); 723 } 724 725 /* how about a word boundary? */ 726 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 727 (c != OUT && ISWORD(c))) 728 flagch = BOW; 729 if ((lastc != OUT && ISWORD(lastc)) && 730 (flagch == EOL || (c != OUT && !ISWORD(c)))) 731 flagch = EOW; 732 if (flagch == BOW || flagch == EOW) { 733 st = step(m->g, startst, stopst, st, flagch, st); 734 SP("boweow", st, c); 735 } 736 737 /* are we done? */ 738 if (ISSET(st, stopst) || p == stop) 739 break; /* NOTE BREAK OUT */ 740 741 /* no, we must deal with this character */ 742 ASSIGN(tmp, st); 743 ASSIGN(st, fresh); 744 assert(c != OUT); 745 st = step(m->g, startst, stopst, tmp, c, st); 746 SP("aft", st, c); 747 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 748 p++; 749 } 750 751 assert(coldp != NULL); 752 m->coldp = coldp; 753 if (ISSET(st, stopst)) 754 return(p+1); 755 else 756 return(NULL); 757 } 758 759 /* 760 - slow - step through the string more deliberately 761 */ 762 static const char * /* where it ended */ 763 slow(struct match *m, const char *start, const char *stop, sopno startst, 764 sopno stopst) 765 { 766 states st = m->st; 767 states empty = m->empty; 768 states tmp = m->tmp; 769 const char *p = start; 770 int c; 771 int lastc; /* previous c */ 772 int flagch; 773 int i; 774 const char *matchp; /* last p at which a match ended */ 775 776 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 777 c = OUT; 778 else 779 c = *(start-1); 780 781 AT("slow", start, stop, startst, stopst); 782 CLEAR(st); 783 SET1(st, startst); 784 SP("sstart", st, *p); 785 st = step(m->g, startst, stopst, st, NOTHING, st); 786 matchp = NULL; 787 for (;;) { 788 /* next character */ 789 lastc = c; 790 c = (p == m->endp) ? OUT : *p; 791 792 /* is there an EOL and/or BOL between lastc and c? */ 793 flagch = '\0'; 794 i = 0; 795 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 796 (lastc == OUT && !(m->eflags®_NOTBOL))) { 797 flagch = BOL; 798 i = m->g->nbol; 799 } 800 if ((c == '\n' && m->g->cflags®_NEWLINE) || 801 (c == OUT && !(m->eflags®_NOTEOL))) { 802 flagch = (flagch == BOL) ? BOLEOL : EOL; 803 i += m->g->neol; 804 } 805 if (i != 0) { 806 for (; i > 0; i--) 807 st = step(m->g, startst, stopst, 808 st, flagch, st); 809 SP("sboleol", st, c); 810 } 811 812 /* how about a word boundary? */ 813 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 814 (c != OUT && ISWORD(c))) 815 flagch = BOW; 816 if ((lastc != OUT && ISWORD(lastc)) && 817 (flagch == EOL || (c != OUT && !ISWORD(c)))) 818 flagch = EOW; 819 if (flagch == BOW || flagch == EOW) { 820 st = step(m->g, startst, stopst, st, flagch, st); 821 SP("sboweow", st, c); 822 } 823 824 /* are we done? */ 825 if (ISSET(st, stopst)) 826 matchp = p; 827 if (EQ(st, empty) || p == stop) 828 break; /* NOTE BREAK OUT */ 829 830 /* no, we must deal with this character */ 831 ASSIGN(tmp, st); 832 ASSIGN(st, empty); 833 assert(c != OUT); 834 st = step(m->g, startst, stopst, tmp, c, st); 835 SP("saft", st, c); 836 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 837 p++; 838 } 839 840 return(matchp); 841 } 842 843 844 /* 845 - step - map set of states reachable before char to set reachable after 846 */ 847 static states 848 step(struct re_guts *g, 849 sopno start, /* start state within strip */ 850 sopno stop, /* state after stop state within strip */ 851 states bef, /* states reachable before */ 852 int ch, /* character or NONCHAR code */ 853 states aft) /* states already known reachable after */ 854 { 855 cset *cs; 856 sop s; 857 sopno pc; 858 onestate here; /* note, macros know this name */ 859 sopno look; 860 int i; 861 862 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 863 s = g->strip[pc]; 864 switch (OP(s)) { 865 case OEND: 866 assert(pc == stop-1); 867 break; 868 case OCHAR: 869 /* only characters can match */ 870 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 871 if (ch == (char)OPND(s)) 872 FWD(aft, bef, 1); 873 break; 874 case OBOL: 875 if (ch == BOL || ch == BOLEOL) 876 FWD(aft, bef, 1); 877 break; 878 case OEOL: 879 if (ch == EOL || ch == BOLEOL) 880 FWD(aft, bef, 1); 881 break; 882 case OBOW: 883 if (ch == BOW) 884 FWD(aft, bef, 1); 885 break; 886 case OEOW: 887 if (ch == EOW) 888 FWD(aft, bef, 1); 889 break; 890 case OANY: 891 if (!NONCHAR(ch)) 892 FWD(aft, bef, 1); 893 break; 894 case OANYOF: 895 cs = &g->sets[OPND(s)]; 896 if (!NONCHAR(ch) && CHIN(cs, ch)) 897 FWD(aft, bef, 1); 898 break; 899 case OBACK_: /* ignored here */ 900 case O_BACK: 901 FWD(aft, aft, 1); 902 break; 903 case OPLUS_: /* forward, this is just an empty */ 904 FWD(aft, aft, 1); 905 break; 906 case O_PLUS: /* both forward and back */ 907 FWD(aft, aft, 1); 908 i = ISSETBACK(aft, OPND(s)); 909 BACK(aft, aft, OPND(s)); 910 if (!i && ISSETBACK(aft, OPND(s))) { 911 /* oho, must reconsider loop body */ 912 pc -= OPND(s) + 1; 913 INIT(here, pc); 914 } 915 break; 916 case OQUEST_: /* two branches, both forward */ 917 FWD(aft, aft, 1); 918 FWD(aft, aft, OPND(s)); 919 break; 920 case O_QUEST: /* just an empty */ 921 FWD(aft, aft, 1); 922 break; 923 case OLPAREN: /* not significant here */ 924 case ORPAREN: 925 FWD(aft, aft, 1); 926 break; 927 case OCH_: /* mark the first two branches */ 928 FWD(aft, aft, 1); 929 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 930 FWD(aft, aft, OPND(s)); 931 break; 932 case OOR1: /* done a branch, find the O_CH */ 933 if (ISSTATEIN(aft, here)) { 934 for (look = 1; 935 OP(s = g->strip[pc+look]) != O_CH; 936 look += OPND(s)) 937 assert(OP(s) == OOR2); 938 FWD(aft, aft, look); 939 } 940 break; 941 case OOR2: /* propagate OCH_'s marking */ 942 FWD(aft, aft, 1); 943 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 944 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 945 FWD(aft, aft, OPND(s)); 946 } 947 break; 948 case O_CH: /* just empty */ 949 FWD(aft, aft, 1); 950 break; 951 default: /* ooooops... */ 952 assert(nope); 953 break; 954 } 955 } 956 957 return(aft); 958 } 959 960 #ifdef REDEBUG 961 /* 962 - print - print a set of states 963 */ 964 static void 965 print(struct match *m, const char *caption, states st, int ch, FILE *d) 966 { 967 struct re_guts *g = m->g; 968 int i; 969 int first = 1; 970 971 if (!(m->eflags®_TRACE)) 972 return; 973 974 (void)fprintf(d, "%s", caption); 975 (void)fprintf(d, " %s", pchar(ch)); 976 for (i = 0; i < g->nstates; i++) { 977 if (ISSET(st, i)) { 978 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 979 first = 0; 980 } 981 } 982 (void)fprintf(d, "\n"); 983 } 984 985 /* 986 - at - print current situation 987 */ 988 static void 989 at(struct match *m, const char *title, const char *start, const char *stop, 990 sopno startst, sopno stopst) 991 { 992 if (!(m->eflags®_TRACE)) 993 return; 994 995 (void)printf("%s %s-", title, pchar(*start)); 996 (void)printf("%s ", pchar(*stop)); 997 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 998 } 999 1000 #ifndef PCHARDONE 1001 #define PCHARDONE /* never again */ 1002 static const char *nonchars[] = 1003 { "OUT", "BOL", "EOL", "BOLEOL", "NOTHING", "BOW", "EOW" }; 1004 #define PNONCHAR(c) \ 1005 ((c) - OUT < (sizeof(nonchars)/sizeof(nonchars[0])) \ 1006 ? nonchars[(c) - OUT] : "invalid") 1007 1008 /* 1009 - pchar - make a character printable 1010 * 1011 * Is this identical to regchar() over in debug.c? Well, yes. But a 1012 * duplicate here avoids having a debugging-capable regexec.o tied to 1013 * a matching debug.o, and this is convenient. It all disappears in 1014 * the non-debug compilation anyway, so it doesn't matter much. 1015 */ 1016 static const char * /* -> representation */ 1017 pchar(int ch) 1018 { 1019 static char pbuf[10]; 1020 1021 if (NONCHAR(ch)) { 1022 if (ch - OUT < (sizeof(nonchars)/sizeof(nonchars[0]))) 1023 return nonchars[ch - OUT]; 1024 return "invalid"; 1025 } 1026 if (isprint((unsigned char)ch) || ch == ' ') 1027 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1028 else 1029 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1030 return(pbuf); 1031 } 1032 #endif 1033 #endif 1034 1035 #undef matcher 1036 #undef fast 1037 #undef slow 1038 #undef dissect 1039 #undef backref 1040 #undef step 1041 #undef print 1042 #undef at 1043 #undef match 1044 #undef nope 1045