1 /* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * 4 * Copyright (c) 1986 by University of Toronto. 5 * Written by Henry Spencer. Not derived from licensed software. 6 * 7 * Permission is granted to anyone to use this software for any 8 * purpose on any computer system, and to redistribute it freely, 9 * subject to the following restrictions: 10 * 11 * 1. The author is not responsible for the consequences of use of 12 * this software, no matter how awful, even if they arise 13 * from defects in it. 14 * 15 * 2. The origin of this software must not be misrepresented, either 16 * by explicit claim or by omission. 17 * 18 * 3. Altered versions must be plainly marked as such, and must not 19 * be misrepresented as being the original software. 20 * 21 * Beware that some of this code is subtly aware of the way operator 22 * precedence is structured in regular expressions. Serious changes in 23 * regular-expression syntax might require a total rethink. 24 */ 25 #include <stdio.h> 26 #ifdef UNIX 27 #ifdef USG 28 #include <string.h> 29 #else 30 #include <strings.h> 31 #endif 32 #else 33 #include <string.h> 34 #endif 35 #include "regexp.h" 36 #include "regmagic.h" 37 #include "proto.h" 38 39 /* 40 * The "internal use only" fields in regexp.h are present to pass info from 41 * compile to execute that permits the execute phase to run lots faster on 42 * simple cases. They are: 43 * 44 * regstart char that must begin a match; '\0' if none obvious 45 * reganch is the match anchored (at beginning-of-line only)? 46 * regmust string (pointer into program) that match must include, or NULL 47 * regmlen length of regmust string 48 * 49 * Regstart and reganch permit very fast decisions on suitable starting points 50 * for a match, cutting down the work a lot. Regmust permits fast rejection 51 * of lines that cannot possibly match. The regmust tests are costly enough 52 * that regcomp() supplies a regmust only if the r.e. contains something 53 * potentially expensive (at present, the only such thing detected is * or + 54 * at the start of the r.e., which can involve a lot of backup). Regmlen is 55 * supplied because the test in regexec() needs it and regcomp() is computing 56 * it anyway. 57 */ 58 59 /* 60 * Structure for regexp "program". This is essentially a linear encoding 61 * of a nondeterministic finite-state machine (aka syntax charts or 62 * "railroad normal form" in parsing technology). Each node is an opcode 63 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 64 * all nodes except BRANCH implement concatenation; a "next" pointer with 65 * a BRANCH on both ends of it is connecting two alternatives. (Here we 66 * have one of the subtle syntax dependencies: an individual BRANCH (as 67 * opposed to a collection of them) is never concatenated with anything 68 * because of operator precedence.) The operand of some types of node is 69 * a literal string; for others, it is a node leading into a sub-FSM. In 70 * particular, the operand of a BRANCH node is the first node of the branch. 71 * (NB this is *not* a tree structure: the tail of the branch connects 72 * to the thing following the set of BRANCHes.) The opcodes are: 73 */ 74 75 /* definition number opnd? meaning */ 76 #define END 0 /* no End of program. */ 77 #define BOL 1 /* no Match "" at beginning of line. */ 78 #define EOL 2 /* no Match "" at end of line. */ 79 #define ANY 3 /* no Match any one character. */ 80 #define ANYOF 4 /* str Match any character in this string. */ 81 #define ANYBUT 5 /* str Match any character not in this string. */ 82 #define BRANCH 6 /* node Match this alternative, or the next... */ 83 #define BACK 7 /* no Match "", "next" ptr points backward. */ 84 #define EXACTLY 8 /* str Match this string. */ 85 #define NOTHING 9 /* no Match empty string. */ 86 #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 87 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 88 #define OPEN 20 /* no Mark this point in input as start of #n. */ 89 /* OPEN+1 is number 1, etc. */ 90 #define CLOSE 30 /* no Analogous to OPEN. */ 91 92 /* 93 * Opcode notes: 94 * 95 * BRANCH The set of branches constituting a single choice are hooked 96 * together with their "next" pointers, since precedence prevents 97 * anything being concatenated to any individual branch. The 98 * "next" pointer of the last BRANCH in a choice points to the 99 * thing following the whole choice. This is also where the 100 * final "next" pointer of each individual branch points; each 101 * branch starts with the operand node of a BRANCH node. 102 * 103 * BACK Normal "next" pointers all implicitly point forward; BACK 104 * exists to make loop structures possible. 105 * 106 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 107 * BRANCH structures using BACK. Simple cases (one character 108 * per match) are implemented with STAR and PLUS for speed 109 * and to minimize recursive plunges. 110 * 111 * OPEN,CLOSE ...are numbered at compile time. 112 */ 113 114 /* 115 * A node is one char of opcode followed by two chars of "next" pointer. 116 * "Next" pointers are stored as two 8-bit pieces, high order first. The 117 * value is a positive offset from the opcode of the node containing it. 118 * An operand, if any, simply follows the node. (Note that much of the 119 * code generation knows about this implicit relationship.) 120 * 121 * Using two bytes for the "next" pointer is vast overkill for most things, 122 * but allows patterns to get big without disasters. 123 */ 124 #define OP(p) (*(p)) 125 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 126 #define OPERAND(p) ((p) + 3) 127 128 /* 129 * See regmagic.h for one further detail of program structure. 130 */ 131 132 133 /* 134 * Utility definitions. 135 */ 136 #ifndef CHARBITS 137 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 138 #else 139 #define UCHARAT(p) ((int)*(p)&CHARBITS) 140 #endif 141 142 #define FAIL(m) { regerror(m); return(NULL); } 143 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 144 #define META "^$.[()|?+*\\" 145 146 /* 147 * Flags to be passed up and down. 148 */ 149 #define HASWIDTH 01 /* Known never to match null string. */ 150 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 151 #define SPSTART 04 /* Starts with * or +. */ 152 #define WORST 0 /* Worst case. */ 153 #define STATIC 154 /* 155 * Global work variables for regcomp(). 156 */ 157 158 STATIC char *regparse; /* Input-scan pointer. */ 159 STATIC int regnpar; /* () count. */ 160 STATIC unsigned char regdummy; 161 STATIC unsigned char *regcode; /* Code-emit pointer; ®dummy = don't. */ 162 STATIC long regsize; /* Code size. */ 163 164 /* 165 * Forward declarations for regcomp()'s friends. 166 */ 167 STATIC unsigned char *reg(int paren, int *flagp ); 168 STATIC unsigned char *regbranch(int *flagp ); 169 STATIC unsigned char *regpiece(int *flagp ); 170 STATIC unsigned char *regatom(int *flagp ); 171 STATIC unsigned char *regnode(int op ); 172 STATIC void regc(int b ); 173 STATIC void reginsert(int op, unsigned char *opnd ); 174 STATIC void regtail(unsigned char *p, unsigned char *val ); 175 STATIC void regoptail(unsigned char *p, unsigned char *val ); 176 STATIC int regtry(regexp *prog, unsigned char *string ); 177 STATIC int regmatch(unsigned char *prog ); 178 STATIC int regrepeat(unsigned char *p ); 179 STATIC unsigned char *regnext(unsigned char *p ); 180 STATIC unsigned char *regprop(unsigned char *op ); 181 182 #ifdef STRCSPN 183 STATIC int strcspn(unsigned char *s1, unsigned char *s2 ); 184 #endif 185 186 /* 187 - regcomp - compile a regular expression into internal code 188 * 189 * We can't allocate space until we know how big the compiled form will be, 190 * but we can't compile it (and thus know how big it is) until we've got a 191 * place to put the code. So we cheat: we compile it twice, once with code 192 * generation turned off and size counting turned on, and once "for real". 193 * This also means that we don't allocate space until we are sure that the 194 * thing really will compile successfully, and we never have to move the 195 * code and thus invalidate pointers into it. (Note that it has to be in 196 * one piece because free() must be able to free it all.) 197 * 198 * Beware that the optimization-preparation code in here knows about some 199 * of the structure of the compiled regexp. 200 */ 201 regexp * 202 regcomp(exp) 203 char *exp; 204 { 205 register regexp *r; 206 register unsigned char *scan; 207 register unsigned char *longest; 208 register int len; 209 int flags; 210 211 if (exp == NULL) 212 FAIL("NULL argument"); 213 214 /* First pass: determine size, legality. */ 215 regparse = exp; 216 regnpar = 1; 217 regsize = 0L; 218 regcode = ®dummy; 219 regc(MAGIC); 220 if (reg(0, &flags) == NULL) 221 return(NULL); 222 223 /* Small enough for pointer-storage convention? */ 224 if (regsize >= 32767L) /* Probably could be 65535L. */ 225 FAIL("regexp too big"); 226 227 /* Allocate space. */ 228 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 229 if (r == NULL) 230 FAIL("out of space"); 231 232 /* Second pass: emit code. */ 233 regparse = exp; 234 regnpar = 1; 235 regcode = r->program; 236 regc(MAGIC); 237 if (reg(0, &flags) == NULL) 238 return(NULL); 239 240 /* Dig out information for optimizations. */ 241 r->regstart = '\0'; /* Worst-case defaults. */ 242 r->reganch = 0; 243 r->regmust = NULL; 244 r->regmlen = 0; 245 scan = r->program+1; /* First BRANCH. */ 246 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 247 scan = OPERAND(scan); 248 249 /* Starting-point info. */ 250 if (OP(scan) == EXACTLY) 251 r->regstart = *OPERAND(scan); 252 else if (OP(scan) == BOL) 253 r->reganch++; 254 255 /* 256 * If there's something expensive in the r.e., find the 257 * longest literal string that must appear and make it the 258 * regmust. Resolve ties in favor of later strings, since 259 * the regstart check works with the beginning of the r.e. 260 * and avoiding duplication strengthens checking. Not a 261 * strong reason, but sufficient in the absence of others. 262 */ 263 if (flags&SPSTART) { 264 longest = NULL; 265 len = 0; 266 for (; scan != NULL; scan = regnext(scan)) 267 if (OP(scan) == EXACTLY 268 && strlen((char *)OPERAND(scan)) >= len) { 269 longest = OPERAND(scan); 270 len = strlen((char *)OPERAND(scan)); 271 } 272 r->regmust = longest; 273 r->regmlen = len; 274 } 275 } 276 277 return(r); 278 } 279 280 /* 281 - reg - regular expression, i.e. main body or parenthesized thing 282 * 283 * Caller must absorb opening parenthesis. 284 * 285 * Combining parenthesis handling with the base level of regular expression 286 * is a trifle forced, but the need to tie the tails of the branches to what 287 * follows makes it hard to avoid. 288 */ 289 STATIC unsigned char * 290 reg(paren, flagp) 291 int paren; /* Parenthesized? */ 292 int *flagp; 293 { 294 register unsigned char *ret; 295 register unsigned char *br; 296 register unsigned char *ender; 297 register int parno; 298 int flags; 299 300 *flagp = HASWIDTH; /* Tentatively. */ 301 302 /* Make an OPEN node, if parenthesized. */ 303 if (paren) { 304 if (regnpar >= NSUBEXP) 305 FAIL("too many ()"); 306 parno = regnpar; 307 regnpar++; 308 ret = regnode(OPEN+parno); 309 } else 310 ret = NULL; 311 312 /* Pick up the branches, linking them together. */ 313 br = regbranch(&flags); 314 if (br == NULL) 315 return(NULL); 316 if (ret != NULL) 317 regtail(ret, br); /* OPEN -> first. */ 318 else 319 ret = br; 320 if (!(flags&HASWIDTH)) 321 *flagp &= ~HASWIDTH; 322 *flagp |= flags&SPSTART; 323 while (*regparse == '|') { 324 regparse++; 325 br = regbranch(&flags); 326 if (br == NULL) 327 return(NULL); 328 regtail(ret, br); /* BRANCH -> BRANCH. */ 329 if (!(flags&HASWIDTH)) 330 *flagp &= ~HASWIDTH; 331 *flagp |= flags&SPSTART; 332 } 333 334 /* Make a closing node, and hook it on the end. */ 335 ender = regnode((paren) ? CLOSE+parno : END); 336 regtail(ret, ender); 337 338 /* Hook the tails of the branches to the closing node. */ 339 for (br = ret; br != NULL; br = regnext(br)) 340 regoptail(br, ender); 341 342 /* Check for proper termination. */ 343 if (paren && *regparse++ != ')') { 344 FAIL("unmatched ()"); 345 } else if (!paren && *regparse != '\0') { 346 if (*regparse == ')') { 347 FAIL("unmatched ()"); 348 } else 349 FAIL("junk on end"); /* "Can't happen". */ 350 /* NOTREACHED */ 351 } 352 353 return(ret); 354 } 355 356 /* 357 - regbranch - one alternative of an | operator 358 * 359 * Implements the concatenation operator. 360 */ 361 STATIC unsigned char * 362 regbranch(flagp) 363 int *flagp; 364 { 365 register unsigned char *ret; 366 register unsigned char *chain; 367 register unsigned char *latest; 368 int flags; 369 370 *flagp = WORST; /* Tentatively. */ 371 372 ret = regnode(BRANCH); 373 chain = NULL; 374 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') { 375 latest = regpiece(&flags); 376 if (latest == NULL) 377 return(NULL); 378 *flagp |= flags&HASWIDTH; 379 if (chain == NULL) /* First piece. */ 380 *flagp |= flags&SPSTART; 381 else 382 regtail(chain, latest); 383 chain = latest; 384 } 385 if (chain == NULL) /* Loop ran zero times. */ 386 (void) regnode(NOTHING); 387 388 return(ret); 389 } 390 391 /* 392 - regpiece - something followed by possible [*+?] 393 * 394 * Note that the branching code sequences used for ? and the general cases 395 * of * and + are somewhat optimized: they use the same NOTHING node as 396 * both the endmarker for their branch list and the body of the last branch. 397 * It might seem that this node could be dispensed with entirely, but the 398 * endmarker role is not redundant. 399 */ 400 STATIC unsigned char * 401 regpiece(flagp) 402 int *flagp; 403 { 404 register unsigned char *ret; 405 register unsigned char op; 406 register unsigned char *next; 407 int flags; 408 409 ret = regatom(&flags); 410 if (ret == NULL) 411 return(NULL); 412 413 op = *regparse; 414 if (!ISMULT(op)) { 415 *flagp = flags; 416 return(ret); 417 } 418 419 if (!(flags&HASWIDTH) && op != '?') 420 FAIL("*+ operand could be empty"); 421 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 422 423 if (op == '*' && (flags&SIMPLE)) 424 reginsert(STAR, ret); 425 else if (op == '*') { 426 /* Emit x* as (x&|), where & means "self". */ 427 reginsert(BRANCH, ret); /* Either x */ 428 regoptail(ret, regnode(BACK)); /* and loop */ 429 regoptail(ret, ret); /* back */ 430 regtail(ret, regnode(BRANCH)); /* or */ 431 regtail(ret, regnode(NOTHING)); /* null. */ 432 } else if (op == '+' && (flags&SIMPLE)) 433 reginsert(PLUS, ret); 434 else if (op == '+') { 435 /* Emit x+ as x(&|), where & means "self". */ 436 next = regnode(BRANCH); /* Either */ 437 regtail(ret, next); 438 regtail(regnode(BACK), ret); /* loop back */ 439 regtail(next, regnode(BRANCH)); /* or */ 440 regtail(ret, regnode(NOTHING)); /* null. */ 441 } else if (op == '?') { 442 /* Emit x? as (x|) */ 443 reginsert(BRANCH, ret); /* Either x */ 444 regtail(ret, regnode(BRANCH)); /* or */ 445 next = regnode(NOTHING); /* null. */ 446 regtail(ret, next); 447 regoptail(ret, next); 448 } 449 regparse++; 450 if (ISMULT(*regparse)) 451 FAIL("nested *?+"); 452 453 return(ret); 454 } 455 456 /* 457 - regatom - the lowest level 458 * 459 * Optimization: gobbles an entire sequence of ordinary characters so that 460 * it can turn them into a single node, which is smaller to store and 461 * faster to run. Backslashed characters are exceptions, each becoming a 462 * separate node; the code is simpler that way and it's not worth fixing. 463 */ 464 STATIC unsigned char * 465 regatom(flagp) 466 int *flagp; 467 { 468 register unsigned char *ret; 469 int flags; 470 471 *flagp = WORST; /* Tentatively. */ 472 473 switch (*regparse++) { 474 case '^': 475 ret = regnode(BOL); 476 break; 477 case '$': 478 ret = regnode(EOL); 479 break; 480 case '.': 481 ret = regnode(ANY); 482 *flagp |= HASWIDTH|SIMPLE; 483 break; 484 case '[': { 485 register int class; 486 register int classend; 487 488 if (*regparse == '^') { /* Complement of range. */ 489 ret = regnode(ANYBUT); 490 regparse++; 491 } else 492 ret = regnode(ANYOF); 493 if (*regparse == ']' || *regparse == '-') 494 regc(*regparse++); 495 while (*regparse != '\0' && *regparse != ']') { 496 if (*regparse == '-') { 497 regparse++; 498 if (*regparse == ']' || *regparse == '\0') 499 regc('-'); 500 else { 501 class = UCHARAT(regparse-2)+1; 502 classend = UCHARAT(regparse); 503 if (class > classend+1) 504 FAIL("invalid [] range"); 505 for (; class <= classend; class++) 506 regc(class); 507 regparse++; 508 } 509 } else 510 regc(*regparse++); 511 } 512 regc('\0'); 513 if (*regparse != ']') 514 FAIL("unmatched []"); 515 regparse++; 516 *flagp |= HASWIDTH|SIMPLE; 517 } 518 break; 519 case '(': 520 ret = reg(1, &flags); 521 if (ret == NULL) 522 return(NULL); 523 *flagp |= flags&(HASWIDTH|SPSTART); 524 break; 525 case '\0': 526 case '|': 527 case ')': 528 FAIL("internal urp"); /* Supposed to be caught earlier. */ 529 break; 530 case '?': 531 case '+': 532 case '*': 533 FAIL("?+* follows nothing"); 534 break; 535 case '\\': 536 if (*regparse == '\0') 537 FAIL("trailing \\"); 538 ret = regnode(EXACTLY); 539 regc(*regparse++); 540 regc('\0'); 541 *flagp |= HASWIDTH|SIMPLE; 542 break; 543 default: { 544 register int len; 545 register unsigned char ender; 546 547 regparse--; 548 #ifdef STRCSPN 549 len = strcspn(regparse, (unsigned char *)META); 550 #else 551 len = strcspn((char *)regparse, META); 552 #endif 553 if (len <= 0) 554 FAIL("internal disaster"); 555 ender = *(regparse+len); 556 if (len > 1 && ISMULT(ender)) 557 len--; /* Back off clear of ?+* operand. */ 558 *flagp |= HASWIDTH; 559 if (len == 1) 560 *flagp |= SIMPLE; 561 ret = regnode(EXACTLY); 562 while (len > 0) { 563 regc(*regparse++); 564 len--; 565 } 566 regc('\0'); 567 } 568 break; 569 } 570 571 return(ret); 572 } 573 574 /* 575 - regnode - emit a node 576 */ 577 STATIC unsigned char * /* Location. */ 578 regnode(iop) 579 int iop; 580 { 581 register unsigned char *ret; 582 register unsigned char *ptr; 583 unsigned char op; 584 585 op = (unsigned char) iop; 586 ret = regcode; 587 if (ret == ®dummy) { 588 regsize += 3; 589 return(ret); 590 } 591 592 ptr = ret; 593 *ptr++ = op; 594 *ptr++ = '\0'; /* Null "next" pointer. */ 595 *ptr++ = '\0'; 596 regcode = ptr; 597 598 return(ret); 599 } 600 601 /* 602 - regc - emit (if appropriate) a byte of code 603 */ 604 STATIC void 605 regc(ib) 606 int ib; 607 { 608 unsigned char b; 609 610 b = (unsigned char) ib; 611 if (regcode != ®dummy) 612 *regcode++ = b; 613 else 614 regsize++; 615 } 616 617 /* 618 - reginsert - insert an operator in front of already-emitted operand 619 * 620 * Means relocating the operand. 621 */ 622 STATIC void 623 reginsert(iop, opnd) 624 int iop; 625 unsigned char *opnd; 626 { 627 register unsigned char *src; 628 register unsigned char *dst; 629 register unsigned char *place; 630 unsigned char op; 631 632 op = (unsigned char) iop; 633 if (regcode == ®dummy) { 634 regsize += 3; 635 return; 636 } 637 638 src = regcode; 639 regcode += 3; 640 dst = regcode; 641 while (src > opnd) 642 *--dst = *--src; 643 644 place = opnd; /* Op node, where operand used to be. */ 645 *place++ = op; 646 *place++ = '\0'; 647 *place++ = '\0'; 648 } 649 650 /* 651 - regtail - set the next-pointer at the end of a node chain 652 */ 653 STATIC void 654 regtail(p, val) 655 unsigned char *p; 656 unsigned char *val; 657 { 658 register unsigned char *scan; 659 register unsigned char *temp; 660 register int offset; 661 662 if (p == ®dummy) 663 return; 664 665 /* Find last node. */ 666 scan = p; 667 for (;;) { 668 temp = regnext(scan); 669 if (temp == NULL) 670 break; 671 scan = temp; 672 } 673 674 if (OP(scan) == BACK) 675 offset = scan - val; 676 else 677 offset = val - scan; 678 *(scan+1) = (offset>>8)&0377; 679 *(scan+2) = offset&0377; 680 } 681 682 /* 683 - regoptail - regtail on operand of first argument; nop if operandless 684 */ 685 STATIC void 686 regoptail(p, val) 687 unsigned char *p; 688 unsigned char *val; 689 { 690 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 691 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 692 return; 693 regtail(OPERAND(p), val); 694 } 695 696 /* 697 * regexec and friends 698 */ 699 700 /* 701 * Global work variables for regexec(). 702 */ 703 STATIC unsigned char *reginput; /* String-input pointer. */ 704 STATIC unsigned char *regbol; /* Beginning of input, for ^ check. */ 705 STATIC unsigned char **regstartp; /* Pointer to startp array. */ 706 STATIC unsigned char **regendp; /* Ditto for endp. */ 707 708 #ifdef DEBUG 709 int regnarrate = 0; 710 void regdump(); 711 STATIC unsigned char *regprop(); 712 #endif 713 714 /* 715 - regexec - match a regexp against a string 716 */ 717 int 718 regexec(prog, string) 719 register regexp *prog; 720 register unsigned char *string; 721 { 722 register unsigned char *s; 723 #ifndef STDLIB 724 extern char *strchr(); 725 #endif 726 727 /* Be paranoid... */ 728 if (prog == NULL || string == NULL) { 729 regerror("NULL parameter"); 730 return(0); 731 } 732 733 /* Check validity of program. */ 734 if (UCHARAT(prog->program) != MAGIC) { 735 regerror("corrupted program"); 736 return(0); 737 } 738 739 /* If there is a "must appear" string, look for it. */ 740 if (prog->regmust != NULL) { 741 s = string; 742 while ((s = (unsigned char *)strchr((char *)s,prog->regmust[0])) 743 != NULL) { 744 if (strncmp((char *)s, (char *)prog->regmust, 745 prog->regmlen) 746 == 0) 747 break; /* Found it. */ 748 s++; 749 } 750 if (s == NULL) /* Not present. */ 751 return(0); 752 } 753 754 /* Mark beginning of line for ^ . */ 755 regbol = string; 756 757 /* Simplest case: anchored match need be tried only once. */ 758 if (prog->reganch) 759 return(regtry(prog, string)); 760 761 /* Messy cases: unanchored match. */ 762 s = string; 763 if (prog->regstart != '\0') 764 /* We know what char it must start with. */ 765 while ((s = (unsigned char *)strchr((char *)s, prog->regstart)) 766 != NULL) { 767 if (regtry(prog, s)) 768 return(1); 769 s++; 770 } 771 else 772 /* We don't -- general case. */ 773 do { 774 if (regtry(prog, s)) 775 return(1); 776 } while (*s++ != '\0'); 777 778 /* Failure. */ 779 return(0); 780 } 781 782 /* 783 - regtry - try match at specific point 784 */ 785 STATIC int /* 0 failure, 1 success */ 786 regtry(prog, string) 787 regexp *prog; 788 unsigned char *string; 789 { 790 register int i; 791 register unsigned char **sp; 792 register unsigned char **ep; 793 794 reginput = string; 795 regstartp = prog->startp; 796 regendp = prog->endp; 797 798 sp = prog->startp; 799 ep = prog->endp; 800 for (i = NSUBEXP; i > 0; i--) { 801 *sp++ = NULL; 802 *ep++ = NULL; 803 } 804 if (regmatch(prog->program + 1)) { 805 prog->startp[0] = string; 806 prog->endp[0] = reginput; 807 return(1); 808 } else 809 return(0); 810 } 811 812 /* 813 - regmatch - main matching routine 814 * 815 * Conceptually the strategy is simple: check to see whether the current 816 * node matches, call self recursively to see whether the rest matches, 817 * and then act accordingly. In practice we make some effort to avoid 818 * recursion, in particular by going through "ordinary" nodes (that don't 819 * need to know whether the rest of the match failed) by a loop instead of 820 * by recursion. 821 */ 822 STATIC int /* 0 failure, 1 success */ 823 regmatch(prog) 824 unsigned char *prog; 825 { 826 register unsigned char *scan; /* Current node. */ 827 unsigned char *next; /* Next node. */ 828 #ifndef STDLIB 829 extern char *strchr(); 830 #endif 831 832 scan = prog; 833 #ifdef DEBUG 834 if (scan != NULL && regnarrate) 835 fprintf(stderr, "%s(\n", regprop(scan)); 836 #endif 837 while (scan != NULL) { 838 #ifdef DEBUG 839 if (regnarrate) 840 fprintf(stderr, "%s...\n", regprop(scan)); 841 #endif 842 next = regnext(scan); 843 844 switch (OP(scan)) { 845 case BOL: 846 if (reginput != regbol) 847 return(0); 848 break; 849 case EOL: 850 if (*reginput != '\0') 851 return(0); 852 break; 853 case ANY: 854 if (*reginput == '\0') 855 return(0); 856 reginput++; 857 break; 858 case EXACTLY: { 859 register int len; 860 register unsigned char *opnd; 861 862 opnd = OPERAND(scan); 863 /* Inline the first character, for speed. */ 864 if (*opnd != *reginput) 865 return(0); 866 len = strlen((char *)opnd); 867 if (len > 1 && strncmp((char *)opnd, 868 (char *)reginput, len) 869 != 0) 870 return(0); 871 reginput += len; 872 } 873 break; 874 case ANYOF: 875 if (*reginput == '\0' 876 || strchr((char *)OPERAND(scan), *reginput) == NULL) 877 return(0); 878 reginput++; 879 break; 880 case ANYBUT: 881 if (*reginput == '\0' 882 || strchr((char *)OPERAND(scan), *reginput) != NULL) 883 return(0); 884 reginput++; 885 break; 886 case NOTHING: 887 break; 888 case BACK: 889 break; 890 case OPEN+1: 891 case OPEN+2: 892 case OPEN+3: 893 case OPEN+4: 894 case OPEN+5: 895 case OPEN+6: 896 case OPEN+7: 897 case OPEN+8: 898 case OPEN+9: { 899 register int no; 900 register unsigned char *save; 901 902 no = OP(scan) - OPEN; 903 save = reginput; 904 905 if (regmatch(next)) { 906 /* 907 * Don't set startp if some later 908 * invocation of the same parentheses 909 * already has. 910 */ 911 if (regstartp[no] == NULL) 912 regstartp[no] = save; 913 return(1); 914 } else 915 return(0); 916 } 917 break; 918 case CLOSE+1: 919 case CLOSE+2: 920 case CLOSE+3: 921 case CLOSE+4: 922 case CLOSE+5: 923 case CLOSE+6: 924 case CLOSE+7: 925 case CLOSE+8: 926 case CLOSE+9: { 927 register int no; 928 register unsigned char *save; 929 930 no = OP(scan) - CLOSE; 931 save = reginput; 932 933 if (regmatch(next)) { 934 /* 935 * Don't set endp if some later 936 * invocation of the same parentheses 937 * already has. 938 */ 939 if (regendp[no] == NULL) 940 regendp[no] = save; 941 return(1); 942 } else 943 return(0); 944 } 945 break; 946 case BRANCH: { 947 register unsigned char *save; 948 949 if (OP(next) != BRANCH) /* No choice. */ 950 next = OPERAND(scan); /* Avoid recursion. */ 951 else { 952 do { 953 save = reginput; 954 if (regmatch(OPERAND(scan))) 955 return(1); 956 reginput = save; 957 scan = regnext(scan); 958 } while (scan != NULL && OP(scan) == BRANCH); 959 return(0); 960 /* NOTREACHED */ 961 } 962 } 963 break; 964 case STAR: 965 case PLUS: { 966 register unsigned char nextch; 967 register int no; 968 register unsigned char *save; 969 register int min; 970 971 /* 972 * Lookahead to avoid useless match attempts 973 * when we know what character comes next. 974 */ 975 nextch = '\0'; 976 if (OP(next) == EXACTLY) 977 nextch = *OPERAND(next); 978 min = (OP(scan) == STAR) ? 0 : 1; 979 save = reginput; 980 no = regrepeat(OPERAND(scan)); 981 while (no >= min) { 982 /* If it could work, try it. */ 983 if (nextch == '\0' || *reginput == nextch) 984 if (regmatch(next)) 985 return(1); 986 /* Couldn't or didn't -- back up. */ 987 no--; 988 reginput = save + no; 989 } 990 return(0); 991 } 992 break; 993 case END: 994 return(1); /* Success! */ 995 break; 996 default: 997 regerror("memory corruption"); 998 return(0); 999 break; 1000 } 1001 1002 scan = next; 1003 } 1004 1005 /* 1006 * We get here only if there's trouble -- normally "case END" is 1007 * the terminating point. 1008 */ 1009 regerror("corrupted pointers"); 1010 return(0); 1011 } 1012 1013 /* 1014 - regrepeat - repeatedly match something simple, report how many 1015 */ 1016 STATIC int 1017 regrepeat(p) 1018 unsigned char *p; 1019 { 1020 register int count = 0; 1021 register unsigned char *scan; 1022 register unsigned char *opnd; 1023 1024 scan = reginput; 1025 opnd = OPERAND(p); 1026 switch (OP(p)) { 1027 case ANY: 1028 count = strlen((char *)scan); 1029 scan += count; 1030 break; 1031 case EXACTLY: 1032 while (*opnd == *scan) { 1033 count++; 1034 scan++; 1035 } 1036 break; 1037 case ANYOF: 1038 while (*scan != '\0' && strchr((char *)opnd, *scan) != NULL) { 1039 count++; 1040 scan++; 1041 } 1042 break; 1043 case ANYBUT: 1044 while (*scan != '\0' && strchr((char *)opnd, *scan) == NULL) { 1045 count++; 1046 scan++; 1047 } 1048 break; 1049 default: /* Oh dear. Called inappropriately. */ 1050 regerror("internal foulup"); 1051 count = 0; /* Best compromise. */ 1052 break; 1053 } 1054 reginput = scan; 1055 1056 return(count); 1057 } 1058 1059 /* 1060 - regnext - dig the "next" pointer out of a node 1061 */ 1062 STATIC unsigned char * 1063 regnext(p) 1064 register unsigned char *p; 1065 { 1066 register int offset; 1067 1068 if (p == ®dummy) 1069 return(NULL); 1070 1071 offset = NEXT(p); 1072 if (offset == 0) 1073 return(NULL); 1074 1075 if (OP(p) == BACK) 1076 return(p-offset); 1077 else 1078 return(p+offset); 1079 } 1080 1081 #ifdef DEBUG 1082 1083 STATIC unsigned char *regprop(); 1084 1085 /* 1086 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1087 */ 1088 void 1089 regdump(r) 1090 regexp *r; 1091 { 1092 register unsigned char *s; 1093 register unsigned char op = EXACTLY; /* Arbitrary non-END op. */ 1094 register unsigned char *next; 1095 extern char *strchr(); 1096 1097 1098 s = r->program + 1; 1099 while (op != END) { /* While that wasn't END last time... */ 1100 op = OP(s); 1101 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1102 next = regnext(s); 1103 if (next == NULL) /* Next ptr. */ 1104 printf("(0)"); 1105 else 1106 printf("(%d)", (s-r->program)+(next-s)); 1107 s += 3; 1108 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1109 /* Literal string, where present. */ 1110 while (*s != '\0') { 1111 putchar(*s); 1112 s++; 1113 } 1114 s++; 1115 } 1116 putchar('\n'); 1117 } 1118 1119 /* Header fields of interest. */ 1120 if (r->regstart != '\0') 1121 printf("start `%c' ", r->regstart); 1122 if (r->reganch) 1123 printf("anchored "); 1124 if (r->regmust != NULL) 1125 printf("must have \"%s\"", r->regmust); 1126 printf("\n"); 1127 } 1128 1129 /* 1130 - regprop - printable representation of opcode 1131 */ 1132 STATIC unsigned char * 1133 regprop(op) 1134 unsigned char *op; 1135 { 1136 register unsigned char *p; 1137 STATIC unsigned char buf[50]; 1138 1139 (void) strcpy(buf, ":"); 1140 1141 switch (OP(op)) { 1142 case BOL: 1143 p = "BOL"; 1144 break; 1145 case EOL: 1146 p = "EOL"; 1147 break; 1148 case ANY: 1149 p = "ANY"; 1150 break; 1151 case ANYOF: 1152 p = "ANYOF"; 1153 break; 1154 case ANYBUT: 1155 p = "ANYBUT"; 1156 break; 1157 case BRANCH: 1158 p = "BRANCH"; 1159 break; 1160 case EXACTLY: 1161 p = "EXACTLY"; 1162 break; 1163 case NOTHING: 1164 p = "NOTHING"; 1165 break; 1166 case BACK: 1167 p = "BACK"; 1168 break; 1169 case END: 1170 p = "END"; 1171 break; 1172 case OPEN+1: 1173 case OPEN+2: 1174 case OPEN+3: 1175 case OPEN+4: 1176 case OPEN+5: 1177 case OPEN+6: 1178 case OPEN+7: 1179 case OPEN+8: 1180 case OPEN+9: 1181 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 1182 p = NULL; 1183 break; 1184 case CLOSE+1: 1185 case CLOSE+2: 1186 case CLOSE+3: 1187 case CLOSE+4: 1188 case CLOSE+5: 1189 case CLOSE+6: 1190 case CLOSE+7: 1191 case CLOSE+8: 1192 case CLOSE+9: 1193 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 1194 p = NULL; 1195 break; 1196 case STAR: 1197 p = "STAR"; 1198 break; 1199 case PLUS: 1200 p = "PLUS"; 1201 break; 1202 default: 1203 regerror("corrupted opcode"); 1204 break; 1205 } 1206 if (p != NULL) 1207 (void) strcat(buf, p); 1208 return(buf); 1209 } 1210 #endif 1211 1212 /* 1213 * The following is provided for those people who do not have strcspn() in 1214 * their C libraries. They should get off their butts and do something 1215 * about it; at least one public-domain implementation of those (highly 1216 * useful) string routines has been published on Usenet. 1217 */ 1218 #ifdef STRCSPN 1219 /* 1220 * strcspn - find length of initial segment of s1 consisting entirely 1221 * of characters not from s2 1222 */ 1223 1224 STATIC int 1225 strcspn(s1, s2) 1226 unsigned char *s1; 1227 unsigned char *s2; 1228 { 1229 register unsigned char *scan1; 1230 register unsigned char *scan2; 1231 register int count; 1232 1233 count = 0; 1234 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1235 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1236 if (*scan1 == *scan2++) 1237 return(count); 1238 count++; 1239 } 1240 return(count); 1241 } 1242 #endif 1243