1 /* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * 4 * Copyright (c) 1986 by University of Toronto. 5 * Written by Henry Spencer. Not derived from licensed software. 6 * 7 * Permission is granted to anyone to use this software for any 8 * purpose on any computer system, and to redistribute it freely, 9 * subject to the following restrictions: 10 * 11 * 1. The author is not responsible for the consequences of use of 12 * this software, no matter how awful, even if they arise 13 * from defects in it. 14 * 15 * 2. The origin of this software must not be misrepresented, either 16 * by explicit claim or by omission. 17 * 18 * 3. Altered versions must be plainly marked as such, and must not 19 * be misrepresented as being the original software. 20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | 22 *** to assist in implementing egrep. 23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching 25 *** as in BSD grep and ex. 26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. 28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, 29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. 30 * 31 * Beware that some of this code is subtly aware of the way operator 32 * precedence is structured in regular expressions. Serious changes in 33 * regular-expression syntax might require a total rethink. 34 */ 35 #include <stdio.h> 36 #include <ctype.h> 37 #include <regexp.h> 38 #include "regmagic.h" 39 40 /* 41 * The "internal use only" fields in regexp.h are present to pass info from 42 * compile to execute that permits the execute phase to run lots faster on 43 * simple cases. They are: 44 * 45 * regstart char that must begin a match; '\0' if none obvious 46 * reganch is the match anchored (at beginning-of-line only)? 47 * regmust string (pointer into program) that match must include, or NULL 48 * regmlen length of regmust string 49 * 50 * Regstart and reganch permit very fast decisions on suitable starting points 51 * for a match, cutting down the work a lot. Regmust permits fast rejection 52 * of lines that cannot possibly match. The regmust tests are costly enough 53 * that regcomp() supplies a regmust only if the r.e. contains something 54 * potentially expensive (at present, the only such thing detected is * or + 55 * at the start of the r.e., which can involve a lot of backup). Regmlen is 56 * supplied because the test in regexec() needs it and regcomp() is computing 57 * it anyway. 58 */ 59 60 /* 61 * Structure for regexp "program". This is essentially a linear encoding 62 * of a nondeterministic finite-state machine (aka syntax charts or 63 * "railroad normal form" in parsing technology). Each node is an opcode 64 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 65 * all nodes except BRANCH implement concatenation; a "next" pointer with 66 * a BRANCH on both ends of it is connecting two alternatives. (Here we 67 * have one of the subtle syntax dependencies: an individual BRANCH (as 68 * opposed to a collection of them) is never concatenated with anything 69 * because of operator precedence.) The operand of some types of node is 70 * a literal string; for others, it is a node leading into a sub-FSM. In 71 * particular, the operand of a BRANCH node is the first node of the branch. 72 * (NB this is *not* a tree structure: the tail of the branch connects 73 * to the thing following the set of BRANCHes.) The opcodes are: 74 */ 75 76 /* definition number opnd? meaning */ 77 #define END 0 /* no End of program. */ 78 #define BOL 1 /* no Match "" at beginning of line. */ 79 #define EOL 2 /* no Match "" at end of line. */ 80 #define ANY 3 /* no Match any one character. */ 81 #define ANYOF 4 /* str Match any character in this string. */ 82 #define ANYBUT 5 /* str Match any character not in this string. */ 83 #define BRANCH 6 /* node Match this alternative, or the next... */ 84 #define BACK 7 /* no Match "", "next" ptr points backward. */ 85 #define EXACTLY 8 /* str Match this string. */ 86 #define NOTHING 9 /* no Match empty string. */ 87 #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 88 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 89 #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ 90 #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ 91 #define OPEN 20 /* no Mark this point in input as start of #n. */ 92 /* OPEN+1 is number 1, etc. */ 93 #define CLOSE 30 /* no Analogous to OPEN. */ 94 95 /* 96 * Opcode notes: 97 * 98 * BRANCH The set of branches constituting a single choice are hooked 99 * together with their "next" pointers, since precedence prevents 100 * anything being concatenated to any individual branch. The 101 * "next" pointer of the last BRANCH in a choice points to the 102 * thing following the whole choice. This is also where the 103 * final "next" pointer of each individual branch points; each 104 * branch starts with the operand node of a BRANCH node. 105 * 106 * BACK Normal "next" pointers all implicitly point forward; BACK 107 * exists to make loop structures possible. 108 * 109 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 110 * BRANCH structures using BACK. Simple cases (one character 111 * per match) are implemented with STAR and PLUS for speed 112 * and to minimize recursive plunges. 113 * 114 * OPEN,CLOSE ...are numbered at compile time. 115 */ 116 117 /* 118 * A node is one char of opcode followed by two chars of "next" pointer. 119 * "Next" pointers are stored as two 8-bit pieces, high order first. The 120 * value is a positive offset from the opcode of the node containing it. 121 * An operand, if any, simply follows the node. (Note that much of the 122 * code generation knows about this implicit relationship.) 123 * 124 * Using two bytes for the "next" pointer is vast overkill for most things, 125 * but allows patterns to get big without disasters. 126 */ 127 #define OP(p) (*(p)) 128 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 129 #define OPERAND(p) ((p) + 3) 130 131 /* 132 * See regmagic.h for one further detail of program structure. 133 */ 134 135 136 /* 137 * Utility definitions. 138 */ 139 #ifndef CHARBITS 140 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 141 #else 142 #define UCHARAT(p) ((int)*(p)&CHARBITS) 143 #endif 144 145 #define FAIL(m) { regerror(m); return(NULL); } 146 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 147 148 /* 149 * Flags to be passed up and down. 150 */ 151 #define HASWIDTH 01 /* Known never to match null string. */ 152 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 153 #define SPSTART 04 /* Starts with * or +. */ 154 #define WORST 0 /* Worst case. */ 155 156 /* 157 * Global work variables for regcomp(). 158 */ 159 static char *regparse; /* Input-scan pointer. */ 160 static int regnpar; /* () count. */ 161 static char regdummy; 162 static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 163 static long regsize; /* Code size. */ 164 165 /* 166 * Forward declarations for regcomp()'s friends. 167 */ 168 #ifndef STATIC 169 #define STATIC static 170 #endif 171 STATIC char *reg(); 172 STATIC char *regbranch(); 173 STATIC char *regpiece(); 174 STATIC char *regatom(); 175 STATIC char *regnode(); 176 STATIC char *regnext(); 177 STATIC void regc(); 178 STATIC void reginsert(); 179 STATIC void regtail(); 180 STATIC void regoptail(); 181 #ifdef STRCSPN 182 STATIC int strcspn(); 183 #endif 184 185 /* 186 - regcomp - compile a regular expression into internal code 187 * 188 * We can't allocate space until we know how big the compiled form will be, 189 * but we can't compile it (and thus know how big it is) until we've got a 190 * place to put the code. So we cheat: we compile it twice, once with code 191 * generation turned off and size counting turned on, and once "for real". 192 * This also means that we don't allocate space until we are sure that the 193 * thing really will compile successfully, and we never have to move the 194 * code and thus invalidate pointers into it. (Note that it has to be in 195 * one piece because free() must be able to free it all.) 196 * 197 * Beware that the optimization-preparation code in here knows about some 198 * of the structure of the compiled regexp. 199 */ 200 regexp * 201 regcomp(exp) 202 char *exp; 203 { 204 register regexp *r; 205 register char *scan; 206 register char *longest; 207 register int len; 208 int flags; 209 extern char *malloc(); 210 211 if (exp == NULL) 212 FAIL("NULL argument"); 213 214 /* First pass: determine size, legality. */ 215 if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */ 216 regparse = exp; 217 regnpar = 1; 218 regsize = 0L; 219 regcode = ®dummy; 220 regc(MAGIC); 221 if (reg(0, &flags) == NULL) 222 return(NULL); 223 224 /* Small enough for pointer-storage convention? */ 225 if (regsize >= 32767L) /* Probably could be 65535L. */ 226 FAIL("regexp too big"); 227 228 /* Allocate space. */ 229 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 230 if (r == NULL) 231 FAIL("out of space"); 232 233 /* Second pass: emit code. */ 234 regparse = exp; 235 regnpar = 1; 236 regcode = r->program; 237 regc(MAGIC); 238 if (reg(0, &flags) == NULL) 239 return(NULL); 240 241 /* Dig out information for optimizations. */ 242 r->regstart = '\0'; /* Worst-case defaults. */ 243 r->reganch = 0; 244 r->regmust = NULL; 245 r->regmlen = 0; 246 scan = r->program+1; /* First BRANCH. */ 247 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 248 scan = OPERAND(scan); 249 250 /* Starting-point info. */ 251 if (OP(scan) == EXACTLY) 252 r->regstart = *OPERAND(scan); 253 else if (OP(scan) == BOL) 254 r->reganch++; 255 256 /* 257 * If there's something expensive in the r.e., find the 258 * longest literal string that must appear and make it the 259 * regmust. Resolve ties in favor of later strings, since 260 * the regstart check works with the beginning of the r.e. 261 * and avoiding duplication strengthens checking. Not a 262 * strong reason, but sufficient in the absence of others. 263 */ 264 if (flags&SPSTART) { 265 longest = NULL; 266 len = 0; 267 for (; scan != NULL; scan = regnext(scan)) 268 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { 269 longest = OPERAND(scan); 270 len = strlen(OPERAND(scan)); 271 } 272 r->regmust = longest; 273 r->regmlen = len; 274 } 275 } 276 277 return(r); 278 } 279 280 /* 281 - reg - regular expression, i.e. main body or parenthesized thing 282 * 283 * Caller must absorb opening parenthesis. 284 * 285 * Combining parenthesis handling with the base level of regular expression 286 * is a trifle forced, but the need to tie the tails of the branches to what 287 * follows makes it hard to avoid. 288 */ 289 static char * 290 reg(paren, flagp) 291 int paren; /* Parenthesized? */ 292 int *flagp; 293 { 294 register char *ret; 295 register char *br; 296 register char *ender; 297 register int parno; 298 int flags; 299 300 *flagp = HASWIDTH; /* Tentatively. */ 301 302 /* Make an OPEN node, if parenthesized. */ 303 if (paren) { 304 if (regnpar >= NSUBEXP) 305 FAIL("too many ()"); 306 parno = regnpar; 307 regnpar++; 308 ret = regnode(OPEN+parno); 309 } else 310 ret = NULL; 311 312 /* Pick up the branches, linking them together. */ 313 br = regbranch(&flags); 314 if (br == NULL) 315 return(NULL); 316 if (ret != NULL) 317 regtail(ret, br); /* OPEN -> first. */ 318 else 319 ret = br; 320 if (!(flags&HASWIDTH)) 321 *flagp &= ~HASWIDTH; 322 *flagp |= flags&SPSTART; 323 while (*regparse == '|' || *regparse == '\n') { 324 regparse++; 325 br = regbranch(&flags); 326 if (br == NULL) 327 return(NULL); 328 regtail(ret, br); /* BRANCH -> BRANCH. */ 329 if (!(flags&HASWIDTH)) 330 *flagp &= ~HASWIDTH; 331 *flagp |= flags&SPSTART; 332 } 333 334 /* Make a closing node, and hook it on the end. */ 335 ender = regnode((paren) ? CLOSE+parno : END); 336 regtail(ret, ender); 337 338 /* Hook the tails of the branches to the closing node. */ 339 for (br = ret; br != NULL; br = regnext(br)) 340 regoptail(br, ender); 341 342 /* Check for proper termination. */ 343 if (paren && *regparse++ != ')') { 344 FAIL("unmatched ()"); 345 } else if (!paren && *regparse != '\0') { 346 if (*regparse == ')') { 347 FAIL("unmatched ()"); 348 } else 349 FAIL("junk on end"); /* "Can't happen". */ 350 /* NOTREACHED */ 351 } 352 353 return(ret); 354 } 355 356 /* 357 - regbranch - one alternative of an | operator 358 * 359 * Implements the concatenation operator. 360 */ 361 static char * 362 regbranch(flagp) 363 int *flagp; 364 { 365 register char *ret; 366 register char *chain; 367 register char *latest; 368 int flags; 369 370 *flagp = WORST; /* Tentatively. */ 371 372 ret = regnode(BRANCH); 373 chain = NULL; 374 while (*regparse != '\0' && *regparse != ')' && 375 *regparse != '\n' && *regparse != '|') { 376 latest = regpiece(&flags); 377 if (latest == NULL) 378 return(NULL); 379 *flagp |= flags&HASWIDTH; 380 if (chain == NULL) /* First piece. */ 381 *flagp |= flags&SPSTART; 382 else 383 regtail(chain, latest); 384 chain = latest; 385 } 386 if (chain == NULL) /* Loop ran zero times. */ 387 (void) regnode(NOTHING); 388 389 return(ret); 390 } 391 392 /* 393 - regpiece - something followed by possible [*+?] 394 * 395 * Note that the branching code sequences used for ? and the general cases 396 * of * and + are somewhat optimized: they use the same NOTHING node as 397 * both the endmarker for their branch list and the body of the last branch. 398 * It might seem that this node could be dispensed with entirely, but the 399 * endmarker role is not redundant. 400 */ 401 static char * 402 regpiece(flagp) 403 int *flagp; 404 { 405 register char *ret; 406 register char op; 407 register char *next; 408 int flags; 409 410 ret = regatom(&flags); 411 if (ret == NULL) 412 return(NULL); 413 414 op = *regparse; 415 if (!ISMULT(op)) { 416 *flagp = flags; 417 return(ret); 418 } 419 420 if (!(flags&HASWIDTH) && op != '?') 421 FAIL("*+ operand could be empty"); 422 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 423 424 if (op == '*' && (flags&SIMPLE)) 425 reginsert(STAR, ret); 426 else if (op == '*') { 427 /* Emit x* as (x&|), where & means "self". */ 428 reginsert(BRANCH, ret); /* Either x */ 429 regoptail(ret, regnode(BACK)); /* and loop */ 430 regoptail(ret, ret); /* back */ 431 regtail(ret, regnode(BRANCH)); /* or */ 432 regtail(ret, regnode(NOTHING)); /* null. */ 433 } else if (op == '+' && (flags&SIMPLE)) 434 reginsert(PLUS, ret); 435 else if (op == '+') { 436 /* Emit x+ as x(&|), where & means "self". */ 437 next = regnode(BRANCH); /* Either */ 438 regtail(ret, next); 439 regtail(regnode(BACK), ret); /* loop back */ 440 regtail(next, regnode(BRANCH)); /* or */ 441 regtail(ret, regnode(NOTHING)); /* null. */ 442 } else if (op == '?') { 443 /* Emit x? as (x|) */ 444 reginsert(BRANCH, ret); /* Either x */ 445 regtail(ret, regnode(BRANCH)); /* or */ 446 next = regnode(NOTHING); /* null. */ 447 regtail(ret, next); 448 regoptail(ret, next); 449 } 450 regparse++; 451 if (ISMULT(*regparse)) 452 FAIL("nested *?+"); 453 454 return(ret); 455 } 456 457 /* 458 - regatom - the lowest level 459 * 460 * Optimization: gobbles an entire sequence of ordinary characters so that 461 * it can turn them into a single node, which is smaller to store and 462 * faster to run. Backslashed characters are exceptions, each becoming a 463 * separate node; the code is simpler that way and it's not worth fixing. 464 */ 465 static char * 466 regatom(flagp) 467 int *flagp; 468 { 469 register char *ret; 470 int flags; 471 472 *flagp = WORST; /* Tentatively. */ 473 474 switch (*regparse++) { 475 /* FIXME: these chars only have meaning at beg/end of pat? */ 476 case '^': 477 ret = regnode(BOL); 478 break; 479 case '$': 480 ret = regnode(EOL); 481 break; 482 case '.': 483 ret = regnode(ANY); 484 *flagp |= HASWIDTH|SIMPLE; 485 break; 486 case '[': { 487 register int class; 488 register int classend; 489 490 if (*regparse == '^') { /* Complement of range. */ 491 ret = regnode(ANYBUT); 492 regparse++; 493 } else 494 ret = regnode(ANYOF); 495 if (*regparse == ']' || *regparse == '-') 496 regc(*regparse++); 497 while (*regparse != '\0' && *regparse != ']') { 498 if (*regparse == '-') { 499 regparse++; 500 if (*regparse == ']' || *regparse == '\0') 501 regc('-'); 502 else { 503 class = UCHARAT(regparse-2)+1; 504 classend = UCHARAT(regparse); 505 if (class > classend+1) 506 FAIL("invalid [] range"); 507 for (; class <= classend; class++) 508 regc(class); 509 regparse++; 510 } 511 } else 512 regc(*regparse++); 513 } 514 regc('\0'); 515 if (*regparse != ']') 516 FAIL("unmatched []"); 517 regparse++; 518 *flagp |= HASWIDTH|SIMPLE; 519 } 520 break; 521 case '(': 522 ret = reg(1, &flags); 523 if (ret == NULL) 524 return(NULL); 525 *flagp |= flags&(HASWIDTH|SPSTART); 526 break; 527 case '\0': 528 case '|': 529 case '\n': 530 case ')': 531 FAIL("internal urp"); /* Supposed to be caught earlier. */ 532 break; 533 case '?': 534 case '+': 535 case '*': 536 FAIL("?+* follows nothing"); 537 break; 538 case '\\': 539 switch (*regparse++) { 540 case '\0': 541 FAIL("trailing \\"); 542 break; 543 case '<': 544 ret = regnode(WORDA); 545 break; 546 case '>': 547 ret = regnode(WORDZ); 548 break; 549 /* FIXME: Someday handle \1, \2, ... */ 550 default: 551 /* Handle general quoted chars in exact-match routine */ 552 goto de_fault; 553 } 554 break; 555 de_fault: 556 default: 557 /* 558 * Encode a string of characters to be matched exactly. 559 * 560 * This is a bit tricky due to quoted chars and due to 561 * '*', '+', and '?' taking the SINGLE char previous 562 * as their operand. 563 * 564 * On entry, the char at regparse[-1] is going to go 565 * into the string, no matter what it is. (It could be 566 * following a \ if we are entered from the '\' case.) 567 * 568 * Basic idea is to pick up a good char in ch and 569 * examine the next char. If it's *+? then we twiddle. 570 * If it's \ then we frozzle. If it's other magic char 571 * we push ch and terminate the string. If none of the 572 * above, we push ch on the string and go around again. 573 * 574 * regprev is used to remember where "the current char" 575 * starts in the string, if due to a *+? we need to back 576 * up and put the current char in a separate, 1-char, string. 577 * When regprev is NULL, ch is the only char in the 578 * string; this is used in *+? handling, and in setting 579 * flags |= SIMPLE at the end. 580 */ 581 { 582 char *regprev; 583 register char ch; 584 585 regparse--; /* Look at cur char */ 586 ret = regnode(EXACTLY); 587 for ( regprev = 0 ; ; ) { 588 ch = *regparse++; /* Get current char */ 589 switch (*regparse) { /* look at next one */ 590 591 default: 592 regc(ch); /* Add cur to string */ 593 break; 594 595 case '.': case '[': case '(': 596 case ')': case '|': case '\n': 597 case '$': case '^': 598 case '\0': 599 /* FIXME, $ and ^ should not always be magic */ 600 magic: 601 regc(ch); /* dump cur char */ 602 goto done; /* and we are done */ 603 604 case '?': case '+': case '*': 605 if (!regprev) /* If just ch in str, */ 606 goto magic; /* use it */ 607 /* End mult-char string one early */ 608 regparse = regprev; /* Back up parse */ 609 goto done; 610 611 case '\\': 612 regc(ch); /* Cur char OK */ 613 switch (regparse[1]){ /* Look after \ */ 614 case '\0': 615 case '<': 616 case '>': 617 /* FIXME: Someday handle \1, \2, ... */ 618 goto done; /* Not quoted */ 619 default: 620 /* Backup point is \, scan * point is after it. */ 621 regprev = regparse; 622 regparse++; 623 continue; /* NOT break; */ 624 } 625 } 626 regprev = regparse; /* Set backup point */ 627 } 628 done: 629 regc('\0'); 630 *flagp |= HASWIDTH; 631 if (!regprev) /* One char? */ 632 *flagp |= SIMPLE; 633 } 634 break; 635 } 636 637 return(ret); 638 } 639 640 /* 641 - regnode - emit a node 642 */ 643 static char * /* Location. */ 644 regnode(op) 645 char op; 646 { 647 register char *ret; 648 register char *ptr; 649 650 ret = regcode; 651 if (ret == ®dummy) { 652 regsize += 3; 653 return(ret); 654 } 655 656 ptr = ret; 657 *ptr++ = op; 658 *ptr++ = '\0'; /* Null "next" pointer. */ 659 *ptr++ = '\0'; 660 regcode = ptr; 661 662 return(ret); 663 } 664 665 /* 666 - regc - emit (if appropriate) a byte of code 667 */ 668 static void 669 regc(b) 670 char b; 671 { 672 if (regcode != ®dummy) 673 *regcode++ = b; 674 else 675 regsize++; 676 } 677 678 /* 679 - reginsert - insert an operator in front of already-emitted operand 680 * 681 * Means relocating the operand. 682 */ 683 static void 684 reginsert(op, opnd) 685 char op; 686 char *opnd; 687 { 688 register char *src; 689 register char *dst; 690 register char *place; 691 692 if (regcode == ®dummy) { 693 regsize += 3; 694 return; 695 } 696 697 src = regcode; 698 regcode += 3; 699 dst = regcode; 700 while (src > opnd) 701 *--dst = *--src; 702 703 place = opnd; /* Op node, where operand used to be. */ 704 *place++ = op; 705 *place++ = '\0'; 706 *place++ = '\0'; 707 } 708 709 /* 710 - regtail - set the next-pointer at the end of a node chain 711 */ 712 static void 713 regtail(p, val) 714 char *p; 715 char *val; 716 { 717 register char *scan; 718 register char *temp; 719 register int offset; 720 721 if (p == ®dummy) 722 return; 723 724 /* Find last node. */ 725 scan = p; 726 for (;;) { 727 temp = regnext(scan); 728 if (temp == NULL) 729 break; 730 scan = temp; 731 } 732 733 if (OP(scan) == BACK) 734 offset = scan - val; 735 else 736 offset = val - scan; 737 *(scan+1) = (offset>>8)&0377; 738 *(scan+2) = offset&0377; 739 } 740 741 /* 742 - regoptail - regtail on operand of first argument; nop if operandless 743 */ 744 static void 745 regoptail(p, val) 746 char *p; 747 char *val; 748 { 749 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 750 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 751 return; 752 regtail(OPERAND(p), val); 753 } 754 755 /* 756 * regexec and friends 757 */ 758 759 /* 760 * Global work variables for regexec(). 761 */ 762 static char *reginput; /* String-input pointer. */ 763 static char *regbol; /* Beginning of input, for ^ check. */ 764 static char **regstartp; /* Pointer to startp array. */ 765 static char **regendp; /* Ditto for endp. */ 766 767 /* 768 * Forwards. 769 */ 770 STATIC int regtry(); 771 STATIC int regmatch(); 772 STATIC int regrepeat(); 773 774 #ifdef DEBUG 775 int regnarrate = 0; 776 void regdump(); 777 STATIC char *regprop(); 778 #endif 779 780 /* 781 - regexec - match a regexp against a string 782 */ 783 int 784 regexec(prog, string) 785 register regexp *prog; 786 register char *string; 787 { 788 register char *s; 789 extern char *strchr(); 790 791 /* Be paranoid... */ 792 if (prog == NULL || string == NULL) { 793 regerror("NULL parameter"); 794 return(0); 795 } 796 797 /* Check validity of program. */ 798 if (UCHARAT(prog->program) != MAGIC) { 799 regerror("corrupted program"); 800 return(0); 801 } 802 803 /* If there is a "must appear" string, look for it. */ 804 if (prog->regmust != NULL) { 805 s = string; 806 while ((s = strchr(s, prog->regmust[0])) != NULL) { 807 if (strncmp(s, prog->regmust, prog->regmlen) == 0) 808 break; /* Found it. */ 809 s++; 810 } 811 if (s == NULL) /* Not present. */ 812 return(0); 813 } 814 815 /* Mark beginning of line for ^ . */ 816 regbol = string; 817 818 /* Simplest case: anchored match need be tried only once. */ 819 if (prog->reganch) 820 return(regtry(prog, string)); 821 822 /* Messy cases: unanchored match. */ 823 s = string; 824 if (prog->regstart != '\0') 825 /* We know what char it must start with. */ 826 while ((s = strchr(s, prog->regstart)) != NULL) { 827 if (regtry(prog, s)) 828 return(1); 829 s++; 830 } 831 else 832 /* We don't -- general case. */ 833 do { 834 if (regtry(prog, s)) 835 return(1); 836 } while (*s++ != '\0'); 837 838 /* Failure. */ 839 return(0); 840 } 841 842 /* 843 - regtry - try match at specific point 844 */ 845 static int /* 0 failure, 1 success */ 846 regtry(prog, string) 847 regexp *prog; 848 char *string; 849 { 850 register int i; 851 register char **sp; 852 register char **ep; 853 854 reginput = string; 855 regstartp = prog->startp; 856 regendp = prog->endp; 857 858 sp = prog->startp; 859 ep = prog->endp; 860 for (i = NSUBEXP; i > 0; i--) { 861 *sp++ = NULL; 862 *ep++ = NULL; 863 } 864 if (regmatch(prog->program + 1)) { 865 prog->startp[0] = string; 866 prog->endp[0] = reginput; 867 return(1); 868 } else 869 return(0); 870 } 871 872 /* 873 - regmatch - main matching routine 874 * 875 * Conceptually the strategy is simple: check to see whether the current 876 * node matches, call self recursively to see whether the rest matches, 877 * and then act accordingly. In practice we make some effort to avoid 878 * recursion, in particular by going through "ordinary" nodes (that don't 879 * need to know whether the rest of the match failed) by a loop instead of 880 * by recursion. 881 */ 882 static int /* 0 failure, 1 success */ 883 regmatch(prog) 884 char *prog; 885 { 886 register char *scan; /* Current node. */ 887 char *next; /* Next node. */ 888 extern char *strchr(); 889 890 scan = prog; 891 #ifdef DEBUG 892 if (scan != NULL && regnarrate) 893 fprintf(stderr, "%s(\n", regprop(scan)); 894 #endif 895 while (scan != NULL) { 896 #ifdef DEBUG 897 if (regnarrate) 898 fprintf(stderr, "%s...\n", regprop(scan)); 899 #endif 900 next = regnext(scan); 901 902 switch (OP(scan)) { 903 case BOL: 904 if (reginput != regbol) 905 return(0); 906 break; 907 case EOL: 908 if (*reginput != '\0') 909 return(0); 910 break; 911 case WORDA: 912 /* Must be looking at a letter, digit, or _ */ 913 if ((!isalnum(*reginput)) && *reginput != '_') 914 return(0); 915 /* Prev must be BOL or nonword */ 916 if (reginput > regbol && 917 (isalnum(reginput[-1]) || reginput[-1] == '_')) 918 return(0); 919 break; 920 case WORDZ: 921 /* Must be looking at non letter, digit, or _ */ 922 if (isalnum(*reginput) || *reginput == '_') 923 return(0); 924 /* We don't care what the previous char was */ 925 break; 926 case ANY: 927 if (*reginput == '\0') 928 return(0); 929 reginput++; 930 break; 931 case EXACTLY: { 932 register int len; 933 register char *opnd; 934 935 opnd = OPERAND(scan); 936 /* Inline the first character, for speed. */ 937 if (*opnd != *reginput) 938 return(0); 939 len = strlen(opnd); 940 if (len > 1 && strncmp(opnd, reginput, len) != 0) 941 return(0); 942 reginput += len; 943 } 944 break; 945 case ANYOF: 946 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 947 return(0); 948 reginput++; 949 break; 950 case ANYBUT: 951 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 952 return(0); 953 reginput++; 954 break; 955 case NOTHING: 956 break; 957 case BACK: 958 break; 959 case OPEN+1: 960 case OPEN+2: 961 case OPEN+3: 962 case OPEN+4: 963 case OPEN+5: 964 case OPEN+6: 965 case OPEN+7: 966 case OPEN+8: 967 case OPEN+9: { 968 register int no; 969 register char *save; 970 971 no = OP(scan) - OPEN; 972 save = reginput; 973 974 if (regmatch(next)) { 975 /* 976 * Don't set startp if some later 977 * invocation of the same parentheses 978 * already has. 979 */ 980 if (regstartp[no] == NULL) 981 regstartp[no] = save; 982 return(1); 983 } else 984 return(0); 985 } 986 break; 987 case CLOSE+1: 988 case CLOSE+2: 989 case CLOSE+3: 990 case CLOSE+4: 991 case CLOSE+5: 992 case CLOSE+6: 993 case CLOSE+7: 994 case CLOSE+8: 995 case CLOSE+9: { 996 register int no; 997 register char *save; 998 999 no = OP(scan) - CLOSE; 1000 save = reginput; 1001 1002 if (regmatch(next)) { 1003 /* 1004 * Don't set endp if some later 1005 * invocation of the same parentheses 1006 * already has. 1007 */ 1008 if (regendp[no] == NULL) 1009 regendp[no] = save; 1010 return(1); 1011 } else 1012 return(0); 1013 } 1014 break; 1015 case BRANCH: { 1016 register char *save; 1017 1018 if (OP(next) != BRANCH) /* No choice. */ 1019 next = OPERAND(scan); /* Avoid recursion. */ 1020 else { 1021 do { 1022 save = reginput; 1023 if (regmatch(OPERAND(scan))) 1024 return(1); 1025 reginput = save; 1026 scan = regnext(scan); 1027 } while (scan != NULL && OP(scan) == BRANCH); 1028 return(0); 1029 /* NOTREACHED */ 1030 } 1031 } 1032 break; 1033 case STAR: 1034 case PLUS: { 1035 register char nextch; 1036 register int no; 1037 register char *save; 1038 register int min; 1039 1040 /* 1041 * Lookahead to avoid useless match attempts 1042 * when we know what character comes next. 1043 */ 1044 nextch = '\0'; 1045 if (OP(next) == EXACTLY) 1046 nextch = *OPERAND(next); 1047 min = (OP(scan) == STAR) ? 0 : 1; 1048 save = reginput; 1049 no = regrepeat(OPERAND(scan)); 1050 while (no >= min) { 1051 /* If it could work, try it. */ 1052 if (nextch == '\0' || *reginput == nextch) 1053 if (regmatch(next)) 1054 return(1); 1055 /* Couldn't or didn't -- back up. */ 1056 no--; 1057 reginput = save + no; 1058 } 1059 return(0); 1060 } 1061 break; 1062 case END: 1063 return(1); /* Success! */ 1064 break; 1065 default: 1066 regerror("memory corruption"); 1067 return(0); 1068 break; 1069 } 1070 1071 scan = next; 1072 } 1073 1074 /* 1075 * We get here only if there's trouble -- normally "case END" is 1076 * the terminating point. 1077 */ 1078 regerror("corrupted pointers"); 1079 return(0); 1080 } 1081 1082 /* 1083 - regrepeat - repeatedly match something simple, report how many 1084 */ 1085 static int 1086 regrepeat(p) 1087 char *p; 1088 { 1089 register int count = 0; 1090 register char *scan; 1091 register char *opnd; 1092 1093 scan = reginput; 1094 opnd = OPERAND(p); 1095 switch (OP(p)) { 1096 case ANY: 1097 count = strlen(scan); 1098 scan += count; 1099 break; 1100 case EXACTLY: 1101 while (*opnd == *scan) { 1102 count++; 1103 scan++; 1104 } 1105 break; 1106 case ANYOF: 1107 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1108 count++; 1109 scan++; 1110 } 1111 break; 1112 case ANYBUT: 1113 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1114 count++; 1115 scan++; 1116 } 1117 break; 1118 default: /* Oh dear. Called inappropriately. */ 1119 regerror("internal foulup"); 1120 count = 0; /* Best compromise. */ 1121 break; 1122 } 1123 reginput = scan; 1124 1125 return(count); 1126 } 1127 1128 /* 1129 - regnext - dig the "next" pointer out of a node 1130 */ 1131 static char * 1132 regnext(p) 1133 register char *p; 1134 { 1135 register int offset; 1136 1137 if (p == ®dummy) 1138 return(NULL); 1139 1140 offset = NEXT(p); 1141 if (offset == 0) 1142 return(NULL); 1143 1144 if (OP(p) == BACK) 1145 return(p-offset); 1146 else 1147 return(p+offset); 1148 } 1149 1150 #ifdef DEBUG 1151 1152 STATIC char *regprop(); 1153 1154 /* 1155 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1156 */ 1157 void 1158 regdump(r) 1159 regexp *r; 1160 { 1161 register char *s; 1162 register char op = EXACTLY; /* Arbitrary non-END op. */ 1163 register char *next; 1164 extern char *strchr(); 1165 1166 1167 s = r->program + 1; 1168 while (op != END) { /* While that wasn't END last time... */ 1169 op = OP(s); 1170 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1171 next = regnext(s); 1172 if (next == NULL) /* Next ptr. */ 1173 printf("(0)"); 1174 else 1175 printf("(%d)", (s-r->program)+(next-s)); 1176 s += 3; 1177 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1178 /* Literal string, where present. */ 1179 while (*s != '\0') { 1180 putchar(*s); 1181 s++; 1182 } 1183 s++; 1184 } 1185 putchar('\n'); 1186 } 1187 1188 /* Header fields of interest. */ 1189 if (r->regstart != '\0') 1190 printf("start `%c' ", r->regstart); 1191 if (r->reganch) 1192 printf("anchored "); 1193 if (r->regmust != NULL) 1194 printf("must have \"%s\"", r->regmust); 1195 printf("\n"); 1196 } 1197 1198 /* 1199 - regprop - printable representation of opcode 1200 */ 1201 static char * 1202 regprop(op) 1203 char *op; 1204 { 1205 register char *p; 1206 static char buf[50]; 1207 1208 (void) strcpy(buf, ":"); 1209 1210 switch (OP(op)) { 1211 case BOL: 1212 p = "BOL"; 1213 break; 1214 case EOL: 1215 p = "EOL"; 1216 break; 1217 case ANY: 1218 p = "ANY"; 1219 break; 1220 case ANYOF: 1221 p = "ANYOF"; 1222 break; 1223 case ANYBUT: 1224 p = "ANYBUT"; 1225 break; 1226 case BRANCH: 1227 p = "BRANCH"; 1228 break; 1229 case EXACTLY: 1230 p = "EXACTLY"; 1231 break; 1232 case NOTHING: 1233 p = "NOTHING"; 1234 break; 1235 case BACK: 1236 p = "BACK"; 1237 break; 1238 case END: 1239 p = "END"; 1240 break; 1241 case OPEN+1: 1242 case OPEN+2: 1243 case OPEN+3: 1244 case OPEN+4: 1245 case OPEN+5: 1246 case OPEN+6: 1247 case OPEN+7: 1248 case OPEN+8: 1249 case OPEN+9: 1250 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 1251 p = NULL; 1252 break; 1253 case CLOSE+1: 1254 case CLOSE+2: 1255 case CLOSE+3: 1256 case CLOSE+4: 1257 case CLOSE+5: 1258 case CLOSE+6: 1259 case CLOSE+7: 1260 case CLOSE+8: 1261 case CLOSE+9: 1262 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 1263 p = NULL; 1264 break; 1265 case STAR: 1266 p = "STAR"; 1267 break; 1268 case PLUS: 1269 p = "PLUS"; 1270 break; 1271 case WORDA: 1272 p = "WORDA"; 1273 break; 1274 case WORDZ: 1275 p = "WORDZ"; 1276 break; 1277 default: 1278 regerror("corrupted opcode"); 1279 break; 1280 } 1281 if (p != NULL) 1282 (void) strcat(buf, p); 1283 return(buf); 1284 } 1285 #endif 1286 1287 /* 1288 * The following is provided for those people who do not have strcspn() in 1289 * their C libraries. They should get off their butts and do something 1290 * about it; at least one public-domain implementation of those (highly 1291 * useful) string routines has been published on Usenet. 1292 */ 1293 #ifdef STRCSPN 1294 /* 1295 * strcspn - find length of initial segment of s1 consisting entirely 1296 * of characters not from s2 1297 */ 1298 1299 static int 1300 strcspn(s1, s2) 1301 char *s1; 1302 char *s2; 1303 { 1304 register char *scan1; 1305 register char *scan2; 1306 register int count; 1307 1308 count = 0; 1309 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1310 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1311 if (*scan1 == *scan2++) 1312 return(count); 1313 count++; 1314 } 1315 return(count); 1316 } 1317 #endif 1318