1 /* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * 4 * Copyright (c) 1986 by University of Toronto. 5 * Written by Henry Spencer. Not derived from licensed software. 6 * 7 * Permission is granted to anyone to use this software for any 8 * purpose on any computer system, and to redistribute it freely, 9 * subject to the following restrictions: 10 * 11 * 1. The author is not responsible for the consequences of use of 12 * this software, no matter how awful, even if they arise 13 * from defects in it. 14 * 15 * 2. The origin of this software must not be misrepresented, either 16 * by explicit claim or by omission. 17 * 18 * 3. Altered versions must be plainly marked as such, and must not 19 * be misrepresented as being the original software. 20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | 22 *** to assist in implementing egrep. 23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching 25 *** as in BSD grep and ex. 26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. 28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, 29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. 30 * 31 * Beware that some of this code is subtly aware of the way operator 32 * precedence is structured in regular expressions. Serious changes in 33 * regular-expression syntax might require a total rethink. 34 */ 35 #include <regexp.h> 36 #include <stdio.h> 37 #include <ctype.h> 38 #include <stdlib.h> 39 #include <string.h> 40 #include "regmagic.h" 41 42 /* 43 * The "internal use only" fields in regexp.h are present to pass info from 44 * compile to execute that permits the execute phase to run lots faster on 45 * simple cases. They are: 46 * 47 * regstart char that must begin a match; '\0' if none obvious 48 * reganch is the match anchored (at beginning-of-line only)? 49 * regmust string (pointer into program) that match must include, or NULL 50 * regmlen length of regmust string 51 * 52 * Regstart and reganch permit very fast decisions on suitable starting points 53 * for a match, cutting down the work a lot. Regmust permits fast rejection 54 * of lines that cannot possibly match. The regmust tests are costly enough 55 * that regcomp() supplies a regmust only if the r.e. contains something 56 * potentially expensive (at present, the only such thing detected is * or + 57 * at the start of the r.e., which can involve a lot of backup). Regmlen is 58 * supplied because the test in regexec() needs it and regcomp() is computing 59 * it anyway. 60 */ 61 62 /* 63 * Structure for regexp "program". This is essentially a linear encoding 64 * of a nondeterministic finite-state machine (aka syntax charts or 65 * "railroad normal form" in parsing technology). Each node is an opcode 66 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 67 * all nodes except BRANCH implement concatenation; a "next" pointer with 68 * a BRANCH on both ends of it is connecting two alternatives. (Here we 69 * have one of the subtle syntax dependencies: an individual BRANCH (as 70 * opposed to a collection of them) is never concatenated with anything 71 * because of operator precedence.) The operand of some types of node is 72 * a literal string; for others, it is a node leading into a sub-FSM. In 73 * particular, the operand of a BRANCH node is the first node of the branch. 74 * (NB this is *not* a tree structure: the tail of the branch connects 75 * to the thing following the set of BRANCHes.) The opcodes are: 76 */ 77 78 /* definition number opnd? meaning */ 79 #define END 0 /* no End of program. */ 80 #define BOL 1 /* no Match "" at beginning of line. */ 81 #define EOL 2 /* no Match "" at end of line. */ 82 #define ANY 3 /* no Match any one character. */ 83 #define ANYOF 4 /* str Match any character in this string. */ 84 #define ANYBUT 5 /* str Match any character not in this string. */ 85 #define BRANCH 6 /* node Match this alternative, or the next... */ 86 #define BACK 7 /* no Match "", "next" ptr points backward. */ 87 #define EXACTLY 8 /* str Match this string. */ 88 #define NOTHING 9 /* no Match empty string. */ 89 #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 90 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 91 #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ 92 #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ 93 #define OPEN 20 /* no Mark this point in input as start of #n. */ 94 /* OPEN+1 is number 1, etc. */ 95 #define CLOSE 30 /* no Analogous to OPEN. */ 96 97 /* 98 * Opcode notes: 99 * 100 * BRANCH The set of branches constituting a single choice are hooked 101 * together with their "next" pointers, since precedence prevents 102 * anything being concatenated to any individual branch. The 103 * "next" pointer of the last BRANCH in a choice points to the 104 * thing following the whole choice. This is also where the 105 * final "next" pointer of each individual branch points; each 106 * branch starts with the operand node of a BRANCH node. 107 * 108 * BACK Normal "next" pointers all implicitly point forward; BACK 109 * exists to make loop structures possible. 110 * 111 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 112 * BRANCH structures using BACK. Simple cases (one character 113 * per match) are implemented with STAR and PLUS for speed 114 * and to minimize recursive plunges. 115 * 116 * OPEN,CLOSE ...are numbered at compile time. 117 */ 118 119 /* 120 * A node is one char of opcode followed by two chars of "next" pointer. 121 * "Next" pointers are stored as two 8-bit pieces, high order first. The 122 * value is a positive offset from the opcode of the node containing it. 123 * An operand, if any, simply follows the node. (Note that much of the 124 * code generation knows about this implicit relationship.) 125 * 126 * Using two bytes for the "next" pointer is vast overkill for most things, 127 * but allows patterns to get big without disasters. 128 */ 129 #define OP(p) (*(p)) 130 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 131 #define OPERAND(p) ((p) + 3) 132 133 /* 134 * See regmagic.h for one further detail of program structure. 135 */ 136 137 138 /* 139 * Utility definitions. 140 */ 141 #ifndef CHARBITS 142 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 143 #else 144 #define UCHARAT(p) ((int)*(p)&CHARBITS) 145 #endif 146 147 #define FAIL(m) { regerror(m); return(NULL); } 148 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 149 150 /* 151 * Flags to be passed up and down. 152 */ 153 #define HASWIDTH 01 /* Known never to match null string. */ 154 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 155 #define SPSTART 04 /* Starts with * or +. */ 156 #define WORST 0 /* Worst case. */ 157 158 /* 159 * Global work variables for regcomp(). 160 */ 161 static char *regparse; /* Input-scan pointer. */ 162 static int regnpar; /* () count. */ 163 static char regdummy; 164 static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 165 static long regsize; /* Code size. */ 166 167 /* 168 * Forward declarations for regcomp()'s friends. 169 */ 170 #ifndef STATIC 171 #define STATIC static 172 #endif 173 STATIC char *reg(); 174 STATIC char *regbranch(); 175 STATIC char *regpiece(); 176 STATIC char *regatom(); 177 STATIC char *regnode(); 178 STATIC char *regnext(); 179 STATIC void regc(); 180 STATIC void reginsert(); 181 STATIC void regtail(); 182 STATIC void regoptail(); 183 #ifdef STRCSPN 184 STATIC int strcspn(); 185 #endif 186 187 /* 188 - regcomp - compile a regular expression into internal code 189 * 190 * We can't allocate space until we know how big the compiled form will be, 191 * but we can't compile it (and thus know how big it is) until we've got a 192 * place to put the code. So we cheat: we compile it twice, once with code 193 * generation turned off and size counting turned on, and once "for real". 194 * This also means that we don't allocate space until we are sure that the 195 * thing really will compile successfully, and we never have to move the 196 * code and thus invalidate pointers into it. (Note that it has to be in 197 * one piece because free() must be able to free it all.) 198 * 199 * Beware that the optimization-preparation code in here knows about some 200 * of the structure of the compiled regexp. 201 */ 202 regexp * 203 regcomp(exp) 204 const char *exp; 205 { 206 register regexp *r; 207 register char *scan; 208 register char *longest; 209 register int len; 210 int flags; 211 212 if (exp == NULL) 213 FAIL("NULL argument"); 214 215 /* First pass: determine size, legality. */ 216 #ifdef notdef 217 if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */ 218 #endif 219 regparse = (char *)exp; 220 regnpar = 1; 221 regsize = 0L; 222 regcode = ®dummy; 223 regc(MAGIC); 224 if (reg(0, &flags) == NULL) 225 return(NULL); 226 227 /* Small enough for pointer-storage convention? */ 228 if (regsize >= 32767L) /* Probably could be 65535L. */ 229 FAIL("regexp too big"); 230 231 /* Allocate space. */ 232 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 233 if (r == NULL) 234 FAIL("out of space"); 235 236 /* Second pass: emit code. */ 237 regparse = (char *)exp; 238 regnpar = 1; 239 regcode = r->program; 240 regc(MAGIC); 241 if (reg(0, &flags) == NULL) 242 return(NULL); 243 244 /* Dig out information for optimizations. */ 245 r->regstart = '\0'; /* Worst-case defaults. */ 246 r->reganch = 0; 247 r->regmust = NULL; 248 r->regmlen = 0; 249 scan = r->program+1; /* First BRANCH. */ 250 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 251 scan = OPERAND(scan); 252 253 /* Starting-point info. */ 254 if (OP(scan) == EXACTLY) 255 r->regstart = *OPERAND(scan); 256 else if (OP(scan) == BOL) 257 r->reganch++; 258 259 /* 260 * If there's something expensive in the r.e., find the 261 * longest literal string that must appear and make it the 262 * regmust. Resolve ties in favor of later strings, since 263 * the regstart check works with the beginning of the r.e. 264 * and avoiding duplication strengthens checking. Not a 265 * strong reason, but sufficient in the absence of others. 266 */ 267 if (flags&SPSTART) { 268 longest = NULL; 269 len = 0; 270 for (; scan != NULL; scan = regnext(scan)) 271 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { 272 longest = OPERAND(scan); 273 len = strlen(OPERAND(scan)); 274 } 275 r->regmust = longest; 276 r->regmlen = len; 277 } 278 } 279 280 return(r); 281 } 282 283 /* 284 - reg - regular expression, i.e. main body or parenthesized thing 285 * 286 * Caller must absorb opening parenthesis. 287 * 288 * Combining parenthesis handling with the base level of regular expression 289 * is a trifle forced, but the need to tie the tails of the branches to what 290 * follows makes it hard to avoid. 291 */ 292 static char * 293 reg(paren, flagp) 294 int paren; /* Parenthesized? */ 295 int *flagp; 296 { 297 register char *ret; 298 register char *br; 299 register char *ender; 300 register int parno; 301 int flags; 302 303 *flagp = HASWIDTH; /* Tentatively. */ 304 305 /* Make an OPEN node, if parenthesized. */ 306 if (paren) { 307 if (regnpar >= NSUBEXP) 308 FAIL("too many ()"); 309 parno = regnpar; 310 regnpar++; 311 ret = regnode(OPEN+parno); 312 } else 313 ret = NULL; 314 315 /* Pick up the branches, linking them together. */ 316 br = regbranch(&flags); 317 if (br == NULL) 318 return(NULL); 319 if (ret != NULL) 320 regtail(ret, br); /* OPEN -> first. */ 321 else 322 ret = br; 323 if (!(flags&HASWIDTH)) 324 *flagp &= ~HASWIDTH; 325 *flagp |= flags&SPSTART; 326 while (*regparse == '|' || *regparse == '\n') { 327 regparse++; 328 br = regbranch(&flags); 329 if (br == NULL) 330 return(NULL); 331 regtail(ret, br); /* BRANCH -> BRANCH. */ 332 if (!(flags&HASWIDTH)) 333 *flagp &= ~HASWIDTH; 334 *flagp |= flags&SPSTART; 335 } 336 337 /* Make a closing node, and hook it on the end. */ 338 ender = regnode((paren) ? CLOSE+parno : END); 339 regtail(ret, ender); 340 341 /* Hook the tails of the branches to the closing node. */ 342 for (br = ret; br != NULL; br = regnext(br)) 343 regoptail(br, ender); 344 345 /* Check for proper termination. */ 346 if (paren && *regparse++ != ')') { 347 FAIL("unmatched ()"); 348 } else if (!paren && *regparse != '\0') { 349 if (*regparse == ')') { 350 FAIL("unmatched ()"); 351 } else 352 FAIL("junk on end"); /* "Can't happen". */ 353 /* NOTREACHED */ 354 } 355 356 return(ret); 357 } 358 359 /* 360 - regbranch - one alternative of an | operator 361 * 362 * Implements the concatenation operator. 363 */ 364 static char * 365 regbranch(flagp) 366 int *flagp; 367 { 368 register char *ret; 369 register char *chain; 370 register char *latest; 371 int flags; 372 373 *flagp = WORST; /* Tentatively. */ 374 375 ret = regnode(BRANCH); 376 chain = NULL; 377 while (*regparse != '\0' && *regparse != ')' && 378 *regparse != '\n' && *regparse != '|') { 379 latest = regpiece(&flags); 380 if (latest == NULL) 381 return(NULL); 382 *flagp |= flags&HASWIDTH; 383 if (chain == NULL) /* First piece. */ 384 *flagp |= flags&SPSTART; 385 else 386 regtail(chain, latest); 387 chain = latest; 388 } 389 if (chain == NULL) /* Loop ran zero times. */ 390 (void) regnode(NOTHING); 391 392 return(ret); 393 } 394 395 /* 396 - regpiece - something followed by possible [*+?] 397 * 398 * Note that the branching code sequences used for ? and the general cases 399 * of * and + are somewhat optimized: they use the same NOTHING node as 400 * both the endmarker for their branch list and the body of the last branch. 401 * It might seem that this node could be dispensed with entirely, but the 402 * endmarker role is not redundant. 403 */ 404 static char * 405 regpiece(flagp) 406 int *flagp; 407 { 408 register char *ret; 409 register char op; 410 register char *next; 411 int flags; 412 413 ret = regatom(&flags); 414 if (ret == NULL) 415 return(NULL); 416 417 op = *regparse; 418 if (!ISMULT(op)) { 419 *flagp = flags; 420 return(ret); 421 } 422 423 if (!(flags&HASWIDTH) && op != '?') 424 FAIL("*+ operand could be empty"); 425 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 426 427 if (op == '*' && (flags&SIMPLE)) 428 reginsert(STAR, ret); 429 else if (op == '*') { 430 /* Emit x* as (x&|), where & means "self". */ 431 reginsert(BRANCH, ret); /* Either x */ 432 regoptail(ret, regnode(BACK)); /* and loop */ 433 regoptail(ret, ret); /* back */ 434 regtail(ret, regnode(BRANCH)); /* or */ 435 regtail(ret, regnode(NOTHING)); /* null. */ 436 } else if (op == '+' && (flags&SIMPLE)) 437 reginsert(PLUS, ret); 438 else if (op == '+') { 439 /* Emit x+ as x(&|), where & means "self". */ 440 next = regnode(BRANCH); /* Either */ 441 regtail(ret, next); 442 regtail(regnode(BACK), ret); /* loop back */ 443 regtail(next, regnode(BRANCH)); /* or */ 444 regtail(ret, regnode(NOTHING)); /* null. */ 445 } else if (op == '?') { 446 /* Emit x? as (x|) */ 447 reginsert(BRANCH, ret); /* Either x */ 448 regtail(ret, regnode(BRANCH)); /* or */ 449 next = regnode(NOTHING); /* null. */ 450 regtail(ret, next); 451 regoptail(ret, next); 452 } 453 regparse++; 454 if (ISMULT(*regparse)) 455 FAIL("nested *?+"); 456 457 return(ret); 458 } 459 460 /* 461 - regatom - the lowest level 462 * 463 * Optimization: gobbles an entire sequence of ordinary characters so that 464 * it can turn them into a single node, which is smaller to store and 465 * faster to run. Backslashed characters are exceptions, each becoming a 466 * separate node; the code is simpler that way and it's not worth fixing. 467 */ 468 static char * 469 regatom(flagp) 470 int *flagp; 471 { 472 register char *ret; 473 int flags; 474 475 *flagp = WORST; /* Tentatively. */ 476 477 switch (*regparse++) { 478 /* FIXME: these chars only have meaning at beg/end of pat? */ 479 case '^': 480 ret = regnode(BOL); 481 break; 482 case '$': 483 ret = regnode(EOL); 484 break; 485 case '.': 486 ret = regnode(ANY); 487 *flagp |= HASWIDTH|SIMPLE; 488 break; 489 case '[': { 490 register int class; 491 register int classend; 492 493 if (*regparse == '^') { /* Complement of range. */ 494 ret = regnode(ANYBUT); 495 regparse++; 496 } else 497 ret = regnode(ANYOF); 498 if (*regparse == ']' || *regparse == '-') 499 regc(*regparse++); 500 while (*regparse != '\0' && *regparse != ']') { 501 if (*regparse == '-') { 502 regparse++; 503 if (*regparse == ']' || *regparse == '\0') 504 regc('-'); 505 else { 506 class = UCHARAT(regparse-2)+1; 507 classend = UCHARAT(regparse); 508 if (class > classend+1) 509 FAIL("invalid [] range"); 510 for (; class <= classend; class++) 511 regc(class); 512 regparse++; 513 } 514 } else 515 regc(*regparse++); 516 } 517 regc('\0'); 518 if (*regparse != ']') 519 FAIL("unmatched []"); 520 regparse++; 521 *flagp |= HASWIDTH|SIMPLE; 522 } 523 break; 524 case '(': 525 ret = reg(1, &flags); 526 if (ret == NULL) 527 return(NULL); 528 *flagp |= flags&(HASWIDTH|SPSTART); 529 break; 530 case '\0': 531 case '|': 532 case '\n': 533 case ')': 534 FAIL("internal urp"); /* Supposed to be caught earlier. */ 535 break; 536 case '?': 537 case '+': 538 case '*': 539 FAIL("?+* follows nothing"); 540 break; 541 case '\\': 542 switch (*regparse++) { 543 case '\0': 544 FAIL("trailing \\"); 545 break; 546 case '<': 547 ret = regnode(WORDA); 548 break; 549 case '>': 550 ret = regnode(WORDZ); 551 break; 552 /* FIXME: Someday handle \1, \2, ... */ 553 default: 554 /* Handle general quoted chars in exact-match routine */ 555 goto de_fault; 556 } 557 break; 558 de_fault: 559 default: 560 /* 561 * Encode a string of characters to be matched exactly. 562 * 563 * This is a bit tricky due to quoted chars and due to 564 * '*', '+', and '?' taking the SINGLE char previous 565 * as their operand. 566 * 567 * On entry, the char at regparse[-1] is going to go 568 * into the string, no matter what it is. (It could be 569 * following a \ if we are entered from the '\' case.) 570 * 571 * Basic idea is to pick up a good char in ch and 572 * examine the next char. If it's *+? then we twiddle. 573 * If it's \ then we frozzle. If it's other magic char 574 * we push ch and terminate the string. If none of the 575 * above, we push ch on the string and go around again. 576 * 577 * regprev is used to remember where "the current char" 578 * starts in the string, if due to a *+? we need to back 579 * up and put the current char in a separate, 1-char, string. 580 * When regprev is NULL, ch is the only char in the 581 * string; this is used in *+? handling, and in setting 582 * flags |= SIMPLE at the end. 583 */ 584 { 585 char *regprev; 586 register char ch; 587 588 regparse--; /* Look at cur char */ 589 ret = regnode(EXACTLY); 590 for ( regprev = 0 ; ; ) { 591 ch = *regparse++; /* Get current char */ 592 switch (*regparse) { /* look at next one */ 593 594 default: 595 regc(ch); /* Add cur to string */ 596 break; 597 598 case '.': case '[': case '(': 599 case ')': case '|': case '\n': 600 case '$': case '^': 601 case '\0': 602 /* FIXME, $ and ^ should not always be magic */ 603 magic: 604 regc(ch); /* dump cur char */ 605 goto done; /* and we are done */ 606 607 case '?': case '+': case '*': 608 if (!regprev) /* If just ch in str, */ 609 goto magic; /* use it */ 610 /* End mult-char string one early */ 611 regparse = regprev; /* Back up parse */ 612 goto done; 613 614 case '\\': 615 regc(ch); /* Cur char OK */ 616 switch (regparse[1]){ /* Look after \ */ 617 case '\0': 618 case '<': 619 case '>': 620 /* FIXME: Someday handle \1, \2, ... */ 621 goto done; /* Not quoted */ 622 default: 623 /* Backup point is \, scan * point is after it. */ 624 regprev = regparse; 625 regparse++; 626 continue; /* NOT break; */ 627 } 628 } 629 regprev = regparse; /* Set backup point */ 630 } 631 done: 632 regc('\0'); 633 *flagp |= HASWIDTH; 634 if (!regprev) /* One char? */ 635 *flagp |= SIMPLE; 636 } 637 break; 638 } 639 640 return(ret); 641 } 642 643 /* 644 - regnode - emit a node 645 */ 646 static char * /* Location. */ 647 regnode(op) 648 char op; 649 { 650 register char *ret; 651 register char *ptr; 652 653 ret = regcode; 654 if (ret == ®dummy) { 655 regsize += 3; 656 return(ret); 657 } 658 659 ptr = ret; 660 *ptr++ = op; 661 *ptr++ = '\0'; /* Null "next" pointer. */ 662 *ptr++ = '\0'; 663 regcode = ptr; 664 665 return(ret); 666 } 667 668 /* 669 - regc - emit (if appropriate) a byte of code 670 */ 671 static void 672 regc(b) 673 char b; 674 { 675 if (regcode != ®dummy) 676 *regcode++ = b; 677 else 678 regsize++; 679 } 680 681 /* 682 - reginsert - insert an operator in front of already-emitted operand 683 * 684 * Means relocating the operand. 685 */ 686 static void 687 reginsert(op, opnd) 688 char op; 689 char *opnd; 690 { 691 register char *src; 692 register char *dst; 693 register char *place; 694 695 if (regcode == ®dummy) { 696 regsize += 3; 697 return; 698 } 699 700 src = regcode; 701 regcode += 3; 702 dst = regcode; 703 while (src > opnd) 704 *--dst = *--src; 705 706 place = opnd; /* Op node, where operand used to be. */ 707 *place++ = op; 708 *place++ = '\0'; 709 *place++ = '\0'; 710 } 711 712 /* 713 - regtail - set the next-pointer at the end of a node chain 714 */ 715 static void 716 regtail(p, val) 717 char *p; 718 char *val; 719 { 720 register char *scan; 721 register char *temp; 722 register int offset; 723 724 if (p == ®dummy) 725 return; 726 727 /* Find last node. */ 728 scan = p; 729 for (;;) { 730 temp = regnext(scan); 731 if (temp == NULL) 732 break; 733 scan = temp; 734 } 735 736 if (OP(scan) == BACK) 737 offset = scan - val; 738 else 739 offset = val - scan; 740 *(scan+1) = (offset>>8)&0377; 741 *(scan+2) = offset&0377; 742 } 743 744 /* 745 - regoptail - regtail on operand of first argument; nop if operandless 746 */ 747 static void 748 regoptail(p, val) 749 char *p; 750 char *val; 751 { 752 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 753 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 754 return; 755 regtail(OPERAND(p), val); 756 } 757 758 /* 759 * regexec and friends 760 */ 761 762 /* 763 * Global work variables for regexec(). 764 */ 765 static char *reginput; /* String-input pointer. */ 766 static char *regbol; /* Beginning of input, for ^ check. */ 767 static char **regstartp; /* Pointer to startp array. */ 768 static char **regendp; /* Ditto for endp. */ 769 770 /* 771 * Forwards. 772 */ 773 STATIC int regtry(); 774 STATIC int regmatch(); 775 STATIC int regrepeat(); 776 777 #ifdef DEBUG 778 int regnarrate = 0; 779 void regdump(); 780 STATIC char *regprop(); 781 #endif 782 783 /* 784 - regexec - match a regexp against a string 785 */ 786 int 787 regexec(prog, string) 788 register const regexp *prog; 789 register const char *string; 790 { 791 register char *s; 792 extern char *strchr(); 793 794 /* Be paranoid... */ 795 if (prog == NULL || string == NULL) { 796 regerror("NULL parameter"); 797 return(0); 798 } 799 800 /* Check validity of program. */ 801 if (UCHARAT(prog->program) != MAGIC) { 802 regerror("corrupted program"); 803 return(0); 804 } 805 806 /* If there is a "must appear" string, look for it. */ 807 if (prog->regmust != NULL) { 808 s = (char *)string; 809 while ((s = strchr(s, prog->regmust[0])) != NULL) { 810 if (strncmp(s, prog->regmust, prog->regmlen) == 0) 811 break; /* Found it. */ 812 s++; 813 } 814 if (s == NULL) /* Not present. */ 815 return(0); 816 } 817 818 /* Mark beginning of line for ^ . */ 819 regbol = (char *)string; 820 821 /* Simplest case: anchored match need be tried only once. */ 822 if (prog->reganch) 823 return(regtry(prog, string)); 824 825 /* Messy cases: unanchored match. */ 826 s = (char *)string; 827 if (prog->regstart != '\0') 828 /* We know what char it must start with. */ 829 while ((s = strchr(s, prog->regstart)) != NULL) { 830 if (regtry(prog, s)) 831 return(1); 832 s++; 833 } 834 else 835 /* We don't -- general case. */ 836 do { 837 if (regtry(prog, s)) 838 return(1); 839 } while (*s++ != '\0'); 840 841 /* Failure. */ 842 return(0); 843 } 844 845 /* 846 - regtry - try match at specific point 847 */ 848 static int /* 0 failure, 1 success */ 849 regtry(prog, string) 850 regexp *prog; 851 char *string; 852 { 853 register int i; 854 register char **sp; 855 register char **ep; 856 857 reginput = string; 858 regstartp = prog->startp; 859 regendp = prog->endp; 860 861 sp = prog->startp; 862 ep = prog->endp; 863 for (i = NSUBEXP; i > 0; i--) { 864 *sp++ = NULL; 865 *ep++ = NULL; 866 } 867 if (regmatch(prog->program + 1)) { 868 prog->startp[0] = string; 869 prog->endp[0] = reginput; 870 return(1); 871 } else 872 return(0); 873 } 874 875 /* 876 - regmatch - main matching routine 877 * 878 * Conceptually the strategy is simple: check to see whether the current 879 * node matches, call self recursively to see whether the rest matches, 880 * and then act accordingly. In practice we make some effort to avoid 881 * recursion, in particular by going through "ordinary" nodes (that don't 882 * need to know whether the rest of the match failed) by a loop instead of 883 * by recursion. 884 */ 885 static int /* 0 failure, 1 success */ 886 regmatch(prog) 887 char *prog; 888 { 889 register char *scan; /* Current node. */ 890 char *next; /* Next node. */ 891 extern char *strchr(); 892 893 scan = prog; 894 #ifdef DEBUG 895 if (scan != NULL && regnarrate) 896 fprintf(stderr, "%s(\n", regprop(scan)); 897 #endif 898 while (scan != NULL) { 899 #ifdef DEBUG 900 if (regnarrate) 901 fprintf(stderr, "%s...\n", regprop(scan)); 902 #endif 903 next = regnext(scan); 904 905 switch (OP(scan)) { 906 case BOL: 907 if (reginput != regbol) 908 return(0); 909 break; 910 case EOL: 911 if (*reginput != '\0') 912 return(0); 913 break; 914 case WORDA: 915 /* Must be looking at a letter, digit, or _ */ 916 if ((!isalnum(*reginput)) && *reginput != '_') 917 return(0); 918 /* Prev must be BOL or nonword */ 919 if (reginput > regbol && 920 (isalnum(reginput[-1]) || reginput[-1] == '_')) 921 return(0); 922 break; 923 case WORDZ: 924 /* Must be looking at non letter, digit, or _ */ 925 if (isalnum(*reginput) || *reginput == '_') 926 return(0); 927 /* We don't care what the previous char was */ 928 break; 929 case ANY: 930 if (*reginput == '\0') 931 return(0); 932 reginput++; 933 break; 934 case EXACTLY: { 935 register int len; 936 register char *opnd; 937 938 opnd = OPERAND(scan); 939 /* Inline the first character, for speed. */ 940 if (*opnd != *reginput) 941 return(0); 942 len = strlen(opnd); 943 if (len > 1 && strncmp(opnd, reginput, len) != 0) 944 return(0); 945 reginput += len; 946 } 947 break; 948 case ANYOF: 949 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 950 return(0); 951 reginput++; 952 break; 953 case ANYBUT: 954 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 955 return(0); 956 reginput++; 957 break; 958 case NOTHING: 959 break; 960 case BACK: 961 break; 962 case OPEN+1: 963 case OPEN+2: 964 case OPEN+3: 965 case OPEN+4: 966 case OPEN+5: 967 case OPEN+6: 968 case OPEN+7: 969 case OPEN+8: 970 case OPEN+9: { 971 register int no; 972 register char *save; 973 974 no = OP(scan) - OPEN; 975 save = reginput; 976 977 if (regmatch(next)) { 978 /* 979 * Don't set startp if some later 980 * invocation of the same parentheses 981 * already has. 982 */ 983 if (regstartp[no] == NULL) 984 regstartp[no] = save; 985 return(1); 986 } else 987 return(0); 988 } 989 break; 990 case CLOSE+1: 991 case CLOSE+2: 992 case CLOSE+3: 993 case CLOSE+4: 994 case CLOSE+5: 995 case CLOSE+6: 996 case CLOSE+7: 997 case CLOSE+8: 998 case CLOSE+9: { 999 register int no; 1000 register char *save; 1001 1002 no = OP(scan) - CLOSE; 1003 save = reginput; 1004 1005 if (regmatch(next)) { 1006 /* 1007 * Don't set endp if some later 1008 * invocation of the same parentheses 1009 * already has. 1010 */ 1011 if (regendp[no] == NULL) 1012 regendp[no] = save; 1013 return(1); 1014 } else 1015 return(0); 1016 } 1017 break; 1018 case BRANCH: { 1019 register char *save; 1020 1021 if (OP(next) != BRANCH) /* No choice. */ 1022 next = OPERAND(scan); /* Avoid recursion. */ 1023 else { 1024 do { 1025 save = reginput; 1026 if (regmatch(OPERAND(scan))) 1027 return(1); 1028 reginput = save; 1029 scan = regnext(scan); 1030 } while (scan != NULL && OP(scan) == BRANCH); 1031 return(0); 1032 /* NOTREACHED */ 1033 } 1034 } 1035 break; 1036 case STAR: 1037 case PLUS: { 1038 register char nextch; 1039 register int no; 1040 register char *save; 1041 register int min; 1042 1043 /* 1044 * Lookahead to avoid useless match attempts 1045 * when we know what character comes next. 1046 */ 1047 nextch = '\0'; 1048 if (OP(next) == EXACTLY) 1049 nextch = *OPERAND(next); 1050 min = (OP(scan) == STAR) ? 0 : 1; 1051 save = reginput; 1052 no = regrepeat(OPERAND(scan)); 1053 while (no >= min) { 1054 /* If it could work, try it. */ 1055 if (nextch == '\0' || *reginput == nextch) 1056 if (regmatch(next)) 1057 return(1); 1058 /* Couldn't or didn't -- back up. */ 1059 no--; 1060 reginput = save + no; 1061 } 1062 return(0); 1063 } 1064 break; 1065 case END: 1066 return(1); /* Success! */ 1067 break; 1068 default: 1069 regerror("memory corruption"); 1070 return(0); 1071 break; 1072 } 1073 1074 scan = next; 1075 } 1076 1077 /* 1078 * We get here only if there's trouble -- normally "case END" is 1079 * the terminating point. 1080 */ 1081 regerror("corrupted pointers"); 1082 return(0); 1083 } 1084 1085 /* 1086 - regrepeat - repeatedly match something simple, report how many 1087 */ 1088 static int 1089 regrepeat(p) 1090 char *p; 1091 { 1092 register int count = 0; 1093 register char *scan; 1094 register char *opnd; 1095 1096 scan = reginput; 1097 opnd = OPERAND(p); 1098 switch (OP(p)) { 1099 case ANY: 1100 count = strlen(scan); 1101 scan += count; 1102 break; 1103 case EXACTLY: 1104 while (*opnd == *scan) { 1105 count++; 1106 scan++; 1107 } 1108 break; 1109 case ANYOF: 1110 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1111 count++; 1112 scan++; 1113 } 1114 break; 1115 case ANYBUT: 1116 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1117 count++; 1118 scan++; 1119 } 1120 break; 1121 default: /* Oh dear. Called inappropriately. */ 1122 regerror("internal foulup"); 1123 count = 0; /* Best compromise. */ 1124 break; 1125 } 1126 reginput = scan; 1127 1128 return(count); 1129 } 1130 1131 /* 1132 - regnext - dig the "next" pointer out of a node 1133 */ 1134 static char * 1135 regnext(p) 1136 register char *p; 1137 { 1138 register int offset; 1139 1140 if (p == ®dummy) 1141 return(NULL); 1142 1143 offset = NEXT(p); 1144 if (offset == 0) 1145 return(NULL); 1146 1147 if (OP(p) == BACK) 1148 return(p-offset); 1149 else 1150 return(p+offset); 1151 } 1152 1153 #ifdef DEBUG 1154 1155 STATIC char *regprop(); 1156 1157 /* 1158 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1159 */ 1160 void 1161 regdump(r) 1162 regexp *r; 1163 { 1164 register char *s; 1165 register char op = EXACTLY; /* Arbitrary non-END op. */ 1166 register char *next; 1167 extern char *strchr(); 1168 1169 1170 s = r->program + 1; 1171 while (op != END) { /* While that wasn't END last time... */ 1172 op = OP(s); 1173 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1174 next = regnext(s); 1175 if (next == NULL) /* Next ptr. */ 1176 printf("(0)"); 1177 else 1178 printf("(%d)", (s-r->program)+(next-s)); 1179 s += 3; 1180 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1181 /* Literal string, where present. */ 1182 while (*s != '\0') { 1183 putchar(*s); 1184 s++; 1185 } 1186 s++; 1187 } 1188 putchar('\n'); 1189 } 1190 1191 /* Header fields of interest. */ 1192 if (r->regstart != '\0') 1193 printf("start `%c' ", r->regstart); 1194 if (r->reganch) 1195 printf("anchored "); 1196 if (r->regmust != NULL) 1197 printf("must have \"%s\"", r->regmust); 1198 printf("\n"); 1199 } 1200 1201 /* 1202 - regprop - printable representation of opcode 1203 */ 1204 static char * 1205 regprop(op) 1206 char *op; 1207 { 1208 register char *p; 1209 static char buf[50]; 1210 1211 (void) strcpy(buf, ":"); 1212 1213 switch (OP(op)) { 1214 case BOL: 1215 p = "BOL"; 1216 break; 1217 case EOL: 1218 p = "EOL"; 1219 break; 1220 case ANY: 1221 p = "ANY"; 1222 break; 1223 case ANYOF: 1224 p = "ANYOF"; 1225 break; 1226 case ANYBUT: 1227 p = "ANYBUT"; 1228 break; 1229 case BRANCH: 1230 p = "BRANCH"; 1231 break; 1232 case EXACTLY: 1233 p = "EXACTLY"; 1234 break; 1235 case NOTHING: 1236 p = "NOTHING"; 1237 break; 1238 case BACK: 1239 p = "BACK"; 1240 break; 1241 case END: 1242 p = "END"; 1243 break; 1244 case OPEN+1: 1245 case OPEN+2: 1246 case OPEN+3: 1247 case OPEN+4: 1248 case OPEN+5: 1249 case OPEN+6: 1250 case OPEN+7: 1251 case OPEN+8: 1252 case OPEN+9: 1253 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 1254 p = NULL; 1255 break; 1256 case CLOSE+1: 1257 case CLOSE+2: 1258 case CLOSE+3: 1259 case CLOSE+4: 1260 case CLOSE+5: 1261 case CLOSE+6: 1262 case CLOSE+7: 1263 case CLOSE+8: 1264 case CLOSE+9: 1265 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 1266 p = NULL; 1267 break; 1268 case STAR: 1269 p = "STAR"; 1270 break; 1271 case PLUS: 1272 p = "PLUS"; 1273 break; 1274 case WORDA: 1275 p = "WORDA"; 1276 break; 1277 case WORDZ: 1278 p = "WORDZ"; 1279 break; 1280 default: 1281 regerror("corrupted opcode"); 1282 break; 1283 } 1284 if (p != NULL) 1285 (void) strcat(buf, p); 1286 return(buf); 1287 } 1288 #endif 1289 1290 /* 1291 * The following is provided for those people who do not have strcspn() in 1292 * their C libraries. They should get off their butts and do something 1293 * about it; at least one public-domain implementation of those (highly 1294 * useful) string routines has been published on Usenet. 1295 */ 1296 #ifdef STRCSPN 1297 /* 1298 * strcspn - find length of initial segment of s1 consisting entirely 1299 * of characters not from s2 1300 */ 1301 1302 static int 1303 strcspn(s1, s2) 1304 char *s1; 1305 char *s2; 1306 { 1307 register char *scan1; 1308 register char *scan2; 1309 register int count; 1310 1311 count = 0; 1312 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1313 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1314 if (*scan1 == *scan2++) 1315 return(count); 1316 count++; 1317 } 1318 return(count); 1319 } 1320 #endif 1321