1 /* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * 4 * Copyright (c) 1986 by University of Toronto. 5 * Written by Henry Spencer. Not derived from licensed software. 6 * 7 * Permission is granted to anyone to use this software for any 8 * purpose on any computer system, and to redistribute it freely, 9 * subject to the following restrictions: 10 * 11 * 1. The author is not responsible for the consequences of use of 12 * this software, no matter how awful, even if they arise 13 * from defects in it. 14 * 15 * 2. The origin of this software must not be misrepresented, either 16 * by explicit claim or by omission. 17 * 18 * 3. Altered versions must be plainly marked as such, and must not 19 * be misrepresented as being the original software. 20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | 22 *** to assist in implementing egrep. 23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching 25 *** as in BSD grep and ex. 26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. 28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, 29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. 30 * 31 * Beware that some of this code is subtly aware of the way operator 32 * precedence is structured in regular expressions. Serious changes in 33 * regular-expression syntax might require a total rethink. 34 */ 35 #include <regexp.h> 36 #include <stdio.h> 37 #include <ctype.h> 38 #include <stdlib.h> 39 #include <string.h> 40 #include "regmagic.h" 41 42 /* 43 * The "internal use only" fields in regexp.h are present to pass info from 44 * compile to execute that permits the execute phase to run lots faster on 45 * simple cases. They are: 46 * 47 * regstart char that must begin a match; '\0' if none obvious 48 * reganch is the match anchored (at beginning-of-line only)? 49 * regmust string (pointer into program) that match must include, or NULL 50 * regmlen length of regmust string 51 * 52 * Regstart and reganch permit very fast decisions on suitable starting points 53 * for a match, cutting down the work a lot. Regmust permits fast rejection 54 * of lines that cannot possibly match. The regmust tests are costly enough 55 * that regcomp() supplies a regmust only if the r.e. contains something 56 * potentially expensive (at present, the only such thing detected is * or + 57 * at the start of the r.e., which can involve a lot of backup). Regmlen is 58 * supplied because the test in regexec() needs it and regcomp() is computing 59 * it anyway. 60 */ 61 62 /* 63 * Structure for regexp "program". This is essentially a linear encoding 64 * of a nondeterministic finite-state machine (aka syntax charts or 65 * "railroad normal form" in parsing technology). Each node is an opcode 66 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 67 * all nodes except BRANCH implement concatenation; a "next" pointer with 68 * a BRANCH on both ends of it is connecting two alternatives. (Here we 69 * have one of the subtle syntax dependencies: an individual BRANCH (as 70 * opposed to a collection of them) is never concatenated with anything 71 * because of operator precedence.) The operand of some types of node is 72 * a literal string; for others, it is a node leading into a sub-FSM. In 73 * particular, the operand of a BRANCH node is the first node of the branch. 74 * (NB this is *not* a tree structure: the tail of the branch connects 75 * to the thing following the set of BRANCHes.) The opcodes are: 76 */ 77 78 /* definition number opnd? meaning */ 79 #define END 0 /* no End of program. */ 80 #define BOL 1 /* no Match "" at beginning of line. */ 81 #define EOL 2 /* no Match "" at end of line. */ 82 #define ANY 3 /* no Match any one character. */ 83 #define ANYOF 4 /* str Match any character in this string. */ 84 #define ANYBUT 5 /* str Match any character not in this string. */ 85 #define BRANCH 6 /* node Match this alternative, or the next... */ 86 #define BACK 7 /* no Match "", "next" ptr points backward. */ 87 #define EXACTLY 8 /* str Match this string. */ 88 #define NOTHING 9 /* no Match empty string. */ 89 #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 90 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 91 #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ 92 #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ 93 #define OPEN 20 /* no Mark this point in input as start of #n. */ 94 /* OPEN+1 is number 1, etc. */ 95 #define CLOSE 30 /* no Analogous to OPEN. */ 96 97 /* 98 * Opcode notes: 99 * 100 * BRANCH The set of branches constituting a single choice are hooked 101 * together with their "next" pointers, since precedence prevents 102 * anything being concatenated to any individual branch. The 103 * "next" pointer of the last BRANCH in a choice points to the 104 * thing following the whole choice. This is also where the 105 * final "next" pointer of each individual branch points; each 106 * branch starts with the operand node of a BRANCH node. 107 * 108 * BACK Normal "next" pointers all implicitly point forward; BACK 109 * exists to make loop structures possible. 110 * 111 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 112 * BRANCH structures using BACK. Simple cases (one character 113 * per match) are implemented with STAR and PLUS for speed 114 * and to minimize recursive plunges. 115 * 116 * OPEN,CLOSE ...are numbered at compile time. 117 */ 118 119 /* 120 * A node is one char of opcode followed by two chars of "next" pointer. 121 * "Next" pointers are stored as two 8-bit pieces, high order first. The 122 * value is a positive offset from the opcode of the node containing it. 123 * An operand, if any, simply follows the node. (Note that much of the 124 * code generation knows about this implicit relationship.) 125 * 126 * Using two bytes for the "next" pointer is vast overkill for most things, 127 * but allows patterns to get big without disasters. 128 */ 129 #define OP(p) (*(p)) 130 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 131 #define OPERAND(p) ((p) + 3) 132 133 /* 134 * See regmagic.h for one further detail of program structure. 135 */ 136 137 138 /* 139 * Utility definitions. 140 */ 141 #ifndef CHARBITS 142 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 143 #else 144 #define UCHARAT(p) ((int)*(p)&CHARBITS) 145 #endif 146 147 #define FAIL(m) { regerror(m); return(NULL); } 148 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 149 150 /* 151 * Flags to be passed up and down. 152 */ 153 #define HASWIDTH 01 /* Known never to match null string. */ 154 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 155 #define SPSTART 04 /* Starts with * or +. */ 156 #define WORST 0 /* Worst case. */ 157 158 /* 159 * Global work variables for regcomp(). 160 */ 161 static char *regparse; /* Input-scan pointer. */ 162 static int regnpar; /* () count. */ 163 static char regdummy; 164 static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 165 static long regsize; /* Code size. */ 166 167 /* 168 * Forward declarations for regcomp()'s friends. 169 */ 170 #ifndef STATIC 171 #define STATIC static 172 #endif 173 STATIC char *reg(); 174 STATIC char *regbranch(); 175 STATIC char *regpiece(); 176 STATIC char *regatom(); 177 STATIC char *regnode(); 178 STATIC char *regnext(); 179 STATIC void regc(); 180 STATIC void reginsert(); 181 STATIC void regtail(); 182 STATIC void regoptail(); 183 #ifdef STRCSPN 184 STATIC int strcspn(); 185 #endif 186 187 /* 188 - regcomp - compile a regular expression into internal code 189 * 190 * We can't allocate space until we know how big the compiled form will be, 191 * but we can't compile it (and thus know how big it is) until we've got a 192 * place to put the code. So we cheat: we compile it twice, once with code 193 * generation turned off and size counting turned on, and once "for real". 194 * This also means that we don't allocate space until we are sure that the 195 * thing really will compile successfully, and we never have to move the 196 * code and thus invalidate pointers into it. (Note that it has to be in 197 * one piece because free() must be able to free it all.) 198 * 199 * Beware that the optimization-preparation code in here knows about some 200 * of the structure of the compiled regexp. 201 */ 202 regexp * 203 regcomp(exp) 204 const char *exp; 205 { 206 register regexp *r; 207 register char *scan; 208 register char *longest; 209 register int len; 210 int flags; 211 212 if (exp == NULL) 213 FAIL("NULL argument"); 214 215 /* First pass: determine size, legality. */ 216 if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */ 217 regparse = (char *)exp; 218 regnpar = 1; 219 regsize = 0L; 220 regcode = ®dummy; 221 regc(MAGIC); 222 if (reg(0, &flags) == NULL) 223 return(NULL); 224 225 /* Small enough for pointer-storage convention? */ 226 if (regsize >= 32767L) /* Probably could be 65535L. */ 227 FAIL("regexp too big"); 228 229 /* Allocate space. */ 230 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 231 if (r == NULL) 232 FAIL("out of space"); 233 234 /* Second pass: emit code. */ 235 regparse = (char *)exp; 236 regnpar = 1; 237 regcode = r->program; 238 regc(MAGIC); 239 if (reg(0, &flags) == NULL) 240 return(NULL); 241 242 /* Dig out information for optimizations. */ 243 r->regstart = '\0'; /* Worst-case defaults. */ 244 r->reganch = 0; 245 r->regmust = NULL; 246 r->regmlen = 0; 247 scan = r->program+1; /* First BRANCH. */ 248 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 249 scan = OPERAND(scan); 250 251 /* Starting-point info. */ 252 if (OP(scan) == EXACTLY) 253 r->regstart = *OPERAND(scan); 254 else if (OP(scan) == BOL) 255 r->reganch++; 256 257 /* 258 * If there's something expensive in the r.e., find the 259 * longest literal string that must appear and make it the 260 * regmust. Resolve ties in favor of later strings, since 261 * the regstart check works with the beginning of the r.e. 262 * and avoiding duplication strengthens checking. Not a 263 * strong reason, but sufficient in the absence of others. 264 */ 265 if (flags&SPSTART) { 266 longest = NULL; 267 len = 0; 268 for (; scan != NULL; scan = regnext(scan)) 269 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { 270 longest = OPERAND(scan); 271 len = strlen(OPERAND(scan)); 272 } 273 r->regmust = longest; 274 r->regmlen = len; 275 } 276 } 277 278 return(r); 279 } 280 281 /* 282 - reg - regular expression, i.e. main body or parenthesized thing 283 * 284 * Caller must absorb opening parenthesis. 285 * 286 * Combining parenthesis handling with the base level of regular expression 287 * is a trifle forced, but the need to tie the tails of the branches to what 288 * follows makes it hard to avoid. 289 */ 290 static char * 291 reg(paren, flagp) 292 int paren; /* Parenthesized? */ 293 int *flagp; 294 { 295 register char *ret; 296 register char *br; 297 register char *ender; 298 register int parno; 299 int flags; 300 301 *flagp = HASWIDTH; /* Tentatively. */ 302 303 /* Make an OPEN node, if parenthesized. */ 304 if (paren) { 305 if (regnpar >= NSUBEXP) 306 FAIL("too many ()"); 307 parno = regnpar; 308 regnpar++; 309 ret = regnode(OPEN+parno); 310 } else 311 ret = NULL; 312 313 /* Pick up the branches, linking them together. */ 314 br = regbranch(&flags); 315 if (br == NULL) 316 return(NULL); 317 if (ret != NULL) 318 regtail(ret, br); /* OPEN -> first. */ 319 else 320 ret = br; 321 if (!(flags&HASWIDTH)) 322 *flagp &= ~HASWIDTH; 323 *flagp |= flags&SPSTART; 324 while (*regparse == '|' || *regparse == '\n') { 325 regparse++; 326 br = regbranch(&flags); 327 if (br == NULL) 328 return(NULL); 329 regtail(ret, br); /* BRANCH -> BRANCH. */ 330 if (!(flags&HASWIDTH)) 331 *flagp &= ~HASWIDTH; 332 *flagp |= flags&SPSTART; 333 } 334 335 /* Make a closing node, and hook it on the end. */ 336 ender = regnode((paren) ? CLOSE+parno : END); 337 regtail(ret, ender); 338 339 /* Hook the tails of the branches to the closing node. */ 340 for (br = ret; br != NULL; br = regnext(br)) 341 regoptail(br, ender); 342 343 /* Check for proper termination. */ 344 if (paren && *regparse++ != ')') { 345 FAIL("unmatched ()"); 346 } else if (!paren && *regparse != '\0') { 347 if (*regparse == ')') { 348 FAIL("unmatched ()"); 349 } else 350 FAIL("junk on end"); /* "Can't happen". */ 351 /* NOTREACHED */ 352 } 353 354 return(ret); 355 } 356 357 /* 358 - regbranch - one alternative of an | operator 359 * 360 * Implements the concatenation operator. 361 */ 362 static char * 363 regbranch(flagp) 364 int *flagp; 365 { 366 register char *ret; 367 register char *chain; 368 register char *latest; 369 int flags; 370 371 *flagp = WORST; /* Tentatively. */ 372 373 ret = regnode(BRANCH); 374 chain = NULL; 375 while (*regparse != '\0' && *regparse != ')' && 376 *regparse != '\n' && *regparse != '|') { 377 latest = regpiece(&flags); 378 if (latest == NULL) 379 return(NULL); 380 *flagp |= flags&HASWIDTH; 381 if (chain == NULL) /* First piece. */ 382 *flagp |= flags&SPSTART; 383 else 384 regtail(chain, latest); 385 chain = latest; 386 } 387 if (chain == NULL) /* Loop ran zero times. */ 388 (void) regnode(NOTHING); 389 390 return(ret); 391 } 392 393 /* 394 - regpiece - something followed by possible [*+?] 395 * 396 * Note that the branching code sequences used for ? and the general cases 397 * of * and + are somewhat optimized: they use the same NOTHING node as 398 * both the endmarker for their branch list and the body of the last branch. 399 * It might seem that this node could be dispensed with entirely, but the 400 * endmarker role is not redundant. 401 */ 402 static char * 403 regpiece(flagp) 404 int *flagp; 405 { 406 register char *ret; 407 register char op; 408 register char *next; 409 int flags; 410 411 ret = regatom(&flags); 412 if (ret == NULL) 413 return(NULL); 414 415 op = *regparse; 416 if (!ISMULT(op)) { 417 *flagp = flags; 418 return(ret); 419 } 420 421 if (!(flags&HASWIDTH) && op != '?') 422 FAIL("*+ operand could be empty"); 423 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 424 425 if (op == '*' && (flags&SIMPLE)) 426 reginsert(STAR, ret); 427 else if (op == '*') { 428 /* Emit x* as (x&|), where & means "self". */ 429 reginsert(BRANCH, ret); /* Either x */ 430 regoptail(ret, regnode(BACK)); /* and loop */ 431 regoptail(ret, ret); /* back */ 432 regtail(ret, regnode(BRANCH)); /* or */ 433 regtail(ret, regnode(NOTHING)); /* null. */ 434 } else if (op == '+' && (flags&SIMPLE)) 435 reginsert(PLUS, ret); 436 else if (op == '+') { 437 /* Emit x+ as x(&|), where & means "self". */ 438 next = regnode(BRANCH); /* Either */ 439 regtail(ret, next); 440 regtail(regnode(BACK), ret); /* loop back */ 441 regtail(next, regnode(BRANCH)); /* or */ 442 regtail(ret, regnode(NOTHING)); /* null. */ 443 } else if (op == '?') { 444 /* Emit x? as (x|) */ 445 reginsert(BRANCH, ret); /* Either x */ 446 regtail(ret, regnode(BRANCH)); /* or */ 447 next = regnode(NOTHING); /* null. */ 448 regtail(ret, next); 449 regoptail(ret, next); 450 } 451 regparse++; 452 if (ISMULT(*regparse)) 453 FAIL("nested *?+"); 454 455 return(ret); 456 } 457 458 /* 459 - regatom - the lowest level 460 * 461 * Optimization: gobbles an entire sequence of ordinary characters so that 462 * it can turn them into a single node, which is smaller to store and 463 * faster to run. Backslashed characters are exceptions, each becoming a 464 * separate node; the code is simpler that way and it's not worth fixing. 465 */ 466 static char * 467 regatom(flagp) 468 int *flagp; 469 { 470 register char *ret; 471 int flags; 472 473 *flagp = WORST; /* Tentatively. */ 474 475 switch (*regparse++) { 476 /* FIXME: these chars only have meaning at beg/end of pat? */ 477 case '^': 478 ret = regnode(BOL); 479 break; 480 case '$': 481 ret = regnode(EOL); 482 break; 483 case '.': 484 ret = regnode(ANY); 485 *flagp |= HASWIDTH|SIMPLE; 486 break; 487 case '[': { 488 register int class; 489 register int classend; 490 491 if (*regparse == '^') { /* Complement of range. */ 492 ret = regnode(ANYBUT); 493 regparse++; 494 } else 495 ret = regnode(ANYOF); 496 if (*regparse == ']' || *regparse == '-') 497 regc(*regparse++); 498 while (*regparse != '\0' && *regparse != ']') { 499 if (*regparse == '-') { 500 regparse++; 501 if (*regparse == ']' || *regparse == '\0') 502 regc('-'); 503 else { 504 class = UCHARAT(regparse-2)+1; 505 classend = UCHARAT(regparse); 506 if (class > classend+1) 507 FAIL("invalid [] range"); 508 for (; class <= classend; class++) 509 regc(class); 510 regparse++; 511 } 512 } else 513 regc(*regparse++); 514 } 515 regc('\0'); 516 if (*regparse != ']') 517 FAIL("unmatched []"); 518 regparse++; 519 *flagp |= HASWIDTH|SIMPLE; 520 } 521 break; 522 case '(': 523 ret = reg(1, &flags); 524 if (ret == NULL) 525 return(NULL); 526 *flagp |= flags&(HASWIDTH|SPSTART); 527 break; 528 case '\0': 529 case '|': 530 case '\n': 531 case ')': 532 FAIL("internal urp"); /* Supposed to be caught earlier. */ 533 break; 534 case '?': 535 case '+': 536 case '*': 537 FAIL("?+* follows nothing"); 538 break; 539 case '\\': 540 switch (*regparse++) { 541 case '\0': 542 FAIL("trailing \\"); 543 break; 544 case '<': 545 ret = regnode(WORDA); 546 break; 547 case '>': 548 ret = regnode(WORDZ); 549 break; 550 /* FIXME: Someday handle \1, \2, ... */ 551 default: 552 /* Handle general quoted chars in exact-match routine */ 553 goto de_fault; 554 } 555 break; 556 de_fault: 557 default: 558 /* 559 * Encode a string of characters to be matched exactly. 560 * 561 * This is a bit tricky due to quoted chars and due to 562 * '*', '+', and '?' taking the SINGLE char previous 563 * as their operand. 564 * 565 * On entry, the char at regparse[-1] is going to go 566 * into the string, no matter what it is. (It could be 567 * following a \ if we are entered from the '\' case.) 568 * 569 * Basic idea is to pick up a good char in ch and 570 * examine the next char. If it's *+? then we twiddle. 571 * If it's \ then we frozzle. If it's other magic char 572 * we push ch and terminate the string. If none of the 573 * above, we push ch on the string and go around again. 574 * 575 * regprev is used to remember where "the current char" 576 * starts in the string, if due to a *+? we need to back 577 * up and put the current char in a separate, 1-char, string. 578 * When regprev is NULL, ch is the only char in the 579 * string; this is used in *+? handling, and in setting 580 * flags |= SIMPLE at the end. 581 */ 582 { 583 char *regprev; 584 register char ch; 585 586 regparse--; /* Look at cur char */ 587 ret = regnode(EXACTLY); 588 for ( regprev = 0 ; ; ) { 589 ch = *regparse++; /* Get current char */ 590 switch (*regparse) { /* look at next one */ 591 592 default: 593 regc(ch); /* Add cur to string */ 594 break; 595 596 case '.': case '[': case '(': 597 case ')': case '|': case '\n': 598 case '$': case '^': 599 case '\0': 600 /* FIXME, $ and ^ should not always be magic */ 601 magic: 602 regc(ch); /* dump cur char */ 603 goto done; /* and we are done */ 604 605 case '?': case '+': case '*': 606 if (!regprev) /* If just ch in str, */ 607 goto magic; /* use it */ 608 /* End mult-char string one early */ 609 regparse = regprev; /* Back up parse */ 610 goto done; 611 612 case '\\': 613 regc(ch); /* Cur char OK */ 614 switch (regparse[1]){ /* Look after \ */ 615 case '\0': 616 case '<': 617 case '>': 618 /* FIXME: Someday handle \1, \2, ... */ 619 goto done; /* Not quoted */ 620 default: 621 /* Backup point is \, scan * point is after it. */ 622 regprev = regparse; 623 regparse++; 624 continue; /* NOT break; */ 625 } 626 } 627 regprev = regparse; /* Set backup point */ 628 } 629 done: 630 regc('\0'); 631 *flagp |= HASWIDTH; 632 if (!regprev) /* One char? */ 633 *flagp |= SIMPLE; 634 } 635 break; 636 } 637 638 return(ret); 639 } 640 641 /* 642 - regnode - emit a node 643 */ 644 static char * /* Location. */ 645 regnode(op) 646 char op; 647 { 648 register char *ret; 649 register char *ptr; 650 651 ret = regcode; 652 if (ret == ®dummy) { 653 regsize += 3; 654 return(ret); 655 } 656 657 ptr = ret; 658 *ptr++ = op; 659 *ptr++ = '\0'; /* Null "next" pointer. */ 660 *ptr++ = '\0'; 661 regcode = ptr; 662 663 return(ret); 664 } 665 666 /* 667 - regc - emit (if appropriate) a byte of code 668 */ 669 static void 670 regc(b) 671 char b; 672 { 673 if (regcode != ®dummy) 674 *regcode++ = b; 675 else 676 regsize++; 677 } 678 679 /* 680 - reginsert - insert an operator in front of already-emitted operand 681 * 682 * Means relocating the operand. 683 */ 684 static void 685 reginsert(op, opnd) 686 char op; 687 char *opnd; 688 { 689 register char *src; 690 register char *dst; 691 register char *place; 692 693 if (regcode == ®dummy) { 694 regsize += 3; 695 return; 696 } 697 698 src = regcode; 699 regcode += 3; 700 dst = regcode; 701 while (src > opnd) 702 *--dst = *--src; 703 704 place = opnd; /* Op node, where operand used to be. */ 705 *place++ = op; 706 *place++ = '\0'; 707 *place++ = '\0'; 708 } 709 710 /* 711 - regtail - set the next-pointer at the end of a node chain 712 */ 713 static void 714 regtail(p, val) 715 char *p; 716 char *val; 717 { 718 register char *scan; 719 register char *temp; 720 register int offset; 721 722 if (p == ®dummy) 723 return; 724 725 /* Find last node. */ 726 scan = p; 727 for (;;) { 728 temp = regnext(scan); 729 if (temp == NULL) 730 break; 731 scan = temp; 732 } 733 734 if (OP(scan) == BACK) 735 offset = scan - val; 736 else 737 offset = val - scan; 738 *(scan+1) = (offset>>8)&0377; 739 *(scan+2) = offset&0377; 740 } 741 742 /* 743 - regoptail - regtail on operand of first argument; nop if operandless 744 */ 745 static void 746 regoptail(p, val) 747 char *p; 748 char *val; 749 { 750 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 751 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 752 return; 753 regtail(OPERAND(p), val); 754 } 755 756 /* 757 * regexec and friends 758 */ 759 760 /* 761 * Global work variables for regexec(). 762 */ 763 static char *reginput; /* String-input pointer. */ 764 static char *regbol; /* Beginning of input, for ^ check. */ 765 static char **regstartp; /* Pointer to startp array. */ 766 static char **regendp; /* Ditto for endp. */ 767 768 /* 769 * Forwards. 770 */ 771 STATIC int regtry(); 772 STATIC int regmatch(); 773 STATIC int regrepeat(); 774 775 #ifdef DEBUG 776 int regnarrate = 0; 777 void regdump(); 778 STATIC char *regprop(); 779 #endif 780 781 /* 782 - regexec - match a regexp against a string 783 */ 784 int 785 regexec(prog, string) 786 register const regexp *prog; 787 register const char *string; 788 { 789 register char *s; 790 extern char *strchr(); 791 792 /* Be paranoid... */ 793 if (prog == NULL || string == NULL) { 794 regerror("NULL parameter"); 795 return(0); 796 } 797 798 /* Check validity of program. */ 799 if (UCHARAT(prog->program) != MAGIC) { 800 regerror("corrupted program"); 801 return(0); 802 } 803 804 /* If there is a "must appear" string, look for it. */ 805 if (prog->regmust != NULL) { 806 s = (char *)string; 807 while ((s = strchr(s, prog->regmust[0])) != NULL) { 808 if (strncmp(s, prog->regmust, prog->regmlen) == 0) 809 break; /* Found it. */ 810 s++; 811 } 812 if (s == NULL) /* Not present. */ 813 return(0); 814 } 815 816 /* Mark beginning of line for ^ . */ 817 regbol = (char *)string; 818 819 /* Simplest case: anchored match need be tried only once. */ 820 if (prog->reganch) 821 return(regtry(prog, string)); 822 823 /* Messy cases: unanchored match. */ 824 s = (char *)string; 825 if (prog->regstart != '\0') 826 /* We know what char it must start with. */ 827 while ((s = strchr(s, prog->regstart)) != NULL) { 828 if (regtry(prog, s)) 829 return(1); 830 s++; 831 } 832 else 833 /* We don't -- general case. */ 834 do { 835 if (regtry(prog, s)) 836 return(1); 837 } while (*s++ != '\0'); 838 839 /* Failure. */ 840 return(0); 841 } 842 843 /* 844 - regtry - try match at specific point 845 */ 846 static int /* 0 failure, 1 success */ 847 regtry(prog, string) 848 regexp *prog; 849 char *string; 850 { 851 register int i; 852 register char **sp; 853 register char **ep; 854 855 reginput = string; 856 regstartp = prog->startp; 857 regendp = prog->endp; 858 859 sp = prog->startp; 860 ep = prog->endp; 861 for (i = NSUBEXP; i > 0; i--) { 862 *sp++ = NULL; 863 *ep++ = NULL; 864 } 865 if (regmatch(prog->program + 1)) { 866 prog->startp[0] = string; 867 prog->endp[0] = reginput; 868 return(1); 869 } else 870 return(0); 871 } 872 873 /* 874 - regmatch - main matching routine 875 * 876 * Conceptually the strategy is simple: check to see whether the current 877 * node matches, call self recursively to see whether the rest matches, 878 * and then act accordingly. In practice we make some effort to avoid 879 * recursion, in particular by going through "ordinary" nodes (that don't 880 * need to know whether the rest of the match failed) by a loop instead of 881 * by recursion. 882 */ 883 static int /* 0 failure, 1 success */ 884 regmatch(prog) 885 char *prog; 886 { 887 register char *scan; /* Current node. */ 888 char *next; /* Next node. */ 889 extern char *strchr(); 890 891 scan = prog; 892 #ifdef DEBUG 893 if (scan != NULL && regnarrate) 894 fprintf(stderr, "%s(\n", regprop(scan)); 895 #endif 896 while (scan != NULL) { 897 #ifdef DEBUG 898 if (regnarrate) 899 fprintf(stderr, "%s...\n", regprop(scan)); 900 #endif 901 next = regnext(scan); 902 903 switch (OP(scan)) { 904 case BOL: 905 if (reginput != regbol) 906 return(0); 907 break; 908 case EOL: 909 if (*reginput != '\0') 910 return(0); 911 break; 912 case WORDA: 913 /* Must be looking at a letter, digit, or _ */ 914 if ((!isalnum(*reginput)) && *reginput != '_') 915 return(0); 916 /* Prev must be BOL or nonword */ 917 if (reginput > regbol && 918 (isalnum(reginput[-1]) || reginput[-1] == '_')) 919 return(0); 920 break; 921 case WORDZ: 922 /* Must be looking at non letter, digit, or _ */ 923 if (isalnum(*reginput) || *reginput == '_') 924 return(0); 925 /* We don't care what the previous char was */ 926 break; 927 case ANY: 928 if (*reginput == '\0') 929 return(0); 930 reginput++; 931 break; 932 case EXACTLY: { 933 register int len; 934 register char *opnd; 935 936 opnd = OPERAND(scan); 937 /* Inline the first character, for speed. */ 938 if (*opnd != *reginput) 939 return(0); 940 len = strlen(opnd); 941 if (len > 1 && strncmp(opnd, reginput, len) != 0) 942 return(0); 943 reginput += len; 944 } 945 break; 946 case ANYOF: 947 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 948 return(0); 949 reginput++; 950 break; 951 case ANYBUT: 952 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 953 return(0); 954 reginput++; 955 break; 956 case NOTHING: 957 break; 958 case BACK: 959 break; 960 case OPEN+1: 961 case OPEN+2: 962 case OPEN+3: 963 case OPEN+4: 964 case OPEN+5: 965 case OPEN+6: 966 case OPEN+7: 967 case OPEN+8: 968 case OPEN+9: { 969 register int no; 970 register char *save; 971 972 no = OP(scan) - OPEN; 973 save = reginput; 974 975 if (regmatch(next)) { 976 /* 977 * Don't set startp if some later 978 * invocation of the same parentheses 979 * already has. 980 */ 981 if (regstartp[no] == NULL) 982 regstartp[no] = save; 983 return(1); 984 } else 985 return(0); 986 } 987 break; 988 case CLOSE+1: 989 case CLOSE+2: 990 case CLOSE+3: 991 case CLOSE+4: 992 case CLOSE+5: 993 case CLOSE+6: 994 case CLOSE+7: 995 case CLOSE+8: 996 case CLOSE+9: { 997 register int no; 998 register char *save; 999 1000 no = OP(scan) - CLOSE; 1001 save = reginput; 1002 1003 if (regmatch(next)) { 1004 /* 1005 * Don't set endp if some later 1006 * invocation of the same parentheses 1007 * already has. 1008 */ 1009 if (regendp[no] == NULL) 1010 regendp[no] = save; 1011 return(1); 1012 } else 1013 return(0); 1014 } 1015 break; 1016 case BRANCH: { 1017 register char *save; 1018 1019 if (OP(next) != BRANCH) /* No choice. */ 1020 next = OPERAND(scan); /* Avoid recursion. */ 1021 else { 1022 do { 1023 save = reginput; 1024 if (regmatch(OPERAND(scan))) 1025 return(1); 1026 reginput = save; 1027 scan = regnext(scan); 1028 } while (scan != NULL && OP(scan) == BRANCH); 1029 return(0); 1030 /* NOTREACHED */ 1031 } 1032 } 1033 break; 1034 case STAR: 1035 case PLUS: { 1036 register char nextch; 1037 register int no; 1038 register char *save; 1039 register int min; 1040 1041 /* 1042 * Lookahead to avoid useless match attempts 1043 * when we know what character comes next. 1044 */ 1045 nextch = '\0'; 1046 if (OP(next) == EXACTLY) 1047 nextch = *OPERAND(next); 1048 min = (OP(scan) == STAR) ? 0 : 1; 1049 save = reginput; 1050 no = regrepeat(OPERAND(scan)); 1051 while (no >= min) { 1052 /* If it could work, try it. */ 1053 if (nextch == '\0' || *reginput == nextch) 1054 if (regmatch(next)) 1055 return(1); 1056 /* Couldn't or didn't -- back up. */ 1057 no--; 1058 reginput = save + no; 1059 } 1060 return(0); 1061 } 1062 break; 1063 case END: 1064 return(1); /* Success! */ 1065 break; 1066 default: 1067 regerror("memory corruption"); 1068 return(0); 1069 break; 1070 } 1071 1072 scan = next; 1073 } 1074 1075 /* 1076 * We get here only if there's trouble -- normally "case END" is 1077 * the terminating point. 1078 */ 1079 regerror("corrupted pointers"); 1080 return(0); 1081 } 1082 1083 /* 1084 - regrepeat - repeatedly match something simple, report how many 1085 */ 1086 static int 1087 regrepeat(p) 1088 char *p; 1089 { 1090 register int count = 0; 1091 register char *scan; 1092 register char *opnd; 1093 1094 scan = reginput; 1095 opnd = OPERAND(p); 1096 switch (OP(p)) { 1097 case ANY: 1098 count = strlen(scan); 1099 scan += count; 1100 break; 1101 case EXACTLY: 1102 while (*opnd == *scan) { 1103 count++; 1104 scan++; 1105 } 1106 break; 1107 case ANYOF: 1108 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1109 count++; 1110 scan++; 1111 } 1112 break; 1113 case ANYBUT: 1114 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1115 count++; 1116 scan++; 1117 } 1118 break; 1119 default: /* Oh dear. Called inappropriately. */ 1120 regerror("internal foulup"); 1121 count = 0; /* Best compromise. */ 1122 break; 1123 } 1124 reginput = scan; 1125 1126 return(count); 1127 } 1128 1129 /* 1130 - regnext - dig the "next" pointer out of a node 1131 */ 1132 static char * 1133 regnext(p) 1134 register char *p; 1135 { 1136 register int offset; 1137 1138 if (p == ®dummy) 1139 return(NULL); 1140 1141 offset = NEXT(p); 1142 if (offset == 0) 1143 return(NULL); 1144 1145 if (OP(p) == BACK) 1146 return(p-offset); 1147 else 1148 return(p+offset); 1149 } 1150 1151 #ifdef DEBUG 1152 1153 STATIC char *regprop(); 1154 1155 /* 1156 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1157 */ 1158 void 1159 regdump(r) 1160 regexp *r; 1161 { 1162 register char *s; 1163 register char op = EXACTLY; /* Arbitrary non-END op. */ 1164 register char *next; 1165 extern char *strchr(); 1166 1167 1168 s = r->program + 1; 1169 while (op != END) { /* While that wasn't END last time... */ 1170 op = OP(s); 1171 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1172 next = regnext(s); 1173 if (next == NULL) /* Next ptr. */ 1174 printf("(0)"); 1175 else 1176 printf("(%d)", (s-r->program)+(next-s)); 1177 s += 3; 1178 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1179 /* Literal string, where present. */ 1180 while (*s != '\0') { 1181 putchar(*s); 1182 s++; 1183 } 1184 s++; 1185 } 1186 putchar('\n'); 1187 } 1188 1189 /* Header fields of interest. */ 1190 if (r->regstart != '\0') 1191 printf("start `%c' ", r->regstart); 1192 if (r->reganch) 1193 printf("anchored "); 1194 if (r->regmust != NULL) 1195 printf("must have \"%s\"", r->regmust); 1196 printf("\n"); 1197 } 1198 1199 /* 1200 - regprop - printable representation of opcode 1201 */ 1202 static char * 1203 regprop(op) 1204 char *op; 1205 { 1206 register char *p; 1207 static char buf[50]; 1208 1209 (void) strcpy(buf, ":"); 1210 1211 switch (OP(op)) { 1212 case BOL: 1213 p = "BOL"; 1214 break; 1215 case EOL: 1216 p = "EOL"; 1217 break; 1218 case ANY: 1219 p = "ANY"; 1220 break; 1221 case ANYOF: 1222 p = "ANYOF"; 1223 break; 1224 case ANYBUT: 1225 p = "ANYBUT"; 1226 break; 1227 case BRANCH: 1228 p = "BRANCH"; 1229 break; 1230 case EXACTLY: 1231 p = "EXACTLY"; 1232 break; 1233 case NOTHING: 1234 p = "NOTHING"; 1235 break; 1236 case BACK: 1237 p = "BACK"; 1238 break; 1239 case END: 1240 p = "END"; 1241 break; 1242 case OPEN+1: 1243 case OPEN+2: 1244 case OPEN+3: 1245 case OPEN+4: 1246 case OPEN+5: 1247 case OPEN+6: 1248 case OPEN+7: 1249 case OPEN+8: 1250 case OPEN+9: 1251 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 1252 p = NULL; 1253 break; 1254 case CLOSE+1: 1255 case CLOSE+2: 1256 case CLOSE+3: 1257 case CLOSE+4: 1258 case CLOSE+5: 1259 case CLOSE+6: 1260 case CLOSE+7: 1261 case CLOSE+8: 1262 case CLOSE+9: 1263 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 1264 p = NULL; 1265 break; 1266 case STAR: 1267 p = "STAR"; 1268 break; 1269 case PLUS: 1270 p = "PLUS"; 1271 break; 1272 case WORDA: 1273 p = "WORDA"; 1274 break; 1275 case WORDZ: 1276 p = "WORDZ"; 1277 break; 1278 default: 1279 regerror("corrupted opcode"); 1280 break; 1281 } 1282 if (p != NULL) 1283 (void) strcat(buf, p); 1284 return(buf); 1285 } 1286 #endif 1287 1288 /* 1289 * The following is provided for those people who do not have strcspn() in 1290 * their C libraries. They should get off their butts and do something 1291 * about it; at least one public-domain implementation of those (highly 1292 * useful) string routines has been published on Usenet. 1293 */ 1294 #ifdef STRCSPN 1295 /* 1296 * strcspn - find length of initial segment of s1 consisting entirely 1297 * of characters not from s2 1298 */ 1299 1300 static int 1301 strcspn(s1, s2) 1302 char *s1; 1303 char *s2; 1304 { 1305 register char *scan1; 1306 register char *scan2; 1307 register int count; 1308 1309 count = 0; 1310 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1311 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1312 if (*scan1 == *scan2++) 1313 return(count); 1314 count++; 1315 } 1316 return(count); 1317 } 1318 #endif 1319