1 /* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * 4 * Copyright (c) 1986 by University of Toronto. 5 * Written by Henry Spencer. Not derived from licensed software. 6 * 7 * Permission is granted to anyone to use this software for any 8 * purpose on any computer system, and to redistribute it freely, 9 * subject to the following restrictions: 10 * 11 * 1. The author is not responsible for the consequences of use of 12 * this software, no matter how awful, even if they arise 13 * from defects in it. 14 * 15 * 2. The origin of this software must not be misrepresented, either 16 * by explicit claim or by omission. 17 * 18 * 3. Altered versions must be plainly marked as such, and must not 19 * be misrepresented as being the original software. 20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | 22 *** to assist in implementing egrep. 23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching 25 *** as in BSD grep and ex. 26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. 28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, 29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. 30 * 31 * Beware that some of this code is subtly aware of the way operator 32 * precedence is structured in regular expressions. Serious changes in 33 * regular-expression syntax might require a total rethink. 34 */ 35 36 #include <sys/cdefs.h> 37 #if defined(LIBC_SCCS) && !defined(lint) 38 __RCSID("$NetBSD: regexp.c,v 1.15 2002/11/11 23:43:03 thorpej Exp $"); 39 #endif /* LIBC_SCCS and not lint */ 40 41 #include <ctype.h> 42 #include <regexp.h> 43 #include <stdio.h> 44 #include <stdlib.h> 45 #include <string.h> 46 #include "regmagic.h" 47 48 /* 49 * The "internal use only" fields in regexp.h are present to pass info from 50 * compile to execute that permits the execute phase to run lots faster on 51 * simple cases. They are: 52 * 53 * regstart char that must begin a match; '\0' if none obvious 54 * reganch is the match anchored (at beginning-of-line only)? 55 * regmust string (pointer into program) that match must include, or NULL 56 * regmlen length of regmust string 57 * 58 * Regstart and reganch permit very fast decisions on suitable starting points 59 * for a match, cutting down the work a lot. Regmust permits fast rejection 60 * of lines that cannot possibly match. The regmust tests are costly enough 61 * that regcomp() supplies a regmust only if the r.e. contains something 62 * potentially expensive (at present, the only such thing detected is * or + 63 * at the start of the r.e., which can involve a lot of backup). Regmlen is 64 * supplied because the test in regexec() needs it and regcomp() is computing 65 * it anyway. 66 */ 67 68 /* 69 * Structure for regexp "program". This is essentially a linear encoding 70 * of a nondeterministic finite-state machine (aka syntax charts or 71 * "railroad normal form" in parsing technology). Each node is an opcode 72 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 73 * all nodes except BRANCH implement concatenation; a "next" pointer with 74 * a BRANCH on both ends of it is connecting two alternatives. (Here we 75 * have one of the subtle syntax dependencies: an individual BRANCH (as 76 * opposed to a collection of them) is never concatenated with anything 77 * because of operator precedence.) The operand of some types of node is 78 * a literal string; for others, it is a node leading into a sub-FSM. In 79 * particular, the operand of a BRANCH node is the first node of the branch. 80 * (NB this is *not* a tree structure: the tail of the branch connects 81 * to the thing following the set of BRANCHes.) The opcodes are: 82 */ 83 84 /* definition number opnd? meaning */ 85 #define END 0 /* no End of program. */ 86 #define BOL 1 /* no Match "" at beginning of line. */ 87 #define EOL 2 /* no Match "" at end of line. */ 88 #define ANY 3 /* no Match any one character. */ 89 #define ANYOF 4 /* str Match any character in this string. */ 90 #define ANYBUT 5 /* str Match any character not in this string. */ 91 #define BRANCH 6 /* node Match this alternative, or the next... */ 92 #define BACK 7 /* no Match "", "next" ptr points backward. */ 93 #define EXACTLY 8 /* str Match this string. */ 94 #define NOTHING 9 /* no Match empty string. */ 95 #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 96 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 97 #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ 98 #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ 99 #define OPEN 20 /* no Mark this point in input as start of #n. */ 100 /* OPEN+1 is number 1, etc. */ 101 #define CLOSE 30 /* no Analogous to OPEN. */ 102 103 /* 104 * Opcode notes: 105 * 106 * BRANCH The set of branches constituting a single choice are hooked 107 * together with their "next" pointers, since precedence prevents 108 * anything being concatenated to any individual branch. The 109 * "next" pointer of the last BRANCH in a choice points to the 110 * thing following the whole choice. This is also where the 111 * final "next" pointer of each individual branch points; each 112 * branch starts with the operand node of a BRANCH node. 113 * 114 * BACK Normal "next" pointers all implicitly point forward; BACK 115 * exists to make loop structures possible. 116 * 117 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 118 * BRANCH structures using BACK. Simple cases (one character 119 * per match) are implemented with STAR and PLUS for speed 120 * and to minimize recursive plunges. 121 * 122 * OPEN,CLOSE ...are numbered at compile time. 123 */ 124 125 /* 126 * A node is one char of opcode followed by two chars of "next" pointer. 127 * "Next" pointers are stored as two 8-bit pieces, high order first. The 128 * value is a positive offset from the opcode of the node containing it. 129 * An operand, if any, simply follows the node. (Note that much of the 130 * code generation knows about this implicit relationship.) 131 * 132 * Using two bytes for the "next" pointer is vast overkill for most things, 133 * but allows patterns to get big without disasters. 134 */ 135 #define OP(p) (*(p)) 136 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 137 #define OPERAND(p) ((p) + 3) 138 139 /* 140 * See regmagic.h for one further detail of program structure. 141 */ 142 143 144 /* 145 * Utility definitions. 146 */ 147 #ifndef CHARBITS 148 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 149 #else 150 #define UCHARAT(p) ((int)*(p)&CHARBITS) 151 #endif 152 153 #define FAIL(m) { regerror(m); return(NULL); } 154 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 155 156 /* 157 * Flags to be passed up and down. 158 */ 159 #define HASWIDTH 01 /* Known never to match null string. */ 160 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 161 #define SPSTART 04 /* Starts with * or +. */ 162 #define WORST 0 /* Worst case. */ 163 164 /* 165 * Global work variables for regcomp(). 166 */ 167 static char *regparse; /* Input-scan pointer. */ 168 static int regnpar; /* () count. */ 169 static char regdummy; 170 static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 171 static long regsize; /* Code size. */ 172 173 /* 174 * Forward declarations for regcomp()'s friends. 175 */ 176 #ifndef STATIC 177 #define STATIC static 178 #endif 179 STATIC char *reg __P((int, int *)); 180 STATIC char *regbranch __P((int *)); 181 STATIC char *regpiece __P((int *)); 182 STATIC char *regatom __P((int *)); 183 STATIC char *regnode __P((int)); 184 STATIC char *regnext __P((char *)); 185 STATIC void regc __P((int)); 186 STATIC void reginsert __P((int, char *)); 187 STATIC void regtail __P((char *, char *)); 188 STATIC void regoptail __P((char *, char *)); 189 #ifdef STRCSPN 190 STATIC int strcspn __P((char *, char *)); 191 #endif 192 193 /* 194 - regcomp - compile a regular expression into internal code 195 * 196 * We can't allocate space until we know how big the compiled form will be, 197 * but we can't compile it (and thus know how big it is) until we've got a 198 * place to put the code. So we cheat: we compile it twice, once with code 199 * generation turned off and size counting turned on, and once "for real". 200 * This also means that we don't allocate space until we are sure that the 201 * thing really will compile successfully, and we never have to move the 202 * code and thus invalidate pointers into it. (Note that it has to be in 203 * one piece because free() must be able to free it all.) 204 * 205 * Beware that the optimization-preparation code in here knows about some 206 * of the structure of the compiled regexp. 207 */ 208 regexp * 209 __compat_regcomp(expn) 210 const char *expn; 211 { 212 regexp *r; 213 char *scan; 214 char *longest; 215 int len; 216 int flags; 217 218 if (expn == NULL) 219 FAIL("NULL argument"); 220 221 /* First pass: determine size, legality. */ 222 #ifdef notdef 223 if (expn[0] == '.' && expn[1] == '*') expn += 2; /* aid grep */ 224 #endif 225 /* LINTED const castaway */ 226 regparse = (char *)expn; 227 regnpar = 1; 228 regsize = 0L; 229 regcode = ®dummy; 230 regc(MAGIC); 231 if (reg(0, &flags) == NULL) 232 return(NULL); 233 234 /* Small enough for pointer-storage convention? */ 235 if (regsize >= 32767L) /* Probably could be 65535L. */ 236 FAIL("regexp too big"); 237 238 /* Allocate space. */ 239 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 240 if (r == NULL) 241 FAIL("out of space"); 242 243 /* Second pass: emit code. */ 244 /* LINTED const castaway */ 245 regparse = (char *)expn; 246 regnpar = 1; 247 regcode = r->program; 248 regc(MAGIC); 249 if (reg(0, &flags) == NULL) 250 return(NULL); 251 252 /* Dig out information for optimizations. */ 253 r->regstart = '\0'; /* Worst-case defaults. */ 254 r->reganch = 0; 255 r->regmust = NULL; 256 r->regmlen = 0; 257 scan = r->program+1; /* First BRANCH. */ 258 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 259 scan = OPERAND(scan); 260 261 /* Starting-point info. */ 262 if (OP(scan) == EXACTLY) 263 r->regstart = *OPERAND(scan); 264 else if (OP(scan) == BOL) 265 r->reganch++; 266 267 /* 268 * If there's something expensive in the r.e., find the 269 * longest literal string that must appear and make it the 270 * regmust. Resolve ties in favor of later strings, since 271 * the regstart check works with the beginning of the r.e. 272 * and avoiding duplication strengthens checking. Not a 273 * strong reason, but sufficient in the absence of others. 274 */ 275 if (flags&SPSTART) { 276 longest = NULL; 277 len = 0; 278 for (; scan != NULL; scan = regnext(scan)) 279 if (OP(scan) == EXACTLY && (int) strlen(OPERAND(scan)) >= len) { 280 longest = OPERAND(scan); 281 len = strlen(OPERAND(scan)); 282 } 283 r->regmust = longest; 284 r->regmlen = len; 285 } 286 } 287 288 return(r); 289 } 290 291 /* 292 - reg - regular expression, i.e. main body or parenthesized thing 293 * 294 * Caller must absorb opening parenthesis. 295 * 296 * Combining parenthesis handling with the base level of regular expression 297 * is a trifle forced, but the need to tie the tails of the branches to what 298 * follows makes it hard to avoid. 299 */ 300 static char * 301 reg(paren, flagp) 302 int paren; /* Parenthesized? */ 303 int *flagp; 304 { 305 char *ret; 306 char *br; 307 char *ender; 308 int parno = 0; 309 int flags; 310 311 *flagp = HASWIDTH; /* Tentatively. */ 312 313 /* Make an OPEN node, if parenthesized. */ 314 if (paren) { 315 if (regnpar >= NSUBEXP) 316 FAIL("too many ()"); 317 parno = regnpar; 318 regnpar++; 319 ret = regnode(OPEN+parno); 320 } else 321 ret = NULL; 322 323 /* Pick up the branches, linking them together. */ 324 br = regbranch(&flags); 325 if (br == NULL) 326 return(NULL); 327 if (ret != NULL) 328 regtail(ret, br); /* OPEN -> first. */ 329 else 330 ret = br; 331 if (!(flags&HASWIDTH)) 332 *flagp &= ~HASWIDTH; 333 *flagp |= flags&SPSTART; 334 while (*regparse == '|' || *regparse == '\n') { 335 regparse++; 336 br = regbranch(&flags); 337 if (br == NULL) 338 return(NULL); 339 regtail(ret, br); /* BRANCH -> BRANCH. */ 340 if (!(flags&HASWIDTH)) 341 *flagp &= ~HASWIDTH; 342 *flagp |= flags&SPSTART; 343 } 344 345 /* Make a closing node, and hook it on the end. */ 346 ender = regnode((paren) ? CLOSE+parno : END); 347 regtail(ret, ender); 348 349 /* Hook the tails of the branches to the closing node. */ 350 for (br = ret; br != NULL; br = regnext(br)) 351 regoptail(br, ender); 352 353 /* Check for proper termination. */ 354 if (paren && *regparse++ != ')') { 355 FAIL("unmatched ()"); 356 } else if (!paren && *regparse != '\0') { 357 if (*regparse == ')') { 358 FAIL("unmatched ()"); 359 } else 360 FAIL("junk on end"); /* "Can't happen". */ 361 /* NOTREACHED */ 362 } 363 364 return(ret); 365 } 366 367 /* 368 - regbranch - one alternative of an | operator 369 * 370 * Implements the concatenation operator. 371 */ 372 static char * 373 regbranch(flagp) 374 int *flagp; 375 { 376 char *ret; 377 char *chain; 378 char *latest; 379 int flags; 380 381 *flagp = WORST; /* Tentatively. */ 382 383 ret = regnode(BRANCH); 384 chain = NULL; 385 while (*regparse != '\0' && *regparse != ')' && 386 *regparse != '\n' && *regparse != '|') { 387 latest = regpiece(&flags); 388 if (latest == NULL) 389 return(NULL); 390 *flagp |= flags&HASWIDTH; 391 if (chain == NULL) /* First piece. */ 392 *flagp |= flags&SPSTART; 393 else 394 regtail(chain, latest); 395 chain = latest; 396 } 397 if (chain == NULL) /* Loop ran zero times. */ 398 (void) regnode(NOTHING); 399 400 return(ret); 401 } 402 403 /* 404 - regpiece - something followed by possible [*+?] 405 * 406 * Note that the branching code sequences used for ? and the general cases 407 * of * and + are somewhat optimized: they use the same NOTHING node as 408 * both the endmarker for their branch list and the body of the last branch. 409 * It might seem that this node could be dispensed with entirely, but the 410 * endmarker role is not redundant. 411 */ 412 static char * 413 regpiece(flagp) 414 int *flagp; 415 { 416 char *ret; 417 char op; 418 char *next; 419 int flags; 420 421 ret = regatom(&flags); 422 if (ret == NULL) 423 return(NULL); 424 425 op = *regparse; 426 if (!ISMULT(op)) { 427 *flagp = flags; 428 return(ret); 429 } 430 431 if (!(flags&HASWIDTH) && op != '?') 432 FAIL("*+ operand could be empty"); 433 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 434 435 if (op == '*' && (flags&SIMPLE)) 436 reginsert(STAR, ret); 437 else if (op == '*') { 438 /* Emit x* as (x&|), where & means "self". */ 439 reginsert(BRANCH, ret); /* Either x */ 440 regoptail(ret, regnode(BACK)); /* and loop */ 441 regoptail(ret, ret); /* back */ 442 regtail(ret, regnode(BRANCH)); /* or */ 443 regtail(ret, regnode(NOTHING)); /* null. */ 444 } else if (op == '+' && (flags&SIMPLE)) 445 reginsert(PLUS, ret); 446 else if (op == '+') { 447 /* Emit x+ as x(&|), where & means "self". */ 448 next = regnode(BRANCH); /* Either */ 449 regtail(ret, next); 450 regtail(regnode(BACK), ret); /* loop back */ 451 regtail(next, regnode(BRANCH)); /* or */ 452 regtail(ret, regnode(NOTHING)); /* null. */ 453 } else if (op == '?') { 454 /* Emit x? as (x|) */ 455 reginsert(BRANCH, ret); /* Either x */ 456 regtail(ret, regnode(BRANCH)); /* or */ 457 next = regnode(NOTHING); /* null. */ 458 regtail(ret, next); 459 regoptail(ret, next); 460 } 461 regparse++; 462 if (ISMULT(*regparse)) 463 FAIL("nested *?+"); 464 465 return(ret); 466 } 467 468 /* 469 - regatom - the lowest level 470 * 471 * Optimization: gobbles an entire sequence of ordinary characters so that 472 * it can turn them into a single node, which is smaller to store and 473 * faster to run. Backslashed characters are exceptions, each becoming a 474 * separate node; the code is simpler that way and it's not worth fixing. 475 */ 476 static char * 477 regatom(flagp) 478 int *flagp; 479 { 480 char *ret; 481 int flags; 482 483 *flagp = WORST; /* Tentatively. */ 484 485 switch (*regparse++) { 486 /* FIXME: these chars only have meaning at beg/end of pat? */ 487 case '^': 488 ret = regnode(BOL); 489 break; 490 case '$': 491 ret = regnode(EOL); 492 break; 493 case '.': 494 ret = regnode(ANY); 495 *flagp |= HASWIDTH|SIMPLE; 496 break; 497 case '[': { 498 int class; 499 int classend; 500 501 if (*regparse == '^') { /* Complement of range. */ 502 ret = regnode(ANYBUT); 503 regparse++; 504 } else 505 ret = regnode(ANYOF); 506 if (*regparse == ']' || *regparse == '-') 507 regc(*regparse++); 508 while (*regparse != '\0' && *regparse != ']') { 509 if (*regparse == '-') { 510 regparse++; 511 if (*regparse == ']' || *regparse == '\0') 512 regc('-'); 513 else { 514 class = UCHARAT(regparse-2)+1; 515 classend = UCHARAT(regparse); 516 if (class > classend+1) 517 FAIL("invalid [] range"); 518 for (; class <= classend; class++) 519 regc(class); 520 regparse++; 521 } 522 } else 523 regc(*regparse++); 524 } 525 regc('\0'); 526 if (*regparse != ']') 527 FAIL("unmatched []"); 528 regparse++; 529 *flagp |= HASWIDTH|SIMPLE; 530 } 531 break; 532 case '(': 533 ret = reg(1, &flags); 534 if (ret == NULL) 535 return(NULL); 536 *flagp |= flags&(HASWIDTH|SPSTART); 537 break; 538 case '\0': 539 case '|': 540 case '\n': 541 case ')': 542 FAIL("internal urp"); /* Supposed to be caught earlier. */ 543 case '?': 544 case '+': 545 case '*': 546 FAIL("?+* follows nothing"); 547 case '\\': 548 switch (*regparse++) { 549 case '\0': 550 FAIL("trailing \\"); 551 case '<': 552 ret = regnode(WORDA); 553 break; 554 case '>': 555 ret = regnode(WORDZ); 556 break; 557 /* FIXME: Someday handle \1, \2, ... */ 558 default: 559 /* Handle general quoted chars in exact-match routine */ 560 goto de_fault; 561 } 562 break; 563 de_fault: 564 /*FALLTHROUGH*/ 565 default: 566 /* 567 * Encode a string of characters to be matched exactly. 568 * 569 * This is a bit tricky due to quoted chars and due to 570 * '*', '+', and '?' taking the SINGLE char previous 571 * as their operand. 572 * 573 * On entry, the char at regparse[-1] is going to go 574 * into the string, no matter what it is. (It could be 575 * following a \ if we are entered from the '\' case.) 576 * 577 * Basic idea is to pick up a good char in ch and 578 * examine the next char. If it's *+? then we twiddle. 579 * If it's \ then we frozzle. If it's other magic char 580 * we push ch and terminate the string. If none of the 581 * above, we push ch on the string and go around again. 582 * 583 * regprev is used to remember where "the current char" 584 * starts in the string, if due to a *+? we need to back 585 * up and put the current char in a separate, 1-char, string. 586 * When regprev is NULL, ch is the only char in the 587 * string; this is used in *+? handling, and in setting 588 * flags |= SIMPLE at the end. 589 */ 590 { 591 char *regprev; 592 char ch; 593 594 regparse--; /* Look at cur char */ 595 ret = regnode(EXACTLY); 596 for ( regprev = 0 ; ; ) { 597 ch = *regparse++; /* Get current char */ 598 switch (*regparse) { /* look at next one */ 599 600 default: 601 regc(ch); /* Add cur to string */ 602 break; 603 604 case '.': case '[': case '(': 605 case ')': case '|': case '\n': 606 case '$': case '^': 607 case '\0': 608 /* FIXME, $ and ^ should not always be magic */ 609 magic: 610 regc(ch); /* dump cur char */ 611 goto done; /* and we are done */ 612 613 case '?': case '+': case '*': 614 if (!regprev) /* If just ch in str, */ 615 goto magic; /* use it */ 616 /* End mult-char string one early */ 617 regparse = regprev; /* Back up parse */ 618 goto done; 619 620 case '\\': 621 regc(ch); /* Cur char OK */ 622 switch (regparse[1]){ /* Look after \ */ 623 case '\0': 624 case '<': 625 case '>': 626 /* FIXME: Someday handle \1, \2, ... */ 627 goto done; /* Not quoted */ 628 default: 629 /* Backup point is \, scan * point is after it. */ 630 regprev = regparse; 631 regparse++; 632 continue; /* NOT break; */ 633 } 634 } 635 regprev = regparse; /* Set backup point */ 636 } 637 done: 638 regc('\0'); 639 *flagp |= HASWIDTH; 640 if (!regprev) /* One char? */ 641 *flagp |= SIMPLE; 642 } 643 break; 644 } 645 646 return(ret); 647 } 648 649 /* 650 - regnode - emit a node 651 */ 652 static char * /* Location. */ 653 regnode(op) 654 int op; 655 { 656 char *ret; 657 char *ptr; 658 659 ret = regcode; 660 if (ret == ®dummy) { 661 regsize += 3; 662 return(ret); 663 } 664 665 ptr = ret; 666 *ptr++ = op; 667 *ptr++ = '\0'; /* Null "next" pointer. */ 668 *ptr++ = '\0'; 669 regcode = ptr; 670 671 return(ret); 672 } 673 674 /* 675 - regc - emit (if appropriate) a byte of code 676 */ 677 static void 678 regc(b) 679 int b; 680 { 681 if (regcode != ®dummy) 682 *regcode++ = b; 683 else 684 regsize++; 685 } 686 687 /* 688 - reginsert - insert an operator in front of already-emitted operand 689 * 690 * Means relocating the operand. 691 */ 692 static void 693 reginsert(op, opnd) 694 int op; 695 char *opnd; 696 { 697 char *src; 698 char *dst; 699 char *place; 700 701 if (regcode == ®dummy) { 702 regsize += 3; 703 return; 704 } 705 706 src = regcode; 707 regcode += 3; 708 dst = regcode; 709 while (src > opnd) 710 *--dst = *--src; 711 712 place = opnd; /* Op node, where operand used to be. */ 713 *place++ = op; 714 *place++ = '\0'; 715 *place++ = '\0'; 716 } 717 718 /* 719 - regtail - set the next-pointer at the end of a node chain 720 */ 721 static void 722 regtail(p, val) 723 char *p; 724 char *val; 725 { 726 char *scan; 727 char *temp; 728 int offset; 729 730 if (p == ®dummy) 731 return; 732 733 /* Find last node. */ 734 scan = p; 735 for (;;) { 736 temp = regnext(scan); 737 if (temp == NULL) 738 break; 739 scan = temp; 740 } 741 742 if (OP(scan) == BACK) 743 offset = scan - val; 744 else 745 offset = val - scan; 746 *(scan+1) = ((unsigned int)offset>>8)&0377; 747 *(scan+2) = offset&0377; 748 } 749 750 /* 751 - regoptail - regtail on operand of first argument; nop if operandless 752 */ 753 static void 754 regoptail(p, val) 755 char *p; 756 char *val; 757 { 758 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 759 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 760 return; 761 regtail(OPERAND(p), val); 762 } 763 764 /* 765 * regexec and friends 766 */ 767 768 /* 769 * Global work variables for regexec(). 770 */ 771 static char *reginput; /* String-input pointer. */ 772 static char *regbol; /* Beginning of input, for ^ check. */ 773 static char **regstartp; /* Pointer to startp array. */ 774 static char **regendp; /* Ditto for endp. */ 775 776 /* 777 * Forwards. 778 */ 779 STATIC int regtry __P((const regexp *, const char *)); 780 STATIC int regmatch __P((char *)); 781 STATIC int regrepeat __P((char *)); 782 783 #ifdef DEBUG 784 int regnarrate = 0; 785 void regdump __P((regexp *)); 786 STATIC char *regprop __P((char *)); 787 #endif 788 789 /* 790 - regexec - match a regexp against a string 791 */ 792 int 793 __compat_regexec(prog, string) 794 const regexp *prog; 795 const char *string; 796 { 797 char *s; 798 799 /* Be paranoid... */ 800 if (prog == NULL || string == NULL) { 801 regerror("NULL parameter"); 802 return(0); 803 } 804 805 /* Check validity of program. */ 806 if (UCHARAT(prog->program) != MAGIC) { 807 regerror("corrupted program"); 808 return(0); 809 } 810 811 /* If there is a "must appear" string, look for it. */ 812 if (prog->regmust != NULL) { 813 /* LINTED const castaway */ 814 s = (char *)string; 815 while ((s = strchr(s, prog->regmust[0])) != NULL) { 816 if (strncmp(s, prog->regmust, 817 (size_t)prog->regmlen) == 0) 818 break; /* Found it. */ 819 s++; 820 } 821 if (s == NULL) /* Not present. */ 822 return(0); 823 } 824 825 /* Mark beginning of line for ^ . */ 826 /* LINTED const castaway */ 827 regbol = (char *)string; 828 829 /* Simplest case: anchored match need be tried only once. */ 830 if (prog->reganch) 831 return(regtry(prog, string)); 832 833 /* Messy cases: unanchored match. */ 834 /* LINTED const castaway */ 835 s = (char *)string; 836 if (prog->regstart != '\0') 837 /* We know what char it must start with. */ 838 while ((s = strchr(s, prog->regstart)) != NULL) { 839 if (regtry(prog, s)) 840 return(1); 841 s++; 842 } 843 else 844 /* We don't -- general case. */ 845 do { 846 if (regtry(prog, s)) 847 return(1); 848 } while (*s++ != '\0'); 849 850 /* Failure. */ 851 return(0); 852 } 853 854 /* 855 - regtry - try match at specific point 856 */ 857 static int /* 0 failure, 1 success */ 858 regtry(prog, string) 859 const regexp *prog; 860 const char *string; 861 { 862 int i; 863 char **sp; 864 char **ep; 865 866 /* LINTED const castaway */ 867 reginput = (char *)string; /* XXX */ 868 regstartp = (char **)prog->startp; /* XXX */ 869 regendp = (char **)prog->endp; /* XXX */ 870 871 sp = (char **)prog->startp; /* XXX */ 872 ep = (char **)prog->endp; /* XXX */ 873 for (i = NSUBEXP; i > 0; i--) { 874 *sp++ = NULL; 875 *ep++ = NULL; 876 } 877 if (regmatch((char *)prog->program + 1)) { /* XXX */ 878 /* LINTED const castaway */ 879 ((regexp *)prog)->startp[0] = (char *)string; /* XXX */ 880 /* LINTED const castaway */ 881 ((regexp *)prog)->endp[0] = reginput; /* XXX */ 882 return(1); 883 } else 884 return(0); 885 } 886 887 /* 888 - regmatch - main matching routine 889 * 890 * Conceptually the strategy is simple: check to see whether the current 891 * node matches, call self recursively to see whether the rest matches, 892 * and then act accordingly. In practice we make some effort to avoid 893 * recursion, in particular by going through "ordinary" nodes (that don't 894 * need to know whether the rest of the match failed) by a loop instead of 895 * by recursion. 896 */ 897 static int /* 0 failure, 1 success */ 898 regmatch(prog) 899 char *prog; 900 { 901 char *scan; /* Current node. */ 902 char *next; /* Next node. */ 903 904 scan = prog; 905 #ifdef DEBUG 906 if (scan != NULL && regnarrate) 907 fprintf(stderr, "%s(\n", regprop(scan)); 908 #endif 909 while (scan != NULL) { 910 #ifdef DEBUG 911 if (regnarrate) 912 fprintf(stderr, "%s...\n", regprop(scan)); 913 #endif 914 next = regnext(scan); 915 916 switch (OP(scan)) { 917 case BOL: 918 if (reginput != regbol) 919 return(0); 920 break; 921 case EOL: 922 if (*reginput != '\0') 923 return(0); 924 break; 925 case WORDA: 926 /* Must be looking at a letter, digit, or _ */ 927 if ((!isalnum(UCHARAT(reginput))) && *reginput != '_') 928 return(0); 929 /* Prev must be BOL or nonword */ 930 if (reginput > regbol && 931 (isalnum(UCHARAT(reginput - 1)) 932 || reginput[-1] == '_')) 933 return(0); 934 break; 935 case WORDZ: 936 /* Must be looking at non letter, digit, or _ */ 937 if (isalnum(UCHARAT(reginput)) || *reginput == '_') 938 return(0); 939 /* We don't care what the previous char was */ 940 break; 941 case ANY: 942 if (*reginput == '\0') 943 return(0); 944 reginput++; 945 break; 946 case EXACTLY: { 947 int len; 948 char *opnd; 949 950 opnd = OPERAND(scan); 951 /* Inline the first character, for speed. */ 952 if (*opnd != *reginput) 953 return(0); 954 len = strlen(opnd); 955 if (len > 1 && strncmp(opnd, reginput, 956 (size_t)len) != 0) 957 return(0); 958 reginput += len; 959 } 960 break; 961 case ANYOF: 962 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 963 return(0); 964 reginput++; 965 break; 966 case ANYBUT: 967 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 968 return(0); 969 reginput++; 970 break; 971 case NOTHING: 972 break; 973 case BACK: 974 break; 975 case OPEN+1: 976 case OPEN+2: 977 case OPEN+3: 978 case OPEN+4: 979 case OPEN+5: 980 case OPEN+6: 981 case OPEN+7: 982 case OPEN+8: 983 case OPEN+9: { 984 int no; 985 char *save; 986 987 no = OP(scan) - OPEN; 988 save = reginput; 989 990 if (regmatch(next)) { 991 /* 992 * Don't set startp if some later 993 * invocation of the same parentheses 994 * already has. 995 */ 996 if (regstartp[no] == NULL) 997 regstartp[no] = save; 998 return(1); 999 } else 1000 return(0); 1001 } 1002 case CLOSE+1: 1003 case CLOSE+2: 1004 case CLOSE+3: 1005 case CLOSE+4: 1006 case CLOSE+5: 1007 case CLOSE+6: 1008 case CLOSE+7: 1009 case CLOSE+8: 1010 case CLOSE+9: { 1011 int no; 1012 char *save; 1013 1014 no = OP(scan) - CLOSE; 1015 save = reginput; 1016 1017 if (regmatch(next)) { 1018 /* 1019 * Don't set endp if some later 1020 * invocation of the same parentheses 1021 * already has. 1022 */ 1023 if (regendp[no] == NULL) 1024 regendp[no] = save; 1025 return(1); 1026 } else 1027 return(0); 1028 } 1029 case BRANCH: { 1030 char *save; 1031 1032 if (OP(next) != BRANCH) /* No choice. */ 1033 next = OPERAND(scan); /* Avoid recursion. */ 1034 else { 1035 do { 1036 save = reginput; 1037 if (regmatch(OPERAND(scan))) 1038 return(1); 1039 reginput = save; 1040 scan = regnext(scan); 1041 } while (scan != NULL && OP(scan) == BRANCH); 1042 return(0); 1043 /* NOTREACHED */ 1044 } 1045 } 1046 break; 1047 case STAR: 1048 case PLUS: { 1049 char nextch; 1050 int no; 1051 char *save; 1052 int min; 1053 1054 /* 1055 * Lookahead to avoid useless match attempts 1056 * when we know what character comes next. 1057 */ 1058 nextch = '\0'; 1059 if (OP(next) == EXACTLY) 1060 nextch = *OPERAND(next); 1061 min = (OP(scan) == STAR) ? 0 : 1; 1062 save = reginput; 1063 no = regrepeat(OPERAND(scan)); 1064 while (no >= min) { 1065 /* If it could work, try it. */ 1066 if (nextch == '\0' || *reginput == nextch) 1067 if (regmatch(next)) 1068 return(1); 1069 /* Couldn't or didn't -- back up. */ 1070 no--; 1071 reginput = save + no; 1072 } 1073 return(0); 1074 } 1075 case END: 1076 return(1); /* Success! */ 1077 default: 1078 regerror("memory corruption"); 1079 return(0); 1080 } 1081 1082 scan = next; 1083 } 1084 1085 /* 1086 * We get here only if there's trouble -- normally "case END" is 1087 * the terminating point. 1088 */ 1089 regerror("corrupted pointers"); 1090 return(0); 1091 } 1092 1093 /* 1094 - regrepeat - repeatedly match something simple, report how many 1095 */ 1096 static int 1097 regrepeat(p) 1098 char *p; 1099 { 1100 int count = 0; 1101 char *scan; 1102 char *opnd; 1103 1104 scan = reginput; 1105 opnd = OPERAND(p); 1106 switch (OP(p)) { 1107 case ANY: 1108 count = strlen(scan); 1109 scan += count; 1110 break; 1111 case EXACTLY: 1112 while (*opnd == *scan) { 1113 count++; 1114 scan++; 1115 } 1116 break; 1117 case ANYOF: 1118 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1119 count++; 1120 scan++; 1121 } 1122 break; 1123 case ANYBUT: 1124 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1125 count++; 1126 scan++; 1127 } 1128 break; 1129 default: /* Oh dear. Called inappropriately. */ 1130 regerror("internal foulup"); 1131 count = 0; /* Best compromise. */ 1132 break; 1133 } 1134 reginput = scan; 1135 1136 return(count); 1137 } 1138 1139 /* 1140 - regnext - dig the "next" pointer out of a node 1141 */ 1142 static char * 1143 regnext(p) 1144 char *p; 1145 { 1146 int offset; 1147 1148 if (p == ®dummy) 1149 return(NULL); 1150 1151 offset = NEXT(p); 1152 if (offset == 0) 1153 return(NULL); 1154 1155 if (OP(p) == BACK) 1156 return(p-offset); 1157 else 1158 return(p+offset); 1159 } 1160 1161 #ifdef DEBUG 1162 1163 /* 1164 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1165 */ 1166 void 1167 regdump(r) 1168 regexp *r; 1169 { 1170 char *s; 1171 char op = EXACTLY; /* Arbitrary non-END op. */ 1172 char *next; 1173 extern char *strchr(); 1174 1175 1176 s = r->program + 1; 1177 while (op != END) { /* While that wasn't END last time... */ 1178 op = OP(s); 1179 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1180 next = regnext(s); 1181 if (next == NULL) /* Next ptr. */ 1182 printf("(0)"); 1183 else 1184 printf("(%d)", (s-r->program)+(next-s)); 1185 s += 3; 1186 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1187 /* Literal string, where present. */ 1188 while (*s != '\0') { 1189 putchar(*s); 1190 s++; 1191 } 1192 s++; 1193 } 1194 putchar('\n'); 1195 } 1196 1197 /* Header fields of interest. */ 1198 if (r->regstart != '\0') 1199 printf("start `%c' ", r->regstart); 1200 if (r->reganch) 1201 printf("anchored "); 1202 if (r->regmust != NULL) 1203 printf("must have \"%s\"", r->regmust); 1204 printf("\n"); 1205 } 1206 1207 /* 1208 - regprop - printable representation of opcode 1209 */ 1210 static char * 1211 regprop(op) 1212 char *op; 1213 { 1214 char *p; 1215 static char buf[50]; 1216 1217 (void)strncpy(buf, ":", sizeof(buf) - 1); 1218 1219 switch (OP(op)) { 1220 case BOL: 1221 p = "BOL"; 1222 break; 1223 case EOL: 1224 p = "EOL"; 1225 break; 1226 case ANY: 1227 p = "ANY"; 1228 break; 1229 case ANYOF: 1230 p = "ANYOF"; 1231 break; 1232 case ANYBUT: 1233 p = "ANYBUT"; 1234 break; 1235 case BRANCH: 1236 p = "BRANCH"; 1237 break; 1238 case EXACTLY: 1239 p = "EXACTLY"; 1240 break; 1241 case NOTHING: 1242 p = "NOTHING"; 1243 break; 1244 case BACK: 1245 p = "BACK"; 1246 break; 1247 case END: 1248 p = "END"; 1249 break; 1250 case OPEN+1: 1251 case OPEN+2: 1252 case OPEN+3: 1253 case OPEN+4: 1254 case OPEN+5: 1255 case OPEN+6: 1256 case OPEN+7: 1257 case OPEN+8: 1258 case OPEN+9: 1259 (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf), 1260 "OPEN%d", OP(op)-OPEN); 1261 p = NULL; 1262 break; 1263 case CLOSE+1: 1264 case CLOSE+2: 1265 case CLOSE+3: 1266 case CLOSE+4: 1267 case CLOSE+5: 1268 case CLOSE+6: 1269 case CLOSE+7: 1270 case CLOSE+8: 1271 case CLOSE+9: 1272 (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf), 1273 "CLOSE%d", OP(op)-CLOSE); 1274 p = NULL; 1275 break; 1276 case STAR: 1277 p = "STAR"; 1278 break; 1279 case PLUS: 1280 p = "PLUS"; 1281 break; 1282 case WORDA: 1283 p = "WORDA"; 1284 break; 1285 case WORDZ: 1286 p = "WORDZ"; 1287 break; 1288 default: 1289 regerror("corrupted opcode"); 1290 break; 1291 } 1292 if (p != NULL) 1293 (void)strncat(buf, p, sizeof(buf) - strlen(buf) - 1); 1294 return(buf); 1295 } 1296 #endif 1297 1298 /* 1299 * The following is provided for those people who do not have strcspn() in 1300 * their C libraries. They should get off their butts and do something 1301 * about it; at least one public-domain implementation of those (highly 1302 * useful) string routines has been published on Usenet. 1303 */ 1304 #ifdef STRCSPN 1305 /* 1306 * strcspn - find length of initial segment of s1 consisting entirely 1307 * of characters not from s2 1308 */ 1309 1310 static int 1311 strcspn(s1, s2) 1312 char *s1; 1313 char *s2; 1314 { 1315 char *scan1; 1316 char *scan2; 1317 int count; 1318 1319 count = 0; 1320 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1321 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1322 if (*scan1 == *scan2++) 1323 return(count); 1324 count++; 1325 } 1326 return(count); 1327 } 1328 #endif 1329