1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 /* lasciate ogne speranza, voi ch'intrate. */ 26 27 #define DEBUG 28 29 #include <ctype.h> 30 #include <limits.h> 31 #include <stdio.h> 32 #include <string.h> 33 #include <stdlib.h> 34 #include "awk.h" 35 #include "ytab.h" 36 37 #define MAXLIN 22 38 39 #define type(v) (v)->nobj /* badly overloaded here */ 40 #define info(v) (v)->ntype /* badly overloaded here */ 41 #define left(v) (v)->narg[0] 42 #define right(v) (v)->narg[1] 43 #define parent(v) (v)->nnext 44 45 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 46 #define ELEAF case EMPTYRE: /* empty string in regexp */ 47 #define UNARY case STAR: case PLUS: case QUEST: 48 49 /* encoding in tree Nodes: 50 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 51 left is index, right contains value or pointer to value 52 unary (STAR, PLUS, QUEST): left is child, right is null 53 binary (CAT, OR): left and right are children 54 parent contains pointer to parent 55 */ 56 57 58 int *setvec; 59 int *tmpset; 60 int maxsetvec = 0; 61 62 int rtok; /* next token in current re */ 63 int rlxval; 64 static const uschar *rlxstr; 65 static const uschar *prestr; /* current position in current re */ 66 static const uschar *lastre; /* origin of last re */ 67 static const uschar *lastatom; /* origin of last Atom */ 68 static const uschar *starttok; 69 static const uschar *basestr; /* starts with original, replaced during 70 repetition processing */ 71 static const uschar *firstbasestr; 72 73 static int setcnt; 74 static int poscnt; 75 76 const char *patbeg; 77 int patlen; 78 79 #define NFA 128 /* cache this many dynamic fa's */ 80 fa *fatab[NFA]; 81 int nfatab = 0; /* entries in fatab */ 82 83 static int * 84 intalloc(size_t n, const char *f) 85 { 86 void *p = calloc(n, sizeof(int)); 87 if (p == NULL) 88 overflo(f); 89 return p; 90 } 91 92 static void 93 resizesetvec(const char *f) 94 { 95 if (maxsetvec == 0) 96 maxsetvec = MAXLIN; 97 else 98 maxsetvec *= 4; 99 setvec = realloc(setvec, maxsetvec * sizeof(*setvec)); 100 tmpset = realloc(tmpset, maxsetvec * sizeof(*tmpset)); 101 if (setvec == NULL || tmpset == NULL) 102 overflo(f); 103 } 104 105 static void 106 resize_state(fa *f, int state) 107 { 108 void *p; 109 int i, new_count; 110 111 if (++state < f->state_count) 112 return; 113 114 new_count = state + 10; /* needs to be tuned */ 115 116 p = realloc(f->gototab, new_count * sizeof(f->gototab[0])); 117 if (p == NULL) 118 goto out; 119 f->gototab = p; 120 121 p = realloc(f->out, new_count * sizeof(f->out[0])); 122 if (p == NULL) 123 goto out; 124 f->out = p; 125 126 p = realloc(f->posns, new_count * sizeof(f->posns[0])); 127 if (p == NULL) 128 goto out; 129 f->posns = p; 130 131 for (i = f->state_count; i < new_count; ++i) { 132 f->gototab[i] = calloc(NCHARS, sizeof(**f->gototab)); 133 if (f->gototab[i] == NULL) 134 goto out; 135 f->out[i] = 0; 136 f->posns[i] = NULL; 137 } 138 f->state_count = new_count; 139 return; 140 out: 141 overflo(__func__); 142 } 143 144 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */ 145 { 146 int i, use, nuse; 147 fa *pfa; 148 static int now = 1; 149 150 if (setvec == NULL) { /* first time through any RE */ 151 resizesetvec(__func__); 152 } 153 154 if (compile_time != RUNNING) /* a constant for sure */ 155 return mkdfa(s, anchor); 156 for (i = 0; i < nfatab; i++) /* is it there already? */ 157 if (fatab[i]->anchor == anchor 158 && strcmp((const char *) fatab[i]->restr, s) == 0) { 159 fatab[i]->use = now++; 160 return fatab[i]; 161 } 162 pfa = mkdfa(s, anchor); 163 if (nfatab < NFA) { /* room for another */ 164 fatab[nfatab] = pfa; 165 fatab[nfatab]->use = now++; 166 nfatab++; 167 return pfa; 168 } 169 use = fatab[0]->use; /* replace least-recently used */ 170 nuse = 0; 171 for (i = 1; i < nfatab; i++) 172 if (fatab[i]->use < use) { 173 use = fatab[i]->use; 174 nuse = i; 175 } 176 freefa(fatab[nuse]); 177 fatab[nuse] = pfa; 178 pfa->use = now++; 179 return pfa; 180 } 181 182 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */ 183 /* anchor = true for anchored matches, else false */ 184 { 185 Node *p, *p1; 186 fa *f; 187 188 firstbasestr = (const uschar *) s; 189 basestr = firstbasestr; 190 p = reparse(s); 191 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 192 /* put ALL STAR in front of reg. exp. */ 193 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 194 /* put FINAL after reg. exp. */ 195 196 poscnt = 0; 197 penter(p1); /* enter parent pointers and leaf indices */ 198 if ((f = calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL) 199 overflo(__func__); 200 f->accept = poscnt-1; /* penter has computed number of positions in re */ 201 cfoll(f, p1); /* set up follow sets */ 202 freetr(p1); 203 resize_state(f, 1); 204 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__); 205 f->posns[1] = intalloc(1, __func__); 206 *f->posns[1] = 0; 207 f->initstat = makeinit(f, anchor); 208 f->anchor = anchor; 209 f->restr = (uschar *) tostring(s); 210 if (firstbasestr != basestr) { 211 if (basestr) 212 xfree(basestr); 213 } 214 return f; 215 } 216 217 int makeinit(fa *f, bool anchor) 218 { 219 int i, k; 220 221 f->curstat = 2; 222 f->out[2] = 0; 223 k = *(f->re[0].lfollow); 224 xfree(f->posns[2]); 225 f->posns[2] = intalloc(k + 1, __func__); 226 for (i = 0; i <= k; i++) { 227 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 228 } 229 if ((f->posns[2])[1] == f->accept) 230 f->out[2] = 1; 231 for (i = 0; i < NCHARS; i++) 232 f->gototab[2][i] = 0; 233 f->curstat = cgoto(f, 2, HAT); 234 if (anchor) { 235 *f->posns[2] = k-1; /* leave out position 0 */ 236 for (i = 0; i < k; i++) { 237 (f->posns[0])[i] = (f->posns[2])[i]; 238 } 239 240 f->out[0] = f->out[2]; 241 if (f->curstat != 2) 242 --(*f->posns[f->curstat]); 243 } 244 return f->curstat; 245 } 246 247 void penter(Node *p) /* set up parent pointers and leaf indices */ 248 { 249 switch (type(p)) { 250 ELEAF 251 LEAF 252 info(p) = poscnt; 253 poscnt++; 254 break; 255 UNARY 256 penter(left(p)); 257 parent(left(p)) = p; 258 break; 259 case CAT: 260 case OR: 261 penter(left(p)); 262 penter(right(p)); 263 parent(left(p)) = p; 264 parent(right(p)) = p; 265 break; 266 case ZERO: 267 break; 268 default: /* can't happen */ 269 FATAL("can't happen: unknown type %d in penter", type(p)); 270 break; 271 } 272 } 273 274 void freetr(Node *p) /* free parse tree */ 275 { 276 switch (type(p)) { 277 ELEAF 278 LEAF 279 xfree(p); 280 break; 281 UNARY 282 case ZERO: 283 freetr(left(p)); 284 xfree(p); 285 break; 286 case CAT: 287 case OR: 288 freetr(left(p)); 289 freetr(right(p)); 290 xfree(p); 291 break; 292 default: /* can't happen */ 293 FATAL("can't happen: unknown type %d in freetr", type(p)); 294 break; 295 } 296 } 297 298 /* in the parsing of regular expressions, metacharacters like . have */ 299 /* to be seen literally; \056 is not a metacharacter. */ 300 301 int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */ 302 { /* only pick up one 8-bit byte (2 chars) */ 303 const uschar *p; 304 int n = 0; 305 int i; 306 307 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) { 308 if (isdigit(*p)) 309 n = 16 * n + *p - '0'; 310 else if (*p >= 'a' && *p <= 'f') 311 n = 16 * n + *p - 'a' + 10; 312 else if (*p >= 'A' && *p <= 'F') 313 n = 16 * n + *p - 'A' + 10; 314 } 315 *pp = p; 316 return n; 317 } 318 319 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 320 321 int quoted(const uschar **pp) /* pick up next thing after a \\ */ 322 /* and increment *pp */ 323 { 324 const uschar *p = *pp; 325 int c; 326 327 if ((c = *p++) == 't') 328 c = '\t'; 329 else if (c == 'n') 330 c = '\n'; 331 else if (c == 'f') 332 c = '\f'; 333 else if (c == 'r') 334 c = '\r'; 335 else if (c == 'b') 336 c = '\b'; 337 else if (c == 'v') 338 c = '\v'; 339 else if (c == 'a') 340 c = '\a'; 341 else if (c == '\\') 342 c = '\\'; 343 else if (c == 'x') { /* hexadecimal goo follows */ 344 c = hexstr(&p); /* this adds a null if number is invalid */ 345 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 346 int n = c - '0'; 347 if (isoctdigit(*p)) { 348 n = 8 * n + *p++ - '0'; 349 if (isoctdigit(*p)) 350 n = 8 * n + *p++ - '0'; 351 } 352 c = n; 353 } /* else */ 354 /* c = c; */ 355 *pp = p; 356 return c; 357 } 358 359 char *cclenter(const char *argp) /* add a character class */ 360 { 361 int i, c, c2; 362 const uschar *op, *p = (const uschar *) argp; 363 uschar *bp; 364 static uschar *buf = NULL; 365 static int bufsz = 100; 366 367 op = p; 368 if (buf == NULL && (buf = malloc(bufsz)) == NULL) 369 FATAL("out of space for character class [%.10s...] 1", p); 370 bp = buf; 371 for (i = 0; (c = *p++) != 0; ) { 372 if (c == '\\') { 373 c = quoted(&p); 374 } else if (c == '-' && i > 0 && bp[-1] != 0) { 375 if (*p != 0) { 376 c = bp[-1]; 377 c2 = *p++; 378 if (c2 == '\\') 379 c2 = quoted(&p); 380 if (c > c2) { /* empty; ignore */ 381 bp--; 382 i--; 383 continue; 384 } 385 while (c < c2) { 386 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) 387 FATAL("out of space for character class [%.10s...] 2", p); 388 *bp++ = ++c; 389 i++; 390 } 391 continue; 392 } 393 } 394 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) 395 FATAL("out of space for character class [%.10s...] 3", p); 396 *bp++ = c; 397 i++; 398 } 399 *bp = 0; 400 DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); 401 xfree(op); 402 return (char *) tostring((char *) buf); 403 } 404 405 void overflo(const char *s) 406 { 407 FATAL("regular expression too big: out of space in %.30s...", s); 408 } 409 410 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 411 { 412 int i; 413 int *p; 414 415 switch (type(v)) { 416 ELEAF 417 LEAF 418 f->re[info(v)].ltype = type(v); 419 f->re[info(v)].lval.np = right(v); 420 while (f->accept >= maxsetvec) { /* guessing here! */ 421 resizesetvec(__func__); 422 } 423 for (i = 0; i <= f->accept; i++) 424 setvec[i] = 0; 425 setcnt = 0; 426 follow(v); /* computes setvec and setcnt */ 427 p = intalloc(setcnt + 1, __func__); 428 f->re[info(v)].lfollow = p; 429 *p = setcnt; 430 for (i = f->accept; i >= 0; i--) 431 if (setvec[i] == 1) 432 *++p = i; 433 break; 434 UNARY 435 cfoll(f,left(v)); 436 break; 437 case CAT: 438 case OR: 439 cfoll(f,left(v)); 440 cfoll(f,right(v)); 441 break; 442 case ZERO: 443 break; 444 default: /* can't happen */ 445 FATAL("can't happen: unknown type %d in cfoll", type(v)); 446 } 447 } 448 449 int first(Node *p) /* collects initially active leaves of p into setvec */ 450 /* returns 0 if p matches empty string */ 451 { 452 int b, lp; 453 454 switch (type(p)) { 455 ELEAF 456 LEAF 457 lp = info(p); /* look for high-water mark of subscripts */ 458 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 459 resizesetvec(__func__); 460 } 461 if (type(p) == EMPTYRE) { 462 setvec[lp] = 0; 463 return(0); 464 } 465 if (setvec[lp] != 1) { 466 setvec[lp] = 1; 467 setcnt++; 468 } 469 if (type(p) == CCL && (*(char *) right(p)) == '\0') 470 return(0); /* empty CCL */ 471 return(1); 472 case PLUS: 473 if (first(left(p)) == 0) 474 return(0); 475 return(1); 476 case STAR: 477 case QUEST: 478 first(left(p)); 479 return(0); 480 case CAT: 481 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 482 return(1); 483 case OR: 484 b = first(right(p)); 485 if (first(left(p)) == 0 || b == 0) return(0); 486 return(1); 487 case ZERO: 488 return 0; 489 } 490 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 491 return(-1); 492 } 493 494 void follow(Node *v) /* collects leaves that can follow v into setvec */ 495 { 496 Node *p; 497 498 if (type(v) == FINAL) 499 return; 500 p = parent(v); 501 switch (type(p)) { 502 case STAR: 503 case PLUS: 504 first(v); 505 follow(p); 506 return; 507 508 case OR: 509 case QUEST: 510 follow(p); 511 return; 512 513 case CAT: 514 if (v == left(p)) { /* v is left child of p */ 515 if (first(right(p)) == 0) { 516 follow(p); 517 return; 518 } 519 } else /* v is right child */ 520 follow(p); 521 return; 522 } 523 } 524 525 int member(int c, const char *sarg) /* is c in s? */ 526 { 527 const uschar *s = (const uschar *) sarg; 528 529 while (*s) 530 if (c == *s++) 531 return(1); 532 return(0); 533 } 534 535 int match(fa *f, const char *p0) /* shortest match ? */ 536 { 537 int s, ns; 538 const uschar *p = (const uschar *) p0; 539 540 s = f->initstat; 541 assert (s < f->state_count); 542 543 if (f->out[s]) 544 return(1); 545 do { 546 /* assert(*p < NCHARS); */ 547 if ((ns = f->gototab[s][*p]) != 0) 548 s = ns; 549 else 550 s = cgoto(f, s, *p); 551 if (f->out[s]) 552 return(1); 553 } while (*p++ != 0); 554 return(0); 555 } 556 557 int pmatch(fa *f, const char *p0) /* longest match, for sub */ 558 { 559 int s, ns; 560 const uschar *p = (const uschar *) p0; 561 const uschar *q; 562 563 s = f->initstat; 564 assert(s < f->state_count); 565 566 patbeg = (const char *)p; 567 patlen = -1; 568 do { 569 q = p; 570 do { 571 if (f->out[s]) /* final state */ 572 patlen = q-p; 573 /* assert(*q < NCHARS); */ 574 if ((ns = f->gototab[s][*q]) != 0) 575 s = ns; 576 else 577 s = cgoto(f, s, *q); 578 579 assert(s < f->state_count); 580 581 if (s == 1) { /* no transition */ 582 if (patlen >= 0) { 583 patbeg = (const char *) p; 584 return(1); 585 } 586 else 587 goto nextin; /* no match */ 588 } 589 } while (*q++ != 0); 590 if (f->out[s]) 591 patlen = q-p-1; /* don't count $ */ 592 if (patlen >= 0) { 593 patbeg = (const char *) p; 594 return(1); 595 } 596 nextin: 597 s = 2; 598 } while (*p++); 599 return (0); 600 } 601 602 int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 603 { 604 int s, ns; 605 const uschar *p = (const uschar *) p0; 606 const uschar *q; 607 608 s = f->initstat; 609 assert(s < f->state_count); 610 611 patbeg = (const char *)p; 612 patlen = -1; 613 while (*p) { 614 q = p; 615 do { 616 if (f->out[s]) /* final state */ 617 patlen = q-p; 618 /* assert(*q < NCHARS); */ 619 if ((ns = f->gototab[s][*q]) != 0) 620 s = ns; 621 else 622 s = cgoto(f, s, *q); 623 if (s == 1) { /* no transition */ 624 if (patlen > 0) { 625 patbeg = (const char *) p; 626 return(1); 627 } else 628 goto nnextin; /* no nonempty match */ 629 } 630 } while (*q++ != 0); 631 if (f->out[s]) 632 patlen = q-p-1; /* don't count $ */ 633 if (patlen > 0 ) { 634 patbeg = (const char *) p; 635 return(1); 636 } 637 nnextin: 638 s = 2; 639 p++; 640 } 641 return (0); 642 } 643 644 645 /* 646 * NAME 647 * fnematch 648 * 649 * DESCRIPTION 650 * A stream-fed version of nematch which transfers characters to a 651 * null-terminated buffer. All characters up to and including the last 652 * character of the matching text or EOF are placed in the buffer. If 653 * a match is found, patbeg and patlen are set appropriately. 654 * 655 * RETURN VALUES 656 * false No match found. 657 * true Match found. 658 */ 659 660 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) 661 { 662 char *buf = *pbuf; 663 int bufsize = *pbufsize; 664 int c, i, j, k, ns, s; 665 666 s = pfa->initstat; 667 patlen = 0; 668 669 /* 670 * All indices relative to buf. 671 * i <= j <= k <= bufsize 672 * 673 * i: origin of active substring 674 * j: current character 675 * k: destination of next getc() 676 */ 677 i = -1, k = 0; 678 do { 679 j = i++; 680 do { 681 if (++j == k) { 682 if (k == bufsize) 683 if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) 684 FATAL("stream '%.30s...' too long", buf); 685 buf[k++] = (c = getc(f)) != EOF ? c : 0; 686 } 687 c = buf[j]; 688 /* assert(c < NCHARS); */ 689 690 if ((ns = pfa->gototab[s][c]) != 0) 691 s = ns; 692 else 693 s = cgoto(pfa, s, c); 694 695 if (pfa->out[s]) { /* final state */ 696 patlen = j - i + 1; 697 if (c == 0) /* don't count $ */ 698 patlen--; 699 } 700 } while (buf[j] && s != 1); 701 s = 2; 702 } while (buf[i] && !patlen); 703 704 /* adjbuf() may have relocated a resized buffer. Inform the world. */ 705 *pbuf = buf; 706 *pbufsize = bufsize; 707 708 if (patlen) { 709 patbeg = (char *) buf + i; 710 /* 711 * Under no circumstances is the last character fed to 712 * the automaton part of the match. It is EOF's nullbyte, 713 * or it sent the automaton into a state with no further 714 * transitions available (s==1), or both. Room for a 715 * terminating nullbyte is guaranteed. 716 * 717 * ungetc any chars after the end of matching text 718 * (except for EOF's nullbyte, if present) and null 719 * terminate the buffer. 720 */ 721 do 722 if (buf[--k] && ungetc(buf[k], f) == EOF) 723 FATAL("unable to ungetc '%c'", buf[k]); 724 while (k > i + patlen); 725 buf[k] = '\0'; 726 return true; 727 } 728 else 729 return false; 730 } 731 732 Node *reparse(const char *p) /* parses regular expression pointed to by p */ 733 { /* uses relex() to scan regular expression */ 734 Node *np; 735 736 DPRINTF("reparse <%s>\n", p); 737 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */ 738 rtok = relex(); 739 /* GNU compatibility: an empty regexp matches anything */ 740 if (rtok == '\0') { 741 /* FATAL("empty regular expression"); previous */ 742 return(op2(EMPTYRE, NIL, NIL)); 743 } 744 np = regexp(); 745 if (rtok != '\0') 746 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 747 return(np); 748 } 749 750 Node *regexp(void) /* top-level parse of reg expr */ 751 { 752 return (alt(concat(primary()))); 753 } 754 755 Node *primary(void) 756 { 757 Node *np; 758 int savelastatom; 759 760 switch (rtok) { 761 case CHAR: 762 lastatom = starttok; 763 np = op2(CHAR, NIL, itonp(rlxval)); 764 rtok = relex(); 765 return (unary(np)); 766 case ALL: 767 rtok = relex(); 768 return (unary(op2(ALL, NIL, NIL))); 769 case EMPTYRE: 770 rtok = relex(); 771 return (unary(op2(EMPTYRE, NIL, NIL))); 772 case DOT: 773 lastatom = starttok; 774 rtok = relex(); 775 return (unary(op2(DOT, NIL, NIL))); 776 case CCL: 777 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr)); 778 lastatom = starttok; 779 rtok = relex(); 780 return (unary(np)); 781 case NCCL: 782 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr)); 783 lastatom = starttok; 784 rtok = relex(); 785 return (unary(np)); 786 case '^': 787 rtok = relex(); 788 return (unary(op2(CHAR, NIL, itonp(HAT)))); 789 case '$': 790 rtok = relex(); 791 return (unary(op2(CHAR, NIL, NIL))); 792 case '(': 793 lastatom = starttok; 794 savelastatom = starttok - basestr; /* Retain over recursion */ 795 rtok = relex(); 796 if (rtok == ')') { /* special pleading for () */ 797 rtok = relex(); 798 return unary(op2(CCL, NIL, (Node *) tostring(""))); 799 } 800 np = regexp(); 801 if (rtok == ')') { 802 lastatom = basestr + savelastatom; /* Restore */ 803 rtok = relex(); 804 return (unary(np)); 805 } 806 else 807 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 808 default: 809 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 810 } 811 return 0; /*NOTREACHED*/ 812 } 813 814 Node *concat(Node *np) 815 { 816 switch (rtok) { 817 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 818 return (concat(op2(CAT, np, primary()))); 819 case EMPTYRE: 820 rtok = relex(); 821 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")), 822 primary()))); 823 } 824 return (np); 825 } 826 827 Node *alt(Node *np) 828 { 829 if (rtok == OR) { 830 rtok = relex(); 831 return (alt(op2(OR, np, concat(primary())))); 832 } 833 return (np); 834 } 835 836 Node *unary(Node *np) 837 { 838 switch (rtok) { 839 case STAR: 840 rtok = relex(); 841 return (unary(op2(STAR, np, NIL))); 842 case PLUS: 843 rtok = relex(); 844 return (unary(op2(PLUS, np, NIL))); 845 case QUEST: 846 rtok = relex(); 847 return (unary(op2(QUEST, np, NIL))); 848 case ZERO: 849 rtok = relex(); 850 return (unary(op2(ZERO, np, NIL))); 851 default: 852 return (np); 853 } 854 } 855 856 /* 857 * Character class definitions conformant to the POSIX locale as 858 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 859 * and operating character sets are both ASCII (ISO646) or supersets 860 * thereof. 861 * 862 * Note that to avoid overflowing the temporary buffer used in 863 * relex(), the expanded character class (prior to range expansion) 864 * must be less than twice the size of their full name. 865 */ 866 867 /* Because isblank doesn't show up in any of the header files on any 868 * system i use, it's defined here. if some other locale has a richer 869 * definition of "blank", define HAS_ISBLANK and provide your own 870 * version. 871 * the parentheses here are an attempt to find a path through the maze 872 * of macro definition and/or function and/or version provided. thanks 873 * to nelson beebe for the suggestion; let's see if it works everywhere. 874 */ 875 876 /* #define HAS_ISBLANK */ 877 #ifndef HAS_ISBLANK 878 879 int (xisblank)(int c) 880 { 881 return c==' ' || c=='\t'; 882 } 883 884 #endif 885 886 static const struct charclass { 887 const char *cc_name; 888 int cc_namelen; 889 int (*cc_func)(int); 890 } charclasses[] = { 891 { "alnum", 5, isalnum }, 892 { "alpha", 5, isalpha }, 893 #ifndef HAS_ISBLANK 894 { "blank", 5, xisblank }, 895 #else 896 { "blank", 5, isblank }, 897 #endif 898 { "cntrl", 5, iscntrl }, 899 { "digit", 5, isdigit }, 900 { "graph", 5, isgraph }, 901 { "lower", 5, islower }, 902 { "print", 5, isprint }, 903 { "punct", 5, ispunct }, 904 { "space", 5, isspace }, 905 { "upper", 5, isupper }, 906 { "xdigit", 6, isxdigit }, 907 { NULL, 0, NULL }, 908 }; 909 910 #define REPEAT_SIMPLE 0 911 #define REPEAT_PLUS_APPENDED 1 912 #define REPEAT_WITH_Q 2 913 #define REPEAT_ZERO 3 914 915 static int 916 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 917 int atomlen, int firstnum, int secondnum, int special_case) 918 { 919 int i, j; 920 uschar *buf = 0; 921 int ret = 1; 922 int init_q = (firstnum == 0); /* first added char will be ? */ 923 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 924 int prefix_length = reptok - basestr; /* prefix includes first rep */ 925 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */ 926 int size = prefix_length + suffix_length; 927 928 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 929 size += atomlen*(firstnum-1); 930 } 931 932 /* Adjust size of buffer for special cases */ 933 if (special_case == REPEAT_PLUS_APPENDED) { 934 size++; /* for the final + */ 935 } else if (special_case == REPEAT_WITH_Q) { 936 size += init_q + (atomlen+1)* n_q_reps; 937 } else if (special_case == REPEAT_ZERO) { 938 size += 2; /* just a null ERE: () */ 939 } 940 if ((buf = malloc(size + 1)) == NULL) 941 FATAL("out of space in reg expr %.10s..", lastre); 942 memcpy(buf, basestr, prefix_length); /* copy prefix */ 943 j = prefix_length; 944 if (special_case == REPEAT_ZERO) { 945 j -= atomlen; 946 buf[j++] = '('; 947 buf[j++] = ')'; 948 } 949 for (i = 1; i < firstnum; i++) { /* copy x reps */ 950 memcpy(&buf[j], atom, atomlen); 951 j += atomlen; 952 } 953 if (special_case == REPEAT_PLUS_APPENDED) { 954 buf[j++] = '+'; 955 } else if (special_case == REPEAT_WITH_Q) { 956 if (init_q) 957 buf[j++] = '?'; 958 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */ 959 memcpy(&buf[j], atom, atomlen); 960 j += atomlen; 961 buf[j++] = '?'; 962 } 963 } 964 memcpy(&buf[j], reptok+reptoklen, suffix_length); 965 if (special_case == REPEAT_ZERO) { 966 buf[j+suffix_length] = '\0'; 967 } else { 968 buf[size] = '\0'; 969 } 970 /* free old basestr */ 971 if (firstbasestr != basestr) { 972 if (basestr) 973 xfree(basestr); 974 } 975 basestr = buf; 976 prestr = buf + prefix_length; 977 if (special_case == REPEAT_ZERO) { 978 prestr -= atomlen; 979 ret++; 980 } 981 return ret; 982 } 983 984 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 985 int atomlen, int firstnum, int secondnum) 986 { 987 /* 988 In general, the repetition specifier or "bound" is replaced here 989 by an equivalent ERE string, repeating the immediately previous atom 990 and appending ? and + as needed. Note that the first copy of the 991 atom is left in place, except in the special_case of a zero-repeat 992 (i.e., {0}). 993 */ 994 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 995 if (firstnum < 2) { 996 /* 0 or 1: should be handled before you get here */ 997 FATAL("internal error"); 998 } else { 999 return replace_repeat(reptok, reptoklen, atom, atomlen, 1000 firstnum, secondnum, REPEAT_PLUS_APPENDED); 1001 } 1002 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 1003 if (firstnum == 0) { /* {0} or {0,0} */ 1004 /* This case is unusual because the resulting 1005 replacement string might actually be SMALLER than 1006 the original ERE */ 1007 return replace_repeat(reptok, reptoklen, atom, atomlen, 1008 firstnum, secondnum, REPEAT_ZERO); 1009 } else { /* (firstnum >= 1) */ 1010 return replace_repeat(reptok, reptoklen, atom, atomlen, 1011 firstnum, secondnum, REPEAT_SIMPLE); 1012 } 1013 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 1014 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 1015 return replace_repeat(reptok, reptoklen, atom, atomlen, 1016 firstnum, secondnum, REPEAT_WITH_Q); 1017 } else { /* Error - shouldn't be here (n>m) */ 1018 FATAL("internal error"); 1019 } 1020 return 0; 1021 } 1022 1023 int relex(void) /* lexical analyzer for reparse */ 1024 { 1025 int c, n; 1026 int cflag; 1027 static uschar *buf = NULL; 1028 static int bufsz = 100; 1029 uschar *bp; 1030 const struct charclass *cc; 1031 int i; 1032 int num, m; 1033 bool commafound, digitfound; 1034 const uschar *startreptok; 1035 static int parens = 0; 1036 1037 rescan: 1038 starttok = prestr; 1039 1040 switch (c = *prestr++) { 1041 case '|': return OR; 1042 case '*': return STAR; 1043 case '+': return PLUS; 1044 case '?': return QUEST; 1045 case '.': return DOT; 1046 case '\0': prestr--; return '\0'; 1047 case '^': 1048 case '$': 1049 return c; 1050 case '(': 1051 parens++; 1052 return c; 1053 case ')': 1054 if (parens) { 1055 parens--; 1056 return c; 1057 } 1058 /* unmatched close parenthesis; per POSIX, treat as literal */ 1059 rlxval = c; 1060 return CHAR; 1061 case '\\': 1062 rlxval = quoted(&prestr); 1063 return CHAR; 1064 default: 1065 rlxval = c; 1066 return CHAR; 1067 case '[': 1068 if (buf == NULL && (buf = malloc(bufsz)) == NULL) 1069 FATAL("out of space in reg expr %.10s..", lastre); 1070 bp = buf; 1071 if (*prestr == '^') { 1072 cflag = 1; 1073 prestr++; 1074 } 1075 else 1076 cflag = 0; 1077 n = 2 * strlen((const char *) prestr)+1; 1078 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 1079 FATAL("out of space for reg expr %.10s...", lastre); 1080 for (; ; ) { 1081 if ((c = *prestr++) == '\\') { 1082 *bp++ = '\\'; 1083 if ((c = *prestr++) == '\0') 1084 FATAL("nonterminated character class %.20s...", lastre); 1085 *bp++ = c; 1086 /* } else if (c == '\n') { */ 1087 /* FATAL("newline in character class %.20s...", lastre); */ 1088 } else if (c == '[' && *prestr == ':') { 1089 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 1090 for (cc = charclasses; cc->cc_name; cc++) 1091 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 1092 break; 1093 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 1094 prestr[2 + cc->cc_namelen] == ']') { 1095 prestr += cc->cc_namelen + 3; 1096 /* 1097 * BUG: We begin at 1, instead of 0, since we 1098 * would otherwise prematurely terminate the 1099 * string for classes like [[:cntrl:]]. This 1100 * means that we can't match the NUL character, 1101 * not without first adapting the entire 1102 * program to track each string's length. 1103 */ 1104 for (i = 1; i <= UCHAR_MAX; i++) { 1105 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) 1106 FATAL("out of space for reg expr %.10s...", lastre); 1107 if (cc->cc_func(i)) { 1108 /* escape backslash */ 1109 if (i == '\\') { 1110 *bp++ = '\\'; 1111 n++; 1112 } 1113 1114 *bp++ = i; 1115 n++; 1116 } 1117 } 1118 } else 1119 *bp++ = c; 1120 } else if (c == '[' && *prestr == '.') { 1121 char collate_char; 1122 prestr++; 1123 collate_char = *prestr++; 1124 if (*prestr == '.' && prestr[1] == ']') { 1125 prestr += 2; 1126 /* Found it: map via locale TBD: for 1127 now, simply return this char. This 1128 is sufficient to pass conformance 1129 test awk.ex 156 1130 */ 1131 if (*prestr == ']') { 1132 prestr++; 1133 rlxval = collate_char; 1134 return CHAR; 1135 } 1136 } 1137 } else if (c == '[' && *prestr == '=') { 1138 char equiv_char; 1139 prestr++; 1140 equiv_char = *prestr++; 1141 if (*prestr == '=' && prestr[1] == ']') { 1142 prestr += 2; 1143 /* Found it: map via locale TBD: for now 1144 simply return this char. This is 1145 sufficient to pass conformance test 1146 awk.ex 156 1147 */ 1148 if (*prestr == ']') { 1149 prestr++; 1150 rlxval = equiv_char; 1151 return CHAR; 1152 } 1153 } 1154 } else if (c == '\0') { 1155 FATAL("nonterminated character class %.20s", lastre); 1156 } else if (bp == buf) { /* 1st char is special */ 1157 *bp++ = c; 1158 } else if (c == ']') { 1159 *bp++ = 0; 1160 rlxstr = (uschar *) tostring((char *) buf); 1161 if (cflag == 0) 1162 return CCL; 1163 else 1164 return NCCL; 1165 } else 1166 *bp++ = c; 1167 } 1168 break; 1169 case '{': 1170 if (isdigit(*(prestr))) { 1171 num = 0; /* Process as a repetition */ 1172 n = -1; m = -1; 1173 commafound = false; 1174 digitfound = false; 1175 startreptok = prestr-1; 1176 /* Remember start of previous atom here ? */ 1177 } else { /* just a { char, not a repetition */ 1178 rlxval = c; 1179 return CHAR; 1180 } 1181 for (; ; ) { 1182 if ((c = *prestr++) == '}') { 1183 if (commafound) { 1184 if (digitfound) { /* {n,m} */ 1185 m = num; 1186 if (m < n) 1187 FATAL("illegal repetition expression: class %.20s", 1188 lastre); 1189 if (n == 0 && m == 1) { 1190 return QUEST; 1191 } 1192 } else { /* {n,} */ 1193 if (n == 0) 1194 return STAR; 1195 else if (n == 1) 1196 return PLUS; 1197 } 1198 } else { 1199 if (digitfound) { /* {n} same as {n,n} */ 1200 n = num; 1201 m = num; 1202 } else { /* {} */ 1203 FATAL("illegal repetition expression: class %.20s", 1204 lastre); 1205 } 1206 } 1207 if (repeat(starttok, prestr-starttok, lastatom, 1208 startreptok - lastatom, n, m) > 0) { 1209 if (n == 0 && m == 0) { 1210 return ZERO; 1211 } 1212 /* must rescan input for next token */ 1213 goto rescan; 1214 } 1215 /* Failed to replace: eat up {...} characters 1216 and treat like just PLUS */ 1217 return PLUS; 1218 } else if (c == '\0') { 1219 FATAL("nonterminated character class %.20s", 1220 lastre); 1221 } else if (isdigit(c)) { 1222 num = 10 * num + c - '0'; 1223 digitfound = true; 1224 } else if (c == ',') { 1225 if (commafound) 1226 FATAL("illegal repetition expression: class %.20s", 1227 lastre); 1228 /* looking for {n,} or {n,m} */ 1229 commafound = true; 1230 n = num; 1231 digitfound = false; /* reset */ 1232 num = 0; 1233 } else { 1234 FATAL("illegal repetition expression: class %.20s", 1235 lastre); 1236 } 1237 } 1238 break; 1239 } 1240 } 1241 1242 int cgoto(fa *f, int s, int c) 1243 { 1244 int *p, *q; 1245 int i, j, k; 1246 1247 assert(c == HAT || c < NCHARS); 1248 while (f->accept >= maxsetvec) { /* guessing here! */ 1249 resizesetvec(__func__); 1250 } 1251 for (i = 0; i <= f->accept; i++) 1252 setvec[i] = 0; 1253 setcnt = 0; 1254 resize_state(f, s); 1255 /* compute positions of gototab[s,c] into setvec */ 1256 p = f->posns[s]; 1257 for (i = 1; i <= *p; i++) { 1258 if ((k = f->re[p[i]].ltype) != FINAL) { 1259 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1260 || (k == DOT && c != 0 && c != HAT) 1261 || (k == ALL && c != 0) 1262 || (k == EMPTYRE && c != 0) 1263 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 1264 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 1265 q = f->re[p[i]].lfollow; 1266 for (j = 1; j <= *q; j++) { 1267 if (q[j] >= maxsetvec) { 1268 resizesetvec(__func__); 1269 } 1270 if (setvec[q[j]] == 0) { 1271 setcnt++; 1272 setvec[q[j]] = 1; 1273 } 1274 } 1275 } 1276 } 1277 } 1278 /* determine if setvec is a previous state */ 1279 tmpset[0] = setcnt; 1280 j = 1; 1281 for (i = f->accept; i >= 0; i--) 1282 if (setvec[i]) { 1283 tmpset[j++] = i; 1284 } 1285 resize_state(f, f->curstat > s ? f->curstat : s); 1286 /* tmpset == previous state? */ 1287 for (i = 1; i <= f->curstat; i++) { 1288 p = f->posns[i]; 1289 if ((k = tmpset[0]) != p[0]) 1290 goto different; 1291 for (j = 1; j <= k; j++) 1292 if (tmpset[j] != p[j]) 1293 goto different; 1294 /* setvec is state i */ 1295 if (c != HAT) 1296 f->gototab[s][c] = i; 1297 return i; 1298 different:; 1299 } 1300 1301 /* add tmpset to current set of states */ 1302 ++(f->curstat); 1303 resize_state(f, f->curstat); 1304 for (i = 0; i < NCHARS; i++) 1305 f->gototab[f->curstat][i] = 0; 1306 xfree(f->posns[f->curstat]); 1307 p = intalloc(setcnt + 1, __func__); 1308 1309 f->posns[f->curstat] = p; 1310 if (c != HAT) 1311 f->gototab[s][c] = f->curstat; 1312 for (i = 0; i <= setcnt; i++) 1313 p[i] = tmpset[i]; 1314 if (setvec[f->accept]) 1315 f->out[f->curstat] = 1; 1316 else 1317 f->out[f->curstat] = 0; 1318 return f->curstat; 1319 } 1320 1321 1322 void freefa(fa *f) /* free a finite automaton */ 1323 { 1324 int i; 1325 1326 if (f == NULL) 1327 return; 1328 for (i = 0; i < f->state_count; i++) 1329 xfree(f->gototab[i]) 1330 for (i = 0; i <= f->curstat; i++) 1331 xfree(f->posns[i]); 1332 for (i = 0; i <= f->accept; i++) { 1333 xfree(f->re[i].lfollow); 1334 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1335 xfree(f->re[i].lval.np); 1336 } 1337 xfree(f->restr); 1338 xfree(f->out); 1339 xfree(f->posns); 1340 xfree(f->gototab); 1341 xfree(f); 1342 } 1343