1 /* $OpenBSD: b.c,v 1.33 2020/07/13 14:03:52 millert Exp $ */ 2 /**************************************************************** 3 Copyright (C) Lucent Technologies 1997 4 All Rights Reserved 5 6 Permission to use, copy, modify, and distribute this software and 7 its documentation for any purpose and without fee is hereby 8 granted, provided that the above copyright notice appear in all 9 copies and that both that the copyright notice and this 10 permission notice and warranty disclaimer appear in supporting 11 documentation, and that the name Lucent Technologies or any of 12 its entities not be used in advertising or publicity pertaining 13 to distribution of the software without specific, written prior 14 permission. 15 16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 23 THIS SOFTWARE. 24 ****************************************************************/ 25 26 /* lasciate ogne speranza, voi ch'intrate. */ 27 28 #define DEBUG 29 30 #include <ctype.h> 31 #include <limits.h> 32 #include <stdio.h> 33 #include <string.h> 34 #include <stdlib.h> 35 #include "awk.h" 36 #include "ytab.h" 37 38 #define MAXLIN 22 39 40 #define type(v) (v)->nobj /* badly overloaded here */ 41 #define info(v) (v)->ntype /* badly overloaded here */ 42 #define left(v) (v)->narg[0] 43 #define right(v) (v)->narg[1] 44 #define parent(v) (v)->nnext 45 46 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 47 #define ELEAF case EMPTYRE: /* empty string in regexp */ 48 #define UNARY case STAR: case PLUS: case QUEST: 49 50 /* encoding in tree Nodes: 51 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 52 left is index, right contains value or pointer to value 53 unary (STAR, PLUS, QUEST): left is child, right is null 54 binary (CAT, OR): left and right are children 55 parent contains pointer to parent 56 */ 57 58 59 int *setvec; 60 int *tmpset; 61 int maxsetvec = 0; 62 63 int rtok; /* next token in current re */ 64 int rlxval; 65 static const uschar *rlxstr; 66 static const uschar *prestr; /* current position in current re */ 67 static const uschar *lastre; /* origin of last re */ 68 static const uschar *lastatom; /* origin of last Atom */ 69 static const uschar *starttok; 70 static const uschar *basestr; /* starts with original, replaced during 71 repetition processing */ 72 static const uschar *firstbasestr; 73 74 static int setcnt; 75 static int poscnt; 76 77 const char *patbeg; 78 int patlen; 79 80 #define NFA 128 /* cache this many dynamic fa's */ 81 fa *fatab[NFA]; 82 int nfatab = 0; /* entries in fatab */ 83 84 static int * 85 intalloc(size_t n, const char *f) 86 { 87 void *p = calloc(n, sizeof(int)); 88 if (p == NULL) 89 overflo(f); 90 return p; 91 } 92 93 static void 94 allocsetvec(const char *f) 95 { 96 maxsetvec = MAXLIN; 97 setvec = reallocarray(setvec, maxsetvec, sizeof(*setvec)); 98 tmpset = reallocarray(tmpset, maxsetvec, sizeof(*tmpset)); 99 if (setvec == NULL || tmpset == NULL) 100 overflo(f); 101 } 102 103 static void 104 resizesetvec(const char *f) 105 { 106 setvec = reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec)); 107 tmpset = reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset)); 108 if (setvec == NULL || tmpset == NULL) 109 overflo(f); 110 maxsetvec *= 4; 111 } 112 113 static void 114 resize_state(fa *f, int state) 115 { 116 void *p; 117 int i, new_count; 118 119 if (++state < f->state_count) 120 return; 121 122 new_count = state + 10; /* needs to be tuned */ 123 124 p = reallocarray(f->gototab, new_count, sizeof(f->gototab[0])); 125 if (p == NULL) 126 goto out; 127 f->gototab = p; 128 129 p = reallocarray(f->out, new_count, sizeof(f->out[0])); 130 if (p == NULL) 131 goto out; 132 f->out = p; 133 134 p = reallocarray(f->posns, new_count, sizeof(f->posns[0])); 135 if (p == NULL) 136 goto out; 137 f->posns = p; 138 139 for (i = f->state_count; i < new_count; ++i) { 140 f->gototab[i] = calloc(NCHARS, sizeof(**f->gototab)); 141 if (f->gototab[i] == NULL) 142 goto out; 143 f->out[i] = 0; 144 f->posns[i] = NULL; 145 } 146 f->state_count = new_count; 147 return; 148 out: 149 overflo(__func__); 150 } 151 152 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */ 153 { 154 int i, use, nuse; 155 fa *pfa; 156 static int now = 1; 157 158 if (setvec == NULL) { /* first time through any RE */ 159 allocsetvec(__func__); 160 } 161 162 if (compile_time != RUNNING) /* a constant for sure */ 163 return mkdfa(s, anchor); 164 for (i = 0; i < nfatab; i++) /* is it there already? */ 165 if (fatab[i]->anchor == anchor 166 && strcmp((const char *) fatab[i]->restr, s) == 0) { 167 fatab[i]->use = now++; 168 return fatab[i]; 169 } 170 pfa = mkdfa(s, anchor); 171 if (nfatab < NFA) { /* room for another */ 172 fatab[nfatab] = pfa; 173 fatab[nfatab]->use = now++; 174 nfatab++; 175 return pfa; 176 } 177 use = fatab[0]->use; /* replace least-recently used */ 178 nuse = 0; 179 for (i = 1; i < nfatab; i++) 180 if (fatab[i]->use < use) { 181 use = fatab[i]->use; 182 nuse = i; 183 } 184 freefa(fatab[nuse]); 185 fatab[nuse] = pfa; 186 pfa->use = now++; 187 return pfa; 188 } 189 190 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */ 191 /* anchor = true for anchored matches, else false */ 192 { 193 Node *p, *p1; 194 fa *f; 195 196 firstbasestr = (const uschar *) s; 197 basestr = firstbasestr; 198 p = reparse(s); 199 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 200 /* put ALL STAR in front of reg. exp. */ 201 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 202 /* put FINAL after reg. exp. */ 203 204 poscnt = 0; 205 penter(p1); /* enter parent pointers and leaf indices */ 206 if ((f = calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL) 207 overflo(__func__); 208 f->accept = poscnt-1; /* penter has computed number of positions in re */ 209 cfoll(f, p1); /* set up follow sets */ 210 freetr(p1); 211 resize_state(f, 1); 212 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__); 213 f->posns[1] = intalloc(1, __func__); 214 *f->posns[1] = 0; 215 f->initstat = makeinit(f, anchor); 216 f->anchor = anchor; 217 f->restr = (uschar *) tostring(s); 218 if (firstbasestr != basestr) { 219 if (basestr) 220 xfree(basestr); 221 } 222 return f; 223 } 224 225 int makeinit(fa *f, bool anchor) 226 { 227 int i, k; 228 229 f->curstat = 2; 230 f->out[2] = 0; 231 k = *(f->re[0].lfollow); 232 xfree(f->posns[2]); 233 f->posns[2] = intalloc(k + 1, __func__); 234 for (i = 0; i <= k; i++) { 235 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 236 } 237 if ((f->posns[2])[1] == f->accept) 238 f->out[2] = 1; 239 for (i = 0; i < NCHARS; i++) 240 f->gototab[2][i] = 0; 241 f->curstat = cgoto(f, 2, HAT); 242 if (anchor) { 243 *f->posns[2] = k-1; /* leave out position 0 */ 244 for (i = 0; i < k; i++) { 245 (f->posns[0])[i] = (f->posns[2])[i]; 246 } 247 248 f->out[0] = f->out[2]; 249 if (f->curstat != 2) 250 --(*f->posns[f->curstat]); 251 } 252 return f->curstat; 253 } 254 255 void penter(Node *p) /* set up parent pointers and leaf indices */ 256 { 257 switch (type(p)) { 258 ELEAF 259 LEAF 260 info(p) = poscnt; 261 poscnt++; 262 break; 263 UNARY 264 penter(left(p)); 265 parent(left(p)) = p; 266 break; 267 case CAT: 268 case OR: 269 penter(left(p)); 270 penter(right(p)); 271 parent(left(p)) = p; 272 parent(right(p)) = p; 273 break; 274 case ZERO: 275 break; 276 default: /* can't happen */ 277 FATAL("can't happen: unknown type %d in penter", type(p)); 278 break; 279 } 280 } 281 282 void freetr(Node *p) /* free parse tree */ 283 { 284 switch (type(p)) { 285 ELEAF 286 LEAF 287 xfree(p); 288 break; 289 UNARY 290 case ZERO: 291 freetr(left(p)); 292 xfree(p); 293 break; 294 case CAT: 295 case OR: 296 freetr(left(p)); 297 freetr(right(p)); 298 xfree(p); 299 break; 300 default: /* can't happen */ 301 FATAL("can't happen: unknown type %d in freetr", type(p)); 302 break; 303 } 304 } 305 306 /* in the parsing of regular expressions, metacharacters like . have */ 307 /* to be seen literally; \056 is not a metacharacter. */ 308 309 int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */ 310 { /* only pick up one 8-bit byte (2 chars) */ 311 const uschar *p; 312 int n = 0; 313 int i; 314 315 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) { 316 if (isdigit(*p)) 317 n = 16 * n + *p - '0'; 318 else if (*p >= 'a' && *p <= 'f') 319 n = 16 * n + *p - 'a' + 10; 320 else if (*p >= 'A' && *p <= 'F') 321 n = 16 * n + *p - 'A' + 10; 322 } 323 *pp = p; 324 return n; 325 } 326 327 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 328 329 int quoted(const uschar **pp) /* pick up next thing after a \\ */ 330 /* and increment *pp */ 331 { 332 const uschar *p = *pp; 333 int c; 334 335 if ((c = *p++) == 't') 336 c = '\t'; 337 else if (c == 'n') 338 c = '\n'; 339 else if (c == 'f') 340 c = '\f'; 341 else if (c == 'r') 342 c = '\r'; 343 else if (c == 'b') 344 c = '\b'; 345 else if (c == 'v') 346 c = '\v'; 347 else if (c == 'a') 348 c = '\a'; 349 else if (c == '\\') 350 c = '\\'; 351 else if (c == 'x') { /* hexadecimal goo follows */ 352 c = hexstr(&p); /* this adds a null if number is invalid */ 353 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 354 int n = c - '0'; 355 if (isoctdigit(*p)) { 356 n = 8 * n + *p++ - '0'; 357 if (isoctdigit(*p)) 358 n = 8 * n + *p++ - '0'; 359 } 360 c = n; 361 } /* else */ 362 /* c = c; */ 363 *pp = p; 364 return c; 365 } 366 367 char *cclenter(const char *argp) /* add a character class */ 368 { 369 int i, c, c2; 370 const uschar *op, *p = (const uschar *) argp; 371 uschar *bp; 372 static uschar *buf = NULL; 373 static int bufsz = 100; 374 375 op = p; 376 if (buf == NULL && (buf = malloc(bufsz)) == NULL) 377 FATAL("out of space for character class [%.10s...] 1", p); 378 bp = buf; 379 for (i = 0; (c = *p++) != 0; ) { 380 if (c == '\\') { 381 c = quoted(&p); 382 } else if (c == '-' && i > 0 && bp[-1] != 0) { 383 if (*p != 0) { 384 c = bp[-1]; 385 c2 = *p++; 386 if (c2 == '\\') 387 c2 = quoted(&p); 388 if (c > c2) { /* empty; ignore */ 389 bp--; 390 i--; 391 continue; 392 } 393 while (c < c2) { 394 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) 395 FATAL("out of space for character class [%.10s...] 2", p); 396 *bp++ = ++c; 397 i++; 398 } 399 continue; 400 } 401 } 402 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) 403 FATAL("out of space for character class [%.10s...] 3", p); 404 *bp++ = c; 405 i++; 406 } 407 *bp = 0; 408 DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); 409 xfree(op); 410 return (char *) tostring((char *) buf); 411 } 412 413 void overflo(const char *s) 414 { 415 FATAL("regular expression too big: out of space in %.30s...", s); 416 } 417 418 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 419 { 420 int i; 421 int *p; 422 423 switch (type(v)) { 424 ELEAF 425 LEAF 426 f->re[info(v)].ltype = type(v); 427 f->re[info(v)].lval.np = right(v); 428 while (f->accept >= maxsetvec) { /* guessing here! */ 429 resizesetvec(__func__); 430 } 431 for (i = 0; i <= f->accept; i++) 432 setvec[i] = 0; 433 setcnt = 0; 434 follow(v); /* computes setvec and setcnt */ 435 p = intalloc(setcnt + 1, __func__); 436 f->re[info(v)].lfollow = p; 437 *p = setcnt; 438 for (i = f->accept; i >= 0; i--) 439 if (setvec[i] == 1) 440 *++p = i; 441 break; 442 UNARY 443 cfoll(f,left(v)); 444 break; 445 case CAT: 446 case OR: 447 cfoll(f,left(v)); 448 cfoll(f,right(v)); 449 break; 450 case ZERO: 451 break; 452 default: /* can't happen */ 453 FATAL("can't happen: unknown type %d in cfoll", type(v)); 454 } 455 } 456 457 int first(Node *p) /* collects initially active leaves of p into setvec */ 458 /* returns 0 if p matches empty string */ 459 { 460 int b, lp; 461 462 switch (type(p)) { 463 ELEAF 464 LEAF 465 lp = info(p); /* look for high-water mark of subscripts */ 466 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 467 resizesetvec(__func__); 468 } 469 if (type(p) == EMPTYRE) { 470 setvec[lp] = 0; 471 return(0); 472 } 473 if (setvec[lp] != 1) { 474 setvec[lp] = 1; 475 setcnt++; 476 } 477 if (type(p) == CCL && (*(char *) right(p)) == '\0') 478 return(0); /* empty CCL */ 479 return(1); 480 case PLUS: 481 if (first(left(p)) == 0) 482 return(0); 483 return(1); 484 case STAR: 485 case QUEST: 486 first(left(p)); 487 return(0); 488 case CAT: 489 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 490 return(1); 491 case OR: 492 b = first(right(p)); 493 if (first(left(p)) == 0 || b == 0) return(0); 494 return(1); 495 case ZERO: 496 return 0; 497 } 498 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 499 return(-1); 500 } 501 502 void follow(Node *v) /* collects leaves that can follow v into setvec */ 503 { 504 Node *p; 505 506 if (type(v) == FINAL) 507 return; 508 p = parent(v); 509 switch (type(p)) { 510 case STAR: 511 case PLUS: 512 first(v); 513 follow(p); 514 return; 515 516 case OR: 517 case QUEST: 518 follow(p); 519 return; 520 521 case CAT: 522 if (v == left(p)) { /* v is left child of p */ 523 if (first(right(p)) == 0) { 524 follow(p); 525 return; 526 } 527 } else /* v is right child */ 528 follow(p); 529 return; 530 } 531 } 532 533 int member(int c, const char *sarg) /* is c in s? */ 534 { 535 const uschar *s = (const uschar *) sarg; 536 537 while (*s) 538 if (c == *s++) 539 return(1); 540 return(0); 541 } 542 543 int match(fa *f, const char *p0) /* shortest match ? */ 544 { 545 int s, ns; 546 const uschar *p = (const uschar *) p0; 547 548 s = f->initstat; 549 assert (s < f->state_count); 550 551 if (f->out[s]) 552 return(1); 553 do { 554 /* assert(*p < NCHARS); */ 555 if ((ns = f->gototab[s][*p]) != 0) 556 s = ns; 557 else 558 s = cgoto(f, s, *p); 559 if (f->out[s]) 560 return(1); 561 } while (*p++ != 0); 562 return(0); 563 } 564 565 int pmatch(fa *f, const char *p0) /* longest match, for sub */ 566 { 567 int s, ns; 568 const uschar *p = (const uschar *) p0; 569 const uschar *q; 570 571 s = f->initstat; 572 assert(s < f->state_count); 573 574 patbeg = (const char *)p; 575 patlen = -1; 576 do { 577 q = p; 578 do { 579 if (f->out[s]) /* final state */ 580 patlen = q-p; 581 /* assert(*q < NCHARS); */ 582 if ((ns = f->gototab[s][*q]) != 0) 583 s = ns; 584 else 585 s = cgoto(f, s, *q); 586 587 assert(s < f->state_count); 588 589 if (s == 1) { /* no transition */ 590 if (patlen >= 0) { 591 patbeg = (const char *) p; 592 return(1); 593 } 594 else 595 goto nextin; /* no match */ 596 } 597 } while (*q++ != 0); 598 if (f->out[s]) 599 patlen = q-p-1; /* don't count $ */ 600 if (patlen >= 0) { 601 patbeg = (const char *) p; 602 return(1); 603 } 604 nextin: 605 s = 2; 606 } while (*p++); 607 return (0); 608 } 609 610 int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 611 { 612 int s, ns; 613 const uschar *p = (const uschar *) p0; 614 const uschar *q; 615 616 s = f->initstat; 617 assert(s < f->state_count); 618 619 patbeg = (const char *)p; 620 patlen = -1; 621 while (*p) { 622 q = p; 623 do { 624 if (f->out[s]) /* final state */ 625 patlen = q-p; 626 /* assert(*q < NCHARS); */ 627 if ((ns = f->gototab[s][*q]) != 0) 628 s = ns; 629 else 630 s = cgoto(f, s, *q); 631 if (s == 1) { /* no transition */ 632 if (patlen > 0) { 633 patbeg = (const char *) p; 634 return(1); 635 } else 636 goto nnextin; /* no nonempty match */ 637 } 638 } while (*q++ != 0); 639 if (f->out[s]) 640 patlen = q-p-1; /* don't count $ */ 641 if (patlen > 0 ) { 642 patbeg = (const char *) p; 643 return(1); 644 } 645 nnextin: 646 s = 2; 647 p++; 648 } 649 return (0); 650 } 651 652 653 /* 654 * NAME 655 * fnematch 656 * 657 * DESCRIPTION 658 * A stream-fed version of nematch which transfers characters to a 659 * null-terminated buffer. All characters up to and including the last 660 * character of the matching text or EOF are placed in the buffer. If 661 * a match is found, patbeg and patlen are set appropriately. 662 * 663 * RETURN VALUES 664 * false No match found. 665 * true Match found. 666 */ 667 668 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) 669 { 670 char *buf = *pbuf; 671 int bufsize = *pbufsize; 672 int c, i, j, k, ns, s; 673 674 s = pfa->initstat; 675 patlen = 0; 676 677 /* 678 * All indices relative to buf. 679 * i <= j <= k <= bufsize 680 * 681 * i: origin of active substring 682 * j: current character 683 * k: destination of next getc() 684 */ 685 i = -1, k = 0; 686 do { 687 j = i++; 688 do { 689 if (++j == k) { 690 if (k == bufsize) 691 if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) 692 FATAL("stream '%.30s...' too long", buf); 693 buf[k++] = (c = getc(f)) != EOF ? c : 0; 694 } 695 c = (uschar)buf[j]; 696 /* assert(c < NCHARS); */ 697 698 if ((ns = pfa->gototab[s][c]) != 0) 699 s = ns; 700 else 701 s = cgoto(pfa, s, c); 702 703 if (pfa->out[s]) { /* final state */ 704 patlen = j - i + 1; 705 if (c == 0) /* don't count $ */ 706 patlen--; 707 } 708 } while (buf[j] && s != 1); 709 s = 2; 710 } while (buf[i] && !patlen); 711 712 /* adjbuf() may have relocated a resized buffer. Inform the world. */ 713 *pbuf = buf; 714 *pbufsize = bufsize; 715 716 if (patlen) { 717 patbeg = buf + i; 718 /* 719 * Under no circumstances is the last character fed to 720 * the automaton part of the match. It is EOF's nullbyte, 721 * or it sent the automaton into a state with no further 722 * transitions available (s==1), or both. Room for a 723 * terminating nullbyte is guaranteed. 724 * 725 * ungetc any chars after the end of matching text 726 * (except for EOF's nullbyte, if present) and null 727 * terminate the buffer. 728 */ 729 do 730 if (buf[--k] && ungetc(buf[k], f) == EOF) 731 FATAL("unable to ungetc '%c'", buf[k]); 732 while (k > i + patlen); 733 buf[k] = '\0'; 734 return true; 735 } 736 else 737 return false; 738 } 739 740 Node *reparse(const char *p) /* parses regular expression pointed to by p */ 741 { /* uses relex() to scan regular expression */ 742 Node *np; 743 744 DPRINTF("reparse <%s>\n", p); 745 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */ 746 rtok = relex(); 747 /* GNU compatibility: an empty regexp matches anything */ 748 if (rtok == '\0') { 749 /* FATAL("empty regular expression"); previous */ 750 return(op2(EMPTYRE, NIL, NIL)); 751 } 752 np = regexp(); 753 if (rtok != '\0') 754 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 755 return(np); 756 } 757 758 Node *regexp(void) /* top-level parse of reg expr */ 759 { 760 return (alt(concat(primary()))); 761 } 762 763 Node *primary(void) 764 { 765 Node *np; 766 int savelastatom; 767 768 switch (rtok) { 769 case CHAR: 770 lastatom = starttok; 771 np = op2(CHAR, NIL, itonp(rlxval)); 772 rtok = relex(); 773 return (unary(np)); 774 case ALL: 775 rtok = relex(); 776 return (unary(op2(ALL, NIL, NIL))); 777 case EMPTYRE: 778 rtok = relex(); 779 return (unary(op2(EMPTYRE, NIL, NIL))); 780 case DOT: 781 lastatom = starttok; 782 rtok = relex(); 783 return (unary(op2(DOT, NIL, NIL))); 784 case CCL: 785 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr)); 786 lastatom = starttok; 787 rtok = relex(); 788 return (unary(np)); 789 case NCCL: 790 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr)); 791 lastatom = starttok; 792 rtok = relex(); 793 return (unary(np)); 794 case '^': 795 rtok = relex(); 796 return (unary(op2(CHAR, NIL, itonp(HAT)))); 797 case '$': 798 rtok = relex(); 799 return (unary(op2(CHAR, NIL, NIL))); 800 case '(': 801 lastatom = starttok; 802 savelastatom = starttok - basestr; /* Retain over recursion */ 803 rtok = relex(); 804 if (rtok == ')') { /* special pleading for () */ 805 rtok = relex(); 806 return unary(op2(CCL, NIL, (Node *) tostring(""))); 807 } 808 np = regexp(); 809 if (rtok == ')') { 810 lastatom = basestr + savelastatom; /* Restore */ 811 rtok = relex(); 812 return (unary(np)); 813 } 814 else 815 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 816 default: 817 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 818 } 819 return 0; /*NOTREACHED*/ 820 } 821 822 Node *concat(Node *np) 823 { 824 switch (rtok) { 825 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 826 return (concat(op2(CAT, np, primary()))); 827 case EMPTYRE: 828 rtok = relex(); 829 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")), 830 primary()))); 831 } 832 return (np); 833 } 834 835 Node *alt(Node *np) 836 { 837 if (rtok == OR) { 838 rtok = relex(); 839 return (alt(op2(OR, np, concat(primary())))); 840 } 841 return (np); 842 } 843 844 Node *unary(Node *np) 845 { 846 switch (rtok) { 847 case STAR: 848 rtok = relex(); 849 return (unary(op2(STAR, np, NIL))); 850 case PLUS: 851 rtok = relex(); 852 return (unary(op2(PLUS, np, NIL))); 853 case QUEST: 854 rtok = relex(); 855 return (unary(op2(QUEST, np, NIL))); 856 case ZERO: 857 rtok = relex(); 858 return (unary(op2(ZERO, np, NIL))); 859 default: 860 return (np); 861 } 862 } 863 864 /* 865 * Character class definitions conformant to the POSIX locale as 866 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 867 * and operating character sets are both ASCII (ISO646) or supersets 868 * thereof. 869 * 870 * Note that to avoid overflowing the temporary buffer used in 871 * relex(), the expanded character class (prior to range expansion) 872 * must be less than twice the size of their full name. 873 */ 874 875 /* Because isblank doesn't show up in any of the header files on any 876 * system i use, it's defined here. if some other locale has a richer 877 * definition of "blank", define HAS_ISBLANK and provide your own 878 * version. 879 * the parentheses here are an attempt to find a path through the maze 880 * of macro definition and/or function and/or version provided. thanks 881 * to nelson beebe for the suggestion; let's see if it works everywhere. 882 */ 883 884 #ifndef HAS_ISBLANK 885 886 int (xisblank)(int c) 887 { 888 return c==' ' || c=='\t'; 889 } 890 891 #endif 892 893 static const struct charclass { 894 const char *cc_name; 895 int cc_namelen; 896 int (*cc_func)(int); 897 } charclasses[] = { 898 { "alnum", 5, isalnum }, 899 { "alpha", 5, isalpha }, 900 #ifndef HAS_ISBLANK 901 { "blank", 5, xisblank }, 902 #else 903 { "blank", 5, isblank }, 904 #endif 905 { "cntrl", 5, iscntrl }, 906 { "digit", 5, isdigit }, 907 { "graph", 5, isgraph }, 908 { "lower", 5, islower }, 909 { "print", 5, isprint }, 910 { "punct", 5, ispunct }, 911 { "space", 5, isspace }, 912 { "upper", 5, isupper }, 913 { "xdigit", 6, isxdigit }, 914 { NULL, 0, NULL }, 915 }; 916 917 #define REPEAT_SIMPLE 0 918 #define REPEAT_PLUS_APPENDED 1 919 #define REPEAT_WITH_Q 2 920 #define REPEAT_ZERO 3 921 922 static int 923 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 924 int atomlen, int firstnum, int secondnum, int special_case) 925 { 926 int i, j; 927 uschar *buf = NULL; 928 int ret = 1; 929 int init_q = (firstnum == 0); /* first added char will be ? */ 930 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 931 int prefix_length = reptok - basestr; /* prefix includes first rep */ 932 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */ 933 int size = prefix_length + suffix_length; 934 935 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 936 size += atomlen*(firstnum-1); 937 } 938 939 /* Adjust size of buffer for special cases */ 940 if (special_case == REPEAT_PLUS_APPENDED) { 941 size++; /* for the final + */ 942 } else if (special_case == REPEAT_WITH_Q) { 943 size += init_q + (atomlen+1)* n_q_reps; 944 } else if (special_case == REPEAT_ZERO) { 945 size += 2; /* just a null ERE: () */ 946 } 947 if ((buf = malloc(size + 1)) == NULL) 948 FATAL("out of space in reg expr %.10s..", lastre); 949 memcpy(buf, basestr, prefix_length); /* copy prefix */ 950 j = prefix_length; 951 if (special_case == REPEAT_ZERO) { 952 j -= atomlen; 953 buf[j++] = '('; 954 buf[j++] = ')'; 955 } 956 for (i = 1; i < firstnum; i++) { /* copy x reps */ 957 memcpy(&buf[j], atom, atomlen); 958 j += atomlen; 959 } 960 if (special_case == REPEAT_PLUS_APPENDED) { 961 buf[j++] = '+'; 962 } else if (special_case == REPEAT_WITH_Q) { 963 if (init_q) 964 buf[j++] = '?'; 965 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */ 966 memcpy(&buf[j], atom, atomlen); 967 j += atomlen; 968 buf[j++] = '?'; 969 } 970 } 971 memcpy(&buf[j], reptok+reptoklen, suffix_length); 972 if (special_case == REPEAT_ZERO) { 973 buf[j+suffix_length] = '\0'; 974 } else { 975 buf[size] = '\0'; 976 } 977 /* free old basestr */ 978 if (firstbasestr != basestr) { 979 if (basestr) 980 xfree(basestr); 981 } 982 basestr = buf; 983 prestr = buf + prefix_length; 984 if (special_case == REPEAT_ZERO) { 985 prestr -= atomlen; 986 ret++; 987 } 988 return ret; 989 } 990 991 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 992 int atomlen, int firstnum, int secondnum) 993 { 994 /* 995 In general, the repetition specifier or "bound" is replaced here 996 by an equivalent ERE string, repeating the immediately previous atom 997 and appending ? and + as needed. Note that the first copy of the 998 atom is left in place, except in the special_case of a zero-repeat 999 (i.e., {0}). 1000 */ 1001 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 1002 if (firstnum < 2) { 1003 /* 0 or 1: should be handled before you get here */ 1004 FATAL("internal error"); 1005 } else { 1006 return replace_repeat(reptok, reptoklen, atom, atomlen, 1007 firstnum, secondnum, REPEAT_PLUS_APPENDED); 1008 } 1009 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 1010 if (firstnum == 0) { /* {0} or {0,0} */ 1011 /* This case is unusual because the resulting 1012 replacement string might actually be SMALLER than 1013 the original ERE */ 1014 return replace_repeat(reptok, reptoklen, atom, atomlen, 1015 firstnum, secondnum, REPEAT_ZERO); 1016 } else { /* (firstnum >= 1) */ 1017 return replace_repeat(reptok, reptoklen, atom, atomlen, 1018 firstnum, secondnum, REPEAT_SIMPLE); 1019 } 1020 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 1021 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 1022 return replace_repeat(reptok, reptoklen, atom, atomlen, 1023 firstnum, secondnum, REPEAT_WITH_Q); 1024 } else { /* Error - shouldn't be here (n>m) */ 1025 FATAL("internal error"); 1026 } 1027 return 0; 1028 } 1029 1030 int relex(void) /* lexical analyzer for reparse */ 1031 { 1032 int c, n; 1033 int cflag; 1034 static uschar *buf = NULL; 1035 static int bufsz = 100; 1036 uschar *bp; 1037 const struct charclass *cc; 1038 int i; 1039 int num, m; 1040 bool commafound, digitfound; 1041 const uschar *startreptok; 1042 static int parens = 0; 1043 1044 rescan: 1045 starttok = prestr; 1046 1047 switch (c = *prestr++) { 1048 case '|': return OR; 1049 case '*': return STAR; 1050 case '+': return PLUS; 1051 case '?': return QUEST; 1052 case '.': return DOT; 1053 case '\0': prestr--; return '\0'; 1054 case '^': 1055 case '$': 1056 return c; 1057 case '(': 1058 parens++; 1059 return c; 1060 case ')': 1061 if (parens) { 1062 parens--; 1063 return c; 1064 } 1065 /* unmatched close parenthesis; per POSIX, treat as literal */ 1066 rlxval = c; 1067 return CHAR; 1068 case '\\': 1069 rlxval = quoted(&prestr); 1070 return CHAR; 1071 default: 1072 rlxval = c; 1073 return CHAR; 1074 case '[': 1075 if (buf == NULL && (buf = malloc(bufsz)) == NULL) 1076 FATAL("out of space in reg expr %.10s..", lastre); 1077 bp = buf; 1078 if (*prestr == '^') { 1079 cflag = 1; 1080 prestr++; 1081 } 1082 else 1083 cflag = 0; 1084 n = 2 * strlen((const char *) prestr)+1; 1085 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 1086 FATAL("out of space for reg expr %.10s...", lastre); 1087 for (; ; ) { 1088 if ((c = *prestr++) == '\\') { 1089 *bp++ = '\\'; 1090 if ((c = *prestr++) == '\0') 1091 FATAL("nonterminated character class %.20s...", lastre); 1092 *bp++ = c; 1093 /* } else if (c == '\n') { */ 1094 /* FATAL("newline in character class %.20s...", lastre); */ 1095 } else if (c == '[' && *prestr == ':') { 1096 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 1097 for (cc = charclasses; cc->cc_name; cc++) 1098 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 1099 break; 1100 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 1101 prestr[2 + cc->cc_namelen] == ']') { 1102 prestr += cc->cc_namelen + 3; 1103 /* 1104 * BUG: We begin at 1, instead of 0, since we 1105 * would otherwise prematurely terminate the 1106 * string for classes like [[:cntrl:]]. This 1107 * means that we can't match the NUL character, 1108 * not without first adapting the entire 1109 * program to track each string's length. 1110 */ 1111 for (i = 1; i <= UCHAR_MAX; i++) { 1112 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) 1113 FATAL("out of space for reg expr %.10s...", lastre); 1114 if (cc->cc_func(i)) { 1115 /* escape backslash */ 1116 if (i == '\\') { 1117 *bp++ = '\\'; 1118 n++; 1119 } 1120 1121 *bp++ = i; 1122 n++; 1123 } 1124 } 1125 } else 1126 *bp++ = c; 1127 } else if (c == '[' && *prestr == '.') { 1128 char collate_char; 1129 prestr++; 1130 collate_char = *prestr++; 1131 if (*prestr == '.' && prestr[1] == ']') { 1132 prestr += 2; 1133 /* Found it: map via locale TBD: for 1134 now, simply return this char. This 1135 is sufficient to pass conformance 1136 test awk.ex 156 1137 */ 1138 if (*prestr == ']') { 1139 prestr++; 1140 rlxval = collate_char; 1141 return CHAR; 1142 } 1143 } 1144 } else if (c == '[' && *prestr == '=') { 1145 char equiv_char; 1146 prestr++; 1147 equiv_char = *prestr++; 1148 if (*prestr == '=' && prestr[1] == ']') { 1149 prestr += 2; 1150 /* Found it: map via locale TBD: for now 1151 simply return this char. This is 1152 sufficient to pass conformance test 1153 awk.ex 156 1154 */ 1155 if (*prestr == ']') { 1156 prestr++; 1157 rlxval = equiv_char; 1158 return CHAR; 1159 } 1160 } 1161 } else if (c == '\0') { 1162 FATAL("nonterminated character class %.20s", lastre); 1163 } else if (bp == buf) { /* 1st char is special */ 1164 *bp++ = c; 1165 } else if (c == ']') { 1166 *bp++ = 0; 1167 rlxstr = (uschar *) tostring((char *) buf); 1168 if (cflag == 0) 1169 return CCL; 1170 else 1171 return NCCL; 1172 } else 1173 *bp++ = c; 1174 } 1175 break; 1176 case '{': 1177 if (isdigit(*(prestr))) { 1178 num = 0; /* Process as a repetition */ 1179 n = -1; m = -1; 1180 commafound = false; 1181 digitfound = false; 1182 startreptok = prestr-1; 1183 /* Remember start of previous atom here ? */ 1184 } else { /* just a { char, not a repetition */ 1185 rlxval = c; 1186 return CHAR; 1187 } 1188 for (; ; ) { 1189 if ((c = *prestr++) == '}') { 1190 if (commafound) { 1191 if (digitfound) { /* {n,m} */ 1192 m = num; 1193 if (m < n) 1194 FATAL("illegal repetition expression: class %.20s", 1195 lastre); 1196 if (n == 0 && m == 1) { 1197 return QUEST; 1198 } 1199 } else { /* {n,} */ 1200 if (n == 0) 1201 return STAR; 1202 else if (n == 1) 1203 return PLUS; 1204 } 1205 } else { 1206 if (digitfound) { /* {n} same as {n,n} */ 1207 n = num; 1208 m = num; 1209 } else { /* {} */ 1210 FATAL("illegal repetition expression: class %.20s", 1211 lastre); 1212 } 1213 } 1214 if (repeat(starttok, prestr-starttok, lastatom, 1215 startreptok - lastatom, n, m) > 0) { 1216 if (n == 0 && m == 0) { 1217 return ZERO; 1218 } 1219 /* must rescan input for next token */ 1220 goto rescan; 1221 } 1222 /* Failed to replace: eat up {...} characters 1223 and treat like just PLUS */ 1224 return PLUS; 1225 } else if (c == '\0') { 1226 FATAL("nonterminated character class %.20s", 1227 lastre); 1228 } else if (isdigit(c)) { 1229 num = 10 * num + c - '0'; 1230 digitfound = true; 1231 } else if (c == ',') { 1232 if (commafound) 1233 FATAL("illegal repetition expression: class %.20s", 1234 lastre); 1235 /* looking for {n,} or {n,m} */ 1236 commafound = true; 1237 n = num; 1238 digitfound = false; /* reset */ 1239 num = 0; 1240 } else { 1241 FATAL("illegal repetition expression: class %.20s", 1242 lastre); 1243 } 1244 } 1245 break; 1246 } 1247 } 1248 1249 int cgoto(fa *f, int s, int c) 1250 { 1251 int *p, *q; 1252 int i, j, k; 1253 1254 assert(c == HAT || c < NCHARS); 1255 while (f->accept >= maxsetvec) { /* guessing here! */ 1256 resizesetvec(__func__); 1257 } 1258 for (i = 0; i <= f->accept; i++) 1259 setvec[i] = 0; 1260 setcnt = 0; 1261 resize_state(f, s); 1262 /* compute positions of gototab[s,c] into setvec */ 1263 p = f->posns[s]; 1264 for (i = 1; i <= *p; i++) { 1265 if ((k = f->re[p[i]].ltype) != FINAL) { 1266 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1267 || (k == DOT && c != 0 && c != HAT) 1268 || (k == ALL && c != 0) 1269 || (k == EMPTYRE && c != 0) 1270 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 1271 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 1272 q = f->re[p[i]].lfollow; 1273 for (j = 1; j <= *q; j++) { 1274 if (q[j] >= maxsetvec) { 1275 resizesetvec(__func__); 1276 } 1277 if (setvec[q[j]] == 0) { 1278 setcnt++; 1279 setvec[q[j]] = 1; 1280 } 1281 } 1282 } 1283 } 1284 } 1285 /* determine if setvec is a previous state */ 1286 tmpset[0] = setcnt; 1287 j = 1; 1288 for (i = f->accept; i >= 0; i--) 1289 if (setvec[i]) { 1290 tmpset[j++] = i; 1291 } 1292 resize_state(f, f->curstat > s ? f->curstat : s); 1293 /* tmpset == previous state? */ 1294 for (i = 1; i <= f->curstat; i++) { 1295 p = f->posns[i]; 1296 if ((k = tmpset[0]) != p[0]) 1297 goto different; 1298 for (j = 1; j <= k; j++) 1299 if (tmpset[j] != p[j]) 1300 goto different; 1301 /* setvec is state i */ 1302 if (c != HAT) 1303 f->gototab[s][c] = i; 1304 return i; 1305 different:; 1306 } 1307 1308 /* add tmpset to current set of states */ 1309 ++(f->curstat); 1310 resize_state(f, f->curstat); 1311 for (i = 0; i < NCHARS; i++) 1312 f->gototab[f->curstat][i] = 0; 1313 xfree(f->posns[f->curstat]); 1314 p = intalloc(setcnt + 1, __func__); 1315 1316 f->posns[f->curstat] = p; 1317 if (c != HAT) 1318 f->gototab[s][c] = f->curstat; 1319 for (i = 0; i <= setcnt; i++) 1320 p[i] = tmpset[i]; 1321 if (setvec[f->accept]) 1322 f->out[f->curstat] = 1; 1323 else 1324 f->out[f->curstat] = 0; 1325 return f->curstat; 1326 } 1327 1328 1329 void freefa(fa *f) /* free a finite automaton */ 1330 { 1331 int i; 1332 1333 if (f == NULL) 1334 return; 1335 for (i = 0; i < f->state_count; i++) 1336 xfree(f->gototab[i]) 1337 for (i = 0; i <= f->curstat; i++) 1338 xfree(f->posns[i]); 1339 for (i = 0; i <= f->accept; i++) { 1340 xfree(f->re[i].lfollow); 1341 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1342 xfree(f->re[i].lval.np); 1343 } 1344 xfree(f->restr); 1345 xfree(f->out); 1346 xfree(f->posns); 1347 xfree(f->gototab); 1348 xfree(f); 1349 } 1350