1 /* $OpenBSD: b.c,v 1.35 2020/12/09 20:00:11 millert Exp $ */ 2 /**************************************************************** 3 Copyright (C) Lucent Technologies 1997 4 All Rights Reserved 5 6 Permission to use, copy, modify, and distribute this software and 7 its documentation for any purpose and without fee is hereby 8 granted, provided that the above copyright notice appear in all 9 copies and that both that the copyright notice and this 10 permission notice and warranty disclaimer appear in supporting 11 documentation, and that the name Lucent Technologies or any of 12 its entities not be used in advertising or publicity pertaining 13 to distribution of the software without specific, written prior 14 permission. 15 16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 23 THIS SOFTWARE. 24 ****************************************************************/ 25 26 /* lasciate ogne speranza, voi ch'intrate. */ 27 28 #define DEBUG 29 30 #include <ctype.h> 31 #include <limits.h> 32 #include <stdio.h> 33 #include <string.h> 34 #include <stdlib.h> 35 #include "awk.h" 36 #include "awkgram.tab.h" 37 38 #define MAXLIN 22 39 40 #define type(v) (v)->nobj /* badly overloaded here */ 41 #define info(v) (v)->ntype /* badly overloaded here */ 42 #define left(v) (v)->narg[0] 43 #define right(v) (v)->narg[1] 44 #define parent(v) (v)->nnext 45 46 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 47 #define ELEAF case EMPTYRE: /* empty string in regexp */ 48 #define UNARY case STAR: case PLUS: case QUEST: 49 50 /* encoding in tree Nodes: 51 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 52 left is index, right contains value or pointer to value 53 unary (STAR, PLUS, QUEST): left is child, right is null 54 binary (CAT, OR): left and right are children 55 parent contains pointer to parent 56 */ 57 58 59 int *setvec; 60 int *tmpset; 61 int maxsetvec = 0; 62 63 int rtok; /* next token in current re */ 64 int rlxval; 65 static const uschar *rlxstr; 66 static const uschar *prestr; /* current position in current re */ 67 static const uschar *lastre; /* origin of last re */ 68 static const uschar *lastatom; /* origin of last Atom */ 69 static const uschar *starttok; 70 static const uschar *basestr; /* starts with original, replaced during 71 repetition processing */ 72 static const uschar *firstbasestr; 73 74 static int setcnt; 75 static int poscnt; 76 77 const char *patbeg; 78 int patlen; 79 80 #define NFA 128 /* cache this many dynamic fa's */ 81 fa *fatab[NFA]; 82 int nfatab = 0; /* entries in fatab */ 83 84 static int * 85 intalloc(size_t n, const char *f) 86 { 87 int *p = (int *) calloc(n, sizeof(int)); 88 if (p == NULL) 89 overflo(f); 90 return p; 91 } 92 93 static void 94 allocsetvec(const char *f) 95 { 96 maxsetvec = MAXLIN; 97 setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec)); 98 tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset)); 99 if (setvec == NULL || tmpset == NULL) 100 overflo(f); 101 } 102 103 static void 104 resizesetvec(const char *f) 105 { 106 setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec)); 107 tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset)); 108 if (setvec == NULL || tmpset == NULL) 109 overflo(f); 110 maxsetvec *= 4; 111 } 112 113 static void 114 resize_state(fa *f, int state) 115 { 116 unsigned int **p; 117 uschar *p2; 118 int **p3; 119 int i, new_count; 120 121 if (++state < f->state_count) 122 return; 123 124 new_count = state + 10; /* needs to be tuned */ 125 126 p = (unsigned int **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0])); 127 if (p == NULL) 128 goto out; 129 f->gototab = p; 130 131 p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0])); 132 if (p2 == NULL) 133 goto out; 134 f->out = p2; 135 136 p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0])); 137 if (p3 == NULL) 138 goto out; 139 f->posns = p3; 140 141 for (i = f->state_count; i < new_count; ++i) { 142 f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab)); 143 if (f->gototab[i] == NULL) 144 goto out; 145 f->out[i] = 0; 146 f->posns[i] = NULL; 147 } 148 f->state_count = new_count; 149 return; 150 out: 151 overflo(__func__); 152 } 153 154 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */ 155 { 156 int i, use, nuse; 157 fa *pfa; 158 static int now = 1; 159 160 if (setvec == NULL) { /* first time through any RE */ 161 allocsetvec(__func__); 162 } 163 164 if (compile_time != RUNNING) /* a constant for sure */ 165 return mkdfa(s, anchor); 166 for (i = 0; i < nfatab; i++) /* is it there already? */ 167 if (fatab[i]->anchor == anchor 168 && strcmp((const char *) fatab[i]->restr, s) == 0) { 169 fatab[i]->use = now++; 170 return fatab[i]; 171 } 172 pfa = mkdfa(s, anchor); 173 if (nfatab < NFA) { /* room for another */ 174 fatab[nfatab] = pfa; 175 fatab[nfatab]->use = now++; 176 nfatab++; 177 return pfa; 178 } 179 use = fatab[0]->use; /* replace least-recently used */ 180 nuse = 0; 181 for (i = 1; i < nfatab; i++) 182 if (fatab[i]->use < use) { 183 use = fatab[i]->use; 184 nuse = i; 185 } 186 freefa(fatab[nuse]); 187 fatab[nuse] = pfa; 188 pfa->use = now++; 189 return pfa; 190 } 191 192 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */ 193 /* anchor = true for anchored matches, else false */ 194 { 195 Node *p, *p1; 196 fa *f; 197 198 firstbasestr = (const uschar *) s; 199 basestr = firstbasestr; 200 p = reparse(s); 201 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 202 /* put ALL STAR in front of reg. exp. */ 203 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 204 /* put FINAL after reg. exp. */ 205 206 poscnt = 0; 207 penter(p1); /* enter parent pointers and leaf indices */ 208 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL) 209 overflo(__func__); 210 f->accept = poscnt-1; /* penter has computed number of positions in re */ 211 cfoll(f, p1); /* set up follow sets */ 212 freetr(p1); 213 resize_state(f, 1); 214 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__); 215 f->posns[1] = intalloc(1, __func__); 216 *f->posns[1] = 0; 217 f->initstat = makeinit(f, anchor); 218 f->anchor = anchor; 219 f->restr = (uschar *) tostring(s); 220 if (firstbasestr != basestr) { 221 if (basestr) 222 xfree(basestr); 223 } 224 return f; 225 } 226 227 int makeinit(fa *f, bool anchor) 228 { 229 int i, k; 230 231 f->curstat = 2; 232 f->out[2] = 0; 233 k = *(f->re[0].lfollow); 234 xfree(f->posns[2]); 235 f->posns[2] = intalloc(k + 1, __func__); 236 for (i = 0; i <= k; i++) { 237 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 238 } 239 if ((f->posns[2])[1] == f->accept) 240 f->out[2] = 1; 241 for (i = 0; i < NCHARS; i++) 242 f->gototab[2][i] = 0; 243 f->curstat = cgoto(f, 2, HAT); 244 if (anchor) { 245 *f->posns[2] = k-1; /* leave out position 0 */ 246 for (i = 0; i < k; i++) { 247 (f->posns[0])[i] = (f->posns[2])[i]; 248 } 249 250 f->out[0] = f->out[2]; 251 if (f->curstat != 2) 252 --(*f->posns[f->curstat]); 253 } 254 return f->curstat; 255 } 256 257 void penter(Node *p) /* set up parent pointers and leaf indices */ 258 { 259 switch (type(p)) { 260 ELEAF 261 LEAF 262 info(p) = poscnt; 263 poscnt++; 264 break; 265 UNARY 266 penter(left(p)); 267 parent(left(p)) = p; 268 break; 269 case CAT: 270 case OR: 271 penter(left(p)); 272 penter(right(p)); 273 parent(left(p)) = p; 274 parent(right(p)) = p; 275 break; 276 case ZERO: 277 break; 278 default: /* can't happen */ 279 FATAL("can't happen: unknown type %d in penter", type(p)); 280 break; 281 } 282 } 283 284 void freetr(Node *p) /* free parse tree */ 285 { 286 switch (type(p)) { 287 ELEAF 288 LEAF 289 xfree(p); 290 break; 291 UNARY 292 case ZERO: 293 freetr(left(p)); 294 xfree(p); 295 break; 296 case CAT: 297 case OR: 298 freetr(left(p)); 299 freetr(right(p)); 300 xfree(p); 301 break; 302 default: /* can't happen */ 303 FATAL("can't happen: unknown type %d in freetr", type(p)); 304 break; 305 } 306 } 307 308 /* in the parsing of regular expressions, metacharacters like . have */ 309 /* to be seen literally; \056 is not a metacharacter. */ 310 311 int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */ 312 { /* only pick up one 8-bit byte (2 chars) */ 313 const uschar *p; 314 int n = 0; 315 int i; 316 317 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) { 318 if (isdigit(*p)) 319 n = 16 * n + *p - '0'; 320 else if (*p >= 'a' && *p <= 'f') 321 n = 16 * n + *p - 'a' + 10; 322 else if (*p >= 'A' && *p <= 'F') 323 n = 16 * n + *p - 'A' + 10; 324 } 325 *pp = p; 326 return n; 327 } 328 329 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 330 331 int quoted(const uschar **pp) /* pick up next thing after a \\ */ 332 /* and increment *pp */ 333 { 334 const uschar *p = *pp; 335 int c; 336 337 if ((c = *p++) == 't') 338 c = '\t'; 339 else if (c == 'n') 340 c = '\n'; 341 else if (c == 'f') 342 c = '\f'; 343 else if (c == 'r') 344 c = '\r'; 345 else if (c == 'b') 346 c = '\b'; 347 else if (c == 'v') 348 c = '\v'; 349 else if (c == 'a') 350 c = '\a'; 351 else if (c == '\\') 352 c = '\\'; 353 else if (c == 'x') { /* hexadecimal goo follows */ 354 c = hexstr(&p); /* this adds a null if number is invalid */ 355 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 356 int n = c - '0'; 357 if (isoctdigit(*p)) { 358 n = 8 * n + *p++ - '0'; 359 if (isoctdigit(*p)) 360 n = 8 * n + *p++ - '0'; 361 } 362 c = n; 363 } /* else */ 364 /* c = c; */ 365 *pp = p; 366 return c; 367 } 368 369 char *cclenter(const char *argp) /* add a character class */ 370 { 371 int i, c, c2; 372 const uschar *op, *p = (const uschar *) argp; 373 uschar *bp; 374 static uschar *buf = NULL; 375 static int bufsz = 100; 376 377 op = p; 378 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 379 FATAL("out of space for character class [%.10s...] 1", p); 380 bp = buf; 381 for (i = 0; (c = *p++) != 0; ) { 382 if (c == '\\') { 383 c = quoted(&p); 384 } else if (c == '-' && i > 0 && bp[-1] != 0) { 385 if (*p != 0) { 386 c = bp[-1]; 387 c2 = *p++; 388 if (c2 == '\\') 389 c2 = quoted(&p); 390 if (c > c2) { /* empty; ignore */ 391 bp--; 392 i--; 393 continue; 394 } 395 while (c < c2) { 396 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) 397 FATAL("out of space for character class [%.10s...] 2", p); 398 *bp++ = ++c; 399 i++; 400 } 401 continue; 402 } 403 } 404 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) 405 FATAL("out of space for character class [%.10s...] 3", p); 406 *bp++ = c; 407 i++; 408 } 409 *bp = 0; 410 DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); 411 xfree(op); 412 return (char *) tostring((char *) buf); 413 } 414 415 void overflo(const char *s) 416 { 417 FATAL("regular expression too big: out of space in %.30s...", s); 418 } 419 420 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 421 { 422 int i; 423 int *p; 424 425 switch (type(v)) { 426 ELEAF 427 LEAF 428 f->re[info(v)].ltype = type(v); 429 f->re[info(v)].lval.np = right(v); 430 while (f->accept >= maxsetvec) { /* guessing here! */ 431 resizesetvec(__func__); 432 } 433 for (i = 0; i <= f->accept; i++) 434 setvec[i] = 0; 435 setcnt = 0; 436 follow(v); /* computes setvec and setcnt */ 437 p = intalloc(setcnt + 1, __func__); 438 f->re[info(v)].lfollow = p; 439 *p = setcnt; 440 for (i = f->accept; i >= 0; i--) 441 if (setvec[i] == 1) 442 *++p = i; 443 break; 444 UNARY 445 cfoll(f,left(v)); 446 break; 447 case CAT: 448 case OR: 449 cfoll(f,left(v)); 450 cfoll(f,right(v)); 451 break; 452 case ZERO: 453 break; 454 default: /* can't happen */ 455 FATAL("can't happen: unknown type %d in cfoll", type(v)); 456 } 457 } 458 459 int first(Node *p) /* collects initially active leaves of p into setvec */ 460 /* returns 0 if p matches empty string */ 461 { 462 int b, lp; 463 464 switch (type(p)) { 465 ELEAF 466 LEAF 467 lp = info(p); /* look for high-water mark of subscripts */ 468 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 469 resizesetvec(__func__); 470 } 471 if (type(p) == EMPTYRE) { 472 setvec[lp] = 0; 473 return(0); 474 } 475 if (setvec[lp] != 1) { 476 setvec[lp] = 1; 477 setcnt++; 478 } 479 if (type(p) == CCL && (*(char *) right(p)) == '\0') 480 return(0); /* empty CCL */ 481 return(1); 482 case PLUS: 483 if (first(left(p)) == 0) 484 return(0); 485 return(1); 486 case STAR: 487 case QUEST: 488 first(left(p)); 489 return(0); 490 case CAT: 491 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 492 return(1); 493 case OR: 494 b = first(right(p)); 495 if (first(left(p)) == 0 || b == 0) return(0); 496 return(1); 497 case ZERO: 498 return 0; 499 } 500 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 501 return(-1); 502 } 503 504 void follow(Node *v) /* collects leaves that can follow v into setvec */ 505 { 506 Node *p; 507 508 if (type(v) == FINAL) 509 return; 510 p = parent(v); 511 switch (type(p)) { 512 case STAR: 513 case PLUS: 514 first(v); 515 follow(p); 516 return; 517 518 case OR: 519 case QUEST: 520 follow(p); 521 return; 522 523 case CAT: 524 if (v == left(p)) { /* v is left child of p */ 525 if (first(right(p)) == 0) { 526 follow(p); 527 return; 528 } 529 } else /* v is right child */ 530 follow(p); 531 return; 532 } 533 } 534 535 int member(int c, const char *sarg) /* is c in s? */ 536 { 537 const uschar *s = (const uschar *) sarg; 538 539 while (*s) 540 if (c == *s++) 541 return(1); 542 return(0); 543 } 544 545 int match(fa *f, const char *p0) /* shortest match ? */ 546 { 547 int s, ns; 548 const uschar *p = (const uschar *) p0; 549 550 s = f->initstat; 551 assert (s < f->state_count); 552 553 if (f->out[s]) 554 return(1); 555 do { 556 /* assert(*p < NCHARS); */ 557 if ((ns = f->gototab[s][*p]) != 0) 558 s = ns; 559 else 560 s = cgoto(f, s, *p); 561 if (f->out[s]) 562 return(1); 563 } while (*p++ != 0); 564 return(0); 565 } 566 567 int pmatch(fa *f, const char *p0) /* longest match, for sub */ 568 { 569 int s, ns; 570 const uschar *p = (const uschar *) p0; 571 const uschar *q; 572 573 s = f->initstat; 574 assert(s < f->state_count); 575 576 patbeg = (const char *)p; 577 patlen = -1; 578 do { 579 q = p; 580 do { 581 if (f->out[s]) /* final state */ 582 patlen = q-p; 583 /* assert(*q < NCHARS); */ 584 if ((ns = f->gototab[s][*q]) != 0) 585 s = ns; 586 else 587 s = cgoto(f, s, *q); 588 589 assert(s < f->state_count); 590 591 if (s == 1) { /* no transition */ 592 if (patlen >= 0) { 593 patbeg = (const char *) p; 594 return(1); 595 } 596 else 597 goto nextin; /* no match */ 598 } 599 } while (*q++ != 0); 600 if (f->out[s]) 601 patlen = q-p-1; /* don't count $ */ 602 if (patlen >= 0) { 603 patbeg = (const char *) p; 604 return(1); 605 } 606 nextin: 607 s = 2; 608 } while (*p++); 609 return (0); 610 } 611 612 int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 613 { 614 int s, ns; 615 const uschar *p = (const uschar *) p0; 616 const uschar *q; 617 618 s = f->initstat; 619 assert(s < f->state_count); 620 621 patbeg = (const char *)p; 622 patlen = -1; 623 while (*p) { 624 q = p; 625 do { 626 if (f->out[s]) /* final state */ 627 patlen = q-p; 628 /* assert(*q < NCHARS); */ 629 if ((ns = f->gototab[s][*q]) != 0) 630 s = ns; 631 else 632 s = cgoto(f, s, *q); 633 if (s == 1) { /* no transition */ 634 if (patlen > 0) { 635 patbeg = (const char *) p; 636 return(1); 637 } else 638 goto nnextin; /* no nonempty match */ 639 } 640 } while (*q++ != 0); 641 if (f->out[s]) 642 patlen = q-p-1; /* don't count $ */ 643 if (patlen > 0 ) { 644 patbeg = (const char *) p; 645 return(1); 646 } 647 nnextin: 648 s = 2; 649 p++; 650 } 651 return (0); 652 } 653 654 655 /* 656 * NAME 657 * fnematch 658 * 659 * DESCRIPTION 660 * A stream-fed version of nematch which transfers characters to a 661 * null-terminated buffer. All characters up to and including the last 662 * character of the matching text or EOF are placed in the buffer. If 663 * a match is found, patbeg and patlen are set appropriately. 664 * 665 * RETURN VALUES 666 * false No match found. 667 * true Match found. 668 */ 669 670 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) 671 { 672 char *buf = *pbuf; 673 int bufsize = *pbufsize; 674 int c, i, j, k, ns, s; 675 676 s = pfa->initstat; 677 patlen = 0; 678 679 /* 680 * All indices relative to buf. 681 * i <= j <= k <= bufsize 682 * 683 * i: origin of active substring 684 * j: current character 685 * k: destination of next getc() 686 */ 687 i = -1, k = 0; 688 do { 689 j = i++; 690 do { 691 if (++j == k) { 692 if (k == bufsize) 693 if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) 694 FATAL("stream '%.30s...' too long", buf); 695 buf[k++] = (c = getc(f)) != EOF ? c : 0; 696 } 697 c = (uschar)buf[j]; 698 /* assert(c < NCHARS); */ 699 700 if ((ns = pfa->gototab[s][c]) != 0) 701 s = ns; 702 else 703 s = cgoto(pfa, s, c); 704 705 if (pfa->out[s]) { /* final state */ 706 patlen = j - i + 1; 707 if (c == 0) /* don't count $ */ 708 patlen--; 709 } 710 } while (buf[j] && s != 1); 711 s = 2; 712 } while (buf[i] && !patlen); 713 714 /* adjbuf() may have relocated a resized buffer. Inform the world. */ 715 *pbuf = buf; 716 *pbufsize = bufsize; 717 718 if (patlen) { 719 patbeg = buf + i; 720 /* 721 * Under no circumstances is the last character fed to 722 * the automaton part of the match. It is EOF's nullbyte, 723 * or it sent the automaton into a state with no further 724 * transitions available (s==1), or both. Room for a 725 * terminating nullbyte is guaranteed. 726 * 727 * ungetc any chars after the end of matching text 728 * (except for EOF's nullbyte, if present) and null 729 * terminate the buffer. 730 */ 731 do 732 if (buf[--k] && ungetc(buf[k], f) == EOF) 733 FATAL("unable to ungetc '%c'", buf[k]); 734 while (k > i + patlen); 735 buf[k] = '\0'; 736 return true; 737 } 738 else 739 return false; 740 } 741 742 Node *reparse(const char *p) /* parses regular expression pointed to by p */ 743 { /* uses relex() to scan regular expression */ 744 Node *np; 745 746 DPRINTF("reparse <%s>\n", p); 747 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */ 748 rtok = relex(); 749 /* GNU compatibility: an empty regexp matches anything */ 750 if (rtok == '\0') { 751 /* FATAL("empty regular expression"); previous */ 752 return(op2(EMPTYRE, NIL, NIL)); 753 } 754 np = regexp(); 755 if (rtok != '\0') 756 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 757 return(np); 758 } 759 760 Node *regexp(void) /* top-level parse of reg expr */ 761 { 762 return (alt(concat(primary()))); 763 } 764 765 Node *primary(void) 766 { 767 Node *np; 768 int savelastatom; 769 770 switch (rtok) { 771 case CHAR: 772 lastatom = starttok; 773 np = op2(CHAR, NIL, itonp(rlxval)); 774 rtok = relex(); 775 return (unary(np)); 776 case ALL: 777 rtok = relex(); 778 return (unary(op2(ALL, NIL, NIL))); 779 case EMPTYRE: 780 rtok = relex(); 781 return (unary(op2(EMPTYRE, NIL, NIL))); 782 case DOT: 783 lastatom = starttok; 784 rtok = relex(); 785 return (unary(op2(DOT, NIL, NIL))); 786 case CCL: 787 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr)); 788 lastatom = starttok; 789 rtok = relex(); 790 return (unary(np)); 791 case NCCL: 792 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr)); 793 lastatom = starttok; 794 rtok = relex(); 795 return (unary(np)); 796 case '^': 797 rtok = relex(); 798 return (unary(op2(CHAR, NIL, itonp(HAT)))); 799 case '$': 800 rtok = relex(); 801 return (unary(op2(CHAR, NIL, NIL))); 802 case '(': 803 lastatom = starttok; 804 savelastatom = starttok - basestr; /* Retain over recursion */ 805 rtok = relex(); 806 if (rtok == ')') { /* special pleading for () */ 807 rtok = relex(); 808 return unary(op2(CCL, NIL, (Node *) tostring(""))); 809 } 810 np = regexp(); 811 if (rtok == ')') { 812 lastatom = basestr + savelastatom; /* Restore */ 813 rtok = relex(); 814 return (unary(np)); 815 } 816 else 817 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 818 default: 819 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 820 } 821 return 0; /*NOTREACHED*/ 822 } 823 824 Node *concat(Node *np) 825 { 826 switch (rtok) { 827 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 828 return (concat(op2(CAT, np, primary()))); 829 case EMPTYRE: 830 rtok = relex(); 831 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")), 832 primary()))); 833 } 834 return (np); 835 } 836 837 Node *alt(Node *np) 838 { 839 if (rtok == OR) { 840 rtok = relex(); 841 return (alt(op2(OR, np, concat(primary())))); 842 } 843 return (np); 844 } 845 846 Node *unary(Node *np) 847 { 848 switch (rtok) { 849 case STAR: 850 rtok = relex(); 851 return (unary(op2(STAR, np, NIL))); 852 case PLUS: 853 rtok = relex(); 854 return (unary(op2(PLUS, np, NIL))); 855 case QUEST: 856 rtok = relex(); 857 return (unary(op2(QUEST, np, NIL))); 858 case ZERO: 859 rtok = relex(); 860 return (unary(op2(ZERO, np, NIL))); 861 default: 862 return (np); 863 } 864 } 865 866 /* 867 * Character class definitions conformant to the POSIX locale as 868 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 869 * and operating character sets are both ASCII (ISO646) or supersets 870 * thereof. 871 * 872 * Note that to avoid overflowing the temporary buffer used in 873 * relex(), the expanded character class (prior to range expansion) 874 * must be less than twice the size of their full name. 875 */ 876 877 /* Because isblank doesn't show up in any of the header files on any 878 * system i use, it's defined here. if some other locale has a richer 879 * definition of "blank", define HAS_ISBLANK and provide your own 880 * version. 881 * the parentheses here are an attempt to find a path through the maze 882 * of macro definition and/or function and/or version provided. thanks 883 * to nelson beebe for the suggestion; let's see if it works everywhere. 884 */ 885 886 #ifndef HAS_ISBLANK 887 888 int (xisblank)(int c) 889 { 890 return c==' ' || c=='\t'; 891 } 892 893 #endif 894 895 static const struct charclass { 896 const char *cc_name; 897 int cc_namelen; 898 int (*cc_func)(int); 899 } charclasses[] = { 900 { "alnum", 5, isalnum }, 901 { "alpha", 5, isalpha }, 902 #ifndef HAS_ISBLANK 903 { "blank", 5, xisblank }, 904 #else 905 { "blank", 5, isblank }, 906 #endif 907 { "cntrl", 5, iscntrl }, 908 { "digit", 5, isdigit }, 909 { "graph", 5, isgraph }, 910 { "lower", 5, islower }, 911 { "print", 5, isprint }, 912 { "punct", 5, ispunct }, 913 { "space", 5, isspace }, 914 { "upper", 5, isupper }, 915 { "xdigit", 6, isxdigit }, 916 { NULL, 0, NULL }, 917 }; 918 919 #define REPEAT_SIMPLE 0 920 #define REPEAT_PLUS_APPENDED 1 921 #define REPEAT_WITH_Q 2 922 #define REPEAT_ZERO 3 923 924 static int 925 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 926 int atomlen, int firstnum, int secondnum, int special_case) 927 { 928 int i, j; 929 uschar *buf = NULL; 930 int ret = 1; 931 int init_q = (firstnum == 0); /* first added char will be ? */ 932 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 933 int prefix_length = reptok - basestr; /* prefix includes first rep */ 934 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */ 935 int size = prefix_length + suffix_length; 936 937 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 938 size += atomlen*(firstnum-1); 939 } 940 941 /* Adjust size of buffer for special cases */ 942 if (special_case == REPEAT_PLUS_APPENDED) { 943 size++; /* for the final + */ 944 } else if (special_case == REPEAT_WITH_Q) { 945 size += init_q + (atomlen+1)* n_q_reps; 946 } else if (special_case == REPEAT_ZERO) { 947 size += 2; /* just a null ERE: () */ 948 } 949 if ((buf = (uschar *) malloc(size + 1)) == NULL) 950 FATAL("out of space in reg expr %.10s..", lastre); 951 memcpy(buf, basestr, prefix_length); /* copy prefix */ 952 j = prefix_length; 953 if (special_case == REPEAT_ZERO) { 954 j -= atomlen; 955 buf[j++] = '('; 956 buf[j++] = ')'; 957 } 958 for (i = 1; i < firstnum; i++) { /* copy x reps */ 959 memcpy(&buf[j], atom, atomlen); 960 j += atomlen; 961 } 962 if (special_case == REPEAT_PLUS_APPENDED) { 963 buf[j++] = '+'; 964 } else if (special_case == REPEAT_WITH_Q) { 965 if (init_q) 966 buf[j++] = '?'; 967 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */ 968 memcpy(&buf[j], atom, atomlen); 969 j += atomlen; 970 buf[j++] = '?'; 971 } 972 } 973 memcpy(&buf[j], reptok+reptoklen, suffix_length); 974 if (special_case == REPEAT_ZERO) { 975 buf[j+suffix_length] = '\0'; 976 } else { 977 buf[size] = '\0'; 978 } 979 /* free old basestr */ 980 if (firstbasestr != basestr) { 981 if (basestr) 982 xfree(basestr); 983 } 984 basestr = buf; 985 prestr = buf + prefix_length; 986 if (special_case == REPEAT_ZERO) { 987 prestr -= atomlen; 988 ret++; 989 } 990 return ret; 991 } 992 993 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 994 int atomlen, int firstnum, int secondnum) 995 { 996 /* 997 In general, the repetition specifier or "bound" is replaced here 998 by an equivalent ERE string, repeating the immediately previous atom 999 and appending ? and + as needed. Note that the first copy of the 1000 atom is left in place, except in the special_case of a zero-repeat 1001 (i.e., {0}). 1002 */ 1003 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 1004 if (firstnum < 2) { 1005 /* 0 or 1: should be handled before you get here */ 1006 FATAL("internal error"); 1007 } else { 1008 return replace_repeat(reptok, reptoklen, atom, atomlen, 1009 firstnum, secondnum, REPEAT_PLUS_APPENDED); 1010 } 1011 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 1012 if (firstnum == 0) { /* {0} or {0,0} */ 1013 /* This case is unusual because the resulting 1014 replacement string might actually be SMALLER than 1015 the original ERE */ 1016 return replace_repeat(reptok, reptoklen, atom, atomlen, 1017 firstnum, secondnum, REPEAT_ZERO); 1018 } else { /* (firstnum >= 1) */ 1019 return replace_repeat(reptok, reptoklen, atom, atomlen, 1020 firstnum, secondnum, REPEAT_SIMPLE); 1021 } 1022 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 1023 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 1024 return replace_repeat(reptok, reptoklen, atom, atomlen, 1025 firstnum, secondnum, REPEAT_WITH_Q); 1026 } else { /* Error - shouldn't be here (n>m) */ 1027 FATAL("internal error"); 1028 } 1029 return 0; 1030 } 1031 1032 int relex(void) /* lexical analyzer for reparse */ 1033 { 1034 int c, n; 1035 int cflag; 1036 static uschar *buf = NULL; 1037 static int bufsz = 100; 1038 uschar *bp; 1039 const struct charclass *cc; 1040 int i; 1041 int num, m; 1042 bool commafound, digitfound; 1043 const uschar *startreptok; 1044 static int parens = 0; 1045 1046 rescan: 1047 starttok = prestr; 1048 1049 switch (c = *prestr++) { 1050 case '|': return OR; 1051 case '*': return STAR; 1052 case '+': return PLUS; 1053 case '?': return QUEST; 1054 case '.': return DOT; 1055 case '\0': prestr--; return '\0'; 1056 case '^': 1057 case '$': 1058 return c; 1059 case '(': 1060 parens++; 1061 return c; 1062 case ')': 1063 if (parens) { 1064 parens--; 1065 return c; 1066 } 1067 /* unmatched close parenthesis; per POSIX, treat as literal */ 1068 rlxval = c; 1069 return CHAR; 1070 case '\\': 1071 rlxval = quoted(&prestr); 1072 return CHAR; 1073 default: 1074 rlxval = c; 1075 return CHAR; 1076 case '[': 1077 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 1078 FATAL("out of space in reg expr %.10s..", lastre); 1079 bp = buf; 1080 if (*prestr == '^') { 1081 cflag = 1; 1082 prestr++; 1083 } 1084 else 1085 cflag = 0; 1086 n = 2 * strlen((const char *) prestr)+1; 1087 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 1088 FATAL("out of space for reg expr %.10s...", lastre); 1089 for (; ; ) { 1090 if ((c = *prestr++) == '\\') { 1091 *bp++ = '\\'; 1092 if ((c = *prestr++) == '\0') 1093 FATAL("nonterminated character class %.20s...", lastre); 1094 *bp++ = c; 1095 /* } else if (c == '\n') { */ 1096 /* FATAL("newline in character class %.20s...", lastre); */ 1097 } else if (c == '[' && *prestr == ':') { 1098 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 1099 for (cc = charclasses; cc->cc_name; cc++) 1100 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 1101 break; 1102 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 1103 prestr[2 + cc->cc_namelen] == ']') { 1104 prestr += cc->cc_namelen + 3; 1105 /* 1106 * BUG: We begin at 1, instead of 0, since we 1107 * would otherwise prematurely terminate the 1108 * string for classes like [[:cntrl:]]. This 1109 * means that we can't match the NUL character, 1110 * not without first adapting the entire 1111 * program to track each string's length. 1112 */ 1113 for (i = 1; i <= UCHAR_MAX; i++) { 1114 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) 1115 FATAL("out of space for reg expr %.10s...", lastre); 1116 if (cc->cc_func(i)) { 1117 /* escape backslash */ 1118 if (i == '\\') { 1119 *bp++ = '\\'; 1120 n++; 1121 } 1122 1123 *bp++ = i; 1124 n++; 1125 } 1126 } 1127 } else 1128 *bp++ = c; 1129 } else if (c == '[' && *prestr == '.') { 1130 char collate_char; 1131 prestr++; 1132 collate_char = *prestr++; 1133 if (*prestr == '.' && prestr[1] == ']') { 1134 prestr += 2; 1135 /* Found it: map via locale TBD: for 1136 now, simply return this char. This 1137 is sufficient to pass conformance 1138 test awk.ex 156 1139 */ 1140 if (*prestr == ']') { 1141 prestr++; 1142 rlxval = collate_char; 1143 return CHAR; 1144 } 1145 } 1146 } else if (c == '[' && *prestr == '=') { 1147 char equiv_char; 1148 prestr++; 1149 equiv_char = *prestr++; 1150 if (*prestr == '=' && prestr[1] == ']') { 1151 prestr += 2; 1152 /* Found it: map via locale TBD: for now 1153 simply return this char. This is 1154 sufficient to pass conformance test 1155 awk.ex 156 1156 */ 1157 if (*prestr == ']') { 1158 prestr++; 1159 rlxval = equiv_char; 1160 return CHAR; 1161 } 1162 } 1163 } else if (c == '\0') { 1164 FATAL("nonterminated character class %.20s", lastre); 1165 } else if (bp == buf) { /* 1st char is special */ 1166 *bp++ = c; 1167 } else if (c == ']') { 1168 *bp++ = 0; 1169 rlxstr = (uschar *) tostring((char *) buf); 1170 if (cflag == 0) 1171 return CCL; 1172 else 1173 return NCCL; 1174 } else 1175 *bp++ = c; 1176 } 1177 break; 1178 case '{': 1179 if (isdigit(*(prestr))) { 1180 num = 0; /* Process as a repetition */ 1181 n = -1; m = -1; 1182 commafound = false; 1183 digitfound = false; 1184 startreptok = prestr-1; 1185 /* Remember start of previous atom here ? */ 1186 } else { /* just a { char, not a repetition */ 1187 rlxval = c; 1188 return CHAR; 1189 } 1190 for (; ; ) { 1191 if ((c = *prestr++) == '}') { 1192 if (commafound) { 1193 if (digitfound) { /* {n,m} */ 1194 m = num; 1195 if (m < n) 1196 FATAL("illegal repetition expression: class %.20s", 1197 lastre); 1198 if (n == 0 && m == 1) { 1199 return QUEST; 1200 } 1201 } else { /* {n,} */ 1202 if (n == 0) 1203 return STAR; 1204 else if (n == 1) 1205 return PLUS; 1206 } 1207 } else { 1208 if (digitfound) { /* {n} same as {n,n} */ 1209 n = num; 1210 m = num; 1211 } else { /* {} */ 1212 FATAL("illegal repetition expression: class %.20s", 1213 lastre); 1214 } 1215 } 1216 if (repeat(starttok, prestr-starttok, lastatom, 1217 startreptok - lastatom, n, m) > 0) { 1218 if (n == 0 && m == 0) { 1219 return ZERO; 1220 } 1221 /* must rescan input for next token */ 1222 goto rescan; 1223 } 1224 /* Failed to replace: eat up {...} characters 1225 and treat like just PLUS */ 1226 return PLUS; 1227 } else if (c == '\0') { 1228 FATAL("nonterminated character class %.20s", 1229 lastre); 1230 } else if (isdigit(c)) { 1231 num = 10 * num + c - '0'; 1232 digitfound = true; 1233 } else if (c == ',') { 1234 if (commafound) 1235 FATAL("illegal repetition expression: class %.20s", 1236 lastre); 1237 /* looking for {n,} or {n,m} */ 1238 commafound = true; 1239 n = num; 1240 digitfound = false; /* reset */ 1241 num = 0; 1242 } else { 1243 FATAL("illegal repetition expression: class %.20s", 1244 lastre); 1245 } 1246 } 1247 break; 1248 } 1249 } 1250 1251 int cgoto(fa *f, int s, int c) 1252 { 1253 int *p, *q; 1254 int i, j, k; 1255 1256 assert(c == HAT || c < NCHARS); 1257 while (f->accept >= maxsetvec) { /* guessing here! */ 1258 resizesetvec(__func__); 1259 } 1260 for (i = 0; i <= f->accept; i++) 1261 setvec[i] = 0; 1262 setcnt = 0; 1263 resize_state(f, s); 1264 /* compute positions of gototab[s,c] into setvec */ 1265 p = f->posns[s]; 1266 for (i = 1; i <= *p; i++) { 1267 if ((k = f->re[p[i]].ltype) != FINAL) { 1268 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1269 || (k == DOT && c != 0 && c != HAT) 1270 || (k == ALL && c != 0) 1271 || (k == EMPTYRE && c != 0) 1272 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 1273 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 1274 q = f->re[p[i]].lfollow; 1275 for (j = 1; j <= *q; j++) { 1276 if (q[j] >= maxsetvec) { 1277 resizesetvec(__func__); 1278 } 1279 if (setvec[q[j]] == 0) { 1280 setcnt++; 1281 setvec[q[j]] = 1; 1282 } 1283 } 1284 } 1285 } 1286 } 1287 /* determine if setvec is a previous state */ 1288 tmpset[0] = setcnt; 1289 j = 1; 1290 for (i = f->accept; i >= 0; i--) 1291 if (setvec[i]) { 1292 tmpset[j++] = i; 1293 } 1294 resize_state(f, f->curstat > s ? f->curstat : s); 1295 /* tmpset == previous state? */ 1296 for (i = 1; i <= f->curstat; i++) { 1297 p = f->posns[i]; 1298 if ((k = tmpset[0]) != p[0]) 1299 goto different; 1300 for (j = 1; j <= k; j++) 1301 if (tmpset[j] != p[j]) 1302 goto different; 1303 /* setvec is state i */ 1304 if (c != HAT) 1305 f->gototab[s][c] = i; 1306 return i; 1307 different:; 1308 } 1309 1310 /* add tmpset to current set of states */ 1311 ++(f->curstat); 1312 resize_state(f, f->curstat); 1313 for (i = 0; i < NCHARS; i++) 1314 f->gototab[f->curstat][i] = 0; 1315 xfree(f->posns[f->curstat]); 1316 p = intalloc(setcnt + 1, __func__); 1317 1318 f->posns[f->curstat] = p; 1319 if (c != HAT) 1320 f->gototab[s][c] = f->curstat; 1321 for (i = 0; i <= setcnt; i++) 1322 p[i] = tmpset[i]; 1323 if (setvec[f->accept]) 1324 f->out[f->curstat] = 1; 1325 else 1326 f->out[f->curstat] = 0; 1327 return f->curstat; 1328 } 1329 1330 1331 void freefa(fa *f) /* free a finite automaton */ 1332 { 1333 int i; 1334 1335 if (f == NULL) 1336 return; 1337 for (i = 0; i < f->state_count; i++) 1338 xfree(f->gototab[i]) 1339 for (i = 0; i <= f->curstat; i++) 1340 xfree(f->posns[i]); 1341 for (i = 0; i <= f->accept; i++) { 1342 xfree(f->re[i].lfollow); 1343 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1344 xfree(f->re[i].lval.np); 1345 } 1346 xfree(f->restr); 1347 xfree(f->out); 1348 xfree(f->posns); 1349 xfree(f->gototab); 1350 xfree(f); 1351 } 1352