1 /* $OpenBSD: b.c,v 1.36 2021/03/02 20:41:42 millert Exp $ */ 2 /**************************************************************** 3 Copyright (C) Lucent Technologies 1997 4 All Rights Reserved 5 6 Permission to use, copy, modify, and distribute this software and 7 its documentation for any purpose and without fee is hereby 8 granted, provided that the above copyright notice appear in all 9 copies and that both that the copyright notice and this 10 permission notice and warranty disclaimer appear in supporting 11 documentation, and that the name Lucent Technologies or any of 12 its entities not be used in advertising or publicity pertaining 13 to distribution of the software without specific, written prior 14 permission. 15 16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 23 THIS SOFTWARE. 24 ****************************************************************/ 25 26 /* lasciate ogne speranza, voi ch'intrate. */ 27 28 #define DEBUG 29 30 #include <ctype.h> 31 #include <limits.h> 32 #include <stdio.h> 33 #include <string.h> 34 #include <stdlib.h> 35 #include "awk.h" 36 #include "awkgram.tab.h" 37 38 #define MAXLIN 22 39 40 #define type(v) (v)->nobj /* badly overloaded here */ 41 #define info(v) (v)->ntype /* badly overloaded here */ 42 #define left(v) (v)->narg[0] 43 #define right(v) (v)->narg[1] 44 #define parent(v) (v)->nnext 45 46 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 47 #define ELEAF case EMPTYRE: /* empty string in regexp */ 48 #define UNARY case STAR: case PLUS: case QUEST: 49 50 /* encoding in tree Nodes: 51 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 52 left is index, right contains value or pointer to value 53 unary (STAR, PLUS, QUEST): left is child, right is null 54 binary (CAT, OR): left and right are children 55 parent contains pointer to parent 56 */ 57 58 59 int *setvec; 60 int *tmpset; 61 int maxsetvec = 0; 62 63 int rtok; /* next token in current re */ 64 int rlxval; 65 static const uschar *rlxstr; 66 static const uschar *prestr; /* current position in current re */ 67 static const uschar *lastre; /* origin of last re */ 68 static const uschar *lastatom; /* origin of last Atom */ 69 static const uschar *starttok; 70 static const uschar *basestr; /* starts with original, replaced during 71 repetition processing */ 72 static const uschar *firstbasestr; 73 74 static int setcnt; 75 static int poscnt; 76 77 const char *patbeg; 78 int patlen; 79 80 #define NFA 128 /* cache this many dynamic fa's */ 81 fa *fatab[NFA]; 82 int nfatab = 0; /* entries in fatab */ 83 84 static int * 85 intalloc(size_t n, const char *f) 86 { 87 int *p = (int *) calloc(n, sizeof(int)); 88 if (p == NULL) 89 overflo(f); 90 return p; 91 } 92 93 static void 94 allocsetvec(const char *f) 95 { 96 maxsetvec = MAXLIN; 97 setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec)); 98 tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset)); 99 if (setvec == NULL || tmpset == NULL) 100 overflo(f); 101 } 102 103 static void 104 resizesetvec(const char *f) 105 { 106 setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec)); 107 tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset)); 108 if (setvec == NULL || tmpset == NULL) 109 overflo(f); 110 maxsetvec *= 4; 111 } 112 113 static void 114 resize_state(fa *f, int state) 115 { 116 unsigned int **p; 117 uschar *p2; 118 int **p3; 119 int i, new_count; 120 121 if (++state < f->state_count) 122 return; 123 124 new_count = state + 10; /* needs to be tuned */ 125 126 p = (unsigned int **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0])); 127 if (p == NULL) 128 goto out; 129 f->gototab = p; 130 131 p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0])); 132 if (p2 == NULL) 133 goto out; 134 f->out = p2; 135 136 p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0])); 137 if (p3 == NULL) 138 goto out; 139 f->posns = p3; 140 141 for (i = f->state_count; i < new_count; ++i) { 142 f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab)); 143 if (f->gototab[i] == NULL) 144 goto out; 145 f->out[i] = 0; 146 f->posns[i] = NULL; 147 } 148 f->state_count = new_count; 149 return; 150 out: 151 overflo(__func__); 152 } 153 154 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */ 155 { 156 int i, use, nuse; 157 fa *pfa; 158 static int now = 1; 159 160 if (setvec == NULL) { /* first time through any RE */ 161 allocsetvec(__func__); 162 } 163 164 if (compile_time != RUNNING) /* a constant for sure */ 165 return mkdfa(s, anchor); 166 for (i = 0; i < nfatab; i++) /* is it there already? */ 167 if (fatab[i]->anchor == anchor 168 && strcmp((const char *) fatab[i]->restr, s) == 0) { 169 fatab[i]->use = now++; 170 return fatab[i]; 171 } 172 pfa = mkdfa(s, anchor); 173 if (nfatab < NFA) { /* room for another */ 174 fatab[nfatab] = pfa; 175 fatab[nfatab]->use = now++; 176 nfatab++; 177 return pfa; 178 } 179 use = fatab[0]->use; /* replace least-recently used */ 180 nuse = 0; 181 for (i = 1; i < nfatab; i++) 182 if (fatab[i]->use < use) { 183 use = fatab[i]->use; 184 nuse = i; 185 } 186 freefa(fatab[nuse]); 187 fatab[nuse] = pfa; 188 pfa->use = now++; 189 return pfa; 190 } 191 192 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */ 193 /* anchor = true for anchored matches, else false */ 194 { 195 Node *p, *p1; 196 fa *f; 197 198 firstbasestr = (const uschar *) s; 199 basestr = firstbasestr; 200 p = reparse(s); 201 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 202 /* put ALL STAR in front of reg. exp. */ 203 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 204 /* put FINAL after reg. exp. */ 205 206 poscnt = 0; 207 penter(p1); /* enter parent pointers and leaf indices */ 208 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL) 209 overflo(__func__); 210 f->accept = poscnt-1; /* penter has computed number of positions in re */ 211 cfoll(f, p1); /* set up follow sets */ 212 freetr(p1); 213 resize_state(f, 1); 214 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__); 215 f->posns[1] = intalloc(1, __func__); 216 *f->posns[1] = 0; 217 f->initstat = makeinit(f, anchor); 218 f->anchor = anchor; 219 f->restr = (uschar *) tostring(s); 220 if (firstbasestr != basestr) { 221 if (basestr) 222 xfree(basestr); 223 } 224 return f; 225 } 226 227 int makeinit(fa *f, bool anchor) 228 { 229 int i, k; 230 231 f->curstat = 2; 232 f->out[2] = 0; 233 k = *(f->re[0].lfollow); 234 xfree(f->posns[2]); 235 f->posns[2] = intalloc(k + 1, __func__); 236 for (i = 0; i <= k; i++) { 237 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 238 } 239 if ((f->posns[2])[1] == f->accept) 240 f->out[2] = 1; 241 for (i = 0; i < NCHARS; i++) 242 f->gototab[2][i] = 0; 243 f->curstat = cgoto(f, 2, HAT); 244 if (anchor) { 245 *f->posns[2] = k-1; /* leave out position 0 */ 246 for (i = 0; i < k; i++) { 247 (f->posns[0])[i] = (f->posns[2])[i]; 248 } 249 250 f->out[0] = f->out[2]; 251 if (f->curstat != 2) 252 --(*f->posns[f->curstat]); 253 } 254 return f->curstat; 255 } 256 257 void penter(Node *p) /* set up parent pointers and leaf indices */ 258 { 259 switch (type(p)) { 260 ELEAF 261 LEAF 262 info(p) = poscnt; 263 poscnt++; 264 break; 265 UNARY 266 penter(left(p)); 267 parent(left(p)) = p; 268 break; 269 case CAT: 270 case OR: 271 penter(left(p)); 272 penter(right(p)); 273 parent(left(p)) = p; 274 parent(right(p)) = p; 275 break; 276 case ZERO: 277 break; 278 default: /* can't happen */ 279 FATAL("can't happen: unknown type %d in penter", type(p)); 280 break; 281 } 282 } 283 284 void freetr(Node *p) /* free parse tree */ 285 { 286 switch (type(p)) { 287 ELEAF 288 LEAF 289 xfree(p); 290 break; 291 UNARY 292 case ZERO: 293 freetr(left(p)); 294 xfree(p); 295 break; 296 case CAT: 297 case OR: 298 freetr(left(p)); 299 freetr(right(p)); 300 xfree(p); 301 break; 302 default: /* can't happen */ 303 FATAL("can't happen: unknown type %d in freetr", type(p)); 304 break; 305 } 306 } 307 308 /* in the parsing of regular expressions, metacharacters like . have */ 309 /* to be seen literally; \056 is not a metacharacter. */ 310 311 int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */ 312 { /* only pick up one 8-bit byte (2 chars) */ 313 const uschar *p; 314 int n = 0; 315 int i; 316 317 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) { 318 if (isdigit(*p)) 319 n = 16 * n + *p - '0'; 320 else if (*p >= 'a' && *p <= 'f') 321 n = 16 * n + *p - 'a' + 10; 322 else if (*p >= 'A' && *p <= 'F') 323 n = 16 * n + *p - 'A' + 10; 324 } 325 *pp = p; 326 return n; 327 } 328 329 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 330 331 int quoted(const uschar **pp) /* pick up next thing after a \\ */ 332 /* and increment *pp */ 333 { 334 const uschar *p = *pp; 335 int c; 336 337 if ((c = *p++) == 't') 338 c = '\t'; 339 else if (c == 'n') 340 c = '\n'; 341 else if (c == 'f') 342 c = '\f'; 343 else if (c == 'r') 344 c = '\r'; 345 else if (c == 'b') 346 c = '\b'; 347 else if (c == 'v') 348 c = '\v'; 349 else if (c == 'a') 350 c = '\a'; 351 else if (c == '\\') 352 c = '\\'; 353 else if (c == 'x') { /* hexadecimal goo follows */ 354 c = hexstr(&p); /* this adds a null if number is invalid */ 355 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 356 int n = c - '0'; 357 if (isoctdigit(*p)) { 358 n = 8 * n + *p++ - '0'; 359 if (isoctdigit(*p)) 360 n = 8 * n + *p++ - '0'; 361 } 362 c = n; 363 } /* else */ 364 /* c = c; */ 365 *pp = p; 366 return c; 367 } 368 369 char *cclenter(const char *argp) /* add a character class */ 370 { 371 int i, c, c2; 372 const uschar *op, *p = (const uschar *) argp; 373 uschar *bp; 374 static uschar *buf = NULL; 375 static int bufsz = 100; 376 377 op = p; 378 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 379 FATAL("out of space for character class [%.10s...] 1", p); 380 bp = buf; 381 for (i = 0; (c = *p++) != 0; ) { 382 if (c == '\\') { 383 c = quoted(&p); 384 } else if (c == '-' && i > 0 && bp[-1] != 0) { 385 if (*p != 0) { 386 c = bp[-1]; 387 c2 = *p++; 388 if (c2 == '\\') 389 c2 = quoted(&p); 390 if (c > c2) { /* empty; ignore */ 391 bp--; 392 i--; 393 continue; 394 } 395 while (c < c2) { 396 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) 397 FATAL("out of space for character class [%.10s...] 2", p); 398 *bp++ = ++c; 399 i++; 400 } 401 continue; 402 } 403 } 404 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) 405 FATAL("out of space for character class [%.10s...] 3", p); 406 *bp++ = c; 407 i++; 408 } 409 *bp = 0; 410 DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); 411 xfree(op); 412 return (char *) tostring((char *) buf); 413 } 414 415 void overflo(const char *s) 416 { 417 FATAL("regular expression too big: out of space in %.30s...", s); 418 } 419 420 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 421 { 422 int i; 423 int *p; 424 425 switch (type(v)) { 426 ELEAF 427 LEAF 428 f->re[info(v)].ltype = type(v); 429 f->re[info(v)].lval.np = right(v); 430 while (f->accept >= maxsetvec) { /* guessing here! */ 431 resizesetvec(__func__); 432 } 433 for (i = 0; i <= f->accept; i++) 434 setvec[i] = 0; 435 setcnt = 0; 436 follow(v); /* computes setvec and setcnt */ 437 p = intalloc(setcnt + 1, __func__); 438 f->re[info(v)].lfollow = p; 439 *p = setcnt; 440 for (i = f->accept; i >= 0; i--) 441 if (setvec[i] == 1) 442 *++p = i; 443 break; 444 UNARY 445 cfoll(f,left(v)); 446 break; 447 case CAT: 448 case OR: 449 cfoll(f,left(v)); 450 cfoll(f,right(v)); 451 break; 452 case ZERO: 453 break; 454 default: /* can't happen */ 455 FATAL("can't happen: unknown type %d in cfoll", type(v)); 456 } 457 } 458 459 int first(Node *p) /* collects initially active leaves of p into setvec */ 460 /* returns 0 if p matches empty string */ 461 { 462 int b, lp; 463 464 switch (type(p)) { 465 ELEAF 466 LEAF 467 lp = info(p); /* look for high-water mark of subscripts */ 468 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 469 resizesetvec(__func__); 470 } 471 if (type(p) == EMPTYRE) { 472 setvec[lp] = 0; 473 return(0); 474 } 475 if (setvec[lp] != 1) { 476 setvec[lp] = 1; 477 setcnt++; 478 } 479 if (type(p) == CCL && (*(char *) right(p)) == '\0') 480 return(0); /* empty CCL */ 481 return(1); 482 case PLUS: 483 if (first(left(p)) == 0) 484 return(0); 485 return(1); 486 case STAR: 487 case QUEST: 488 first(left(p)); 489 return(0); 490 case CAT: 491 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 492 return(1); 493 case OR: 494 b = first(right(p)); 495 if (first(left(p)) == 0 || b == 0) return(0); 496 return(1); 497 case ZERO: 498 return 0; 499 } 500 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 501 return(-1); 502 } 503 504 void follow(Node *v) /* collects leaves that can follow v into setvec */ 505 { 506 Node *p; 507 508 if (type(v) == FINAL) 509 return; 510 p = parent(v); 511 switch (type(p)) { 512 case STAR: 513 case PLUS: 514 first(v); 515 follow(p); 516 return; 517 518 case OR: 519 case QUEST: 520 follow(p); 521 return; 522 523 case CAT: 524 if (v == left(p)) { /* v is left child of p */ 525 if (first(right(p)) == 0) { 526 follow(p); 527 return; 528 } 529 } else /* v is right child */ 530 follow(p); 531 return; 532 } 533 } 534 535 int member(int c, const char *sarg) /* is c in s? */ 536 { 537 const uschar *s = (const uschar *) sarg; 538 539 while (*s) 540 if (c == *s++) 541 return(1); 542 return(0); 543 } 544 545 int match(fa *f, const char *p0) /* shortest match ? */ 546 { 547 int s, ns; 548 const uschar *p = (const uschar *) p0; 549 550 s = f->initstat; 551 assert (s < f->state_count); 552 553 if (f->out[s]) 554 return(1); 555 do { 556 /* assert(*p < NCHARS); */ 557 if ((ns = f->gototab[s][*p]) != 0) 558 s = ns; 559 else 560 s = cgoto(f, s, *p); 561 if (f->out[s]) 562 return(1); 563 } while (*p++ != 0); 564 return(0); 565 } 566 567 int pmatch(fa *f, const char *p0) /* longest match, for sub */ 568 { 569 int s, ns; 570 const uschar *p = (const uschar *) p0; 571 const uschar *q; 572 573 s = f->initstat; 574 assert(s < f->state_count); 575 576 patbeg = (const char *)p; 577 patlen = -1; 578 do { 579 q = p; 580 do { 581 if (f->out[s]) /* final state */ 582 patlen = q-p; 583 /* assert(*q < NCHARS); */ 584 if ((ns = f->gototab[s][*q]) != 0) 585 s = ns; 586 else 587 s = cgoto(f, s, *q); 588 589 assert(s < f->state_count); 590 591 if (s == 1) { /* no transition */ 592 if (patlen >= 0) { 593 patbeg = (const char *) p; 594 return(1); 595 } 596 else 597 goto nextin; /* no match */ 598 } 599 } while (*q++ != 0); 600 if (f->out[s]) 601 patlen = q-p-1; /* don't count $ */ 602 if (patlen >= 0) { 603 patbeg = (const char *) p; 604 return(1); 605 } 606 nextin: 607 s = 2; 608 } while (*p++); 609 return (0); 610 } 611 612 int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 613 { 614 int s, ns; 615 const uschar *p = (const uschar *) p0; 616 const uschar *q; 617 618 s = f->initstat; 619 assert(s < f->state_count); 620 621 patbeg = (const char *)p; 622 patlen = -1; 623 while (*p) { 624 q = p; 625 do { 626 if (f->out[s]) /* final state */ 627 patlen = q-p; 628 /* assert(*q < NCHARS); */ 629 if ((ns = f->gototab[s][*q]) != 0) 630 s = ns; 631 else 632 s = cgoto(f, s, *q); 633 if (s == 1) { /* no transition */ 634 if (patlen > 0) { 635 patbeg = (const char *) p; 636 return(1); 637 } else 638 goto nnextin; /* no nonempty match */ 639 } 640 } while (*q++ != 0); 641 if (f->out[s]) 642 patlen = q-p-1; /* don't count $ */ 643 if (patlen > 0 ) { 644 patbeg = (const char *) p; 645 return(1); 646 } 647 nnextin: 648 s = 2; 649 p++; 650 } 651 return (0); 652 } 653 654 655 /* 656 * NAME 657 * fnematch 658 * 659 * DESCRIPTION 660 * A stream-fed version of nematch which transfers characters to a 661 * null-terminated buffer. All characters up to and including the last 662 * character of the matching text or EOF are placed in the buffer. If 663 * a match is found, patbeg and patlen are set appropriately. 664 * 665 * RETURN VALUES 666 * false No match found. 667 * true Match found. 668 */ 669 670 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) 671 { 672 char *buf = *pbuf; 673 int bufsize = *pbufsize; 674 int c, i, j, k, ns, s; 675 676 s = pfa->initstat; 677 patlen = 0; 678 679 /* 680 * All indices relative to buf. 681 * i <= j <= k <= bufsize 682 * 683 * i: origin of active substring 684 * j: current character 685 * k: destination of next getc() 686 */ 687 i = -1, k = 0; 688 do { 689 j = i++; 690 do { 691 if (++j == k) { 692 if (k == bufsize) 693 if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) 694 FATAL("stream '%.30s...' too long", buf); 695 buf[k++] = (c = getc(f)) != EOF ? c : 0; 696 } 697 c = (uschar)buf[j]; 698 /* assert(c < NCHARS); */ 699 700 if ((ns = pfa->gototab[s][c]) != 0) 701 s = ns; 702 else 703 s = cgoto(pfa, s, c); 704 705 if (pfa->out[s]) { /* final state */ 706 patlen = j - i + 1; 707 if (c == 0) /* don't count $ */ 708 patlen--; 709 } 710 } while (buf[j] && s != 1); 711 s = 2; 712 } while (buf[i] && !patlen); 713 714 /* adjbuf() may have relocated a resized buffer. Inform the world. */ 715 *pbuf = buf; 716 *pbufsize = bufsize; 717 718 if (patlen) { 719 patbeg = buf + i; 720 /* 721 * Under no circumstances is the last character fed to 722 * the automaton part of the match. It is EOF's nullbyte, 723 * or it sent the automaton into a state with no further 724 * transitions available (s==1), or both. Room for a 725 * terminating nullbyte is guaranteed. 726 * 727 * ungetc any chars after the end of matching text 728 * (except for EOF's nullbyte, if present) and null 729 * terminate the buffer. 730 */ 731 do 732 if (buf[--k] && ungetc(buf[k], f) == EOF) 733 FATAL("unable to ungetc '%c'", buf[k]); 734 while (k > i + patlen); 735 buf[k] = '\0'; 736 return true; 737 } 738 else 739 return false; 740 } 741 742 Node *reparse(const char *p) /* parses regular expression pointed to by p */ 743 { /* uses relex() to scan regular expression */ 744 Node *np; 745 746 DPRINTF("reparse <%s>\n", p); 747 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */ 748 rtok = relex(); 749 /* GNU compatibility: an empty regexp matches anything */ 750 if (rtok == '\0') { 751 /* FATAL("empty regular expression"); previous */ 752 return(op2(EMPTYRE, NIL, NIL)); 753 } 754 np = regexp(); 755 if (rtok != '\0') 756 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 757 return(np); 758 } 759 760 Node *regexp(void) /* top-level parse of reg expr */ 761 { 762 return (alt(concat(primary()))); 763 } 764 765 Node *primary(void) 766 { 767 Node *np; 768 int savelastatom; 769 770 switch (rtok) { 771 case CHAR: 772 lastatom = starttok; 773 np = op2(CHAR, NIL, itonp(rlxval)); 774 rtok = relex(); 775 return (unary(np)); 776 case ALL: 777 rtok = relex(); 778 return (unary(op2(ALL, NIL, NIL))); 779 case EMPTYRE: 780 rtok = relex(); 781 return (unary(op2(EMPTYRE, NIL, NIL))); 782 case DOT: 783 lastatom = starttok; 784 rtok = relex(); 785 return (unary(op2(DOT, NIL, NIL))); 786 case CCL: 787 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr)); 788 lastatom = starttok; 789 rtok = relex(); 790 return (unary(np)); 791 case NCCL: 792 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr)); 793 lastatom = starttok; 794 rtok = relex(); 795 return (unary(np)); 796 case '^': 797 rtok = relex(); 798 return (unary(op2(CHAR, NIL, itonp(HAT)))); 799 case '$': 800 rtok = relex(); 801 return (unary(op2(CHAR, NIL, NIL))); 802 case '(': 803 lastatom = starttok; 804 savelastatom = starttok - basestr; /* Retain over recursion */ 805 rtok = relex(); 806 if (rtok == ')') { /* special pleading for () */ 807 rtok = relex(); 808 return unary(op2(CCL, NIL, (Node *) tostring(""))); 809 } 810 np = regexp(); 811 if (rtok == ')') { 812 lastatom = basestr + savelastatom; /* Restore */ 813 rtok = relex(); 814 return (unary(np)); 815 } 816 else 817 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 818 default: 819 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 820 } 821 return 0; /*NOTREACHED*/ 822 } 823 824 Node *concat(Node *np) 825 { 826 switch (rtok) { 827 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 828 return (concat(op2(CAT, np, primary()))); 829 case EMPTYRE: 830 rtok = relex(); 831 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")), 832 primary()))); 833 } 834 return (np); 835 } 836 837 Node *alt(Node *np) 838 { 839 if (rtok == OR) { 840 rtok = relex(); 841 return (alt(op2(OR, np, concat(primary())))); 842 } 843 return (np); 844 } 845 846 Node *unary(Node *np) 847 { 848 switch (rtok) { 849 case STAR: 850 rtok = relex(); 851 return (unary(op2(STAR, np, NIL))); 852 case PLUS: 853 rtok = relex(); 854 return (unary(op2(PLUS, np, NIL))); 855 case QUEST: 856 rtok = relex(); 857 return (unary(op2(QUEST, np, NIL))); 858 case ZERO: 859 rtok = relex(); 860 return (unary(op2(ZERO, np, NIL))); 861 default: 862 return (np); 863 } 864 } 865 866 /* 867 * Character class definitions conformant to the POSIX locale as 868 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 869 * and operating character sets are both ASCII (ISO646) or supersets 870 * thereof. 871 * 872 * Note that to avoid overflowing the temporary buffer used in 873 * relex(), the expanded character class (prior to range expansion) 874 * must be less than twice the size of their full name. 875 */ 876 877 /* Because isblank doesn't show up in any of the header files on any 878 * system i use, it's defined here. if some other locale has a richer 879 * definition of "blank", define HAS_ISBLANK and provide your own 880 * version. 881 * the parentheses here are an attempt to find a path through the maze 882 * of macro definition and/or function and/or version provided. thanks 883 * to nelson beebe for the suggestion; let's see if it works everywhere. 884 */ 885 886 #ifndef HAS_ISBLANK 887 888 int (xisblank)(int c) 889 { 890 return c==' ' || c=='\t'; 891 } 892 893 #endif 894 895 static const struct charclass { 896 const char *cc_name; 897 int cc_namelen; 898 int (*cc_func)(int); 899 } charclasses[] = { 900 { "alnum", 5, isalnum }, 901 { "alpha", 5, isalpha }, 902 #ifndef HAS_ISBLANK 903 { "blank", 5, xisblank }, 904 #else 905 { "blank", 5, isblank }, 906 #endif 907 { "cntrl", 5, iscntrl }, 908 { "digit", 5, isdigit }, 909 { "graph", 5, isgraph }, 910 { "lower", 5, islower }, 911 { "print", 5, isprint }, 912 { "punct", 5, ispunct }, 913 { "space", 5, isspace }, 914 { "upper", 5, isupper }, 915 { "xdigit", 6, isxdigit }, 916 { NULL, 0, NULL }, 917 }; 918 919 #define REPEAT_SIMPLE 0 920 #define REPEAT_PLUS_APPENDED 1 921 #define REPEAT_WITH_Q 2 922 #define REPEAT_ZERO 3 923 924 static int 925 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 926 int atomlen, int firstnum, int secondnum, int special_case) 927 { 928 int i, j; 929 uschar *buf = NULL; 930 int ret = 1; 931 int init_q = (firstnum == 0); /* first added char will be ? */ 932 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 933 int prefix_length = reptok - basestr; /* prefix includes first rep */ 934 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */ 935 int size = prefix_length + suffix_length; 936 937 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 938 size += atomlen*(firstnum-1); 939 } 940 941 /* Adjust size of buffer for special cases */ 942 if (special_case == REPEAT_PLUS_APPENDED) { 943 size++; /* for the final + */ 944 } else if (special_case == REPEAT_WITH_Q) { 945 size += init_q + (atomlen+1)* (n_q_reps-init_q); 946 } else if (special_case == REPEAT_ZERO) { 947 size += 2; /* just a null ERE: () */ 948 } 949 if ((buf = (uschar *) malloc(size + 1)) == NULL) 950 FATAL("out of space in reg expr %.10s..", lastre); 951 memcpy(buf, basestr, prefix_length); /* copy prefix */ 952 j = prefix_length; 953 if (special_case == REPEAT_ZERO) { 954 j -= atomlen; 955 buf[j++] = '('; 956 buf[j++] = ')'; 957 } 958 for (i = 1; i < firstnum; i++) { /* copy x reps */ 959 memcpy(&buf[j], atom, atomlen); 960 j += atomlen; 961 } 962 if (special_case == REPEAT_PLUS_APPENDED) { 963 buf[j++] = '+'; 964 } else if (special_case == REPEAT_WITH_Q) { 965 if (init_q) 966 buf[j++] = '?'; 967 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */ 968 memcpy(&buf[j], atom, atomlen); 969 j += atomlen; 970 buf[j++] = '?'; 971 } 972 } 973 memcpy(&buf[j], reptok+reptoklen, suffix_length); 974 j += suffix_length; 975 buf[j] = '\0'; 976 /* free old basestr */ 977 if (firstbasestr != basestr) { 978 if (basestr) 979 xfree(basestr); 980 } 981 basestr = buf; 982 prestr = buf + prefix_length; 983 if (special_case == REPEAT_ZERO) { 984 prestr -= atomlen; 985 ret++; 986 } 987 return ret; 988 } 989 990 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 991 int atomlen, int firstnum, int secondnum) 992 { 993 /* 994 In general, the repetition specifier or "bound" is replaced here 995 by an equivalent ERE string, repeating the immediately previous atom 996 and appending ? and + as needed. Note that the first copy of the 997 atom is left in place, except in the special_case of a zero-repeat 998 (i.e., {0}). 999 */ 1000 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 1001 if (firstnum < 2) { 1002 /* 0 or 1: should be handled before you get here */ 1003 FATAL("internal error"); 1004 } else { 1005 return replace_repeat(reptok, reptoklen, atom, atomlen, 1006 firstnum, secondnum, REPEAT_PLUS_APPENDED); 1007 } 1008 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 1009 if (firstnum == 0) { /* {0} or {0,0} */ 1010 /* This case is unusual because the resulting 1011 replacement string might actually be SMALLER than 1012 the original ERE */ 1013 return replace_repeat(reptok, reptoklen, atom, atomlen, 1014 firstnum, secondnum, REPEAT_ZERO); 1015 } else { /* (firstnum >= 1) */ 1016 return replace_repeat(reptok, reptoklen, atom, atomlen, 1017 firstnum, secondnum, REPEAT_SIMPLE); 1018 } 1019 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 1020 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 1021 return replace_repeat(reptok, reptoklen, atom, atomlen, 1022 firstnum, secondnum, REPEAT_WITH_Q); 1023 } else { /* Error - shouldn't be here (n>m) */ 1024 FATAL("internal error"); 1025 } 1026 return 0; 1027 } 1028 1029 int relex(void) /* lexical analyzer for reparse */ 1030 { 1031 int c, n; 1032 int cflag; 1033 static uschar *buf = NULL; 1034 static int bufsz = 100; 1035 uschar *bp; 1036 const struct charclass *cc; 1037 int i; 1038 int num, m; 1039 bool commafound, digitfound; 1040 const uschar *startreptok; 1041 static int parens = 0; 1042 1043 rescan: 1044 starttok = prestr; 1045 1046 switch (c = *prestr++) { 1047 case '|': return OR; 1048 case '*': return STAR; 1049 case '+': return PLUS; 1050 case '?': return QUEST; 1051 case '.': return DOT; 1052 case '\0': prestr--; return '\0'; 1053 case '^': 1054 case '$': 1055 return c; 1056 case '(': 1057 parens++; 1058 return c; 1059 case ')': 1060 if (parens) { 1061 parens--; 1062 return c; 1063 } 1064 /* unmatched close parenthesis; per POSIX, treat as literal */ 1065 rlxval = c; 1066 return CHAR; 1067 case '\\': 1068 rlxval = quoted(&prestr); 1069 return CHAR; 1070 default: 1071 rlxval = c; 1072 return CHAR; 1073 case '[': 1074 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 1075 FATAL("out of space in reg expr %.10s..", lastre); 1076 bp = buf; 1077 if (*prestr == '^') { 1078 cflag = 1; 1079 prestr++; 1080 } 1081 else 1082 cflag = 0; 1083 n = 2 * strlen((const char *) prestr)+1; 1084 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 1085 FATAL("out of space for reg expr %.10s...", lastre); 1086 for (; ; ) { 1087 if ((c = *prestr++) == '\\') { 1088 *bp++ = '\\'; 1089 if ((c = *prestr++) == '\0') 1090 FATAL("nonterminated character class %.20s...", lastre); 1091 *bp++ = c; 1092 /* } else if (c == '\n') { */ 1093 /* FATAL("newline in character class %.20s...", lastre); */ 1094 } else if (c == '[' && *prestr == ':') { 1095 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 1096 for (cc = charclasses; cc->cc_name; cc++) 1097 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 1098 break; 1099 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 1100 prestr[2 + cc->cc_namelen] == ']') { 1101 prestr += cc->cc_namelen + 3; 1102 /* 1103 * BUG: We begin at 1, instead of 0, since we 1104 * would otherwise prematurely terminate the 1105 * string for classes like [[:cntrl:]]. This 1106 * means that we can't match the NUL character, 1107 * not without first adapting the entire 1108 * program to track each string's length. 1109 */ 1110 for (i = 1; i <= UCHAR_MAX; i++) { 1111 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) 1112 FATAL("out of space for reg expr %.10s...", lastre); 1113 if (cc->cc_func(i)) { 1114 /* escape backslash */ 1115 if (i == '\\') { 1116 *bp++ = '\\'; 1117 n++; 1118 } 1119 1120 *bp++ = i; 1121 n++; 1122 } 1123 } 1124 } else 1125 *bp++ = c; 1126 } else if (c == '[' && *prestr == '.') { 1127 char collate_char; 1128 prestr++; 1129 collate_char = *prestr++; 1130 if (*prestr == '.' && prestr[1] == ']') { 1131 prestr += 2; 1132 /* Found it: map via locale TBD: for 1133 now, simply return this char. This 1134 is sufficient to pass conformance 1135 test awk.ex 156 1136 */ 1137 if (*prestr == ']') { 1138 prestr++; 1139 rlxval = collate_char; 1140 return CHAR; 1141 } 1142 } 1143 } else if (c == '[' && *prestr == '=') { 1144 char equiv_char; 1145 prestr++; 1146 equiv_char = *prestr++; 1147 if (*prestr == '=' && prestr[1] == ']') { 1148 prestr += 2; 1149 /* Found it: map via locale TBD: for now 1150 simply return this char. This is 1151 sufficient to pass conformance test 1152 awk.ex 156 1153 */ 1154 if (*prestr == ']') { 1155 prestr++; 1156 rlxval = equiv_char; 1157 return CHAR; 1158 } 1159 } 1160 } else if (c == '\0') { 1161 FATAL("nonterminated character class %.20s", lastre); 1162 } else if (bp == buf) { /* 1st char is special */ 1163 *bp++ = c; 1164 } else if (c == ']') { 1165 *bp++ = 0; 1166 rlxstr = (uschar *) tostring((char *) buf); 1167 if (cflag == 0) 1168 return CCL; 1169 else 1170 return NCCL; 1171 } else 1172 *bp++ = c; 1173 } 1174 break; 1175 case '{': 1176 if (isdigit(*(prestr))) { 1177 num = 0; /* Process as a repetition */ 1178 n = -1; m = -1; 1179 commafound = false; 1180 digitfound = false; 1181 startreptok = prestr-1; 1182 /* Remember start of previous atom here ? */ 1183 } else { /* just a { char, not a repetition */ 1184 rlxval = c; 1185 return CHAR; 1186 } 1187 for (; ; ) { 1188 if ((c = *prestr++) == '}') { 1189 if (commafound) { 1190 if (digitfound) { /* {n,m} */ 1191 m = num; 1192 if (m < n) 1193 FATAL("illegal repetition expression: class %.20s", 1194 lastre); 1195 if (n == 0 && m == 1) { 1196 return QUEST; 1197 } 1198 } else { /* {n,} */ 1199 if (n == 0) 1200 return STAR; 1201 else if (n == 1) 1202 return PLUS; 1203 } 1204 } else { 1205 if (digitfound) { /* {n} same as {n,n} */ 1206 n = num; 1207 m = num; 1208 } else { /* {} */ 1209 FATAL("illegal repetition expression: class %.20s", 1210 lastre); 1211 } 1212 } 1213 if (repeat(starttok, prestr-starttok, lastatom, 1214 startreptok - lastatom, n, m) > 0) { 1215 if (n == 0 && m == 0) { 1216 return ZERO; 1217 } 1218 /* must rescan input for next token */ 1219 goto rescan; 1220 } 1221 /* Failed to replace: eat up {...} characters 1222 and treat like just PLUS */ 1223 return PLUS; 1224 } else if (c == '\0') { 1225 FATAL("nonterminated character class %.20s", 1226 lastre); 1227 } else if (isdigit(c)) { 1228 num = 10 * num + c - '0'; 1229 digitfound = true; 1230 } else if (c == ',') { 1231 if (commafound) 1232 FATAL("illegal repetition expression: class %.20s", 1233 lastre); 1234 /* looking for {n,} or {n,m} */ 1235 commafound = true; 1236 n = num; 1237 digitfound = false; /* reset */ 1238 num = 0; 1239 } else { 1240 FATAL("illegal repetition expression: class %.20s", 1241 lastre); 1242 } 1243 } 1244 break; 1245 } 1246 } 1247 1248 int cgoto(fa *f, int s, int c) 1249 { 1250 int *p, *q; 1251 int i, j, k; 1252 1253 assert(c == HAT || c < NCHARS); 1254 while (f->accept >= maxsetvec) { /* guessing here! */ 1255 resizesetvec(__func__); 1256 } 1257 for (i = 0; i <= f->accept; i++) 1258 setvec[i] = 0; 1259 setcnt = 0; 1260 resize_state(f, s); 1261 /* compute positions of gototab[s,c] into setvec */ 1262 p = f->posns[s]; 1263 for (i = 1; i <= *p; i++) { 1264 if ((k = f->re[p[i]].ltype) != FINAL) { 1265 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1266 || (k == DOT && c != 0 && c != HAT) 1267 || (k == ALL && c != 0) 1268 || (k == EMPTYRE && c != 0) 1269 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 1270 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 1271 q = f->re[p[i]].lfollow; 1272 for (j = 1; j <= *q; j++) { 1273 if (q[j] >= maxsetvec) { 1274 resizesetvec(__func__); 1275 } 1276 if (setvec[q[j]] == 0) { 1277 setcnt++; 1278 setvec[q[j]] = 1; 1279 } 1280 } 1281 } 1282 } 1283 } 1284 /* determine if setvec is a previous state */ 1285 tmpset[0] = setcnt; 1286 j = 1; 1287 for (i = f->accept; i >= 0; i--) 1288 if (setvec[i]) { 1289 tmpset[j++] = i; 1290 } 1291 resize_state(f, f->curstat > s ? f->curstat : s); 1292 /* tmpset == previous state? */ 1293 for (i = 1; i <= f->curstat; i++) { 1294 p = f->posns[i]; 1295 if ((k = tmpset[0]) != p[0]) 1296 goto different; 1297 for (j = 1; j <= k; j++) 1298 if (tmpset[j] != p[j]) 1299 goto different; 1300 /* setvec is state i */ 1301 if (c != HAT) 1302 f->gototab[s][c] = i; 1303 return i; 1304 different:; 1305 } 1306 1307 /* add tmpset to current set of states */ 1308 ++(f->curstat); 1309 resize_state(f, f->curstat); 1310 for (i = 0; i < NCHARS; i++) 1311 f->gototab[f->curstat][i] = 0; 1312 xfree(f->posns[f->curstat]); 1313 p = intalloc(setcnt + 1, __func__); 1314 1315 f->posns[f->curstat] = p; 1316 if (c != HAT) 1317 f->gototab[s][c] = f->curstat; 1318 for (i = 0; i <= setcnt; i++) 1319 p[i] = tmpset[i]; 1320 if (setvec[f->accept]) 1321 f->out[f->curstat] = 1; 1322 else 1323 f->out[f->curstat] = 0; 1324 return f->curstat; 1325 } 1326 1327 1328 void freefa(fa *f) /* free a finite automaton */ 1329 { 1330 int i; 1331 1332 if (f == NULL) 1333 return; 1334 for (i = 0; i < f->state_count; i++) 1335 xfree(f->gototab[i]) 1336 for (i = 0; i <= f->curstat; i++) 1337 xfree(f->posns[i]); 1338 for (i = 0; i <= f->accept; i++) { 1339 xfree(f->re[i].lfollow); 1340 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1341 xfree(f->re[i].lval.np); 1342 } 1343 xfree(f->restr); 1344 xfree(f->out); 1345 xfree(f->posns); 1346 xfree(f->gototab); 1347 xfree(f); 1348 } 1349