1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 /* lasciate ogne speranza, voi ch'intrate. */ 26 27 #if HAVE_NBTOOL_CONFIG_H 28 #include "nbtool_config.h" 29 #endif 30 31 #define DEBUG 32 33 #include <ctype.h> 34 #include <stdio.h> 35 #include <string.h> 36 #include <stdlib.h> 37 #include <assert.h> 38 #include "awk.h" 39 #include "awkgram.h" 40 41 #define HAT (NCHARS+2) /* matches ^ in regular expr */ 42 /* NCHARS is 2**n */ 43 #define MAXLIN 22 44 45 #define type(v) (v)->nobj /* badly overloaded here */ 46 #define info(v) (v)->ntype /* badly overloaded here */ 47 #define left(v) (v)->narg[0] 48 #define right(v) (v)->narg[1] 49 #define parent(v) (v)->nnext 50 51 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 52 #define ELEAF case EMPTYRE: /* empty string in regexp */ 53 #define UNARY case STAR: case PLUS: case QUEST: 54 55 /* encoding in tree Nodes: 56 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 57 left is index, right contains value or pointer to value 58 unary (STAR, PLUS, QUEST): left is child, right is null 59 binary (CAT, OR): left and right are children 60 parent contains pointer to parent 61 */ 62 63 64 int *setvec; 65 int *tmpset; 66 int maxsetvec = 0; 67 68 int rtok; /* next token in current re */ 69 int rlxval; 70 static const uschar *rlxstr; 71 static const uschar *prestr; /* current position in current re */ 72 static const uschar *lastre; /* origin of last re */ 73 74 static int setcnt; 75 static int poscnt; 76 77 uschar *patbeg; 78 int patlen; 79 80 #define NFA 128 /* cache this many dynamic fa's */ 81 fa *fatab[NFA]; 82 int nfatab = 0; /* entries in fatab */ 83 84 static void 85 resizesetvec(const char *msg) 86 { 87 if (maxsetvec == 0) 88 maxsetvec = MAXLIN; 89 else 90 maxsetvec *= 4; 91 setvec = realloc(setvec, maxsetvec * sizeof(*setvec)); 92 tmpset = realloc(tmpset, maxsetvec * sizeof(*tmpset)); 93 if (setvec == 0 || tmpset == 0) 94 overflo(msg); 95 } 96 97 static void 98 resize_state(fa *f, int state) 99 { 100 void *p; 101 int i, new_count; 102 103 if (++state < f->state_count) 104 return; 105 106 new_count = state + 10; /* needs to be tuned */ 107 108 p = realloc(f->gototab, new_count * sizeof(f->gototab[0])); 109 if (p == NULL) 110 goto out; 111 f->gototab = p; 112 113 p = realloc(f->out, new_count * sizeof(f->out[0])); 114 if (p == NULL) 115 goto out; 116 f->out = p; 117 118 p = realloc(f->posns, new_count * sizeof(f->posns[0])); 119 if (p == NULL) 120 goto out; 121 f->posns = p; 122 123 for (i = f->state_count; i < new_count; ++i) { 124 f->gototab[i] = calloc(1, NCHARS * sizeof (**f->gototab)); 125 if (f->gototab[i] == NULL) 126 goto out; 127 f->out[i] = 0; 128 f->posns[i] = NULL; 129 } 130 f->state_count = new_count; 131 return; 132 out: 133 overflo("out of memory in resize_state"); 134 } 135 136 fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */ 137 { 138 int i, use, nuse; 139 fa *pfa; 140 static int now = 1; 141 142 if (setvec == 0) /* first time through any RE */ 143 resizesetvec("out of space initializing makedfa"); 144 145 if (compile_time) /* a constant for sure */ 146 return mkdfa(s, anchor); 147 for (i = 0; i < nfatab; i++) /* is it there already? */ 148 if (fatab[i]->anchor == anchor 149 && strcmp((const char *) fatab[i]->restr, s) == 0) { 150 fatab[i]->use = now++; 151 return fatab[i]; 152 } 153 pfa = mkdfa(s, anchor); 154 if (nfatab < NFA) { /* room for another */ 155 fatab[nfatab] = pfa; 156 fatab[nfatab]->use = now++; 157 nfatab++; 158 return pfa; 159 } 160 use = fatab[0]->use; /* replace least-recently used */ 161 nuse = 0; 162 for (i = 1; i < nfatab; i++) 163 if (fatab[i]->use < use) { 164 use = fatab[i]->use; 165 nuse = i; 166 } 167 freefa(fatab[nuse]); 168 fatab[nuse] = pfa; 169 pfa->use = now++; 170 return pfa; 171 } 172 173 fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */ 174 /* anchor = 1 for anchored matches, else 0 */ 175 { 176 Node *p, *p1; 177 fa *f; 178 179 p = reparse(s); 180 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 181 /* put ALL STAR in front of reg. exp. */ 182 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 183 /* put FINAL after reg. exp. */ 184 185 poscnt = 0; 186 penter(p1); /* enter parent pointers and leaf indices */ 187 if ((f = calloc(1, sizeof(*f) + poscnt*sizeof(rrow))) == NULL) 188 overflo("out of space for fa"); 189 f->accept = poscnt-1; /* penter has computed number of positions in re */ 190 cfoll(f, p1); /* set up follow sets */ 191 freetr(p1); 192 resize_state(f, 1); 193 if ((f->posns[0] = calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL) 194 overflo("out of space in makedfa"); 195 if ((f->posns[1] = calloc(1, sizeof(int))) == NULL) 196 overflo("out of space in makedfa"); 197 *f->posns[1] = 0; 198 f->initstat = makeinit(f, anchor); 199 f->anchor = anchor; 200 f->restr = (uschar *) tostring(s); 201 return f; 202 } 203 204 int makeinit(fa *f, int anchor) 205 { 206 int i, k; 207 208 resize_state(f, 2); 209 f->curstat = 2; 210 f->out[2] = 0; 211 k = *(f->re[0].lfollow); 212 xfree(f->posns[2]); 213 if ((f->posns[2] = calloc(1, (k+1)*sizeof(int))) == NULL) 214 overflo("out of space in makeinit"); 215 for (i=0; i <= k; i++) { 216 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 217 } 218 if ((f->posns[2])[1] == f->accept) 219 f->out[2] = 1; 220 for (i=0; i < NCHARS; i++) 221 f->gototab[2][i] = 0; 222 f->curstat = cgoto(f, 2, HAT); 223 if (anchor) { 224 *f->posns[2] = k-1; /* leave out position 0 */ 225 for (i=0; i < k; i++) { 226 (f->posns[0])[i] = (f->posns[2])[i]; 227 } 228 229 f->out[0] = f->out[2]; 230 if (f->curstat != 2) { 231 resize_state(f, f->curstat); 232 --(*f->posns[f->curstat]); 233 } 234 } 235 return f->curstat; 236 } 237 238 void penter(Node *p) /* set up parent pointers and leaf indices */ 239 { 240 switch (type(p)) { 241 ELEAF 242 LEAF 243 info(p) = poscnt; 244 poscnt++; 245 break; 246 UNARY 247 penter(left(p)); 248 parent(left(p)) = p; 249 break; 250 case CAT: 251 case OR: 252 penter(left(p)); 253 penter(right(p)); 254 parent(left(p)) = p; 255 parent(right(p)) = p; 256 break; 257 default: /* can't happen */ 258 FATAL("can't happen: unknown type %d in penter", type(p)); 259 break; 260 } 261 } 262 263 void freetr(Node *p) /* free parse tree */ 264 { 265 switch (type(p)) { 266 ELEAF 267 LEAF 268 xfree(p); 269 break; 270 UNARY 271 freetr(left(p)); 272 xfree(p); 273 break; 274 case CAT: 275 case OR: 276 freetr(left(p)); 277 freetr(right(p)); 278 xfree(p); 279 break; 280 default: /* can't happen */ 281 FATAL("can't happen: unknown type %d in freetr", type(p)); 282 break; 283 } 284 } 285 286 /* in the parsing of regular expressions, metacharacters like . have */ 287 /* to be seen literally; \056 is not a metacharacter. */ 288 289 int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */ 290 { /* only pick up one 8-bit byte (2 chars) */ 291 const uschar *p; 292 int n = 0; 293 int i; 294 295 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) { 296 if (isdigit(*p)) 297 n = 16 * n + *p - '0'; 298 else if (*p >= 'a' && *p <= 'f') 299 n = 16 * n + *p - 'a' + 10; 300 else if (*p >= 'A' && *p <= 'F') 301 n = 16 * n + *p - 'A' + 10; 302 } 303 *pp = p; 304 return n; 305 } 306 307 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 308 309 int quoted(const uschar **pp) /* pick up next thing after a \\ */ 310 /* and increment *pp */ 311 { 312 const uschar *p = *pp; 313 int c; 314 315 if ((c = *p++) == 't') 316 c = '\t'; 317 else if (c == 'n') 318 c = '\n'; 319 else if (c == 'f') 320 c = '\f'; 321 else if (c == 'r') 322 c = '\r'; 323 else if (c == 'b') 324 c = '\b'; 325 else if (c == '\\') 326 c = '\\'; 327 else if (c == 'x') { /* hexadecimal goo follows */ 328 c = hexstr(&p); /* this adds a null if number is invalid */ 329 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 330 int n = c - '0'; 331 if (isoctdigit(*p)) { 332 n = 8 * n + *p++ - '0'; 333 if (isoctdigit(*p)) 334 n = 8 * n + *p++ - '0'; 335 } 336 c = n; 337 } /* else */ 338 /* c = c; */ 339 *pp = p; 340 return c; 341 } 342 343 char *cclenter(const char *argp) /* add a character class */ 344 { 345 int i, c, c2; 346 const uschar *p = (const uschar *) argp; 347 const uschar *op; 348 uschar *bp; 349 static uschar *buf = 0; 350 static int bufsz = 100; 351 352 op = p; 353 if (buf == 0 && (buf = malloc(bufsz)) == NULL) 354 FATAL("out of space for character class [%.10s...] 1", p); 355 bp = buf; 356 for (i = 0; (c = *p++) != 0; ) { 357 if (c == '\\') { 358 c = quoted(&p); 359 } else if (c == '-' && i > 0 && bp[-1] != 0) { 360 if (*p != 0) { 361 c = bp[-1]; 362 c2 = *p++; 363 if (c2 == '\\') 364 c2 = quoted(&p); 365 if (c > c2) { /* empty; ignore */ 366 bp--; 367 i--; 368 continue; 369 } 370 while (c < c2) { 371 if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter1")) 372 FATAL("out of space for character class [%.10s...] 2", p); 373 *bp++ = ++c; 374 i++; 375 } 376 continue; 377 } 378 } 379 if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter2")) 380 FATAL("out of space for character class [%.10s...] 3", p); 381 *bp++ = c; 382 i++; 383 } 384 *bp = 0; 385 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) ); 386 free(__UNCONST(op)); 387 return (char *) tostring((char *) buf); 388 } 389 390 void overflo(const char *s) 391 { 392 FATAL("regular expression too big: %.30s...", s); 393 } 394 395 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 396 { 397 int i; 398 int *p; 399 400 switch (type(v)) { 401 ELEAF 402 LEAF 403 f->re[info(v)].ltype = type(v); 404 f->re[info(v)].lval.np = right(v); 405 while (f->accept >= maxsetvec) /* guessing here! */ 406 resizesetvec("out of space in cfoll()"); 407 for (i = 0; i <= f->accept; i++) 408 setvec[i] = 0; 409 setcnt = 0; 410 follow(v); /* computes setvec and setcnt */ 411 if ((p = calloc(1, (setcnt+1)*sizeof(int))) == NULL) 412 overflo("out of space building follow set"); 413 f->re[info(v)].lfollow = p; 414 *p = setcnt; 415 for (i = f->accept; i >= 0; i--) 416 if (setvec[i] == 1) 417 *++p = i; 418 break; 419 UNARY 420 cfoll(f,left(v)); 421 break; 422 case CAT: 423 case OR: 424 cfoll(f,left(v)); 425 cfoll(f,right(v)); 426 break; 427 default: /* can't happen */ 428 FATAL("can't happen: unknown type %d in cfoll", type(v)); 429 } 430 } 431 432 int first(Node *p) /* collects initially active leaves of p into setvec */ 433 /* returns 0 if p matches empty string */ 434 { 435 int b, lp; 436 437 switch (type(p)) { 438 ELEAF 439 LEAF 440 lp = info(p); /* look for high-water mark of subscripts */ 441 while (setcnt >= maxsetvec || lp >= maxsetvec) /* guessing here! */ 442 resizesetvec("out of space in first()"); 443 if (type(p) == EMPTYRE) { 444 setvec[lp] = 0; 445 return(0); 446 } 447 if (setvec[lp] != 1) { 448 setvec[lp] = 1; 449 setcnt++; 450 } 451 if (type(p) == CCL && (*(char *) right(p)) == '\0') 452 return(0); /* empty CCL */ 453 else return(1); 454 case PLUS: 455 if (first(left(p)) == 0) return(0); 456 return(1); 457 case STAR: 458 case QUEST: 459 first(left(p)); 460 return(0); 461 case CAT: 462 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 463 return(1); 464 case OR: 465 b = first(right(p)); 466 if (first(left(p)) == 0 || b == 0) return(0); 467 return(1); 468 } 469 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 470 return(-1); 471 } 472 473 void follow(Node *v) /* collects leaves that can follow v into setvec */ 474 { 475 Node *p; 476 477 if (type(v) == FINAL) 478 return; 479 p = parent(v); 480 switch (type(p)) { 481 case STAR: 482 case PLUS: 483 first(v); 484 follow(p); 485 return; 486 487 case OR: 488 case QUEST: 489 follow(p); 490 return; 491 492 case CAT: 493 if (v == left(p)) { /* v is left child of p */ 494 if (first(right(p)) == 0) { 495 follow(p); 496 return; 497 } 498 } else /* v is right child */ 499 follow(p); 500 return; 501 } 502 } 503 504 int member(int c, const char *sarg) /* is c in s? */ 505 { 506 const uschar *s = (const uschar *) sarg; 507 508 while (*s) 509 if (c == *s++) 510 return(1); 511 return(0); 512 } 513 514 int match(fa *f, const char *p0) /* shortest match ? */ 515 { 516 int s, ns; 517 const uschar *p = (const uschar *) p0; 518 519 s = f->initstat; 520 assert (s < f->state_count); 521 522 if (f->out[s]) 523 return(1); 524 do { 525 /* assert(*p < NCHARS); */ 526 if ((ns = f->gototab[s][*p]) != 0) 527 s = ns; 528 else 529 s = cgoto(f, s, *p); 530 531 assert (s < f->state_count); 532 533 if (f->out[s]) 534 return(1); 535 } while (*p++ != 0); 536 return(0); 537 } 538 539 int pmatch(fa *f, const char *p0) /* longest match, for sub */ 540 { 541 int s, ns; 542 uschar *p = __UNCONST(p0); 543 uschar *q; 544 545 s = f->initstat; 546 assert(s < f->state_count); 547 patbeg = p; 548 patlen = -1; 549 do { 550 q = p; 551 do { 552 if (f->out[s]) /* final state */ 553 patlen = q-p; 554 /* assert(*q < NCHARS); */ 555 if ((ns = f->gototab[s][*q]) != 0) 556 s = ns; 557 else 558 s = cgoto(f, s, *q); 559 560 assert(s < f->state_count); 561 562 if (s == 1) { /* no transition */ 563 if (patlen >= 0) { 564 patbeg = p; 565 return(1); 566 } 567 else 568 goto nextin; /* no match */ 569 } 570 } while (*q++ != 0); 571 if (f->out[s]) 572 patlen = q-p-1; /* don't count $ */ 573 if (patlen >= 0) { 574 patbeg = p; 575 return(1); 576 } 577 nextin: 578 s = 2; 579 } while (*p++ != 0); 580 return (0); 581 } 582 583 int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 584 { 585 int s, ns; 586 uschar *p = __UNCONST(p0); 587 uschar *q; 588 589 s = f->initstat; 590 assert(s < f->state_count); 591 592 patlen = -1; 593 while (*p) { 594 q = p; 595 do { 596 if (f->out[s]) /* final state */ 597 patlen = q-p; 598 /* assert(*q < NCHARS); */ 599 if ((ns = f->gototab[s][*q]) != 0) 600 s = ns; 601 else 602 s = cgoto(f, s, *q); 603 604 assert(s < f->state_count); 605 606 if (s == 1) { /* no transition */ 607 if (patlen > 0) { 608 patbeg = p; 609 return(1); 610 } else 611 goto nnextin; /* no nonempty match */ 612 } 613 } while (*q++ != 0); 614 if (f->out[s]) 615 patlen = q-p-1; /* don't count $ */ 616 if (patlen > 0 ) { 617 patbeg = p; 618 return(1); 619 } 620 nnextin: 621 s = 2; 622 p++; 623 } 624 return (0); 625 } 626 627 Node *reparse(const char *p) /* parses regular expression pointed to by p */ 628 { /* uses relex() to scan regular expression */ 629 Node *np; 630 631 dprintf( ("reparse <%s>\n", p) ); 632 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */ 633 rtok = relex(); 634 /* GNU compatibility: an empty regexp matches anything */ 635 if (rtok == '\0') { 636 /* FATAL("empty regular expression"); previous */ 637 return(op2(EMPTYRE, NIL, NIL)); 638 } 639 np = regexp(); 640 if (rtok != '\0') 641 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 642 return(np); 643 } 644 645 Node *regexp(void) /* top-level parse of reg expr */ 646 { 647 return (alt(concat(primary()))); 648 } 649 650 Node *primary(void) 651 { 652 Node *np; 653 654 switch (rtok) { 655 case CHAR: 656 np = op2(CHAR, NIL, itonp(rlxval)); 657 rtok = relex(); 658 return (unary(np)); 659 case ALL: 660 rtok = relex(); 661 return (unary(op2(ALL, NIL, NIL))); 662 case EMPTYRE: 663 rtok = relex(); 664 return (unary(op2(ALL, NIL, NIL))); 665 case DOT: 666 rtok = relex(); 667 return (unary(op2(DOT, NIL, NIL))); 668 case CCL: 669 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr)); 670 rtok = relex(); 671 return (unary(np)); 672 case NCCL: 673 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr)); 674 rtok = relex(); 675 return (unary(np)); 676 case '^': 677 rtok = relex(); 678 return (unary(op2(CHAR, NIL, itonp(HAT)))); 679 case '$': 680 rtok = relex(); 681 return (unary(op2(CHAR, NIL, NIL))); 682 case '(': 683 rtok = relex(); 684 if (rtok == ')') { /* special pleading for () */ 685 rtok = relex(); 686 return unary(op2(CCL, NIL, (Node *) tostring(""))); 687 } 688 np = regexp(); 689 if (rtok == ')') { 690 rtok = relex(); 691 return (unary(np)); 692 } 693 else 694 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 695 default: 696 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 697 } 698 return 0; /*NOTREACHED*/ 699 } 700 701 Node *concat(Node *np) 702 { 703 switch (rtok) { 704 case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(': 705 return (concat(op2(CAT, np, primary()))); 706 } 707 return (np); 708 } 709 710 Node *alt(Node *np) 711 { 712 if (rtok == OR) { 713 rtok = relex(); 714 return (alt(op2(OR, np, concat(primary())))); 715 } 716 return (np); 717 } 718 719 Node *unary(Node *np) 720 { 721 switch (rtok) { 722 case STAR: 723 rtok = relex(); 724 return (unary(op2(STAR, np, NIL))); 725 case PLUS: 726 rtok = relex(); 727 return (unary(op2(PLUS, np, NIL))); 728 case QUEST: 729 rtok = relex(); 730 return (unary(op2(QUEST, np, NIL))); 731 default: 732 return (np); 733 } 734 } 735 736 /* 737 * Character class definitions conformant to the POSIX locale as 738 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 739 * and operating character sets are both ASCII (ISO646) or supersets 740 * thereof. 741 * 742 * Note that to avoid overflowing the temporary buffer used in 743 * relex(), the expanded character class (prior to range expansion) 744 * must be less than twice the size of their full name. 745 */ 746 747 /* Because isblank doesn't show up in any of the header files on any 748 * system i use, it's defined here. if some other locale has a richer 749 * definition of "blank", define HAS_ISBLANK and provide your own 750 * version. 751 * the parentheses here are an attempt to find a path through the maze 752 * of macro definition and/or function and/or version provided. thanks 753 * to nelson beebe for the suggestion; let's see if it works everywhere. 754 */ 755 756 /* #define HAS_ISBLANK */ 757 758 static const struct charclass { 759 const char *cc_name; 760 int cc_namelen; 761 int (*cc_func)(int); 762 } charclasses[] = { 763 { "alnum", 5, isalnum }, 764 { "alpha", 5, isalpha }, 765 { "blank", 5, isblank }, 766 { "cntrl", 5, iscntrl }, 767 { "digit", 5, isdigit }, 768 { "graph", 5, isgraph }, 769 { "lower", 5, islower }, 770 { "print", 5, isprint }, 771 { "punct", 5, ispunct }, 772 { "space", 5, isspace }, 773 { "upper", 5, isupper }, 774 { "xdigit", 6, isxdigit }, 775 { NULL, 0, NULL }, 776 }; 777 778 779 int relex(void) /* lexical analyzer for reparse */ 780 { 781 int c, n; 782 int cflag; 783 static uschar *buf = 0; 784 static int bufsz = 100; 785 uschar *bp; 786 const struct charclass *cc; 787 int i; 788 789 switch (c = *prestr++) { 790 case '|': return OR; 791 case '*': return STAR; 792 case '+': return PLUS; 793 case '?': return QUEST; 794 case '.': return DOT; 795 case '\0': prestr--; return '\0'; 796 case '^': 797 case '$': 798 case '(': 799 case ')': 800 return c; 801 case '\\': 802 rlxval = quoted(&prestr); 803 return CHAR; 804 default: 805 rlxval = c; 806 return CHAR; 807 case '[': 808 if (buf == 0 && (buf = malloc(bufsz)) == NULL) 809 FATAL("out of space in reg expr %.10s..", lastre); 810 bp = buf; 811 if (*prestr == '^') { 812 cflag = 1; 813 prestr++; 814 } 815 else 816 cflag = 0; 817 n = 2 * strlen((const char *) prestr)+1; 818 if (!adjbuf(&buf, &bufsz, n, n, &bp, "relex1")) 819 FATAL("out of space for reg expr %.10s...", lastre); 820 for (; ; ) { 821 if ((c = *prestr++) == '\\') { 822 *bp++ = '\\'; 823 if ((c = *prestr++) == '\0') 824 FATAL("nonterminated character class %.20s...", lastre); 825 *bp++ = c; 826 /* } else if (c == '\n') { */ 827 /* FATAL("newline in character class %.20s...", lastre); */ 828 } else if (c == '[' && *prestr == ':') { 829 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 830 for (cc = charclasses; cc->cc_name; cc++) 831 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 832 break; 833 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 834 prestr[2 + cc->cc_namelen] == ']') { 835 prestr += cc->cc_namelen + 3; 836 for (i = 1; i < NCHARS; i++) { 837 if (!adjbuf(&buf, &bufsz, bp-buf+1, 100, &bp, "relex2")) 838 FATAL("out of space for reg expr %.10s...", lastre); 839 if (cc->cc_func(i)) { 840 *bp++ = i; 841 n++; 842 } 843 } 844 } else 845 *bp++ = c; 846 } else if (c == '\0') { 847 FATAL("nonterminated character class %.20s", lastre); 848 } else if (bp == buf) { /* 1st char is special */ 849 *bp++ = c; 850 } else if (c == ']') { 851 *bp++ = 0; 852 rlxstr = (uschar *) tostring((char *) buf); 853 if (cflag == 0) 854 return CCL; 855 else 856 return NCCL; 857 } else 858 *bp++ = c; 859 } 860 } 861 } 862 863 int cgoto(fa *f, int s, int c) 864 { 865 int i, j, k; 866 int *p, *q; 867 868 assert(c == HAT || c < NCHARS); 869 while (f->accept >= maxsetvec) /* guessing here! */ 870 resizesetvec("out of space in cgoto()"); 871 for (i = 0; i <= f->accept; i++) 872 setvec[i] = 0; 873 setcnt = 0; 874 resize_state(f, s); 875 /* compute positions of gototab[s,c] into setvec */ 876 p = f->posns[s]; 877 for (i = 1; i <= *p; i++) { 878 if ((k = f->re[p[i]].ltype) != FINAL) { 879 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 880 || (k == DOT && c != 0 && c != HAT) 881 || (k == ALL && c != 0) 882 || (k == EMPTYRE && c != 0) 883 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 884 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 885 q = f->re[p[i]].lfollow; 886 for (j = 1; j <= *q; j++) { 887 if (q[j] >= maxsetvec) 888 resizesetvec("cgoto overflow"); 889 if (setvec[q[j]] == 0) { 890 setcnt++; 891 setvec[q[j]] = 1; 892 } 893 } 894 } 895 } 896 } 897 /* determine if setvec is a previous state */ 898 tmpset[0] = setcnt; 899 j = 1; 900 for (i = f->accept; i >= 0; i--) 901 if (setvec[i]) { 902 tmpset[j++] = i; 903 } 904 905 resize_state(f, f->curstat > s ? f->curstat : s); 906 /* tmpset == previous state? */ 907 for (i = 1; i <= f->curstat; i++) { 908 p = f->posns[i]; 909 if ((k = tmpset[0]) != p[0]) 910 goto different; 911 for (j = 1; j <= k; j++) 912 if (tmpset[j] != p[j]) 913 goto different; 914 /* setvec is state i */ 915 if (c != HAT) 916 f->gototab[s][c] = i; 917 return i; 918 different:; 919 } 920 921 /* add tmpset to current set of states */ 922 ++(f->curstat); 923 resize_state(f, f->curstat); 924 for (i = 0; i < NCHARS; i++) 925 f->gototab[f->curstat][i] = 0; 926 xfree(f->posns[f->curstat]); 927 if ((p = calloc(1, (setcnt+1)*sizeof(int))) == NULL) 928 overflo("out of space in cgoto"); 929 930 f->posns[f->curstat] = p; 931 if (c != HAT) 932 f->gototab[s][c] = f->curstat; 933 for (i = 0; i <= setcnt; i++) 934 p[i] = tmpset[i]; 935 if (setvec[f->accept]) 936 f->out[f->curstat] = 1; 937 else 938 f->out[f->curstat] = 0; 939 return f->curstat; 940 } 941 942 943 void freefa(fa *f) /* free a finite automaton */ 944 { 945 int i; 946 947 if (f == NULL) 948 return; 949 for (i = 0; i < f->state_count; i++) { 950 xfree(f->gototab[i]) 951 xfree(f->posns[i]); 952 } 953 for (i = 0; i <= f->accept; i++) { 954 xfree(f->re[i].lfollow); 955 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 956 xfree((f->re[i].lval.np)); 957 } 958 xfree(f->restr); 959 xfree(f->out); 960 xfree(f->posns); 961 xfree(f->gototab); 962 xfree(f); 963 } 964