1 /* $OpenBSD: b.c,v 1.53 2024/06/03 00:55:05 millert Exp $ */ 2 /**************************************************************** 3 Copyright (C) Lucent Technologies 1997 4 All Rights Reserved 5 6 Permission to use, copy, modify, and distribute this software and 7 its documentation for any purpose and without fee is hereby 8 granted, provided that the above copyright notice appear in all 9 copies and that both that the copyright notice and this 10 permission notice and warranty disclaimer appear in supporting 11 documentation, and that the name Lucent Technologies or any of 12 its entities not be used in advertising or publicity pertaining 13 to distribution of the software without specific, written prior 14 permission. 15 16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 23 THIS SOFTWARE. 24 ****************************************************************/ 25 26 /* lasciate ogne speranza, voi ch'intrate. */ 27 28 #define DEBUG 29 30 #include <ctype.h> 31 #include <limits.h> 32 #include <stdio.h> 33 #include <string.h> 34 #include <stdlib.h> 35 #include "awk.h" 36 #include "awkgram.tab.h" 37 38 #define MAXLIN 22 39 40 #define type(v) (v)->nobj /* badly overloaded here */ 41 #define info(v) (v)->ntype /* badly overloaded here */ 42 #define left(v) (v)->narg[0] 43 #define right(v) (v)->narg[1] 44 #define parent(v) (v)->nnext 45 46 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 47 #define ELEAF case EMPTYRE: /* empty string in regexp */ 48 #define UNARY case STAR: case PLUS: case QUEST: 49 50 /* encoding in tree Nodes: 51 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 52 left is index, right contains value or pointer to value 53 unary (STAR, PLUS, QUEST): left is child, right is null 54 binary (CAT, OR): left and right are children 55 parent contains pointer to parent 56 */ 57 58 59 int *setvec; 60 int *tmpset; 61 int maxsetvec = 0; 62 63 int rtok; /* next token in current re */ 64 int rlxval; 65 static const uschar *rlxstr; 66 static const uschar *prestr; /* current position in current re */ 67 static const uschar *lastre; /* origin of last re */ 68 static const uschar *lastatom; /* origin of last Atom */ 69 static const uschar *starttok; 70 static const uschar *basestr; /* starts with original, replaced during 71 repetition processing */ 72 static const uschar *firstbasestr; 73 74 static int setcnt; 75 static int poscnt; 76 77 const char *patbeg; 78 int patlen; 79 80 #define NFA 128 /* cache this many dynamic fa's */ 81 fa *fatab[NFA]; 82 int nfatab = 0; /* entries in fatab */ 83 84 /* utf-8 mechanism: 85 86 For most of Awk, utf-8 strings just "work", since they look like 87 null-terminated sequences of 8-bit bytes. 88 89 Functions like length(), index(), and substr() have to operate 90 in units of utf-8 characters. The u8_* functions in run.c 91 handle this. 92 93 Regular expressions are more complicated, since the basic 94 mechanism of the goto table used 8-bit byte indices into the 95 gototab entries to compute the next state. Unicode is a lot 96 bigger, so the gototab entries are now structs with a character 97 and a next state. These are sorted by code point and binary 98 searched. 99 100 Throughout the RE mechanism in b.c, utf-8 characters are 101 converted to their utf-32 value. This mostly shows up in 102 cclenter, which expands character class ranges like a-z and now 103 alpha-omega. The size of a gototab array is still about 256. 104 This should be dynamic, but for now things work ok for a single 105 code page of Unicode, which is the most likely case. 106 107 The code changes are localized in run.c and b.c. I have added a 108 handful of functions to somewhat better hide the implementation, 109 but a lot more could be done. 110 111 */ 112 113 static int entry_cmp(const void *l, const void *r); 114 static int get_gototab(fa*, int, int); 115 static int set_gototab(fa*, int, int, int); 116 static void clear_gototab(fa*, int); 117 118 static int * 119 intalloc(size_t n, const char *f) 120 { 121 int *p = (int *) calloc(n, sizeof(int)); 122 if (p == NULL) 123 overflo(f); 124 return p; 125 } 126 127 static void 128 allocsetvec(const char *f) 129 { 130 maxsetvec = MAXLIN; 131 setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec)); 132 tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset)); 133 if (setvec == NULL || tmpset == NULL) 134 overflo(f); 135 } 136 137 static void 138 resizesetvec(const char *f) 139 { 140 setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec)); 141 tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset)); 142 if (setvec == NULL || tmpset == NULL) 143 overflo(f); 144 maxsetvec *= 4; 145 } 146 147 static void 148 resize_state(fa *f, int state) 149 { 150 gtt *p; 151 uschar *p2; 152 int **p3; 153 int i, new_count; 154 155 if (++state < f->state_count) 156 return; 157 158 new_count = state + 10; /* needs to be tuned */ 159 160 p = (gtt *) reallocarray(f->gototab, new_count, sizeof(gtt)); 161 if (p == NULL) 162 goto out; 163 f->gototab = p; 164 165 p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0])); 166 if (p2 == NULL) 167 goto out; 168 f->out = p2; 169 170 p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0])); 171 if (p3 == NULL) 172 goto out; 173 f->posns = p3; 174 175 for (i = f->state_count; i < new_count; ++i) { 176 f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte)); 177 if (f->gototab[i].entries == NULL) 178 goto out; 179 f->gototab[i].allocated = NCHARS; 180 f->gototab[i].inuse = 0; 181 f->out[i] = 0; 182 f->posns[i] = NULL; 183 } 184 f->state_count = new_count; 185 return; 186 out: 187 overflo(__func__); 188 } 189 190 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */ 191 { 192 int i, use, nuse; 193 fa *pfa; 194 static int now = 1; 195 196 if (setvec == NULL) { /* first time through any RE */ 197 allocsetvec(__func__); 198 } 199 200 if (compile_time != RUNNING) /* a constant for sure */ 201 return mkdfa(s, anchor); 202 for (i = 0; i < nfatab; i++) /* is it there already? */ 203 if (fatab[i]->anchor == anchor 204 && strcmp((const char *) fatab[i]->restr, s) == 0) { 205 fatab[i]->use = now++; 206 return fatab[i]; 207 } 208 pfa = mkdfa(s, anchor); 209 if (nfatab < NFA) { /* room for another */ 210 fatab[nfatab] = pfa; 211 fatab[nfatab]->use = now++; 212 nfatab++; 213 return pfa; 214 } 215 use = fatab[0]->use; /* replace least-recently used */ 216 nuse = 0; 217 for (i = 1; i < nfatab; i++) 218 if (fatab[i]->use < use) { 219 use = fatab[i]->use; 220 nuse = i; 221 } 222 freefa(fatab[nuse]); 223 fatab[nuse] = pfa; 224 pfa->use = now++; 225 return pfa; 226 } 227 228 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */ 229 /* anchor = true for anchored matches, else false */ 230 { 231 Node *p, *p1; 232 fa *f; 233 234 firstbasestr = (const uschar *) s; 235 basestr = firstbasestr; 236 p = reparse(s); 237 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 238 /* put ALL STAR in front of reg. exp. */ 239 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 240 /* put FINAL after reg. exp. */ 241 242 poscnt = 0; 243 penter(p1); /* enter parent pointers and leaf indices */ 244 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL) 245 overflo(__func__); 246 f->accept = poscnt-1; /* penter has computed number of positions in re */ 247 cfoll(f, p1); /* set up follow sets */ 248 freetr(p1); 249 resize_state(f, 1); 250 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__); 251 f->posns[1] = intalloc(1, __func__); 252 *f->posns[1] = 0; 253 f->initstat = makeinit(f, anchor); 254 f->anchor = anchor; 255 f->restr = (uschar *) tostring(s); 256 if (firstbasestr != basestr) { 257 if (basestr) 258 xfree(basestr); 259 } 260 return f; 261 } 262 263 int makeinit(fa *f, bool anchor) 264 { 265 int i, k; 266 267 f->curstat = 2; 268 f->out[2] = 0; 269 k = *(f->re[0].lfollow); 270 xfree(f->posns[2]); 271 f->posns[2] = intalloc(k + 1, __func__); 272 for (i = 0; i <= k; i++) { 273 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 274 } 275 if ((f->posns[2])[1] == f->accept) 276 f->out[2] = 1; 277 clear_gototab(f, 2); 278 f->curstat = cgoto(f, 2, HAT); 279 if (anchor) { 280 *f->posns[2] = k-1; /* leave out position 0 */ 281 for (i = 0; i < k; i++) { 282 (f->posns[0])[i] = (f->posns[2])[i]; 283 } 284 285 f->out[0] = f->out[2]; 286 if (f->curstat != 2) 287 --(*f->posns[f->curstat]); 288 } 289 return f->curstat; 290 } 291 292 void penter(Node *p) /* set up parent pointers and leaf indices */ 293 { 294 switch (type(p)) { 295 ELEAF 296 LEAF 297 info(p) = poscnt; 298 poscnt++; 299 break; 300 UNARY 301 penter(left(p)); 302 parent(left(p)) = p; 303 break; 304 case CAT: 305 case OR: 306 penter(left(p)); 307 penter(right(p)); 308 parent(left(p)) = p; 309 parent(right(p)) = p; 310 break; 311 case ZERO: 312 break; 313 default: /* can't happen */ 314 FATAL("can't happen: unknown type %d in penter", type(p)); 315 break; 316 } 317 } 318 319 void freetr(Node *p) /* free parse tree */ 320 { 321 switch (type(p)) { 322 ELEAF 323 LEAF 324 xfree(p); 325 break; 326 UNARY 327 case ZERO: 328 freetr(left(p)); 329 xfree(p); 330 break; 331 case CAT: 332 case OR: 333 freetr(left(p)); 334 freetr(right(p)); 335 xfree(p); 336 break; 337 default: /* can't happen */ 338 FATAL("can't happen: unknown type %d in freetr", type(p)); 339 break; 340 } 341 } 342 343 /* in the parsing of regular expressions, metacharacters like . have */ 344 /* to be seen literally; \056 is not a metacharacter. */ 345 346 static int 347 hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */ 348 { /* only pick up one 8-bit byte (2 chars) */ 349 const uschar *p; 350 int n = 0; 351 int i; 352 353 for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) { 354 if (isdigit(*p)) 355 n = 16 * n + *p - '0'; 356 else if (*p >= 'a' && *p <= 'f') 357 n = 16 * n + *p - 'a' + 10; 358 else if (*p >= 'A' && *p <= 'F') 359 n = 16 * n + *p - 'A' + 10; 360 } 361 *pp = p; 362 return n; 363 } 364 365 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 366 367 int quoted(const uschar **pp) /* pick up next thing after a \\ */ 368 /* and increment *pp */ 369 { 370 const uschar *p = *pp; 371 int c; 372 373 /* BUG: should advance by utf-8 char even if makes no sense */ 374 375 if ((c = *p++) == 't') { 376 c = '\t'; 377 } else if (c == 'n') { 378 c = '\n'; 379 } else if (c == 'f') { 380 c = '\f'; 381 } else if (c == 'r') { 382 c = '\r'; 383 } else if (c == 'b') { 384 c = '\b'; 385 } else if (c == 'v') { 386 c = '\v'; 387 } else if (c == 'a') { 388 c = '\a'; 389 } else if (c == '\\') { 390 c = '\\'; 391 } else if (c == 'x') { /* 2 hex digits follow */ 392 c = hexstr(&p, 2); /* this adds a null if number is invalid */ 393 } else if (c == 'u') { /* unicode char number up to 8 hex digits */ 394 c = hexstr(&p, 8); 395 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 396 int n = c - '0'; 397 if (isoctdigit(*p)) { 398 n = 8 * n + *p++ - '0'; 399 if (isoctdigit(*p)) 400 n = 8 * n + *p++ - '0'; 401 } 402 c = n; 403 } /* else */ 404 /* c = c; */ 405 *pp = p; 406 return c; 407 } 408 409 int *cclenter(const char *argp) /* add a character class */ 410 { 411 int i, c, c2; 412 int n; 413 const uschar *p = (const uschar *) argp; 414 int *bp, *retp; 415 static int *buf = NULL; 416 static int bufsz = 100; 417 418 if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL) 419 FATAL("out of space for character class [%.10s...] 1", p); 420 bp = buf; 421 for (i = 0; *p != 0; ) { 422 n = u8_rune(&c, (const char *) p); 423 p += n; 424 if (c == '\\') { 425 c = quoted(&p); 426 } else if (c == '-' && i > 0 && bp[-1] != 0) { 427 if (*p != 0) { 428 c = bp[-1]; 429 /* c2 = *p++; */ 430 n = u8_rune(&c2, (const char *) p); 431 p += n; 432 if (c2 == '\\') 433 c2 = quoted(&p); /* BUG: sets p, has to be u8 size */ 434 if (c > c2) { /* empty; ignore */ 435 bp--; 436 i--; 437 continue; 438 } 439 while (c < c2) { 440 if (i >= bufsz) { 441 buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int)); 442 if (buf == NULL) 443 FATAL("out of space for character class [%.10s...] 2", p); 444 bufsz *= 2; 445 bp = buf + i; 446 } 447 *bp++ = ++c; 448 i++; 449 } 450 continue; 451 } 452 } 453 if (i >= bufsz) { 454 buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int)); 455 if (buf == NULL) 456 FATAL("out of space for character class [%.10s...] 2", p); 457 bufsz *= 2; 458 bp = buf + i; 459 } 460 *bp++ = c; 461 i++; 462 } 463 *bp = 0; 464 /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */ 465 /* xfree(op); BUG: what are we freeing here? */ 466 retp = (int *) calloc(bp-buf+1, sizeof(int)); 467 for (i = 0; i < bp-buf+1; i++) 468 retp[i] = buf[i]; 469 return retp; 470 } 471 472 void overflo(const char *s) 473 { 474 FATAL("regular expression too big: out of space in %.30s...", s); 475 } 476 477 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 478 { 479 int i; 480 int *p; 481 482 switch (type(v)) { 483 ELEAF 484 LEAF 485 f->re[info(v)].ltype = type(v); 486 f->re[info(v)].lval.np = right(v); 487 while (f->accept >= maxsetvec) { /* guessing here! */ 488 resizesetvec(__func__); 489 } 490 for (i = 0; i <= f->accept; i++) 491 setvec[i] = 0; 492 setcnt = 0; 493 follow(v); /* computes setvec and setcnt */ 494 p = intalloc(setcnt + 1, __func__); 495 f->re[info(v)].lfollow = p; 496 *p = setcnt; 497 for (i = f->accept; i >= 0; i--) 498 if (setvec[i] == 1) 499 *++p = i; 500 break; 501 UNARY 502 cfoll(f,left(v)); 503 break; 504 case CAT: 505 case OR: 506 cfoll(f,left(v)); 507 cfoll(f,right(v)); 508 break; 509 case ZERO: 510 break; 511 default: /* can't happen */ 512 FATAL("can't happen: unknown type %d in cfoll", type(v)); 513 } 514 } 515 516 int first(Node *p) /* collects initially active leaves of p into setvec */ 517 /* returns 0 if p matches empty string */ 518 { 519 int b, lp; 520 521 switch (type(p)) { 522 ELEAF 523 LEAF 524 lp = info(p); /* look for high-water mark of subscripts */ 525 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 526 resizesetvec(__func__); 527 } 528 if (type(p) == EMPTYRE) { 529 setvec[lp] = 0; 530 return(0); 531 } 532 if (setvec[lp] != 1) { 533 setvec[lp] = 1; 534 setcnt++; 535 } 536 if (type(p) == CCL && (*(int *) right(p)) == 0) 537 return(0); /* empty CCL */ 538 return(1); 539 case PLUS: 540 if (first(left(p)) == 0) 541 return(0); 542 return(1); 543 case STAR: 544 case QUEST: 545 first(left(p)); 546 return(0); 547 case CAT: 548 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 549 return(1); 550 case OR: 551 b = first(right(p)); 552 if (first(left(p)) == 0 || b == 0) return(0); 553 return(1); 554 case ZERO: 555 return 0; 556 } 557 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 558 return(-1); 559 } 560 561 void follow(Node *v) /* collects leaves that can follow v into setvec */ 562 { 563 Node *p; 564 565 if (type(v) == FINAL) 566 return; 567 p = parent(v); 568 switch (type(p)) { 569 case STAR: 570 case PLUS: 571 first(v); 572 follow(p); 573 return; 574 575 case OR: 576 case QUEST: 577 follow(p); 578 return; 579 580 case CAT: 581 if (v == left(p)) { /* v is left child of p */ 582 if (first(right(p)) == 0) { 583 follow(p); 584 return; 585 } 586 } else /* v is right child */ 587 follow(p); 588 return; 589 } 590 } 591 592 int member(int c, int *sarg) /* is c in s? */ 593 { 594 int *s = (int *) sarg; 595 596 while (*s) 597 if (c == *s++) 598 return(1); 599 return(0); 600 } 601 602 static void resize_gototab(fa *f, int state) 603 { 604 size_t new_size = f->gototab[state].allocated * 2; 605 gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte)); 606 if (p == NULL) 607 overflo(__func__); 608 609 // need to initialized the new memory to zero 610 size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size 611 memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out 612 613 f->gototab[state].allocated = new_size; // update gototab info 614 f->gototab[state].entries = p; 615 } 616 617 static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */ 618 { 619 gtte key; 620 gtte *item; 621 622 key.ch = ch; 623 key.state = 0; /* irrelevant */ 624 item = (gtte *) bsearch(& key, f->gototab[state].entries, 625 f->gototab[state].inuse, sizeof(gtte), 626 entry_cmp); 627 628 if (item == NULL) 629 return 0; 630 else 631 return item->state; 632 } 633 634 static int entry_cmp(const void *l, const void *r) 635 { 636 const gtte *left, *right; 637 638 left = (const gtte *) l; 639 right = (const gtte *) r; 640 641 return left->ch - right->ch; 642 } 643 644 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */ 645 { 646 if (f->gototab[state].inuse == 0) { 647 f->gototab[state].entries[0].ch = ch; 648 f->gototab[state].entries[0].state = val; 649 f->gototab[state].inuse++; 650 return val; 651 } else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) { 652 // not seen yet, insert and return 653 gtt *tab = & f->gototab[state]; 654 if (tab->inuse + 1 >= tab->allocated) 655 resize_gototab(f, state); 656 657 f->gototab[state].entries[f->gototab[state].inuse].ch = ch; 658 f->gototab[state].entries[f->gototab[state].inuse].state = val; 659 f->gototab[state].inuse++; 660 return val; 661 } else { 662 // maybe we have it, maybe we don't 663 gtte key; 664 gtte *item; 665 666 key.ch = ch; 667 key.state = 0; /* irrelevant */ 668 item = (gtte *) bsearch(& key, f->gototab[state].entries, 669 f->gototab[state].inuse, sizeof(gtte), 670 entry_cmp); 671 672 if (item != NULL) { 673 // we have it, update state and return 674 item->state = val; 675 return item->state; 676 } 677 // otherwise, fall through to insert and reallocate. 678 } 679 680 gtt *tab = & f->gototab[state]; 681 if (tab->inuse + 1 >= tab->allocated) 682 resize_gototab(f, state); 683 f->gototab[state].entries[tab->inuse].ch = ch; 684 f->gototab[state].entries[tab->inuse].state = val; 685 ++tab->inuse; 686 687 qsort(f->gototab[state].entries, 688 f->gototab[state].inuse, sizeof(gtte), entry_cmp); 689 690 return val; /* not used anywhere at the moment */ 691 } 692 693 static void clear_gototab(fa *f, int state) 694 { 695 memset(f->gototab[state].entries, 0, 696 f->gototab[state].allocated * sizeof(gtte)); 697 f->gototab[state].inuse = 0; 698 } 699 700 int match(fa *f, const char *p0) /* shortest match ? */ 701 { 702 int s, ns; 703 int n; 704 int rune; 705 const uschar *p = (const uschar *) p0; 706 707 /* return pmatch(f, p0); does it matter whether longest or shortest? */ 708 709 s = f->initstat; 710 assert (s < f->state_count); 711 712 if (f->out[s]) 713 return(1); 714 do { 715 /* assert(*p < NCHARS); */ 716 n = u8_rune(&rune, (const char *) p); 717 if ((ns = get_gototab(f, s, rune)) != 0) 718 s = ns; 719 else 720 s = cgoto(f, s, rune); 721 if (f->out[s]) 722 return(1); 723 if (*p == 0) 724 break; 725 p += n; 726 } while (1); /* was *p++ != 0 */ 727 return(0); 728 } 729 730 int pmatch(fa *f, const char *p0) /* longest match, for sub */ 731 { 732 int s, ns; 733 int n; 734 int rune; 735 const uschar *p = (const uschar *) p0; 736 const uschar *q; 737 738 s = f->initstat; 739 assert(s < f->state_count); 740 741 patbeg = (const char *)p; 742 patlen = -1; 743 do { 744 q = p; 745 do { 746 if (f->out[s]) /* final state */ 747 patlen = q-p; 748 /* assert(*q < NCHARS); */ 749 n = u8_rune(&rune, (const char *) q); 750 if ((ns = get_gototab(f, s, rune)) != 0) 751 s = ns; 752 else 753 s = cgoto(f, s, rune); 754 755 assert(s < f->state_count); 756 757 if (s == 1) { /* no transition */ 758 if (patlen >= 0) { 759 patbeg = (const char *) p; 760 return(1); 761 } 762 else 763 goto nextin; /* no match */ 764 } 765 if (*q == 0) 766 break; 767 q += n; 768 } while (1); 769 q++; /* was *q++ */ 770 if (f->out[s]) 771 patlen = q-p-1; /* don't count $ */ 772 if (patlen >= 0) { 773 patbeg = (const char *) p; 774 return(1); 775 } 776 nextin: 777 s = 2; 778 if (*p == 0) 779 break; 780 n = u8_rune(&rune, (const char *) p); 781 p += n; 782 } while (1); /* was *p++ */ 783 return (0); 784 } 785 786 int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 787 { 788 int s, ns; 789 int n; 790 int rune; 791 const uschar *p = (const uschar *) p0; 792 const uschar *q; 793 794 s = f->initstat; 795 assert(s < f->state_count); 796 797 patbeg = (const char *)p; 798 patlen = -1; 799 while (*p) { 800 q = p; 801 do { 802 if (f->out[s]) /* final state */ 803 patlen = q-p; 804 /* assert(*q < NCHARS); */ 805 n = u8_rune(&rune, (const char *) q); 806 if ((ns = get_gototab(f, s, rune)) != 0) 807 s = ns; 808 else 809 s = cgoto(f, s, rune); 810 if (s == 1) { /* no transition */ 811 if (patlen > 0) { 812 patbeg = (const char *) p; 813 return(1); 814 } else 815 goto nnextin; /* no nonempty match */ 816 } 817 if (*q == 0) 818 break; 819 q += n; 820 } while (1); 821 q++; 822 if (f->out[s]) 823 patlen = q-p-1; /* don't count $ */ 824 if (patlen > 0 ) { 825 patbeg = (const char *) p; 826 return(1); 827 } 828 nnextin: 829 s = 2; 830 p++; 831 } 832 return (0); 833 } 834 835 836 /* 837 * NAME 838 * fnematch 839 * 840 * DESCRIPTION 841 * A stream-fed version of nematch which transfers characters to a 842 * null-terminated buffer. All characters up to and including the last 843 * character of the matching text or EOF are placed in the buffer. If 844 * a match is found, patbeg and patlen are set appropriately. 845 * 846 * RETURN VALUES 847 * false No match found. 848 * true Match found. 849 */ 850 851 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) 852 { 853 char *i, *j, *k, *buf = *pbuf; 854 int bufsize = *pbufsize; 855 int c, n, ns, s; 856 857 s = pfa->initstat; 858 patlen = 0; 859 860 /* 861 * buf <= i <= j <= k <= buf+bufsize 862 * 863 * i: origin of active substring 864 * j: current character 865 * k: destination of the next getc 866 */ 867 868 i = j = k = buf; 869 870 do { 871 /* 872 * Call u8_rune with at least awk_mb_cur_max ahead in 873 * the buffer until EOF interferes. 874 */ 875 if (k - j < (int)awk_mb_cur_max) { 876 if (k + awk_mb_cur_max > buf + bufsize) { 877 char *obuf = buf; 878 adjbuf(&buf, &bufsize, 879 bufsize + awk_mb_cur_max, 880 quantum, 0, "fnematch"); 881 882 /* buf resized, maybe moved. update pointers */ 883 *pbufsize = bufsize; 884 if (obuf != buf) { 885 i = buf + (i - obuf); 886 j = buf + (j - obuf); 887 k = buf + (k - obuf); 888 *pbuf = buf; 889 if (patlen) 890 patbeg = buf + (patbeg - obuf); 891 } 892 } 893 for (n = awk_mb_cur_max ; n > 0; n--) { 894 *k++ = (c = getc(f)) != EOF ? c : 0; 895 if (c == EOF) { 896 if (ferror(f)) 897 FATAL("fnematch: getc error"); 898 break; 899 } 900 } 901 } 902 903 j += u8_rune(&c, j); 904 905 if ((ns = get_gototab(pfa, s, c)) != 0) 906 s = ns; 907 else 908 s = cgoto(pfa, s, c); 909 910 if (pfa->out[s]) { /* final state */ 911 patbeg = i; 912 patlen = j - i; 913 if (c == 0) /* don't count $ */ 914 patlen--; 915 } 916 917 if (c && s != 1) 918 continue; /* origin i still viable, next j */ 919 if (patlen) 920 break; /* best match found */ 921 922 /* no match at origin i, next i and start over */ 923 i += u8_rune(&c, i); 924 if (c == 0) 925 break; /* no match */ 926 j = i; 927 s = 2; 928 } while (1); 929 930 if (patlen) { 931 /* 932 * Under no circumstances is the last character fed to 933 * the automaton part of the match. It is EOF's nullbyte, 934 * or it sent the automaton into a state with no further 935 * transitions available (s==1), or both. Room for a 936 * terminating nullbyte is guaranteed. 937 * 938 * ungetc any chars after the end of matching text 939 * (except for EOF's nullbyte, if present) and null 940 * terminate the buffer. 941 */ 942 do 943 if (*--k && ungetc(*k, f) == EOF) 944 FATAL("unable to ungetc '%c'", *k); 945 while (k > patbeg + patlen); 946 *k = '\0'; 947 return true; 948 } 949 else 950 return false; 951 } 952 953 Node *reparse(const char *p) /* parses regular expression pointed to by p */ 954 { /* uses relex() to scan regular expression */ 955 Node *np; 956 957 DPRINTF("reparse <%s>\n", p); 958 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */ 959 rtok = relex(); 960 /* GNU compatibility: an empty regexp matches anything */ 961 if (rtok == '\0') { 962 /* FATAL("empty regular expression"); previous */ 963 return(op2(EMPTYRE, NIL, NIL)); 964 } 965 np = regexp(); 966 if (rtok != '\0') 967 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 968 return(np); 969 } 970 971 Node *regexp(void) /* top-level parse of reg expr */ 972 { 973 return (alt(concat(primary()))); 974 } 975 976 Node *primary(void) 977 { 978 Node *np; 979 int savelastatom; 980 981 switch (rtok) { 982 case CHAR: 983 lastatom = starttok; 984 np = op2(CHAR, NIL, itonp(rlxval)); 985 rtok = relex(); 986 return (unary(np)); 987 case ALL: 988 rtok = relex(); 989 return (unary(op2(ALL, NIL, NIL))); 990 case EMPTYRE: 991 rtok = relex(); 992 return (unary(op2(EMPTYRE, NIL, NIL))); 993 case DOT: 994 lastatom = starttok; 995 rtok = relex(); 996 return (unary(op2(DOT, NIL, NIL))); 997 case CCL: 998 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr)); 999 lastatom = starttok; 1000 rtok = relex(); 1001 return (unary(np)); 1002 case NCCL: 1003 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr)); 1004 lastatom = starttok; 1005 rtok = relex(); 1006 return (unary(np)); 1007 case '^': 1008 rtok = relex(); 1009 return (unary(op2(CHAR, NIL, itonp(HAT)))); 1010 case '$': 1011 rtok = relex(); 1012 return (unary(op2(CHAR, NIL, NIL))); 1013 case '(': 1014 lastatom = starttok; 1015 savelastatom = starttok - basestr; /* Retain over recursion */ 1016 rtok = relex(); 1017 if (rtok == ')') { /* special pleading for () */ 1018 rtok = relex(); 1019 return unary(op2(CCL, NIL, (Node *) cclenter(""))); 1020 } 1021 np = regexp(); 1022 if (rtok == ')') { 1023 lastatom = basestr + savelastatom; /* Restore */ 1024 rtok = relex(); 1025 return (unary(np)); 1026 } 1027 else 1028 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 1029 default: 1030 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 1031 } 1032 return 0; /*NOTREACHED*/ 1033 } 1034 1035 Node *concat(Node *np) 1036 { 1037 switch (rtok) { 1038 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 1039 return (concat(op2(CAT, np, primary()))); 1040 case EMPTYRE: 1041 rtok = relex(); 1042 return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")), 1043 primary()))); 1044 } 1045 return (np); 1046 } 1047 1048 Node *alt(Node *np) 1049 { 1050 if (rtok == OR) { 1051 rtok = relex(); 1052 return (alt(op2(OR, np, concat(primary())))); 1053 } 1054 return (np); 1055 } 1056 1057 Node *unary(Node *np) 1058 { 1059 switch (rtok) { 1060 case STAR: 1061 rtok = relex(); 1062 return (unary(op2(STAR, np, NIL))); 1063 case PLUS: 1064 rtok = relex(); 1065 return (unary(op2(PLUS, np, NIL))); 1066 case QUEST: 1067 rtok = relex(); 1068 return (unary(op2(QUEST, np, NIL))); 1069 case ZERO: 1070 rtok = relex(); 1071 return (unary(op2(ZERO, np, NIL))); 1072 default: 1073 return (np); 1074 } 1075 } 1076 1077 /* 1078 * Character class definitions conformant to the POSIX locale as 1079 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 1080 * and operating character sets are both ASCII (ISO646) or supersets 1081 * thereof. 1082 * 1083 * Note that to avoid overflowing the temporary buffer used in 1084 * relex(), the expanded character class (prior to range expansion) 1085 * must be less than twice the size of their full name. 1086 */ 1087 1088 /* Because isblank doesn't show up in any of the header files on any 1089 * system i use, it's defined here. if some other locale has a richer 1090 * definition of "blank", define HAS_ISBLANK and provide your own 1091 * version. 1092 * the parentheses here are an attempt to find a path through the maze 1093 * of macro definition and/or function and/or version provided. thanks 1094 * to nelson beebe for the suggestion; let's see if it works everywhere. 1095 */ 1096 1097 #ifndef HAS_ISBLANK 1098 1099 int (xisblank)(int c) 1100 { 1101 return c==' ' || c=='\t'; 1102 } 1103 1104 #endif 1105 1106 static const struct charclass { 1107 const char *cc_name; 1108 int cc_namelen; 1109 int (*cc_func)(int); 1110 } charclasses[] = { 1111 { "alnum", 5, isalnum }, 1112 { "alpha", 5, isalpha }, 1113 #ifndef HAS_ISBLANK 1114 { "blank", 5, xisblank }, 1115 #else 1116 { "blank", 5, isblank }, 1117 #endif 1118 { "cntrl", 5, iscntrl }, 1119 { "digit", 5, isdigit }, 1120 { "graph", 5, isgraph }, 1121 { "lower", 5, islower }, 1122 { "print", 5, isprint }, 1123 { "punct", 5, ispunct }, 1124 { "space", 5, isspace }, 1125 { "upper", 5, isupper }, 1126 { "xdigit", 6, isxdigit }, 1127 { NULL, 0, NULL }, 1128 }; 1129 1130 #define REPEAT_SIMPLE 0 1131 #define REPEAT_PLUS_APPENDED 1 1132 #define REPEAT_WITH_Q 2 1133 #define REPEAT_ZERO 3 1134 1135 static int 1136 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1137 int atomlen, int firstnum, int secondnum, int special_case) 1138 { 1139 int i, j; 1140 uschar *buf = NULL; 1141 int ret = 1; 1142 int init_q = (firstnum == 0); /* first added char will be ? */ 1143 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 1144 int prefix_length = reptok - basestr; /* prefix includes first rep */ 1145 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */ 1146 int size = prefix_length + suffix_length; 1147 1148 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 1149 size += atomlen*(firstnum-1); 1150 } 1151 1152 /* Adjust size of buffer for special cases */ 1153 if (special_case == REPEAT_PLUS_APPENDED) { 1154 size++; /* for the final + */ 1155 } else if (special_case == REPEAT_WITH_Q) { 1156 size += init_q + (atomlen+1)* (n_q_reps-init_q); 1157 } else if (special_case == REPEAT_ZERO) { 1158 size += 2; /* just a null ERE: () */ 1159 } 1160 if ((buf = (uschar *) malloc(size + 1)) == NULL) 1161 FATAL("out of space in reg expr %.10s..", lastre); 1162 memcpy(buf, basestr, prefix_length); /* copy prefix */ 1163 j = prefix_length; 1164 if (special_case == REPEAT_ZERO) { 1165 j -= atomlen; 1166 buf[j++] = '('; 1167 buf[j++] = ')'; 1168 } 1169 for (i = 1; i < firstnum; i++) { /* copy x reps */ 1170 memcpy(&buf[j], atom, atomlen); 1171 j += atomlen; 1172 } 1173 if (special_case == REPEAT_PLUS_APPENDED) { 1174 buf[j++] = '+'; 1175 } else if (special_case == REPEAT_WITH_Q) { 1176 if (init_q) 1177 buf[j++] = '?'; 1178 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */ 1179 memcpy(&buf[j], atom, atomlen); 1180 j += atomlen; 1181 buf[j++] = '?'; 1182 } 1183 } 1184 memcpy(&buf[j], reptok+reptoklen, suffix_length); 1185 j += suffix_length; 1186 buf[j] = '\0'; 1187 /* free old basestr */ 1188 if (firstbasestr != basestr) { 1189 if (basestr) 1190 xfree(basestr); 1191 } 1192 basestr = buf; 1193 prestr = buf + prefix_length; 1194 if (special_case == REPEAT_ZERO) { 1195 prestr -= atomlen; 1196 ret++; 1197 } 1198 return ret; 1199 } 1200 1201 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1202 int atomlen, int firstnum, int secondnum) 1203 { 1204 /* 1205 In general, the repetition specifier or "bound" is replaced here 1206 by an equivalent ERE string, repeating the immediately previous atom 1207 and appending ? and + as needed. Note that the first copy of the 1208 atom is left in place, except in the special_case of a zero-repeat 1209 (i.e., {0}). 1210 */ 1211 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 1212 if (firstnum < 2) { 1213 /* 0 or 1: should be handled before you get here */ 1214 FATAL("internal error"); 1215 } else { 1216 return replace_repeat(reptok, reptoklen, atom, atomlen, 1217 firstnum, secondnum, REPEAT_PLUS_APPENDED); 1218 } 1219 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 1220 if (firstnum == 0) { /* {0} or {0,0} */ 1221 /* This case is unusual because the resulting 1222 replacement string might actually be SMALLER than 1223 the original ERE */ 1224 return replace_repeat(reptok, reptoklen, atom, atomlen, 1225 firstnum, secondnum, REPEAT_ZERO); 1226 } else { /* (firstnum >= 1) */ 1227 return replace_repeat(reptok, reptoklen, atom, atomlen, 1228 firstnum, secondnum, REPEAT_SIMPLE); 1229 } 1230 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 1231 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 1232 return replace_repeat(reptok, reptoklen, atom, atomlen, 1233 firstnum, secondnum, REPEAT_WITH_Q); 1234 } else { /* Error - shouldn't be here (n>m) */ 1235 FATAL("internal error"); 1236 } 1237 return 0; 1238 } 1239 1240 int relex(void) /* lexical analyzer for reparse */ 1241 { 1242 int c, n; 1243 int cflag; 1244 static uschar *buf = NULL; 1245 static int bufsz = 100; 1246 uschar *bp; 1247 const struct charclass *cc; 1248 int i; 1249 int num, m; 1250 bool commafound, digitfound; 1251 const uschar *startreptok; 1252 static int parens = 0; 1253 1254 rescan: 1255 starttok = prestr; 1256 1257 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) { 1258 prestr += n; 1259 starttok = prestr; 1260 return CHAR; 1261 } 1262 1263 switch (c = *prestr++) { 1264 case '|': return OR; 1265 case '*': return STAR; 1266 case '+': return PLUS; 1267 case '?': return QUEST; 1268 case '.': return DOT; 1269 case '\0': prestr--; return '\0'; 1270 case '^': 1271 case '$': 1272 return c; 1273 case '(': 1274 parens++; 1275 return c; 1276 case ')': 1277 if (parens) { 1278 parens--; 1279 return c; 1280 } 1281 /* unmatched close parenthesis; per POSIX, treat as literal */ 1282 rlxval = c; 1283 return CHAR; 1284 case '\\': 1285 rlxval = quoted(&prestr); 1286 return CHAR; 1287 default: 1288 rlxval = c; 1289 return CHAR; 1290 case '[': 1291 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 1292 FATAL("out of space in reg expr %.10s..", lastre); 1293 bp = buf; 1294 if (*prestr == '^') { 1295 cflag = 1; 1296 prestr++; 1297 } 1298 else 1299 cflag = 0; 1300 n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */ 1301 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 1302 FATAL("out of space for reg expr %.10s...", lastre); 1303 for (; ; ) { 1304 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) { 1305 for (i = 0; i < n; i++) 1306 *bp++ = *prestr++; 1307 continue; 1308 } 1309 if ((c = *prestr++) == '\\') { 1310 *bp++ = '\\'; 1311 if ((c = *prestr++) == '\0') 1312 FATAL("nonterminated character class %.20s...", lastre); 1313 *bp++ = c; 1314 /* } else if (c == '\n') { */ 1315 /* FATAL("newline in character class %.20s...", lastre); */ 1316 } else if (c == '[' && *prestr == ':') { 1317 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 1318 for (cc = charclasses; cc->cc_name; cc++) 1319 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 1320 break; 1321 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 1322 prestr[2 + cc->cc_namelen] == ']') { 1323 prestr += cc->cc_namelen + 3; 1324 /* 1325 * BUG: We begin at 1, instead of 0, since we 1326 * would otherwise prematurely terminate the 1327 * string for classes like [[:cntrl:]]. This 1328 * means that we can't match the NUL character, 1329 * not without first adapting the entire 1330 * program to track each string's length. 1331 */ 1332 for (i = 1; i <= UCHAR_MAX; i++) { 1333 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2")) 1334 FATAL("out of space for reg expr %.10s...", lastre); 1335 if (cc->cc_func(i)) { 1336 /* escape backslash */ 1337 if (i == '\\') { 1338 *bp++ = '\\'; 1339 n++; 1340 } 1341 1342 *bp++ = i; 1343 n++; 1344 } 1345 } 1346 } else 1347 *bp++ = c; 1348 } else if (c == '[' && *prestr == '.') { 1349 char collate_char; 1350 prestr++; 1351 collate_char = *prestr++; 1352 if (*prestr == '.' && prestr[1] == ']') { 1353 prestr += 2; 1354 /* Found it: map via locale TBD: for 1355 now, simply return this char. This 1356 is sufficient to pass conformance 1357 test awk.ex 156 1358 */ 1359 if (*prestr == ']') { 1360 prestr++; 1361 rlxval = collate_char; 1362 return CHAR; 1363 } 1364 } 1365 } else if (c == '[' && *prestr == '=') { 1366 char equiv_char; 1367 prestr++; 1368 equiv_char = *prestr++; 1369 if (*prestr == '=' && prestr[1] == ']') { 1370 prestr += 2; 1371 /* Found it: map via locale TBD: for now 1372 simply return this char. This is 1373 sufficient to pass conformance test 1374 awk.ex 156 1375 */ 1376 if (*prestr == ']') { 1377 prestr++; 1378 rlxval = equiv_char; 1379 return CHAR; 1380 } 1381 } 1382 } else if (c == '\0') { 1383 FATAL("nonterminated character class %.20s", lastre); 1384 } else if (bp == buf) { /* 1st char is special */ 1385 *bp++ = c; 1386 } else if (c == ']') { 1387 *bp++ = 0; 1388 rlxstr = (uschar *) tostring((char *) buf); 1389 if (cflag == 0) 1390 return CCL; 1391 else 1392 return NCCL; 1393 } else 1394 *bp++ = c; 1395 } 1396 break; 1397 case '{': 1398 if (isdigit(*(prestr))) { 1399 num = 0; /* Process as a repetition */ 1400 n = -1; m = -1; 1401 commafound = false; 1402 digitfound = false; 1403 startreptok = prestr-1; 1404 /* Remember start of previous atom here ? */ 1405 } else { /* just a { char, not a repetition */ 1406 rlxval = c; 1407 return CHAR; 1408 } 1409 for (; ; ) { 1410 if ((c = *prestr++) == '}') { 1411 if (commafound) { 1412 if (digitfound) { /* {n,m} */ 1413 m = num; 1414 if (m < n) 1415 FATAL("illegal repetition expression: class %.20s", 1416 lastre); 1417 if (n == 0 && m == 1) { 1418 return QUEST; 1419 } 1420 } else { /* {n,} */ 1421 if (n == 0) 1422 return STAR; 1423 else if (n == 1) 1424 return PLUS; 1425 } 1426 } else { 1427 if (digitfound) { /* {n} same as {n,n} */ 1428 n = num; 1429 m = num; 1430 } else { /* {} */ 1431 FATAL("illegal repetition expression: class %.20s", 1432 lastre); 1433 } 1434 } 1435 if (repeat(starttok, prestr-starttok, lastatom, 1436 startreptok - lastatom, n, m) > 0) { 1437 if (n == 0 && m == 0) { 1438 return ZERO; 1439 } 1440 /* must rescan input for next token */ 1441 goto rescan; 1442 } 1443 /* Failed to replace: eat up {...} characters 1444 and treat like just PLUS */ 1445 return PLUS; 1446 } else if (c == '\0') { 1447 FATAL("nonterminated character class %.20s", 1448 lastre); 1449 } else if (isdigit(c)) { 1450 num = 10 * num + c - '0'; 1451 digitfound = true; 1452 } else if (c == ',') { 1453 if (commafound) 1454 FATAL("illegal repetition expression: class %.20s", 1455 lastre); 1456 /* looking for {n,} or {n,m} */ 1457 commafound = true; 1458 n = num; 1459 digitfound = false; /* reset */ 1460 num = 0; 1461 } else { 1462 FATAL("illegal repetition expression: class %.20s", 1463 lastre); 1464 } 1465 } 1466 break; 1467 } 1468 } 1469 1470 int cgoto(fa *f, int s, int c) 1471 { 1472 int *p, *q; 1473 int i, j, k; 1474 1475 /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */ 1476 while (f->accept >= maxsetvec) { /* guessing here! */ 1477 resizesetvec(__func__); 1478 } 1479 for (i = 0; i <= f->accept; i++) 1480 setvec[i] = 0; 1481 setcnt = 0; 1482 resize_state(f, s); 1483 /* compute positions of gototab[s,c] into setvec */ 1484 p = f->posns[s]; 1485 for (i = 1; i <= *p; i++) { 1486 if ((k = f->re[p[i]].ltype) != FINAL) { 1487 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1488 || (k == DOT && c != 0 && c != HAT) 1489 || (k == ALL && c != 0) 1490 || (k == EMPTYRE && c != 0) 1491 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp)) 1492 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) { 1493 q = f->re[p[i]].lfollow; 1494 for (j = 1; j <= *q; j++) { 1495 if (q[j] >= maxsetvec) { 1496 resizesetvec(__func__); 1497 } 1498 if (setvec[q[j]] == 0) { 1499 setcnt++; 1500 setvec[q[j]] = 1; 1501 } 1502 } 1503 } 1504 } 1505 } 1506 /* determine if setvec is a previous state */ 1507 tmpset[0] = setcnt; 1508 j = 1; 1509 for (i = f->accept; i >= 0; i--) 1510 if (setvec[i]) { 1511 tmpset[j++] = i; 1512 } 1513 resize_state(f, f->curstat > s ? f->curstat : s); 1514 /* tmpset == previous state? */ 1515 for (i = 1; i <= f->curstat; i++) { 1516 p = f->posns[i]; 1517 if ((k = tmpset[0]) != p[0]) 1518 goto different; 1519 for (j = 1; j <= k; j++) 1520 if (tmpset[j] != p[j]) 1521 goto different; 1522 /* setvec is state i */ 1523 if (c != HAT) 1524 set_gototab(f, s, c, i); 1525 return i; 1526 different:; 1527 } 1528 1529 /* add tmpset to current set of states */ 1530 ++(f->curstat); 1531 resize_state(f, f->curstat); 1532 clear_gototab(f, f->curstat); 1533 xfree(f->posns[f->curstat]); 1534 p = intalloc(setcnt + 1, __func__); 1535 1536 f->posns[f->curstat] = p; 1537 if (c != HAT) 1538 set_gototab(f, s, c, f->curstat); 1539 for (i = 0; i <= setcnt; i++) 1540 p[i] = tmpset[i]; 1541 if (setvec[f->accept]) 1542 f->out[f->curstat] = 1; 1543 else 1544 f->out[f->curstat] = 0; 1545 return f->curstat; 1546 } 1547 1548 1549 void freefa(fa *f) /* free a finite automaton */ 1550 { 1551 int i; 1552 1553 if (f == NULL) 1554 return; 1555 for (i = 0; i < f->state_count; i++) 1556 xfree(f->gototab[i].entries); 1557 xfree(f->gototab); 1558 for (i = 0; i <= f->curstat; i++) 1559 xfree(f->posns[i]); 1560 for (i = 0; i <= f->accept; i++) { 1561 xfree(f->re[i].lfollow); 1562 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1563 xfree(f->re[i].lval.np); 1564 } 1565 xfree(f->restr); 1566 xfree(f->out); 1567 xfree(f->posns); 1568 xfree(f->gototab); 1569 xfree(f); 1570 } 1571