1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 /* lasciate ogne speranza, voi ch'intrate. */
26
27 #define DEBUG
28
29 #include <ctype.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include "awk.h"
35 #include "awkgram.tab.h"
36
37 #define MAXLIN 22
38
39 #define type(v) (v)->nobj /* badly overloaded here */
40 #define info(v) (v)->ntype /* badly overloaded here */
41 #define left(v) (v)->narg[0]
42 #define right(v) (v)->narg[1]
43 #define parent(v) (v)->nnext
44
45 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46 #define ELEAF case EMPTYRE: /* empty string in regexp */
47 #define UNARY case STAR: case PLUS: case QUEST:
48
49 /* encoding in tree Nodes:
50 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51 left is index, right contains value or pointer to value
52 unary (STAR, PLUS, QUEST): left is child, right is null
53 binary (CAT, OR): left and right are children
54 parent contains pointer to parent
55 */
56
57
58 int *setvec;
59 int *tmpset;
60 int maxsetvec = 0;
61
62 int rtok; /* next token in current re */
63 int rlxval;
64 static const uschar *rlxstr;
65 static const uschar *prestr; /* current position in current re */
66 static const uschar *lastre; /* origin of last re */
67 static const uschar *lastatom; /* origin of last Atom */
68 static const uschar *starttok;
69 static const uschar *basestr; /* starts with original, replaced during
70 repetition processing */
71 static const uschar *firstbasestr;
72
73 static int setcnt;
74 static int poscnt;
75
76 const char *patbeg;
77 int patlen;
78
79 #define NFA 128 /* cache this many dynamic fa's */
80 fa *fatab[NFA];
81 int nfatab = 0; /* entries in fatab */
82
83 extern int u8_nextlen(const char *s);
84
85
86 /* utf-8 mechanism:
87
88 For most of Awk, utf-8 strings just "work", since they look like
89 null-terminated sequences of 8-bit bytes.
90
91 Functions like length(), index(), and substr() have to operate
92 in units of utf-8 characters. The u8_* functions in run.c
93 handle this.
94
95 Regular expressions are more complicated, since the basic
96 mechanism of the goto table used 8-bit byte indices into the
97 gototab entries to compute the next state. Unicode is a lot
98 bigger, so the gototab entries are now structs with a character
99 and a next state. These are sorted by code point and binary
100 searched.
101
102 Throughout the RE mechanism in b.c, utf-8 characters are
103 converted to their utf-32 value. This mostly shows up in
104 cclenter, which expands character class ranges like a-z and now
105 alpha-omega. The size of a gototab array is still about 256.
106 This should be dynamic, but for now things work ok for a single
107 code page of Unicode, which is the most likely case.
108
109 The code changes are localized in run.c and b.c. I have added a
110 handful of functions to somewhat better hide the implementation,
111 but a lot more could be done.
112
113 */
114
115 static int entry_cmp(const void *l, const void *r);
116 static int get_gototab(fa*, int, int);
117 static int set_gototab(fa*, int, int, int);
118 static void clear_gototab(fa*, int);
119 extern int u8_rune(int *, const char *);
120
121 static int *
intalloc(size_t n,const char * f)122 intalloc(size_t n, const char *f)
123 {
124 int *p = (int *) calloc(n, sizeof(int));
125 if (p == NULL)
126 overflo(f);
127 return p;
128 }
129
130 static void
resizesetvec(const char * f)131 resizesetvec(const char *f)
132 {
133 if (maxsetvec == 0)
134 maxsetvec = MAXLIN;
135 else
136 maxsetvec *= 4;
137 setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
138 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
139 if (setvec == NULL || tmpset == NULL)
140 overflo(f);
141 }
142
143 static void
resize_state(fa * f,int state)144 resize_state(fa *f, int state)
145 {
146 gtt *p;
147 uschar *p2;
148 int **p3;
149 int i, new_count;
150
151 if (++state < f->state_count)
152 return;
153
154 new_count = state + 10; /* needs to be tuned */
155
156 p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
157 if (p == NULL)
158 goto out;
159 f->gototab = p;
160
161 p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
162 if (p2 == NULL)
163 goto out;
164 f->out = p2;
165
166 p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
167 if (p3 == NULL)
168 goto out;
169 f->posns = p3;
170
171 for (i = f->state_count; i < new_count; ++i) {
172 f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
173 if (f->gototab[i].entries == NULL)
174 goto out;
175 f->gototab[i].allocated = NCHARS;
176 f->gototab[i].inuse = 0;
177 f->out[i] = 0;
178 f->posns[i] = NULL;
179 }
180 f->state_count = new_count;
181 return;
182 out:
183 overflo(__func__);
184 }
185
makedfa(const char * s,bool anchor)186 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
187 {
188 int i, use, nuse;
189 fa *pfa;
190 static int now = 1;
191
192 if (setvec == NULL) { /* first time through any RE */
193 resizesetvec(__func__);
194 }
195
196 if (compile_time != RUNNING) /* a constant for sure */
197 return mkdfa(s, anchor);
198 for (i = 0; i < nfatab; i++) /* is it there already? */
199 if (fatab[i]->anchor == anchor
200 && strcmp((const char *) fatab[i]->restr, s) == 0) {
201 fatab[i]->use = now++;
202 return fatab[i];
203 }
204 pfa = mkdfa(s, anchor);
205 if (nfatab < NFA) { /* room for another */
206 fatab[nfatab] = pfa;
207 fatab[nfatab]->use = now++;
208 nfatab++;
209 return pfa;
210 }
211 use = fatab[0]->use; /* replace least-recently used */
212 nuse = 0;
213 for (i = 1; i < nfatab; i++)
214 if (fatab[i]->use < use) {
215 use = fatab[i]->use;
216 nuse = i;
217 }
218 freefa(fatab[nuse]);
219 fatab[nuse] = pfa;
220 pfa->use = now++;
221 return pfa;
222 }
223
mkdfa(const char * s,bool anchor)224 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
225 /* anchor = true for anchored matches, else false */
226 {
227 Node *p, *p1;
228 fa *f;
229
230 firstbasestr = (const uschar *) s;
231 basestr = firstbasestr;
232 p = reparse(s);
233 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
234 /* put ALL STAR in front of reg. exp. */
235 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
236 /* put FINAL after reg. exp. */
237
238 poscnt = 0;
239 penter(p1); /* enter parent pointers and leaf indices */
240 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
241 overflo(__func__);
242 f->accept = poscnt-1; /* penter has computed number of positions in re */
243 cfoll(f, p1); /* set up follow sets */
244 freetr(p1);
245 resize_state(f, 1);
246 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
247 f->posns[1] = intalloc(1, __func__);
248 *f->posns[1] = 0;
249 f->initstat = makeinit(f, anchor);
250 f->anchor = anchor;
251 f->restr = (uschar *) tostring(s);
252 if (firstbasestr != basestr) {
253 if (basestr)
254 xfree(basestr);
255 }
256 return f;
257 }
258
makeinit(fa * f,bool anchor)259 int makeinit(fa *f, bool anchor)
260 {
261 int i, k;
262
263 f->curstat = 2;
264 f->out[2] = 0;
265 k = *(f->re[0].lfollow);
266 xfree(f->posns[2]);
267 f->posns[2] = intalloc(k + 1, __func__);
268 for (i = 0; i <= k; i++) {
269 (f->posns[2])[i] = (f->re[0].lfollow)[i];
270 }
271 if ((f->posns[2])[1] == f->accept)
272 f->out[2] = 1;
273 clear_gototab(f, 2);
274 f->curstat = cgoto(f, 2, HAT);
275 if (anchor) {
276 *f->posns[2] = k-1; /* leave out position 0 */
277 for (i = 0; i < k; i++) {
278 (f->posns[0])[i] = (f->posns[2])[i];
279 }
280
281 f->out[0] = f->out[2];
282 if (f->curstat != 2)
283 --(*f->posns[f->curstat]);
284 }
285 return f->curstat;
286 }
287
penter(Node * p)288 void penter(Node *p) /* set up parent pointers and leaf indices */
289 {
290 switch (type(p)) {
291 ELEAF
292 LEAF
293 info(p) = poscnt;
294 poscnt++;
295 break;
296 UNARY
297 penter(left(p));
298 parent(left(p)) = p;
299 break;
300 case CAT:
301 case OR:
302 penter(left(p));
303 penter(right(p));
304 parent(left(p)) = p;
305 parent(right(p)) = p;
306 break;
307 case ZERO:
308 break;
309 default: /* can't happen */
310 FATAL("can't happen: unknown type %d in penter", type(p));
311 break;
312 }
313 }
314
freetr(Node * p)315 void freetr(Node *p) /* free parse tree */
316 {
317 switch (type(p)) {
318 ELEAF
319 LEAF
320 xfree(p);
321 break;
322 UNARY
323 case ZERO:
324 freetr(left(p));
325 xfree(p);
326 break;
327 case CAT:
328 case OR:
329 freetr(left(p));
330 freetr(right(p));
331 xfree(p);
332 break;
333 default: /* can't happen */
334 FATAL("can't happen: unknown type %d in freetr", type(p));
335 break;
336 }
337 }
338
339 /* in the parsing of regular expressions, metacharacters like . have */
340 /* to be seen literally; \056 is not a metacharacter. */
341
hexstr(const uschar ** pp,int max)342 int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
343 { /* only pick up one 8-bit byte (2 chars) */
344 const uschar *p;
345 int n = 0;
346 int i;
347
348 for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
349 if (isdigit((int) *p))
350 n = 16 * n + *p - '0';
351 else if (*p >= 'a' && *p <= 'f')
352 n = 16 * n + *p - 'a' + 10;
353 else if (*p >= 'A' && *p <= 'F')
354 n = 16 * n + *p - 'A' + 10;
355 }
356 *pp = p;
357 return n;
358 }
359
360
361
362 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
363
quoted(const uschar ** pp)364 int quoted(const uschar **pp) /* pick up next thing after a \\ */
365 /* and increment *pp */
366 {
367 const uschar *p = *pp;
368 int c;
369
370 /* BUG: should advance by utf-8 char even if makes no sense */
371
372 if ((c = *p++) == 't') {
373 c = '\t';
374 } else if (c == 'n') {
375 c = '\n';
376 } else if (c == 'f') {
377 c = '\f';
378 } else if (c == 'r') {
379 c = '\r';
380 } else if (c == 'b') {
381 c = '\b';
382 } else if (c == 'v') {
383 c = '\v';
384 } else if (c == 'a') {
385 c = '\a';
386 } else if (c == '\\') {
387 c = '\\';
388 } else if (c == 'x') { /* 2 hex digits follow */
389 c = hexstr(&p, 2); /* this adds a null if number is invalid */
390 } else if (c == 'u') { /* unicode char number up to 8 hex digits */
391 c = hexstr(&p, 8);
392 } else if (isoctdigit(c)) { /* \d \dd \ddd */
393 int n = c - '0';
394 if (isoctdigit(*p)) {
395 n = 8 * n + *p++ - '0';
396 if (isoctdigit(*p))
397 n = 8 * n + *p++ - '0';
398 }
399 c = n;
400 } /* else */
401 /* c = c; */
402 *pp = p;
403 return c;
404 }
405
cclenter(const char * argp)406 int *cclenter(const char *argp) /* add a character class */
407 {
408 int i, c, c2;
409 int n;
410 const uschar *p = (const uschar *) argp;
411 int *bp, *retp;
412 static int *buf = NULL;
413 static int bufsz = 100;
414
415 if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
416 FATAL("out of space for character class [%.10s...] 1", p);
417 bp = buf;
418 for (i = 0; *p != 0; ) {
419 n = u8_rune(&c, (const char *) p);
420 p += n;
421 if (c == '\\') {
422 c = quoted(&p);
423 } else if (c == '-' && i > 0 && bp[-1] != 0) {
424 if (*p != 0) {
425 c = bp[-1];
426 /* c2 = *p++; */
427 n = u8_rune(&c2, (const char *) p);
428 p += n;
429 if (c2 == '\\')
430 c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
431 if (c > c2) { /* empty; ignore */
432 bp--;
433 i--;
434 continue;
435 }
436 while (c < c2) {
437 if (i >= bufsz) {
438 bufsz *= 2;
439 buf = (int *) realloc(buf, bufsz * sizeof(int));
440 if (buf == NULL)
441 FATAL("out of space for character class [%.10s...] 2", p);
442 bp = buf + i;
443 }
444 *bp++ = ++c;
445 i++;
446 }
447 continue;
448 }
449 }
450 if (i >= bufsz) {
451 bufsz *= 2;
452 buf = (int *) realloc(buf, bufsz * sizeof(int));
453 if (buf == NULL)
454 FATAL("out of space for character class [%.10s...] 2", p);
455 bp = buf + i;
456 }
457 *bp++ = c;
458 i++;
459 }
460 *bp = 0;
461 /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
462 /* xfree(op); BUG: what are we freeing here? */
463 retp = (int *) calloc(bp-buf+1, sizeof(int));
464 for (i = 0; i < bp-buf+1; i++)
465 retp[i] = buf[i];
466 return retp;
467 }
468
overflo(const char * s)469 void overflo(const char *s)
470 {
471 FATAL("regular expression too big: out of space in %.30s...", s);
472 }
473
cfoll(fa * f,Node * v)474 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
475 {
476 int i;
477 int *p;
478
479 switch (type(v)) {
480 ELEAF
481 LEAF
482 f->re[info(v)].ltype = type(v);
483 f->re[info(v)].lval.np = right(v);
484 while (f->accept >= maxsetvec) { /* guessing here! */
485 resizesetvec(__func__);
486 }
487 for (i = 0; i <= f->accept; i++)
488 setvec[i] = 0;
489 setcnt = 0;
490 follow(v); /* computes setvec and setcnt */
491 p = intalloc(setcnt + 1, __func__);
492 f->re[info(v)].lfollow = p;
493 *p = setcnt;
494 for (i = f->accept; i >= 0; i--)
495 if (setvec[i] == 1)
496 *++p = i;
497 break;
498 UNARY
499 cfoll(f,left(v));
500 break;
501 case CAT:
502 case OR:
503 cfoll(f,left(v));
504 cfoll(f,right(v));
505 break;
506 case ZERO:
507 break;
508 default: /* can't happen */
509 FATAL("can't happen: unknown type %d in cfoll", type(v));
510 }
511 }
512
first(Node * p)513 int first(Node *p) /* collects initially active leaves of p into setvec */
514 /* returns 0 if p matches empty string */
515 {
516 int b, lp;
517
518 switch (type(p)) {
519 ELEAF
520 LEAF
521 lp = info(p); /* look for high-water mark of subscripts */
522 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
523 resizesetvec(__func__);
524 }
525 if (type(p) == EMPTYRE) {
526 setvec[lp] = 0;
527 return(0);
528 }
529 if (setvec[lp] != 1) {
530 setvec[lp] = 1;
531 setcnt++;
532 }
533 if (type(p) == CCL && (*(int *) right(p)) == 0)
534 return(0); /* empty CCL */
535 return(1);
536 case PLUS:
537 if (first(left(p)) == 0)
538 return(0);
539 return(1);
540 case STAR:
541 case QUEST:
542 first(left(p));
543 return(0);
544 case CAT:
545 if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
546 return(1);
547 case OR:
548 b = first(right(p));
549 if (first(left(p)) == 0 || b == 0) return(0);
550 return(1);
551 case ZERO:
552 return 0;
553 }
554 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
555 return(-1);
556 }
557
follow(Node * v)558 void follow(Node *v) /* collects leaves that can follow v into setvec */
559 {
560 Node *p;
561
562 if (type(v) == FINAL)
563 return;
564 p = parent(v);
565 switch (type(p)) {
566 case STAR:
567 case PLUS:
568 first(v);
569 follow(p);
570 return;
571
572 case OR:
573 case QUEST:
574 follow(p);
575 return;
576
577 case CAT:
578 if (v == left(p)) { /* v is left child of p */
579 if (first(right(p)) == 0) {
580 follow(p);
581 return;
582 }
583 } else /* v is right child */
584 follow(p);
585 return;
586 }
587 }
588
member(int c,int * sarg)589 int member(int c, int *sarg) /* is c in s? */
590 {
591 int *s = (int *) sarg;
592
593 while (*s)
594 if (c == *s++)
595 return(1);
596 return(0);
597 }
598
resize_gototab(fa * f,int state)599 static void resize_gototab(fa *f, int state)
600 {
601 size_t new_size = f->gototab[state].allocated * 2;
602 gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
603 if (p == NULL)
604 overflo(__func__);
605
606 // need to initialized the new memory to zero
607 size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size
608 memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out
609
610 f->gototab[state].allocated = new_size; // update gototab info
611 f->gototab[state].entries = p;
612 }
613
get_gototab(fa * f,int state,int ch)614 static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
615 {
616 gtte key;
617 gtte *item;
618
619 key.ch = ch;
620 key.state = 0; /* irrelevant */
621 item = (gtte *) bsearch(& key, f->gototab[state].entries,
622 f->gototab[state].inuse, sizeof(gtte),
623 entry_cmp);
624
625 if (item == NULL)
626 return 0;
627 else
628 return item->state;
629 }
630
entry_cmp(const void * l,const void * r)631 static int entry_cmp(const void *l, const void *r)
632 {
633 const gtte *left, *right;
634
635 left = (const gtte *) l;
636 right = (const gtte *) r;
637
638 return left->ch - right->ch;
639 }
640
set_gototab(fa * f,int state,int ch,int val)641 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
642 {
643 if (f->gototab[state].inuse == 0) {
644 f->gototab[state].entries[0].ch = ch;
645 f->gototab[state].entries[0].state = val;
646 f->gototab[state].inuse++;
647 return val;
648 } else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
649 // not seen yet, insert and return
650 gtt *tab = & f->gototab[state];
651 if (tab->inuse + 1 >= tab->allocated)
652 resize_gototab(f, state);
653
654 f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch;
655 f->gototab[state].entries[f->gototab[state].inuse-1].state = val;
656 f->gototab[state].inuse++;
657 return val;
658 } else {
659 // maybe we have it, maybe we don't
660 gtte key;
661 gtte *item;
662
663 key.ch = ch;
664 key.state = 0; /* irrelevant */
665 item = (gtte *) bsearch(& key, f->gototab[state].entries,
666 f->gototab[state].inuse, sizeof(gtte),
667 entry_cmp);
668
669 if (item != NULL) {
670 // we have it, update state and return
671 item->state = val;
672 return item->state;
673 }
674 // otherwise, fall through to insert and reallocate.
675 }
676
677 gtt *tab = & f->gototab[state];
678 if (tab->inuse + 1 >= tab->allocated)
679 resize_gototab(f, state);
680 ++tab->inuse;
681 f->gototab[state].entries[tab->inuse].ch = ch;
682 f->gototab[state].entries[tab->inuse].state = val;
683
684 qsort(f->gototab[state].entries,
685 f->gototab[state].inuse, sizeof(gtte), entry_cmp);
686
687 return val; /* not used anywhere at the moment */
688 }
689
clear_gototab(fa * f,int state)690 static void clear_gototab(fa *f, int state)
691 {
692 memset(f->gototab[state].entries, 0,
693 f->gototab[state].allocated * sizeof(gtte));
694 f->gototab[state].inuse = 0;
695 }
696
match(fa * f,const char * p0)697 int match(fa *f, const char *p0) /* shortest match ? */
698 {
699 int s, ns;
700 int n;
701 int rune;
702 const uschar *p = (const uschar *) p0;
703
704 /* return pmatch(f, p0); does it matter whether longest or shortest? */
705
706 s = f->initstat;
707 assert (s < f->state_count);
708
709 if (f->out[s])
710 return(1);
711 do {
712 /* assert(*p < NCHARS); */
713 n = u8_rune(&rune, (const char *) p);
714 if ((ns = get_gototab(f, s, rune)) != 0)
715 s = ns;
716 else
717 s = cgoto(f, s, rune);
718 if (f->out[s])
719 return(1);
720 if (*p == 0)
721 break;
722 p += n;
723 } while (1); /* was *p++ != 0 */
724 return(0);
725 }
726
pmatch(fa * f,const char * p0)727 int pmatch(fa *f, const char *p0) /* longest match, for sub */
728 {
729 int s, ns;
730 int n;
731 int rune;
732 const uschar *p = (const uschar *) p0;
733 const uschar *q;
734
735 s = f->initstat;
736 assert(s < f->state_count);
737
738 patbeg = (const char *)p;
739 patlen = -1;
740 do {
741 q = p;
742 do {
743 if (f->out[s]) /* final state */
744 patlen = q-p;
745 /* assert(*q < NCHARS); */
746 n = u8_rune(&rune, (const char *) q);
747 if ((ns = get_gototab(f, s, rune)) != 0)
748 s = ns;
749 else
750 s = cgoto(f, s, rune);
751
752 assert(s < f->state_count);
753
754 if (s == 1) { /* no transition */
755 if (patlen >= 0) {
756 patbeg = (const char *) p;
757 return(1);
758 }
759 else
760 goto nextin; /* no match */
761 }
762 if (*q == 0)
763 break;
764 q += n;
765 } while (1);
766 q++; /* was *q++ */
767 if (f->out[s])
768 patlen = q-p-1; /* don't count $ */
769 if (patlen >= 0) {
770 patbeg = (const char *) p;
771 return(1);
772 }
773 nextin:
774 s = 2;
775 if (*p == 0)
776 break;
777 n = u8_rune(&rune, (const char *) p);
778 p += n;
779 } while (1); /* was *p++ */
780 return (0);
781 }
782
nematch(fa * f,const char * p0)783 int nematch(fa *f, const char *p0) /* non-empty match, for sub */
784 {
785 int s, ns;
786 int n;
787 int rune;
788 const uschar *p = (const uschar *) p0;
789 const uschar *q;
790
791 s = f->initstat;
792 assert(s < f->state_count);
793
794 patbeg = (const char *)p;
795 patlen = -1;
796 while (*p) {
797 q = p;
798 do {
799 if (f->out[s]) /* final state */
800 patlen = q-p;
801 /* assert(*q < NCHARS); */
802 n = u8_rune(&rune, (const char *) q);
803 if ((ns = get_gototab(f, s, rune)) != 0)
804 s = ns;
805 else
806 s = cgoto(f, s, rune);
807 if (s == 1) { /* no transition */
808 if (patlen > 0) {
809 patbeg = (const char *) p;
810 return(1);
811 } else
812 goto nnextin; /* no nonempty match */
813 }
814 if (*q == 0)
815 break;
816 q += n;
817 } while (1);
818 q++;
819 if (f->out[s])
820 patlen = q-p-1; /* don't count $ */
821 if (patlen > 0 ) {
822 patbeg = (const char *) p;
823 return(1);
824 }
825 nnextin:
826 s = 2;
827 p++;
828 }
829 return (0);
830 }
831
832
833 #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
834
835 /*
836 * NAME
837 * fnematch
838 *
839 * DESCRIPTION
840 * A stream-fed version of nematch which transfers characters to a
841 * null-terminated buffer. All characters up to and including the last
842 * character of the matching text or EOF are placed in the buffer. If
843 * a match is found, patbeg and patlen are set appropriately.
844 *
845 * RETURN VALUES
846 * false No match found.
847 * true Match found.
848 */
849
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)850 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
851 {
852 char *i, *j, *k, *buf = *pbuf;
853 int bufsize = *pbufsize;
854 int c, n, ns, s;
855
856 s = pfa->initstat;
857 patlen = 0;
858
859 /*
860 * buf <= i <= j <= k <= buf+bufsize
861 *
862 * i: origin of active substring
863 * j: current character
864 * k: destination of the next getc
865 */
866
867 i = j = k = buf;
868
869 do {
870 /*
871 * Call u8_rune with at least MAX_UTF_BYTES ahead in
872 * the buffer until EOF interferes.
873 */
874 if (k - j < MAX_UTF_BYTES) {
875 if (k + MAX_UTF_BYTES > buf + bufsize) {
876 adjbuf((char **) &buf, &bufsize,
877 bufsize + MAX_UTF_BYTES,
878 quantum, 0, "fnematch");
879 }
880 for (n = MAX_UTF_BYTES ; n > 0; n--) {
881 *k++ = (c = getc(f)) != EOF ? c : 0;
882 if (c == EOF) {
883 if (ferror(f))
884 FATAL("fnematch: getc error");
885 break;
886 }
887 }
888 }
889
890 j += u8_rune(&c, j);
891
892 if ((ns = get_gototab(pfa, s, c)) != 0)
893 s = ns;
894 else
895 s = cgoto(pfa, s, c);
896
897 if (pfa->out[s]) { /* final state */
898 patbeg = i;
899 patlen = j - i;
900 if (c == 0) /* don't count $ */
901 patlen--;
902 }
903
904 if (c && s != 1)
905 continue; /* origin i still viable, next j */
906 if (patlen)
907 break; /* best match found */
908
909 /* no match at origin i, next i and start over */
910 i += u8_rune(&c, i);
911 if (c == 0)
912 break; /* no match */
913 j = i;
914 s = 2;
915 } while (1);
916
917 /* adjbuf() may have relocated a resized buffer. Inform the world. */
918 *pbuf = buf;
919 *pbufsize = bufsize;
920
921 if (patlen) {
922 /*
923 * Under no circumstances is the last character fed to
924 * the automaton part of the match. It is EOF's nullbyte,
925 * or it sent the automaton into a state with no further
926 * transitions available (s==1), or both. Room for a
927 * terminating nullbyte is guaranteed.
928 *
929 * ungetc any chars after the end of matching text
930 * (except for EOF's nullbyte, if present) and null
931 * terminate the buffer.
932 */
933 do
934 if (*--k && ungetc(*k, f) == EOF)
935 FATAL("unable to ungetc '%c'", *k);
936 while (k > patbeg + patlen);
937 *k = '\0';
938 return true;
939 }
940 else
941 return false;
942 }
943
reparse(const char * p)944 Node *reparse(const char *p) /* parses regular expression pointed to by p */
945 { /* uses relex() to scan regular expression */
946 Node *np;
947
948 DPRINTF("reparse <%s>\n", p);
949 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
950 rtok = relex();
951 /* GNU compatibility: an empty regexp matches anything */
952 if (rtok == '\0') {
953 /* FATAL("empty regular expression"); previous */
954 return(op2(EMPTYRE, NIL, NIL));
955 }
956 np = regexp();
957 if (rtok != '\0')
958 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
959 return(np);
960 }
961
regexp(void)962 Node *regexp(void) /* top-level parse of reg expr */
963 {
964 return (alt(concat(primary())));
965 }
966
primary(void)967 Node *primary(void)
968 {
969 Node *np;
970 int savelastatom;
971
972 switch (rtok) {
973 case CHAR:
974 lastatom = starttok;
975 np = op2(CHAR, NIL, itonp(rlxval));
976 rtok = relex();
977 return (unary(np));
978 case ALL:
979 rtok = relex();
980 return (unary(op2(ALL, NIL, NIL)));
981 case EMPTYRE:
982 rtok = relex();
983 return (unary(op2(EMPTYRE, NIL, NIL)));
984 case DOT:
985 lastatom = starttok;
986 rtok = relex();
987 return (unary(op2(DOT, NIL, NIL)));
988 case CCL:
989 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
990 lastatom = starttok;
991 rtok = relex();
992 return (unary(np));
993 case NCCL:
994 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
995 lastatom = starttok;
996 rtok = relex();
997 return (unary(np));
998 case '^':
999 rtok = relex();
1000 return (unary(op2(CHAR, NIL, itonp(HAT))));
1001 case '$':
1002 rtok = relex();
1003 return (unary(op2(CHAR, NIL, NIL)));
1004 case '(':
1005 lastatom = starttok;
1006 savelastatom = starttok - basestr; /* Retain over recursion */
1007 rtok = relex();
1008 if (rtok == ')') { /* special pleading for () */
1009 rtok = relex();
1010 return unary(op2(CCL, NIL, (Node *) cclenter("")));
1011 }
1012 np = regexp();
1013 if (rtok == ')') {
1014 lastatom = basestr + savelastatom; /* Restore */
1015 rtok = relex();
1016 return (unary(np));
1017 }
1018 else
1019 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1020 default:
1021 FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1022 }
1023 return 0; /*NOTREACHED*/
1024 }
1025
concat(Node * np)1026 Node *concat(Node *np)
1027 {
1028 switch (rtok) {
1029 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1030 return (concat(op2(CAT, np, primary())));
1031 case EMPTYRE:
1032 rtok = relex();
1033 return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1034 primary())));
1035 }
1036 return (np);
1037 }
1038
alt(Node * np)1039 Node *alt(Node *np)
1040 {
1041 if (rtok == OR) {
1042 rtok = relex();
1043 return (alt(op2(OR, np, concat(primary()))));
1044 }
1045 return (np);
1046 }
1047
unary(Node * np)1048 Node *unary(Node *np)
1049 {
1050 switch (rtok) {
1051 case STAR:
1052 rtok = relex();
1053 return (unary(op2(STAR, np, NIL)));
1054 case PLUS:
1055 rtok = relex();
1056 return (unary(op2(PLUS, np, NIL)));
1057 case QUEST:
1058 rtok = relex();
1059 return (unary(op2(QUEST, np, NIL)));
1060 case ZERO:
1061 rtok = relex();
1062 return (unary(op2(ZERO, np, NIL)));
1063 default:
1064 return (np);
1065 }
1066 }
1067
1068 /*
1069 * Character class definitions conformant to the POSIX locale as
1070 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1071 * and operating character sets are both ASCII (ISO646) or supersets
1072 * thereof.
1073 *
1074 * Note that to avoid overflowing the temporary buffer used in
1075 * relex(), the expanded character class (prior to range expansion)
1076 * must be less than twice the size of their full name.
1077 */
1078
1079 /* Because isblank doesn't show up in any of the header files on any
1080 * system i use, it's defined here. if some other locale has a richer
1081 * definition of "blank", define HAS_ISBLANK and provide your own
1082 * version.
1083 * the parentheses here are an attempt to find a path through the maze
1084 * of macro definition and/or function and/or version provided. thanks
1085 * to nelson beebe for the suggestion; let's see if it works everywhere.
1086 */
1087
1088 /* #define HAS_ISBLANK */
1089 #ifndef HAS_ISBLANK
1090
1091 int (xisblank)(int c)
1092 {
1093 return c==' ' || c=='\t';
1094 }
1095
1096 #endif
1097
1098 static const struct charclass {
1099 const char *cc_name;
1100 int cc_namelen;
1101 int (*cc_func)(int);
1102 } charclasses[] = {
1103 { "alnum", 5, isalnum },
1104 { "alpha", 5, isalpha },
1105 #ifndef HAS_ISBLANK
1106 { "blank", 5, xisblank },
1107 #else
1108 { "blank", 5, isblank },
1109 #endif
1110 { "cntrl", 5, iscntrl },
1111 { "digit", 5, isdigit },
1112 { "graph", 5, isgraph },
1113 { "lower", 5, islower },
1114 { "print", 5, isprint },
1115 { "punct", 5, ispunct },
1116 { "space", 5, isspace },
1117 { "upper", 5, isupper },
1118 { "xdigit", 6, isxdigit },
1119 { NULL, 0, NULL },
1120 };
1121
1122 #define REPEAT_SIMPLE 0
1123 #define REPEAT_PLUS_APPENDED 1
1124 #define REPEAT_WITH_Q 2
1125 #define REPEAT_ZERO 3
1126
1127 static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)1128 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1129 int atomlen, int firstnum, int secondnum, int special_case)
1130 {
1131 int i, j;
1132 uschar *buf = 0;
1133 int ret = 1;
1134 int init_q = (firstnum == 0); /* first added char will be ? */
1135 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
1136 int prefix_length = reptok - basestr; /* prefix includes first rep */
1137 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
1138 int size = prefix_length + suffix_length;
1139
1140 if (firstnum > 1) { /* add room for reps 2 through firstnum */
1141 size += atomlen*(firstnum-1);
1142 }
1143
1144 /* Adjust size of buffer for special cases */
1145 if (special_case == REPEAT_PLUS_APPENDED) {
1146 size++; /* for the final + */
1147 } else if (special_case == REPEAT_WITH_Q) {
1148 size += init_q + (atomlen+1)* (n_q_reps-init_q);
1149 } else if (special_case == REPEAT_ZERO) {
1150 size += 2; /* just a null ERE: () */
1151 }
1152 if ((buf = (uschar *) malloc(size + 1)) == NULL)
1153 FATAL("out of space in reg expr %.10s..", lastre);
1154 memcpy(buf, basestr, prefix_length); /* copy prefix */
1155 j = prefix_length;
1156 if (special_case == REPEAT_ZERO) {
1157 j -= atomlen;
1158 buf[j++] = '(';
1159 buf[j++] = ')';
1160 }
1161 for (i = 1; i < firstnum; i++) { /* copy x reps */
1162 memcpy(&buf[j], atom, atomlen);
1163 j += atomlen;
1164 }
1165 if (special_case == REPEAT_PLUS_APPENDED) {
1166 buf[j++] = '+';
1167 } else if (special_case == REPEAT_WITH_Q) {
1168 if (init_q)
1169 buf[j++] = '?';
1170 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
1171 memcpy(&buf[j], atom, atomlen);
1172 j += atomlen;
1173 buf[j++] = '?';
1174 }
1175 }
1176 memcpy(&buf[j], reptok+reptoklen, suffix_length);
1177 j += suffix_length;
1178 buf[j] = '\0';
1179 /* free old basestr */
1180 if (firstbasestr != basestr) {
1181 if (basestr)
1182 xfree(basestr);
1183 }
1184 basestr = buf;
1185 prestr = buf + prefix_length;
1186 if (special_case == REPEAT_ZERO) {
1187 prestr -= atomlen;
1188 ret++;
1189 }
1190 return ret;
1191 }
1192
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)1193 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1194 int atomlen, int firstnum, int secondnum)
1195 {
1196 /*
1197 In general, the repetition specifier or "bound" is replaced here
1198 by an equivalent ERE string, repeating the immediately previous atom
1199 and appending ? and + as needed. Note that the first copy of the
1200 atom is left in place, except in the special_case of a zero-repeat
1201 (i.e., {0}).
1202 */
1203 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1204 if (firstnum < 2) {
1205 /* 0 or 1: should be handled before you get here */
1206 FATAL("internal error");
1207 } else {
1208 return replace_repeat(reptok, reptoklen, atom, atomlen,
1209 firstnum, secondnum, REPEAT_PLUS_APPENDED);
1210 }
1211 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1212 if (firstnum == 0) { /* {0} or {0,0} */
1213 /* This case is unusual because the resulting
1214 replacement string might actually be SMALLER than
1215 the original ERE */
1216 return replace_repeat(reptok, reptoklen, atom, atomlen,
1217 firstnum, secondnum, REPEAT_ZERO);
1218 } else { /* (firstnum >= 1) */
1219 return replace_repeat(reptok, reptoklen, atom, atomlen,
1220 firstnum, secondnum, REPEAT_SIMPLE);
1221 }
1222 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1223 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1224 return replace_repeat(reptok, reptoklen, atom, atomlen,
1225 firstnum, secondnum, REPEAT_WITH_Q);
1226 } else { /* Error - shouldn't be here (n>m) */
1227 FATAL("internal error");
1228 }
1229 return 0;
1230 }
1231
relex(void)1232 int relex(void) /* lexical analyzer for reparse */
1233 {
1234 int c, n;
1235 int cflag;
1236 static uschar *buf = NULL;
1237 static int bufsz = 100;
1238 uschar *bp;
1239 const struct charclass *cc;
1240 int i;
1241 int num, m;
1242 bool commafound, digitfound;
1243 const uschar *startreptok;
1244 static int parens = 0;
1245
1246 rescan:
1247 starttok = prestr;
1248
1249 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1250 prestr += n;
1251 starttok = prestr;
1252 return CHAR;
1253 }
1254
1255 switch (c = *prestr++) {
1256 case '|': return OR;
1257 case '*': return STAR;
1258 case '+': return PLUS;
1259 case '?': return QUEST;
1260 case '.': return DOT;
1261 case '\0': prestr--; return '\0';
1262 case '^':
1263 case '$':
1264 return c;
1265 case '(':
1266 parens++;
1267 return c;
1268 case ')':
1269 if (parens) {
1270 parens--;
1271 return c;
1272 }
1273 /* unmatched close parenthesis; per POSIX, treat as literal */
1274 rlxval = c;
1275 return CHAR;
1276 case '\\':
1277 rlxval = quoted(&prestr);
1278 return CHAR;
1279 default:
1280 rlxval = c;
1281 return CHAR;
1282 case '[':
1283 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1284 FATAL("out of space in reg expr %.10s..", lastre);
1285 bp = buf;
1286 if (*prestr == '^') {
1287 cflag = 1;
1288 prestr++;
1289 }
1290 else
1291 cflag = 0;
1292 n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
1293 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1294 FATAL("out of space for reg expr %.10s...", lastre);
1295 for (; ; ) {
1296 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1297 for (i = 0; i < n; i++)
1298 *bp++ = *prestr++;
1299 continue;
1300 }
1301 if ((c = *prestr++) == '\\') {
1302 *bp++ = '\\';
1303 if ((c = *prestr++) == '\0')
1304 FATAL("nonterminated character class %.20s...", lastre);
1305 *bp++ = c;
1306 /* } else if (c == '\n') { */
1307 /* FATAL("newline in character class %.20s...", lastre); */
1308 } else if (c == '[' && *prestr == ':') {
1309 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1310 for (cc = charclasses; cc->cc_name; cc++)
1311 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1312 break;
1313 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1314 prestr[2 + cc->cc_namelen] == ']') {
1315 prestr += cc->cc_namelen + 3;
1316 /*
1317 * BUG: We begin at 1, instead of 0, since we
1318 * would otherwise prematurely terminate the
1319 * string for classes like [[:cntrl:]]. This
1320 * means that we can't match the NUL character,
1321 * not without first adapting the entire
1322 * program to track each string's length.
1323 */
1324 for (i = 1; i <= UCHAR_MAX; i++) {
1325 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1326 FATAL("out of space for reg expr %.10s...", lastre);
1327 if (cc->cc_func(i)) {
1328 /* escape backslash */
1329 if (i == '\\') {
1330 *bp++ = '\\';
1331 n++;
1332 }
1333
1334 *bp++ = i;
1335 n++;
1336 }
1337 }
1338 } else
1339 *bp++ = c;
1340 } else if (c == '[' && *prestr == '.') {
1341 char collate_char;
1342 prestr++;
1343 collate_char = *prestr++;
1344 if (*prestr == '.' && prestr[1] == ']') {
1345 prestr += 2;
1346 /* Found it: map via locale TBD: for
1347 now, simply return this char. This
1348 is sufficient to pass conformance
1349 test awk.ex 156
1350 */
1351 if (*prestr == ']') {
1352 prestr++;
1353 rlxval = collate_char;
1354 return CHAR;
1355 }
1356 }
1357 } else if (c == '[' && *prestr == '=') {
1358 char equiv_char;
1359 prestr++;
1360 equiv_char = *prestr++;
1361 if (*prestr == '=' && prestr[1] == ']') {
1362 prestr += 2;
1363 /* Found it: map via locale TBD: for now
1364 simply return this char. This is
1365 sufficient to pass conformance test
1366 awk.ex 156
1367 */
1368 if (*prestr == ']') {
1369 prestr++;
1370 rlxval = equiv_char;
1371 return CHAR;
1372 }
1373 }
1374 } else if (c == '\0') {
1375 FATAL("nonterminated character class %.20s", lastre);
1376 } else if (bp == buf) { /* 1st char is special */
1377 *bp++ = c;
1378 } else if (c == ']') {
1379 *bp++ = 0;
1380 rlxstr = (uschar *) tostring((char *) buf);
1381 if (cflag == 0)
1382 return CCL;
1383 else
1384 return NCCL;
1385 } else
1386 *bp++ = c;
1387 }
1388 break;
1389 case '{':
1390 if (isdigit((int) *(prestr))) {
1391 num = 0; /* Process as a repetition */
1392 n = -1; m = -1;
1393 commafound = false;
1394 digitfound = false;
1395 startreptok = prestr-1;
1396 /* Remember start of previous atom here ? */
1397 } else { /* just a { char, not a repetition */
1398 rlxval = c;
1399 return CHAR;
1400 }
1401 for (; ; ) {
1402 if ((c = *prestr++) == '}') {
1403 if (commafound) {
1404 if (digitfound) { /* {n,m} */
1405 m = num;
1406 if (m < n)
1407 FATAL("illegal repetition expression: class %.20s",
1408 lastre);
1409 if (n == 0 && m == 1) {
1410 return QUEST;
1411 }
1412 } else { /* {n,} */
1413 if (n == 0)
1414 return STAR;
1415 else if (n == 1)
1416 return PLUS;
1417 }
1418 } else {
1419 if (digitfound) { /* {n} same as {n,n} */
1420 n = num;
1421 m = num;
1422 } else { /* {} */
1423 FATAL("illegal repetition expression: class %.20s",
1424 lastre);
1425 }
1426 }
1427 if (repeat(starttok, prestr-starttok, lastatom,
1428 startreptok - lastatom, n, m) > 0) {
1429 if (n == 0 && m == 0) {
1430 return ZERO;
1431 }
1432 /* must rescan input for next token */
1433 goto rescan;
1434 }
1435 /* Failed to replace: eat up {...} characters
1436 and treat like just PLUS */
1437 return PLUS;
1438 } else if (c == '\0') {
1439 FATAL("nonterminated character class %.20s",
1440 lastre);
1441 } else if (isdigit(c)) {
1442 num = 10 * num + c - '0';
1443 digitfound = true;
1444 } else if (c == ',') {
1445 if (commafound)
1446 FATAL("illegal repetition expression: class %.20s",
1447 lastre);
1448 /* looking for {n,} or {n,m} */
1449 commafound = true;
1450 n = num;
1451 digitfound = false; /* reset */
1452 num = 0;
1453 } else {
1454 FATAL("illegal repetition expression: class %.20s",
1455 lastre);
1456 }
1457 }
1458 break;
1459 }
1460 }
1461
cgoto(fa * f,int s,int c)1462 int cgoto(fa *f, int s, int c)
1463 {
1464 int *p, *q;
1465 int i, j, k;
1466
1467 /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
1468 while (f->accept >= maxsetvec) { /* guessing here! */
1469 resizesetvec(__func__);
1470 }
1471 for (i = 0; i <= f->accept; i++)
1472 setvec[i] = 0;
1473 setcnt = 0;
1474 resize_state(f, s);
1475 /* compute positions of gototab[s,c] into setvec */
1476 p = f->posns[s];
1477 for (i = 1; i <= *p; i++) {
1478 if ((k = f->re[p[i]].ltype) != FINAL) {
1479 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1480 || (k == DOT && c != 0 && c != HAT)
1481 || (k == ALL && c != 0)
1482 || (k == EMPTYRE && c != 0)
1483 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1484 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1485 q = f->re[p[i]].lfollow;
1486 for (j = 1; j <= *q; j++) {
1487 if (q[j] >= maxsetvec) {
1488 resizesetvec(__func__);
1489 }
1490 if (setvec[q[j]] == 0) {
1491 setcnt++;
1492 setvec[q[j]] = 1;
1493 }
1494 }
1495 }
1496 }
1497 }
1498 /* determine if setvec is a previous state */
1499 tmpset[0] = setcnt;
1500 j = 1;
1501 for (i = f->accept; i >= 0; i--)
1502 if (setvec[i]) {
1503 tmpset[j++] = i;
1504 }
1505 resize_state(f, f->curstat > s ? f->curstat : s);
1506 /* tmpset == previous state? */
1507 for (i = 1; i <= f->curstat; i++) {
1508 p = f->posns[i];
1509 if ((k = tmpset[0]) != p[0])
1510 goto different;
1511 for (j = 1; j <= k; j++)
1512 if (tmpset[j] != p[j])
1513 goto different;
1514 /* setvec is state i */
1515 if (c != HAT)
1516 set_gototab(f, s, c, i);
1517 return i;
1518 different:;
1519 }
1520
1521 /* add tmpset to current set of states */
1522 ++(f->curstat);
1523 resize_state(f, f->curstat);
1524 clear_gototab(f, f->curstat);
1525 xfree(f->posns[f->curstat]);
1526 p = intalloc(setcnt + 1, __func__);
1527
1528 f->posns[f->curstat] = p;
1529 if (c != HAT)
1530 set_gototab(f, s, c, f->curstat);
1531 for (i = 0; i <= setcnt; i++)
1532 p[i] = tmpset[i];
1533 if (setvec[f->accept])
1534 f->out[f->curstat] = 1;
1535 else
1536 f->out[f->curstat] = 0;
1537 return f->curstat;
1538 }
1539
1540
freefa(fa * f)1541 void freefa(fa *f) /* free a finite automaton */
1542 {
1543 int i;
1544
1545 if (f == NULL)
1546 return;
1547 for (i = 0; i < f->state_count; i++)
1548 xfree(f->gototab[i].entries);
1549 xfree(f->gototab);
1550 for (i = 0; i <= f->curstat; i++)
1551 xfree(f->posns[i]);
1552 for (i = 0; i <= f->accept; i++) {
1553 xfree(f->re[i].lfollow);
1554 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1555 xfree(f->re[i].lval.np);
1556 }
1557 xfree(f->restr);
1558 xfree(f->out);
1559 xfree(f->posns);
1560 xfree(f->gototab);
1561 xfree(f);
1562 }
1563