1 /* $OpenBSD: b.c,v 1.54 2024/08/03 21:12:16 millert Exp $ */
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25
26 /* lasciate ogne speranza, voi ch'intrate. */
27
28 #define DEBUG
29
30 #include <ctype.h>
31 #include <limits.h>
32 #include <stdio.h>
33 #include <string.h>
34 #include <stdlib.h>
35 #include "awk.h"
36 #include "awkgram.tab.h"
37
38 #define MAXLIN 22
39
40 #define type(v) (v)->nobj /* badly overloaded here */
41 #define info(v) (v)->ntype /* badly overloaded here */
42 #define left(v) (v)->narg[0]
43 #define right(v) (v)->narg[1]
44 #define parent(v) (v)->nnext
45
46 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
47 #define ELEAF case EMPTYRE: /* empty string in regexp */
48 #define UNARY case STAR: case PLUS: case QUEST:
49
50 /* encoding in tree Nodes:
51 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
52 left is index, right contains value or pointer to value
53 unary (STAR, PLUS, QUEST): left is child, right is null
54 binary (CAT, OR): left and right are children
55 parent contains pointer to parent
56 */
57
58
59 int *setvec;
60 int *tmpset;
61 int maxsetvec = 0;
62
63 int rtok; /* next token in current re */
64 int rlxval;
65 static const uschar *rlxstr;
66 static const uschar *prestr; /* current position in current re */
67 static const uschar *lastre; /* origin of last re */
68 static const uschar *lastatom; /* origin of last Atom */
69 static const uschar *starttok;
70 static const uschar *basestr; /* starts with original, replaced during
71 repetition processing */
72 static const uschar *firstbasestr;
73
74 static int setcnt;
75 static int poscnt;
76
77 const char *patbeg;
78 int patlen;
79
80 #define NFA 128 /* cache this many dynamic fa's */
81 fa *fatab[NFA];
82 int nfatab = 0; /* entries in fatab */
83
84 /* utf-8 mechanism:
85
86 For most of Awk, utf-8 strings just "work", since they look like
87 null-terminated sequences of 8-bit bytes.
88
89 Functions like length(), index(), and substr() have to operate
90 in units of utf-8 characters. The u8_* functions in run.c
91 handle this.
92
93 Regular expressions are more complicated, since the basic
94 mechanism of the goto table used 8-bit byte indices into the
95 gototab entries to compute the next state. Unicode is a lot
96 bigger, so the gototab entries are now structs with a character
97 and a next state. These are sorted by code point and binary
98 searched.
99
100 Throughout the RE mechanism in b.c, utf-8 characters are
101 converted to their utf-32 value. This mostly shows up in
102 cclenter, which expands character class ranges like a-z and now
103 alpha-omega. The size of a gototab array is still about 256.
104 This should be dynamic, but for now things work ok for a single
105 code page of Unicode, which is the most likely case.
106
107 The code changes are localized in run.c and b.c. I have added a
108 handful of functions to somewhat better hide the implementation,
109 but a lot more could be done.
110
111 */
112
113 static int entry_cmp(const void *l, const void *r);
114 static int get_gototab(fa*, int, int);
115 static int set_gototab(fa*, int, int, int);
116 static void clear_gototab(fa*, int);
117
118 static int *
intalloc(size_t n,const char * f)119 intalloc(size_t n, const char *f)
120 {
121 int *p = (int *) calloc(n, sizeof(int));
122 if (p == NULL)
123 overflo(f);
124 return p;
125 }
126
127 static void
allocsetvec(const char * f)128 allocsetvec(const char *f)
129 {
130 maxsetvec = MAXLIN;
131 setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
132 tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
133 if (setvec == NULL || tmpset == NULL)
134 overflo(f);
135 }
136
137 static void
resizesetvec(const char * f)138 resizesetvec(const char *f)
139 {
140 setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
141 tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
142 if (setvec == NULL || tmpset == NULL)
143 overflo(f);
144 maxsetvec *= 4;
145 }
146
147 static void
resize_state(fa * f,int state)148 resize_state(fa *f, int state)
149 {
150 gtt *p;
151 uschar *p2;
152 int **p3;
153 int i, new_count;
154
155 if (++state < f->state_count)
156 return;
157
158 new_count = state + 10; /* needs to be tuned */
159
160 p = (gtt *) reallocarray(f->gototab, new_count, sizeof(gtt));
161 if (p == NULL)
162 goto out;
163 f->gototab = p;
164
165 p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
166 if (p2 == NULL)
167 goto out;
168 f->out = p2;
169
170 p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
171 if (p3 == NULL)
172 goto out;
173 f->posns = p3;
174
175 for (i = f->state_count; i < new_count; ++i) {
176 f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
177 if (f->gototab[i].entries == NULL)
178 goto out;
179 f->gototab[i].allocated = NCHARS;
180 f->gototab[i].inuse = 0;
181 f->out[i] = 0;
182 f->posns[i] = NULL;
183 }
184 f->state_count = new_count;
185 return;
186 out:
187 overflo(__func__);
188 }
189
makedfa(const char * s,bool anchor)190 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
191 {
192 int i, use, nuse;
193 fa *pfa;
194 static int now = 1;
195
196 if (setvec == NULL) { /* first time through any RE */
197 allocsetvec(__func__);
198 }
199
200 if (compile_time != RUNNING) /* a constant for sure */
201 return mkdfa(s, anchor);
202 for (i = 0; i < nfatab; i++) /* is it there already? */
203 if (fatab[i]->anchor == anchor
204 && strcmp((const char *) fatab[i]->restr, s) == 0) {
205 fatab[i]->use = now++;
206 return fatab[i];
207 }
208 pfa = mkdfa(s, anchor);
209 if (nfatab < NFA) { /* room for another */
210 fatab[nfatab] = pfa;
211 fatab[nfatab]->use = now++;
212 nfatab++;
213 return pfa;
214 }
215 use = fatab[0]->use; /* replace least-recently used */
216 nuse = 0;
217 for (i = 1; i < nfatab; i++)
218 if (fatab[i]->use < use) {
219 use = fatab[i]->use;
220 nuse = i;
221 }
222 freefa(fatab[nuse]);
223 fatab[nuse] = pfa;
224 pfa->use = now++;
225 return pfa;
226 }
227
mkdfa(const char * s,bool anchor)228 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
229 /* anchor = true for anchored matches, else false */
230 {
231 Node *p, *p1;
232 fa *f;
233
234 firstbasestr = (const uschar *) s;
235 basestr = firstbasestr;
236 p = reparse(s);
237 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
238 /* put ALL STAR in front of reg. exp. */
239 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
240 /* put FINAL after reg. exp. */
241
242 poscnt = 0;
243 penter(p1); /* enter parent pointers and leaf indices */
244 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
245 overflo(__func__);
246 f->accept = poscnt-1; /* penter has computed number of positions in re */
247 cfoll(f, p1); /* set up follow sets */
248 freetr(p1);
249 resize_state(f, 1);
250 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
251 f->posns[1] = intalloc(1, __func__);
252 *f->posns[1] = 0;
253 f->initstat = makeinit(f, anchor);
254 f->anchor = anchor;
255 f->restr = (uschar *) tostring(s);
256 if (firstbasestr != basestr) {
257 if (basestr)
258 xfree(basestr);
259 }
260 return f;
261 }
262
makeinit(fa * f,bool anchor)263 int makeinit(fa *f, bool anchor)
264 {
265 int i, k;
266
267 f->curstat = 2;
268 f->out[2] = 0;
269 k = *(f->re[0].lfollow);
270 xfree(f->posns[2]);
271 f->posns[2] = intalloc(k + 1, __func__);
272 for (i = 0; i <= k; i++) {
273 (f->posns[2])[i] = (f->re[0].lfollow)[i];
274 }
275 if ((f->posns[2])[1] == f->accept)
276 f->out[2] = 1;
277 clear_gototab(f, 2);
278 f->curstat = cgoto(f, 2, HAT);
279 if (anchor) {
280 *f->posns[2] = k-1; /* leave out position 0 */
281 for (i = 0; i < k; i++) {
282 (f->posns[0])[i] = (f->posns[2])[i];
283 }
284
285 f->out[0] = f->out[2];
286 if (f->curstat != 2)
287 --(*f->posns[f->curstat]);
288 }
289 return f->curstat;
290 }
291
penter(Node * p)292 void penter(Node *p) /* set up parent pointers and leaf indices */
293 {
294 switch (type(p)) {
295 ELEAF
296 LEAF
297 info(p) = poscnt;
298 poscnt++;
299 break;
300 UNARY
301 penter(left(p));
302 parent(left(p)) = p;
303 break;
304 case CAT:
305 case OR:
306 penter(left(p));
307 penter(right(p));
308 parent(left(p)) = p;
309 parent(right(p)) = p;
310 break;
311 case ZERO:
312 break;
313 default: /* can't happen */
314 FATAL("can't happen: unknown type %d in penter", type(p));
315 break;
316 }
317 }
318
freetr(Node * p)319 void freetr(Node *p) /* free parse tree */
320 {
321 switch (type(p)) {
322 ELEAF
323 LEAF
324 xfree(p);
325 break;
326 UNARY
327 case ZERO:
328 freetr(left(p));
329 xfree(p);
330 break;
331 case CAT:
332 case OR:
333 freetr(left(p));
334 freetr(right(p));
335 xfree(p);
336 break;
337 default: /* can't happen */
338 FATAL("can't happen: unknown type %d in freetr", type(p));
339 break;
340 }
341 }
342
343 /* in the parsing of regular expressions, metacharacters like . have */
344 /* to be seen literally; \056 is not a metacharacter. */
345
346 static int
hexstr(const uschar ** pp,int max)347 hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
348 { /* only pick up one 8-bit byte (2 chars) */
349 const uschar *p;
350 int n = 0;
351 int i;
352
353 for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
354 if (isdigit(*p))
355 n = 16 * n + *p - '0';
356 else if (*p >= 'a' && *p <= 'f')
357 n = 16 * n + *p - 'a' + 10;
358 else if (*p >= 'A' && *p <= 'F')
359 n = 16 * n + *p - 'A' + 10;
360 }
361 *pp = p;
362 return n;
363 }
364
365 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
366
quoted(const uschar ** pp)367 int quoted(const uschar **pp) /* pick up next thing after a \\ */
368 /* and increment *pp */
369 {
370 const uschar *p = *pp;
371 int c;
372
373 /* BUG: should advance by utf-8 char even if makes no sense */
374
375 switch ((c = *p++)) {
376 case 't':
377 c = '\t';
378 break;
379 case 'n':
380 c = '\n';
381 break;
382 case 'f':
383 c = '\f';
384 break;
385 case 'r':
386 c = '\r';
387 break;
388 case 'b':
389 c = '\b';
390 break;
391 case 'v':
392 c = '\v';
393 break;
394 case 'a':
395 c = '\a';
396 break;
397 case '\\':
398 c = '\\';
399 break;
400 case 'x': /* 2 hex digits follow */
401 c = hexstr(&p, 2); /* this adds a null if number is invalid */
402 break;
403 case 'u': /* unicode char number up to 8 hex digits */
404 c = hexstr(&p, 8);
405 break;
406 default:
407 if (isoctdigit(c)) { /* \d \dd \ddd */
408 int n = c - '0';
409 if (isoctdigit(*p)) {
410 n = 8 * n + *p++ - '0';
411 if (isoctdigit(*p))
412 n = 8 * n + *p++ - '0';
413 }
414 c = n;
415 }
416 }
417
418 *pp = p;
419 return c;
420 }
421
cclenter(const char * argp)422 int *cclenter(const char *argp) /* add a character class */
423 {
424 int i, c, c2;
425 int n;
426 const uschar *p = (const uschar *) argp;
427 int *bp, *retp;
428 static int *buf = NULL;
429 static int bufsz = 100;
430
431 if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
432 FATAL("out of space for character class [%.10s...] 1", p);
433 bp = buf;
434 for (i = 0; *p != 0; ) {
435 n = u8_rune(&c, (const char *) p);
436 p += n;
437 if (c == '\\') {
438 c = quoted(&p);
439 } else if (c == '-' && i > 0 && bp[-1] != 0) {
440 if (*p != 0) {
441 c = bp[-1];
442 /* c2 = *p++; */
443 n = u8_rune(&c2, (const char *) p);
444 p += n;
445 if (c2 == '\\')
446 c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
447 if (c > c2) { /* empty; ignore */
448 bp--;
449 i--;
450 continue;
451 }
452 while (c < c2) {
453 if (i >= bufsz) {
454 buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
455 if (buf == NULL)
456 FATAL("out of space for character class [%.10s...] 2", p);
457 bufsz *= 2;
458 bp = buf + i;
459 }
460 *bp++ = ++c;
461 i++;
462 }
463 continue;
464 }
465 }
466 if (i >= bufsz) {
467 buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
468 if (buf == NULL)
469 FATAL("out of space for character class [%.10s...] 2", p);
470 bufsz *= 2;
471 bp = buf + i;
472 }
473 *bp++ = c;
474 i++;
475 }
476 *bp = 0;
477 /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
478 /* xfree(op); BUG: what are we freeing here? */
479 retp = (int *) calloc(bp-buf+1, sizeof(int));
480 for (i = 0; i < bp-buf+1; i++)
481 retp[i] = buf[i];
482 return retp;
483 }
484
overflo(const char * s)485 void overflo(const char *s)
486 {
487 FATAL("regular expression too big: out of space in %.30s...", s);
488 }
489
cfoll(fa * f,Node * v)490 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
491 {
492 int i;
493 int *p;
494
495 switch (type(v)) {
496 ELEAF
497 LEAF
498 f->re[info(v)].ltype = type(v);
499 f->re[info(v)].lval.np = right(v);
500 while (f->accept >= maxsetvec) { /* guessing here! */
501 resizesetvec(__func__);
502 }
503 for (i = 0; i <= f->accept; i++)
504 setvec[i] = 0;
505 setcnt = 0;
506 follow(v); /* computes setvec and setcnt */
507 p = intalloc(setcnt + 1, __func__);
508 f->re[info(v)].lfollow = p;
509 *p = setcnt;
510 for (i = f->accept; i >= 0; i--)
511 if (setvec[i] == 1)
512 *++p = i;
513 break;
514 UNARY
515 cfoll(f,left(v));
516 break;
517 case CAT:
518 case OR:
519 cfoll(f,left(v));
520 cfoll(f,right(v));
521 break;
522 case ZERO:
523 break;
524 default: /* can't happen */
525 FATAL("can't happen: unknown type %d in cfoll", type(v));
526 }
527 }
528
first(Node * p)529 int first(Node *p) /* collects initially active leaves of p into setvec */
530 /* returns 0 if p matches empty string */
531 {
532 int b, lp;
533
534 switch (type(p)) {
535 ELEAF
536 LEAF
537 lp = info(p); /* look for high-water mark of subscripts */
538 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
539 resizesetvec(__func__);
540 }
541 if (type(p) == EMPTYRE) {
542 setvec[lp] = 0;
543 return(0);
544 }
545 if (setvec[lp] != 1) {
546 setvec[lp] = 1;
547 setcnt++;
548 }
549 if (type(p) == CCL && (*(int *) right(p)) == 0)
550 return(0); /* empty CCL */
551 return(1);
552 case PLUS:
553 if (first(left(p)) == 0)
554 return(0);
555 return(1);
556 case STAR:
557 case QUEST:
558 first(left(p));
559 return(0);
560 case CAT:
561 if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
562 return(1);
563 case OR:
564 b = first(right(p));
565 if (first(left(p)) == 0 || b == 0) return(0);
566 return(1);
567 case ZERO:
568 return 0;
569 }
570 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
571 return(-1);
572 }
573
follow(Node * v)574 void follow(Node *v) /* collects leaves that can follow v into setvec */
575 {
576 Node *p;
577
578 if (type(v) == FINAL)
579 return;
580 p = parent(v);
581 switch (type(p)) {
582 case STAR:
583 case PLUS:
584 first(v);
585 follow(p);
586 return;
587
588 case OR:
589 case QUEST:
590 follow(p);
591 return;
592
593 case CAT:
594 if (v == left(p)) { /* v is left child of p */
595 if (first(right(p)) == 0) {
596 follow(p);
597 return;
598 }
599 } else /* v is right child */
600 follow(p);
601 return;
602 }
603 }
604
member(int c,int * sarg)605 int member(int c, int *sarg) /* is c in s? */
606 {
607 int *s = (int *) sarg;
608
609 while (*s)
610 if (c == *s++)
611 return(1);
612 return(0);
613 }
614
resize_gototab(fa * f,int state)615 static void resize_gototab(fa *f, int state)
616 {
617 size_t new_size = f->gototab[state].allocated * 2;
618 gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
619 if (p == NULL)
620 overflo(__func__);
621
622 // need to initialized the new memory to zero
623 size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size
624 memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out
625
626 f->gototab[state].allocated = new_size; // update gototab info
627 f->gototab[state].entries = p;
628 }
629
get_gototab(fa * f,int state,int ch)630 static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
631 {
632 gtte key;
633 gtte *item;
634
635 key.ch = ch;
636 key.state = 0; /* irrelevant */
637 item = (gtte *) bsearch(& key, f->gototab[state].entries,
638 f->gototab[state].inuse, sizeof(gtte),
639 entry_cmp);
640
641 if (item == NULL)
642 return 0;
643 else
644 return item->state;
645 }
646
entry_cmp(const void * l,const void * r)647 static int entry_cmp(const void *l, const void *r)
648 {
649 const gtte *left, *right;
650
651 left = (const gtte *) l;
652 right = (const gtte *) r;
653
654 return left->ch - right->ch;
655 }
656
set_gototab(fa * f,int state,int ch,int val)657 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
658 {
659 if (f->gototab[state].inuse == 0) {
660 f->gototab[state].entries[0].ch = ch;
661 f->gototab[state].entries[0].state = val;
662 f->gototab[state].inuse++;
663 return val;
664 } else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
665 // not seen yet, insert and return
666 gtt *tab = & f->gototab[state];
667 if (tab->inuse + 1 >= tab->allocated)
668 resize_gototab(f, state);
669
670 f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
671 f->gototab[state].entries[f->gototab[state].inuse].state = val;
672 f->gototab[state].inuse++;
673 return val;
674 } else {
675 // maybe we have it, maybe we don't
676 gtte key;
677 gtte *item;
678
679 key.ch = ch;
680 key.state = 0; /* irrelevant */
681 item = (gtte *) bsearch(& key, f->gototab[state].entries,
682 f->gototab[state].inuse, sizeof(gtte),
683 entry_cmp);
684
685 if (item != NULL) {
686 // we have it, update state and return
687 item->state = val;
688 return item->state;
689 }
690 // otherwise, fall through to insert and reallocate.
691 }
692
693 gtt *tab = & f->gototab[state];
694 if (tab->inuse + 1 >= tab->allocated)
695 resize_gototab(f, state);
696 f->gototab[state].entries[tab->inuse].ch = ch;
697 f->gototab[state].entries[tab->inuse].state = val;
698 ++tab->inuse;
699
700 qsort(f->gototab[state].entries,
701 f->gototab[state].inuse, sizeof(gtte), entry_cmp);
702
703 return val; /* not used anywhere at the moment */
704 }
705
clear_gototab(fa * f,int state)706 static void clear_gototab(fa *f, int state)
707 {
708 memset(f->gototab[state].entries, 0,
709 f->gototab[state].allocated * sizeof(gtte));
710 f->gototab[state].inuse = 0;
711 }
712
match(fa * f,const char * p0)713 int match(fa *f, const char *p0) /* shortest match ? */
714 {
715 int s, ns;
716 int n;
717 int rune;
718 const uschar *p = (const uschar *) p0;
719
720 /* return pmatch(f, p0); does it matter whether longest or shortest? */
721
722 s = f->initstat;
723 assert (s < f->state_count);
724
725 if (f->out[s])
726 return(1);
727 do {
728 /* assert(*p < NCHARS); */
729 n = u8_rune(&rune, (const char *) p);
730 if ((ns = get_gototab(f, s, rune)) != 0)
731 s = ns;
732 else
733 s = cgoto(f, s, rune);
734 if (f->out[s])
735 return(1);
736 if (*p == 0)
737 break;
738 p += n;
739 } while (1); /* was *p++ != 0 */
740 return(0);
741 }
742
pmatch(fa * f,const char * p0)743 int pmatch(fa *f, const char *p0) /* longest match, for sub */
744 {
745 int s, ns;
746 int n;
747 int rune;
748 const uschar *p = (const uschar *) p0;
749 const uschar *q;
750
751 s = f->initstat;
752 assert(s < f->state_count);
753
754 patbeg = (const char *)p;
755 patlen = -1;
756 do {
757 q = p;
758 do {
759 if (f->out[s]) /* final state */
760 patlen = q-p;
761 /* assert(*q < NCHARS); */
762 n = u8_rune(&rune, (const char *) q);
763 if ((ns = get_gototab(f, s, rune)) != 0)
764 s = ns;
765 else
766 s = cgoto(f, s, rune);
767
768 assert(s < f->state_count);
769
770 if (s == 1) { /* no transition */
771 if (patlen >= 0) {
772 patbeg = (const char *) p;
773 return(1);
774 }
775 else
776 goto nextin; /* no match */
777 }
778 if (*q == 0)
779 break;
780 q += n;
781 } while (1);
782 q++; /* was *q++ */
783 if (f->out[s])
784 patlen = q-p-1; /* don't count $ */
785 if (patlen >= 0) {
786 patbeg = (const char *) p;
787 return(1);
788 }
789 nextin:
790 s = 2;
791 if (*p == 0)
792 break;
793 n = u8_rune(&rune, (const char *) p);
794 p += n;
795 } while (1); /* was *p++ */
796 return (0);
797 }
798
nematch(fa * f,const char * p0)799 int nematch(fa *f, const char *p0) /* non-empty match, for sub */
800 {
801 int s, ns;
802 int n;
803 int rune;
804 const uschar *p = (const uschar *) p0;
805 const uschar *q;
806
807 s = f->initstat;
808 assert(s < f->state_count);
809
810 patbeg = (const char *)p;
811 patlen = -1;
812 while (*p) {
813 q = p;
814 do {
815 if (f->out[s]) /* final state */
816 patlen = q-p;
817 /* assert(*q < NCHARS); */
818 n = u8_rune(&rune, (const char *) q);
819 if ((ns = get_gototab(f, s, rune)) != 0)
820 s = ns;
821 else
822 s = cgoto(f, s, rune);
823 if (s == 1) { /* no transition */
824 if (patlen > 0) {
825 patbeg = (const char *) p;
826 return(1);
827 } else
828 goto nnextin; /* no nonempty match */
829 }
830 if (*q == 0)
831 break;
832 q += n;
833 } while (1);
834 q++;
835 if (f->out[s])
836 patlen = q-p-1; /* don't count $ */
837 if (patlen > 0 ) {
838 patbeg = (const char *) p;
839 return(1);
840 }
841 nnextin:
842 s = 2;
843 p++;
844 }
845 return (0);
846 }
847
848
849 /*
850 * NAME
851 * fnematch
852 *
853 * DESCRIPTION
854 * A stream-fed version of nematch which transfers characters to a
855 * null-terminated buffer. All characters up to and including the last
856 * character of the matching text or EOF are placed in the buffer. If
857 * a match is found, patbeg and patlen are set appropriately.
858 *
859 * RETURN VALUES
860 * false No match found.
861 * true Match found.
862 */
863
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)864 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
865 {
866 char *i, *j, *k, *buf = *pbuf;
867 int bufsize = *pbufsize;
868 int c, n, ns, s;
869
870 s = pfa->initstat;
871 patlen = 0;
872
873 /*
874 * buf <= i <= j <= k <= buf+bufsize
875 *
876 * i: origin of active substring
877 * j: current character
878 * k: destination of the next getc
879 */
880
881 i = j = k = buf;
882
883 do {
884 /*
885 * Call u8_rune with at least awk_mb_cur_max ahead in
886 * the buffer until EOF interferes.
887 */
888 if (k - j < (int)awk_mb_cur_max) {
889 if (k + awk_mb_cur_max > buf + bufsize) {
890 char *obuf = buf;
891 adjbuf(&buf, &bufsize,
892 bufsize + awk_mb_cur_max,
893 quantum, 0, "fnematch");
894
895 /* buf resized, maybe moved. update pointers */
896 *pbufsize = bufsize;
897 if (obuf != buf) {
898 i = buf + (i - obuf);
899 j = buf + (j - obuf);
900 k = buf + (k - obuf);
901 *pbuf = buf;
902 if (patlen)
903 patbeg = buf + (patbeg - obuf);
904 }
905 }
906 for (n = awk_mb_cur_max ; n > 0; n--) {
907 *k++ = (c = getc(f)) != EOF ? c : 0;
908 if (c == EOF) {
909 if (ferror(f))
910 FATAL("fnematch: getc error");
911 break;
912 }
913 }
914 }
915
916 j += u8_rune(&c, j);
917
918 if ((ns = get_gototab(pfa, s, c)) != 0)
919 s = ns;
920 else
921 s = cgoto(pfa, s, c);
922
923 if (pfa->out[s]) { /* final state */
924 patbeg = i;
925 patlen = j - i;
926 if (c == 0) /* don't count $ */
927 patlen--;
928 }
929
930 if (c && s != 1)
931 continue; /* origin i still viable, next j */
932 if (patlen)
933 break; /* best match found */
934
935 /* no match at origin i, next i and start over */
936 i += u8_rune(&c, i);
937 if (c == 0)
938 break; /* no match */
939 j = i;
940 s = 2;
941 } while (1);
942
943 if (patlen) {
944 /*
945 * Under no circumstances is the last character fed to
946 * the automaton part of the match. It is EOF's nullbyte,
947 * or it sent the automaton into a state with no further
948 * transitions available (s==1), or both. Room for a
949 * terminating nullbyte is guaranteed.
950 *
951 * ungetc any chars after the end of matching text
952 * (except for EOF's nullbyte, if present) and null
953 * terminate the buffer.
954 */
955 do
956 if (*--k && ungetc(*k, f) == EOF)
957 FATAL("unable to ungetc '%c'", *k);
958 while (k > patbeg + patlen);
959 *k = '\0';
960 return true;
961 }
962 else
963 return false;
964 }
965
reparse(const char * p)966 Node *reparse(const char *p) /* parses regular expression pointed to by p */
967 { /* uses relex() to scan regular expression */
968 Node *np;
969
970 DPRINTF("reparse <%s>\n", p);
971 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
972 rtok = relex();
973 /* GNU compatibility: an empty regexp matches anything */
974 if (rtok == '\0') {
975 /* FATAL("empty regular expression"); previous */
976 return(op2(EMPTYRE, NIL, NIL));
977 }
978 np = regexp();
979 if (rtok != '\0')
980 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
981 return(np);
982 }
983
regexp(void)984 Node *regexp(void) /* top-level parse of reg expr */
985 {
986 return (alt(concat(primary())));
987 }
988
primary(void)989 Node *primary(void)
990 {
991 Node *np;
992 int savelastatom;
993
994 switch (rtok) {
995 case CHAR:
996 lastatom = starttok;
997 np = op2(CHAR, NIL, itonp(rlxval));
998 rtok = relex();
999 return (unary(np));
1000 case ALL:
1001 rtok = relex();
1002 return (unary(op2(ALL, NIL, NIL)));
1003 case EMPTYRE:
1004 rtok = relex();
1005 return (unary(op2(EMPTYRE, NIL, NIL)));
1006 case DOT:
1007 lastatom = starttok;
1008 rtok = relex();
1009 return (unary(op2(DOT, NIL, NIL)));
1010 case CCL:
1011 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1012 lastatom = starttok;
1013 rtok = relex();
1014 return (unary(np));
1015 case NCCL:
1016 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1017 lastatom = starttok;
1018 rtok = relex();
1019 return (unary(np));
1020 case '^':
1021 rtok = relex();
1022 return (unary(op2(CHAR, NIL, itonp(HAT))));
1023 case '$':
1024 rtok = relex();
1025 return (unary(op2(CHAR, NIL, NIL)));
1026 case '(':
1027 lastatom = starttok;
1028 savelastatom = starttok - basestr; /* Retain over recursion */
1029 rtok = relex();
1030 if (rtok == ')') { /* special pleading for () */
1031 rtok = relex();
1032 return unary(op2(CCL, NIL, (Node *) cclenter("")));
1033 }
1034 np = regexp();
1035 if (rtok == ')') {
1036 lastatom = basestr + savelastatom; /* Restore */
1037 rtok = relex();
1038 return (unary(np));
1039 }
1040 else
1041 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1042 default:
1043 FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1044 }
1045 return 0; /*NOTREACHED*/
1046 }
1047
concat(Node * np)1048 Node *concat(Node *np)
1049 {
1050 switch (rtok) {
1051 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1052 return (concat(op2(CAT, np, primary())));
1053 case EMPTYRE:
1054 rtok = relex();
1055 return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1056 primary())));
1057 }
1058 return (np);
1059 }
1060
alt(Node * np)1061 Node *alt(Node *np)
1062 {
1063 if (rtok == OR) {
1064 rtok = relex();
1065 return (alt(op2(OR, np, concat(primary()))));
1066 }
1067 return (np);
1068 }
1069
unary(Node * np)1070 Node *unary(Node *np)
1071 {
1072 switch (rtok) {
1073 case STAR:
1074 rtok = relex();
1075 return (unary(op2(STAR, np, NIL)));
1076 case PLUS:
1077 rtok = relex();
1078 return (unary(op2(PLUS, np, NIL)));
1079 case QUEST:
1080 rtok = relex();
1081 return (unary(op2(QUEST, np, NIL)));
1082 case ZERO:
1083 rtok = relex();
1084 return (unary(op2(ZERO, np, NIL)));
1085 default:
1086 return (np);
1087 }
1088 }
1089
1090 /*
1091 * Character class definitions conformant to the POSIX locale as
1092 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1093 * and operating character sets are both ASCII (ISO646) or supersets
1094 * thereof.
1095 *
1096 * Note that to avoid overflowing the temporary buffer used in
1097 * relex(), the expanded character class (prior to range expansion)
1098 * must be less than twice the size of their full name.
1099 */
1100
1101 /* Because isblank doesn't show up in any of the header files on any
1102 * system i use, it's defined here. if some other locale has a richer
1103 * definition of "blank", define HAS_ISBLANK and provide your own
1104 * version.
1105 * the parentheses here are an attempt to find a path through the maze
1106 * of macro definition and/or function and/or version provided. thanks
1107 * to nelson beebe for the suggestion; let's see if it works everywhere.
1108 */
1109
1110 #ifndef HAS_ISBLANK
1111
1112 int (xisblank)(int c)
1113 {
1114 return c==' ' || c=='\t';
1115 }
1116
1117 #endif
1118
1119 static const struct charclass {
1120 const char *cc_name;
1121 int cc_namelen;
1122 int (*cc_func)(int);
1123 } charclasses[] = {
1124 { "alnum", 5, isalnum },
1125 { "alpha", 5, isalpha },
1126 #ifndef HAS_ISBLANK
1127 { "blank", 5, xisblank },
1128 #else
1129 { "blank", 5, isblank },
1130 #endif
1131 { "cntrl", 5, iscntrl },
1132 { "digit", 5, isdigit },
1133 { "graph", 5, isgraph },
1134 { "lower", 5, islower },
1135 { "print", 5, isprint },
1136 { "punct", 5, ispunct },
1137 { "space", 5, isspace },
1138 { "upper", 5, isupper },
1139 { "xdigit", 6, isxdigit },
1140 { NULL, 0, NULL },
1141 };
1142
1143 #define REPEAT_SIMPLE 0
1144 #define REPEAT_PLUS_APPENDED 1
1145 #define REPEAT_WITH_Q 2
1146 #define REPEAT_ZERO 3
1147
1148 static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)1149 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1150 int atomlen, int firstnum, int secondnum, int special_case)
1151 {
1152 int i, j;
1153 uschar *buf = NULL;
1154 int ret = 1;
1155 int init_q = (firstnum == 0); /* first added char will be ? */
1156 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
1157 int prefix_length = reptok - basestr; /* prefix includes first rep */
1158 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
1159 int size = prefix_length + suffix_length;
1160
1161 if (firstnum > 1) { /* add room for reps 2 through firstnum */
1162 size += atomlen*(firstnum-1);
1163 }
1164
1165 /* Adjust size of buffer for special cases */
1166 if (special_case == REPEAT_PLUS_APPENDED) {
1167 size++; /* for the final + */
1168 } else if (special_case == REPEAT_WITH_Q) {
1169 size += init_q + (atomlen+1)* (n_q_reps-init_q);
1170 } else if (special_case == REPEAT_ZERO) {
1171 size += 2; /* just a null ERE: () */
1172 }
1173 if ((buf = (uschar *) malloc(size + 1)) == NULL)
1174 FATAL("out of space in reg expr %.10s..", lastre);
1175 memcpy(buf, basestr, prefix_length); /* copy prefix */
1176 j = prefix_length;
1177 if (special_case == REPEAT_ZERO) {
1178 j -= atomlen;
1179 buf[j++] = '(';
1180 buf[j++] = ')';
1181 }
1182 for (i = 1; i < firstnum; i++) { /* copy x reps */
1183 memcpy(&buf[j], atom, atomlen);
1184 j += atomlen;
1185 }
1186 if (special_case == REPEAT_PLUS_APPENDED) {
1187 buf[j++] = '+';
1188 } else if (special_case == REPEAT_WITH_Q) {
1189 if (init_q)
1190 buf[j++] = '?';
1191 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
1192 memcpy(&buf[j], atom, atomlen);
1193 j += atomlen;
1194 buf[j++] = '?';
1195 }
1196 }
1197 memcpy(&buf[j], reptok+reptoklen, suffix_length);
1198 j += suffix_length;
1199 buf[j] = '\0';
1200 /* free old basestr */
1201 if (firstbasestr != basestr) {
1202 if (basestr)
1203 xfree(basestr);
1204 }
1205 basestr = buf;
1206 prestr = buf + prefix_length;
1207 if (special_case == REPEAT_ZERO) {
1208 prestr -= atomlen;
1209 ret++;
1210 }
1211 return ret;
1212 }
1213
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)1214 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1215 int atomlen, int firstnum, int secondnum)
1216 {
1217 /*
1218 In general, the repetition specifier or "bound" is replaced here
1219 by an equivalent ERE string, repeating the immediately previous atom
1220 and appending ? and + as needed. Note that the first copy of the
1221 atom is left in place, except in the special_case of a zero-repeat
1222 (i.e., {0}).
1223 */
1224 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1225 if (firstnum < 2) {
1226 /* 0 or 1: should be handled before you get here */
1227 FATAL("internal error");
1228 } else {
1229 return replace_repeat(reptok, reptoklen, atom, atomlen,
1230 firstnum, secondnum, REPEAT_PLUS_APPENDED);
1231 }
1232 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1233 if (firstnum == 0) { /* {0} or {0,0} */
1234 /* This case is unusual because the resulting
1235 replacement string might actually be SMALLER than
1236 the original ERE */
1237 return replace_repeat(reptok, reptoklen, atom, atomlen,
1238 firstnum, secondnum, REPEAT_ZERO);
1239 } else { /* (firstnum >= 1) */
1240 return replace_repeat(reptok, reptoklen, atom, atomlen,
1241 firstnum, secondnum, REPEAT_SIMPLE);
1242 }
1243 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1244 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1245 return replace_repeat(reptok, reptoklen, atom, atomlen,
1246 firstnum, secondnum, REPEAT_WITH_Q);
1247 } else { /* Error - shouldn't be here (n>m) */
1248 FATAL("internal error");
1249 }
1250 return 0;
1251 }
1252
relex(void)1253 int relex(void) /* lexical analyzer for reparse */
1254 {
1255 int c, n;
1256 int cflag;
1257 static uschar *buf = NULL;
1258 static int bufsz = 100;
1259 uschar *bp;
1260 const struct charclass *cc;
1261 int i;
1262 int num, m;
1263 bool commafound, digitfound;
1264 const uschar *startreptok;
1265 static int parens = 0;
1266
1267 rescan:
1268 starttok = prestr;
1269
1270 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1271 prestr += n;
1272 starttok = prestr;
1273 return CHAR;
1274 }
1275
1276 switch (c = *prestr++) {
1277 case '|': return OR;
1278 case '*': return STAR;
1279 case '+': return PLUS;
1280 case '?': return QUEST;
1281 case '.': return DOT;
1282 case '\0': prestr--; return '\0';
1283 case '^':
1284 case '$':
1285 return c;
1286 case '(':
1287 parens++;
1288 return c;
1289 case ')':
1290 if (parens) {
1291 parens--;
1292 return c;
1293 }
1294 /* unmatched close parenthesis; per POSIX, treat as literal */
1295 rlxval = c;
1296 return CHAR;
1297 case '\\':
1298 rlxval = quoted(&prestr);
1299 return CHAR;
1300 default:
1301 rlxval = c;
1302 return CHAR;
1303 case '[':
1304 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1305 FATAL("out of space in reg expr %.10s..", lastre);
1306 bp = buf;
1307 if (*prestr == '^') {
1308 cflag = 1;
1309 prestr++;
1310 }
1311 else
1312 cflag = 0;
1313 n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
1314 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1315 FATAL("out of space for reg expr %.10s...", lastre);
1316 for (; ; ) {
1317 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1318 for (i = 0; i < n; i++)
1319 *bp++ = *prestr++;
1320 continue;
1321 }
1322 if ((c = *prestr++) == '\\') {
1323 *bp++ = '\\';
1324 if ((c = *prestr++) == '\0')
1325 FATAL("nonterminated character class %.20s...", lastre);
1326 *bp++ = c;
1327 /* } else if (c == '\n') { */
1328 /* FATAL("newline in character class %.20s...", lastre); */
1329 } else if (c == '[' && *prestr == ':') {
1330 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1331 for (cc = charclasses; cc->cc_name; cc++)
1332 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1333 break;
1334 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1335 prestr[2 + cc->cc_namelen] == ']') {
1336 prestr += cc->cc_namelen + 3;
1337 /*
1338 * BUG: We begin at 1, instead of 0, since we
1339 * would otherwise prematurely terminate the
1340 * string for classes like [[:cntrl:]]. This
1341 * means that we can't match the NUL character,
1342 * not without first adapting the entire
1343 * program to track each string's length.
1344 */
1345 for (i = 1; i <= UCHAR_MAX; i++) {
1346 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1347 FATAL("out of space for reg expr %.10s...", lastre);
1348 if (cc->cc_func(i)) {
1349 /* escape backslash */
1350 if (i == '\\') {
1351 *bp++ = '\\';
1352 n++;
1353 }
1354
1355 *bp++ = i;
1356 n++;
1357 }
1358 }
1359 } else
1360 *bp++ = c;
1361 } else if (c == '[' && *prestr == '.') {
1362 char collate_char;
1363 prestr++;
1364 collate_char = *prestr++;
1365 if (*prestr == '.' && prestr[1] == ']') {
1366 prestr += 2;
1367 /* Found it: map via locale TBD: for
1368 now, simply return this char. This
1369 is sufficient to pass conformance
1370 test awk.ex 156
1371 */
1372 if (*prestr == ']') {
1373 prestr++;
1374 rlxval = collate_char;
1375 return CHAR;
1376 }
1377 }
1378 } else if (c == '[' && *prestr == '=') {
1379 char equiv_char;
1380 prestr++;
1381 equiv_char = *prestr++;
1382 if (*prestr == '=' && prestr[1] == ']') {
1383 prestr += 2;
1384 /* Found it: map via locale TBD: for now
1385 simply return this char. This is
1386 sufficient to pass conformance test
1387 awk.ex 156
1388 */
1389 if (*prestr == ']') {
1390 prestr++;
1391 rlxval = equiv_char;
1392 return CHAR;
1393 }
1394 }
1395 } else if (c == '\0') {
1396 FATAL("nonterminated character class %.20s", lastre);
1397 } else if (bp == buf) { /* 1st char is special */
1398 *bp++ = c;
1399 } else if (c == ']') {
1400 *bp++ = 0;
1401 rlxstr = (uschar *) tostring((char *) buf);
1402 if (cflag == 0)
1403 return CCL;
1404 else
1405 return NCCL;
1406 } else
1407 *bp++ = c;
1408 }
1409 break;
1410 case '{':
1411 if (isdigit(*(prestr))) {
1412 num = 0; /* Process as a repetition */
1413 n = -1; m = -1;
1414 commafound = false;
1415 digitfound = false;
1416 startreptok = prestr-1;
1417 /* Remember start of previous atom here ? */
1418 } else { /* just a { char, not a repetition */
1419 rlxval = c;
1420 return CHAR;
1421 }
1422 for (; ; ) {
1423 if ((c = *prestr++) == '}') {
1424 if (commafound) {
1425 if (digitfound) { /* {n,m} */
1426 m = num;
1427 if (m < n)
1428 FATAL("illegal repetition expression: class %.20s",
1429 lastre);
1430 if (n == 0 && m == 1) {
1431 return QUEST;
1432 }
1433 } else { /* {n,} */
1434 if (n == 0)
1435 return STAR;
1436 else if (n == 1)
1437 return PLUS;
1438 }
1439 } else {
1440 if (digitfound) { /* {n} same as {n,n} */
1441 n = num;
1442 m = num;
1443 } else { /* {} */
1444 FATAL("illegal repetition expression: class %.20s",
1445 lastre);
1446 }
1447 }
1448 if (repeat(starttok, prestr-starttok, lastatom,
1449 startreptok - lastatom, n, m) > 0) {
1450 if (n == 0 && m == 0) {
1451 return ZERO;
1452 }
1453 /* must rescan input for next token */
1454 goto rescan;
1455 }
1456 /* Failed to replace: eat up {...} characters
1457 and treat like just PLUS */
1458 return PLUS;
1459 } else if (c == '\0') {
1460 FATAL("nonterminated character class %.20s",
1461 lastre);
1462 } else if (isdigit(c)) {
1463 num = 10 * num + c - '0';
1464 digitfound = true;
1465 } else if (c == ',') {
1466 if (commafound)
1467 FATAL("illegal repetition expression: class %.20s",
1468 lastre);
1469 /* looking for {n,} or {n,m} */
1470 commafound = true;
1471 n = num;
1472 digitfound = false; /* reset */
1473 num = 0;
1474 } else {
1475 FATAL("illegal repetition expression: class %.20s",
1476 lastre);
1477 }
1478 }
1479 break;
1480 }
1481 }
1482
cgoto(fa * f,int s,int c)1483 int cgoto(fa *f, int s, int c)
1484 {
1485 int *p, *q;
1486 int i, j, k;
1487
1488 /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
1489 while (f->accept >= maxsetvec) { /* guessing here! */
1490 resizesetvec(__func__);
1491 }
1492 for (i = 0; i <= f->accept; i++)
1493 setvec[i] = 0;
1494 setcnt = 0;
1495 resize_state(f, s);
1496 /* compute positions of gototab[s,c] into setvec */
1497 p = f->posns[s];
1498 for (i = 1; i <= *p; i++) {
1499 if ((k = f->re[p[i]].ltype) != FINAL) {
1500 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1501 || (k == DOT && c != 0 && c != HAT)
1502 || (k == ALL && c != 0)
1503 || (k == EMPTYRE && c != 0)
1504 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1505 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1506 q = f->re[p[i]].lfollow;
1507 for (j = 1; j <= *q; j++) {
1508 if (q[j] >= maxsetvec) {
1509 resizesetvec(__func__);
1510 }
1511 if (setvec[q[j]] == 0) {
1512 setcnt++;
1513 setvec[q[j]] = 1;
1514 }
1515 }
1516 }
1517 }
1518 }
1519 /* determine if setvec is a previous state */
1520 tmpset[0] = setcnt;
1521 j = 1;
1522 for (i = f->accept; i >= 0; i--)
1523 if (setvec[i]) {
1524 tmpset[j++] = i;
1525 }
1526 resize_state(f, f->curstat > s ? f->curstat : s);
1527 /* tmpset == previous state? */
1528 for (i = 1; i <= f->curstat; i++) {
1529 p = f->posns[i];
1530 if ((k = tmpset[0]) != p[0])
1531 goto different;
1532 for (j = 1; j <= k; j++)
1533 if (tmpset[j] != p[j])
1534 goto different;
1535 /* setvec is state i */
1536 if (c != HAT)
1537 set_gototab(f, s, c, i);
1538 return i;
1539 different:;
1540 }
1541
1542 /* add tmpset to current set of states */
1543 ++(f->curstat);
1544 resize_state(f, f->curstat);
1545 clear_gototab(f, f->curstat);
1546 xfree(f->posns[f->curstat]);
1547 p = intalloc(setcnt + 1, __func__);
1548
1549 f->posns[f->curstat] = p;
1550 if (c != HAT)
1551 set_gototab(f, s, c, f->curstat);
1552 for (i = 0; i <= setcnt; i++)
1553 p[i] = tmpset[i];
1554 if (setvec[f->accept])
1555 f->out[f->curstat] = 1;
1556 else
1557 f->out[f->curstat] = 0;
1558 return f->curstat;
1559 }
1560
1561
freefa(fa * f)1562 void freefa(fa *f) /* free a finite automaton */
1563 {
1564 int i;
1565
1566 if (f == NULL)
1567 return;
1568 for (i = 0; i < f->state_count; i++)
1569 xfree(f->gototab[i].entries);
1570 xfree(f->gototab);
1571 for (i = 0; i <= f->curstat; i++)
1572 xfree(f->posns[i]);
1573 for (i = 0; i <= f->accept; i++) {
1574 xfree(f->re[i].lfollow);
1575 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1576 xfree(f->re[i].lval.np);
1577 }
1578 xfree(f->restr);
1579 xfree(f->out);
1580 xfree(f->posns);
1581 xfree(f->gototab);
1582 xfree(f);
1583 }
1584