xref: /openbsd/usr.bin/awk/b.c (revision 4fb9ab68)
1 /*	$OpenBSD: b.c,v 1.53 2024/06/03 00:55:05 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 /* lasciate ogne speranza, voi ch'intrate. */
27 
28 #define	DEBUG
29 
30 #include <ctype.h>
31 #include <limits.h>
32 #include <stdio.h>
33 #include <string.h>
34 #include <stdlib.h>
35 #include "awk.h"
36 #include "awkgram.tab.h"
37 
38 #define MAXLIN 22
39 
40 #define type(v)		(v)->nobj	/* badly overloaded here */
41 #define info(v)		(v)->ntype	/* badly overloaded here */
42 #define left(v)		(v)->narg[0]
43 #define right(v)	(v)->narg[1]
44 #define parent(v)	(v)->nnext
45 
46 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
47 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
48 #define UNARY	case STAR: case PLUS: case QUEST:
49 
50 /* encoding in tree Nodes:
51 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
52 		left is index, right contains value or pointer to value
53 	unary (STAR, PLUS, QUEST): left is child, right is null
54 	binary (CAT, OR): left and right are children
55 	parent contains pointer to parent
56 */
57 
58 
59 int	*setvec;
60 int	*tmpset;
61 int	maxsetvec = 0;
62 
63 int	rtok;		/* next token in current re */
64 int	rlxval;
65 static const uschar	*rlxstr;
66 static const uschar	*prestr;	/* current position in current re */
67 static const uschar	*lastre;	/* origin of last re */
68 static const uschar	*lastatom;	/* origin of last Atom */
69 static const uschar	*starttok;
70 static const uschar 	*basestr;	/* starts with original, replaced during
71 				   repetition processing */
72 static const uschar 	*firstbasestr;
73 
74 static	int setcnt;
75 static	int poscnt;
76 
77 const char	*patbeg;
78 int	patlen;
79 
80 #define	NFA	128	/* cache this many dynamic fa's */
81 fa	*fatab[NFA];
82 int	nfatab	= 0;	/* entries in fatab */
83 
84 /* utf-8 mechanism:
85 
86    For most of Awk, utf-8 strings just "work", since they look like
87    null-terminated sequences of 8-bit bytes.
88 
89    Functions like length(), index(), and substr() have to operate
90    in units of utf-8 characters.  The u8_* functions in run.c
91    handle this.
92 
93    Regular expressions are more complicated, since the basic
94    mechanism of the goto table used 8-bit byte indices into the
95    gototab entries to compute the next state.  Unicode is a lot
96    bigger, so the gototab entries are now structs with a character
97    and a next state. These are sorted by code point and binary
98    searched.
99 
100    Throughout the RE mechanism in b.c, utf-8 characters are
101    converted to their utf-32 value.  This mostly shows up in
102    cclenter, which expands character class ranges like a-z and now
103    alpha-omega.  The size of a gototab array is still about 256.
104    This should be dynamic, but for now things work ok for a single
105    code page of Unicode, which is the most likely case.
106 
107    The code changes are localized in run.c and b.c.  I have added a
108    handful of functions to somewhat better hide the implementation,
109    but a lot more could be done.
110 
111  */
112 
113 static int entry_cmp(const void *l, const void *r);
114 static int get_gototab(fa*, int, int);
115 static int set_gototab(fa*, int, int, int);
116 static void clear_gototab(fa*, int);
117 
118 static int *
119 intalloc(size_t n, const char *f)
120 {
121 	int *p = (int *) calloc(n, sizeof(int));
122 	if (p == NULL)
123 		overflo(f);
124 	return p;
125 }
126 
127 static void
128 allocsetvec(const char *f)
129 {
130 	maxsetvec = MAXLIN;
131 	setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
132 	tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
133 	if (setvec == NULL || tmpset == NULL)
134 		overflo(f);
135 }
136 
137 static void
138 resizesetvec(const char *f)
139 {
140 	setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
141 	tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
142 	if (setvec == NULL || tmpset == NULL)
143 		overflo(f);
144 	maxsetvec *= 4;
145 }
146 
147 static void
148 resize_state(fa *f, int state)
149 {
150 	gtt *p;
151 	uschar *p2;
152 	int **p3;
153 	int i, new_count;
154 
155 	if (++state < f->state_count)
156 		return;
157 
158 	new_count = state + 10; /* needs to be tuned */
159 
160 	p = (gtt *) reallocarray(f->gototab, new_count, sizeof(gtt));
161 	if (p == NULL)
162 		goto out;
163 	f->gototab = p;
164 
165 	p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
166 	if (p2 == NULL)
167 		goto out;
168 	f->out = p2;
169 
170 	p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
171 	if (p3 == NULL)
172 		goto out;
173 	f->posns = p3;
174 
175 	for (i = f->state_count; i < new_count; ++i) {
176 		f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
177 		if (f->gototab[i].entries == NULL)
178 			goto out;
179 		f->gototab[i].allocated = NCHARS;
180 		f->gototab[i].inuse = 0;
181 		f->out[i] = 0;
182 		f->posns[i] = NULL;
183 	}
184 	f->state_count = new_count;
185 	return;
186 out:
187 	overflo(__func__);
188 }
189 
190 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
191 {
192 	int i, use, nuse;
193 	fa *pfa;
194 	static int now = 1;
195 
196 	if (setvec == NULL) {	/* first time through any RE */
197 		allocsetvec(__func__);
198 	}
199 
200 	if (compile_time != RUNNING)	/* a constant for sure */
201 		return mkdfa(s, anchor);
202 	for (i = 0; i < nfatab; i++)	/* is it there already? */
203 		if (fatab[i]->anchor == anchor
204 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
205 			fatab[i]->use = now++;
206 			return fatab[i];
207 		}
208 	pfa = mkdfa(s, anchor);
209 	if (nfatab < NFA) {	/* room for another */
210 		fatab[nfatab] = pfa;
211 		fatab[nfatab]->use = now++;
212 		nfatab++;
213 		return pfa;
214 	}
215 	use = fatab[0]->use;	/* replace least-recently used */
216 	nuse = 0;
217 	for (i = 1; i < nfatab; i++)
218 		if (fatab[i]->use < use) {
219 			use = fatab[i]->use;
220 			nuse = i;
221 		}
222 	freefa(fatab[nuse]);
223 	fatab[nuse] = pfa;
224 	pfa->use = now++;
225 	return pfa;
226 }
227 
228 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
229 				/* anchor = true for anchored matches, else false */
230 {
231 	Node *p, *p1;
232 	fa *f;
233 
234 	firstbasestr = (const uschar *) s;
235 	basestr = firstbasestr;
236 	p = reparse(s);
237 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
238 		/* put ALL STAR in front of reg.  exp. */
239 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
240 		/* put FINAL after reg.  exp. */
241 
242 	poscnt = 0;
243 	penter(p1);	/* enter parent pointers and leaf indices */
244 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
245 		overflo(__func__);
246 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
247 	cfoll(f, p1);	/* set up follow sets */
248 	freetr(p1);
249 	resize_state(f, 1);
250 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
251 	f->posns[1] = intalloc(1, __func__);
252 	*f->posns[1] = 0;
253 	f->initstat = makeinit(f, anchor);
254 	f->anchor = anchor;
255 	f->restr = (uschar *) tostring(s);
256 	if (firstbasestr != basestr) {
257 		if (basestr)
258 			xfree(basestr);
259 	}
260 	return f;
261 }
262 
263 int makeinit(fa *f, bool anchor)
264 {
265 	int i, k;
266 
267 	f->curstat = 2;
268 	f->out[2] = 0;
269 	k = *(f->re[0].lfollow);
270 	xfree(f->posns[2]);
271 	f->posns[2] = intalloc(k + 1,  __func__);
272 	for (i = 0; i <= k; i++) {
273 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
274 	}
275 	if ((f->posns[2])[1] == f->accept)
276 		f->out[2] = 1;
277 	clear_gototab(f, 2);
278 	f->curstat = cgoto(f, 2, HAT);
279 	if (anchor) {
280 		*f->posns[2] = k-1;	/* leave out position 0 */
281 		for (i = 0; i < k; i++) {
282 			(f->posns[0])[i] = (f->posns[2])[i];
283 		}
284 
285 		f->out[0] = f->out[2];
286 		if (f->curstat != 2)
287 			--(*f->posns[f->curstat]);
288 	}
289 	return f->curstat;
290 }
291 
292 void penter(Node *p)	/* set up parent pointers and leaf indices */
293 {
294 	switch (type(p)) {
295 	ELEAF
296 	LEAF
297 		info(p) = poscnt;
298 		poscnt++;
299 		break;
300 	UNARY
301 		penter(left(p));
302 		parent(left(p)) = p;
303 		break;
304 	case CAT:
305 	case OR:
306 		penter(left(p));
307 		penter(right(p));
308 		parent(left(p)) = p;
309 		parent(right(p)) = p;
310 		break;
311 	case ZERO:
312 		break;
313 	default:	/* can't happen */
314 		FATAL("can't happen: unknown type %d in penter", type(p));
315 		break;
316 	}
317 }
318 
319 void freetr(Node *p)	/* free parse tree */
320 {
321 	switch (type(p)) {
322 	ELEAF
323 	LEAF
324 		xfree(p);
325 		break;
326 	UNARY
327 	case ZERO:
328 		freetr(left(p));
329 		xfree(p);
330 		break;
331 	case CAT:
332 	case OR:
333 		freetr(left(p));
334 		freetr(right(p));
335 		xfree(p);
336 		break;
337 	default:	/* can't happen */
338 		FATAL("can't happen: unknown type %d in freetr", type(p));
339 		break;
340 	}
341 }
342 
343 /* in the parsing of regular expressions, metacharacters like . have */
344 /* to be seen literally;  \056 is not a metacharacter. */
345 
346 static int
347 hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
348 {			/* only pick up one 8-bit byte (2 chars) */
349 	const uschar *p;
350 	int n = 0;
351 	int i;
352 
353 	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
354 		if (isdigit(*p))
355 			n = 16 * n + *p - '0';
356 		else if (*p >= 'a' && *p <= 'f')
357 			n = 16 * n + *p - 'a' + 10;
358 		else if (*p >= 'A' && *p <= 'F')
359 			n = 16 * n + *p - 'A' + 10;
360 	}
361 	*pp = p;
362 	return n;
363 }
364 
365 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
366 
367 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
368 			/* and increment *pp */
369 {
370 	const uschar *p = *pp;
371 	int c;
372 
373 /* BUG: should advance by utf-8 char even if makes no sense */
374 
375 	if ((c = *p++) == 't') {
376 		c = '\t';
377 	} else if (c == 'n') {
378 		c = '\n';
379 	} else if (c == 'f') {
380 		c = '\f';
381 	} else if (c == 'r') {
382 		c = '\r';
383 	} else if (c == 'b') {
384 		c = '\b';
385 	} else if (c == 'v') {
386 		c = '\v';
387 	} else if (c == 'a') {
388 		c = '\a';
389 	} else if (c == '\\') {
390 		c = '\\';
391 	} else if (c == 'x') {	/* 2 hex digits follow */
392 		c = hexstr(&p, 2);	/* this adds a null if number is invalid */
393 	} else if (c == 'u') {	/* unicode char number up to 8 hex digits */
394 		c = hexstr(&p, 8);
395 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
396 		int n = c - '0';
397 		if (isoctdigit(*p)) {
398 			n = 8 * n + *p++ - '0';
399 			if (isoctdigit(*p))
400 				n = 8 * n + *p++ - '0';
401 		}
402 		c = n;
403 	} /* else */
404 		/* c = c; */
405 	*pp = p;
406 	return c;
407 }
408 
409 int *cclenter(const char *argp)	/* add a character class */
410 {
411 	int i, c, c2;
412 	int n;
413 	const uschar *p = (const uschar *) argp;
414 	int *bp, *retp;
415 	static int *buf = NULL;
416 	static int bufsz = 100;
417 
418 	if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
419 		FATAL("out of space for character class [%.10s...] 1", p);
420 	bp = buf;
421 	for (i = 0; *p != 0; ) {
422 		n = u8_rune(&c, (const char *) p);
423 		p += n;
424 		if (c == '\\') {
425 			c = quoted(&p);
426 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
427 			if (*p != 0) {
428 				c = bp[-1];
429 				/* c2 = *p++; */
430 				n = u8_rune(&c2, (const char *) p);
431 				p += n;
432 				if (c2 == '\\')
433 					c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
434 				if (c > c2) {	/* empty; ignore */
435 					bp--;
436 					i--;
437 					continue;
438 				}
439 				while (c < c2) {
440 					if (i >= bufsz) {
441 						buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
442 						if (buf == NULL)
443 							FATAL("out of space for character class [%.10s...] 2", p);
444 						bufsz *= 2;
445 						bp = buf + i;
446 					}
447 					*bp++ = ++c;
448 					i++;
449 				}
450 				continue;
451 			}
452 		}
453 		if (i >= bufsz) {
454 			buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
455 			if (buf == NULL)
456 				FATAL("out of space for character class [%.10s...] 2", p);
457 			bufsz *= 2;
458 			bp = buf + i;
459 		}
460 		*bp++ = c;
461 		i++;
462 	}
463 	*bp = 0;
464 	/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
465 	/* xfree(op);  BUG: what are we freeing here? */
466 	retp = (int *) calloc(bp-buf+1, sizeof(int));
467 	for (i = 0; i < bp-buf+1; i++)
468 		retp[i] = buf[i];
469 	return retp;
470 }
471 
472 void overflo(const char *s)
473 {
474 	FATAL("regular expression too big: out of space in %.30s...", s);
475 }
476 
477 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
478 {
479 	int i;
480 	int *p;
481 
482 	switch (type(v)) {
483 	ELEAF
484 	LEAF
485 		f->re[info(v)].ltype = type(v);
486 		f->re[info(v)].lval.np = right(v);
487 		while (f->accept >= maxsetvec) {	/* guessing here! */
488 			resizesetvec(__func__);
489 		}
490 		for (i = 0; i <= f->accept; i++)
491 			setvec[i] = 0;
492 		setcnt = 0;
493 		follow(v);	/* computes setvec and setcnt */
494 		p = intalloc(setcnt + 1, __func__);
495 		f->re[info(v)].lfollow = p;
496 		*p = setcnt;
497 		for (i = f->accept; i >= 0; i--)
498 			if (setvec[i] == 1)
499 				*++p = i;
500 		break;
501 	UNARY
502 		cfoll(f,left(v));
503 		break;
504 	case CAT:
505 	case OR:
506 		cfoll(f,left(v));
507 		cfoll(f,right(v));
508 		break;
509 	case ZERO:
510 		break;
511 	default:	/* can't happen */
512 		FATAL("can't happen: unknown type %d in cfoll", type(v));
513 	}
514 }
515 
516 int first(Node *p)	/* collects initially active leaves of p into setvec */
517 			/* returns 0 if p matches empty string */
518 {
519 	int b, lp;
520 
521 	switch (type(p)) {
522 	ELEAF
523 	LEAF
524 		lp = info(p);	/* look for high-water mark of subscripts */
525 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
526 			resizesetvec(__func__);
527 		}
528 		if (type(p) == EMPTYRE) {
529 			setvec[lp] = 0;
530 			return(0);
531 		}
532 		if (setvec[lp] != 1) {
533 			setvec[lp] = 1;
534 			setcnt++;
535 		}
536 		if (type(p) == CCL && (*(int *) right(p)) == 0)
537 			return(0);		/* empty CCL */
538 		return(1);
539 	case PLUS:
540 		if (first(left(p)) == 0)
541 			return(0);
542 		return(1);
543 	case STAR:
544 	case QUEST:
545 		first(left(p));
546 		return(0);
547 	case CAT:
548 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
549 		return(1);
550 	case OR:
551 		b = first(right(p));
552 		if (first(left(p)) == 0 || b == 0) return(0);
553 		return(1);
554 	case ZERO:
555 		return 0;
556 	}
557 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
558 	return(-1);
559 }
560 
561 void follow(Node *v)	/* collects leaves that can follow v into setvec */
562 {
563 	Node *p;
564 
565 	if (type(v) == FINAL)
566 		return;
567 	p = parent(v);
568 	switch (type(p)) {
569 	case STAR:
570 	case PLUS:
571 		first(v);
572 		follow(p);
573 		return;
574 
575 	case OR:
576 	case QUEST:
577 		follow(p);
578 		return;
579 
580 	case CAT:
581 		if (v == left(p)) {	/* v is left child of p */
582 			if (first(right(p)) == 0) {
583 				follow(p);
584 				return;
585 			}
586 		} else		/* v is right child */
587 			follow(p);
588 		return;
589 	}
590 }
591 
592 int member(int c, int *sarg)	/* is c in s? */
593 {
594 	int *s = (int *) sarg;
595 
596 	while (*s)
597 		if (c == *s++)
598 			return(1);
599 	return(0);
600 }
601 
602 static void resize_gototab(fa *f, int state)
603 {
604 	size_t new_size = f->gototab[state].allocated * 2;
605 	gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
606 	if (p == NULL)
607 		overflo(__func__);
608 
609 	// need to initialized the new memory to zero
610 	size_t orig_size = f->gototab[state].allocated;		// 2nd half of new mem is this size
611 	memset(p + orig_size, 0, orig_size * sizeof(gtte));	// clean it out
612 
613 	f->gototab[state].allocated = new_size;			// update gototab info
614 	f->gototab[state].entries = p;
615 }
616 
617 static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
618 {
619 	gtte key;
620 	gtte *item;
621 
622 	key.ch = ch;
623 	key.state = 0;	/* irrelevant */
624 	item = (gtte *) bsearch(& key, f->gototab[state].entries,
625 			f->gototab[state].inuse, sizeof(gtte),
626 			entry_cmp);
627 
628 	if (item == NULL)
629 		return 0;
630 	else
631 		return item->state;
632 }
633 
634 static int entry_cmp(const void *l, const void *r)
635 {
636 	const gtte *left, *right;
637 
638 	left = (const gtte *) l;
639 	right = (const gtte *) r;
640 
641 	return left->ch - right->ch;
642 }
643 
644 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
645 {
646 	if (f->gototab[state].inuse == 0) {
647 		f->gototab[state].entries[0].ch = ch;
648 		f->gototab[state].entries[0].state = val;
649 		f->gototab[state].inuse++;
650 		return val;
651 	} else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
652 		// not seen yet, insert and return
653 		gtt *tab = & f->gototab[state];
654 		if (tab->inuse + 1 >= tab->allocated)
655 			resize_gototab(f, state);
656 
657 		f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
658 		f->gototab[state].entries[f->gototab[state].inuse].state = val;
659 		f->gototab[state].inuse++;
660 		return val;
661 	} else {
662 		// maybe we have it, maybe we don't
663 		gtte key;
664 		gtte *item;
665 
666 		key.ch = ch;
667 		key.state = 0;	/* irrelevant */
668 		item = (gtte *) bsearch(& key, f->gototab[state].entries,
669 				f->gototab[state].inuse, sizeof(gtte),
670 				entry_cmp);
671 
672 		if (item != NULL) {
673 			// we have it, update state and return
674 			item->state = val;
675 			return item->state;
676 		}
677 		// otherwise, fall through to insert and reallocate.
678 	}
679 
680 	gtt *tab = & f->gototab[state];
681 	if (tab->inuse + 1 >= tab->allocated)
682 		resize_gototab(f, state);
683 	f->gototab[state].entries[tab->inuse].ch = ch;
684 	f->gototab[state].entries[tab->inuse].state = val;
685 	++tab->inuse;
686 
687 	qsort(f->gototab[state].entries,
688 		f->gototab[state].inuse, sizeof(gtte), entry_cmp);
689 
690 	return val; /* not used anywhere at the moment */
691 }
692 
693 static void clear_gototab(fa *f, int state)
694 {
695 	memset(f->gototab[state].entries, 0,
696 		f->gototab[state].allocated * sizeof(gtte));
697 	f->gototab[state].inuse = 0;
698 }
699 
700 int match(fa *f, const char *p0)	/* shortest match ? */
701 {
702 	int s, ns;
703 	int n;
704 	int rune;
705 	const uschar *p = (const uschar *) p0;
706 
707 	/* return pmatch(f, p0); does it matter whether longest or shortest? */
708 
709 	s = f->initstat;
710 	assert (s < f->state_count);
711 
712 	if (f->out[s])
713 		return(1);
714 	do {
715 		/* assert(*p < NCHARS); */
716 		n = u8_rune(&rune, (const char *) p);
717 		if ((ns = get_gototab(f, s, rune)) != 0)
718 			s = ns;
719 		else
720 			s = cgoto(f, s, rune);
721 		if (f->out[s])
722 			return(1);
723 		if (*p == 0)
724 			break;
725 		p += n;
726 	} while (1);  /* was *p++ != 0 */
727 	return(0);
728 }
729 
730 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
731 {
732 	int s, ns;
733 	int n;
734 	int rune;
735 	const uschar *p = (const uschar *) p0;
736 	const uschar *q;
737 
738 	s = f->initstat;
739 	assert(s < f->state_count);
740 
741 	patbeg = (const char *)p;
742 	patlen = -1;
743 	do {
744 		q = p;
745 		do {
746 			if (f->out[s])		/* final state */
747 				patlen = q-p;
748 			/* assert(*q < NCHARS); */
749 			n = u8_rune(&rune, (const char *) q);
750 			if ((ns = get_gototab(f, s, rune)) != 0)
751 				s = ns;
752 			else
753 				s = cgoto(f, s, rune);
754 
755 			assert(s < f->state_count);
756 
757 			if (s == 1) {	/* no transition */
758 				if (patlen >= 0) {
759 					patbeg = (const char *) p;
760 					return(1);
761 				}
762 				else
763 					goto nextin;	/* no match */
764 			}
765 			if (*q == 0)
766 				break;
767 			q += n;
768 		} while (1);
769 		q++;  /* was *q++ */
770 		if (f->out[s])
771 			patlen = q-p-1;	/* don't count $ */
772 		if (patlen >= 0) {
773 			patbeg = (const char *) p;
774 			return(1);
775 		}
776 	nextin:
777 		s = 2;
778 		if (*p == 0)
779 			break;
780 		n = u8_rune(&rune, (const char *) p);
781 		p += n;
782 	} while (1); /* was *p++ */
783 	return (0);
784 }
785 
786 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
787 {
788 	int s, ns;
789 	int n;
790 	int rune;
791 	const uschar *p = (const uschar *) p0;
792 	const uschar *q;
793 
794 	s = f->initstat;
795 	assert(s < f->state_count);
796 
797 	patbeg = (const char *)p;
798 	patlen = -1;
799 	while (*p) {
800 		q = p;
801 		do {
802 			if (f->out[s])		/* final state */
803 				patlen = q-p;
804 			/* assert(*q < NCHARS); */
805 			n = u8_rune(&rune, (const char *) q);
806 			if ((ns = get_gototab(f, s, rune)) != 0)
807 				s = ns;
808 			else
809 				s = cgoto(f, s, rune);
810 			if (s == 1) {	/* no transition */
811 				if (patlen > 0) {
812 					patbeg = (const char *) p;
813 					return(1);
814 				} else
815 					goto nnextin;	/* no nonempty match */
816 			}
817 			if (*q == 0)
818 				break;
819 			q += n;
820 		} while (1);
821 		q++;
822 		if (f->out[s])
823 			patlen = q-p-1;	/* don't count $ */
824 		if (patlen > 0 ) {
825 			patbeg = (const char *) p;
826 			return(1);
827 		}
828 	nnextin:
829 		s = 2;
830 		p++;
831 	}
832 	return (0);
833 }
834 
835 
836 /*
837  * NAME
838  *     fnematch
839  *
840  * DESCRIPTION
841  *     A stream-fed version of nematch which transfers characters to a
842  *     null-terminated buffer. All characters up to and including the last
843  *     character of the matching text or EOF are placed in the buffer. If
844  *     a match is found, patbeg and patlen are set appropriately.
845  *
846  * RETURN VALUES
847  *     false    No match found.
848  *     true     Match found.
849  */
850 
851 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
852 {
853 	char *i, *j, *k, *buf = *pbuf;
854 	int bufsize = *pbufsize;
855 	int c, n, ns, s;
856 
857 	s = pfa->initstat;
858 	patlen = 0;
859 
860 	/*
861 	 * buf <= i <= j <= k <= buf+bufsize
862 	 *
863 	 * i: origin of active substring
864 	 * j: current character
865 	 * k: destination of the next getc
866 	 */
867 
868 	i = j = k = buf;
869 
870 	do {
871 		/*
872 		 * Call u8_rune with at least awk_mb_cur_max ahead in
873 		 * the buffer until EOF interferes.
874 		 */
875 		if (k - j < (int)awk_mb_cur_max) {
876 			if (k + awk_mb_cur_max > buf + bufsize) {
877 				char *obuf = buf;
878 				adjbuf(&buf, &bufsize,
879 				    bufsize + awk_mb_cur_max,
880 				    quantum, 0, "fnematch");
881 
882 				/* buf resized, maybe moved. update pointers */
883 				*pbufsize = bufsize;
884 				if (obuf != buf) {
885 					i = buf + (i - obuf);
886 					j = buf + (j - obuf);
887 					k = buf + (k - obuf);
888 					*pbuf = buf;
889 					if (patlen)
890 						patbeg = buf + (patbeg - obuf);
891 				}
892 			}
893 			for (n = awk_mb_cur_max ; n > 0; n--) {
894 				*k++ = (c = getc(f)) != EOF ? c : 0;
895 				if (c == EOF) {
896 					if (ferror(f))
897 						FATAL("fnematch: getc error");
898 					break;
899 				}
900 			}
901 		}
902 
903 		j += u8_rune(&c, j);
904 
905 		if ((ns = get_gototab(pfa, s, c)) != 0)
906 			s = ns;
907 		else
908 			s = cgoto(pfa, s, c);
909 
910 		if (pfa->out[s]) {	/* final state */
911 			patbeg = i;
912 			patlen = j - i;
913 			if (c == 0)	/* don't count $ */
914 				patlen--;
915 		}
916 
917 		if (c && s != 1)
918 			continue;  /* origin i still viable, next j */
919 		if (patlen)
920 			break;     /* best match found */
921 
922 		/* no match at origin i, next i and start over */
923 		i += u8_rune(&c, i);
924 		if (c == 0)
925 			break;    /* no match */
926 		j = i;
927 		s = 2;
928 	} while (1);
929 
930 	if (patlen) {
931 		/*
932 		 * Under no circumstances is the last character fed to
933 		 * the automaton part of the match. It is EOF's nullbyte,
934 		 * or it sent the automaton into a state with no further
935 		 * transitions available (s==1), or both. Room for a
936 		 * terminating nullbyte is guaranteed.
937 		 *
938 		 * ungetc any chars after the end of matching text
939 		 * (except for EOF's nullbyte, if present) and null
940 		 * terminate the buffer.
941 		 */
942 		do
943 			if (*--k && ungetc(*k, f) == EOF)
944 				FATAL("unable to ungetc '%c'", *k);
945 		while (k > patbeg + patlen);
946 		*k = '\0';
947 		return true;
948 	}
949 	else
950 		return false;
951 }
952 
953 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
954 {			/* uses relex() to scan regular expression */
955 	Node *np;
956 
957 	DPRINTF("reparse <%s>\n", p);
958 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
959 	rtok = relex();
960 	/* GNU compatibility: an empty regexp matches anything */
961 	if (rtok == '\0') {
962 		/* FATAL("empty regular expression"); previous */
963 		return(op2(EMPTYRE, NIL, NIL));
964 	}
965 	np = regexp();
966 	if (rtok != '\0')
967 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
968 	return(np);
969 }
970 
971 Node *regexp(void)	/* top-level parse of reg expr */
972 {
973 	return (alt(concat(primary())));
974 }
975 
976 Node *primary(void)
977 {
978 	Node *np;
979 	int savelastatom;
980 
981 	switch (rtok) {
982 	case CHAR:
983 		lastatom = starttok;
984 		np = op2(CHAR, NIL, itonp(rlxval));
985 		rtok = relex();
986 		return (unary(np));
987 	case ALL:
988 		rtok = relex();
989 		return (unary(op2(ALL, NIL, NIL)));
990 	case EMPTYRE:
991 		rtok = relex();
992 		return (unary(op2(EMPTYRE, NIL, NIL)));
993 	case DOT:
994 		lastatom = starttok;
995 		rtok = relex();
996 		return (unary(op2(DOT, NIL, NIL)));
997 	case CCL:
998 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
999 		lastatom = starttok;
1000 		rtok = relex();
1001 		return (unary(np));
1002 	case NCCL:
1003 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1004 		lastatom = starttok;
1005 		rtok = relex();
1006 		return (unary(np));
1007 	case '^':
1008 		rtok = relex();
1009 		return (unary(op2(CHAR, NIL, itonp(HAT))));
1010 	case '$':
1011 		rtok = relex();
1012 		return (unary(op2(CHAR, NIL, NIL)));
1013 	case '(':
1014 		lastatom = starttok;
1015 		savelastatom = starttok - basestr; /* Retain over recursion */
1016 		rtok = relex();
1017 		if (rtok == ')') {	/* special pleading for () */
1018 			rtok = relex();
1019 			return unary(op2(CCL, NIL, (Node *) cclenter("")));
1020 		}
1021 		np = regexp();
1022 		if (rtok == ')') {
1023 			lastatom = basestr + savelastatom; /* Restore */
1024 			rtok = relex();
1025 			return (unary(np));
1026 		}
1027 		else
1028 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1029 	default:
1030 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1031 	}
1032 	return 0;	/*NOTREACHED*/
1033 }
1034 
1035 Node *concat(Node *np)
1036 {
1037 	switch (rtok) {
1038 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1039 		return (concat(op2(CAT, np, primary())));
1040 	case EMPTYRE:
1041 		rtok = relex();
1042 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1043 				primary())));
1044 	}
1045 	return (np);
1046 }
1047 
1048 Node *alt(Node *np)
1049 {
1050 	if (rtok == OR) {
1051 		rtok = relex();
1052 		return (alt(op2(OR, np, concat(primary()))));
1053 	}
1054 	return (np);
1055 }
1056 
1057 Node *unary(Node *np)
1058 {
1059 	switch (rtok) {
1060 	case STAR:
1061 		rtok = relex();
1062 		return (unary(op2(STAR, np, NIL)));
1063 	case PLUS:
1064 		rtok = relex();
1065 		return (unary(op2(PLUS, np, NIL)));
1066 	case QUEST:
1067 		rtok = relex();
1068 		return (unary(op2(QUEST, np, NIL)));
1069 	case ZERO:
1070 		rtok = relex();
1071 		return (unary(op2(ZERO, np, NIL)));
1072 	default:
1073 		return (np);
1074 	}
1075 }
1076 
1077 /*
1078  * Character class definitions conformant to the POSIX locale as
1079  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1080  * and operating character sets are both ASCII (ISO646) or supersets
1081  * thereof.
1082  *
1083  * Note that to avoid overflowing the temporary buffer used in
1084  * relex(), the expanded character class (prior to range expansion)
1085  * must be less than twice the size of their full name.
1086  */
1087 
1088 /* Because isblank doesn't show up in any of the header files on any
1089  * system i use, it's defined here.  if some other locale has a richer
1090  * definition of "blank", define HAS_ISBLANK and provide your own
1091  * version.
1092  * the parentheses here are an attempt to find a path through the maze
1093  * of macro definition and/or function and/or version provided.  thanks
1094  * to nelson beebe for the suggestion; let's see if it works everywhere.
1095  */
1096 
1097 #ifndef HAS_ISBLANK
1098 
1099 int (xisblank)(int c)
1100 {
1101 	return c==' ' || c=='\t';
1102 }
1103 
1104 #endif
1105 
1106 static const struct charclass {
1107 	const char *cc_name;
1108 	int cc_namelen;
1109 	int (*cc_func)(int);
1110 } charclasses[] = {
1111 	{ "alnum",	5,	isalnum },
1112 	{ "alpha",	5,	isalpha },
1113 #ifndef HAS_ISBLANK
1114 	{ "blank",	5,	xisblank },
1115 #else
1116 	{ "blank",	5,	isblank },
1117 #endif
1118 	{ "cntrl",	5,	iscntrl },
1119 	{ "digit",	5,	isdigit },
1120 	{ "graph",	5,	isgraph },
1121 	{ "lower",	5,	islower },
1122 	{ "print",	5,	isprint },
1123 	{ "punct",	5,	ispunct },
1124 	{ "space",	5,	isspace },
1125 	{ "upper",	5,	isupper },
1126 	{ "xdigit",	6,	isxdigit },
1127 	{ NULL,		0,	NULL },
1128 };
1129 
1130 #define REPEAT_SIMPLE		0
1131 #define REPEAT_PLUS_APPENDED	1
1132 #define REPEAT_WITH_Q		2
1133 #define REPEAT_ZERO		3
1134 
1135 static int
1136 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1137 	       int atomlen, int firstnum, int secondnum, int special_case)
1138 {
1139 	int i, j;
1140 	uschar *buf = NULL;
1141 	int ret = 1;
1142 	int init_q = (firstnum == 0);		/* first added char will be ? */
1143 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
1144 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
1145 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
1146 	int size = prefix_length +  suffix_length;
1147 
1148 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
1149 		size += atomlen*(firstnum-1);
1150 	}
1151 
1152 	/* Adjust size of buffer for special cases */
1153 	if (special_case == REPEAT_PLUS_APPENDED) {
1154 		size++;		/* for the final + */
1155 	} else if (special_case == REPEAT_WITH_Q) {
1156 		size += init_q + (atomlen+1)* (n_q_reps-init_q);
1157 	} else if (special_case == REPEAT_ZERO) {
1158 		size += 2;	/* just a null ERE: () */
1159 	}
1160 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
1161 		FATAL("out of space in reg expr %.10s..", lastre);
1162 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
1163 	j = prefix_length;
1164 	if (special_case == REPEAT_ZERO) {
1165 		j -= atomlen;
1166 		buf[j++] = '(';
1167 		buf[j++] = ')';
1168 	}
1169 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
1170 		memcpy(&buf[j], atom, atomlen);
1171 		j += atomlen;
1172 	}
1173 	if (special_case == REPEAT_PLUS_APPENDED) {
1174 		buf[j++] = '+';
1175 	} else if (special_case == REPEAT_WITH_Q) {
1176 		if (init_q)
1177 			buf[j++] = '?';
1178 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
1179 			memcpy(&buf[j], atom, atomlen);
1180 			j += atomlen;
1181 			buf[j++] = '?';
1182 		}
1183 	}
1184 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
1185 	j += suffix_length;
1186 	buf[j] = '\0';
1187 	/* free old basestr */
1188 	if (firstbasestr != basestr) {
1189 		if (basestr)
1190 			xfree(basestr);
1191 	}
1192 	basestr = buf;
1193 	prestr  = buf + prefix_length;
1194 	if (special_case == REPEAT_ZERO) {
1195 		prestr  -= atomlen;
1196 		ret++;
1197 	}
1198 	return ret;
1199 }
1200 
1201 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1202 		  int atomlen, int firstnum, int secondnum)
1203 {
1204 	/*
1205 	   In general, the repetition specifier or "bound" is replaced here
1206 	   by an equivalent ERE string, repeating the immediately previous atom
1207 	   and appending ? and + as needed. Note that the first copy of the
1208 	   atom is left in place, except in the special_case of a zero-repeat
1209 	   (i.e., {0}).
1210 	 */
1211 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1212 		if (firstnum < 2) {
1213 			/* 0 or 1: should be handled before you get here */
1214 			FATAL("internal error");
1215 		} else {
1216 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1217 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1218 		}
1219 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1220 		if (firstnum == 0) {	/* {0} or {0,0} */
1221 			/* This case is unusual because the resulting
1222 			   replacement string might actually be SMALLER than
1223 			   the original ERE */
1224 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1225 					firstnum, secondnum, REPEAT_ZERO);
1226 		} else {		/* (firstnum >= 1) */
1227 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1228 					firstnum, secondnum, REPEAT_SIMPLE);
1229 		}
1230 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1231 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1232 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1233 					firstnum, secondnum, REPEAT_WITH_Q);
1234 	} else {	/* Error - shouldn't be here (n>m) */
1235 		FATAL("internal error");
1236 	}
1237 	return 0;
1238 }
1239 
1240 int relex(void)		/* lexical analyzer for reparse */
1241 {
1242 	int c, n;
1243 	int cflag;
1244 	static uschar *buf = NULL;
1245 	static int bufsz = 100;
1246 	uschar *bp;
1247 	const struct charclass *cc;
1248 	int i;
1249 	int num, m;
1250 	bool commafound, digitfound;
1251 	const uschar *startreptok;
1252 	static int parens = 0;
1253 
1254 rescan:
1255 	starttok = prestr;
1256 
1257 	if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1258 		prestr += n;
1259 		starttok = prestr;
1260 		return CHAR;
1261 	}
1262 
1263 	switch (c = *prestr++) {
1264 	case '|': return OR;
1265 	case '*': return STAR;
1266 	case '+': return PLUS;
1267 	case '?': return QUEST;
1268 	case '.': return DOT;
1269 	case '\0': prestr--; return '\0';
1270 	case '^':
1271 	case '$':
1272 		return c;
1273 	case '(':
1274 		parens++;
1275 		return c;
1276 	case ')':
1277 		if (parens) {
1278 			parens--;
1279 			return c;
1280 		}
1281 		/* unmatched close parenthesis; per POSIX, treat as literal */
1282 		rlxval = c;
1283 		return CHAR;
1284 	case '\\':
1285 		rlxval = quoted(&prestr);
1286 		return CHAR;
1287 	default:
1288 		rlxval = c;
1289 		return CHAR;
1290 	case '[':
1291 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1292 			FATAL("out of space in reg expr %.10s..", lastre);
1293 		bp = buf;
1294 		if (*prestr == '^') {
1295 			cflag = 1;
1296 			prestr++;
1297 		}
1298 		else
1299 			cflag = 0;
1300 		n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1301 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1302 			FATAL("out of space for reg expr %.10s...", lastre);
1303 		for (; ; ) {
1304 			if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1305 				for (i = 0; i < n; i++)
1306 					*bp++ = *prestr++;
1307 				continue;
1308 			}
1309 			if ((c = *prestr++) == '\\') {
1310 				*bp++ = '\\';
1311 				if ((c = *prestr++) == '\0')
1312 					FATAL("nonterminated character class %.20s...", lastre);
1313 				*bp++ = c;
1314 			/* } else if (c == '\n') { */
1315 			/* 	FATAL("newline in character class %.20s...", lastre); */
1316 			} else if (c == '[' && *prestr == ':') {
1317 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1318 				for (cc = charclasses; cc->cc_name; cc++)
1319 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1320 						break;
1321 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1322 				    prestr[2 + cc->cc_namelen] == ']') {
1323 					prestr += cc->cc_namelen + 3;
1324 					/*
1325 					 * BUG: We begin at 1, instead of 0, since we
1326 					 * would otherwise prematurely terminate the
1327 					 * string for classes like [[:cntrl:]]. This
1328 					 * means that we can't match the NUL character,
1329 					 * not without first adapting the entire
1330 					 * program to track each string's length.
1331 					 */
1332 					for (i = 1; i <= UCHAR_MAX; i++) {
1333 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1334 						    FATAL("out of space for reg expr %.10s...", lastre);
1335 						if (cc->cc_func(i)) {
1336 							/* escape backslash */
1337 							if (i == '\\') {
1338 								*bp++ = '\\';
1339 								n++;
1340 							}
1341 
1342 							*bp++ = i;
1343 							n++;
1344 						}
1345 					}
1346 				} else
1347 					*bp++ = c;
1348 			} else if (c == '[' && *prestr == '.') {
1349 				char collate_char;
1350 				prestr++;
1351 				collate_char = *prestr++;
1352 				if (*prestr == '.' && prestr[1] == ']') {
1353 					prestr += 2;
1354 					/* Found it: map via locale TBD: for
1355 					   now, simply return this char.  This
1356 					   is sufficient to pass conformance
1357 					   test awk.ex 156
1358 					 */
1359 					if (*prestr == ']') {
1360 						prestr++;
1361 						rlxval = collate_char;
1362 						return CHAR;
1363 					}
1364 				}
1365 			} else if (c == '[' && *prestr == '=') {
1366 				char equiv_char;
1367 				prestr++;
1368 				equiv_char = *prestr++;
1369 				if (*prestr == '=' && prestr[1] == ']') {
1370 					prestr += 2;
1371 					/* Found it: map via locale TBD: for now
1372 					   simply return this char. This is
1373 					   sufficient to pass conformance test
1374 					   awk.ex 156
1375 					 */
1376 					if (*prestr == ']') {
1377 						prestr++;
1378 						rlxval = equiv_char;
1379 						return CHAR;
1380 					}
1381 				}
1382 			} else if (c == '\0') {
1383 				FATAL("nonterminated character class %.20s", lastre);
1384 			} else if (bp == buf) {	/* 1st char is special */
1385 				*bp++ = c;
1386 			} else if (c == ']') {
1387 				*bp++ = 0;
1388 				rlxstr = (uschar *) tostring((char *) buf);
1389 				if (cflag == 0)
1390 					return CCL;
1391 				else
1392 					return NCCL;
1393 			} else
1394 				*bp++ = c;
1395 		}
1396 		break;
1397 	case '{':
1398 		if (isdigit(*(prestr))) {
1399 			num = 0;	/* Process as a repetition */
1400 			n = -1; m = -1;
1401 			commafound = false;
1402 			digitfound = false;
1403 			startreptok = prestr-1;
1404 			/* Remember start of previous atom here ? */
1405 		} else {        	/* just a { char, not a repetition */
1406 			rlxval = c;
1407 			return CHAR;
1408                 }
1409 		for (; ; ) {
1410 			if ((c = *prestr++) == '}') {
1411 				if (commafound) {
1412 					if (digitfound) { /* {n,m} */
1413 						m = num;
1414 						if (m < n)
1415 							FATAL("illegal repetition expression: class %.20s",
1416 								lastre);
1417 						if (n == 0 && m == 1) {
1418 							return QUEST;
1419 						}
1420 					} else {	/* {n,} */
1421 						if (n == 0)
1422 							return STAR;
1423 						else if (n == 1)
1424 							return PLUS;
1425 					}
1426 				} else {
1427 					if (digitfound) { /* {n} same as {n,n} */
1428 						n = num;
1429 						m = num;
1430 					} else {	/* {} */
1431 						FATAL("illegal repetition expression: class %.20s",
1432 							lastre);
1433 					}
1434 				}
1435 				if (repeat(starttok, prestr-starttok, lastatom,
1436 					   startreptok - lastatom, n, m) > 0) {
1437 					if (n == 0 && m == 0) {
1438 						return ZERO;
1439 					}
1440 					/* must rescan input for next token */
1441 					goto rescan;
1442 				}
1443 				/* Failed to replace: eat up {...} characters
1444 				   and treat like just PLUS */
1445 				return PLUS;
1446 			} else if (c == '\0') {
1447 				FATAL("nonterminated character class %.20s",
1448 					lastre);
1449 			} else if (isdigit(c)) {
1450 				num = 10 * num + c - '0';
1451 				digitfound = true;
1452 			} else if (c == ',') {
1453 				if (commafound)
1454 					FATAL("illegal repetition expression: class %.20s",
1455 						lastre);
1456 				/* looking for {n,} or {n,m} */
1457 				commafound = true;
1458 				n = num;
1459 				digitfound = false; /* reset */
1460 				num = 0;
1461 			} else {
1462 				FATAL("illegal repetition expression: class %.20s",
1463 					lastre);
1464 			}
1465 		}
1466 		break;
1467 	}
1468 }
1469 
1470 int cgoto(fa *f, int s, int c)
1471 {
1472 	int *p, *q;
1473 	int i, j, k;
1474 
1475 	/* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
1476 	while (f->accept >= maxsetvec) {	/* guessing here! */
1477 		resizesetvec(__func__);
1478 	}
1479 	for (i = 0; i <= f->accept; i++)
1480 		setvec[i] = 0;
1481 	setcnt = 0;
1482 	resize_state(f, s);
1483 	/* compute positions of gototab[s,c] into setvec */
1484 	p = f->posns[s];
1485 	for (i = 1; i <= *p; i++) {
1486 		if ((k = f->re[p[i]].ltype) != FINAL) {
1487 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1488 			 || (k == DOT && c != 0 && c != HAT)
1489 			 || (k == ALL && c != 0)
1490 			 || (k == EMPTYRE && c != 0)
1491 			 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1492 			 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1493 				q = f->re[p[i]].lfollow;
1494 				for (j = 1; j <= *q; j++) {
1495 					if (q[j] >= maxsetvec) {
1496 						resizesetvec(__func__);
1497 					}
1498 					if (setvec[q[j]] == 0) {
1499 						setcnt++;
1500 						setvec[q[j]] = 1;
1501 					}
1502 				}
1503 			}
1504 		}
1505 	}
1506 	/* determine if setvec is a previous state */
1507 	tmpset[0] = setcnt;
1508 	j = 1;
1509 	for (i = f->accept; i >= 0; i--)
1510 		if (setvec[i]) {
1511 			tmpset[j++] = i;
1512 		}
1513 	resize_state(f, f->curstat > s ? f->curstat : s);
1514 	/* tmpset == previous state? */
1515 	for (i = 1; i <= f->curstat; i++) {
1516 		p = f->posns[i];
1517 		if ((k = tmpset[0]) != p[0])
1518 			goto different;
1519 		for (j = 1; j <= k; j++)
1520 			if (tmpset[j] != p[j])
1521 				goto different;
1522 		/* setvec is state i */
1523 		if (c != HAT)
1524 			set_gototab(f, s, c, i);
1525 		return i;
1526 	  different:;
1527 	}
1528 
1529 	/* add tmpset to current set of states */
1530 	++(f->curstat);
1531 	resize_state(f, f->curstat);
1532 	clear_gototab(f, f->curstat);
1533 	xfree(f->posns[f->curstat]);
1534 	p = intalloc(setcnt + 1, __func__);
1535 
1536 	f->posns[f->curstat] = p;
1537 	if (c != HAT)
1538 		set_gototab(f, s, c, f->curstat);
1539 	for (i = 0; i <= setcnt; i++)
1540 		p[i] = tmpset[i];
1541 	if (setvec[f->accept])
1542 		f->out[f->curstat] = 1;
1543 	else
1544 		f->out[f->curstat] = 0;
1545 	return f->curstat;
1546 }
1547 
1548 
1549 void freefa(fa *f)	/* free a finite automaton */
1550 {
1551 	int i;
1552 
1553 	if (f == NULL)
1554 		return;
1555 	for (i = 0; i < f->state_count; i++)
1556 		xfree(f->gototab[i].entries);
1557 	xfree(f->gototab);
1558 	for (i = 0; i <= f->curstat; i++)
1559 		xfree(f->posns[i]);
1560 	for (i = 0; i <= f->accept; i++) {
1561 		xfree(f->re[i].lfollow);
1562 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1563 			xfree(f->re[i].lval.np);
1564 	}
1565 	xfree(f->restr);
1566 	xfree(f->out);
1567 	xfree(f->posns);
1568 	xfree(f->gototab);
1569 	xfree(f);
1570 }
1571