xref: /openbsd/usr.bin/awk/b.c (revision fb60ec6a)
1 /*	$OpenBSD: b.c,v 1.54 2024/08/03 21:12:16 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 /* lasciate ogne speranza, voi ch'intrate. */
27 
28 #define	DEBUG
29 
30 #include <ctype.h>
31 #include <limits.h>
32 #include <stdio.h>
33 #include <string.h>
34 #include <stdlib.h>
35 #include "awk.h"
36 #include "awkgram.tab.h"
37 
38 #define MAXLIN 22
39 
40 #define type(v)		(v)->nobj	/* badly overloaded here */
41 #define info(v)		(v)->ntype	/* badly overloaded here */
42 #define left(v)		(v)->narg[0]
43 #define right(v)	(v)->narg[1]
44 #define parent(v)	(v)->nnext
45 
46 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
47 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
48 #define UNARY	case STAR: case PLUS: case QUEST:
49 
50 /* encoding in tree Nodes:
51 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
52 		left is index, right contains value or pointer to value
53 	unary (STAR, PLUS, QUEST): left is child, right is null
54 	binary (CAT, OR): left and right are children
55 	parent contains pointer to parent
56 */
57 
58 
59 int	*setvec;
60 int	*tmpset;
61 int	maxsetvec = 0;
62 
63 int	rtok;		/* next token in current re */
64 int	rlxval;
65 static const uschar	*rlxstr;
66 static const uschar	*prestr;	/* current position in current re */
67 static const uschar	*lastre;	/* origin of last re */
68 static const uschar	*lastatom;	/* origin of last Atom */
69 static const uschar	*starttok;
70 static const uschar 	*basestr;	/* starts with original, replaced during
71 				   repetition processing */
72 static const uschar 	*firstbasestr;
73 
74 static	int setcnt;
75 static	int poscnt;
76 
77 const char	*patbeg;
78 int	patlen;
79 
80 #define	NFA	128	/* cache this many dynamic fa's */
81 fa	*fatab[NFA];
82 int	nfatab	= 0;	/* entries in fatab */
83 
84 /* utf-8 mechanism:
85 
86    For most of Awk, utf-8 strings just "work", since they look like
87    null-terminated sequences of 8-bit bytes.
88 
89    Functions like length(), index(), and substr() have to operate
90    in units of utf-8 characters.  The u8_* functions in run.c
91    handle this.
92 
93    Regular expressions are more complicated, since the basic
94    mechanism of the goto table used 8-bit byte indices into the
95    gototab entries to compute the next state.  Unicode is a lot
96    bigger, so the gototab entries are now structs with a character
97    and a next state. These are sorted by code point and binary
98    searched.
99 
100    Throughout the RE mechanism in b.c, utf-8 characters are
101    converted to their utf-32 value.  This mostly shows up in
102    cclenter, which expands character class ranges like a-z and now
103    alpha-omega.  The size of a gototab array is still about 256.
104    This should be dynamic, but for now things work ok for a single
105    code page of Unicode, which is the most likely case.
106 
107    The code changes are localized in run.c and b.c.  I have added a
108    handful of functions to somewhat better hide the implementation,
109    but a lot more could be done.
110 
111  */
112 
113 static int entry_cmp(const void *l, const void *r);
114 static int get_gototab(fa*, int, int);
115 static int set_gototab(fa*, int, int, int);
116 static void clear_gototab(fa*, int);
117 
118 static int *
intalloc(size_t n,const char * f)119 intalloc(size_t n, const char *f)
120 {
121 	int *p = (int *) calloc(n, sizeof(int));
122 	if (p == NULL)
123 		overflo(f);
124 	return p;
125 }
126 
127 static void
allocsetvec(const char * f)128 allocsetvec(const char *f)
129 {
130 	maxsetvec = MAXLIN;
131 	setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
132 	tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
133 	if (setvec == NULL || tmpset == NULL)
134 		overflo(f);
135 }
136 
137 static void
resizesetvec(const char * f)138 resizesetvec(const char *f)
139 {
140 	setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
141 	tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
142 	if (setvec == NULL || tmpset == NULL)
143 		overflo(f);
144 	maxsetvec *= 4;
145 }
146 
147 static void
resize_state(fa * f,int state)148 resize_state(fa *f, int state)
149 {
150 	gtt *p;
151 	uschar *p2;
152 	int **p3;
153 	int i, new_count;
154 
155 	if (++state < f->state_count)
156 		return;
157 
158 	new_count = state + 10; /* needs to be tuned */
159 
160 	p = (gtt *) reallocarray(f->gototab, new_count, sizeof(gtt));
161 	if (p == NULL)
162 		goto out;
163 	f->gototab = p;
164 
165 	p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
166 	if (p2 == NULL)
167 		goto out;
168 	f->out = p2;
169 
170 	p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
171 	if (p3 == NULL)
172 		goto out;
173 	f->posns = p3;
174 
175 	for (i = f->state_count; i < new_count; ++i) {
176 		f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
177 		if (f->gototab[i].entries == NULL)
178 			goto out;
179 		f->gototab[i].allocated = NCHARS;
180 		f->gototab[i].inuse = 0;
181 		f->out[i] = 0;
182 		f->posns[i] = NULL;
183 	}
184 	f->state_count = new_count;
185 	return;
186 out:
187 	overflo(__func__);
188 }
189 
makedfa(const char * s,bool anchor)190 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
191 {
192 	int i, use, nuse;
193 	fa *pfa;
194 	static int now = 1;
195 
196 	if (setvec == NULL) {	/* first time through any RE */
197 		allocsetvec(__func__);
198 	}
199 
200 	if (compile_time != RUNNING)	/* a constant for sure */
201 		return mkdfa(s, anchor);
202 	for (i = 0; i < nfatab; i++)	/* is it there already? */
203 		if (fatab[i]->anchor == anchor
204 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
205 			fatab[i]->use = now++;
206 			return fatab[i];
207 		}
208 	pfa = mkdfa(s, anchor);
209 	if (nfatab < NFA) {	/* room for another */
210 		fatab[nfatab] = pfa;
211 		fatab[nfatab]->use = now++;
212 		nfatab++;
213 		return pfa;
214 	}
215 	use = fatab[0]->use;	/* replace least-recently used */
216 	nuse = 0;
217 	for (i = 1; i < nfatab; i++)
218 		if (fatab[i]->use < use) {
219 			use = fatab[i]->use;
220 			nuse = i;
221 		}
222 	freefa(fatab[nuse]);
223 	fatab[nuse] = pfa;
224 	pfa->use = now++;
225 	return pfa;
226 }
227 
mkdfa(const char * s,bool anchor)228 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
229 				/* anchor = true for anchored matches, else false */
230 {
231 	Node *p, *p1;
232 	fa *f;
233 
234 	firstbasestr = (const uschar *) s;
235 	basestr = firstbasestr;
236 	p = reparse(s);
237 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
238 		/* put ALL STAR in front of reg.  exp. */
239 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
240 		/* put FINAL after reg.  exp. */
241 
242 	poscnt = 0;
243 	penter(p1);	/* enter parent pointers and leaf indices */
244 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
245 		overflo(__func__);
246 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
247 	cfoll(f, p1);	/* set up follow sets */
248 	freetr(p1);
249 	resize_state(f, 1);
250 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
251 	f->posns[1] = intalloc(1, __func__);
252 	*f->posns[1] = 0;
253 	f->initstat = makeinit(f, anchor);
254 	f->anchor = anchor;
255 	f->restr = (uschar *) tostring(s);
256 	if (firstbasestr != basestr) {
257 		if (basestr)
258 			xfree(basestr);
259 	}
260 	return f;
261 }
262 
makeinit(fa * f,bool anchor)263 int makeinit(fa *f, bool anchor)
264 {
265 	int i, k;
266 
267 	f->curstat = 2;
268 	f->out[2] = 0;
269 	k = *(f->re[0].lfollow);
270 	xfree(f->posns[2]);
271 	f->posns[2] = intalloc(k + 1,  __func__);
272 	for (i = 0; i <= k; i++) {
273 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
274 	}
275 	if ((f->posns[2])[1] == f->accept)
276 		f->out[2] = 1;
277 	clear_gototab(f, 2);
278 	f->curstat = cgoto(f, 2, HAT);
279 	if (anchor) {
280 		*f->posns[2] = k-1;	/* leave out position 0 */
281 		for (i = 0; i < k; i++) {
282 			(f->posns[0])[i] = (f->posns[2])[i];
283 		}
284 
285 		f->out[0] = f->out[2];
286 		if (f->curstat != 2)
287 			--(*f->posns[f->curstat]);
288 	}
289 	return f->curstat;
290 }
291 
penter(Node * p)292 void penter(Node *p)	/* set up parent pointers and leaf indices */
293 {
294 	switch (type(p)) {
295 	ELEAF
296 	LEAF
297 		info(p) = poscnt;
298 		poscnt++;
299 		break;
300 	UNARY
301 		penter(left(p));
302 		parent(left(p)) = p;
303 		break;
304 	case CAT:
305 	case OR:
306 		penter(left(p));
307 		penter(right(p));
308 		parent(left(p)) = p;
309 		parent(right(p)) = p;
310 		break;
311 	case ZERO:
312 		break;
313 	default:	/* can't happen */
314 		FATAL("can't happen: unknown type %d in penter", type(p));
315 		break;
316 	}
317 }
318 
freetr(Node * p)319 void freetr(Node *p)	/* free parse tree */
320 {
321 	switch (type(p)) {
322 	ELEAF
323 	LEAF
324 		xfree(p);
325 		break;
326 	UNARY
327 	case ZERO:
328 		freetr(left(p));
329 		xfree(p);
330 		break;
331 	case CAT:
332 	case OR:
333 		freetr(left(p));
334 		freetr(right(p));
335 		xfree(p);
336 		break;
337 	default:	/* can't happen */
338 		FATAL("can't happen: unknown type %d in freetr", type(p));
339 		break;
340 	}
341 }
342 
343 /* in the parsing of regular expressions, metacharacters like . have */
344 /* to be seen literally;  \056 is not a metacharacter. */
345 
346 static int
hexstr(const uschar ** pp,int max)347 hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
348 {			/* only pick up one 8-bit byte (2 chars) */
349 	const uschar *p;
350 	int n = 0;
351 	int i;
352 
353 	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
354 		if (isdigit(*p))
355 			n = 16 * n + *p - '0';
356 		else if (*p >= 'a' && *p <= 'f')
357 			n = 16 * n + *p - 'a' + 10;
358 		else if (*p >= 'A' && *p <= 'F')
359 			n = 16 * n + *p - 'A' + 10;
360 	}
361 	*pp = p;
362 	return n;
363 }
364 
365 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
366 
quoted(const uschar ** pp)367 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
368 			/* and increment *pp */
369 {
370 	const uschar *p = *pp;
371 	int c;
372 
373 /* BUG: should advance by utf-8 char even if makes no sense */
374 
375 	switch ((c = *p++)) {
376 	case 't':
377 		c = '\t';
378 		break;
379 	case 'n':
380 		c = '\n';
381 		break;
382 	case 'f':
383 		c = '\f';
384 		break;
385 	case 'r':
386 		c = '\r';
387 		break;
388 	case 'b':
389 		c = '\b';
390 		break;
391 	case 'v':
392 		c = '\v';
393 		break;
394 	case 'a':
395 		c = '\a';
396 		break;
397 	case '\\':
398 		c = '\\';
399 		break;
400 	case 'x': /* 2 hex digits follow */
401 		c = hexstr(&p, 2); /* this adds a null if number is invalid */
402 		break;
403 	case 'u': /* unicode char number up to 8 hex digits */
404 		c = hexstr(&p, 8);
405 		break;
406 	default:
407 		if (isoctdigit(c)) { /* \d \dd \ddd */
408 			int n = c - '0';
409 			if (isoctdigit(*p)) {
410 				n = 8 * n + *p++ - '0';
411 				if (isoctdigit(*p))
412 					n = 8 * n + *p++ - '0';
413 			}
414 			c = n;
415 		}
416 	}
417 
418 	*pp = p;
419 	return c;
420 }
421 
cclenter(const char * argp)422 int *cclenter(const char *argp)	/* add a character class */
423 {
424 	int i, c, c2;
425 	int n;
426 	const uschar *p = (const uschar *) argp;
427 	int *bp, *retp;
428 	static int *buf = NULL;
429 	static int bufsz = 100;
430 
431 	if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
432 		FATAL("out of space for character class [%.10s...] 1", p);
433 	bp = buf;
434 	for (i = 0; *p != 0; ) {
435 		n = u8_rune(&c, (const char *) p);
436 		p += n;
437 		if (c == '\\') {
438 			c = quoted(&p);
439 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
440 			if (*p != 0) {
441 				c = bp[-1];
442 				/* c2 = *p++; */
443 				n = u8_rune(&c2, (const char *) p);
444 				p += n;
445 				if (c2 == '\\')
446 					c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
447 				if (c > c2) {	/* empty; ignore */
448 					bp--;
449 					i--;
450 					continue;
451 				}
452 				while (c < c2) {
453 					if (i >= bufsz) {
454 						buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
455 						if (buf == NULL)
456 							FATAL("out of space for character class [%.10s...] 2", p);
457 						bufsz *= 2;
458 						bp = buf + i;
459 					}
460 					*bp++ = ++c;
461 					i++;
462 				}
463 				continue;
464 			}
465 		}
466 		if (i >= bufsz) {
467 			buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
468 			if (buf == NULL)
469 				FATAL("out of space for character class [%.10s...] 2", p);
470 			bufsz *= 2;
471 			bp = buf + i;
472 		}
473 		*bp++ = c;
474 		i++;
475 	}
476 	*bp = 0;
477 	/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
478 	/* xfree(op);  BUG: what are we freeing here? */
479 	retp = (int *) calloc(bp-buf+1, sizeof(int));
480 	for (i = 0; i < bp-buf+1; i++)
481 		retp[i] = buf[i];
482 	return retp;
483 }
484 
overflo(const char * s)485 void overflo(const char *s)
486 {
487 	FATAL("regular expression too big: out of space in %.30s...", s);
488 }
489 
cfoll(fa * f,Node * v)490 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
491 {
492 	int i;
493 	int *p;
494 
495 	switch (type(v)) {
496 	ELEAF
497 	LEAF
498 		f->re[info(v)].ltype = type(v);
499 		f->re[info(v)].lval.np = right(v);
500 		while (f->accept >= maxsetvec) {	/* guessing here! */
501 			resizesetvec(__func__);
502 		}
503 		for (i = 0; i <= f->accept; i++)
504 			setvec[i] = 0;
505 		setcnt = 0;
506 		follow(v);	/* computes setvec and setcnt */
507 		p = intalloc(setcnt + 1, __func__);
508 		f->re[info(v)].lfollow = p;
509 		*p = setcnt;
510 		for (i = f->accept; i >= 0; i--)
511 			if (setvec[i] == 1)
512 				*++p = i;
513 		break;
514 	UNARY
515 		cfoll(f,left(v));
516 		break;
517 	case CAT:
518 	case OR:
519 		cfoll(f,left(v));
520 		cfoll(f,right(v));
521 		break;
522 	case ZERO:
523 		break;
524 	default:	/* can't happen */
525 		FATAL("can't happen: unknown type %d in cfoll", type(v));
526 	}
527 }
528 
first(Node * p)529 int first(Node *p)	/* collects initially active leaves of p into setvec */
530 			/* returns 0 if p matches empty string */
531 {
532 	int b, lp;
533 
534 	switch (type(p)) {
535 	ELEAF
536 	LEAF
537 		lp = info(p);	/* look for high-water mark of subscripts */
538 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
539 			resizesetvec(__func__);
540 		}
541 		if (type(p) == EMPTYRE) {
542 			setvec[lp] = 0;
543 			return(0);
544 		}
545 		if (setvec[lp] != 1) {
546 			setvec[lp] = 1;
547 			setcnt++;
548 		}
549 		if (type(p) == CCL && (*(int *) right(p)) == 0)
550 			return(0);		/* empty CCL */
551 		return(1);
552 	case PLUS:
553 		if (first(left(p)) == 0)
554 			return(0);
555 		return(1);
556 	case STAR:
557 	case QUEST:
558 		first(left(p));
559 		return(0);
560 	case CAT:
561 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
562 		return(1);
563 	case OR:
564 		b = first(right(p));
565 		if (first(left(p)) == 0 || b == 0) return(0);
566 		return(1);
567 	case ZERO:
568 		return 0;
569 	}
570 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
571 	return(-1);
572 }
573 
follow(Node * v)574 void follow(Node *v)	/* collects leaves that can follow v into setvec */
575 {
576 	Node *p;
577 
578 	if (type(v) == FINAL)
579 		return;
580 	p = parent(v);
581 	switch (type(p)) {
582 	case STAR:
583 	case PLUS:
584 		first(v);
585 		follow(p);
586 		return;
587 
588 	case OR:
589 	case QUEST:
590 		follow(p);
591 		return;
592 
593 	case CAT:
594 		if (v == left(p)) {	/* v is left child of p */
595 			if (first(right(p)) == 0) {
596 				follow(p);
597 				return;
598 			}
599 		} else		/* v is right child */
600 			follow(p);
601 		return;
602 	}
603 }
604 
member(int c,int * sarg)605 int member(int c, int *sarg)	/* is c in s? */
606 {
607 	int *s = (int *) sarg;
608 
609 	while (*s)
610 		if (c == *s++)
611 			return(1);
612 	return(0);
613 }
614 
resize_gototab(fa * f,int state)615 static void resize_gototab(fa *f, int state)
616 {
617 	size_t new_size = f->gototab[state].allocated * 2;
618 	gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
619 	if (p == NULL)
620 		overflo(__func__);
621 
622 	// need to initialized the new memory to zero
623 	size_t orig_size = f->gototab[state].allocated;		// 2nd half of new mem is this size
624 	memset(p + orig_size, 0, orig_size * sizeof(gtte));	// clean it out
625 
626 	f->gototab[state].allocated = new_size;			// update gototab info
627 	f->gototab[state].entries = p;
628 }
629 
get_gototab(fa * f,int state,int ch)630 static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
631 {
632 	gtte key;
633 	gtte *item;
634 
635 	key.ch = ch;
636 	key.state = 0;	/* irrelevant */
637 	item = (gtte *) bsearch(& key, f->gototab[state].entries,
638 			f->gototab[state].inuse, sizeof(gtte),
639 			entry_cmp);
640 
641 	if (item == NULL)
642 		return 0;
643 	else
644 		return item->state;
645 }
646 
entry_cmp(const void * l,const void * r)647 static int entry_cmp(const void *l, const void *r)
648 {
649 	const gtte *left, *right;
650 
651 	left = (const gtte *) l;
652 	right = (const gtte *) r;
653 
654 	return left->ch - right->ch;
655 }
656 
set_gototab(fa * f,int state,int ch,int val)657 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
658 {
659 	if (f->gototab[state].inuse == 0) {
660 		f->gototab[state].entries[0].ch = ch;
661 		f->gototab[state].entries[0].state = val;
662 		f->gototab[state].inuse++;
663 		return val;
664 	} else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
665 		// not seen yet, insert and return
666 		gtt *tab = & f->gototab[state];
667 		if (tab->inuse + 1 >= tab->allocated)
668 			resize_gototab(f, state);
669 
670 		f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
671 		f->gototab[state].entries[f->gototab[state].inuse].state = val;
672 		f->gototab[state].inuse++;
673 		return val;
674 	} else {
675 		// maybe we have it, maybe we don't
676 		gtte key;
677 		gtte *item;
678 
679 		key.ch = ch;
680 		key.state = 0;	/* irrelevant */
681 		item = (gtte *) bsearch(& key, f->gototab[state].entries,
682 				f->gototab[state].inuse, sizeof(gtte),
683 				entry_cmp);
684 
685 		if (item != NULL) {
686 			// we have it, update state and return
687 			item->state = val;
688 			return item->state;
689 		}
690 		// otherwise, fall through to insert and reallocate.
691 	}
692 
693 	gtt *tab = & f->gototab[state];
694 	if (tab->inuse + 1 >= tab->allocated)
695 		resize_gototab(f, state);
696 	f->gototab[state].entries[tab->inuse].ch = ch;
697 	f->gototab[state].entries[tab->inuse].state = val;
698 	++tab->inuse;
699 
700 	qsort(f->gototab[state].entries,
701 		f->gototab[state].inuse, sizeof(gtte), entry_cmp);
702 
703 	return val; /* not used anywhere at the moment */
704 }
705 
clear_gototab(fa * f,int state)706 static void clear_gototab(fa *f, int state)
707 {
708 	memset(f->gototab[state].entries, 0,
709 		f->gototab[state].allocated * sizeof(gtte));
710 	f->gototab[state].inuse = 0;
711 }
712 
match(fa * f,const char * p0)713 int match(fa *f, const char *p0)	/* shortest match ? */
714 {
715 	int s, ns;
716 	int n;
717 	int rune;
718 	const uschar *p = (const uschar *) p0;
719 
720 	/* return pmatch(f, p0); does it matter whether longest or shortest? */
721 
722 	s = f->initstat;
723 	assert (s < f->state_count);
724 
725 	if (f->out[s])
726 		return(1);
727 	do {
728 		/* assert(*p < NCHARS); */
729 		n = u8_rune(&rune, (const char *) p);
730 		if ((ns = get_gototab(f, s, rune)) != 0)
731 			s = ns;
732 		else
733 			s = cgoto(f, s, rune);
734 		if (f->out[s])
735 			return(1);
736 		if (*p == 0)
737 			break;
738 		p += n;
739 	} while (1);  /* was *p++ != 0 */
740 	return(0);
741 }
742 
pmatch(fa * f,const char * p0)743 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
744 {
745 	int s, ns;
746 	int n;
747 	int rune;
748 	const uschar *p = (const uschar *) p0;
749 	const uschar *q;
750 
751 	s = f->initstat;
752 	assert(s < f->state_count);
753 
754 	patbeg = (const char *)p;
755 	patlen = -1;
756 	do {
757 		q = p;
758 		do {
759 			if (f->out[s])		/* final state */
760 				patlen = q-p;
761 			/* assert(*q < NCHARS); */
762 			n = u8_rune(&rune, (const char *) q);
763 			if ((ns = get_gototab(f, s, rune)) != 0)
764 				s = ns;
765 			else
766 				s = cgoto(f, s, rune);
767 
768 			assert(s < f->state_count);
769 
770 			if (s == 1) {	/* no transition */
771 				if (patlen >= 0) {
772 					patbeg = (const char *) p;
773 					return(1);
774 				}
775 				else
776 					goto nextin;	/* no match */
777 			}
778 			if (*q == 0)
779 				break;
780 			q += n;
781 		} while (1);
782 		q++;  /* was *q++ */
783 		if (f->out[s])
784 			patlen = q-p-1;	/* don't count $ */
785 		if (patlen >= 0) {
786 			patbeg = (const char *) p;
787 			return(1);
788 		}
789 	nextin:
790 		s = 2;
791 		if (*p == 0)
792 			break;
793 		n = u8_rune(&rune, (const char *) p);
794 		p += n;
795 	} while (1); /* was *p++ */
796 	return (0);
797 }
798 
nematch(fa * f,const char * p0)799 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
800 {
801 	int s, ns;
802 	int n;
803 	int rune;
804 	const uschar *p = (const uschar *) p0;
805 	const uschar *q;
806 
807 	s = f->initstat;
808 	assert(s < f->state_count);
809 
810 	patbeg = (const char *)p;
811 	patlen = -1;
812 	while (*p) {
813 		q = p;
814 		do {
815 			if (f->out[s])		/* final state */
816 				patlen = q-p;
817 			/* assert(*q < NCHARS); */
818 			n = u8_rune(&rune, (const char *) q);
819 			if ((ns = get_gototab(f, s, rune)) != 0)
820 				s = ns;
821 			else
822 				s = cgoto(f, s, rune);
823 			if (s == 1) {	/* no transition */
824 				if (patlen > 0) {
825 					patbeg = (const char *) p;
826 					return(1);
827 				} else
828 					goto nnextin;	/* no nonempty match */
829 			}
830 			if (*q == 0)
831 				break;
832 			q += n;
833 		} while (1);
834 		q++;
835 		if (f->out[s])
836 			patlen = q-p-1;	/* don't count $ */
837 		if (patlen > 0 ) {
838 			patbeg = (const char *) p;
839 			return(1);
840 		}
841 	nnextin:
842 		s = 2;
843 		p++;
844 	}
845 	return (0);
846 }
847 
848 
849 /*
850  * NAME
851  *     fnematch
852  *
853  * DESCRIPTION
854  *     A stream-fed version of nematch which transfers characters to a
855  *     null-terminated buffer. All characters up to and including the last
856  *     character of the matching text or EOF are placed in the buffer. If
857  *     a match is found, patbeg and patlen are set appropriately.
858  *
859  * RETURN VALUES
860  *     false    No match found.
861  *     true     Match found.
862  */
863 
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)864 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
865 {
866 	char *i, *j, *k, *buf = *pbuf;
867 	int bufsize = *pbufsize;
868 	int c, n, ns, s;
869 
870 	s = pfa->initstat;
871 	patlen = 0;
872 
873 	/*
874 	 * buf <= i <= j <= k <= buf+bufsize
875 	 *
876 	 * i: origin of active substring
877 	 * j: current character
878 	 * k: destination of the next getc
879 	 */
880 
881 	i = j = k = buf;
882 
883 	do {
884 		/*
885 		 * Call u8_rune with at least awk_mb_cur_max ahead in
886 		 * the buffer until EOF interferes.
887 		 */
888 		if (k - j < (int)awk_mb_cur_max) {
889 			if (k + awk_mb_cur_max > buf + bufsize) {
890 				char *obuf = buf;
891 				adjbuf(&buf, &bufsize,
892 				    bufsize + awk_mb_cur_max,
893 				    quantum, 0, "fnematch");
894 
895 				/* buf resized, maybe moved. update pointers */
896 				*pbufsize = bufsize;
897 				if (obuf != buf) {
898 					i = buf + (i - obuf);
899 					j = buf + (j - obuf);
900 					k = buf + (k - obuf);
901 					*pbuf = buf;
902 					if (patlen)
903 						patbeg = buf + (patbeg - obuf);
904 				}
905 			}
906 			for (n = awk_mb_cur_max ; n > 0; n--) {
907 				*k++ = (c = getc(f)) != EOF ? c : 0;
908 				if (c == EOF) {
909 					if (ferror(f))
910 						FATAL("fnematch: getc error");
911 					break;
912 				}
913 			}
914 		}
915 
916 		j += u8_rune(&c, j);
917 
918 		if ((ns = get_gototab(pfa, s, c)) != 0)
919 			s = ns;
920 		else
921 			s = cgoto(pfa, s, c);
922 
923 		if (pfa->out[s]) {	/* final state */
924 			patbeg = i;
925 			patlen = j - i;
926 			if (c == 0)	/* don't count $ */
927 				patlen--;
928 		}
929 
930 		if (c && s != 1)
931 			continue;  /* origin i still viable, next j */
932 		if (patlen)
933 			break;     /* best match found */
934 
935 		/* no match at origin i, next i and start over */
936 		i += u8_rune(&c, i);
937 		if (c == 0)
938 			break;    /* no match */
939 		j = i;
940 		s = 2;
941 	} while (1);
942 
943 	if (patlen) {
944 		/*
945 		 * Under no circumstances is the last character fed to
946 		 * the automaton part of the match. It is EOF's nullbyte,
947 		 * or it sent the automaton into a state with no further
948 		 * transitions available (s==1), or both. Room for a
949 		 * terminating nullbyte is guaranteed.
950 		 *
951 		 * ungetc any chars after the end of matching text
952 		 * (except for EOF's nullbyte, if present) and null
953 		 * terminate the buffer.
954 		 */
955 		do
956 			if (*--k && ungetc(*k, f) == EOF)
957 				FATAL("unable to ungetc '%c'", *k);
958 		while (k > patbeg + patlen);
959 		*k = '\0';
960 		return true;
961 	}
962 	else
963 		return false;
964 }
965 
reparse(const char * p)966 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
967 {			/* uses relex() to scan regular expression */
968 	Node *np;
969 
970 	DPRINTF("reparse <%s>\n", p);
971 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
972 	rtok = relex();
973 	/* GNU compatibility: an empty regexp matches anything */
974 	if (rtok == '\0') {
975 		/* FATAL("empty regular expression"); previous */
976 		return(op2(EMPTYRE, NIL, NIL));
977 	}
978 	np = regexp();
979 	if (rtok != '\0')
980 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
981 	return(np);
982 }
983 
regexp(void)984 Node *regexp(void)	/* top-level parse of reg expr */
985 {
986 	return (alt(concat(primary())));
987 }
988 
primary(void)989 Node *primary(void)
990 {
991 	Node *np;
992 	int savelastatom;
993 
994 	switch (rtok) {
995 	case CHAR:
996 		lastatom = starttok;
997 		np = op2(CHAR, NIL, itonp(rlxval));
998 		rtok = relex();
999 		return (unary(np));
1000 	case ALL:
1001 		rtok = relex();
1002 		return (unary(op2(ALL, NIL, NIL)));
1003 	case EMPTYRE:
1004 		rtok = relex();
1005 		return (unary(op2(EMPTYRE, NIL, NIL)));
1006 	case DOT:
1007 		lastatom = starttok;
1008 		rtok = relex();
1009 		return (unary(op2(DOT, NIL, NIL)));
1010 	case CCL:
1011 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1012 		lastatom = starttok;
1013 		rtok = relex();
1014 		return (unary(np));
1015 	case NCCL:
1016 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1017 		lastatom = starttok;
1018 		rtok = relex();
1019 		return (unary(np));
1020 	case '^':
1021 		rtok = relex();
1022 		return (unary(op2(CHAR, NIL, itonp(HAT))));
1023 	case '$':
1024 		rtok = relex();
1025 		return (unary(op2(CHAR, NIL, NIL)));
1026 	case '(':
1027 		lastatom = starttok;
1028 		savelastatom = starttok - basestr; /* Retain over recursion */
1029 		rtok = relex();
1030 		if (rtok == ')') {	/* special pleading for () */
1031 			rtok = relex();
1032 			return unary(op2(CCL, NIL, (Node *) cclenter("")));
1033 		}
1034 		np = regexp();
1035 		if (rtok == ')') {
1036 			lastatom = basestr + savelastatom; /* Restore */
1037 			rtok = relex();
1038 			return (unary(np));
1039 		}
1040 		else
1041 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1042 	default:
1043 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1044 	}
1045 	return 0;	/*NOTREACHED*/
1046 }
1047 
concat(Node * np)1048 Node *concat(Node *np)
1049 {
1050 	switch (rtok) {
1051 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1052 		return (concat(op2(CAT, np, primary())));
1053 	case EMPTYRE:
1054 		rtok = relex();
1055 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1056 				primary())));
1057 	}
1058 	return (np);
1059 }
1060 
alt(Node * np)1061 Node *alt(Node *np)
1062 {
1063 	if (rtok == OR) {
1064 		rtok = relex();
1065 		return (alt(op2(OR, np, concat(primary()))));
1066 	}
1067 	return (np);
1068 }
1069 
unary(Node * np)1070 Node *unary(Node *np)
1071 {
1072 	switch (rtok) {
1073 	case STAR:
1074 		rtok = relex();
1075 		return (unary(op2(STAR, np, NIL)));
1076 	case PLUS:
1077 		rtok = relex();
1078 		return (unary(op2(PLUS, np, NIL)));
1079 	case QUEST:
1080 		rtok = relex();
1081 		return (unary(op2(QUEST, np, NIL)));
1082 	case ZERO:
1083 		rtok = relex();
1084 		return (unary(op2(ZERO, np, NIL)));
1085 	default:
1086 		return (np);
1087 	}
1088 }
1089 
1090 /*
1091  * Character class definitions conformant to the POSIX locale as
1092  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1093  * and operating character sets are both ASCII (ISO646) or supersets
1094  * thereof.
1095  *
1096  * Note that to avoid overflowing the temporary buffer used in
1097  * relex(), the expanded character class (prior to range expansion)
1098  * must be less than twice the size of their full name.
1099  */
1100 
1101 /* Because isblank doesn't show up in any of the header files on any
1102  * system i use, it's defined here.  if some other locale has a richer
1103  * definition of "blank", define HAS_ISBLANK and provide your own
1104  * version.
1105  * the parentheses here are an attempt to find a path through the maze
1106  * of macro definition and/or function and/or version provided.  thanks
1107  * to nelson beebe for the suggestion; let's see if it works everywhere.
1108  */
1109 
1110 #ifndef HAS_ISBLANK
1111 
1112 int (xisblank)(int c)
1113 {
1114 	return c==' ' || c=='\t';
1115 }
1116 
1117 #endif
1118 
1119 static const struct charclass {
1120 	const char *cc_name;
1121 	int cc_namelen;
1122 	int (*cc_func)(int);
1123 } charclasses[] = {
1124 	{ "alnum",	5,	isalnum },
1125 	{ "alpha",	5,	isalpha },
1126 #ifndef HAS_ISBLANK
1127 	{ "blank",	5,	xisblank },
1128 #else
1129 	{ "blank",	5,	isblank },
1130 #endif
1131 	{ "cntrl",	5,	iscntrl },
1132 	{ "digit",	5,	isdigit },
1133 	{ "graph",	5,	isgraph },
1134 	{ "lower",	5,	islower },
1135 	{ "print",	5,	isprint },
1136 	{ "punct",	5,	ispunct },
1137 	{ "space",	5,	isspace },
1138 	{ "upper",	5,	isupper },
1139 	{ "xdigit",	6,	isxdigit },
1140 	{ NULL,		0,	NULL },
1141 };
1142 
1143 #define REPEAT_SIMPLE		0
1144 #define REPEAT_PLUS_APPENDED	1
1145 #define REPEAT_WITH_Q		2
1146 #define REPEAT_ZERO		3
1147 
1148 static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)1149 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1150 	       int atomlen, int firstnum, int secondnum, int special_case)
1151 {
1152 	int i, j;
1153 	uschar *buf = NULL;
1154 	int ret = 1;
1155 	int init_q = (firstnum == 0);		/* first added char will be ? */
1156 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
1157 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
1158 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
1159 	int size = prefix_length +  suffix_length;
1160 
1161 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
1162 		size += atomlen*(firstnum-1);
1163 	}
1164 
1165 	/* Adjust size of buffer for special cases */
1166 	if (special_case == REPEAT_PLUS_APPENDED) {
1167 		size++;		/* for the final + */
1168 	} else if (special_case == REPEAT_WITH_Q) {
1169 		size += init_q + (atomlen+1)* (n_q_reps-init_q);
1170 	} else if (special_case == REPEAT_ZERO) {
1171 		size += 2;	/* just a null ERE: () */
1172 	}
1173 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
1174 		FATAL("out of space in reg expr %.10s..", lastre);
1175 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
1176 	j = prefix_length;
1177 	if (special_case == REPEAT_ZERO) {
1178 		j -= atomlen;
1179 		buf[j++] = '(';
1180 		buf[j++] = ')';
1181 	}
1182 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
1183 		memcpy(&buf[j], atom, atomlen);
1184 		j += atomlen;
1185 	}
1186 	if (special_case == REPEAT_PLUS_APPENDED) {
1187 		buf[j++] = '+';
1188 	} else if (special_case == REPEAT_WITH_Q) {
1189 		if (init_q)
1190 			buf[j++] = '?';
1191 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
1192 			memcpy(&buf[j], atom, atomlen);
1193 			j += atomlen;
1194 			buf[j++] = '?';
1195 		}
1196 	}
1197 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
1198 	j += suffix_length;
1199 	buf[j] = '\0';
1200 	/* free old basestr */
1201 	if (firstbasestr != basestr) {
1202 		if (basestr)
1203 			xfree(basestr);
1204 	}
1205 	basestr = buf;
1206 	prestr  = buf + prefix_length;
1207 	if (special_case == REPEAT_ZERO) {
1208 		prestr  -= atomlen;
1209 		ret++;
1210 	}
1211 	return ret;
1212 }
1213 
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)1214 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1215 		  int atomlen, int firstnum, int secondnum)
1216 {
1217 	/*
1218 	   In general, the repetition specifier or "bound" is replaced here
1219 	   by an equivalent ERE string, repeating the immediately previous atom
1220 	   and appending ? and + as needed. Note that the first copy of the
1221 	   atom is left in place, except in the special_case of a zero-repeat
1222 	   (i.e., {0}).
1223 	 */
1224 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1225 		if (firstnum < 2) {
1226 			/* 0 or 1: should be handled before you get here */
1227 			FATAL("internal error");
1228 		} else {
1229 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1230 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1231 		}
1232 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1233 		if (firstnum == 0) {	/* {0} or {0,0} */
1234 			/* This case is unusual because the resulting
1235 			   replacement string might actually be SMALLER than
1236 			   the original ERE */
1237 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1238 					firstnum, secondnum, REPEAT_ZERO);
1239 		} else {		/* (firstnum >= 1) */
1240 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1241 					firstnum, secondnum, REPEAT_SIMPLE);
1242 		}
1243 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1244 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1245 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1246 					firstnum, secondnum, REPEAT_WITH_Q);
1247 	} else {	/* Error - shouldn't be here (n>m) */
1248 		FATAL("internal error");
1249 	}
1250 	return 0;
1251 }
1252 
relex(void)1253 int relex(void)		/* lexical analyzer for reparse */
1254 {
1255 	int c, n;
1256 	int cflag;
1257 	static uschar *buf = NULL;
1258 	static int bufsz = 100;
1259 	uschar *bp;
1260 	const struct charclass *cc;
1261 	int i;
1262 	int num, m;
1263 	bool commafound, digitfound;
1264 	const uschar *startreptok;
1265 	static int parens = 0;
1266 
1267 rescan:
1268 	starttok = prestr;
1269 
1270 	if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1271 		prestr += n;
1272 		starttok = prestr;
1273 		return CHAR;
1274 	}
1275 
1276 	switch (c = *prestr++) {
1277 	case '|': return OR;
1278 	case '*': return STAR;
1279 	case '+': return PLUS;
1280 	case '?': return QUEST;
1281 	case '.': return DOT;
1282 	case '\0': prestr--; return '\0';
1283 	case '^':
1284 	case '$':
1285 		return c;
1286 	case '(':
1287 		parens++;
1288 		return c;
1289 	case ')':
1290 		if (parens) {
1291 			parens--;
1292 			return c;
1293 		}
1294 		/* unmatched close parenthesis; per POSIX, treat as literal */
1295 		rlxval = c;
1296 		return CHAR;
1297 	case '\\':
1298 		rlxval = quoted(&prestr);
1299 		return CHAR;
1300 	default:
1301 		rlxval = c;
1302 		return CHAR;
1303 	case '[':
1304 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1305 			FATAL("out of space in reg expr %.10s..", lastre);
1306 		bp = buf;
1307 		if (*prestr == '^') {
1308 			cflag = 1;
1309 			prestr++;
1310 		}
1311 		else
1312 			cflag = 0;
1313 		n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1314 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1315 			FATAL("out of space for reg expr %.10s...", lastre);
1316 		for (; ; ) {
1317 			if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1318 				for (i = 0; i < n; i++)
1319 					*bp++ = *prestr++;
1320 				continue;
1321 			}
1322 			if ((c = *prestr++) == '\\') {
1323 				*bp++ = '\\';
1324 				if ((c = *prestr++) == '\0')
1325 					FATAL("nonterminated character class %.20s...", lastre);
1326 				*bp++ = c;
1327 			/* } else if (c == '\n') { */
1328 			/* 	FATAL("newline in character class %.20s...", lastre); */
1329 			} else if (c == '[' && *prestr == ':') {
1330 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1331 				for (cc = charclasses; cc->cc_name; cc++)
1332 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1333 						break;
1334 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1335 				    prestr[2 + cc->cc_namelen] == ']') {
1336 					prestr += cc->cc_namelen + 3;
1337 					/*
1338 					 * BUG: We begin at 1, instead of 0, since we
1339 					 * would otherwise prematurely terminate the
1340 					 * string for classes like [[:cntrl:]]. This
1341 					 * means that we can't match the NUL character,
1342 					 * not without first adapting the entire
1343 					 * program to track each string's length.
1344 					 */
1345 					for (i = 1; i <= UCHAR_MAX; i++) {
1346 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1347 						    FATAL("out of space for reg expr %.10s...", lastre);
1348 						if (cc->cc_func(i)) {
1349 							/* escape backslash */
1350 							if (i == '\\') {
1351 								*bp++ = '\\';
1352 								n++;
1353 							}
1354 
1355 							*bp++ = i;
1356 							n++;
1357 						}
1358 					}
1359 				} else
1360 					*bp++ = c;
1361 			} else if (c == '[' && *prestr == '.') {
1362 				char collate_char;
1363 				prestr++;
1364 				collate_char = *prestr++;
1365 				if (*prestr == '.' && prestr[1] == ']') {
1366 					prestr += 2;
1367 					/* Found it: map via locale TBD: for
1368 					   now, simply return this char.  This
1369 					   is sufficient to pass conformance
1370 					   test awk.ex 156
1371 					 */
1372 					if (*prestr == ']') {
1373 						prestr++;
1374 						rlxval = collate_char;
1375 						return CHAR;
1376 					}
1377 				}
1378 			} else if (c == '[' && *prestr == '=') {
1379 				char equiv_char;
1380 				prestr++;
1381 				equiv_char = *prestr++;
1382 				if (*prestr == '=' && prestr[1] == ']') {
1383 					prestr += 2;
1384 					/* Found it: map via locale TBD: for now
1385 					   simply return this char. This is
1386 					   sufficient to pass conformance test
1387 					   awk.ex 156
1388 					 */
1389 					if (*prestr == ']') {
1390 						prestr++;
1391 						rlxval = equiv_char;
1392 						return CHAR;
1393 					}
1394 				}
1395 			} else if (c == '\0') {
1396 				FATAL("nonterminated character class %.20s", lastre);
1397 			} else if (bp == buf) {	/* 1st char is special */
1398 				*bp++ = c;
1399 			} else if (c == ']') {
1400 				*bp++ = 0;
1401 				rlxstr = (uschar *) tostring((char *) buf);
1402 				if (cflag == 0)
1403 					return CCL;
1404 				else
1405 					return NCCL;
1406 			} else
1407 				*bp++ = c;
1408 		}
1409 		break;
1410 	case '{':
1411 		if (isdigit(*(prestr))) {
1412 			num = 0;	/* Process as a repetition */
1413 			n = -1; m = -1;
1414 			commafound = false;
1415 			digitfound = false;
1416 			startreptok = prestr-1;
1417 			/* Remember start of previous atom here ? */
1418 		} else {        	/* just a { char, not a repetition */
1419 			rlxval = c;
1420 			return CHAR;
1421                 }
1422 		for (; ; ) {
1423 			if ((c = *prestr++) == '}') {
1424 				if (commafound) {
1425 					if (digitfound) { /* {n,m} */
1426 						m = num;
1427 						if (m < n)
1428 							FATAL("illegal repetition expression: class %.20s",
1429 								lastre);
1430 						if (n == 0 && m == 1) {
1431 							return QUEST;
1432 						}
1433 					} else {	/* {n,} */
1434 						if (n == 0)
1435 							return STAR;
1436 						else if (n == 1)
1437 							return PLUS;
1438 					}
1439 				} else {
1440 					if (digitfound) { /* {n} same as {n,n} */
1441 						n = num;
1442 						m = num;
1443 					} else {	/* {} */
1444 						FATAL("illegal repetition expression: class %.20s",
1445 							lastre);
1446 					}
1447 				}
1448 				if (repeat(starttok, prestr-starttok, lastatom,
1449 					   startreptok - lastatom, n, m) > 0) {
1450 					if (n == 0 && m == 0) {
1451 						return ZERO;
1452 					}
1453 					/* must rescan input for next token */
1454 					goto rescan;
1455 				}
1456 				/* Failed to replace: eat up {...} characters
1457 				   and treat like just PLUS */
1458 				return PLUS;
1459 			} else if (c == '\0') {
1460 				FATAL("nonterminated character class %.20s",
1461 					lastre);
1462 			} else if (isdigit(c)) {
1463 				num = 10 * num + c - '0';
1464 				digitfound = true;
1465 			} else if (c == ',') {
1466 				if (commafound)
1467 					FATAL("illegal repetition expression: class %.20s",
1468 						lastre);
1469 				/* looking for {n,} or {n,m} */
1470 				commafound = true;
1471 				n = num;
1472 				digitfound = false; /* reset */
1473 				num = 0;
1474 			} else {
1475 				FATAL("illegal repetition expression: class %.20s",
1476 					lastre);
1477 			}
1478 		}
1479 		break;
1480 	}
1481 }
1482 
cgoto(fa * f,int s,int c)1483 int cgoto(fa *f, int s, int c)
1484 {
1485 	int *p, *q;
1486 	int i, j, k;
1487 
1488 	/* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
1489 	while (f->accept >= maxsetvec) {	/* guessing here! */
1490 		resizesetvec(__func__);
1491 	}
1492 	for (i = 0; i <= f->accept; i++)
1493 		setvec[i] = 0;
1494 	setcnt = 0;
1495 	resize_state(f, s);
1496 	/* compute positions of gototab[s,c] into setvec */
1497 	p = f->posns[s];
1498 	for (i = 1; i <= *p; i++) {
1499 		if ((k = f->re[p[i]].ltype) != FINAL) {
1500 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1501 			 || (k == DOT && c != 0 && c != HAT)
1502 			 || (k == ALL && c != 0)
1503 			 || (k == EMPTYRE && c != 0)
1504 			 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1505 			 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1506 				q = f->re[p[i]].lfollow;
1507 				for (j = 1; j <= *q; j++) {
1508 					if (q[j] >= maxsetvec) {
1509 						resizesetvec(__func__);
1510 					}
1511 					if (setvec[q[j]] == 0) {
1512 						setcnt++;
1513 						setvec[q[j]] = 1;
1514 					}
1515 				}
1516 			}
1517 		}
1518 	}
1519 	/* determine if setvec is a previous state */
1520 	tmpset[0] = setcnt;
1521 	j = 1;
1522 	for (i = f->accept; i >= 0; i--)
1523 		if (setvec[i]) {
1524 			tmpset[j++] = i;
1525 		}
1526 	resize_state(f, f->curstat > s ? f->curstat : s);
1527 	/* tmpset == previous state? */
1528 	for (i = 1; i <= f->curstat; i++) {
1529 		p = f->posns[i];
1530 		if ((k = tmpset[0]) != p[0])
1531 			goto different;
1532 		for (j = 1; j <= k; j++)
1533 			if (tmpset[j] != p[j])
1534 				goto different;
1535 		/* setvec is state i */
1536 		if (c != HAT)
1537 			set_gototab(f, s, c, i);
1538 		return i;
1539 	  different:;
1540 	}
1541 
1542 	/* add tmpset to current set of states */
1543 	++(f->curstat);
1544 	resize_state(f, f->curstat);
1545 	clear_gototab(f, f->curstat);
1546 	xfree(f->posns[f->curstat]);
1547 	p = intalloc(setcnt + 1, __func__);
1548 
1549 	f->posns[f->curstat] = p;
1550 	if (c != HAT)
1551 		set_gototab(f, s, c, f->curstat);
1552 	for (i = 0; i <= setcnt; i++)
1553 		p[i] = tmpset[i];
1554 	if (setvec[f->accept])
1555 		f->out[f->curstat] = 1;
1556 	else
1557 		f->out[f->curstat] = 0;
1558 	return f->curstat;
1559 }
1560 
1561 
freefa(fa * f)1562 void freefa(fa *f)	/* free a finite automaton */
1563 {
1564 	int i;
1565 
1566 	if (f == NULL)
1567 		return;
1568 	for (i = 0; i < f->state_count; i++)
1569 		xfree(f->gototab[i].entries);
1570 	xfree(f->gototab);
1571 	for (i = 0; i <= f->curstat; i++)
1572 		xfree(f->posns[i]);
1573 	for (i = 0; i <= f->accept; i++) {
1574 		xfree(f->re[i].lfollow);
1575 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1576 			xfree(f->re[i].lval.np);
1577 	}
1578 	xfree(f->restr);
1579 	xfree(f->out);
1580 	xfree(f->posns);
1581 	xfree(f->gototab);
1582 	xfree(f);
1583 }
1584