xref: /openbsd/usr.bin/awk/b.c (revision 4cfece93)
1 /*	$OpenBSD: b.c,v 1.33 2020/07/13 14:03:52 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 /* lasciate ogne speranza, voi ch'intrate. */
27 
28 #define	DEBUG
29 
30 #include <ctype.h>
31 #include <limits.h>
32 #include <stdio.h>
33 #include <string.h>
34 #include <stdlib.h>
35 #include "awk.h"
36 #include "ytab.h"
37 
38 #define MAXLIN 22
39 
40 #define type(v)		(v)->nobj	/* badly overloaded here */
41 #define info(v)		(v)->ntype	/* badly overloaded here */
42 #define left(v)		(v)->narg[0]
43 #define right(v)	(v)->narg[1]
44 #define parent(v)	(v)->nnext
45 
46 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
47 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
48 #define UNARY	case STAR: case PLUS: case QUEST:
49 
50 /* encoding in tree Nodes:
51 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
52 		left is index, right contains value or pointer to value
53 	unary (STAR, PLUS, QUEST): left is child, right is null
54 	binary (CAT, OR): left and right are children
55 	parent contains pointer to parent
56 */
57 
58 
59 int	*setvec;
60 int	*tmpset;
61 int	maxsetvec = 0;
62 
63 int	rtok;		/* next token in current re */
64 int	rlxval;
65 static const uschar	*rlxstr;
66 static const uschar	*prestr;	/* current position in current re */
67 static const uschar	*lastre;	/* origin of last re */
68 static const uschar	*lastatom;	/* origin of last Atom */
69 static const uschar	*starttok;
70 static const uschar 	*basestr;	/* starts with original, replaced during
71 				   repetition processing */
72 static const uschar 	*firstbasestr;
73 
74 static	int setcnt;
75 static	int poscnt;
76 
77 const char	*patbeg;
78 int	patlen;
79 
80 #define	NFA	128	/* cache this many dynamic fa's */
81 fa	*fatab[NFA];
82 int	nfatab	= 0;	/* entries in fatab */
83 
84 static int *
85 intalloc(size_t n, const char *f)
86 {
87 	void *p = calloc(n, sizeof(int));
88 	if (p == NULL)
89 		overflo(f);
90 	return p;
91 }
92 
93 static void
94 allocsetvec(const char *f)
95 {
96 	maxsetvec = MAXLIN;
97 	setvec = reallocarray(setvec, maxsetvec, sizeof(*setvec));
98 	tmpset = reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
99 	if (setvec == NULL || tmpset == NULL)
100 		overflo(f);
101 }
102 
103 static void
104 resizesetvec(const char *f)
105 {
106 	setvec = reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
107 	tmpset = reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
108 	if (setvec == NULL || tmpset == NULL)
109 		overflo(f);
110 	maxsetvec *= 4;
111 }
112 
113 static void
114 resize_state(fa *f, int state)
115 {
116 	void *p;
117 	int i, new_count;
118 
119 	if (++state < f->state_count)
120 		return;
121 
122 	new_count = state + 10; /* needs to be tuned */
123 
124 	p = reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
125 	if (p == NULL)
126 		goto out;
127 	f->gototab = p;
128 
129 	p = reallocarray(f->out, new_count, sizeof(f->out[0]));
130 	if (p == NULL)
131 		goto out;
132 	f->out = p;
133 
134 	p = reallocarray(f->posns, new_count, sizeof(f->posns[0]));
135 	if (p == NULL)
136 		goto out;
137 	f->posns = p;
138 
139 	for (i = f->state_count; i < new_count; ++i) {
140 		f->gototab[i] = calloc(NCHARS, sizeof(**f->gototab));
141 		if (f->gototab[i] == NULL)
142 			goto out;
143 		f->out[i]  = 0;
144 		f->posns[i] = NULL;
145 	}
146 	f->state_count = new_count;
147 	return;
148 out:
149 	overflo(__func__);
150 }
151 
152 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
153 {
154 	int i, use, nuse;
155 	fa *pfa;
156 	static int now = 1;
157 
158 	if (setvec == NULL) {	/* first time through any RE */
159 		allocsetvec(__func__);
160 	}
161 
162 	if (compile_time != RUNNING)	/* a constant for sure */
163 		return mkdfa(s, anchor);
164 	for (i = 0; i < nfatab; i++)	/* is it there already? */
165 		if (fatab[i]->anchor == anchor
166 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
167 			fatab[i]->use = now++;
168 			return fatab[i];
169 		}
170 	pfa = mkdfa(s, anchor);
171 	if (nfatab < NFA) {	/* room for another */
172 		fatab[nfatab] = pfa;
173 		fatab[nfatab]->use = now++;
174 		nfatab++;
175 		return pfa;
176 	}
177 	use = fatab[0]->use;	/* replace least-recently used */
178 	nuse = 0;
179 	for (i = 1; i < nfatab; i++)
180 		if (fatab[i]->use < use) {
181 			use = fatab[i]->use;
182 			nuse = i;
183 		}
184 	freefa(fatab[nuse]);
185 	fatab[nuse] = pfa;
186 	pfa->use = now++;
187 	return pfa;
188 }
189 
190 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
191 				/* anchor = true for anchored matches, else false */
192 {
193 	Node *p, *p1;
194 	fa *f;
195 
196 	firstbasestr = (const uschar *) s;
197 	basestr = firstbasestr;
198 	p = reparse(s);
199 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
200 		/* put ALL STAR in front of reg.  exp. */
201 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
202 		/* put FINAL after reg.  exp. */
203 
204 	poscnt = 0;
205 	penter(p1);	/* enter parent pointers and leaf indices */
206 	if ((f = calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
207 		overflo(__func__);
208 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
209 	cfoll(f, p1);	/* set up follow sets */
210 	freetr(p1);
211 	resize_state(f, 1);
212 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
213 	f->posns[1] = intalloc(1, __func__);
214 	*f->posns[1] = 0;
215 	f->initstat = makeinit(f, anchor);
216 	f->anchor = anchor;
217 	f->restr = (uschar *) tostring(s);
218 	if (firstbasestr != basestr) {
219 		if (basestr)
220 			xfree(basestr);
221 	}
222 	return f;
223 }
224 
225 int makeinit(fa *f, bool anchor)
226 {
227 	int i, k;
228 
229 	f->curstat = 2;
230 	f->out[2] = 0;
231 	k = *(f->re[0].lfollow);
232 	xfree(f->posns[2]);
233 	f->posns[2] = intalloc(k + 1,  __func__);
234 	for (i = 0; i <= k; i++) {
235 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
236 	}
237 	if ((f->posns[2])[1] == f->accept)
238 		f->out[2] = 1;
239 	for (i = 0; i < NCHARS; i++)
240 		f->gototab[2][i] = 0;
241 	f->curstat = cgoto(f, 2, HAT);
242 	if (anchor) {
243 		*f->posns[2] = k-1;	/* leave out position 0 */
244 		for (i = 0; i < k; i++) {
245 			(f->posns[0])[i] = (f->posns[2])[i];
246 		}
247 
248 		f->out[0] = f->out[2];
249 		if (f->curstat != 2)
250 			--(*f->posns[f->curstat]);
251 	}
252 	return f->curstat;
253 }
254 
255 void penter(Node *p)	/* set up parent pointers and leaf indices */
256 {
257 	switch (type(p)) {
258 	ELEAF
259 	LEAF
260 		info(p) = poscnt;
261 		poscnt++;
262 		break;
263 	UNARY
264 		penter(left(p));
265 		parent(left(p)) = p;
266 		break;
267 	case CAT:
268 	case OR:
269 		penter(left(p));
270 		penter(right(p));
271 		parent(left(p)) = p;
272 		parent(right(p)) = p;
273 		break;
274 	case ZERO:
275 		break;
276 	default:	/* can't happen */
277 		FATAL("can't happen: unknown type %d in penter", type(p));
278 		break;
279 	}
280 }
281 
282 void freetr(Node *p)	/* free parse tree */
283 {
284 	switch (type(p)) {
285 	ELEAF
286 	LEAF
287 		xfree(p);
288 		break;
289 	UNARY
290 	case ZERO:
291 		freetr(left(p));
292 		xfree(p);
293 		break;
294 	case CAT:
295 	case OR:
296 		freetr(left(p));
297 		freetr(right(p));
298 		xfree(p);
299 		break;
300 	default:	/* can't happen */
301 		FATAL("can't happen: unknown type %d in freetr", type(p));
302 		break;
303 	}
304 }
305 
306 /* in the parsing of regular expressions, metacharacters like . have */
307 /* to be seen literally;  \056 is not a metacharacter. */
308 
309 int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
310 {			/* only pick up one 8-bit byte (2 chars) */
311 	const uschar *p;
312 	int n = 0;
313 	int i;
314 
315 	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
316 		if (isdigit(*p))
317 			n = 16 * n + *p - '0';
318 		else if (*p >= 'a' && *p <= 'f')
319 			n = 16 * n + *p - 'a' + 10;
320 		else if (*p >= 'A' && *p <= 'F')
321 			n = 16 * n + *p - 'A' + 10;
322 	}
323 	*pp = p;
324 	return n;
325 }
326 
327 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
328 
329 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
330 			/* and increment *pp */
331 {
332 	const uschar *p = *pp;
333 	int c;
334 
335 	if ((c = *p++) == 't')
336 		c = '\t';
337 	else if (c == 'n')
338 		c = '\n';
339 	else if (c == 'f')
340 		c = '\f';
341 	else if (c == 'r')
342 		c = '\r';
343 	else if (c == 'b')
344 		c = '\b';
345 	else if (c == 'v')
346 		c = '\v';
347 	else if (c == 'a')
348 		c = '\a';
349 	else if (c == '\\')
350 		c = '\\';
351 	else if (c == 'x') {	/* hexadecimal goo follows */
352 		c = hexstr(&p);	/* this adds a null if number is invalid */
353 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
354 		int n = c - '0';
355 		if (isoctdigit(*p)) {
356 			n = 8 * n + *p++ - '0';
357 			if (isoctdigit(*p))
358 				n = 8 * n + *p++ - '0';
359 		}
360 		c = n;
361 	} /* else */
362 		/* c = c; */
363 	*pp = p;
364 	return c;
365 }
366 
367 char *cclenter(const char *argp)	/* add a character class */
368 {
369 	int i, c, c2;
370 	const uschar *op, *p = (const uschar *) argp;
371 	uschar *bp;
372 	static uschar *buf = NULL;
373 	static int bufsz = 100;
374 
375 	op = p;
376 	if (buf == NULL && (buf = malloc(bufsz)) == NULL)
377 		FATAL("out of space for character class [%.10s...] 1", p);
378 	bp = buf;
379 	for (i = 0; (c = *p++) != 0; ) {
380 		if (c == '\\') {
381 			c = quoted(&p);
382 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
383 			if (*p != 0) {
384 				c = bp[-1];
385 				c2 = *p++;
386 				if (c2 == '\\')
387 					c2 = quoted(&p);
388 				if (c > c2) {	/* empty; ignore */
389 					bp--;
390 					i--;
391 					continue;
392 				}
393 				while (c < c2) {
394 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
395 						FATAL("out of space for character class [%.10s...] 2", p);
396 					*bp++ = ++c;
397 					i++;
398 				}
399 				continue;
400 			}
401 		}
402 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
403 			FATAL("out of space for character class [%.10s...] 3", p);
404 		*bp++ = c;
405 		i++;
406 	}
407 	*bp = 0;
408 	DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
409 	xfree(op);
410 	return (char *) tostring((char *) buf);
411 }
412 
413 void overflo(const char *s)
414 {
415 	FATAL("regular expression too big: out of space in %.30s...", s);
416 }
417 
418 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
419 {
420 	int i;
421 	int *p;
422 
423 	switch (type(v)) {
424 	ELEAF
425 	LEAF
426 		f->re[info(v)].ltype = type(v);
427 		f->re[info(v)].lval.np = right(v);
428 		while (f->accept >= maxsetvec) {	/* guessing here! */
429 			resizesetvec(__func__);
430 		}
431 		for (i = 0; i <= f->accept; i++)
432 			setvec[i] = 0;
433 		setcnt = 0;
434 		follow(v);	/* computes setvec and setcnt */
435 		p = intalloc(setcnt + 1, __func__);
436 		f->re[info(v)].lfollow = p;
437 		*p = setcnt;
438 		for (i = f->accept; i >= 0; i--)
439 			if (setvec[i] == 1)
440 				*++p = i;
441 		break;
442 	UNARY
443 		cfoll(f,left(v));
444 		break;
445 	case CAT:
446 	case OR:
447 		cfoll(f,left(v));
448 		cfoll(f,right(v));
449 		break;
450 	case ZERO:
451 		break;
452 	default:	/* can't happen */
453 		FATAL("can't happen: unknown type %d in cfoll", type(v));
454 	}
455 }
456 
457 int first(Node *p)	/* collects initially active leaves of p into setvec */
458 			/* returns 0 if p matches empty string */
459 {
460 	int b, lp;
461 
462 	switch (type(p)) {
463 	ELEAF
464 	LEAF
465 		lp = info(p);	/* look for high-water mark of subscripts */
466 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
467 			resizesetvec(__func__);
468 		}
469 		if (type(p) == EMPTYRE) {
470 			setvec[lp] = 0;
471 			return(0);
472 		}
473 		if (setvec[lp] != 1) {
474 			setvec[lp] = 1;
475 			setcnt++;
476 		}
477 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
478 			return(0);		/* empty CCL */
479 		return(1);
480 	case PLUS:
481 		if (first(left(p)) == 0)
482 			return(0);
483 		return(1);
484 	case STAR:
485 	case QUEST:
486 		first(left(p));
487 		return(0);
488 	case CAT:
489 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
490 		return(1);
491 	case OR:
492 		b = first(right(p));
493 		if (first(left(p)) == 0 || b == 0) return(0);
494 		return(1);
495 	case ZERO:
496 		return 0;
497 	}
498 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
499 	return(-1);
500 }
501 
502 void follow(Node *v)	/* collects leaves that can follow v into setvec */
503 {
504 	Node *p;
505 
506 	if (type(v) == FINAL)
507 		return;
508 	p = parent(v);
509 	switch (type(p)) {
510 	case STAR:
511 	case PLUS:
512 		first(v);
513 		follow(p);
514 		return;
515 
516 	case OR:
517 	case QUEST:
518 		follow(p);
519 		return;
520 
521 	case CAT:
522 		if (v == left(p)) {	/* v is left child of p */
523 			if (first(right(p)) == 0) {
524 				follow(p);
525 				return;
526 			}
527 		} else		/* v is right child */
528 			follow(p);
529 		return;
530 	}
531 }
532 
533 int member(int c, const char *sarg)	/* is c in s? */
534 {
535 	const uschar *s = (const uschar *) sarg;
536 
537 	while (*s)
538 		if (c == *s++)
539 			return(1);
540 	return(0);
541 }
542 
543 int match(fa *f, const char *p0)	/* shortest match ? */
544 {
545 	int s, ns;
546 	const uschar *p = (const uschar *) p0;
547 
548 	s = f->initstat;
549 	assert (s < f->state_count);
550 
551 	if (f->out[s])
552 		return(1);
553 	do {
554 		/* assert(*p < NCHARS); */
555 		if ((ns = f->gototab[s][*p]) != 0)
556 			s = ns;
557 		else
558 			s = cgoto(f, s, *p);
559 		if (f->out[s])
560 			return(1);
561 	} while (*p++ != 0);
562 	return(0);
563 }
564 
565 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
566 {
567 	int s, ns;
568 	const uschar *p = (const uschar *) p0;
569 	const uschar *q;
570 
571 	s = f->initstat;
572 	assert(s < f->state_count);
573 
574 	patbeg = (const char *)p;
575 	patlen = -1;
576 	do {
577 		q = p;
578 		do {
579 			if (f->out[s])		/* final state */
580 				patlen = q-p;
581 			/* assert(*q < NCHARS); */
582 			if ((ns = f->gototab[s][*q]) != 0)
583 				s = ns;
584 			else
585 				s = cgoto(f, s, *q);
586 
587 			assert(s < f->state_count);
588 
589 			if (s == 1) {	/* no transition */
590 				if (patlen >= 0) {
591 					patbeg = (const char *) p;
592 					return(1);
593 				}
594 				else
595 					goto nextin;	/* no match */
596 			}
597 		} while (*q++ != 0);
598 		if (f->out[s])
599 			patlen = q-p-1;	/* don't count $ */
600 		if (patlen >= 0) {
601 			patbeg = (const char *) p;
602 			return(1);
603 		}
604 	nextin:
605 		s = 2;
606 	} while (*p++);
607 	return (0);
608 }
609 
610 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
611 {
612 	int s, ns;
613 	const uschar *p = (const uschar *) p0;
614 	const uschar *q;
615 
616 	s = f->initstat;
617 	assert(s < f->state_count);
618 
619 	patbeg = (const char *)p;
620 	patlen = -1;
621 	while (*p) {
622 		q = p;
623 		do {
624 			if (f->out[s])		/* final state */
625 				patlen = q-p;
626 			/* assert(*q < NCHARS); */
627 			if ((ns = f->gototab[s][*q]) != 0)
628 				s = ns;
629 			else
630 				s = cgoto(f, s, *q);
631 			if (s == 1) {	/* no transition */
632 				if (patlen > 0) {
633 					patbeg = (const char *) p;
634 					return(1);
635 				} else
636 					goto nnextin;	/* no nonempty match */
637 			}
638 		} while (*q++ != 0);
639 		if (f->out[s])
640 			patlen = q-p-1;	/* don't count $ */
641 		if (patlen > 0 ) {
642 			patbeg = (const char *) p;
643 			return(1);
644 		}
645 	nnextin:
646 		s = 2;
647 		p++;
648 	}
649 	return (0);
650 }
651 
652 
653 /*
654  * NAME
655  *     fnematch
656  *
657  * DESCRIPTION
658  *     A stream-fed version of nematch which transfers characters to a
659  *     null-terminated buffer. All characters up to and including the last
660  *     character of the matching text or EOF are placed in the buffer. If
661  *     a match is found, patbeg and patlen are set appropriately.
662  *
663  * RETURN VALUES
664  *     false    No match found.
665  *     true     Match found.
666  */
667 
668 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
669 {
670 	char *buf = *pbuf;
671 	int bufsize = *pbufsize;
672 	int c, i, j, k, ns, s;
673 
674 	s = pfa->initstat;
675 	patlen = 0;
676 
677 	/*
678 	 * All indices relative to buf.
679 	 * i <= j <= k <= bufsize
680 	 *
681 	 * i: origin of active substring
682 	 * j: current character
683 	 * k: destination of next getc()
684 	 */
685 	i = -1, k = 0;
686         do {
687 		j = i++;
688 		do {
689 			if (++j == k) {
690 				if (k == bufsize)
691 					if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
692 						FATAL("stream '%.30s...' too long", buf);
693 				buf[k++] = (c = getc(f)) != EOF ? c : 0;
694 			}
695 			c = (uschar)buf[j];
696 			/* assert(c < NCHARS); */
697 
698 			if ((ns = pfa->gototab[s][c]) != 0)
699 				s = ns;
700 			else
701 				s = cgoto(pfa, s, c);
702 
703 			if (pfa->out[s]) {	/* final state */
704 				patlen = j - i + 1;
705 				if (c == 0)	/* don't count $ */
706 					patlen--;
707 			}
708 		} while (buf[j] && s != 1);
709 		s = 2;
710 	} while (buf[i] && !patlen);
711 
712 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
713 	*pbuf = buf;
714 	*pbufsize = bufsize;
715 
716 	if (patlen) {
717 		patbeg = buf + i;
718 		/*
719 		 * Under no circumstances is the last character fed to
720 		 * the automaton part of the match. It is EOF's nullbyte,
721 		 * or it sent the automaton into a state with no further
722 		 * transitions available (s==1), or both. Room for a
723 		 * terminating nullbyte is guaranteed.
724 		 *
725 		 * ungetc any chars after the end of matching text
726 		 * (except for EOF's nullbyte, if present) and null
727 		 * terminate the buffer.
728 		 */
729 		do
730 			if (buf[--k] && ungetc(buf[k], f) == EOF)
731 				FATAL("unable to ungetc '%c'", buf[k]);
732 		while (k > i + patlen);
733 		buf[k] = '\0';
734 		return true;
735 	}
736 	else
737 		return false;
738 }
739 
740 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
741 {			/* uses relex() to scan regular expression */
742 	Node *np;
743 
744 	DPRINTF("reparse <%s>\n", p);
745 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
746 	rtok = relex();
747 	/* GNU compatibility: an empty regexp matches anything */
748 	if (rtok == '\0') {
749 		/* FATAL("empty regular expression"); previous */
750 		return(op2(EMPTYRE, NIL, NIL));
751 	}
752 	np = regexp();
753 	if (rtok != '\0')
754 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
755 	return(np);
756 }
757 
758 Node *regexp(void)	/* top-level parse of reg expr */
759 {
760 	return (alt(concat(primary())));
761 }
762 
763 Node *primary(void)
764 {
765 	Node *np;
766 	int savelastatom;
767 
768 	switch (rtok) {
769 	case CHAR:
770 		lastatom = starttok;
771 		np = op2(CHAR, NIL, itonp(rlxval));
772 		rtok = relex();
773 		return (unary(np));
774 	case ALL:
775 		rtok = relex();
776 		return (unary(op2(ALL, NIL, NIL)));
777 	case EMPTYRE:
778 		rtok = relex();
779 		return (unary(op2(EMPTYRE, NIL, NIL)));
780 	case DOT:
781 		lastatom = starttok;
782 		rtok = relex();
783 		return (unary(op2(DOT, NIL, NIL)));
784 	case CCL:
785 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
786 		lastatom = starttok;
787 		rtok = relex();
788 		return (unary(np));
789 	case NCCL:
790 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
791 		lastatom = starttok;
792 		rtok = relex();
793 		return (unary(np));
794 	case '^':
795 		rtok = relex();
796 		return (unary(op2(CHAR, NIL, itonp(HAT))));
797 	case '$':
798 		rtok = relex();
799 		return (unary(op2(CHAR, NIL, NIL)));
800 	case '(':
801 		lastatom = starttok;
802 		savelastatom = starttok - basestr; /* Retain over recursion */
803 		rtok = relex();
804 		if (rtok == ')') {	/* special pleading for () */
805 			rtok = relex();
806 			return unary(op2(CCL, NIL, (Node *) tostring("")));
807 		}
808 		np = regexp();
809 		if (rtok == ')') {
810 			lastatom = basestr + savelastatom; /* Restore */
811 			rtok = relex();
812 			return (unary(np));
813 		}
814 		else
815 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
816 	default:
817 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
818 	}
819 	return 0;	/*NOTREACHED*/
820 }
821 
822 Node *concat(Node *np)
823 {
824 	switch (rtok) {
825 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
826 		return (concat(op2(CAT, np, primary())));
827 	case EMPTYRE:
828 		rtok = relex();
829 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
830 				primary())));
831 	}
832 	return (np);
833 }
834 
835 Node *alt(Node *np)
836 {
837 	if (rtok == OR) {
838 		rtok = relex();
839 		return (alt(op2(OR, np, concat(primary()))));
840 	}
841 	return (np);
842 }
843 
844 Node *unary(Node *np)
845 {
846 	switch (rtok) {
847 	case STAR:
848 		rtok = relex();
849 		return (unary(op2(STAR, np, NIL)));
850 	case PLUS:
851 		rtok = relex();
852 		return (unary(op2(PLUS, np, NIL)));
853 	case QUEST:
854 		rtok = relex();
855 		return (unary(op2(QUEST, np, NIL)));
856 	case ZERO:
857 		rtok = relex();
858 		return (unary(op2(ZERO, np, NIL)));
859 	default:
860 		return (np);
861 	}
862 }
863 
864 /*
865  * Character class definitions conformant to the POSIX locale as
866  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
867  * and operating character sets are both ASCII (ISO646) or supersets
868  * thereof.
869  *
870  * Note that to avoid overflowing the temporary buffer used in
871  * relex(), the expanded character class (prior to range expansion)
872  * must be less than twice the size of their full name.
873  */
874 
875 /* Because isblank doesn't show up in any of the header files on any
876  * system i use, it's defined here.  if some other locale has a richer
877  * definition of "blank", define HAS_ISBLANK and provide your own
878  * version.
879  * the parentheses here are an attempt to find a path through the maze
880  * of macro definition and/or function and/or version provided.  thanks
881  * to nelson beebe for the suggestion; let's see if it works everywhere.
882  */
883 
884 #ifndef HAS_ISBLANK
885 
886 int (xisblank)(int c)
887 {
888 	return c==' ' || c=='\t';
889 }
890 
891 #endif
892 
893 static const struct charclass {
894 	const char *cc_name;
895 	int cc_namelen;
896 	int (*cc_func)(int);
897 } charclasses[] = {
898 	{ "alnum",	5,	isalnum },
899 	{ "alpha",	5,	isalpha },
900 #ifndef HAS_ISBLANK
901 	{ "blank",	5,	xisblank },
902 #else
903 	{ "blank",	5,	isblank },
904 #endif
905 	{ "cntrl",	5,	iscntrl },
906 	{ "digit",	5,	isdigit },
907 	{ "graph",	5,	isgraph },
908 	{ "lower",	5,	islower },
909 	{ "print",	5,	isprint },
910 	{ "punct",	5,	ispunct },
911 	{ "space",	5,	isspace },
912 	{ "upper",	5,	isupper },
913 	{ "xdigit",	6,	isxdigit },
914 	{ NULL,		0,	NULL },
915 };
916 
917 #define REPEAT_SIMPLE		0
918 #define REPEAT_PLUS_APPENDED	1
919 #define REPEAT_WITH_Q		2
920 #define REPEAT_ZERO		3
921 
922 static int
923 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
924 	       int atomlen, int firstnum, int secondnum, int special_case)
925 {
926 	int i, j;
927 	uschar *buf = NULL;
928 	int ret = 1;
929 	int init_q = (firstnum == 0);		/* first added char will be ? */
930 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
931 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
932 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
933 	int size = prefix_length +  suffix_length;
934 
935 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
936 		size += atomlen*(firstnum-1);
937 	}
938 
939 	/* Adjust size of buffer for special cases */
940 	if (special_case == REPEAT_PLUS_APPENDED) {
941 		size++;		/* for the final + */
942 	} else if (special_case == REPEAT_WITH_Q) {
943 		size += init_q + (atomlen+1)* n_q_reps;
944 	} else if (special_case == REPEAT_ZERO) {
945 		size += 2;	/* just a null ERE: () */
946 	}
947 	if ((buf = malloc(size + 1)) == NULL)
948 		FATAL("out of space in reg expr %.10s..", lastre);
949 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
950 	j = prefix_length;
951 	if (special_case == REPEAT_ZERO) {
952 		j -= atomlen;
953 		buf[j++] = '(';
954 		buf[j++] = ')';
955 	}
956 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
957 		memcpy(&buf[j], atom, atomlen);
958 		j += atomlen;
959 	}
960 	if (special_case == REPEAT_PLUS_APPENDED) {
961 		buf[j++] = '+';
962 	} else if (special_case == REPEAT_WITH_Q) {
963 		if (init_q)
964 			buf[j++] = '?';
965 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
966 			memcpy(&buf[j], atom, atomlen);
967 			j += atomlen;
968 			buf[j++] = '?';
969 		}
970 	}
971 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
972 	if (special_case == REPEAT_ZERO) {
973 		buf[j+suffix_length] = '\0';
974 	} else {
975 		buf[size] = '\0';
976 	}
977 	/* free old basestr */
978 	if (firstbasestr != basestr) {
979 		if (basestr)
980 			xfree(basestr);
981 	}
982 	basestr = buf;
983 	prestr  = buf + prefix_length;
984 	if (special_case == REPEAT_ZERO) {
985 		prestr  -= atomlen;
986 		ret++;
987 	}
988 	return ret;
989 }
990 
991 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
992 		  int atomlen, int firstnum, int secondnum)
993 {
994 	/*
995 	   In general, the repetition specifier or "bound" is replaced here
996 	   by an equivalent ERE string, repeating the immediately previous atom
997 	   and appending ? and + as needed. Note that the first copy of the
998 	   atom is left in place, except in the special_case of a zero-repeat
999 	   (i.e., {0}).
1000 	 */
1001 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1002 		if (firstnum < 2) {
1003 			/* 0 or 1: should be handled before you get here */
1004 			FATAL("internal error");
1005 		} else {
1006 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1007 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1008 		}
1009 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1010 		if (firstnum == 0) {	/* {0} or {0,0} */
1011 			/* This case is unusual because the resulting
1012 			   replacement string might actually be SMALLER than
1013 			   the original ERE */
1014 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1015 					firstnum, secondnum, REPEAT_ZERO);
1016 		} else {		/* (firstnum >= 1) */
1017 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1018 					firstnum, secondnum, REPEAT_SIMPLE);
1019 		}
1020 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1021 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1022 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1023 					firstnum, secondnum, REPEAT_WITH_Q);
1024 	} else {	/* Error - shouldn't be here (n>m) */
1025 		FATAL("internal error");
1026 	}
1027 	return 0;
1028 }
1029 
1030 int relex(void)		/* lexical analyzer for reparse */
1031 {
1032 	int c, n;
1033 	int cflag;
1034 	static uschar *buf = NULL;
1035 	static int bufsz = 100;
1036 	uschar *bp;
1037 	const struct charclass *cc;
1038 	int i;
1039 	int num, m;
1040 	bool commafound, digitfound;
1041 	const uschar *startreptok;
1042 	static int parens = 0;
1043 
1044 rescan:
1045 	starttok = prestr;
1046 
1047 	switch (c = *prestr++) {
1048 	case '|': return OR;
1049 	case '*': return STAR;
1050 	case '+': return PLUS;
1051 	case '?': return QUEST;
1052 	case '.': return DOT;
1053 	case '\0': prestr--; return '\0';
1054 	case '^':
1055 	case '$':
1056 		return c;
1057 	case '(':
1058 		parens++;
1059 		return c;
1060 	case ')':
1061 		if (parens) {
1062 			parens--;
1063 			return c;
1064 		}
1065 		/* unmatched close parenthesis; per POSIX, treat as literal */
1066 		rlxval = c;
1067 		return CHAR;
1068 	case '\\':
1069 		rlxval = quoted(&prestr);
1070 		return CHAR;
1071 	default:
1072 		rlxval = c;
1073 		return CHAR;
1074 	case '[':
1075 		if (buf == NULL && (buf = malloc(bufsz)) == NULL)
1076 			FATAL("out of space in reg expr %.10s..", lastre);
1077 		bp = buf;
1078 		if (*prestr == '^') {
1079 			cflag = 1;
1080 			prestr++;
1081 		}
1082 		else
1083 			cflag = 0;
1084 		n = 2 * strlen((const char *) prestr)+1;
1085 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1086 			FATAL("out of space for reg expr %.10s...", lastre);
1087 		for (; ; ) {
1088 			if ((c = *prestr++) == '\\') {
1089 				*bp++ = '\\';
1090 				if ((c = *prestr++) == '\0')
1091 					FATAL("nonterminated character class %.20s...", lastre);
1092 				*bp++ = c;
1093 			/* } else if (c == '\n') { */
1094 			/* 	FATAL("newline in character class %.20s...", lastre); */
1095 			} else if (c == '[' && *prestr == ':') {
1096 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1097 				for (cc = charclasses; cc->cc_name; cc++)
1098 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1099 						break;
1100 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1101 				    prestr[2 + cc->cc_namelen] == ']') {
1102 					prestr += cc->cc_namelen + 3;
1103 					/*
1104 					 * BUG: We begin at 1, instead of 0, since we
1105 					 * would otherwise prematurely terminate the
1106 					 * string for classes like [[:cntrl:]]. This
1107 					 * means that we can't match the NUL character,
1108 					 * not without first adapting the entire
1109 					 * program to track each string's length.
1110 					 */
1111 					for (i = 1; i <= UCHAR_MAX; i++) {
1112 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1113 						    FATAL("out of space for reg expr %.10s...", lastre);
1114 						if (cc->cc_func(i)) {
1115 							/* escape backslash */
1116 							if (i == '\\') {
1117 								*bp++ = '\\';
1118 								n++;
1119 							}
1120 
1121 							*bp++ = i;
1122 							n++;
1123 						}
1124 					}
1125 				} else
1126 					*bp++ = c;
1127 			} else if (c == '[' && *prestr == '.') {
1128 				char collate_char;
1129 				prestr++;
1130 				collate_char = *prestr++;
1131 				if (*prestr == '.' && prestr[1] == ']') {
1132 					prestr += 2;
1133 					/* Found it: map via locale TBD: for
1134 					   now, simply return this char.  This
1135 					   is sufficient to pass conformance
1136 					   test awk.ex 156
1137 					 */
1138 					if (*prestr == ']') {
1139 						prestr++;
1140 						rlxval = collate_char;
1141 						return CHAR;
1142 					}
1143 				}
1144 			} else if (c == '[' && *prestr == '=') {
1145 				char equiv_char;
1146 				prestr++;
1147 				equiv_char = *prestr++;
1148 				if (*prestr == '=' && prestr[1] == ']') {
1149 					prestr += 2;
1150 					/* Found it: map via locale TBD: for now
1151 					   simply return this char. This is
1152 					   sufficient to pass conformance test
1153 					   awk.ex 156
1154 					 */
1155 					if (*prestr == ']') {
1156 						prestr++;
1157 						rlxval = equiv_char;
1158 						return CHAR;
1159 					}
1160 				}
1161 			} else if (c == '\0') {
1162 				FATAL("nonterminated character class %.20s", lastre);
1163 			} else if (bp == buf) {	/* 1st char is special */
1164 				*bp++ = c;
1165 			} else if (c == ']') {
1166 				*bp++ = 0;
1167 				rlxstr = (uschar *) tostring((char *) buf);
1168 				if (cflag == 0)
1169 					return CCL;
1170 				else
1171 					return NCCL;
1172 			} else
1173 				*bp++ = c;
1174 		}
1175 		break;
1176 	case '{':
1177 		if (isdigit(*(prestr))) {
1178 			num = 0;	/* Process as a repetition */
1179 			n = -1; m = -1;
1180 			commafound = false;
1181 			digitfound = false;
1182 			startreptok = prestr-1;
1183 			/* Remember start of previous atom here ? */
1184 		} else {        	/* just a { char, not a repetition */
1185 			rlxval = c;
1186 			return CHAR;
1187                 }
1188 		for (; ; ) {
1189 			if ((c = *prestr++) == '}') {
1190 				if (commafound) {
1191 					if (digitfound) { /* {n,m} */
1192 						m = num;
1193 						if (m < n)
1194 							FATAL("illegal repetition expression: class %.20s",
1195 								lastre);
1196 						if (n == 0 && m == 1) {
1197 							return QUEST;
1198 						}
1199 					} else {	/* {n,} */
1200 						if (n == 0)
1201 							return STAR;
1202 						else if (n == 1)
1203 							return PLUS;
1204 					}
1205 				} else {
1206 					if (digitfound) { /* {n} same as {n,n} */
1207 						n = num;
1208 						m = num;
1209 					} else {	/* {} */
1210 						FATAL("illegal repetition expression: class %.20s",
1211 							lastre);
1212 					}
1213 				}
1214 				if (repeat(starttok, prestr-starttok, lastatom,
1215 					   startreptok - lastatom, n, m) > 0) {
1216 					if (n == 0 && m == 0) {
1217 						return ZERO;
1218 					}
1219 					/* must rescan input for next token */
1220 					goto rescan;
1221 				}
1222 				/* Failed to replace: eat up {...} characters
1223 				   and treat like just PLUS */
1224 				return PLUS;
1225 			} else if (c == '\0') {
1226 				FATAL("nonterminated character class %.20s",
1227 					lastre);
1228 			} else if (isdigit(c)) {
1229 				num = 10 * num + c - '0';
1230 				digitfound = true;
1231 			} else if (c == ',') {
1232 				if (commafound)
1233 					FATAL("illegal repetition expression: class %.20s",
1234 						lastre);
1235 				/* looking for {n,} or {n,m} */
1236 				commafound = true;
1237 				n = num;
1238 				digitfound = false; /* reset */
1239 				num = 0;
1240 			} else {
1241 				FATAL("illegal repetition expression: class %.20s",
1242 					lastre);
1243 			}
1244 		}
1245 		break;
1246 	}
1247 }
1248 
1249 int cgoto(fa *f, int s, int c)
1250 {
1251 	int *p, *q;
1252 	int i, j, k;
1253 
1254 	assert(c == HAT || c < NCHARS);
1255 	while (f->accept >= maxsetvec) {	/* guessing here! */
1256 		resizesetvec(__func__);
1257 	}
1258 	for (i = 0; i <= f->accept; i++)
1259 		setvec[i] = 0;
1260 	setcnt = 0;
1261 	resize_state(f, s);
1262 	/* compute positions of gototab[s,c] into setvec */
1263 	p = f->posns[s];
1264 	for (i = 1; i <= *p; i++) {
1265 		if ((k = f->re[p[i]].ltype) != FINAL) {
1266 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1267 			 || (k == DOT && c != 0 && c != HAT)
1268 			 || (k == ALL && c != 0)
1269 			 || (k == EMPTYRE && c != 0)
1270 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1271 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1272 				q = f->re[p[i]].lfollow;
1273 				for (j = 1; j <= *q; j++) {
1274 					if (q[j] >= maxsetvec) {
1275 						resizesetvec(__func__);
1276 					}
1277 					if (setvec[q[j]] == 0) {
1278 						setcnt++;
1279 						setvec[q[j]] = 1;
1280 					}
1281 				}
1282 			}
1283 		}
1284 	}
1285 	/* determine if setvec is a previous state */
1286 	tmpset[0] = setcnt;
1287 	j = 1;
1288 	for (i = f->accept; i >= 0; i--)
1289 		if (setvec[i]) {
1290 			tmpset[j++] = i;
1291 		}
1292 	resize_state(f, f->curstat > s ? f->curstat : s);
1293 	/* tmpset == previous state? */
1294 	for (i = 1; i <= f->curstat; i++) {
1295 		p = f->posns[i];
1296 		if ((k = tmpset[0]) != p[0])
1297 			goto different;
1298 		for (j = 1; j <= k; j++)
1299 			if (tmpset[j] != p[j])
1300 				goto different;
1301 		/* setvec is state i */
1302 		if (c != HAT)
1303 			f->gototab[s][c] = i;
1304 		return i;
1305 	  different:;
1306 	}
1307 
1308 	/* add tmpset to current set of states */
1309 	++(f->curstat);
1310 	resize_state(f, f->curstat);
1311 	for (i = 0; i < NCHARS; i++)
1312 		f->gototab[f->curstat][i] = 0;
1313 	xfree(f->posns[f->curstat]);
1314 	p = intalloc(setcnt + 1, __func__);
1315 
1316 	f->posns[f->curstat] = p;
1317 	if (c != HAT)
1318 		f->gototab[s][c] = f->curstat;
1319 	for (i = 0; i <= setcnt; i++)
1320 		p[i] = tmpset[i];
1321 	if (setvec[f->accept])
1322 		f->out[f->curstat] = 1;
1323 	else
1324 		f->out[f->curstat] = 0;
1325 	return f->curstat;
1326 }
1327 
1328 
1329 void freefa(fa *f)	/* free a finite automaton */
1330 {
1331 	int i;
1332 
1333 	if (f == NULL)
1334 		return;
1335 	for (i = 0; i < f->state_count; i++)
1336 		xfree(f->gototab[i])
1337 	for (i = 0; i <= f->curstat; i++)
1338 		xfree(f->posns[i]);
1339 	for (i = 0; i <= f->accept; i++) {
1340 		xfree(f->re[i].lfollow);
1341 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1342 			xfree(f->re[i].lval.np);
1343 	}
1344 	xfree(f->restr);
1345 	xfree(f->out);
1346 	xfree(f->posns);
1347 	xfree(f->gototab);
1348 	xfree(f);
1349 }
1350