xref: /dragonfly/contrib/awk/b.c (revision 7ff0fc30)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #define	DEBUG
28 
29 #include <ctype.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include "awk.h"
35 #include "ytab.h"
36 
37 #define MAXLIN 22
38 
39 #define type(v)		(v)->nobj	/* badly overloaded here */
40 #define info(v)		(v)->ntype	/* badly overloaded here */
41 #define left(v)		(v)->narg[0]
42 #define right(v)	(v)->narg[1]
43 #define parent(v)	(v)->nnext
44 
45 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
47 #define UNARY	case STAR: case PLUS: case QUEST:
48 
49 /* encoding in tree Nodes:
50 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51 		left is index, right contains value or pointer to value
52 	unary (STAR, PLUS, QUEST): left is child, right is null
53 	binary (CAT, OR): left and right are children
54 	parent contains pointer to parent
55 */
56 
57 
58 int	*setvec;
59 int	*tmpset;
60 int	maxsetvec = 0;
61 
62 int	rtok;		/* next token in current re */
63 int	rlxval;
64 static const uschar	*rlxstr;
65 static const uschar	*prestr;	/* current position in current re */
66 static const uschar	*lastre;	/* origin of last re */
67 static const uschar	*lastatom;	/* origin of last Atom */
68 static const uschar	*starttok;
69 static const uschar 	*basestr;	/* starts with original, replaced during
70 				   repetition processing */
71 static const uschar 	*firstbasestr;
72 
73 static	int setcnt;
74 static	int poscnt;
75 
76 const char	*patbeg;
77 int	patlen;
78 
79 #define	NFA	128	/* cache this many dynamic fa's */
80 fa	*fatab[NFA];
81 int	nfatab	= 0;	/* entries in fatab */
82 
83 static int *
84 intalloc(size_t n, const char *f)
85 {
86 	void *p = calloc(n, sizeof(int));
87 	if (p == NULL)
88 		overflo(f);
89 	return p;
90 }
91 
92 static void
93 resizesetvec(const char *f)
94 {
95 	if (maxsetvec == 0)
96 		maxsetvec = MAXLIN;
97 	else
98 		maxsetvec *= 4;
99 	setvec = realloc(setvec, maxsetvec * sizeof(*setvec));
100 	tmpset = realloc(tmpset, maxsetvec * sizeof(*tmpset));
101 	if (setvec == NULL || tmpset == NULL)
102 		overflo(f);
103 }
104 
105 static void
106 resize_state(fa *f, int state)
107 {
108 	void *p;
109 	int i, new_count;
110 
111 	if (++state < f->state_count)
112 		return;
113 
114 	new_count = state + 10; /* needs to be tuned */
115 
116 	p = realloc(f->gototab, new_count * sizeof(f->gototab[0]));
117 	if (p == NULL)
118 		goto out;
119 	f->gototab = p;
120 
121 	p = realloc(f->out, new_count * sizeof(f->out[0]));
122 	if (p == NULL)
123 		goto out;
124 	f->out = p;
125 
126 	p = realloc(f->posns, new_count * sizeof(f->posns[0]));
127 	if (p == NULL)
128 		goto out;
129 	f->posns = p;
130 
131 	for (i = f->state_count; i < new_count; ++i) {
132 		f->gototab[i] = calloc(NCHARS, sizeof(**f->gototab));
133 		if (f->gototab[i] == NULL)
134 			goto out;
135 		f->out[i]  = 0;
136 		f->posns[i] = NULL;
137 	}
138 	f->state_count = new_count;
139 	return;
140 out:
141 	overflo(__func__);
142 }
143 
144 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
145 {
146 	int i, use, nuse;
147 	fa *pfa;
148 	static int now = 1;
149 
150 	if (setvec == NULL) {	/* first time through any RE */
151 		resizesetvec(__func__);
152 	}
153 
154 	if (compile_time != RUNNING)	/* a constant for sure */
155 		return mkdfa(s, anchor);
156 	for (i = 0; i < nfatab; i++)	/* is it there already? */
157 		if (fatab[i]->anchor == anchor
158 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
159 			fatab[i]->use = now++;
160 			return fatab[i];
161 		}
162 	pfa = mkdfa(s, anchor);
163 	if (nfatab < NFA) {	/* room for another */
164 		fatab[nfatab] = pfa;
165 		fatab[nfatab]->use = now++;
166 		nfatab++;
167 		return pfa;
168 	}
169 	use = fatab[0]->use;	/* replace least-recently used */
170 	nuse = 0;
171 	for (i = 1; i < nfatab; i++)
172 		if (fatab[i]->use < use) {
173 			use = fatab[i]->use;
174 			nuse = i;
175 		}
176 	freefa(fatab[nuse]);
177 	fatab[nuse] = pfa;
178 	pfa->use = now++;
179 	return pfa;
180 }
181 
182 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
183 				/* anchor = true for anchored matches, else false */
184 {
185 	Node *p, *p1;
186 	fa *f;
187 
188 	firstbasestr = (const uschar *) s;
189 	basestr = firstbasestr;
190 	p = reparse(s);
191 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
192 		/* put ALL STAR in front of reg.  exp. */
193 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
194 		/* put FINAL after reg.  exp. */
195 
196 	poscnt = 0;
197 	penter(p1);	/* enter parent pointers and leaf indices */
198 	if ((f = calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
199 		overflo(__func__);
200 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
201 	cfoll(f, p1);	/* set up follow sets */
202 	freetr(p1);
203 	resize_state(f, 1);
204 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
205 	f->posns[1] = intalloc(1, __func__);
206 	*f->posns[1] = 0;
207 	f->initstat = makeinit(f, anchor);
208 	f->anchor = anchor;
209 	f->restr = (uschar *) tostring(s);
210 	if (firstbasestr != basestr) {
211 		if (basestr)
212 			xfree(basestr);
213 	}
214 	return f;
215 }
216 
217 int makeinit(fa *f, bool anchor)
218 {
219 	int i, k;
220 
221 	f->curstat = 2;
222 	f->out[2] = 0;
223 	k = *(f->re[0].lfollow);
224 	xfree(f->posns[2]);
225 	f->posns[2] = intalloc(k + 1,  __func__);
226 	for (i = 0; i <= k; i++) {
227 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
228 	}
229 	if ((f->posns[2])[1] == f->accept)
230 		f->out[2] = 1;
231 	for (i = 0; i < NCHARS; i++)
232 		f->gototab[2][i] = 0;
233 	f->curstat = cgoto(f, 2, HAT);
234 	if (anchor) {
235 		*f->posns[2] = k-1;	/* leave out position 0 */
236 		for (i = 0; i < k; i++) {
237 			(f->posns[0])[i] = (f->posns[2])[i];
238 		}
239 
240 		f->out[0] = f->out[2];
241 		if (f->curstat != 2)
242 			--(*f->posns[f->curstat]);
243 	}
244 	return f->curstat;
245 }
246 
247 void penter(Node *p)	/* set up parent pointers and leaf indices */
248 {
249 	switch (type(p)) {
250 	ELEAF
251 	LEAF
252 		info(p) = poscnt;
253 		poscnt++;
254 		break;
255 	UNARY
256 		penter(left(p));
257 		parent(left(p)) = p;
258 		break;
259 	case CAT:
260 	case OR:
261 		penter(left(p));
262 		penter(right(p));
263 		parent(left(p)) = p;
264 		parent(right(p)) = p;
265 		break;
266 	case ZERO:
267 		break;
268 	default:	/* can't happen */
269 		FATAL("can't happen: unknown type %d in penter", type(p));
270 		break;
271 	}
272 }
273 
274 void freetr(Node *p)	/* free parse tree */
275 {
276 	switch (type(p)) {
277 	ELEAF
278 	LEAF
279 		xfree(p);
280 		break;
281 	UNARY
282 	case ZERO:
283 		freetr(left(p));
284 		xfree(p);
285 		break;
286 	case CAT:
287 	case OR:
288 		freetr(left(p));
289 		freetr(right(p));
290 		xfree(p);
291 		break;
292 	default:	/* can't happen */
293 		FATAL("can't happen: unknown type %d in freetr", type(p));
294 		break;
295 	}
296 }
297 
298 /* in the parsing of regular expressions, metacharacters like . have */
299 /* to be seen literally;  \056 is not a metacharacter. */
300 
301 int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
302 {			/* only pick up one 8-bit byte (2 chars) */
303 	const uschar *p;
304 	int n = 0;
305 	int i;
306 
307 	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
308 		if (isdigit(*p))
309 			n = 16 * n + *p - '0';
310 		else if (*p >= 'a' && *p <= 'f')
311 			n = 16 * n + *p - 'a' + 10;
312 		else if (*p >= 'A' && *p <= 'F')
313 			n = 16 * n + *p - 'A' + 10;
314 	}
315 	*pp = p;
316 	return n;
317 }
318 
319 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
320 
321 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
322 			/* and increment *pp */
323 {
324 	const uschar *p = *pp;
325 	int c;
326 
327 	if ((c = *p++) == 't')
328 		c = '\t';
329 	else if (c == 'n')
330 		c = '\n';
331 	else if (c == 'f')
332 		c = '\f';
333 	else if (c == 'r')
334 		c = '\r';
335 	else if (c == 'b')
336 		c = '\b';
337 	else if (c == 'v')
338 		c = '\v';
339 	else if (c == 'a')
340 		c = '\a';
341 	else if (c == '\\')
342 		c = '\\';
343 	else if (c == 'x') {	/* hexadecimal goo follows */
344 		c = hexstr(&p);	/* this adds a null if number is invalid */
345 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
346 		int n = c - '0';
347 		if (isoctdigit(*p)) {
348 			n = 8 * n + *p++ - '0';
349 			if (isoctdigit(*p))
350 				n = 8 * n + *p++ - '0';
351 		}
352 		c = n;
353 	} /* else */
354 		/* c = c; */
355 	*pp = p;
356 	return c;
357 }
358 
359 char *cclenter(const char *argp)	/* add a character class */
360 {
361 	int i, c, c2;
362 	const uschar *op, *p = (const uschar *) argp;
363 	uschar *bp;
364 	static uschar *buf = NULL;
365 	static int bufsz = 100;
366 
367 	op = p;
368 	if (buf == NULL && (buf = malloc(bufsz)) == NULL)
369 		FATAL("out of space for character class [%.10s...] 1", p);
370 	bp = buf;
371 	for (i = 0; (c = *p++) != 0; ) {
372 		if (c == '\\') {
373 			c = quoted(&p);
374 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
375 			if (*p != 0) {
376 				c = bp[-1];
377 				c2 = *p++;
378 				if (c2 == '\\')
379 					c2 = quoted(&p);
380 				if (c > c2) {	/* empty; ignore */
381 					bp--;
382 					i--;
383 					continue;
384 				}
385 				while (c < c2) {
386 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
387 						FATAL("out of space for character class [%.10s...] 2", p);
388 					*bp++ = ++c;
389 					i++;
390 				}
391 				continue;
392 			}
393 		}
394 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
395 			FATAL("out of space for character class [%.10s...] 3", p);
396 		*bp++ = c;
397 		i++;
398 	}
399 	*bp = 0;
400 	DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
401 	xfree(op);
402 	return (char *) tostring((char *) buf);
403 }
404 
405 void overflo(const char *s)
406 {
407 	FATAL("regular expression too big: out of space in %.30s...", s);
408 }
409 
410 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
411 {
412 	int i;
413 	int *p;
414 
415 	switch (type(v)) {
416 	ELEAF
417 	LEAF
418 		f->re[info(v)].ltype = type(v);
419 		f->re[info(v)].lval.np = right(v);
420 		while (f->accept >= maxsetvec) {	/* guessing here! */
421 			resizesetvec(__func__);
422 		}
423 		for (i = 0; i <= f->accept; i++)
424 			setvec[i] = 0;
425 		setcnt = 0;
426 		follow(v);	/* computes setvec and setcnt */
427 		p = intalloc(setcnt + 1, __func__);
428 		f->re[info(v)].lfollow = p;
429 		*p = setcnt;
430 		for (i = f->accept; i >= 0; i--)
431 			if (setvec[i] == 1)
432 				*++p = i;
433 		break;
434 	UNARY
435 		cfoll(f,left(v));
436 		break;
437 	case CAT:
438 	case OR:
439 		cfoll(f,left(v));
440 		cfoll(f,right(v));
441 		break;
442 	case ZERO:
443 		break;
444 	default:	/* can't happen */
445 		FATAL("can't happen: unknown type %d in cfoll", type(v));
446 	}
447 }
448 
449 int first(Node *p)	/* collects initially active leaves of p into setvec */
450 			/* returns 0 if p matches empty string */
451 {
452 	int b, lp;
453 
454 	switch (type(p)) {
455 	ELEAF
456 	LEAF
457 		lp = info(p);	/* look for high-water mark of subscripts */
458 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
459 			resizesetvec(__func__);
460 		}
461 		if (type(p) == EMPTYRE) {
462 			setvec[lp] = 0;
463 			return(0);
464 		}
465 		if (setvec[lp] != 1) {
466 			setvec[lp] = 1;
467 			setcnt++;
468 		}
469 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
470 			return(0);		/* empty CCL */
471 		return(1);
472 	case PLUS:
473 		if (first(left(p)) == 0)
474 			return(0);
475 		return(1);
476 	case STAR:
477 	case QUEST:
478 		first(left(p));
479 		return(0);
480 	case CAT:
481 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
482 		return(1);
483 	case OR:
484 		b = first(right(p));
485 		if (first(left(p)) == 0 || b == 0) return(0);
486 		return(1);
487 	case ZERO:
488 		return 0;
489 	}
490 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
491 	return(-1);
492 }
493 
494 void follow(Node *v)	/* collects leaves that can follow v into setvec */
495 {
496 	Node *p;
497 
498 	if (type(v) == FINAL)
499 		return;
500 	p = parent(v);
501 	switch (type(p)) {
502 	case STAR:
503 	case PLUS:
504 		first(v);
505 		follow(p);
506 		return;
507 
508 	case OR:
509 	case QUEST:
510 		follow(p);
511 		return;
512 
513 	case CAT:
514 		if (v == left(p)) {	/* v is left child of p */
515 			if (first(right(p)) == 0) {
516 				follow(p);
517 				return;
518 			}
519 		} else		/* v is right child */
520 			follow(p);
521 		return;
522 	}
523 }
524 
525 int member(int c, const char *sarg)	/* is c in s? */
526 {
527 	const uschar *s = (const uschar *) sarg;
528 
529 	while (*s)
530 		if (c == *s++)
531 			return(1);
532 	return(0);
533 }
534 
535 int match(fa *f, const char *p0)	/* shortest match ? */
536 {
537 	int s, ns;
538 	const uschar *p = (const uschar *) p0;
539 
540 	s = f->initstat;
541 	assert (s < f->state_count);
542 
543 	if (f->out[s])
544 		return(1);
545 	do {
546 		/* assert(*p < NCHARS); */
547 		if ((ns = f->gototab[s][*p]) != 0)
548 			s = ns;
549 		else
550 			s = cgoto(f, s, *p);
551 		if (f->out[s])
552 			return(1);
553 	} while (*p++ != 0);
554 	return(0);
555 }
556 
557 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
558 {
559 	int s, ns;
560 	const uschar *p = (const uschar *) p0;
561 	const uschar *q;
562 
563 	s = f->initstat;
564 	assert(s < f->state_count);
565 
566 	patbeg = (const char *)p;
567 	patlen = -1;
568 	do {
569 		q = p;
570 		do {
571 			if (f->out[s])		/* final state */
572 				patlen = q-p;
573 			/* assert(*q < NCHARS); */
574 			if ((ns = f->gototab[s][*q]) != 0)
575 				s = ns;
576 			else
577 				s = cgoto(f, s, *q);
578 
579 			assert(s < f->state_count);
580 
581 			if (s == 1) {	/* no transition */
582 				if (patlen >= 0) {
583 					patbeg = (const char *) p;
584 					return(1);
585 				}
586 				else
587 					goto nextin;	/* no match */
588 			}
589 		} while (*q++ != 0);
590 		if (f->out[s])
591 			patlen = q-p-1;	/* don't count $ */
592 		if (patlen >= 0) {
593 			patbeg = (const char *) p;
594 			return(1);
595 		}
596 	nextin:
597 		s = 2;
598 	} while (*p++);
599 	return (0);
600 }
601 
602 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
603 {
604 	int s, ns;
605 	const uschar *p = (const uschar *) p0;
606 	const uschar *q;
607 
608 	s = f->initstat;
609 	assert(s < f->state_count);
610 
611 	patbeg = (const char *)p;
612 	patlen = -1;
613 	while (*p) {
614 		q = p;
615 		do {
616 			if (f->out[s])		/* final state */
617 				patlen = q-p;
618 			/* assert(*q < NCHARS); */
619 			if ((ns = f->gototab[s][*q]) != 0)
620 				s = ns;
621 			else
622 				s = cgoto(f, s, *q);
623 			if (s == 1) {	/* no transition */
624 				if (patlen > 0) {
625 					patbeg = (const char *) p;
626 					return(1);
627 				} else
628 					goto nnextin;	/* no nonempty match */
629 			}
630 		} while (*q++ != 0);
631 		if (f->out[s])
632 			patlen = q-p-1;	/* don't count $ */
633 		if (patlen > 0 ) {
634 			patbeg = (const char *) p;
635 			return(1);
636 		}
637 	nnextin:
638 		s = 2;
639 		p++;
640 	}
641 	return (0);
642 }
643 
644 
645 /*
646  * NAME
647  *     fnematch
648  *
649  * DESCRIPTION
650  *     A stream-fed version of nematch which transfers characters to a
651  *     null-terminated buffer. All characters up to and including the last
652  *     character of the matching text or EOF are placed in the buffer. If
653  *     a match is found, patbeg and patlen are set appropriately.
654  *
655  * RETURN VALUES
656  *     false    No match found.
657  *     true     Match found.
658  */
659 
660 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
661 {
662 	char *buf = *pbuf;
663 	int bufsize = *pbufsize;
664 	int c, i, j, k, ns, s;
665 
666 	s = pfa->initstat;
667 	patlen = 0;
668 
669 	/*
670 	 * All indices relative to buf.
671 	 * i <= j <= k <= bufsize
672 	 *
673 	 * i: origin of active substring
674 	 * j: current character
675 	 * k: destination of next getc()
676 	 */
677 	i = -1, k = 0;
678         do {
679 		j = i++;
680 		do {
681 			if (++j == k) {
682 				if (k == bufsize)
683 					if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
684 						FATAL("stream '%.30s...' too long", buf);
685 				buf[k++] = (c = getc(f)) != EOF ? c : 0;
686 			}
687 			c = buf[j];
688 			/* assert(c < NCHARS); */
689 
690 			if ((ns = pfa->gototab[s][c]) != 0)
691 				s = ns;
692 			else
693 				s = cgoto(pfa, s, c);
694 
695 			if (pfa->out[s]) {	/* final state */
696 				patlen = j - i + 1;
697 				if (c == 0)	/* don't count $ */
698 					patlen--;
699 			}
700 		} while (buf[j] && s != 1);
701 		s = 2;
702 	} while (buf[i] && !patlen);
703 
704 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
705 	*pbuf = buf;
706 	*pbufsize = bufsize;
707 
708 	if (patlen) {
709 		patbeg = (char *) buf + i;
710 		/*
711 		 * Under no circumstances is the last character fed to
712 		 * the automaton part of the match. It is EOF's nullbyte,
713 		 * or it sent the automaton into a state with no further
714 		 * transitions available (s==1), or both. Room for a
715 		 * terminating nullbyte is guaranteed.
716 		 *
717 		 * ungetc any chars after the end of matching text
718 		 * (except for EOF's nullbyte, if present) and null
719 		 * terminate the buffer.
720 		 */
721 		do
722 			if (buf[--k] && ungetc(buf[k], f) == EOF)
723 				FATAL("unable to ungetc '%c'", buf[k]);
724 		while (k > i + patlen);
725 		buf[k] = '\0';
726 		return true;
727 	}
728 	else
729 		return false;
730 }
731 
732 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
733 {			/* uses relex() to scan regular expression */
734 	Node *np;
735 
736 	DPRINTF("reparse <%s>\n", p);
737 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
738 	rtok = relex();
739 	/* GNU compatibility: an empty regexp matches anything */
740 	if (rtok == '\0') {
741 		/* FATAL("empty regular expression"); previous */
742 		return(op2(EMPTYRE, NIL, NIL));
743 	}
744 	np = regexp();
745 	if (rtok != '\0')
746 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
747 	return(np);
748 }
749 
750 Node *regexp(void)	/* top-level parse of reg expr */
751 {
752 	return (alt(concat(primary())));
753 }
754 
755 Node *primary(void)
756 {
757 	Node *np;
758 	int savelastatom;
759 
760 	switch (rtok) {
761 	case CHAR:
762 		lastatom = starttok;
763 		np = op2(CHAR, NIL, itonp(rlxval));
764 		rtok = relex();
765 		return (unary(np));
766 	case ALL:
767 		rtok = relex();
768 		return (unary(op2(ALL, NIL, NIL)));
769 	case EMPTYRE:
770 		rtok = relex();
771 		return (unary(op2(EMPTYRE, NIL, NIL)));
772 	case DOT:
773 		lastatom = starttok;
774 		rtok = relex();
775 		return (unary(op2(DOT, NIL, NIL)));
776 	case CCL:
777 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
778 		lastatom = starttok;
779 		rtok = relex();
780 		return (unary(np));
781 	case NCCL:
782 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
783 		lastatom = starttok;
784 		rtok = relex();
785 		return (unary(np));
786 	case '^':
787 		rtok = relex();
788 		return (unary(op2(CHAR, NIL, itonp(HAT))));
789 	case '$':
790 		rtok = relex();
791 		return (unary(op2(CHAR, NIL, NIL)));
792 	case '(':
793 		lastatom = starttok;
794 		savelastatom = starttok - basestr; /* Retain over recursion */
795 		rtok = relex();
796 		if (rtok == ')') {	/* special pleading for () */
797 			rtok = relex();
798 			return unary(op2(CCL, NIL, (Node *) tostring("")));
799 		}
800 		np = regexp();
801 		if (rtok == ')') {
802 			lastatom = basestr + savelastatom; /* Restore */
803 			rtok = relex();
804 			return (unary(np));
805 		}
806 		else
807 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
808 	default:
809 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
810 	}
811 	return 0;	/*NOTREACHED*/
812 }
813 
814 Node *concat(Node *np)
815 {
816 	switch (rtok) {
817 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
818 		return (concat(op2(CAT, np, primary())));
819 	case EMPTYRE:
820 		rtok = relex();
821 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
822 				primary())));
823 	}
824 	return (np);
825 }
826 
827 Node *alt(Node *np)
828 {
829 	if (rtok == OR) {
830 		rtok = relex();
831 		return (alt(op2(OR, np, concat(primary()))));
832 	}
833 	return (np);
834 }
835 
836 Node *unary(Node *np)
837 {
838 	switch (rtok) {
839 	case STAR:
840 		rtok = relex();
841 		return (unary(op2(STAR, np, NIL)));
842 	case PLUS:
843 		rtok = relex();
844 		return (unary(op2(PLUS, np, NIL)));
845 	case QUEST:
846 		rtok = relex();
847 		return (unary(op2(QUEST, np, NIL)));
848 	case ZERO:
849 		rtok = relex();
850 		return (unary(op2(ZERO, np, NIL)));
851 	default:
852 		return (np);
853 	}
854 }
855 
856 /*
857  * Character class definitions conformant to the POSIX locale as
858  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
859  * and operating character sets are both ASCII (ISO646) or supersets
860  * thereof.
861  *
862  * Note that to avoid overflowing the temporary buffer used in
863  * relex(), the expanded character class (prior to range expansion)
864  * must be less than twice the size of their full name.
865  */
866 
867 /* Because isblank doesn't show up in any of the header files on any
868  * system i use, it's defined here.  if some other locale has a richer
869  * definition of "blank", define HAS_ISBLANK and provide your own
870  * version.
871  * the parentheses here are an attempt to find a path through the maze
872  * of macro definition and/or function and/or version provided.  thanks
873  * to nelson beebe for the suggestion; let's see if it works everywhere.
874  */
875 
876 /* #define HAS_ISBLANK */
877 #ifndef HAS_ISBLANK
878 
879 int (xisblank)(int c)
880 {
881 	return c==' ' || c=='\t';
882 }
883 
884 #endif
885 
886 static const struct charclass {
887 	const char *cc_name;
888 	int cc_namelen;
889 	int (*cc_func)(int);
890 } charclasses[] = {
891 	{ "alnum",	5,	isalnum },
892 	{ "alpha",	5,	isalpha },
893 #ifndef HAS_ISBLANK
894 	{ "blank",	5,	xisblank },
895 #else
896 	{ "blank",	5,	isblank },
897 #endif
898 	{ "cntrl",	5,	iscntrl },
899 	{ "digit",	5,	isdigit },
900 	{ "graph",	5,	isgraph },
901 	{ "lower",	5,	islower },
902 	{ "print",	5,	isprint },
903 	{ "punct",	5,	ispunct },
904 	{ "space",	5,	isspace },
905 	{ "upper",	5,	isupper },
906 	{ "xdigit",	6,	isxdigit },
907 	{ NULL,		0,	NULL },
908 };
909 
910 #define REPEAT_SIMPLE		0
911 #define REPEAT_PLUS_APPENDED	1
912 #define REPEAT_WITH_Q		2
913 #define REPEAT_ZERO		3
914 
915 static int
916 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
917 	       int atomlen, int firstnum, int secondnum, int special_case)
918 {
919 	int i, j;
920 	uschar *buf = 0;
921 	int ret = 1;
922 	int init_q = (firstnum == 0);		/* first added char will be ? */
923 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
924 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
925 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
926 	int size = prefix_length +  suffix_length;
927 
928 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
929 		size += atomlen*(firstnum-1);
930 	}
931 
932 	/* Adjust size of buffer for special cases */
933 	if (special_case == REPEAT_PLUS_APPENDED) {
934 		size++;		/* for the final + */
935 	} else if (special_case == REPEAT_WITH_Q) {
936 		size += init_q + (atomlen+1)* n_q_reps;
937 	} else if (special_case == REPEAT_ZERO) {
938 		size += 2;	/* just a null ERE: () */
939 	}
940 	if ((buf = malloc(size + 1)) == NULL)
941 		FATAL("out of space in reg expr %.10s..", lastre);
942 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
943 	j = prefix_length;
944 	if (special_case == REPEAT_ZERO) {
945 		j -= atomlen;
946 		buf[j++] = '(';
947 		buf[j++] = ')';
948 	}
949 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
950 		memcpy(&buf[j], atom, atomlen);
951 		j += atomlen;
952 	}
953 	if (special_case == REPEAT_PLUS_APPENDED) {
954 		buf[j++] = '+';
955 	} else if (special_case == REPEAT_WITH_Q) {
956 		if (init_q)
957 			buf[j++] = '?';
958 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
959 			memcpy(&buf[j], atom, atomlen);
960 			j += atomlen;
961 			buf[j++] = '?';
962 		}
963 	}
964 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
965 	if (special_case == REPEAT_ZERO) {
966 		buf[j+suffix_length] = '\0';
967 	} else {
968 		buf[size] = '\0';
969 	}
970 	/* free old basestr */
971 	if (firstbasestr != basestr) {
972 		if (basestr)
973 			xfree(basestr);
974 	}
975 	basestr = buf;
976 	prestr  = buf + prefix_length;
977 	if (special_case == REPEAT_ZERO) {
978 		prestr  -= atomlen;
979 		ret++;
980 	}
981 	return ret;
982 }
983 
984 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
985 		  int atomlen, int firstnum, int secondnum)
986 {
987 	/*
988 	   In general, the repetition specifier or "bound" is replaced here
989 	   by an equivalent ERE string, repeating the immediately previous atom
990 	   and appending ? and + as needed. Note that the first copy of the
991 	   atom is left in place, except in the special_case of a zero-repeat
992 	   (i.e., {0}).
993 	 */
994 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
995 		if (firstnum < 2) {
996 			/* 0 or 1: should be handled before you get here */
997 			FATAL("internal error");
998 		} else {
999 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1000 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1001 		}
1002 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1003 		if (firstnum == 0) {	/* {0} or {0,0} */
1004 			/* This case is unusual because the resulting
1005 			   replacement string might actually be SMALLER than
1006 			   the original ERE */
1007 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1008 					firstnum, secondnum, REPEAT_ZERO);
1009 		} else {		/* (firstnum >= 1) */
1010 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1011 					firstnum, secondnum, REPEAT_SIMPLE);
1012 		}
1013 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1014 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1015 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1016 					firstnum, secondnum, REPEAT_WITH_Q);
1017 	} else {	/* Error - shouldn't be here (n>m) */
1018 		FATAL("internal error");
1019 	}
1020 	return 0;
1021 }
1022 
1023 int relex(void)		/* lexical analyzer for reparse */
1024 {
1025 	int c, n;
1026 	int cflag;
1027 	static uschar *buf = NULL;
1028 	static int bufsz = 100;
1029 	uschar *bp;
1030 	const struct charclass *cc;
1031 	int i;
1032 	int num, m;
1033 	bool commafound, digitfound;
1034 	const uschar *startreptok;
1035 	static int parens = 0;
1036 
1037 rescan:
1038 	starttok = prestr;
1039 
1040 	switch (c = *prestr++) {
1041 	case '|': return OR;
1042 	case '*': return STAR;
1043 	case '+': return PLUS;
1044 	case '?': return QUEST;
1045 	case '.': return DOT;
1046 	case '\0': prestr--; return '\0';
1047 	case '^':
1048 	case '$':
1049 		return c;
1050 	case '(':
1051 		parens++;
1052 		return c;
1053 	case ')':
1054 		if (parens) {
1055 			parens--;
1056 			return c;
1057 		}
1058 		/* unmatched close parenthesis; per POSIX, treat as literal */
1059 		rlxval = c;
1060 		return CHAR;
1061 	case '\\':
1062 		rlxval = quoted(&prestr);
1063 		return CHAR;
1064 	default:
1065 		rlxval = c;
1066 		return CHAR;
1067 	case '[':
1068 		if (buf == NULL && (buf = malloc(bufsz)) == NULL)
1069 			FATAL("out of space in reg expr %.10s..", lastre);
1070 		bp = buf;
1071 		if (*prestr == '^') {
1072 			cflag = 1;
1073 			prestr++;
1074 		}
1075 		else
1076 			cflag = 0;
1077 		n = 2 * strlen((const char *) prestr)+1;
1078 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1079 			FATAL("out of space for reg expr %.10s...", lastre);
1080 		for (; ; ) {
1081 			if ((c = *prestr++) == '\\') {
1082 				*bp++ = '\\';
1083 				if ((c = *prestr++) == '\0')
1084 					FATAL("nonterminated character class %.20s...", lastre);
1085 				*bp++ = c;
1086 			/* } else if (c == '\n') { */
1087 			/* 	FATAL("newline in character class %.20s...", lastre); */
1088 			} else if (c == '[' && *prestr == ':') {
1089 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1090 				for (cc = charclasses; cc->cc_name; cc++)
1091 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1092 						break;
1093 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1094 				    prestr[2 + cc->cc_namelen] == ']') {
1095 					prestr += cc->cc_namelen + 3;
1096 					/*
1097 					 * BUG: We begin at 1, instead of 0, since we
1098 					 * would otherwise prematurely terminate the
1099 					 * string for classes like [[:cntrl:]]. This
1100 					 * means that we can't match the NUL character,
1101 					 * not without first adapting the entire
1102 					 * program to track each string's length.
1103 					 */
1104 					for (i = 1; i <= UCHAR_MAX; i++) {
1105 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1106 						    FATAL("out of space for reg expr %.10s...", lastre);
1107 						if (cc->cc_func(i)) {
1108 							/* escape backslash */
1109 							if (i == '\\') {
1110 								*bp++ = '\\';
1111 								n++;
1112 							}
1113 
1114 							*bp++ = i;
1115 							n++;
1116 						}
1117 					}
1118 				} else
1119 					*bp++ = c;
1120 			} else if (c == '[' && *prestr == '.') {
1121 				char collate_char;
1122 				prestr++;
1123 				collate_char = *prestr++;
1124 				if (*prestr == '.' && prestr[1] == ']') {
1125 					prestr += 2;
1126 					/* Found it: map via locale TBD: for
1127 					   now, simply return this char.  This
1128 					   is sufficient to pass conformance
1129 					   test awk.ex 156
1130 					 */
1131 					if (*prestr == ']') {
1132 						prestr++;
1133 						rlxval = collate_char;
1134 						return CHAR;
1135 					}
1136 				}
1137 			} else if (c == '[' && *prestr == '=') {
1138 				char equiv_char;
1139 				prestr++;
1140 				equiv_char = *prestr++;
1141 				if (*prestr == '=' && prestr[1] == ']') {
1142 					prestr += 2;
1143 					/* Found it: map via locale TBD: for now
1144 					   simply return this char. This is
1145 					   sufficient to pass conformance test
1146 					   awk.ex 156
1147 					 */
1148 					if (*prestr == ']') {
1149 						prestr++;
1150 						rlxval = equiv_char;
1151 						return CHAR;
1152 					}
1153 				}
1154 			} else if (c == '\0') {
1155 				FATAL("nonterminated character class %.20s", lastre);
1156 			} else if (bp == buf) {	/* 1st char is special */
1157 				*bp++ = c;
1158 			} else if (c == ']') {
1159 				*bp++ = 0;
1160 				rlxstr = (uschar *) tostring((char *) buf);
1161 				if (cflag == 0)
1162 					return CCL;
1163 				else
1164 					return NCCL;
1165 			} else
1166 				*bp++ = c;
1167 		}
1168 		break;
1169 	case '{':
1170 		if (isdigit(*(prestr))) {
1171 			num = 0;	/* Process as a repetition */
1172 			n = -1; m = -1;
1173 			commafound = false;
1174 			digitfound = false;
1175 			startreptok = prestr-1;
1176 			/* Remember start of previous atom here ? */
1177 		} else {        	/* just a { char, not a repetition */
1178 			rlxval = c;
1179 			return CHAR;
1180                 }
1181 		for (; ; ) {
1182 			if ((c = *prestr++) == '}') {
1183 				if (commafound) {
1184 					if (digitfound) { /* {n,m} */
1185 						m = num;
1186 						if (m < n)
1187 							FATAL("illegal repetition expression: class %.20s",
1188 								lastre);
1189 						if (n == 0 && m == 1) {
1190 							return QUEST;
1191 						}
1192 					} else {	/* {n,} */
1193 						if (n == 0)
1194 							return STAR;
1195 						else if (n == 1)
1196 							return PLUS;
1197 					}
1198 				} else {
1199 					if (digitfound) { /* {n} same as {n,n} */
1200 						n = num;
1201 						m = num;
1202 					} else {	/* {} */
1203 						FATAL("illegal repetition expression: class %.20s",
1204 							lastre);
1205 					}
1206 				}
1207 				if (repeat(starttok, prestr-starttok, lastatom,
1208 					   startreptok - lastatom, n, m) > 0) {
1209 					if (n == 0 && m == 0) {
1210 						return ZERO;
1211 					}
1212 					/* must rescan input for next token */
1213 					goto rescan;
1214 				}
1215 				/* Failed to replace: eat up {...} characters
1216 				   and treat like just PLUS */
1217 				return PLUS;
1218 			} else if (c == '\0') {
1219 				FATAL("nonterminated character class %.20s",
1220 					lastre);
1221 			} else if (isdigit(c)) {
1222 				num = 10 * num + c - '0';
1223 				digitfound = true;
1224 			} else if (c == ',') {
1225 				if (commafound)
1226 					FATAL("illegal repetition expression: class %.20s",
1227 						lastre);
1228 				/* looking for {n,} or {n,m} */
1229 				commafound = true;
1230 				n = num;
1231 				digitfound = false; /* reset */
1232 				num = 0;
1233 			} else {
1234 				FATAL("illegal repetition expression: class %.20s",
1235 					lastre);
1236 			}
1237 		}
1238 		break;
1239 	}
1240 }
1241 
1242 int cgoto(fa *f, int s, int c)
1243 {
1244 	int *p, *q;
1245 	int i, j, k;
1246 
1247 	assert(c == HAT || c < NCHARS);
1248 	while (f->accept >= maxsetvec) {	/* guessing here! */
1249 		resizesetvec(__func__);
1250 	}
1251 	for (i = 0; i <= f->accept; i++)
1252 		setvec[i] = 0;
1253 	setcnt = 0;
1254 	resize_state(f, s);
1255 	/* compute positions of gototab[s,c] into setvec */
1256 	p = f->posns[s];
1257 	for (i = 1; i <= *p; i++) {
1258 		if ((k = f->re[p[i]].ltype) != FINAL) {
1259 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1260 			 || (k == DOT && c != 0 && c != HAT)
1261 			 || (k == ALL && c != 0)
1262 			 || (k == EMPTYRE && c != 0)
1263 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1264 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1265 				q = f->re[p[i]].lfollow;
1266 				for (j = 1; j <= *q; j++) {
1267 					if (q[j] >= maxsetvec) {
1268 						resizesetvec(__func__);
1269 					}
1270 					if (setvec[q[j]] == 0) {
1271 						setcnt++;
1272 						setvec[q[j]] = 1;
1273 					}
1274 				}
1275 			}
1276 		}
1277 	}
1278 	/* determine if setvec is a previous state */
1279 	tmpset[0] = setcnt;
1280 	j = 1;
1281 	for (i = f->accept; i >= 0; i--)
1282 		if (setvec[i]) {
1283 			tmpset[j++] = i;
1284 		}
1285 	resize_state(f, f->curstat > s ? f->curstat : s);
1286 	/* tmpset == previous state? */
1287 	for (i = 1; i <= f->curstat; i++) {
1288 		p = f->posns[i];
1289 		if ((k = tmpset[0]) != p[0])
1290 			goto different;
1291 		for (j = 1; j <= k; j++)
1292 			if (tmpset[j] != p[j])
1293 				goto different;
1294 		/* setvec is state i */
1295 		if (c != HAT)
1296 			f->gototab[s][c] = i;
1297 		return i;
1298 	  different:;
1299 	}
1300 
1301 	/* add tmpset to current set of states */
1302 	++(f->curstat);
1303 	resize_state(f, f->curstat);
1304 	for (i = 0; i < NCHARS; i++)
1305 		f->gototab[f->curstat][i] = 0;
1306 	xfree(f->posns[f->curstat]);
1307 	p = intalloc(setcnt + 1, __func__);
1308 
1309 	f->posns[f->curstat] = p;
1310 	if (c != HAT)
1311 		f->gototab[s][c] = f->curstat;
1312 	for (i = 0; i <= setcnt; i++)
1313 		p[i] = tmpset[i];
1314 	if (setvec[f->accept])
1315 		f->out[f->curstat] = 1;
1316 	else
1317 		f->out[f->curstat] = 0;
1318 	return f->curstat;
1319 }
1320 
1321 
1322 void freefa(fa *f)	/* free a finite automaton */
1323 {
1324 	int i;
1325 
1326 	if (f == NULL)
1327 		return;
1328 	for (i = 0; i < f->state_count; i++)
1329 		xfree(f->gototab[i])
1330 	for (i = 0; i <= f->curstat; i++)
1331 		xfree(f->posns[i]);
1332 	for (i = 0; i <= f->accept; i++) {
1333 		xfree(f->re[i].lfollow);
1334 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1335 			xfree(f->re[i].lval.np);
1336 	}
1337 	xfree(f->restr);
1338 	xfree(f->out);
1339 	xfree(f->posns);
1340 	xfree(f->gototab);
1341 	xfree(f);
1342 }
1343