xref: /freebsd/contrib/one-true-awk/b.c (revision f39dd6a9)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #define	DEBUG
31 
32 #include <ctype.h>
33 #include <limits.h>
34 #include <stdio.h>
35 #include <string.h>
36 #include <stdlib.h>
37 #include "awk.h"
38 #include "awkgram.tab.h"
39 
40 #define MAXLIN 22
41 
42 #define type(v)		(v)->nobj	/* badly overloaded here */
43 #define info(v)		(v)->ntype	/* badly overloaded here */
44 #define left(v)		(v)->narg[0]
45 #define right(v)	(v)->narg[1]
46 #define parent(v)	(v)->nnext
47 
48 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
49 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
50 #define UNARY	case STAR: case PLUS: case QUEST:
51 
52 /* encoding in tree Nodes:
53 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
54 		left is index, right contains value or pointer to value
55 	unary (STAR, PLUS, QUEST): left is child, right is null
56 	binary (CAT, OR): left and right are children
57 	parent contains pointer to parent
58 */
59 
60 
61 int	*setvec;
62 int	*tmpset;
63 int	maxsetvec = 0;
64 
65 int	rtok;		/* next token in current re */
66 int	rlxval;
67 static const uschar	*rlxstr;
68 static const uschar	*prestr;	/* current position in current re */
69 static const uschar	*lastre;	/* origin of last re */
70 static const uschar	*lastatom;	/* origin of last Atom */
71 static const uschar	*starttok;
72 static const uschar 	*basestr;	/* starts with original, replaced during
73 				   repetition processing */
74 static const uschar 	*firstbasestr;
75 
76 static	int setcnt;
77 static	int poscnt;
78 
79 const char	*patbeg;
80 int	patlen;
81 
82 #define	NFA	128	/* cache this many dynamic fa's */
83 fa	*fatab[NFA];
84 int	nfatab	= 0;	/* entries in fatab */
85 
86 static int *
87 intalloc(size_t n, const char *f)
88 {
89 	int *p = (int *) calloc(n, sizeof(int));
90 	if (p == NULL)
91 		overflo(f);
92 	return p;
93 }
94 
95 static void
96 resizesetvec(const char *f)
97 {
98 	if (maxsetvec == 0)
99 		maxsetvec = MAXLIN;
100 	else
101 		maxsetvec *= 4;
102 	setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
103 	tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
104 	if (setvec == NULL || tmpset == NULL)
105 		overflo(f);
106 }
107 
108 static void
109 resize_state(fa *f, int state)
110 {
111 	unsigned int **p;
112 	uschar *p2;
113 	int **p3;
114 	int i, new_count;
115 
116 	if (++state < f->state_count)
117 		return;
118 
119 	new_count = state + 10; /* needs to be tuned */
120 
121 	p = (unsigned int **) realloc(f->gototab, new_count * sizeof(f->gototab[0]));
122 	if (p == NULL)
123 		goto out;
124 	f->gototab = p;
125 
126 	p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
127 	if (p2 == NULL)
128 		goto out;
129 	f->out = p2;
130 
131 	p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
132 	if (p3 == NULL)
133 		goto out;
134 	f->posns = p3;
135 
136 	for (i = f->state_count; i < new_count; ++i) {
137 		f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));
138 		if (f->gototab[i] == NULL)
139 			goto out;
140 		f->out[i]  = 0;
141 		f->posns[i] = NULL;
142 	}
143 	f->state_count = new_count;
144 	return;
145 out:
146 	overflo(__func__);
147 }
148 
149 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
150 {
151 	int i, use, nuse;
152 	fa *pfa;
153 	static int now = 1;
154 
155 	if (setvec == NULL) {	/* first time through any RE */
156 		resizesetvec(__func__);
157 	}
158 
159 	if (compile_time != RUNNING)	/* a constant for sure */
160 		return mkdfa(s, anchor);
161 	for (i = 0; i < nfatab; i++)	/* is it there already? */
162 		if (fatab[i]->anchor == anchor
163 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
164 			fatab[i]->use = now++;
165 			return fatab[i];
166 		}
167 	pfa = mkdfa(s, anchor);
168 	if (nfatab < NFA) {	/* room for another */
169 		fatab[nfatab] = pfa;
170 		fatab[nfatab]->use = now++;
171 		nfatab++;
172 		return pfa;
173 	}
174 	use = fatab[0]->use;	/* replace least-recently used */
175 	nuse = 0;
176 	for (i = 1; i < nfatab; i++)
177 		if (fatab[i]->use < use) {
178 			use = fatab[i]->use;
179 			nuse = i;
180 		}
181 	freefa(fatab[nuse]);
182 	fatab[nuse] = pfa;
183 	pfa->use = now++;
184 	return pfa;
185 }
186 
187 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
188 				/* anchor = true for anchored matches, else false */
189 {
190 	Node *p, *p1;
191 	fa *f;
192 
193 	firstbasestr = (const uschar *) s;
194 	basestr = firstbasestr;
195 	p = reparse(s);
196 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
197 		/* put ALL STAR in front of reg.  exp. */
198 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
199 		/* put FINAL after reg.  exp. */
200 
201 	poscnt = 0;
202 	penter(p1);	/* enter parent pointers and leaf indices */
203 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
204 		overflo(__func__);
205 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
206 	cfoll(f, p1);	/* set up follow sets */
207 	freetr(p1);
208 	resize_state(f, 1);
209 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
210 	f->posns[1] = intalloc(1, __func__);
211 	*f->posns[1] = 0;
212 	f->initstat = makeinit(f, anchor);
213 	f->anchor = anchor;
214 	f->restr = (uschar *) tostring(s);
215 	if (firstbasestr != basestr) {
216 		if (basestr)
217 			xfree(basestr);
218 	}
219 	return f;
220 }
221 
222 int makeinit(fa *f, bool anchor)
223 {
224 	int i, k;
225 
226 	f->curstat = 2;
227 	f->out[2] = 0;
228 	k = *(f->re[0].lfollow);
229 	xfree(f->posns[2]);
230 	f->posns[2] = intalloc(k + 1,  __func__);
231 	for (i = 0; i <= k; i++) {
232 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
233 	}
234 	if ((f->posns[2])[1] == f->accept)
235 		f->out[2] = 1;
236 	for (i = 0; i < NCHARS; i++)
237 		f->gototab[2][i] = 0;
238 	f->curstat = cgoto(f, 2, HAT);
239 	if (anchor) {
240 		*f->posns[2] = k-1;	/* leave out position 0 */
241 		for (i = 0; i < k; i++) {
242 			(f->posns[0])[i] = (f->posns[2])[i];
243 		}
244 
245 		f->out[0] = f->out[2];
246 		if (f->curstat != 2)
247 			--(*f->posns[f->curstat]);
248 	}
249 	return f->curstat;
250 }
251 
252 void penter(Node *p)	/* set up parent pointers and leaf indices */
253 {
254 	switch (type(p)) {
255 	ELEAF
256 	LEAF
257 		info(p) = poscnt;
258 		poscnt++;
259 		break;
260 	UNARY
261 		penter(left(p));
262 		parent(left(p)) = p;
263 		break;
264 	case CAT:
265 	case OR:
266 		penter(left(p));
267 		penter(right(p));
268 		parent(left(p)) = p;
269 		parent(right(p)) = p;
270 		break;
271 	case ZERO:
272 		break;
273 	default:	/* can't happen */
274 		FATAL("can't happen: unknown type %d in penter", type(p));
275 		break;
276 	}
277 }
278 
279 void freetr(Node *p)	/* free parse tree */
280 {
281 	switch (type(p)) {
282 	ELEAF
283 	LEAF
284 		xfree(p);
285 		break;
286 	UNARY
287 	case ZERO:
288 		freetr(left(p));
289 		xfree(p);
290 		break;
291 	case CAT:
292 	case OR:
293 		freetr(left(p));
294 		freetr(right(p));
295 		xfree(p);
296 		break;
297 	default:	/* can't happen */
298 		FATAL("can't happen: unknown type %d in freetr", type(p));
299 		break;
300 	}
301 }
302 
303 /* in the parsing of regular expressions, metacharacters like . have */
304 /* to be seen literally;  \056 is not a metacharacter. */
305 
306 int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
307 {			/* only pick up one 8-bit byte (2 chars) */
308 	const uschar *p;
309 	int n = 0;
310 	int i;
311 
312 	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
313 		if (isdigit(*p))
314 			n = 16 * n + *p - '0';
315 		else if (*p >= 'a' && *p <= 'f')
316 			n = 16 * n + *p - 'a' + 10;
317 		else if (*p >= 'A' && *p <= 'F')
318 			n = 16 * n + *p - 'A' + 10;
319 	}
320 	*pp = p;
321 	return n;
322 }
323 
324 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
325 
326 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
327 			/* and increment *pp */
328 {
329 	const uschar *p = *pp;
330 	int c;
331 
332 	if ((c = *p++) == 't')
333 		c = '\t';
334 	else if (c == 'n')
335 		c = '\n';
336 	else if (c == 'f')
337 		c = '\f';
338 	else if (c == 'r')
339 		c = '\r';
340 	else if (c == 'b')
341 		c = '\b';
342 	else if (c == 'v')
343 		c = '\v';
344 	else if (c == 'a')
345 		c = '\a';
346 	else if (c == '\\')
347 		c = '\\';
348 	else if (c == 'x') {	/* hexadecimal goo follows */
349 		c = hexstr(&p);	/* this adds a null if number is invalid */
350 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
351 		int n = c - '0';
352 		if (isoctdigit(*p)) {
353 			n = 8 * n + *p++ - '0';
354 			if (isoctdigit(*p))
355 				n = 8 * n + *p++ - '0';
356 		}
357 		c = n;
358 	} /* else */
359 		/* c = c; */
360 	*pp = p;
361 	return c;
362 }
363 
364 static int collate_range_cmp(int a, int b)
365 {
366 	static char s[2][2];
367 
368 	if ((uschar)a == (uschar)b)
369 		return 0;
370 	s[0][0] = a;
371 	s[1][0] = b;
372 	return (strcoll(s[0], s[1]));
373 }
374 
375 char *cclenter(const char *argp)	/* add a character class */
376 {
377 	int i, c, c2, j;
378 	const uschar *op, *p = (const uschar *) argp;
379 	uschar *bp;
380 	static uschar *buf = NULL;
381 	static int bufsz = 100;
382 
383 	op = p;
384 	if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
385 		FATAL("out of space for character class [%.10s...] 1", p);
386 	bp = buf;
387 	for (i = 0; (c = *p++) != 0; ) {
388 		if (c == '\\') {
389 			c = quoted(&p);
390 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
391 			if (*p != 0) {
392 				c = bp[-1];
393 				c2 = *p++;
394 				if (c2 == '\\')
395 					c2 = quoted(&p);
396 				if (collate_range_cmp(c, c2) > 0) {
397 					bp--;
398 					i--;
399 					continue;
400 				}
401 				for (j = 0; j < NCHARS; j++) {
402 					if ((collate_range_cmp(c, j) > 0) ||
403 					    collate_range_cmp(j, c2) > 0)
404 						continue;
405 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
406 						FATAL("out of space for character class [%.10s...] 2", p);
407 					*bp++ = j;
408 					i++;
409 				}
410 				continue;
411 			}
412 		}
413 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
414 			FATAL("out of space for character class [%.10s...] 3", p);
415 		*bp++ = c;
416 		i++;
417 	}
418 	*bp = 0;
419 	DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
420 	xfree(op);
421 	return (char *) tostring((char *) buf);
422 }
423 
424 void overflo(const char *s)
425 {
426 	FATAL("regular expression too big: out of space in %.30s...", s);
427 }
428 
429 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
430 {
431 	int i;
432 	int *p;
433 
434 	switch (type(v)) {
435 	ELEAF
436 	LEAF
437 		f->re[info(v)].ltype = type(v);
438 		f->re[info(v)].lval.np = right(v);
439 		while (f->accept >= maxsetvec) {	/* guessing here! */
440 			resizesetvec(__func__);
441 		}
442 		for (i = 0; i <= f->accept; i++)
443 			setvec[i] = 0;
444 		setcnt = 0;
445 		follow(v);	/* computes setvec and setcnt */
446 		p = intalloc(setcnt + 1, __func__);
447 		f->re[info(v)].lfollow = p;
448 		*p = setcnt;
449 		for (i = f->accept; i >= 0; i--)
450 			if (setvec[i] == 1)
451 				*++p = i;
452 		break;
453 	UNARY
454 		cfoll(f,left(v));
455 		break;
456 	case CAT:
457 	case OR:
458 		cfoll(f,left(v));
459 		cfoll(f,right(v));
460 		break;
461 	case ZERO:
462 		break;
463 	default:	/* can't happen */
464 		FATAL("can't happen: unknown type %d in cfoll", type(v));
465 	}
466 }
467 
468 int first(Node *p)	/* collects initially active leaves of p into setvec */
469 			/* returns 0 if p matches empty string */
470 {
471 	int b, lp;
472 
473 	switch (type(p)) {
474 	ELEAF
475 	LEAF
476 		lp = info(p);	/* look for high-water mark of subscripts */
477 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
478 			resizesetvec(__func__);
479 		}
480 		if (type(p) == EMPTYRE) {
481 			setvec[lp] = 0;
482 			return(0);
483 		}
484 		if (setvec[lp] != 1) {
485 			setvec[lp] = 1;
486 			setcnt++;
487 		}
488 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
489 			return(0);		/* empty CCL */
490 		return(1);
491 	case PLUS:
492 		if (first(left(p)) == 0)
493 			return(0);
494 		return(1);
495 	case STAR:
496 	case QUEST:
497 		first(left(p));
498 		return(0);
499 	case CAT:
500 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
501 		return(1);
502 	case OR:
503 		b = first(right(p));
504 		if (first(left(p)) == 0 || b == 0) return(0);
505 		return(1);
506 	case ZERO:
507 		return 0;
508 	}
509 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
510 	return(-1);
511 }
512 
513 void follow(Node *v)	/* collects leaves that can follow v into setvec */
514 {
515 	Node *p;
516 
517 	if (type(v) == FINAL)
518 		return;
519 	p = parent(v);
520 	switch (type(p)) {
521 	case STAR:
522 	case PLUS:
523 		first(v);
524 		follow(p);
525 		return;
526 
527 	case OR:
528 	case QUEST:
529 		follow(p);
530 		return;
531 
532 	case CAT:
533 		if (v == left(p)) {	/* v is left child of p */
534 			if (first(right(p)) == 0) {
535 				follow(p);
536 				return;
537 			}
538 		} else		/* v is right child */
539 			follow(p);
540 		return;
541 	}
542 }
543 
544 int member(int c, const char *sarg)	/* is c in s? */
545 {
546 	const uschar *s = (const uschar *) sarg;
547 
548 	while (*s)
549 		if (c == *s++)
550 			return(1);
551 	return(0);
552 }
553 
554 int match(fa *f, const char *p0)	/* shortest match ? */
555 {
556 	int s, ns;
557 	const uschar *p = (const uschar *) p0;
558 
559 	s = f->initstat;
560 	assert (s < f->state_count);
561 
562 	if (f->out[s])
563 		return(1);
564 	do {
565 		/* assert(*p < NCHARS); */
566 		if ((ns = f->gototab[s][*p]) != 0)
567 			s = ns;
568 		else
569 			s = cgoto(f, s, *p);
570 		if (f->out[s])
571 			return(1);
572 	} while (*p++ != 0);
573 	return(0);
574 }
575 
576 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
577 {
578 	int s, ns;
579 	const uschar *p = (const uschar *) p0;
580 	const uschar *q;
581 
582 	s = f->initstat;
583 	assert(s < f->state_count);
584 
585 	patbeg = (const char *)p;
586 	patlen = -1;
587 	do {
588 		q = p;
589 		do {
590 			if (f->out[s])		/* final state */
591 				patlen = q-p;
592 			/* assert(*q < NCHARS); */
593 			if ((ns = f->gototab[s][*q]) != 0)
594 				s = ns;
595 			else
596 				s = cgoto(f, s, *q);
597 
598 			assert(s < f->state_count);
599 
600 			if (s == 1) {	/* no transition */
601 				if (patlen >= 0) {
602 					patbeg = (const char *) p;
603 					return(1);
604 				}
605 				else
606 					goto nextin;	/* no match */
607 			}
608 		} while (*q++ != 0);
609 		if (f->out[s])
610 			patlen = q-p-1;	/* don't count $ */
611 		if (patlen >= 0) {
612 			patbeg = (const char *) p;
613 			return(1);
614 		}
615 	nextin:
616 		s = 2;
617 #if 0 /* XXX */
618 		if (f->reset) {
619 			for (i = 2; i <= f->curstat; i++)
620 n				xfree(f->posns[i]);
621 			k = *f->posns[0];
622 			if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
623 				overflo("out of space in pmatch");
624 			for (i = 0; i <= k; i++)
625 				(f->posns[2])[i] = (f->posns[0])[i];
626 			f->initstat = f->curstat = 2;
627 			f->out[2] = f->out[0];
628 			for (i = 0; i < NCHARS; i++)
629 				f->gototab[2][i] = 0;
630 		}
631 #endif
632 	} while (*p++ != 0);
633 	return (0);
634 }
635 
636 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
637 {
638 	int s, ns;
639 	const uschar *p = (const uschar *) p0;
640 	const uschar *q;
641 
642 	s = f->initstat;
643 	assert(s < f->state_count);
644 
645 	patbeg = (const char *)p;
646 	patlen = -1;
647 	while (*p) {
648 		q = p;
649 		do {
650 			if (f->out[s])		/* final state */
651 				patlen = q-p;
652 			/* assert(*q < NCHARS); */
653 			if ((ns = f->gototab[s][*q]) != 0)
654 				s = ns;
655 			else
656 				s = cgoto(f, s, *q);
657 			if (s == 1) {	/* no transition */
658 				if (patlen > 0) {
659 					patbeg = (const char *) p;
660 					return(1);
661 				} else
662 					goto nnextin;	/* no nonempty match */
663 			}
664 		} while (*q++ != 0);
665 		if (f->out[s])
666 			patlen = q-p-1;	/* don't count $ */
667 		if (patlen > 0 ) {
668 			patbeg = (const char *) p;
669 			return(1);
670 		}
671 	nnextin:
672 		s = 2;
673 #if 0 /* XXX */
674 		if (f->reset) {
675 			for (i = 2; i <= f->curstat; i++)
676 				xfree(f->posns[i]);
677 			k = *f->posns[0];
678 			if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
679 				overflo("out of state space");
680 			for (i = 0; i <= k; i++)
681 				(f->posns[2])[i] = (f->posns[0])[i];
682 			f->initstat = f->curstat = 2;
683 			f->out[2] = f->out[0];
684 			for (i = 0; i < NCHARS; i++)
685 				f->gototab[2][i] = 0;
686 		}
687 #endif
688 		p++;
689 	}
690 	return (0);
691 }
692 
693 
694 /*
695  * NAME
696  *     fnematch
697  *
698  * DESCRIPTION
699  *     A stream-fed version of nematch which transfers characters to a
700  *     null-terminated buffer. All characters up to and including the last
701  *     character of the matching text or EOF are placed in the buffer. If
702  *     a match is found, patbeg and patlen are set appropriately.
703  *
704  * RETURN VALUES
705  *     false    No match found.
706  *     true     Match found.
707  */
708 
709 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
710 {
711 	char *buf = *pbuf;
712 	int bufsize = *pbufsize;
713 	int c, i, j, k, ns, s;
714 
715 	s = pfa->initstat;
716 	patlen = 0;
717 
718 	/*
719 	 * All indices relative to buf.
720 	 * i <= j <= k <= bufsize
721 	 *
722 	 * i: origin of active substring
723 	 * j: current character
724 	 * k: destination of next getc()
725 	 */
726 	i = -1, k = 0;
727         do {
728 		j = i++;
729 		do {
730 			if (++j == k) {
731 				if (k == bufsize)
732 					if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
733 						FATAL("stream '%.30s...' too long", buf);
734 				buf[k++] = (c = getc(f)) != EOF ? c : 0;
735 			}
736 			c = (uschar)buf[j];
737 			/* assert(c < NCHARS); */
738 
739 			if ((ns = pfa->gototab[s][c]) != 0)
740 				s = ns;
741 			else
742 				s = cgoto(pfa, s, c);
743 
744 			if (pfa->out[s]) {	/* final state */
745 				patlen = j - i + 1;
746 				if (c == 0)	/* don't count $ */
747 					patlen--;
748 			}
749 		} while (buf[j] && s != 1);
750 		s = 2;
751 	} while (buf[i] && !patlen);
752 
753 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
754 	*pbuf = buf;
755 	*pbufsize = bufsize;
756 
757 	if (patlen) {
758 		patbeg = (char *) buf + i;
759 		/*
760 		 * Under no circumstances is the last character fed to
761 		 * the automaton part of the match. It is EOF's nullbyte,
762 		 * or it sent the automaton into a state with no further
763 		 * transitions available (s==1), or both. Room for a
764 		 * terminating nullbyte is guaranteed.
765 		 *
766 		 * ungetc any chars after the end of matching text
767 		 * (except for EOF's nullbyte, if present) and null
768 		 * terminate the buffer.
769 		 */
770 		do
771 			if (buf[--k] && ungetc(buf[k], f) == EOF)
772 				FATAL("unable to ungetc '%c'", buf[k]);
773 		while (k > i + patlen);
774 		buf[k] = '\0';
775 		return true;
776 	}
777 	else
778 		return false;
779 }
780 
781 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
782 {			/* uses relex() to scan regular expression */
783 	Node *np;
784 
785 	DPRINTF("reparse <%s>\n", p);
786 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
787 	rtok = relex();
788 	/* GNU compatibility: an empty regexp matches anything */
789 	if (rtok == '\0') {
790 		/* FATAL("empty regular expression"); previous */
791 		return(op2(EMPTYRE, NIL, NIL));
792 	}
793 	np = regexp();
794 	if (rtok != '\0')
795 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
796 	return(np);
797 }
798 
799 Node *regexp(void)	/* top-level parse of reg expr */
800 {
801 	return (alt(concat(primary())));
802 }
803 
804 Node *primary(void)
805 {
806 	Node *np;
807 	int savelastatom;
808 
809 	switch (rtok) {
810 	case CHAR:
811 		lastatom = starttok;
812 		np = op2(CHAR, NIL, itonp(rlxval));
813 		rtok = relex();
814 		return (unary(np));
815 	case ALL:
816 		rtok = relex();
817 		return (unary(op2(ALL, NIL, NIL)));
818 	case EMPTYRE:
819 		rtok = relex();
820 		return (unary(op2(EMPTYRE, NIL, NIL)));
821 	case DOT:
822 		lastatom = starttok;
823 		rtok = relex();
824 		return (unary(op2(DOT, NIL, NIL)));
825 	case CCL:
826 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
827 		lastatom = starttok;
828 		rtok = relex();
829 		return (unary(np));
830 	case NCCL:
831 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
832 		lastatom = starttok;
833 		rtok = relex();
834 		return (unary(np));
835 	case '^':
836 		rtok = relex();
837 		return (unary(op2(CHAR, NIL, itonp(HAT))));
838 	case '$':
839 		rtok = relex();
840 		return (unary(op2(CHAR, NIL, NIL)));
841 	case '(':
842 		lastatom = starttok;
843 		savelastatom = starttok - basestr; /* Retain over recursion */
844 		rtok = relex();
845 		if (rtok == ')') {	/* special pleading for () */
846 			rtok = relex();
847 			return unary(op2(CCL, NIL, (Node *) tostring("")));
848 		}
849 		np = regexp();
850 		if (rtok == ')') {
851 			lastatom = basestr + savelastatom; /* Restore */
852 			rtok = relex();
853 			return (unary(np));
854 		}
855 		else
856 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
857 	default:
858 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
859 	}
860 	return 0;	/*NOTREACHED*/
861 }
862 
863 Node *concat(Node *np)
864 {
865 	switch (rtok) {
866 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
867 		return (concat(op2(CAT, np, primary())));
868 	case EMPTYRE:
869 		rtok = relex();
870 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
871 				primary())));
872 	}
873 	return (np);
874 }
875 
876 Node *alt(Node *np)
877 {
878 	if (rtok == OR) {
879 		rtok = relex();
880 		return (alt(op2(OR, np, concat(primary()))));
881 	}
882 	return (np);
883 }
884 
885 Node *unary(Node *np)
886 {
887 	switch (rtok) {
888 	case STAR:
889 		rtok = relex();
890 		return (unary(op2(STAR, np, NIL)));
891 	case PLUS:
892 		rtok = relex();
893 		return (unary(op2(PLUS, np, NIL)));
894 	case QUEST:
895 		rtok = relex();
896 		return (unary(op2(QUEST, np, NIL)));
897 	case ZERO:
898 		rtok = relex();
899 		return (unary(op2(ZERO, np, NIL)));
900 	default:
901 		return (np);
902 	}
903 }
904 
905 /*
906  * Character class definitions conformant to the POSIX locale as
907  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
908  * and operating character sets are both ASCII (ISO646) or supersets
909  * thereof.
910  *
911  * Note that to avoid overflowing the temporary buffer used in
912  * relex(), the expanded character class (prior to range expansion)
913  * must be less than twice the size of their full name.
914  */
915 
916 /* Because isblank doesn't show up in any of the header files on any
917  * system i use, it's defined here.  if some other locale has a richer
918  * definition of "blank", define HAS_ISBLANK and provide your own
919  * version.
920  * the parentheses here are an attempt to find a path through the maze
921  * of macro definition and/or function and/or version provided.  thanks
922  * to nelson beebe for the suggestion; let's see if it works everywhere.
923  */
924 
925 /* #define HAS_ISBLANK */
926 #ifndef HAS_ISBLANK
927 
928 int (xisblank)(int c)
929 {
930 	return c==' ' || c=='\t';
931 }
932 
933 #endif
934 
935 static const struct charclass {
936 	const char *cc_name;
937 	int cc_namelen;
938 	int (*cc_func)(int);
939 } charclasses[] = {
940 	{ "alnum",	5,	isalnum },
941 	{ "alpha",	5,	isalpha },
942 #ifndef HAS_ISBLANK
943 	{ "blank",	5,	xisblank },
944 #else
945 	{ "blank",	5,	isblank },
946 #endif
947 	{ "cntrl",	5,	iscntrl },
948 	{ "digit",	5,	isdigit },
949 	{ "graph",	5,	isgraph },
950 	{ "lower",	5,	islower },
951 	{ "print",	5,	isprint },
952 	{ "punct",	5,	ispunct },
953 	{ "space",	5,	isspace },
954 	{ "upper",	5,	isupper },
955 	{ "xdigit",	6,	isxdigit },
956 	{ NULL,		0,	NULL },
957 };
958 
959 #define REPEAT_SIMPLE		0
960 #define REPEAT_PLUS_APPENDED	1
961 #define REPEAT_WITH_Q		2
962 #define REPEAT_ZERO		3
963 
964 static int
965 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
966 	       int atomlen, int firstnum, int secondnum, int special_case)
967 {
968 	int i, j;
969 	uschar *buf = 0;
970 	int ret = 1;
971 	int init_q = (firstnum == 0);		/* first added char will be ? */
972 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
973 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
974 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
975 	int size = prefix_length +  suffix_length;
976 
977 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
978 		size += atomlen*(firstnum-1);
979 	}
980 
981 	/* Adjust size of buffer for special cases */
982 	if (special_case == REPEAT_PLUS_APPENDED) {
983 		size++;		/* for the final + */
984 	} else if (special_case == REPEAT_WITH_Q) {
985 		size += init_q + (atomlen+1)* n_q_reps;
986 	} else if (special_case == REPEAT_ZERO) {
987 		size += 2;	/* just a null ERE: () */
988 	}
989 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
990 		FATAL("out of space in reg expr %.10s..", lastre);
991 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
992 	j = prefix_length;
993 	if (special_case == REPEAT_ZERO) {
994 		j -= atomlen;
995 		buf[j++] = '(';
996 		buf[j++] = ')';
997 	}
998 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
999 		memcpy(&buf[j], atom, atomlen);
1000 		j += atomlen;
1001 	}
1002 	if (special_case == REPEAT_PLUS_APPENDED) {
1003 		buf[j++] = '+';
1004 	} else if (special_case == REPEAT_WITH_Q) {
1005 		if (init_q)
1006 			buf[j++] = '?';
1007 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
1008 			memcpy(&buf[j], atom, atomlen);
1009 			j += atomlen;
1010 			buf[j++] = '?';
1011 		}
1012 	}
1013 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
1014 	if (special_case == REPEAT_ZERO) {
1015 		buf[j+suffix_length] = '\0';
1016 	} else {
1017 		buf[size] = '\0';
1018 	}
1019 	/* free old basestr */
1020 	if (firstbasestr != basestr) {
1021 		if (basestr)
1022 			xfree(basestr);
1023 	}
1024 	basestr = buf;
1025 	prestr  = buf + prefix_length;
1026 	if (special_case == REPEAT_ZERO) {
1027 		prestr  -= atomlen;
1028 		ret++;
1029 	}
1030 	return ret;
1031 }
1032 
1033 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1034 		  int atomlen, int firstnum, int secondnum)
1035 {
1036 	/*
1037 	   In general, the repetition specifier or "bound" is replaced here
1038 	   by an equivalent ERE string, repeating the immediately previous atom
1039 	   and appending ? and + as needed. Note that the first copy of the
1040 	   atom is left in place, except in the special_case of a zero-repeat
1041 	   (i.e., {0}).
1042 	 */
1043 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1044 		if (firstnum < 2) {
1045 			/* 0 or 1: should be handled before you get here */
1046 			FATAL("internal error");
1047 		} else {
1048 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1049 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1050 		}
1051 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1052 		if (firstnum == 0) {	/* {0} or {0,0} */
1053 			/* This case is unusual because the resulting
1054 			   replacement string might actually be SMALLER than
1055 			   the original ERE */
1056 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1057 					firstnum, secondnum, REPEAT_ZERO);
1058 		} else {		/* (firstnum >= 1) */
1059 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1060 					firstnum, secondnum, REPEAT_SIMPLE);
1061 		}
1062 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1063 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1064 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1065 					firstnum, secondnum, REPEAT_WITH_Q);
1066 	} else {	/* Error - shouldn't be here (n>m) */
1067 		FATAL("internal error");
1068 	}
1069 	return 0;
1070 }
1071 
1072 int relex(void)		/* lexical analyzer for reparse */
1073 {
1074 	int c, n;
1075 	int cflag;
1076 	static uschar *buf = NULL;
1077 	static int bufsz = 100;
1078 	uschar *bp;
1079 	const struct charclass *cc;
1080 	int i;
1081 	int num, m;
1082 	bool commafound, digitfound;
1083 	const uschar *startreptok;
1084 	static int parens = 0;
1085 
1086 rescan:
1087 	starttok = prestr;
1088 
1089 	switch (c = *prestr++) {
1090 	case '|': return OR;
1091 	case '*': return STAR;
1092 	case '+': return PLUS;
1093 	case '?': return QUEST;
1094 	case '.': return DOT;
1095 	case '\0': prestr--; return '\0';
1096 	case '^':
1097 	case '$':
1098 		return c;
1099 	case '(':
1100 		parens++;
1101 		return c;
1102 	case ')':
1103 		if (parens) {
1104 			parens--;
1105 			return c;
1106 		}
1107 		/* unmatched close parenthesis; per POSIX, treat as literal */
1108 		rlxval = c;
1109 		return CHAR;
1110 	case '\\':
1111 		rlxval = quoted(&prestr);
1112 		return CHAR;
1113 	default:
1114 		rlxval = c;
1115 		return CHAR;
1116 	case '[':
1117 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1118 			FATAL("out of space in reg expr %.10s..", lastre);
1119 		bp = buf;
1120 		if (*prestr == '^') {
1121 			cflag = 1;
1122 			prestr++;
1123 		}
1124 		else
1125 			cflag = 0;
1126 		n = 2 * strlen((const char *) prestr)+1;
1127 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1128 			FATAL("out of space for reg expr %.10s...", lastre);
1129 		for (; ; ) {
1130 			if ((c = *prestr++) == '\\') {
1131 				*bp++ = '\\';
1132 				if ((c = *prestr++) == '\0')
1133 					FATAL("nonterminated character class %.20s...", lastre);
1134 				*bp++ = c;
1135 			/* } else if (c == '\n') { */
1136 			/* 	FATAL("newline in character class %.20s...", lastre); */
1137 			} else if (c == '[' && *prestr == ':') {
1138 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1139 				for (cc = charclasses; cc->cc_name; cc++)
1140 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1141 						break;
1142 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1143 				    prestr[2 + cc->cc_namelen] == ']') {
1144 					prestr += cc->cc_namelen + 3;
1145 					/*
1146 					 * BUG: We begin at 1, instead of 0, since we
1147 					 * would otherwise prematurely terminate the
1148 					 * string for classes like [[:cntrl:]]. This
1149 					 * means that we can't match the NUL character,
1150 					 * not without first adapting the entire
1151 					 * program to track each string's length.
1152 					 */
1153 					for (i = 1; i <= UCHAR_MAX; i++) {
1154 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1155 						    FATAL("out of space for reg expr %.10s...", lastre);
1156 						if (cc->cc_func(i)) {
1157 							/* escape backslash */
1158 							if (i == '\\') {
1159 								*bp++ = '\\';
1160 								n++;
1161 							}
1162 
1163 							*bp++ = i;
1164 							n++;
1165 						}
1166 					}
1167 				} else
1168 					*bp++ = c;
1169 			} else if (c == '[' && *prestr == '.') {
1170 				char collate_char;
1171 				prestr++;
1172 				collate_char = *prestr++;
1173 				if (*prestr == '.' && prestr[1] == ']') {
1174 					prestr += 2;
1175 					/* Found it: map via locale TBD: for
1176 					   now, simply return this char.  This
1177 					   is sufficient to pass conformance
1178 					   test awk.ex 156
1179 					 */
1180 					if (*prestr == ']') {
1181 						prestr++;
1182 						rlxval = collate_char;
1183 						return CHAR;
1184 					}
1185 				}
1186 			} else if (c == '[' && *prestr == '=') {
1187 				char equiv_char;
1188 				prestr++;
1189 				equiv_char = *prestr++;
1190 				if (*prestr == '=' && prestr[1] == ']') {
1191 					prestr += 2;
1192 					/* Found it: map via locale TBD: for now
1193 					   simply return this char. This is
1194 					   sufficient to pass conformance test
1195 					   awk.ex 156
1196 					 */
1197 					if (*prestr == ']') {
1198 						prestr++;
1199 						rlxval = equiv_char;
1200 						return CHAR;
1201 					}
1202 				}
1203 			} else if (c == '\0') {
1204 				FATAL("nonterminated character class %.20s", lastre);
1205 			} else if (bp == buf) {	/* 1st char is special */
1206 				*bp++ = c;
1207 			} else if (c == ']') {
1208 				*bp++ = 0;
1209 				rlxstr = (uschar *) tostring((char *) buf);
1210 				if (cflag == 0)
1211 					return CCL;
1212 				else
1213 					return NCCL;
1214 			} else
1215 				*bp++ = c;
1216 		}
1217 		break;
1218 	case '{':
1219 		if (isdigit(*(prestr))) {
1220 			num = 0;	/* Process as a repetition */
1221 			n = -1; m = -1;
1222 			commafound = false;
1223 			digitfound = false;
1224 			startreptok = prestr-1;
1225 			/* Remember start of previous atom here ? */
1226 		} else {        	/* just a { char, not a repetition */
1227 			rlxval = c;
1228 			return CHAR;
1229                 }
1230 		for (; ; ) {
1231 			if ((c = *prestr++) == '}') {
1232 				if (commafound) {
1233 					if (digitfound) { /* {n,m} */
1234 						m = num;
1235 						if (m < n)
1236 							FATAL("illegal repetition expression: class %.20s",
1237 								lastre);
1238 						if (n == 0 && m == 1) {
1239 							return QUEST;
1240 						}
1241 					} else {	/* {n,} */
1242 						if (n == 0)
1243 							return STAR;
1244 						else if (n == 1)
1245 							return PLUS;
1246 					}
1247 				} else {
1248 					if (digitfound) { /* {n} same as {n,n} */
1249 						n = num;
1250 						m = num;
1251 					} else {	/* {} */
1252 						FATAL("illegal repetition expression: class %.20s",
1253 							lastre);
1254 					}
1255 				}
1256 				if (repeat(starttok, prestr-starttok, lastatom,
1257 					   startreptok - lastatom, n, m) > 0) {
1258 					if (n == 0 && m == 0) {
1259 						return ZERO;
1260 					}
1261 					/* must rescan input for next token */
1262 					goto rescan;
1263 				}
1264 				/* Failed to replace: eat up {...} characters
1265 				   and treat like just PLUS */
1266 				return PLUS;
1267 			} else if (c == '\0') {
1268 				FATAL("nonterminated character class %.20s",
1269 					lastre);
1270 			} else if (isdigit(c)) {
1271 				num = 10 * num + c - '0';
1272 				digitfound = true;
1273 			} else if (c == ',') {
1274 				if (commafound)
1275 					FATAL("illegal repetition expression: class %.20s",
1276 						lastre);
1277 				/* looking for {n,} or {n,m} */
1278 				commafound = true;
1279 				n = num;
1280 				digitfound = false; /* reset */
1281 				num = 0;
1282 			} else {
1283 				FATAL("illegal repetition expression: class %.20s",
1284 					lastre);
1285 			}
1286 		}
1287 		break;
1288 	}
1289 }
1290 
1291 int cgoto(fa *f, int s, int c)
1292 {
1293 	int *p, *q;
1294 	int i, j, k;
1295 
1296 	assert(c == HAT || c < NCHARS);
1297 	while (f->accept >= maxsetvec) {	/* guessing here! */
1298 		resizesetvec(__func__);
1299 	}
1300 	for (i = 0; i <= f->accept; i++)
1301 		setvec[i] = 0;
1302 	setcnt = 0;
1303 	resize_state(f, s);
1304 	/* compute positions of gototab[s,c] into setvec */
1305 	p = f->posns[s];
1306 	for (i = 1; i <= *p; i++) {
1307 		if ((k = f->re[p[i]].ltype) != FINAL) {
1308 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1309 			 || (k == DOT && c != 0 && c != HAT)
1310 			 || (k == ALL && c != 0)
1311 			 || (k == EMPTYRE && c != 0)
1312 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1313 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1314 				q = f->re[p[i]].lfollow;
1315 				for (j = 1; j <= *q; j++) {
1316 					if (q[j] >= maxsetvec) {
1317 						resizesetvec(__func__);
1318 					}
1319 					if (setvec[q[j]] == 0) {
1320 						setcnt++;
1321 						setvec[q[j]] = 1;
1322 					}
1323 				}
1324 			}
1325 		}
1326 	}
1327 	/* determine if setvec is a previous state */
1328 	tmpset[0] = setcnt;
1329 	j = 1;
1330 	for (i = f->accept; i >= 0; i--)
1331 		if (setvec[i]) {
1332 			tmpset[j++] = i;
1333 		}
1334 	resize_state(f, f->curstat > s ? f->curstat : s);
1335 	/* tmpset == previous state? */
1336 	for (i = 1; i <= f->curstat; i++) {
1337 		p = f->posns[i];
1338 		if ((k = tmpset[0]) != p[0])
1339 			goto different;
1340 		for (j = 1; j <= k; j++)
1341 			if (tmpset[j] != p[j])
1342 				goto different;
1343 		/* setvec is state i */
1344 		if (c != HAT)
1345 			f->gototab[s][c] = i;
1346 		return i;
1347 	  different:;
1348 	}
1349 
1350 	/* add tmpset to current set of states */
1351 	++(f->curstat);
1352 	resize_state(f, f->curstat);
1353 	for (i = 0; i < NCHARS; i++)
1354 		f->gototab[f->curstat][i] = 0;
1355 	xfree(f->posns[f->curstat]);
1356 	p = intalloc(setcnt + 1, __func__);
1357 
1358 	f->posns[f->curstat] = p;
1359 	if (c != HAT)
1360 		f->gototab[s][c] = f->curstat;
1361 	for (i = 0; i <= setcnt; i++)
1362 		p[i] = tmpset[i];
1363 	if (setvec[f->accept])
1364 		f->out[f->curstat] = 1;
1365 	else
1366 		f->out[f->curstat] = 0;
1367 	return f->curstat;
1368 }
1369 
1370 
1371 void freefa(fa *f)	/* free a finite automaton */
1372 {
1373 	int i;
1374 
1375 	if (f == NULL)
1376 		return;
1377 	for (i = 0; i < f->state_count; i++)
1378 		xfree(f->gototab[i])
1379 	for (i = 0; i <= f->curstat; i++)
1380 		xfree(f->posns[i]);
1381 	for (i = 0; i <= f->accept; i++) {
1382 		xfree(f->re[i].lfollow);
1383 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1384 			xfree(f->re[i].lval.np);
1385 	}
1386 	xfree(f->restr);
1387 	xfree(f->out);
1388 	xfree(f->posns);
1389 	xfree(f->gototab);
1390 	xfree(f);
1391 }
1392