xref: /freebsd/contrib/one-true-awk/b.c (revision f68a53db)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #define	DEBUG
31 
32 #include <ctype.h>
33 #include <limits.h>
34 #include <stdio.h>
35 #include <string.h>
36 #include <stdlib.h>
37 #include "awk.h"
38 #include "awkgram.tab.h"
39 
40 #define MAXLIN 22
41 
42 #define type(v)		(v)->nobj	/* badly overloaded here */
43 #define info(v)		(v)->ntype	/* badly overloaded here */
44 #define left(v)		(v)->narg[0]
45 #define right(v)	(v)->narg[1]
46 #define parent(v)	(v)->nnext
47 
48 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
49 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
50 #define UNARY	case STAR: case PLUS: case QUEST:
51 
52 /* encoding in tree Nodes:
53 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
54 		left is index, right contains value or pointer to value
55 	unary (STAR, PLUS, QUEST): left is child, right is null
56 	binary (CAT, OR): left and right are children
57 	parent contains pointer to parent
58 */
59 
60 
61 int	*setvec;
62 int	*tmpset;
63 int	maxsetvec = 0;
64 
65 int	rtok;		/* next token in current re */
66 int	rlxval;
67 static const uschar	*rlxstr;
68 static const uschar	*prestr;	/* current position in current re */
69 static const uschar	*lastre;	/* origin of last re */
70 static const uschar	*lastatom;	/* origin of last Atom */
71 static const uschar	*starttok;
72 static const uschar 	*basestr;	/* starts with original, replaced during
73 				   repetition processing */
74 static const uschar 	*firstbasestr;
75 
76 static	int setcnt;
77 static	int poscnt;
78 
79 const char	*patbeg;
80 int	patlen;
81 
82 #define	NFA	128	/* cache this many dynamic fa's */
83 fa	*fatab[NFA];
84 int	nfatab	= 0;	/* entries in fatab */
85 
86 static int *
87 intalloc(size_t n, const char *f)
88 {
89 	int *p = (int *) calloc(n, sizeof(int));
90 	if (p == NULL)
91 		overflo(f);
92 	return p;
93 }
94 
95 static void
96 resizesetvec(const char *f)
97 {
98 	if (maxsetvec == 0)
99 		maxsetvec = MAXLIN;
100 	else
101 		maxsetvec *= 4;
102 	setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
103 	tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
104 	if (setvec == NULL || tmpset == NULL)
105 		overflo(f);
106 }
107 
108 static void
109 resize_state(fa *f, int state)
110 {
111 	unsigned int **p;
112 	uschar *p2;
113 	int **p3;
114 	int i, new_count;
115 
116 	if (++state < f->state_count)
117 		return;
118 
119 	new_count = state + 10; /* needs to be tuned */
120 
121 	p = (unsigned int **) realloc(f->gototab, new_count * sizeof(f->gototab[0]));
122 	if (p == NULL)
123 		goto out;
124 	f->gototab = p;
125 
126 	p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
127 	if (p2 == NULL)
128 		goto out;
129 	f->out = p2;
130 
131 	p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
132 	if (p3 == NULL)
133 		goto out;
134 	f->posns = p3;
135 
136 	for (i = f->state_count; i < new_count; ++i) {
137 		f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));
138 		if (f->gototab[i] == NULL)
139 			goto out;
140 		f->out[i]  = 0;
141 		f->posns[i] = NULL;
142 	}
143 	f->state_count = new_count;
144 	return;
145 out:
146 	overflo(__func__);
147 }
148 
149 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
150 {
151 	int i, use, nuse;
152 	fa *pfa;
153 	static int now = 1;
154 
155 	if (setvec == NULL) {	/* first time through any RE */
156 		resizesetvec(__func__);
157 	}
158 
159 	if (compile_time != RUNNING)	/* a constant for sure */
160 		return mkdfa(s, anchor);
161 	for (i = 0; i < nfatab; i++)	/* is it there already? */
162 		if (fatab[i]->anchor == anchor
163 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
164 			fatab[i]->use = now++;
165 			return fatab[i];
166 		}
167 	pfa = mkdfa(s, anchor);
168 	if (nfatab < NFA) {	/* room for another */
169 		fatab[nfatab] = pfa;
170 		fatab[nfatab]->use = now++;
171 		nfatab++;
172 		return pfa;
173 	}
174 	use = fatab[0]->use;	/* replace least-recently used */
175 	nuse = 0;
176 	for (i = 1; i < nfatab; i++)
177 		if (fatab[i]->use < use) {
178 			use = fatab[i]->use;
179 			nuse = i;
180 		}
181 	freefa(fatab[nuse]);
182 	fatab[nuse] = pfa;
183 	pfa->use = now++;
184 	return pfa;
185 }
186 
187 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
188 				/* anchor = true for anchored matches, else false */
189 {
190 	Node *p, *p1;
191 	fa *f;
192 
193 	firstbasestr = (const uschar *) s;
194 	basestr = firstbasestr;
195 	p = reparse(s);
196 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
197 		/* put ALL STAR in front of reg.  exp. */
198 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
199 		/* put FINAL after reg.  exp. */
200 
201 	poscnt = 0;
202 	penter(p1);	/* enter parent pointers and leaf indices */
203 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
204 		overflo(__func__);
205 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
206 	cfoll(f, p1);	/* set up follow sets */
207 	freetr(p1);
208 	resize_state(f, 1);
209 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
210 	f->posns[1] = intalloc(1, __func__);
211 	*f->posns[1] = 0;
212 	f->initstat = makeinit(f, anchor);
213 	f->anchor = anchor;
214 	f->restr = (uschar *) tostring(s);
215 	if (firstbasestr != basestr) {
216 		if (basestr)
217 			xfree(basestr);
218 	}
219 	return f;
220 }
221 
222 int makeinit(fa *f, bool anchor)
223 {
224 	int i, k;
225 
226 	f->curstat = 2;
227 	f->out[2] = 0;
228 	k = *(f->re[0].lfollow);
229 	xfree(f->posns[2]);
230 	f->posns[2] = intalloc(k + 1,  __func__);
231 	for (i = 0; i <= k; i++) {
232 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
233 	}
234 	if ((f->posns[2])[1] == f->accept)
235 		f->out[2] = 1;
236 	for (i = 0; i < NCHARS; i++)
237 		f->gototab[2][i] = 0;
238 	f->curstat = cgoto(f, 2, HAT);
239 	if (anchor) {
240 		*f->posns[2] = k-1;	/* leave out position 0 */
241 		for (i = 0; i < k; i++) {
242 			(f->posns[0])[i] = (f->posns[2])[i];
243 		}
244 
245 		f->out[0] = f->out[2];
246 		if (f->curstat != 2)
247 			--(*f->posns[f->curstat]);
248 	}
249 	return f->curstat;
250 }
251 
252 void penter(Node *p)	/* set up parent pointers and leaf indices */
253 {
254 	switch (type(p)) {
255 	ELEAF
256 	LEAF
257 		info(p) = poscnt;
258 		poscnt++;
259 		break;
260 	UNARY
261 		penter(left(p));
262 		parent(left(p)) = p;
263 		break;
264 	case CAT:
265 	case OR:
266 		penter(left(p));
267 		penter(right(p));
268 		parent(left(p)) = p;
269 		parent(right(p)) = p;
270 		break;
271 	case ZERO:
272 		break;
273 	default:	/* can't happen */
274 		FATAL("can't happen: unknown type %d in penter", type(p));
275 		break;
276 	}
277 }
278 
279 void freetr(Node *p)	/* free parse tree */
280 {
281 	switch (type(p)) {
282 	ELEAF
283 	LEAF
284 		xfree(p);
285 		break;
286 	UNARY
287 	case ZERO:
288 		freetr(left(p));
289 		xfree(p);
290 		break;
291 	case CAT:
292 	case OR:
293 		freetr(left(p));
294 		freetr(right(p));
295 		xfree(p);
296 		break;
297 	default:	/* can't happen */
298 		FATAL("can't happen: unknown type %d in freetr", type(p));
299 		break;
300 	}
301 }
302 
303 /* in the parsing of regular expressions, metacharacters like . have */
304 /* to be seen literally;  \056 is not a metacharacter. */
305 
306 int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
307 {			/* only pick up one 8-bit byte (2 chars) */
308 	const uschar *p;
309 	int n = 0;
310 	int i;
311 
312 	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
313 		if (isdigit(*p))
314 			n = 16 * n + *p - '0';
315 		else if (*p >= 'a' && *p <= 'f')
316 			n = 16 * n + *p - 'a' + 10;
317 		else if (*p >= 'A' && *p <= 'F')
318 			n = 16 * n + *p - 'A' + 10;
319 	}
320 	*pp = p;
321 	return n;
322 }
323 
324 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
325 
326 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
327 			/* and increment *pp */
328 {
329 	const uschar *p = *pp;
330 	int c;
331 
332 	if ((c = *p++) == 't')
333 		c = '\t';
334 	else if (c == 'n')
335 		c = '\n';
336 	else if (c == 'f')
337 		c = '\f';
338 	else if (c == 'r')
339 		c = '\r';
340 	else if (c == 'b')
341 		c = '\b';
342 	else if (c == 'v')
343 		c = '\v';
344 	else if (c == 'a')
345 		c = '\a';
346 	else if (c == '\\')
347 		c = '\\';
348 	else if (c == 'x') {	/* hexadecimal goo follows */
349 		c = hexstr(&p);	/* this adds a null if number is invalid */
350 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
351 		int n = c - '0';
352 		if (isoctdigit(*p)) {
353 			n = 8 * n + *p++ - '0';
354 			if (isoctdigit(*p))
355 				n = 8 * n + *p++ - '0';
356 		}
357 		c = n;
358 	} /* else */
359 		/* c = c; */
360 	*pp = p;
361 	return c;
362 }
363 
364 static int collate_range_cmp(int a, int b)
365 {
366 	static char s[2][2];
367 
368 	if ((uschar)a == (uschar)b)
369 		return 0;
370 	s[0][0] = a;
371 	s[1][0] = b;
372 	return (strcoll(s[0], s[1]));
373 }
374 
375 char *cclenter(const char *argp)	/* add a character class */
376 {
377 	int i, c, c2, j;
378 	const uschar *op, *p = (const uschar *) argp;
379 	uschar *bp;
380 	static uschar *buf = NULL;
381 	static int bufsz = 100;
382 
383 	op = p;
384 	if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
385 		FATAL("out of space for character class [%.10s...] 1", p);
386 	bp = buf;
387 	for (i = 0; (c = *p++) != 0; ) {
388 		if (c == '\\') {
389 			c = quoted(&p);
390 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
391 			if (*p != 0) {
392 				c = bp[-1];
393 				c2 = *p++;
394 				if (c2 == '\\')
395 					c2 = quoted(&p);
396 				if (collate_range_cmp(c, c2) > 0) {
397 					bp--;
398 					i--;
399 					continue;
400 				}
401 				for (j = 0; j < NCHARS; j++) {
402 					if ((collate_range_cmp(c, j) > 0) ||
403 					    collate_range_cmp(j, c2) > 0)
404 						continue;
405 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
406 						FATAL("out of space for character class [%.10s...] 2", p);
407 					*bp++ = j;
408 					i++;
409 				}
410 				continue;
411 			}
412 		}
413 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
414 			FATAL("out of space for character class [%.10s...] 3", p);
415 		*bp++ = c;
416 		i++;
417 	}
418 	*bp = 0;
419 	DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
420 	xfree(op);
421 	return (char *) tostring((char *) buf);
422 }
423 
424 void overflo(const char *s)
425 {
426 	FATAL("regular expression too big: out of space in %.30s...", s);
427 }
428 
429 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
430 {
431 	int i;
432 	int *p;
433 
434 	switch (type(v)) {
435 	ELEAF
436 	LEAF
437 		f->re[info(v)].ltype = type(v);
438 		f->re[info(v)].lval.np = right(v);
439 		while (f->accept >= maxsetvec) {	/* guessing here! */
440 			resizesetvec(__func__);
441 		}
442 		for (i = 0; i <= f->accept; i++)
443 			setvec[i] = 0;
444 		setcnt = 0;
445 		follow(v);	/* computes setvec and setcnt */
446 		p = intalloc(setcnt + 1, __func__);
447 		f->re[info(v)].lfollow = p;
448 		*p = setcnt;
449 		for (i = f->accept; i >= 0; i--)
450 			if (setvec[i] == 1)
451 				*++p = i;
452 		break;
453 	UNARY
454 		cfoll(f,left(v));
455 		break;
456 	case CAT:
457 	case OR:
458 		cfoll(f,left(v));
459 		cfoll(f,right(v));
460 		break;
461 	case ZERO:
462 		break;
463 	default:	/* can't happen */
464 		FATAL("can't happen: unknown type %d in cfoll", type(v));
465 	}
466 }
467 
468 int first(Node *p)	/* collects initially active leaves of p into setvec */
469 			/* returns 0 if p matches empty string */
470 {
471 	int b, lp;
472 
473 	switch (type(p)) {
474 	ELEAF
475 	LEAF
476 		lp = info(p);	/* look for high-water mark of subscripts */
477 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
478 			resizesetvec(__func__);
479 		}
480 		if (type(p) == EMPTYRE) {
481 			setvec[lp] = 0;
482 			return(0);
483 		}
484 		if (setvec[lp] != 1) {
485 			setvec[lp] = 1;
486 			setcnt++;
487 		}
488 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
489 			return(0);		/* empty CCL */
490 		return(1);
491 	case PLUS:
492 		if (first(left(p)) == 0)
493 			return(0);
494 		return(1);
495 	case STAR:
496 	case QUEST:
497 		first(left(p));
498 		return(0);
499 	case CAT:
500 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
501 		return(1);
502 	case OR:
503 		b = first(right(p));
504 		if (first(left(p)) == 0 || b == 0) return(0);
505 		return(1);
506 	case ZERO:
507 		return 0;
508 	}
509 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
510 	return(-1);
511 }
512 
513 void follow(Node *v)	/* collects leaves that can follow v into setvec */
514 {
515 	Node *p;
516 
517 	if (type(v) == FINAL)
518 		return;
519 	p = parent(v);
520 	switch (type(p)) {
521 	case STAR:
522 	case PLUS:
523 		first(v);
524 		follow(p);
525 		return;
526 
527 	case OR:
528 	case QUEST:
529 		follow(p);
530 		return;
531 
532 	case CAT:
533 		if (v == left(p)) {	/* v is left child of p */
534 			if (first(right(p)) == 0) {
535 				follow(p);
536 				return;
537 			}
538 		} else		/* v is right child */
539 			follow(p);
540 		return;
541 	}
542 }
543 
544 int member(int c, const char *sarg)	/* is c in s? */
545 {
546 	const uschar *s = (const uschar *) sarg;
547 
548 	while (*s)
549 		if (c == *s++)
550 			return(1);
551 	return(0);
552 }
553 
554 int match(fa *f, const char *p0)	/* shortest match ? */
555 {
556 	int s, ns;
557 	const uschar *p = (const uschar *) p0;
558 
559 	s = f->initstat;
560 	assert (s < f->state_count);
561 
562 	if (f->out[s])
563 		return(1);
564 	do {
565 		/* assert(*p < NCHARS); */
566 		if ((ns = f->gototab[s][*p]) != 0)
567 			s = ns;
568 		else
569 			s = cgoto(f, s, *p);
570 		if (f->out[s])
571 			return(1);
572 	} while (*p++ != 0);
573 	return(0);
574 }
575 
576 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
577 {
578 	int s, ns;
579 	const uschar *p = (const uschar *) p0;
580 	const uschar *q;
581 
582 	s = f->initstat;
583 	assert(s < f->state_count);
584 
585 	patbeg = (const char *)p;
586 	patlen = -1;
587 	do {
588 		q = p;
589 		do {
590 			if (f->out[s])		/* final state */
591 				patlen = q-p;
592 			/* assert(*q < NCHARS); */
593 			if ((ns = f->gototab[s][*q]) != 0)
594 				s = ns;
595 			else
596 				s = cgoto(f, s, *q);
597 
598 			assert(s < f->state_count);
599 
600 			if (s == 1) {	/* no transition */
601 				if (patlen >= 0) {
602 					patbeg = (const char *) p;
603 					return(1);
604 				}
605 				else
606 					goto nextin;	/* no match */
607 			}
608 		} while (*q++ != 0);
609 		if (f->out[s])
610 			patlen = q-p-1;	/* don't count $ */
611 		if (patlen >= 0) {
612 			patbeg = (const char *) p;
613 			return(1);
614 		}
615 	nextin:
616 		s = 2;
617 	} while (*p++);
618 	return (0);
619 }
620 
621 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
622 {
623 	int s, ns;
624 	const uschar *p = (const uschar *) p0;
625 	const uschar *q;
626 
627 	s = f->initstat;
628 	assert(s < f->state_count);
629 
630 	patbeg = (const char *)p;
631 	patlen = -1;
632 	while (*p) {
633 		q = p;
634 		do {
635 			if (f->out[s])		/* final state */
636 				patlen = q-p;
637 			/* assert(*q < NCHARS); */
638 			if ((ns = f->gototab[s][*q]) != 0)
639 				s = ns;
640 			else
641 				s = cgoto(f, s, *q);
642 			if (s == 1) {	/* no transition */
643 				if (patlen > 0) {
644 					patbeg = (const char *) p;
645 					return(1);
646 				} else
647 					goto nnextin;	/* no nonempty match */
648 			}
649 		} while (*q++ != 0);
650 		if (f->out[s])
651 			patlen = q-p-1;	/* don't count $ */
652 		if (patlen > 0 ) {
653 			patbeg = (const char *) p;
654 			return(1);
655 		}
656 	nnextin:
657 		s = 2;
658 		p++;
659 	}
660 	return (0);
661 }
662 
663 
664 /*
665  * NAME
666  *     fnematch
667  *
668  * DESCRIPTION
669  *     A stream-fed version of nematch which transfers characters to a
670  *     null-terminated buffer. All characters up to and including the last
671  *     character of the matching text or EOF are placed in the buffer. If
672  *     a match is found, patbeg and patlen are set appropriately.
673  *
674  * RETURN VALUES
675  *     false    No match found.
676  *     true     Match found.
677  */
678 
679 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
680 {
681 	char *buf = *pbuf;
682 	int bufsize = *pbufsize;
683 	int c, i, j, k, ns, s;
684 
685 	s = pfa->initstat;
686 	patlen = 0;
687 
688 	/*
689 	 * All indices relative to buf.
690 	 * i <= j <= k <= bufsize
691 	 *
692 	 * i: origin of active substring
693 	 * j: current character
694 	 * k: destination of next getc()
695 	 */
696 	i = -1, k = 0;
697         do {
698 		j = i++;
699 		do {
700 			if (++j == k) {
701 				if (k == bufsize)
702 					if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
703 						FATAL("stream '%.30s...' too long", buf);
704 				buf[k++] = (c = getc(f)) != EOF ? c : 0;
705 			}
706 			c = (uschar)buf[j];
707 			/* assert(c < NCHARS); */
708 
709 			if ((ns = pfa->gototab[s][c]) != 0)
710 				s = ns;
711 			else
712 				s = cgoto(pfa, s, c);
713 
714 			if (pfa->out[s]) {	/* final state */
715 				patlen = j - i + 1;
716 				if (c == 0)	/* don't count $ */
717 					patlen--;
718 			}
719 		} while (buf[j] && s != 1);
720 		s = 2;
721 	} while (buf[i] && !patlen);
722 
723 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
724 	*pbuf = buf;
725 	*pbufsize = bufsize;
726 
727 	if (patlen) {
728 		patbeg = (char *) buf + i;
729 		/*
730 		 * Under no circumstances is the last character fed to
731 		 * the automaton part of the match. It is EOF's nullbyte,
732 		 * or it sent the automaton into a state with no further
733 		 * transitions available (s==1), or both. Room for a
734 		 * terminating nullbyte is guaranteed.
735 		 *
736 		 * ungetc any chars after the end of matching text
737 		 * (except for EOF's nullbyte, if present) and null
738 		 * terminate the buffer.
739 		 */
740 		do
741 			if (buf[--k] && ungetc(buf[k], f) == EOF)
742 				FATAL("unable to ungetc '%c'", buf[k]);
743 		while (k > i + patlen);
744 		buf[k] = '\0';
745 		return true;
746 	}
747 	else
748 		return false;
749 }
750 
751 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
752 {			/* uses relex() to scan regular expression */
753 	Node *np;
754 
755 	DPRINTF("reparse <%s>\n", p);
756 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
757 	rtok = relex();
758 	/* GNU compatibility: an empty regexp matches anything */
759 	if (rtok == '\0') {
760 		/* FATAL("empty regular expression"); previous */
761 		return(op2(EMPTYRE, NIL, NIL));
762 	}
763 	np = regexp();
764 	if (rtok != '\0')
765 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
766 	return(np);
767 }
768 
769 Node *regexp(void)	/* top-level parse of reg expr */
770 {
771 	return (alt(concat(primary())));
772 }
773 
774 Node *primary(void)
775 {
776 	Node *np;
777 	int savelastatom;
778 
779 	switch (rtok) {
780 	case CHAR:
781 		lastatom = starttok;
782 		np = op2(CHAR, NIL, itonp(rlxval));
783 		rtok = relex();
784 		return (unary(np));
785 	case ALL:
786 		rtok = relex();
787 		return (unary(op2(ALL, NIL, NIL)));
788 	case EMPTYRE:
789 		rtok = relex();
790 		return (unary(op2(EMPTYRE, NIL, NIL)));
791 	case DOT:
792 		lastatom = starttok;
793 		rtok = relex();
794 		return (unary(op2(DOT, NIL, NIL)));
795 	case CCL:
796 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
797 		lastatom = starttok;
798 		rtok = relex();
799 		return (unary(np));
800 	case NCCL:
801 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
802 		lastatom = starttok;
803 		rtok = relex();
804 		return (unary(np));
805 	case '^':
806 		rtok = relex();
807 		return (unary(op2(CHAR, NIL, itonp(HAT))));
808 	case '$':
809 		rtok = relex();
810 		return (unary(op2(CHAR, NIL, NIL)));
811 	case '(':
812 		lastatom = starttok;
813 		savelastatom = starttok - basestr; /* Retain over recursion */
814 		rtok = relex();
815 		if (rtok == ')') {	/* special pleading for () */
816 			rtok = relex();
817 			return unary(op2(CCL, NIL, (Node *) tostring("")));
818 		}
819 		np = regexp();
820 		if (rtok == ')') {
821 			lastatom = basestr + savelastatom; /* Restore */
822 			rtok = relex();
823 			return (unary(np));
824 		}
825 		else
826 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
827 	default:
828 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
829 	}
830 	return 0;	/*NOTREACHED*/
831 }
832 
833 Node *concat(Node *np)
834 {
835 	switch (rtok) {
836 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
837 		return (concat(op2(CAT, np, primary())));
838 	case EMPTYRE:
839 		rtok = relex();
840 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
841 				primary())));
842 	}
843 	return (np);
844 }
845 
846 Node *alt(Node *np)
847 {
848 	if (rtok == OR) {
849 		rtok = relex();
850 		return (alt(op2(OR, np, concat(primary()))));
851 	}
852 	return (np);
853 }
854 
855 Node *unary(Node *np)
856 {
857 	switch (rtok) {
858 	case STAR:
859 		rtok = relex();
860 		return (unary(op2(STAR, np, NIL)));
861 	case PLUS:
862 		rtok = relex();
863 		return (unary(op2(PLUS, np, NIL)));
864 	case QUEST:
865 		rtok = relex();
866 		return (unary(op2(QUEST, np, NIL)));
867 	case ZERO:
868 		rtok = relex();
869 		return (unary(op2(ZERO, np, NIL)));
870 	default:
871 		return (np);
872 	}
873 }
874 
875 /*
876  * Character class definitions conformant to the POSIX locale as
877  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
878  * and operating character sets are both ASCII (ISO646) or supersets
879  * thereof.
880  *
881  * Note that to avoid overflowing the temporary buffer used in
882  * relex(), the expanded character class (prior to range expansion)
883  * must be less than twice the size of their full name.
884  */
885 
886 /* Because isblank doesn't show up in any of the header files on any
887  * system i use, it's defined here.  if some other locale has a richer
888  * definition of "blank", define HAS_ISBLANK and provide your own
889  * version.
890  * the parentheses here are an attempt to find a path through the maze
891  * of macro definition and/or function and/or version provided.  thanks
892  * to nelson beebe for the suggestion; let's see if it works everywhere.
893  */
894 
895 /* #define HAS_ISBLANK */
896 #ifndef HAS_ISBLANK
897 
898 int (xisblank)(int c)
899 {
900 	return c==' ' || c=='\t';
901 }
902 
903 #endif
904 
905 static const struct charclass {
906 	const char *cc_name;
907 	int cc_namelen;
908 	int (*cc_func)(int);
909 } charclasses[] = {
910 	{ "alnum",	5,	isalnum },
911 	{ "alpha",	5,	isalpha },
912 #ifndef HAS_ISBLANK
913 	{ "blank",	5,	xisblank },
914 #else
915 	{ "blank",	5,	isblank },
916 #endif
917 	{ "cntrl",	5,	iscntrl },
918 	{ "digit",	5,	isdigit },
919 	{ "graph",	5,	isgraph },
920 	{ "lower",	5,	islower },
921 	{ "print",	5,	isprint },
922 	{ "punct",	5,	ispunct },
923 	{ "space",	5,	isspace },
924 	{ "upper",	5,	isupper },
925 	{ "xdigit",	6,	isxdigit },
926 	{ NULL,		0,	NULL },
927 };
928 
929 #define REPEAT_SIMPLE		0
930 #define REPEAT_PLUS_APPENDED	1
931 #define REPEAT_WITH_Q		2
932 #define REPEAT_ZERO		3
933 
934 static int
935 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
936 	       int atomlen, int firstnum, int secondnum, int special_case)
937 {
938 	int i, j;
939 	uschar *buf = 0;
940 	int ret = 1;
941 	int init_q = (firstnum == 0);		/* first added char will be ? */
942 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
943 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
944 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
945 	int size = prefix_length +  suffix_length;
946 
947 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
948 		size += atomlen*(firstnum-1);
949 	}
950 
951 	/* Adjust size of buffer for special cases */
952 	if (special_case == REPEAT_PLUS_APPENDED) {
953 		size++;		/* for the final + */
954 	} else if (special_case == REPEAT_WITH_Q) {
955 		size += init_q + (atomlen+1)* n_q_reps;
956 	} else if (special_case == REPEAT_ZERO) {
957 		size += 2;	/* just a null ERE: () */
958 	}
959 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
960 		FATAL("out of space in reg expr %.10s..", lastre);
961 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
962 	j = prefix_length;
963 	if (special_case == REPEAT_ZERO) {
964 		j -= atomlen;
965 		buf[j++] = '(';
966 		buf[j++] = ')';
967 	}
968 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
969 		memcpy(&buf[j], atom, atomlen);
970 		j += atomlen;
971 	}
972 	if (special_case == REPEAT_PLUS_APPENDED) {
973 		buf[j++] = '+';
974 	} else if (special_case == REPEAT_WITH_Q) {
975 		if (init_q)
976 			buf[j++] = '?';
977 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
978 			memcpy(&buf[j], atom, atomlen);
979 			j += atomlen;
980 			buf[j++] = '?';
981 		}
982 	}
983 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
984 	if (special_case == REPEAT_ZERO) {
985 		buf[j+suffix_length] = '\0';
986 	} else {
987 		buf[size] = '\0';
988 	}
989 	/* free old basestr */
990 	if (firstbasestr != basestr) {
991 		if (basestr)
992 			xfree(basestr);
993 	}
994 	basestr = buf;
995 	prestr  = buf + prefix_length;
996 	if (special_case == REPEAT_ZERO) {
997 		prestr  -= atomlen;
998 		ret++;
999 	}
1000 	return ret;
1001 }
1002 
1003 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1004 		  int atomlen, int firstnum, int secondnum)
1005 {
1006 	/*
1007 	   In general, the repetition specifier or "bound" is replaced here
1008 	   by an equivalent ERE string, repeating the immediately previous atom
1009 	   and appending ? and + as needed. Note that the first copy of the
1010 	   atom is left in place, except in the special_case of a zero-repeat
1011 	   (i.e., {0}).
1012 	 */
1013 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1014 		if (firstnum < 2) {
1015 			/* 0 or 1: should be handled before you get here */
1016 			FATAL("internal error");
1017 		} else {
1018 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1019 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1020 		}
1021 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1022 		if (firstnum == 0) {	/* {0} or {0,0} */
1023 			/* This case is unusual because the resulting
1024 			   replacement string might actually be SMALLER than
1025 			   the original ERE */
1026 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1027 					firstnum, secondnum, REPEAT_ZERO);
1028 		} else {		/* (firstnum >= 1) */
1029 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1030 					firstnum, secondnum, REPEAT_SIMPLE);
1031 		}
1032 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1033 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1034 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1035 					firstnum, secondnum, REPEAT_WITH_Q);
1036 	} else {	/* Error - shouldn't be here (n>m) */
1037 		FATAL("internal error");
1038 	}
1039 	return 0;
1040 }
1041 
1042 int relex(void)		/* lexical analyzer for reparse */
1043 {
1044 	int c, n;
1045 	int cflag;
1046 	static uschar *buf = NULL;
1047 	static int bufsz = 100;
1048 	uschar *bp;
1049 	const struct charclass *cc;
1050 	int i;
1051 	int num, m;
1052 	bool commafound, digitfound;
1053 	const uschar *startreptok;
1054 	static int parens = 0;
1055 
1056 rescan:
1057 	starttok = prestr;
1058 
1059 	switch (c = *prestr++) {
1060 	case '|': return OR;
1061 	case '*': return STAR;
1062 	case '+': return PLUS;
1063 	case '?': return QUEST;
1064 	case '.': return DOT;
1065 	case '\0': prestr--; return '\0';
1066 	case '^':
1067 	case '$':
1068 		return c;
1069 	case '(':
1070 		parens++;
1071 		return c;
1072 	case ')':
1073 		if (parens) {
1074 			parens--;
1075 			return c;
1076 		}
1077 		/* unmatched close parenthesis; per POSIX, treat as literal */
1078 		rlxval = c;
1079 		return CHAR;
1080 	case '\\':
1081 		rlxval = quoted(&prestr);
1082 		return CHAR;
1083 	default:
1084 		rlxval = c;
1085 		return CHAR;
1086 	case '[':
1087 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1088 			FATAL("out of space in reg expr %.10s..", lastre);
1089 		bp = buf;
1090 		if (*prestr == '^') {
1091 			cflag = 1;
1092 			prestr++;
1093 		}
1094 		else
1095 			cflag = 0;
1096 		n = 2 * strlen((const char *) prestr)+1;
1097 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1098 			FATAL("out of space for reg expr %.10s...", lastre);
1099 		for (; ; ) {
1100 			if ((c = *prestr++) == '\\') {
1101 				*bp++ = '\\';
1102 				if ((c = *prestr++) == '\0')
1103 					FATAL("nonterminated character class %.20s...", lastre);
1104 				*bp++ = c;
1105 			/* } else if (c == '\n') { */
1106 			/* 	FATAL("newline in character class %.20s...", lastre); */
1107 			} else if (c == '[' && *prestr == ':') {
1108 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1109 				for (cc = charclasses; cc->cc_name; cc++)
1110 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1111 						break;
1112 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1113 				    prestr[2 + cc->cc_namelen] == ']') {
1114 					prestr += cc->cc_namelen + 3;
1115 					/*
1116 					 * BUG: We begin at 1, instead of 0, since we
1117 					 * would otherwise prematurely terminate the
1118 					 * string for classes like [[:cntrl:]]. This
1119 					 * means that we can't match the NUL character,
1120 					 * not without first adapting the entire
1121 					 * program to track each string's length.
1122 					 */
1123 					for (i = 1; i <= UCHAR_MAX; i++) {
1124 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1125 						    FATAL("out of space for reg expr %.10s...", lastre);
1126 						if (cc->cc_func(i)) {
1127 							/* escape backslash */
1128 							if (i == '\\') {
1129 								*bp++ = '\\';
1130 								n++;
1131 							}
1132 
1133 							*bp++ = i;
1134 							n++;
1135 						}
1136 					}
1137 				} else
1138 					*bp++ = c;
1139 			} else if (c == '[' && *prestr == '.') {
1140 				char collate_char;
1141 				prestr++;
1142 				collate_char = *prestr++;
1143 				if (*prestr == '.' && prestr[1] == ']') {
1144 					prestr += 2;
1145 					/* Found it: map via locale TBD: for
1146 					   now, simply return this char.  This
1147 					   is sufficient to pass conformance
1148 					   test awk.ex 156
1149 					 */
1150 					if (*prestr == ']') {
1151 						prestr++;
1152 						rlxval = collate_char;
1153 						return CHAR;
1154 					}
1155 				}
1156 			} else if (c == '[' && *prestr == '=') {
1157 				char equiv_char;
1158 				prestr++;
1159 				equiv_char = *prestr++;
1160 				if (*prestr == '=' && prestr[1] == ']') {
1161 					prestr += 2;
1162 					/* Found it: map via locale TBD: for now
1163 					   simply return this char. This is
1164 					   sufficient to pass conformance test
1165 					   awk.ex 156
1166 					 */
1167 					if (*prestr == ']') {
1168 						prestr++;
1169 						rlxval = equiv_char;
1170 						return CHAR;
1171 					}
1172 				}
1173 			} else if (c == '\0') {
1174 				FATAL("nonterminated character class %.20s", lastre);
1175 			} else if (bp == buf) {	/* 1st char is special */
1176 				*bp++ = c;
1177 			} else if (c == ']') {
1178 				*bp++ = 0;
1179 				rlxstr = (uschar *) tostring((char *) buf);
1180 				if (cflag == 0)
1181 					return CCL;
1182 				else
1183 					return NCCL;
1184 			} else
1185 				*bp++ = c;
1186 		}
1187 		break;
1188 	case '{':
1189 		if (isdigit(*(prestr))) {
1190 			num = 0;	/* Process as a repetition */
1191 			n = -1; m = -1;
1192 			commafound = false;
1193 			digitfound = false;
1194 			startreptok = prestr-1;
1195 			/* Remember start of previous atom here ? */
1196 		} else {        	/* just a { char, not a repetition */
1197 			rlxval = c;
1198 			return CHAR;
1199                 }
1200 		for (; ; ) {
1201 			if ((c = *prestr++) == '}') {
1202 				if (commafound) {
1203 					if (digitfound) { /* {n,m} */
1204 						m = num;
1205 						if (m < n)
1206 							FATAL("illegal repetition expression: class %.20s",
1207 								lastre);
1208 						if (n == 0 && m == 1) {
1209 							return QUEST;
1210 						}
1211 					} else {	/* {n,} */
1212 						if (n == 0)
1213 							return STAR;
1214 						else if (n == 1)
1215 							return PLUS;
1216 					}
1217 				} else {
1218 					if (digitfound) { /* {n} same as {n,n} */
1219 						n = num;
1220 						m = num;
1221 					} else {	/* {} */
1222 						FATAL("illegal repetition expression: class %.20s",
1223 							lastre);
1224 					}
1225 				}
1226 				if (repeat(starttok, prestr-starttok, lastatom,
1227 					   startreptok - lastatom, n, m) > 0) {
1228 					if (n == 0 && m == 0) {
1229 						return ZERO;
1230 					}
1231 					/* must rescan input for next token */
1232 					goto rescan;
1233 				}
1234 				/* Failed to replace: eat up {...} characters
1235 				   and treat like just PLUS */
1236 				return PLUS;
1237 			} else if (c == '\0') {
1238 				FATAL("nonterminated character class %.20s",
1239 					lastre);
1240 			} else if (isdigit(c)) {
1241 				num = 10 * num + c - '0';
1242 				digitfound = true;
1243 			} else if (c == ',') {
1244 				if (commafound)
1245 					FATAL("illegal repetition expression: class %.20s",
1246 						lastre);
1247 				/* looking for {n,} or {n,m} */
1248 				commafound = true;
1249 				n = num;
1250 				digitfound = false; /* reset */
1251 				num = 0;
1252 			} else {
1253 				FATAL("illegal repetition expression: class %.20s",
1254 					lastre);
1255 			}
1256 		}
1257 		break;
1258 	}
1259 }
1260 
1261 int cgoto(fa *f, int s, int c)
1262 {
1263 	int *p, *q;
1264 	int i, j, k;
1265 
1266 	assert(c == HAT || c < NCHARS);
1267 	while (f->accept >= maxsetvec) {	/* guessing here! */
1268 		resizesetvec(__func__);
1269 	}
1270 	for (i = 0; i <= f->accept; i++)
1271 		setvec[i] = 0;
1272 	setcnt = 0;
1273 	resize_state(f, s);
1274 	/* compute positions of gototab[s,c] into setvec */
1275 	p = f->posns[s];
1276 	for (i = 1; i <= *p; i++) {
1277 		if ((k = f->re[p[i]].ltype) != FINAL) {
1278 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1279 			 || (k == DOT && c != 0 && c != HAT)
1280 			 || (k == ALL && c != 0)
1281 			 || (k == EMPTYRE && c != 0)
1282 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1283 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1284 				q = f->re[p[i]].lfollow;
1285 				for (j = 1; j <= *q; j++) {
1286 					if (q[j] >= maxsetvec) {
1287 						resizesetvec(__func__);
1288 					}
1289 					if (setvec[q[j]] == 0) {
1290 						setcnt++;
1291 						setvec[q[j]] = 1;
1292 					}
1293 				}
1294 			}
1295 		}
1296 	}
1297 	/* determine if setvec is a previous state */
1298 	tmpset[0] = setcnt;
1299 	j = 1;
1300 	for (i = f->accept; i >= 0; i--)
1301 		if (setvec[i]) {
1302 			tmpset[j++] = i;
1303 		}
1304 	resize_state(f, f->curstat > s ? f->curstat : s);
1305 	/* tmpset == previous state? */
1306 	for (i = 1; i <= f->curstat; i++) {
1307 		p = f->posns[i];
1308 		if ((k = tmpset[0]) != p[0])
1309 			goto different;
1310 		for (j = 1; j <= k; j++)
1311 			if (tmpset[j] != p[j])
1312 				goto different;
1313 		/* setvec is state i */
1314 		if (c != HAT)
1315 			f->gototab[s][c] = i;
1316 		return i;
1317 	  different:;
1318 	}
1319 
1320 	/* add tmpset to current set of states */
1321 	++(f->curstat);
1322 	resize_state(f, f->curstat);
1323 	for (i = 0; i < NCHARS; i++)
1324 		f->gototab[f->curstat][i] = 0;
1325 	xfree(f->posns[f->curstat]);
1326 	p = intalloc(setcnt + 1, __func__);
1327 
1328 	f->posns[f->curstat] = p;
1329 	if (c != HAT)
1330 		f->gototab[s][c] = f->curstat;
1331 	for (i = 0; i <= setcnt; i++)
1332 		p[i] = tmpset[i];
1333 	if (setvec[f->accept])
1334 		f->out[f->curstat] = 1;
1335 	else
1336 		f->out[f->curstat] = 0;
1337 	return f->curstat;
1338 }
1339 
1340 
1341 void freefa(fa *f)	/* free a finite automaton */
1342 {
1343 	int i;
1344 
1345 	if (f == NULL)
1346 		return;
1347 	for (i = 0; i < f->state_count; i++)
1348 		xfree(f->gototab[i])
1349 	for (i = 0; i <= f->curstat; i++)
1350 		xfree(f->posns[i]);
1351 	for (i = 0; i <= f->accept; i++) {
1352 		xfree(f->re[i].lfollow);
1353 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1354 			xfree(f->re[i].lval.np);
1355 	}
1356 	xfree(f->restr);
1357 	xfree(f->out);
1358 	xfree(f->posns);
1359 	xfree(f->gototab);
1360 	xfree(f);
1361 }
1362