xref: /openbsd/usr.bin/awk/b.c (revision 097a140d)
1 /*	$OpenBSD: b.c,v 1.36 2021/03/02 20:41:42 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 /* lasciate ogne speranza, voi ch'intrate. */
27 
28 #define	DEBUG
29 
30 #include <ctype.h>
31 #include <limits.h>
32 #include <stdio.h>
33 #include <string.h>
34 #include <stdlib.h>
35 #include "awk.h"
36 #include "awkgram.tab.h"
37 
38 #define MAXLIN 22
39 
40 #define type(v)		(v)->nobj	/* badly overloaded here */
41 #define info(v)		(v)->ntype	/* badly overloaded here */
42 #define left(v)		(v)->narg[0]
43 #define right(v)	(v)->narg[1]
44 #define parent(v)	(v)->nnext
45 
46 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
47 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
48 #define UNARY	case STAR: case PLUS: case QUEST:
49 
50 /* encoding in tree Nodes:
51 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
52 		left is index, right contains value or pointer to value
53 	unary (STAR, PLUS, QUEST): left is child, right is null
54 	binary (CAT, OR): left and right are children
55 	parent contains pointer to parent
56 */
57 
58 
59 int	*setvec;
60 int	*tmpset;
61 int	maxsetvec = 0;
62 
63 int	rtok;		/* next token in current re */
64 int	rlxval;
65 static const uschar	*rlxstr;
66 static const uschar	*prestr;	/* current position in current re */
67 static const uschar	*lastre;	/* origin of last re */
68 static const uschar	*lastatom;	/* origin of last Atom */
69 static const uschar	*starttok;
70 static const uschar 	*basestr;	/* starts with original, replaced during
71 				   repetition processing */
72 static const uschar 	*firstbasestr;
73 
74 static	int setcnt;
75 static	int poscnt;
76 
77 const char	*patbeg;
78 int	patlen;
79 
80 #define	NFA	128	/* cache this many dynamic fa's */
81 fa	*fatab[NFA];
82 int	nfatab	= 0;	/* entries in fatab */
83 
84 static int *
85 intalloc(size_t n, const char *f)
86 {
87 	int *p = (int *) calloc(n, sizeof(int));
88 	if (p == NULL)
89 		overflo(f);
90 	return p;
91 }
92 
93 static void
94 allocsetvec(const char *f)
95 {
96 	maxsetvec = MAXLIN;
97 	setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
98 	tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
99 	if (setvec == NULL || tmpset == NULL)
100 		overflo(f);
101 }
102 
103 static void
104 resizesetvec(const char *f)
105 {
106 	setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
107 	tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
108 	if (setvec == NULL || tmpset == NULL)
109 		overflo(f);
110 	maxsetvec *= 4;
111 }
112 
113 static void
114 resize_state(fa *f, int state)
115 {
116 	unsigned int **p;
117 	uschar *p2;
118 	int **p3;
119 	int i, new_count;
120 
121 	if (++state < f->state_count)
122 		return;
123 
124 	new_count = state + 10; /* needs to be tuned */
125 
126 	p = (unsigned int **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
127 	if (p == NULL)
128 		goto out;
129 	f->gototab = p;
130 
131 	p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
132 	if (p2 == NULL)
133 		goto out;
134 	f->out = p2;
135 
136 	p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
137 	if (p3 == NULL)
138 		goto out;
139 	f->posns = p3;
140 
141 	for (i = f->state_count; i < new_count; ++i) {
142 		f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));
143 		if (f->gototab[i] == NULL)
144 			goto out;
145 		f->out[i]  = 0;
146 		f->posns[i] = NULL;
147 	}
148 	f->state_count = new_count;
149 	return;
150 out:
151 	overflo(__func__);
152 }
153 
154 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
155 {
156 	int i, use, nuse;
157 	fa *pfa;
158 	static int now = 1;
159 
160 	if (setvec == NULL) {	/* first time through any RE */
161 		allocsetvec(__func__);
162 	}
163 
164 	if (compile_time != RUNNING)	/* a constant for sure */
165 		return mkdfa(s, anchor);
166 	for (i = 0; i < nfatab; i++)	/* is it there already? */
167 		if (fatab[i]->anchor == anchor
168 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
169 			fatab[i]->use = now++;
170 			return fatab[i];
171 		}
172 	pfa = mkdfa(s, anchor);
173 	if (nfatab < NFA) {	/* room for another */
174 		fatab[nfatab] = pfa;
175 		fatab[nfatab]->use = now++;
176 		nfatab++;
177 		return pfa;
178 	}
179 	use = fatab[0]->use;	/* replace least-recently used */
180 	nuse = 0;
181 	for (i = 1; i < nfatab; i++)
182 		if (fatab[i]->use < use) {
183 			use = fatab[i]->use;
184 			nuse = i;
185 		}
186 	freefa(fatab[nuse]);
187 	fatab[nuse] = pfa;
188 	pfa->use = now++;
189 	return pfa;
190 }
191 
192 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
193 				/* anchor = true for anchored matches, else false */
194 {
195 	Node *p, *p1;
196 	fa *f;
197 
198 	firstbasestr = (const uschar *) s;
199 	basestr = firstbasestr;
200 	p = reparse(s);
201 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
202 		/* put ALL STAR in front of reg.  exp. */
203 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
204 		/* put FINAL after reg.  exp. */
205 
206 	poscnt = 0;
207 	penter(p1);	/* enter parent pointers and leaf indices */
208 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
209 		overflo(__func__);
210 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
211 	cfoll(f, p1);	/* set up follow sets */
212 	freetr(p1);
213 	resize_state(f, 1);
214 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
215 	f->posns[1] = intalloc(1, __func__);
216 	*f->posns[1] = 0;
217 	f->initstat = makeinit(f, anchor);
218 	f->anchor = anchor;
219 	f->restr = (uschar *) tostring(s);
220 	if (firstbasestr != basestr) {
221 		if (basestr)
222 			xfree(basestr);
223 	}
224 	return f;
225 }
226 
227 int makeinit(fa *f, bool anchor)
228 {
229 	int i, k;
230 
231 	f->curstat = 2;
232 	f->out[2] = 0;
233 	k = *(f->re[0].lfollow);
234 	xfree(f->posns[2]);
235 	f->posns[2] = intalloc(k + 1,  __func__);
236 	for (i = 0; i <= k; i++) {
237 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
238 	}
239 	if ((f->posns[2])[1] == f->accept)
240 		f->out[2] = 1;
241 	for (i = 0; i < NCHARS; i++)
242 		f->gototab[2][i] = 0;
243 	f->curstat = cgoto(f, 2, HAT);
244 	if (anchor) {
245 		*f->posns[2] = k-1;	/* leave out position 0 */
246 		for (i = 0; i < k; i++) {
247 			(f->posns[0])[i] = (f->posns[2])[i];
248 		}
249 
250 		f->out[0] = f->out[2];
251 		if (f->curstat != 2)
252 			--(*f->posns[f->curstat]);
253 	}
254 	return f->curstat;
255 }
256 
257 void penter(Node *p)	/* set up parent pointers and leaf indices */
258 {
259 	switch (type(p)) {
260 	ELEAF
261 	LEAF
262 		info(p) = poscnt;
263 		poscnt++;
264 		break;
265 	UNARY
266 		penter(left(p));
267 		parent(left(p)) = p;
268 		break;
269 	case CAT:
270 	case OR:
271 		penter(left(p));
272 		penter(right(p));
273 		parent(left(p)) = p;
274 		parent(right(p)) = p;
275 		break;
276 	case ZERO:
277 		break;
278 	default:	/* can't happen */
279 		FATAL("can't happen: unknown type %d in penter", type(p));
280 		break;
281 	}
282 }
283 
284 void freetr(Node *p)	/* free parse tree */
285 {
286 	switch (type(p)) {
287 	ELEAF
288 	LEAF
289 		xfree(p);
290 		break;
291 	UNARY
292 	case ZERO:
293 		freetr(left(p));
294 		xfree(p);
295 		break;
296 	case CAT:
297 	case OR:
298 		freetr(left(p));
299 		freetr(right(p));
300 		xfree(p);
301 		break;
302 	default:	/* can't happen */
303 		FATAL("can't happen: unknown type %d in freetr", type(p));
304 		break;
305 	}
306 }
307 
308 /* in the parsing of regular expressions, metacharacters like . have */
309 /* to be seen literally;  \056 is not a metacharacter. */
310 
311 int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
312 {			/* only pick up one 8-bit byte (2 chars) */
313 	const uschar *p;
314 	int n = 0;
315 	int i;
316 
317 	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
318 		if (isdigit(*p))
319 			n = 16 * n + *p - '0';
320 		else if (*p >= 'a' && *p <= 'f')
321 			n = 16 * n + *p - 'a' + 10;
322 		else if (*p >= 'A' && *p <= 'F')
323 			n = 16 * n + *p - 'A' + 10;
324 	}
325 	*pp = p;
326 	return n;
327 }
328 
329 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
330 
331 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
332 			/* and increment *pp */
333 {
334 	const uschar *p = *pp;
335 	int c;
336 
337 	if ((c = *p++) == 't')
338 		c = '\t';
339 	else if (c == 'n')
340 		c = '\n';
341 	else if (c == 'f')
342 		c = '\f';
343 	else if (c == 'r')
344 		c = '\r';
345 	else if (c == 'b')
346 		c = '\b';
347 	else if (c == 'v')
348 		c = '\v';
349 	else if (c == 'a')
350 		c = '\a';
351 	else if (c == '\\')
352 		c = '\\';
353 	else if (c == 'x') {	/* hexadecimal goo follows */
354 		c = hexstr(&p);	/* this adds a null if number is invalid */
355 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
356 		int n = c - '0';
357 		if (isoctdigit(*p)) {
358 			n = 8 * n + *p++ - '0';
359 			if (isoctdigit(*p))
360 				n = 8 * n + *p++ - '0';
361 		}
362 		c = n;
363 	} /* else */
364 		/* c = c; */
365 	*pp = p;
366 	return c;
367 }
368 
369 char *cclenter(const char *argp)	/* add a character class */
370 {
371 	int i, c, c2;
372 	const uschar *op, *p = (const uschar *) argp;
373 	uschar *bp;
374 	static uschar *buf = NULL;
375 	static int bufsz = 100;
376 
377 	op = p;
378 	if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
379 		FATAL("out of space for character class [%.10s...] 1", p);
380 	bp = buf;
381 	for (i = 0; (c = *p++) != 0; ) {
382 		if (c == '\\') {
383 			c = quoted(&p);
384 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
385 			if (*p != 0) {
386 				c = bp[-1];
387 				c2 = *p++;
388 				if (c2 == '\\')
389 					c2 = quoted(&p);
390 				if (c > c2) {	/* empty; ignore */
391 					bp--;
392 					i--;
393 					continue;
394 				}
395 				while (c < c2) {
396 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
397 						FATAL("out of space for character class [%.10s...] 2", p);
398 					*bp++ = ++c;
399 					i++;
400 				}
401 				continue;
402 			}
403 		}
404 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
405 			FATAL("out of space for character class [%.10s...] 3", p);
406 		*bp++ = c;
407 		i++;
408 	}
409 	*bp = 0;
410 	DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
411 	xfree(op);
412 	return (char *) tostring((char *) buf);
413 }
414 
415 void overflo(const char *s)
416 {
417 	FATAL("regular expression too big: out of space in %.30s...", s);
418 }
419 
420 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
421 {
422 	int i;
423 	int *p;
424 
425 	switch (type(v)) {
426 	ELEAF
427 	LEAF
428 		f->re[info(v)].ltype = type(v);
429 		f->re[info(v)].lval.np = right(v);
430 		while (f->accept >= maxsetvec) {	/* guessing here! */
431 			resizesetvec(__func__);
432 		}
433 		for (i = 0; i <= f->accept; i++)
434 			setvec[i] = 0;
435 		setcnt = 0;
436 		follow(v);	/* computes setvec and setcnt */
437 		p = intalloc(setcnt + 1, __func__);
438 		f->re[info(v)].lfollow = p;
439 		*p = setcnt;
440 		for (i = f->accept; i >= 0; i--)
441 			if (setvec[i] == 1)
442 				*++p = i;
443 		break;
444 	UNARY
445 		cfoll(f,left(v));
446 		break;
447 	case CAT:
448 	case OR:
449 		cfoll(f,left(v));
450 		cfoll(f,right(v));
451 		break;
452 	case ZERO:
453 		break;
454 	default:	/* can't happen */
455 		FATAL("can't happen: unknown type %d in cfoll", type(v));
456 	}
457 }
458 
459 int first(Node *p)	/* collects initially active leaves of p into setvec */
460 			/* returns 0 if p matches empty string */
461 {
462 	int b, lp;
463 
464 	switch (type(p)) {
465 	ELEAF
466 	LEAF
467 		lp = info(p);	/* look for high-water mark of subscripts */
468 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
469 			resizesetvec(__func__);
470 		}
471 		if (type(p) == EMPTYRE) {
472 			setvec[lp] = 0;
473 			return(0);
474 		}
475 		if (setvec[lp] != 1) {
476 			setvec[lp] = 1;
477 			setcnt++;
478 		}
479 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
480 			return(0);		/* empty CCL */
481 		return(1);
482 	case PLUS:
483 		if (first(left(p)) == 0)
484 			return(0);
485 		return(1);
486 	case STAR:
487 	case QUEST:
488 		first(left(p));
489 		return(0);
490 	case CAT:
491 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
492 		return(1);
493 	case OR:
494 		b = first(right(p));
495 		if (first(left(p)) == 0 || b == 0) return(0);
496 		return(1);
497 	case ZERO:
498 		return 0;
499 	}
500 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
501 	return(-1);
502 }
503 
504 void follow(Node *v)	/* collects leaves that can follow v into setvec */
505 {
506 	Node *p;
507 
508 	if (type(v) == FINAL)
509 		return;
510 	p = parent(v);
511 	switch (type(p)) {
512 	case STAR:
513 	case PLUS:
514 		first(v);
515 		follow(p);
516 		return;
517 
518 	case OR:
519 	case QUEST:
520 		follow(p);
521 		return;
522 
523 	case CAT:
524 		if (v == left(p)) {	/* v is left child of p */
525 			if (first(right(p)) == 0) {
526 				follow(p);
527 				return;
528 			}
529 		} else		/* v is right child */
530 			follow(p);
531 		return;
532 	}
533 }
534 
535 int member(int c, const char *sarg)	/* is c in s? */
536 {
537 	const uschar *s = (const uschar *) sarg;
538 
539 	while (*s)
540 		if (c == *s++)
541 			return(1);
542 	return(0);
543 }
544 
545 int match(fa *f, const char *p0)	/* shortest match ? */
546 {
547 	int s, ns;
548 	const uschar *p = (const uschar *) p0;
549 
550 	s = f->initstat;
551 	assert (s < f->state_count);
552 
553 	if (f->out[s])
554 		return(1);
555 	do {
556 		/* assert(*p < NCHARS); */
557 		if ((ns = f->gototab[s][*p]) != 0)
558 			s = ns;
559 		else
560 			s = cgoto(f, s, *p);
561 		if (f->out[s])
562 			return(1);
563 	} while (*p++ != 0);
564 	return(0);
565 }
566 
567 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
568 {
569 	int s, ns;
570 	const uschar *p = (const uschar *) p0;
571 	const uschar *q;
572 
573 	s = f->initstat;
574 	assert(s < f->state_count);
575 
576 	patbeg = (const char *)p;
577 	patlen = -1;
578 	do {
579 		q = p;
580 		do {
581 			if (f->out[s])		/* final state */
582 				patlen = q-p;
583 			/* assert(*q < NCHARS); */
584 			if ((ns = f->gototab[s][*q]) != 0)
585 				s = ns;
586 			else
587 				s = cgoto(f, s, *q);
588 
589 			assert(s < f->state_count);
590 
591 			if (s == 1) {	/* no transition */
592 				if (patlen >= 0) {
593 					patbeg = (const char *) p;
594 					return(1);
595 				}
596 				else
597 					goto nextin;	/* no match */
598 			}
599 		} while (*q++ != 0);
600 		if (f->out[s])
601 			patlen = q-p-1;	/* don't count $ */
602 		if (patlen >= 0) {
603 			patbeg = (const char *) p;
604 			return(1);
605 		}
606 	nextin:
607 		s = 2;
608 	} while (*p++);
609 	return (0);
610 }
611 
612 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
613 {
614 	int s, ns;
615 	const uschar *p = (const uschar *) p0;
616 	const uschar *q;
617 
618 	s = f->initstat;
619 	assert(s < f->state_count);
620 
621 	patbeg = (const char *)p;
622 	patlen = -1;
623 	while (*p) {
624 		q = p;
625 		do {
626 			if (f->out[s])		/* final state */
627 				patlen = q-p;
628 			/* assert(*q < NCHARS); */
629 			if ((ns = f->gototab[s][*q]) != 0)
630 				s = ns;
631 			else
632 				s = cgoto(f, s, *q);
633 			if (s == 1) {	/* no transition */
634 				if (patlen > 0) {
635 					patbeg = (const char *) p;
636 					return(1);
637 				} else
638 					goto nnextin;	/* no nonempty match */
639 			}
640 		} while (*q++ != 0);
641 		if (f->out[s])
642 			patlen = q-p-1;	/* don't count $ */
643 		if (patlen > 0 ) {
644 			patbeg = (const char *) p;
645 			return(1);
646 		}
647 	nnextin:
648 		s = 2;
649 		p++;
650 	}
651 	return (0);
652 }
653 
654 
655 /*
656  * NAME
657  *     fnematch
658  *
659  * DESCRIPTION
660  *     A stream-fed version of nematch which transfers characters to a
661  *     null-terminated buffer. All characters up to and including the last
662  *     character of the matching text or EOF are placed in the buffer. If
663  *     a match is found, patbeg and patlen are set appropriately.
664  *
665  * RETURN VALUES
666  *     false    No match found.
667  *     true     Match found.
668  */
669 
670 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
671 {
672 	char *buf = *pbuf;
673 	int bufsize = *pbufsize;
674 	int c, i, j, k, ns, s;
675 
676 	s = pfa->initstat;
677 	patlen = 0;
678 
679 	/*
680 	 * All indices relative to buf.
681 	 * i <= j <= k <= bufsize
682 	 *
683 	 * i: origin of active substring
684 	 * j: current character
685 	 * k: destination of next getc()
686 	 */
687 	i = -1, k = 0;
688         do {
689 		j = i++;
690 		do {
691 			if (++j == k) {
692 				if (k == bufsize)
693 					if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
694 						FATAL("stream '%.30s...' too long", buf);
695 				buf[k++] = (c = getc(f)) != EOF ? c : 0;
696 			}
697 			c = (uschar)buf[j];
698 			/* assert(c < NCHARS); */
699 
700 			if ((ns = pfa->gototab[s][c]) != 0)
701 				s = ns;
702 			else
703 				s = cgoto(pfa, s, c);
704 
705 			if (pfa->out[s]) {	/* final state */
706 				patlen = j - i + 1;
707 				if (c == 0)	/* don't count $ */
708 					patlen--;
709 			}
710 		} while (buf[j] && s != 1);
711 		s = 2;
712 	} while (buf[i] && !patlen);
713 
714 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
715 	*pbuf = buf;
716 	*pbufsize = bufsize;
717 
718 	if (patlen) {
719 		patbeg = buf + i;
720 		/*
721 		 * Under no circumstances is the last character fed to
722 		 * the automaton part of the match. It is EOF's nullbyte,
723 		 * or it sent the automaton into a state with no further
724 		 * transitions available (s==1), or both. Room for a
725 		 * terminating nullbyte is guaranteed.
726 		 *
727 		 * ungetc any chars after the end of matching text
728 		 * (except for EOF's nullbyte, if present) and null
729 		 * terminate the buffer.
730 		 */
731 		do
732 			if (buf[--k] && ungetc(buf[k], f) == EOF)
733 				FATAL("unable to ungetc '%c'", buf[k]);
734 		while (k > i + patlen);
735 		buf[k] = '\0';
736 		return true;
737 	}
738 	else
739 		return false;
740 }
741 
742 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
743 {			/* uses relex() to scan regular expression */
744 	Node *np;
745 
746 	DPRINTF("reparse <%s>\n", p);
747 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
748 	rtok = relex();
749 	/* GNU compatibility: an empty regexp matches anything */
750 	if (rtok == '\0') {
751 		/* FATAL("empty regular expression"); previous */
752 		return(op2(EMPTYRE, NIL, NIL));
753 	}
754 	np = regexp();
755 	if (rtok != '\0')
756 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
757 	return(np);
758 }
759 
760 Node *regexp(void)	/* top-level parse of reg expr */
761 {
762 	return (alt(concat(primary())));
763 }
764 
765 Node *primary(void)
766 {
767 	Node *np;
768 	int savelastatom;
769 
770 	switch (rtok) {
771 	case CHAR:
772 		lastatom = starttok;
773 		np = op2(CHAR, NIL, itonp(rlxval));
774 		rtok = relex();
775 		return (unary(np));
776 	case ALL:
777 		rtok = relex();
778 		return (unary(op2(ALL, NIL, NIL)));
779 	case EMPTYRE:
780 		rtok = relex();
781 		return (unary(op2(EMPTYRE, NIL, NIL)));
782 	case DOT:
783 		lastatom = starttok;
784 		rtok = relex();
785 		return (unary(op2(DOT, NIL, NIL)));
786 	case CCL:
787 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
788 		lastatom = starttok;
789 		rtok = relex();
790 		return (unary(np));
791 	case NCCL:
792 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
793 		lastatom = starttok;
794 		rtok = relex();
795 		return (unary(np));
796 	case '^':
797 		rtok = relex();
798 		return (unary(op2(CHAR, NIL, itonp(HAT))));
799 	case '$':
800 		rtok = relex();
801 		return (unary(op2(CHAR, NIL, NIL)));
802 	case '(':
803 		lastatom = starttok;
804 		savelastatom = starttok - basestr; /* Retain over recursion */
805 		rtok = relex();
806 		if (rtok == ')') {	/* special pleading for () */
807 			rtok = relex();
808 			return unary(op2(CCL, NIL, (Node *) tostring("")));
809 		}
810 		np = regexp();
811 		if (rtok == ')') {
812 			lastatom = basestr + savelastatom; /* Restore */
813 			rtok = relex();
814 			return (unary(np));
815 		}
816 		else
817 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
818 	default:
819 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
820 	}
821 	return 0;	/*NOTREACHED*/
822 }
823 
824 Node *concat(Node *np)
825 {
826 	switch (rtok) {
827 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
828 		return (concat(op2(CAT, np, primary())));
829 	case EMPTYRE:
830 		rtok = relex();
831 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
832 				primary())));
833 	}
834 	return (np);
835 }
836 
837 Node *alt(Node *np)
838 {
839 	if (rtok == OR) {
840 		rtok = relex();
841 		return (alt(op2(OR, np, concat(primary()))));
842 	}
843 	return (np);
844 }
845 
846 Node *unary(Node *np)
847 {
848 	switch (rtok) {
849 	case STAR:
850 		rtok = relex();
851 		return (unary(op2(STAR, np, NIL)));
852 	case PLUS:
853 		rtok = relex();
854 		return (unary(op2(PLUS, np, NIL)));
855 	case QUEST:
856 		rtok = relex();
857 		return (unary(op2(QUEST, np, NIL)));
858 	case ZERO:
859 		rtok = relex();
860 		return (unary(op2(ZERO, np, NIL)));
861 	default:
862 		return (np);
863 	}
864 }
865 
866 /*
867  * Character class definitions conformant to the POSIX locale as
868  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
869  * and operating character sets are both ASCII (ISO646) or supersets
870  * thereof.
871  *
872  * Note that to avoid overflowing the temporary buffer used in
873  * relex(), the expanded character class (prior to range expansion)
874  * must be less than twice the size of their full name.
875  */
876 
877 /* Because isblank doesn't show up in any of the header files on any
878  * system i use, it's defined here.  if some other locale has a richer
879  * definition of "blank", define HAS_ISBLANK and provide your own
880  * version.
881  * the parentheses here are an attempt to find a path through the maze
882  * of macro definition and/or function and/or version provided.  thanks
883  * to nelson beebe for the suggestion; let's see if it works everywhere.
884  */
885 
886 #ifndef HAS_ISBLANK
887 
888 int (xisblank)(int c)
889 {
890 	return c==' ' || c=='\t';
891 }
892 
893 #endif
894 
895 static const struct charclass {
896 	const char *cc_name;
897 	int cc_namelen;
898 	int (*cc_func)(int);
899 } charclasses[] = {
900 	{ "alnum",	5,	isalnum },
901 	{ "alpha",	5,	isalpha },
902 #ifndef HAS_ISBLANK
903 	{ "blank",	5,	xisblank },
904 #else
905 	{ "blank",	5,	isblank },
906 #endif
907 	{ "cntrl",	5,	iscntrl },
908 	{ "digit",	5,	isdigit },
909 	{ "graph",	5,	isgraph },
910 	{ "lower",	5,	islower },
911 	{ "print",	5,	isprint },
912 	{ "punct",	5,	ispunct },
913 	{ "space",	5,	isspace },
914 	{ "upper",	5,	isupper },
915 	{ "xdigit",	6,	isxdigit },
916 	{ NULL,		0,	NULL },
917 };
918 
919 #define REPEAT_SIMPLE		0
920 #define REPEAT_PLUS_APPENDED	1
921 #define REPEAT_WITH_Q		2
922 #define REPEAT_ZERO		3
923 
924 static int
925 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
926 	       int atomlen, int firstnum, int secondnum, int special_case)
927 {
928 	int i, j;
929 	uschar *buf = NULL;
930 	int ret = 1;
931 	int init_q = (firstnum == 0);		/* first added char will be ? */
932 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
933 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
934 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
935 	int size = prefix_length +  suffix_length;
936 
937 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
938 		size += atomlen*(firstnum-1);
939 	}
940 
941 	/* Adjust size of buffer for special cases */
942 	if (special_case == REPEAT_PLUS_APPENDED) {
943 		size++;		/* for the final + */
944 	} else if (special_case == REPEAT_WITH_Q) {
945 		size += init_q + (atomlen+1)* (n_q_reps-init_q);
946 	} else if (special_case == REPEAT_ZERO) {
947 		size += 2;	/* just a null ERE: () */
948 	}
949 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
950 		FATAL("out of space in reg expr %.10s..", lastre);
951 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
952 	j = prefix_length;
953 	if (special_case == REPEAT_ZERO) {
954 		j -= atomlen;
955 		buf[j++] = '(';
956 		buf[j++] = ')';
957 	}
958 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
959 		memcpy(&buf[j], atom, atomlen);
960 		j += atomlen;
961 	}
962 	if (special_case == REPEAT_PLUS_APPENDED) {
963 		buf[j++] = '+';
964 	} else if (special_case == REPEAT_WITH_Q) {
965 		if (init_q)
966 			buf[j++] = '?';
967 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
968 			memcpy(&buf[j], atom, atomlen);
969 			j += atomlen;
970 			buf[j++] = '?';
971 		}
972 	}
973 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
974 	j += suffix_length;
975 	buf[j] = '\0';
976 	/* free old basestr */
977 	if (firstbasestr != basestr) {
978 		if (basestr)
979 			xfree(basestr);
980 	}
981 	basestr = buf;
982 	prestr  = buf + prefix_length;
983 	if (special_case == REPEAT_ZERO) {
984 		prestr  -= atomlen;
985 		ret++;
986 	}
987 	return ret;
988 }
989 
990 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
991 		  int atomlen, int firstnum, int secondnum)
992 {
993 	/*
994 	   In general, the repetition specifier or "bound" is replaced here
995 	   by an equivalent ERE string, repeating the immediately previous atom
996 	   and appending ? and + as needed. Note that the first copy of the
997 	   atom is left in place, except in the special_case of a zero-repeat
998 	   (i.e., {0}).
999 	 */
1000 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1001 		if (firstnum < 2) {
1002 			/* 0 or 1: should be handled before you get here */
1003 			FATAL("internal error");
1004 		} else {
1005 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1006 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1007 		}
1008 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1009 		if (firstnum == 0) {	/* {0} or {0,0} */
1010 			/* This case is unusual because the resulting
1011 			   replacement string might actually be SMALLER than
1012 			   the original ERE */
1013 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1014 					firstnum, secondnum, REPEAT_ZERO);
1015 		} else {		/* (firstnum >= 1) */
1016 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1017 					firstnum, secondnum, REPEAT_SIMPLE);
1018 		}
1019 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1020 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1021 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1022 					firstnum, secondnum, REPEAT_WITH_Q);
1023 	} else {	/* Error - shouldn't be here (n>m) */
1024 		FATAL("internal error");
1025 	}
1026 	return 0;
1027 }
1028 
1029 int relex(void)		/* lexical analyzer for reparse */
1030 {
1031 	int c, n;
1032 	int cflag;
1033 	static uschar *buf = NULL;
1034 	static int bufsz = 100;
1035 	uschar *bp;
1036 	const struct charclass *cc;
1037 	int i;
1038 	int num, m;
1039 	bool commafound, digitfound;
1040 	const uschar *startreptok;
1041 	static int parens = 0;
1042 
1043 rescan:
1044 	starttok = prestr;
1045 
1046 	switch (c = *prestr++) {
1047 	case '|': return OR;
1048 	case '*': return STAR;
1049 	case '+': return PLUS;
1050 	case '?': return QUEST;
1051 	case '.': return DOT;
1052 	case '\0': prestr--; return '\0';
1053 	case '^':
1054 	case '$':
1055 		return c;
1056 	case '(':
1057 		parens++;
1058 		return c;
1059 	case ')':
1060 		if (parens) {
1061 			parens--;
1062 			return c;
1063 		}
1064 		/* unmatched close parenthesis; per POSIX, treat as literal */
1065 		rlxval = c;
1066 		return CHAR;
1067 	case '\\':
1068 		rlxval = quoted(&prestr);
1069 		return CHAR;
1070 	default:
1071 		rlxval = c;
1072 		return CHAR;
1073 	case '[':
1074 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1075 			FATAL("out of space in reg expr %.10s..", lastre);
1076 		bp = buf;
1077 		if (*prestr == '^') {
1078 			cflag = 1;
1079 			prestr++;
1080 		}
1081 		else
1082 			cflag = 0;
1083 		n = 2 * strlen((const char *) prestr)+1;
1084 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1085 			FATAL("out of space for reg expr %.10s...", lastre);
1086 		for (; ; ) {
1087 			if ((c = *prestr++) == '\\') {
1088 				*bp++ = '\\';
1089 				if ((c = *prestr++) == '\0')
1090 					FATAL("nonterminated character class %.20s...", lastre);
1091 				*bp++ = c;
1092 			/* } else if (c == '\n') { */
1093 			/* 	FATAL("newline in character class %.20s...", lastre); */
1094 			} else if (c == '[' && *prestr == ':') {
1095 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1096 				for (cc = charclasses; cc->cc_name; cc++)
1097 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1098 						break;
1099 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1100 				    prestr[2 + cc->cc_namelen] == ']') {
1101 					prestr += cc->cc_namelen + 3;
1102 					/*
1103 					 * BUG: We begin at 1, instead of 0, since we
1104 					 * would otherwise prematurely terminate the
1105 					 * string for classes like [[:cntrl:]]. This
1106 					 * means that we can't match the NUL character,
1107 					 * not without first adapting the entire
1108 					 * program to track each string's length.
1109 					 */
1110 					for (i = 1; i <= UCHAR_MAX; i++) {
1111 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1112 						    FATAL("out of space for reg expr %.10s...", lastre);
1113 						if (cc->cc_func(i)) {
1114 							/* escape backslash */
1115 							if (i == '\\') {
1116 								*bp++ = '\\';
1117 								n++;
1118 							}
1119 
1120 							*bp++ = i;
1121 							n++;
1122 						}
1123 					}
1124 				} else
1125 					*bp++ = c;
1126 			} else if (c == '[' && *prestr == '.') {
1127 				char collate_char;
1128 				prestr++;
1129 				collate_char = *prestr++;
1130 				if (*prestr == '.' && prestr[1] == ']') {
1131 					prestr += 2;
1132 					/* Found it: map via locale TBD: for
1133 					   now, simply return this char.  This
1134 					   is sufficient to pass conformance
1135 					   test awk.ex 156
1136 					 */
1137 					if (*prestr == ']') {
1138 						prestr++;
1139 						rlxval = collate_char;
1140 						return CHAR;
1141 					}
1142 				}
1143 			} else if (c == '[' && *prestr == '=') {
1144 				char equiv_char;
1145 				prestr++;
1146 				equiv_char = *prestr++;
1147 				if (*prestr == '=' && prestr[1] == ']') {
1148 					prestr += 2;
1149 					/* Found it: map via locale TBD: for now
1150 					   simply return this char. This is
1151 					   sufficient to pass conformance test
1152 					   awk.ex 156
1153 					 */
1154 					if (*prestr == ']') {
1155 						prestr++;
1156 						rlxval = equiv_char;
1157 						return CHAR;
1158 					}
1159 				}
1160 			} else if (c == '\0') {
1161 				FATAL("nonterminated character class %.20s", lastre);
1162 			} else if (bp == buf) {	/* 1st char is special */
1163 				*bp++ = c;
1164 			} else if (c == ']') {
1165 				*bp++ = 0;
1166 				rlxstr = (uschar *) tostring((char *) buf);
1167 				if (cflag == 0)
1168 					return CCL;
1169 				else
1170 					return NCCL;
1171 			} else
1172 				*bp++ = c;
1173 		}
1174 		break;
1175 	case '{':
1176 		if (isdigit(*(prestr))) {
1177 			num = 0;	/* Process as a repetition */
1178 			n = -1; m = -1;
1179 			commafound = false;
1180 			digitfound = false;
1181 			startreptok = prestr-1;
1182 			/* Remember start of previous atom here ? */
1183 		} else {        	/* just a { char, not a repetition */
1184 			rlxval = c;
1185 			return CHAR;
1186                 }
1187 		for (; ; ) {
1188 			if ((c = *prestr++) == '}') {
1189 				if (commafound) {
1190 					if (digitfound) { /* {n,m} */
1191 						m = num;
1192 						if (m < n)
1193 							FATAL("illegal repetition expression: class %.20s",
1194 								lastre);
1195 						if (n == 0 && m == 1) {
1196 							return QUEST;
1197 						}
1198 					} else {	/* {n,} */
1199 						if (n == 0)
1200 							return STAR;
1201 						else if (n == 1)
1202 							return PLUS;
1203 					}
1204 				} else {
1205 					if (digitfound) { /* {n} same as {n,n} */
1206 						n = num;
1207 						m = num;
1208 					} else {	/* {} */
1209 						FATAL("illegal repetition expression: class %.20s",
1210 							lastre);
1211 					}
1212 				}
1213 				if (repeat(starttok, prestr-starttok, lastatom,
1214 					   startreptok - lastatom, n, m) > 0) {
1215 					if (n == 0 && m == 0) {
1216 						return ZERO;
1217 					}
1218 					/* must rescan input for next token */
1219 					goto rescan;
1220 				}
1221 				/* Failed to replace: eat up {...} characters
1222 				   and treat like just PLUS */
1223 				return PLUS;
1224 			} else if (c == '\0') {
1225 				FATAL("nonterminated character class %.20s",
1226 					lastre);
1227 			} else if (isdigit(c)) {
1228 				num = 10 * num + c - '0';
1229 				digitfound = true;
1230 			} else if (c == ',') {
1231 				if (commafound)
1232 					FATAL("illegal repetition expression: class %.20s",
1233 						lastre);
1234 				/* looking for {n,} or {n,m} */
1235 				commafound = true;
1236 				n = num;
1237 				digitfound = false; /* reset */
1238 				num = 0;
1239 			} else {
1240 				FATAL("illegal repetition expression: class %.20s",
1241 					lastre);
1242 			}
1243 		}
1244 		break;
1245 	}
1246 }
1247 
1248 int cgoto(fa *f, int s, int c)
1249 {
1250 	int *p, *q;
1251 	int i, j, k;
1252 
1253 	assert(c == HAT || c < NCHARS);
1254 	while (f->accept >= maxsetvec) {	/* guessing here! */
1255 		resizesetvec(__func__);
1256 	}
1257 	for (i = 0; i <= f->accept; i++)
1258 		setvec[i] = 0;
1259 	setcnt = 0;
1260 	resize_state(f, s);
1261 	/* compute positions of gototab[s,c] into setvec */
1262 	p = f->posns[s];
1263 	for (i = 1; i <= *p; i++) {
1264 		if ((k = f->re[p[i]].ltype) != FINAL) {
1265 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1266 			 || (k == DOT && c != 0 && c != HAT)
1267 			 || (k == ALL && c != 0)
1268 			 || (k == EMPTYRE && c != 0)
1269 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1270 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1271 				q = f->re[p[i]].lfollow;
1272 				for (j = 1; j <= *q; j++) {
1273 					if (q[j] >= maxsetvec) {
1274 						resizesetvec(__func__);
1275 					}
1276 					if (setvec[q[j]] == 0) {
1277 						setcnt++;
1278 						setvec[q[j]] = 1;
1279 					}
1280 				}
1281 			}
1282 		}
1283 	}
1284 	/* determine if setvec is a previous state */
1285 	tmpset[0] = setcnt;
1286 	j = 1;
1287 	for (i = f->accept; i >= 0; i--)
1288 		if (setvec[i]) {
1289 			tmpset[j++] = i;
1290 		}
1291 	resize_state(f, f->curstat > s ? f->curstat : s);
1292 	/* tmpset == previous state? */
1293 	for (i = 1; i <= f->curstat; i++) {
1294 		p = f->posns[i];
1295 		if ((k = tmpset[0]) != p[0])
1296 			goto different;
1297 		for (j = 1; j <= k; j++)
1298 			if (tmpset[j] != p[j])
1299 				goto different;
1300 		/* setvec is state i */
1301 		if (c != HAT)
1302 			f->gototab[s][c] = i;
1303 		return i;
1304 	  different:;
1305 	}
1306 
1307 	/* add tmpset to current set of states */
1308 	++(f->curstat);
1309 	resize_state(f, f->curstat);
1310 	for (i = 0; i < NCHARS; i++)
1311 		f->gototab[f->curstat][i] = 0;
1312 	xfree(f->posns[f->curstat]);
1313 	p = intalloc(setcnt + 1, __func__);
1314 
1315 	f->posns[f->curstat] = p;
1316 	if (c != HAT)
1317 		f->gototab[s][c] = f->curstat;
1318 	for (i = 0; i <= setcnt; i++)
1319 		p[i] = tmpset[i];
1320 	if (setvec[f->accept])
1321 		f->out[f->curstat] = 1;
1322 	else
1323 		f->out[f->curstat] = 0;
1324 	return f->curstat;
1325 }
1326 
1327 
1328 void freefa(fa *f)	/* free a finite automaton */
1329 {
1330 	int i;
1331 
1332 	if (f == NULL)
1333 		return;
1334 	for (i = 0; i < f->state_count; i++)
1335 		xfree(f->gototab[i])
1336 	for (i = 0; i <= f->curstat; i++)
1337 		xfree(f->posns[i]);
1338 	for (i = 0; i <= f->accept; i++) {
1339 		xfree(f->re[i].lfollow);
1340 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1341 			xfree(f->re[i].lval.np);
1342 	}
1343 	xfree(f->restr);
1344 	xfree(f->out);
1345 	xfree(f->posns);
1346 	xfree(f->gototab);
1347 	xfree(f);
1348 }
1349