xref: /openbsd/usr.bin/awk/b.c (revision 55cc5ba3)
1 /*	$OpenBSD: b.c,v 1.35 2020/12/09 20:00:11 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 /* lasciate ogne speranza, voi ch'intrate. */
27 
28 #define	DEBUG
29 
30 #include <ctype.h>
31 #include <limits.h>
32 #include <stdio.h>
33 #include <string.h>
34 #include <stdlib.h>
35 #include "awk.h"
36 #include "awkgram.tab.h"
37 
38 #define MAXLIN 22
39 
40 #define type(v)		(v)->nobj	/* badly overloaded here */
41 #define info(v)		(v)->ntype	/* badly overloaded here */
42 #define left(v)		(v)->narg[0]
43 #define right(v)	(v)->narg[1]
44 #define parent(v)	(v)->nnext
45 
46 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
47 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
48 #define UNARY	case STAR: case PLUS: case QUEST:
49 
50 /* encoding in tree Nodes:
51 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
52 		left is index, right contains value or pointer to value
53 	unary (STAR, PLUS, QUEST): left is child, right is null
54 	binary (CAT, OR): left and right are children
55 	parent contains pointer to parent
56 */
57 
58 
59 int	*setvec;
60 int	*tmpset;
61 int	maxsetvec = 0;
62 
63 int	rtok;		/* next token in current re */
64 int	rlxval;
65 static const uschar	*rlxstr;
66 static const uschar	*prestr;	/* current position in current re */
67 static const uschar	*lastre;	/* origin of last re */
68 static const uschar	*lastatom;	/* origin of last Atom */
69 static const uschar	*starttok;
70 static const uschar 	*basestr;	/* starts with original, replaced during
71 				   repetition processing */
72 static const uschar 	*firstbasestr;
73 
74 static	int setcnt;
75 static	int poscnt;
76 
77 const char	*patbeg;
78 int	patlen;
79 
80 #define	NFA	128	/* cache this many dynamic fa's */
81 fa	*fatab[NFA];
82 int	nfatab	= 0;	/* entries in fatab */
83 
84 static int *
85 intalloc(size_t n, const char *f)
86 {
87 	int *p = (int *) calloc(n, sizeof(int));
88 	if (p == NULL)
89 		overflo(f);
90 	return p;
91 }
92 
93 static void
94 allocsetvec(const char *f)
95 {
96 	maxsetvec = MAXLIN;
97 	setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
98 	tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
99 	if (setvec == NULL || tmpset == NULL)
100 		overflo(f);
101 }
102 
103 static void
104 resizesetvec(const char *f)
105 {
106 	setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
107 	tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
108 	if (setvec == NULL || tmpset == NULL)
109 		overflo(f);
110 	maxsetvec *= 4;
111 }
112 
113 static void
114 resize_state(fa *f, int state)
115 {
116 	unsigned int **p;
117 	uschar *p2;
118 	int **p3;
119 	int i, new_count;
120 
121 	if (++state < f->state_count)
122 		return;
123 
124 	new_count = state + 10; /* needs to be tuned */
125 
126 	p = (unsigned int **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
127 	if (p == NULL)
128 		goto out;
129 	f->gototab = p;
130 
131 	p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
132 	if (p2 == NULL)
133 		goto out;
134 	f->out = p2;
135 
136 	p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
137 	if (p3 == NULL)
138 		goto out;
139 	f->posns = p3;
140 
141 	for (i = f->state_count; i < new_count; ++i) {
142 		f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));
143 		if (f->gototab[i] == NULL)
144 			goto out;
145 		f->out[i]  = 0;
146 		f->posns[i] = NULL;
147 	}
148 	f->state_count = new_count;
149 	return;
150 out:
151 	overflo(__func__);
152 }
153 
154 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
155 {
156 	int i, use, nuse;
157 	fa *pfa;
158 	static int now = 1;
159 
160 	if (setvec == NULL) {	/* first time through any RE */
161 		allocsetvec(__func__);
162 	}
163 
164 	if (compile_time != RUNNING)	/* a constant for sure */
165 		return mkdfa(s, anchor);
166 	for (i = 0; i < nfatab; i++)	/* is it there already? */
167 		if (fatab[i]->anchor == anchor
168 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
169 			fatab[i]->use = now++;
170 			return fatab[i];
171 		}
172 	pfa = mkdfa(s, anchor);
173 	if (nfatab < NFA) {	/* room for another */
174 		fatab[nfatab] = pfa;
175 		fatab[nfatab]->use = now++;
176 		nfatab++;
177 		return pfa;
178 	}
179 	use = fatab[0]->use;	/* replace least-recently used */
180 	nuse = 0;
181 	for (i = 1; i < nfatab; i++)
182 		if (fatab[i]->use < use) {
183 			use = fatab[i]->use;
184 			nuse = i;
185 		}
186 	freefa(fatab[nuse]);
187 	fatab[nuse] = pfa;
188 	pfa->use = now++;
189 	return pfa;
190 }
191 
192 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
193 				/* anchor = true for anchored matches, else false */
194 {
195 	Node *p, *p1;
196 	fa *f;
197 
198 	firstbasestr = (const uschar *) s;
199 	basestr = firstbasestr;
200 	p = reparse(s);
201 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
202 		/* put ALL STAR in front of reg.  exp. */
203 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
204 		/* put FINAL after reg.  exp. */
205 
206 	poscnt = 0;
207 	penter(p1);	/* enter parent pointers and leaf indices */
208 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
209 		overflo(__func__);
210 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
211 	cfoll(f, p1);	/* set up follow sets */
212 	freetr(p1);
213 	resize_state(f, 1);
214 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
215 	f->posns[1] = intalloc(1, __func__);
216 	*f->posns[1] = 0;
217 	f->initstat = makeinit(f, anchor);
218 	f->anchor = anchor;
219 	f->restr = (uschar *) tostring(s);
220 	if (firstbasestr != basestr) {
221 		if (basestr)
222 			xfree(basestr);
223 	}
224 	return f;
225 }
226 
227 int makeinit(fa *f, bool anchor)
228 {
229 	int i, k;
230 
231 	f->curstat = 2;
232 	f->out[2] = 0;
233 	k = *(f->re[0].lfollow);
234 	xfree(f->posns[2]);
235 	f->posns[2] = intalloc(k + 1,  __func__);
236 	for (i = 0; i <= k; i++) {
237 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
238 	}
239 	if ((f->posns[2])[1] == f->accept)
240 		f->out[2] = 1;
241 	for (i = 0; i < NCHARS; i++)
242 		f->gototab[2][i] = 0;
243 	f->curstat = cgoto(f, 2, HAT);
244 	if (anchor) {
245 		*f->posns[2] = k-1;	/* leave out position 0 */
246 		for (i = 0; i < k; i++) {
247 			(f->posns[0])[i] = (f->posns[2])[i];
248 		}
249 
250 		f->out[0] = f->out[2];
251 		if (f->curstat != 2)
252 			--(*f->posns[f->curstat]);
253 	}
254 	return f->curstat;
255 }
256 
257 void penter(Node *p)	/* set up parent pointers and leaf indices */
258 {
259 	switch (type(p)) {
260 	ELEAF
261 	LEAF
262 		info(p) = poscnt;
263 		poscnt++;
264 		break;
265 	UNARY
266 		penter(left(p));
267 		parent(left(p)) = p;
268 		break;
269 	case CAT:
270 	case OR:
271 		penter(left(p));
272 		penter(right(p));
273 		parent(left(p)) = p;
274 		parent(right(p)) = p;
275 		break;
276 	case ZERO:
277 		break;
278 	default:	/* can't happen */
279 		FATAL("can't happen: unknown type %d in penter", type(p));
280 		break;
281 	}
282 }
283 
284 void freetr(Node *p)	/* free parse tree */
285 {
286 	switch (type(p)) {
287 	ELEAF
288 	LEAF
289 		xfree(p);
290 		break;
291 	UNARY
292 	case ZERO:
293 		freetr(left(p));
294 		xfree(p);
295 		break;
296 	case CAT:
297 	case OR:
298 		freetr(left(p));
299 		freetr(right(p));
300 		xfree(p);
301 		break;
302 	default:	/* can't happen */
303 		FATAL("can't happen: unknown type %d in freetr", type(p));
304 		break;
305 	}
306 }
307 
308 /* in the parsing of regular expressions, metacharacters like . have */
309 /* to be seen literally;  \056 is not a metacharacter. */
310 
311 int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
312 {			/* only pick up one 8-bit byte (2 chars) */
313 	const uschar *p;
314 	int n = 0;
315 	int i;
316 
317 	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
318 		if (isdigit(*p))
319 			n = 16 * n + *p - '0';
320 		else if (*p >= 'a' && *p <= 'f')
321 			n = 16 * n + *p - 'a' + 10;
322 		else if (*p >= 'A' && *p <= 'F')
323 			n = 16 * n + *p - 'A' + 10;
324 	}
325 	*pp = p;
326 	return n;
327 }
328 
329 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
330 
331 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
332 			/* and increment *pp */
333 {
334 	const uschar *p = *pp;
335 	int c;
336 
337 	if ((c = *p++) == 't')
338 		c = '\t';
339 	else if (c == 'n')
340 		c = '\n';
341 	else if (c == 'f')
342 		c = '\f';
343 	else if (c == 'r')
344 		c = '\r';
345 	else if (c == 'b')
346 		c = '\b';
347 	else if (c == 'v')
348 		c = '\v';
349 	else if (c == 'a')
350 		c = '\a';
351 	else if (c == '\\')
352 		c = '\\';
353 	else if (c == 'x') {	/* hexadecimal goo follows */
354 		c = hexstr(&p);	/* this adds a null if number is invalid */
355 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
356 		int n = c - '0';
357 		if (isoctdigit(*p)) {
358 			n = 8 * n + *p++ - '0';
359 			if (isoctdigit(*p))
360 				n = 8 * n + *p++ - '0';
361 		}
362 		c = n;
363 	} /* else */
364 		/* c = c; */
365 	*pp = p;
366 	return c;
367 }
368 
369 char *cclenter(const char *argp)	/* add a character class */
370 {
371 	int i, c, c2;
372 	const uschar *op, *p = (const uschar *) argp;
373 	uschar *bp;
374 	static uschar *buf = NULL;
375 	static int bufsz = 100;
376 
377 	op = p;
378 	if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
379 		FATAL("out of space for character class [%.10s...] 1", p);
380 	bp = buf;
381 	for (i = 0; (c = *p++) != 0; ) {
382 		if (c == '\\') {
383 			c = quoted(&p);
384 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
385 			if (*p != 0) {
386 				c = bp[-1];
387 				c2 = *p++;
388 				if (c2 == '\\')
389 					c2 = quoted(&p);
390 				if (c > c2) {	/* empty; ignore */
391 					bp--;
392 					i--;
393 					continue;
394 				}
395 				while (c < c2) {
396 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
397 						FATAL("out of space for character class [%.10s...] 2", p);
398 					*bp++ = ++c;
399 					i++;
400 				}
401 				continue;
402 			}
403 		}
404 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
405 			FATAL("out of space for character class [%.10s...] 3", p);
406 		*bp++ = c;
407 		i++;
408 	}
409 	*bp = 0;
410 	DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
411 	xfree(op);
412 	return (char *) tostring((char *) buf);
413 }
414 
415 void overflo(const char *s)
416 {
417 	FATAL("regular expression too big: out of space in %.30s...", s);
418 }
419 
420 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
421 {
422 	int i;
423 	int *p;
424 
425 	switch (type(v)) {
426 	ELEAF
427 	LEAF
428 		f->re[info(v)].ltype = type(v);
429 		f->re[info(v)].lval.np = right(v);
430 		while (f->accept >= maxsetvec) {	/* guessing here! */
431 			resizesetvec(__func__);
432 		}
433 		for (i = 0; i <= f->accept; i++)
434 			setvec[i] = 0;
435 		setcnt = 0;
436 		follow(v);	/* computes setvec and setcnt */
437 		p = intalloc(setcnt + 1, __func__);
438 		f->re[info(v)].lfollow = p;
439 		*p = setcnt;
440 		for (i = f->accept; i >= 0; i--)
441 			if (setvec[i] == 1)
442 				*++p = i;
443 		break;
444 	UNARY
445 		cfoll(f,left(v));
446 		break;
447 	case CAT:
448 	case OR:
449 		cfoll(f,left(v));
450 		cfoll(f,right(v));
451 		break;
452 	case ZERO:
453 		break;
454 	default:	/* can't happen */
455 		FATAL("can't happen: unknown type %d in cfoll", type(v));
456 	}
457 }
458 
459 int first(Node *p)	/* collects initially active leaves of p into setvec */
460 			/* returns 0 if p matches empty string */
461 {
462 	int b, lp;
463 
464 	switch (type(p)) {
465 	ELEAF
466 	LEAF
467 		lp = info(p);	/* look for high-water mark of subscripts */
468 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
469 			resizesetvec(__func__);
470 		}
471 		if (type(p) == EMPTYRE) {
472 			setvec[lp] = 0;
473 			return(0);
474 		}
475 		if (setvec[lp] != 1) {
476 			setvec[lp] = 1;
477 			setcnt++;
478 		}
479 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
480 			return(0);		/* empty CCL */
481 		return(1);
482 	case PLUS:
483 		if (first(left(p)) == 0)
484 			return(0);
485 		return(1);
486 	case STAR:
487 	case QUEST:
488 		first(left(p));
489 		return(0);
490 	case CAT:
491 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
492 		return(1);
493 	case OR:
494 		b = first(right(p));
495 		if (first(left(p)) == 0 || b == 0) return(0);
496 		return(1);
497 	case ZERO:
498 		return 0;
499 	}
500 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
501 	return(-1);
502 }
503 
504 void follow(Node *v)	/* collects leaves that can follow v into setvec */
505 {
506 	Node *p;
507 
508 	if (type(v) == FINAL)
509 		return;
510 	p = parent(v);
511 	switch (type(p)) {
512 	case STAR:
513 	case PLUS:
514 		first(v);
515 		follow(p);
516 		return;
517 
518 	case OR:
519 	case QUEST:
520 		follow(p);
521 		return;
522 
523 	case CAT:
524 		if (v == left(p)) {	/* v is left child of p */
525 			if (first(right(p)) == 0) {
526 				follow(p);
527 				return;
528 			}
529 		} else		/* v is right child */
530 			follow(p);
531 		return;
532 	}
533 }
534 
535 int member(int c, const char *sarg)	/* is c in s? */
536 {
537 	const uschar *s = (const uschar *) sarg;
538 
539 	while (*s)
540 		if (c == *s++)
541 			return(1);
542 	return(0);
543 }
544 
545 int match(fa *f, const char *p0)	/* shortest match ? */
546 {
547 	int s, ns;
548 	const uschar *p = (const uschar *) p0;
549 
550 	s = f->initstat;
551 	assert (s < f->state_count);
552 
553 	if (f->out[s])
554 		return(1);
555 	do {
556 		/* assert(*p < NCHARS); */
557 		if ((ns = f->gototab[s][*p]) != 0)
558 			s = ns;
559 		else
560 			s = cgoto(f, s, *p);
561 		if (f->out[s])
562 			return(1);
563 	} while (*p++ != 0);
564 	return(0);
565 }
566 
567 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
568 {
569 	int s, ns;
570 	const uschar *p = (const uschar *) p0;
571 	const uschar *q;
572 
573 	s = f->initstat;
574 	assert(s < f->state_count);
575 
576 	patbeg = (const char *)p;
577 	patlen = -1;
578 	do {
579 		q = p;
580 		do {
581 			if (f->out[s])		/* final state */
582 				patlen = q-p;
583 			/* assert(*q < NCHARS); */
584 			if ((ns = f->gototab[s][*q]) != 0)
585 				s = ns;
586 			else
587 				s = cgoto(f, s, *q);
588 
589 			assert(s < f->state_count);
590 
591 			if (s == 1) {	/* no transition */
592 				if (patlen >= 0) {
593 					patbeg = (const char *) p;
594 					return(1);
595 				}
596 				else
597 					goto nextin;	/* no match */
598 			}
599 		} while (*q++ != 0);
600 		if (f->out[s])
601 			patlen = q-p-1;	/* don't count $ */
602 		if (patlen >= 0) {
603 			patbeg = (const char *) p;
604 			return(1);
605 		}
606 	nextin:
607 		s = 2;
608 	} while (*p++);
609 	return (0);
610 }
611 
612 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
613 {
614 	int s, ns;
615 	const uschar *p = (const uschar *) p0;
616 	const uschar *q;
617 
618 	s = f->initstat;
619 	assert(s < f->state_count);
620 
621 	patbeg = (const char *)p;
622 	patlen = -1;
623 	while (*p) {
624 		q = p;
625 		do {
626 			if (f->out[s])		/* final state */
627 				patlen = q-p;
628 			/* assert(*q < NCHARS); */
629 			if ((ns = f->gototab[s][*q]) != 0)
630 				s = ns;
631 			else
632 				s = cgoto(f, s, *q);
633 			if (s == 1) {	/* no transition */
634 				if (patlen > 0) {
635 					patbeg = (const char *) p;
636 					return(1);
637 				} else
638 					goto nnextin;	/* no nonempty match */
639 			}
640 		} while (*q++ != 0);
641 		if (f->out[s])
642 			patlen = q-p-1;	/* don't count $ */
643 		if (patlen > 0 ) {
644 			patbeg = (const char *) p;
645 			return(1);
646 		}
647 	nnextin:
648 		s = 2;
649 		p++;
650 	}
651 	return (0);
652 }
653 
654 
655 /*
656  * NAME
657  *     fnematch
658  *
659  * DESCRIPTION
660  *     A stream-fed version of nematch which transfers characters to a
661  *     null-terminated buffer. All characters up to and including the last
662  *     character of the matching text or EOF are placed in the buffer. If
663  *     a match is found, patbeg and patlen are set appropriately.
664  *
665  * RETURN VALUES
666  *     false    No match found.
667  *     true     Match found.
668  */
669 
670 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
671 {
672 	char *buf = *pbuf;
673 	int bufsize = *pbufsize;
674 	int c, i, j, k, ns, s;
675 
676 	s = pfa->initstat;
677 	patlen = 0;
678 
679 	/*
680 	 * All indices relative to buf.
681 	 * i <= j <= k <= bufsize
682 	 *
683 	 * i: origin of active substring
684 	 * j: current character
685 	 * k: destination of next getc()
686 	 */
687 	i = -1, k = 0;
688         do {
689 		j = i++;
690 		do {
691 			if (++j == k) {
692 				if (k == bufsize)
693 					if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
694 						FATAL("stream '%.30s...' too long", buf);
695 				buf[k++] = (c = getc(f)) != EOF ? c : 0;
696 			}
697 			c = (uschar)buf[j];
698 			/* assert(c < NCHARS); */
699 
700 			if ((ns = pfa->gototab[s][c]) != 0)
701 				s = ns;
702 			else
703 				s = cgoto(pfa, s, c);
704 
705 			if (pfa->out[s]) {	/* final state */
706 				patlen = j - i + 1;
707 				if (c == 0)	/* don't count $ */
708 					patlen--;
709 			}
710 		} while (buf[j] && s != 1);
711 		s = 2;
712 	} while (buf[i] && !patlen);
713 
714 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
715 	*pbuf = buf;
716 	*pbufsize = bufsize;
717 
718 	if (patlen) {
719 		patbeg = buf + i;
720 		/*
721 		 * Under no circumstances is the last character fed to
722 		 * the automaton part of the match. It is EOF's nullbyte,
723 		 * or it sent the automaton into a state with no further
724 		 * transitions available (s==1), or both. Room for a
725 		 * terminating nullbyte is guaranteed.
726 		 *
727 		 * ungetc any chars after the end of matching text
728 		 * (except for EOF's nullbyte, if present) and null
729 		 * terminate the buffer.
730 		 */
731 		do
732 			if (buf[--k] && ungetc(buf[k], f) == EOF)
733 				FATAL("unable to ungetc '%c'", buf[k]);
734 		while (k > i + patlen);
735 		buf[k] = '\0';
736 		return true;
737 	}
738 	else
739 		return false;
740 }
741 
742 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
743 {			/* uses relex() to scan regular expression */
744 	Node *np;
745 
746 	DPRINTF("reparse <%s>\n", p);
747 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
748 	rtok = relex();
749 	/* GNU compatibility: an empty regexp matches anything */
750 	if (rtok == '\0') {
751 		/* FATAL("empty regular expression"); previous */
752 		return(op2(EMPTYRE, NIL, NIL));
753 	}
754 	np = regexp();
755 	if (rtok != '\0')
756 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
757 	return(np);
758 }
759 
760 Node *regexp(void)	/* top-level parse of reg expr */
761 {
762 	return (alt(concat(primary())));
763 }
764 
765 Node *primary(void)
766 {
767 	Node *np;
768 	int savelastatom;
769 
770 	switch (rtok) {
771 	case CHAR:
772 		lastatom = starttok;
773 		np = op2(CHAR, NIL, itonp(rlxval));
774 		rtok = relex();
775 		return (unary(np));
776 	case ALL:
777 		rtok = relex();
778 		return (unary(op2(ALL, NIL, NIL)));
779 	case EMPTYRE:
780 		rtok = relex();
781 		return (unary(op2(EMPTYRE, NIL, NIL)));
782 	case DOT:
783 		lastatom = starttok;
784 		rtok = relex();
785 		return (unary(op2(DOT, NIL, NIL)));
786 	case CCL:
787 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
788 		lastatom = starttok;
789 		rtok = relex();
790 		return (unary(np));
791 	case NCCL:
792 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
793 		lastatom = starttok;
794 		rtok = relex();
795 		return (unary(np));
796 	case '^':
797 		rtok = relex();
798 		return (unary(op2(CHAR, NIL, itonp(HAT))));
799 	case '$':
800 		rtok = relex();
801 		return (unary(op2(CHAR, NIL, NIL)));
802 	case '(':
803 		lastatom = starttok;
804 		savelastatom = starttok - basestr; /* Retain over recursion */
805 		rtok = relex();
806 		if (rtok == ')') {	/* special pleading for () */
807 			rtok = relex();
808 			return unary(op2(CCL, NIL, (Node *) tostring("")));
809 		}
810 		np = regexp();
811 		if (rtok == ')') {
812 			lastatom = basestr + savelastatom; /* Restore */
813 			rtok = relex();
814 			return (unary(np));
815 		}
816 		else
817 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
818 	default:
819 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
820 	}
821 	return 0;	/*NOTREACHED*/
822 }
823 
824 Node *concat(Node *np)
825 {
826 	switch (rtok) {
827 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
828 		return (concat(op2(CAT, np, primary())));
829 	case EMPTYRE:
830 		rtok = relex();
831 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
832 				primary())));
833 	}
834 	return (np);
835 }
836 
837 Node *alt(Node *np)
838 {
839 	if (rtok == OR) {
840 		rtok = relex();
841 		return (alt(op2(OR, np, concat(primary()))));
842 	}
843 	return (np);
844 }
845 
846 Node *unary(Node *np)
847 {
848 	switch (rtok) {
849 	case STAR:
850 		rtok = relex();
851 		return (unary(op2(STAR, np, NIL)));
852 	case PLUS:
853 		rtok = relex();
854 		return (unary(op2(PLUS, np, NIL)));
855 	case QUEST:
856 		rtok = relex();
857 		return (unary(op2(QUEST, np, NIL)));
858 	case ZERO:
859 		rtok = relex();
860 		return (unary(op2(ZERO, np, NIL)));
861 	default:
862 		return (np);
863 	}
864 }
865 
866 /*
867  * Character class definitions conformant to the POSIX locale as
868  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
869  * and operating character sets are both ASCII (ISO646) or supersets
870  * thereof.
871  *
872  * Note that to avoid overflowing the temporary buffer used in
873  * relex(), the expanded character class (prior to range expansion)
874  * must be less than twice the size of their full name.
875  */
876 
877 /* Because isblank doesn't show up in any of the header files on any
878  * system i use, it's defined here.  if some other locale has a richer
879  * definition of "blank", define HAS_ISBLANK and provide your own
880  * version.
881  * the parentheses here are an attempt to find a path through the maze
882  * of macro definition and/or function and/or version provided.  thanks
883  * to nelson beebe for the suggestion; let's see if it works everywhere.
884  */
885 
886 #ifndef HAS_ISBLANK
887 
888 int (xisblank)(int c)
889 {
890 	return c==' ' || c=='\t';
891 }
892 
893 #endif
894 
895 static const struct charclass {
896 	const char *cc_name;
897 	int cc_namelen;
898 	int (*cc_func)(int);
899 } charclasses[] = {
900 	{ "alnum",	5,	isalnum },
901 	{ "alpha",	5,	isalpha },
902 #ifndef HAS_ISBLANK
903 	{ "blank",	5,	xisblank },
904 #else
905 	{ "blank",	5,	isblank },
906 #endif
907 	{ "cntrl",	5,	iscntrl },
908 	{ "digit",	5,	isdigit },
909 	{ "graph",	5,	isgraph },
910 	{ "lower",	5,	islower },
911 	{ "print",	5,	isprint },
912 	{ "punct",	5,	ispunct },
913 	{ "space",	5,	isspace },
914 	{ "upper",	5,	isupper },
915 	{ "xdigit",	6,	isxdigit },
916 	{ NULL,		0,	NULL },
917 };
918 
919 #define REPEAT_SIMPLE		0
920 #define REPEAT_PLUS_APPENDED	1
921 #define REPEAT_WITH_Q		2
922 #define REPEAT_ZERO		3
923 
924 static int
925 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
926 	       int atomlen, int firstnum, int secondnum, int special_case)
927 {
928 	int i, j;
929 	uschar *buf = NULL;
930 	int ret = 1;
931 	int init_q = (firstnum == 0);		/* first added char will be ? */
932 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
933 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
934 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
935 	int size = prefix_length +  suffix_length;
936 
937 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
938 		size += atomlen*(firstnum-1);
939 	}
940 
941 	/* Adjust size of buffer for special cases */
942 	if (special_case == REPEAT_PLUS_APPENDED) {
943 		size++;		/* for the final + */
944 	} else if (special_case == REPEAT_WITH_Q) {
945 		size += init_q + (atomlen+1)* n_q_reps;
946 	} else if (special_case == REPEAT_ZERO) {
947 		size += 2;	/* just a null ERE: () */
948 	}
949 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
950 		FATAL("out of space in reg expr %.10s..", lastre);
951 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
952 	j = prefix_length;
953 	if (special_case == REPEAT_ZERO) {
954 		j -= atomlen;
955 		buf[j++] = '(';
956 		buf[j++] = ')';
957 	}
958 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
959 		memcpy(&buf[j], atom, atomlen);
960 		j += atomlen;
961 	}
962 	if (special_case == REPEAT_PLUS_APPENDED) {
963 		buf[j++] = '+';
964 	} else if (special_case == REPEAT_WITH_Q) {
965 		if (init_q)
966 			buf[j++] = '?';
967 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
968 			memcpy(&buf[j], atom, atomlen);
969 			j += atomlen;
970 			buf[j++] = '?';
971 		}
972 	}
973 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
974 	if (special_case == REPEAT_ZERO) {
975 		buf[j+suffix_length] = '\0';
976 	} else {
977 		buf[size] = '\0';
978 	}
979 	/* free old basestr */
980 	if (firstbasestr != basestr) {
981 		if (basestr)
982 			xfree(basestr);
983 	}
984 	basestr = buf;
985 	prestr  = buf + prefix_length;
986 	if (special_case == REPEAT_ZERO) {
987 		prestr  -= atomlen;
988 		ret++;
989 	}
990 	return ret;
991 }
992 
993 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
994 		  int atomlen, int firstnum, int secondnum)
995 {
996 	/*
997 	   In general, the repetition specifier or "bound" is replaced here
998 	   by an equivalent ERE string, repeating the immediately previous atom
999 	   and appending ? and + as needed. Note that the first copy of the
1000 	   atom is left in place, except in the special_case of a zero-repeat
1001 	   (i.e., {0}).
1002 	 */
1003 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1004 		if (firstnum < 2) {
1005 			/* 0 or 1: should be handled before you get here */
1006 			FATAL("internal error");
1007 		} else {
1008 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1009 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1010 		}
1011 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1012 		if (firstnum == 0) {	/* {0} or {0,0} */
1013 			/* This case is unusual because the resulting
1014 			   replacement string might actually be SMALLER than
1015 			   the original ERE */
1016 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1017 					firstnum, secondnum, REPEAT_ZERO);
1018 		} else {		/* (firstnum >= 1) */
1019 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1020 					firstnum, secondnum, REPEAT_SIMPLE);
1021 		}
1022 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1023 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1024 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1025 					firstnum, secondnum, REPEAT_WITH_Q);
1026 	} else {	/* Error - shouldn't be here (n>m) */
1027 		FATAL("internal error");
1028 	}
1029 	return 0;
1030 }
1031 
1032 int relex(void)		/* lexical analyzer for reparse */
1033 {
1034 	int c, n;
1035 	int cflag;
1036 	static uschar *buf = NULL;
1037 	static int bufsz = 100;
1038 	uschar *bp;
1039 	const struct charclass *cc;
1040 	int i;
1041 	int num, m;
1042 	bool commafound, digitfound;
1043 	const uschar *startreptok;
1044 	static int parens = 0;
1045 
1046 rescan:
1047 	starttok = prestr;
1048 
1049 	switch (c = *prestr++) {
1050 	case '|': return OR;
1051 	case '*': return STAR;
1052 	case '+': return PLUS;
1053 	case '?': return QUEST;
1054 	case '.': return DOT;
1055 	case '\0': prestr--; return '\0';
1056 	case '^':
1057 	case '$':
1058 		return c;
1059 	case '(':
1060 		parens++;
1061 		return c;
1062 	case ')':
1063 		if (parens) {
1064 			parens--;
1065 			return c;
1066 		}
1067 		/* unmatched close parenthesis; per POSIX, treat as literal */
1068 		rlxval = c;
1069 		return CHAR;
1070 	case '\\':
1071 		rlxval = quoted(&prestr);
1072 		return CHAR;
1073 	default:
1074 		rlxval = c;
1075 		return CHAR;
1076 	case '[':
1077 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1078 			FATAL("out of space in reg expr %.10s..", lastre);
1079 		bp = buf;
1080 		if (*prestr == '^') {
1081 			cflag = 1;
1082 			prestr++;
1083 		}
1084 		else
1085 			cflag = 0;
1086 		n = 2 * strlen((const char *) prestr)+1;
1087 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1088 			FATAL("out of space for reg expr %.10s...", lastre);
1089 		for (; ; ) {
1090 			if ((c = *prestr++) == '\\') {
1091 				*bp++ = '\\';
1092 				if ((c = *prestr++) == '\0')
1093 					FATAL("nonterminated character class %.20s...", lastre);
1094 				*bp++ = c;
1095 			/* } else if (c == '\n') { */
1096 			/* 	FATAL("newline in character class %.20s...", lastre); */
1097 			} else if (c == '[' && *prestr == ':') {
1098 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1099 				for (cc = charclasses; cc->cc_name; cc++)
1100 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1101 						break;
1102 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1103 				    prestr[2 + cc->cc_namelen] == ']') {
1104 					prestr += cc->cc_namelen + 3;
1105 					/*
1106 					 * BUG: We begin at 1, instead of 0, since we
1107 					 * would otherwise prematurely terminate the
1108 					 * string for classes like [[:cntrl:]]. This
1109 					 * means that we can't match the NUL character,
1110 					 * not without first adapting the entire
1111 					 * program to track each string's length.
1112 					 */
1113 					for (i = 1; i <= UCHAR_MAX; i++) {
1114 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1115 						    FATAL("out of space for reg expr %.10s...", lastre);
1116 						if (cc->cc_func(i)) {
1117 							/* escape backslash */
1118 							if (i == '\\') {
1119 								*bp++ = '\\';
1120 								n++;
1121 							}
1122 
1123 							*bp++ = i;
1124 							n++;
1125 						}
1126 					}
1127 				} else
1128 					*bp++ = c;
1129 			} else if (c == '[' && *prestr == '.') {
1130 				char collate_char;
1131 				prestr++;
1132 				collate_char = *prestr++;
1133 				if (*prestr == '.' && prestr[1] == ']') {
1134 					prestr += 2;
1135 					/* Found it: map via locale TBD: for
1136 					   now, simply return this char.  This
1137 					   is sufficient to pass conformance
1138 					   test awk.ex 156
1139 					 */
1140 					if (*prestr == ']') {
1141 						prestr++;
1142 						rlxval = collate_char;
1143 						return CHAR;
1144 					}
1145 				}
1146 			} else if (c == '[' && *prestr == '=') {
1147 				char equiv_char;
1148 				prestr++;
1149 				equiv_char = *prestr++;
1150 				if (*prestr == '=' && prestr[1] == ']') {
1151 					prestr += 2;
1152 					/* Found it: map via locale TBD: for now
1153 					   simply return this char. This is
1154 					   sufficient to pass conformance test
1155 					   awk.ex 156
1156 					 */
1157 					if (*prestr == ']') {
1158 						prestr++;
1159 						rlxval = equiv_char;
1160 						return CHAR;
1161 					}
1162 				}
1163 			} else if (c == '\0') {
1164 				FATAL("nonterminated character class %.20s", lastre);
1165 			} else if (bp == buf) {	/* 1st char is special */
1166 				*bp++ = c;
1167 			} else if (c == ']') {
1168 				*bp++ = 0;
1169 				rlxstr = (uschar *) tostring((char *) buf);
1170 				if (cflag == 0)
1171 					return CCL;
1172 				else
1173 					return NCCL;
1174 			} else
1175 				*bp++ = c;
1176 		}
1177 		break;
1178 	case '{':
1179 		if (isdigit(*(prestr))) {
1180 			num = 0;	/* Process as a repetition */
1181 			n = -1; m = -1;
1182 			commafound = false;
1183 			digitfound = false;
1184 			startreptok = prestr-1;
1185 			/* Remember start of previous atom here ? */
1186 		} else {        	/* just a { char, not a repetition */
1187 			rlxval = c;
1188 			return CHAR;
1189                 }
1190 		for (; ; ) {
1191 			if ((c = *prestr++) == '}') {
1192 				if (commafound) {
1193 					if (digitfound) { /* {n,m} */
1194 						m = num;
1195 						if (m < n)
1196 							FATAL("illegal repetition expression: class %.20s",
1197 								lastre);
1198 						if (n == 0 && m == 1) {
1199 							return QUEST;
1200 						}
1201 					} else {	/* {n,} */
1202 						if (n == 0)
1203 							return STAR;
1204 						else if (n == 1)
1205 							return PLUS;
1206 					}
1207 				} else {
1208 					if (digitfound) { /* {n} same as {n,n} */
1209 						n = num;
1210 						m = num;
1211 					} else {	/* {} */
1212 						FATAL("illegal repetition expression: class %.20s",
1213 							lastre);
1214 					}
1215 				}
1216 				if (repeat(starttok, prestr-starttok, lastatom,
1217 					   startreptok - lastatom, n, m) > 0) {
1218 					if (n == 0 && m == 0) {
1219 						return ZERO;
1220 					}
1221 					/* must rescan input for next token */
1222 					goto rescan;
1223 				}
1224 				/* Failed to replace: eat up {...} characters
1225 				   and treat like just PLUS */
1226 				return PLUS;
1227 			} else if (c == '\0') {
1228 				FATAL("nonterminated character class %.20s",
1229 					lastre);
1230 			} else if (isdigit(c)) {
1231 				num = 10 * num + c - '0';
1232 				digitfound = true;
1233 			} else if (c == ',') {
1234 				if (commafound)
1235 					FATAL("illegal repetition expression: class %.20s",
1236 						lastre);
1237 				/* looking for {n,} or {n,m} */
1238 				commafound = true;
1239 				n = num;
1240 				digitfound = false; /* reset */
1241 				num = 0;
1242 			} else {
1243 				FATAL("illegal repetition expression: class %.20s",
1244 					lastre);
1245 			}
1246 		}
1247 		break;
1248 	}
1249 }
1250 
1251 int cgoto(fa *f, int s, int c)
1252 {
1253 	int *p, *q;
1254 	int i, j, k;
1255 
1256 	assert(c == HAT || c < NCHARS);
1257 	while (f->accept >= maxsetvec) {	/* guessing here! */
1258 		resizesetvec(__func__);
1259 	}
1260 	for (i = 0; i <= f->accept; i++)
1261 		setvec[i] = 0;
1262 	setcnt = 0;
1263 	resize_state(f, s);
1264 	/* compute positions of gototab[s,c] into setvec */
1265 	p = f->posns[s];
1266 	for (i = 1; i <= *p; i++) {
1267 		if ((k = f->re[p[i]].ltype) != FINAL) {
1268 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1269 			 || (k == DOT && c != 0 && c != HAT)
1270 			 || (k == ALL && c != 0)
1271 			 || (k == EMPTYRE && c != 0)
1272 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1273 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1274 				q = f->re[p[i]].lfollow;
1275 				for (j = 1; j <= *q; j++) {
1276 					if (q[j] >= maxsetvec) {
1277 						resizesetvec(__func__);
1278 					}
1279 					if (setvec[q[j]] == 0) {
1280 						setcnt++;
1281 						setvec[q[j]] = 1;
1282 					}
1283 				}
1284 			}
1285 		}
1286 	}
1287 	/* determine if setvec is a previous state */
1288 	tmpset[0] = setcnt;
1289 	j = 1;
1290 	for (i = f->accept; i >= 0; i--)
1291 		if (setvec[i]) {
1292 			tmpset[j++] = i;
1293 		}
1294 	resize_state(f, f->curstat > s ? f->curstat : s);
1295 	/* tmpset == previous state? */
1296 	for (i = 1; i <= f->curstat; i++) {
1297 		p = f->posns[i];
1298 		if ((k = tmpset[0]) != p[0])
1299 			goto different;
1300 		for (j = 1; j <= k; j++)
1301 			if (tmpset[j] != p[j])
1302 				goto different;
1303 		/* setvec is state i */
1304 		if (c != HAT)
1305 			f->gototab[s][c] = i;
1306 		return i;
1307 	  different:;
1308 	}
1309 
1310 	/* add tmpset to current set of states */
1311 	++(f->curstat);
1312 	resize_state(f, f->curstat);
1313 	for (i = 0; i < NCHARS; i++)
1314 		f->gototab[f->curstat][i] = 0;
1315 	xfree(f->posns[f->curstat]);
1316 	p = intalloc(setcnt + 1, __func__);
1317 
1318 	f->posns[f->curstat] = p;
1319 	if (c != HAT)
1320 		f->gototab[s][c] = f->curstat;
1321 	for (i = 0; i <= setcnt; i++)
1322 		p[i] = tmpset[i];
1323 	if (setvec[f->accept])
1324 		f->out[f->curstat] = 1;
1325 	else
1326 		f->out[f->curstat] = 0;
1327 	return f->curstat;
1328 }
1329 
1330 
1331 void freefa(fa *f)	/* free a finite automaton */
1332 {
1333 	int i;
1334 
1335 	if (f == NULL)
1336 		return;
1337 	for (i = 0; i < f->state_count; i++)
1338 		xfree(f->gototab[i])
1339 	for (i = 0; i <= f->curstat; i++)
1340 		xfree(f->posns[i]);
1341 	for (i = 0; i <= f->accept; i++) {
1342 		xfree(f->re[i].lfollow);
1343 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1344 			xfree(f->re[i].lval.np);
1345 	}
1346 	xfree(f->restr);
1347 	xfree(f->out);
1348 	xfree(f->posns);
1349 	xfree(f->gototab);
1350 	xfree(f);
1351 }
1352