xref: /netbsd/external/historical/nawk/dist/b.c (revision 8eaba782)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #if HAVE_NBTOOL_CONFIG_H
28 #include "nbtool_config.h"
29 #endif
30 
31 #define	DEBUG
32 
33 #include <ctype.h>
34 #include <limits.h>
35 #include <stdio.h>
36 #include <string.h>
37 #include <stdlib.h>
38 #include "awk.h"
39 #include "awkgram.h"
40 
41 #define MAXLIN 22
42 
43 #define type(v)		(v)->nobj	/* badly overloaded here */
44 #define info(v)		(v)->ntype	/* badly overloaded here */
45 #define left(v)		(v)->narg[0]
46 #define right(v)	(v)->narg[1]
47 #define parent(v)	(v)->nnext
48 
49 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
50 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
51 #define UNARY	case STAR: case PLUS: case QUEST:
52 
53 /* encoding in tree Nodes:
54 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
55 		left is index, right contains value or pointer to value
56 	unary (STAR, PLUS, QUEST): left is child, right is null
57 	binary (CAT, OR): left and right are children
58 	parent contains pointer to parent
59 */
60 
61 
62 int	*setvec;
63 int	*tmpset;
64 int	maxsetvec = 0;
65 
66 int	rtok;		/* next token in current re */
67 int	rlxval;
68 static const uschar	*rlxstr;
69 static const uschar	*prestr;	/* current position in current re */
70 static const uschar	*lastre;	/* origin of last re */
71 static const uschar	*lastatom;	/* origin of last Atom */
72 static const uschar	*starttok;
73 static const uschar 	*basestr;	/* starts with original, replaced during
74 				   repetition processing */
75 static const uschar 	*firstbasestr;
76 
77 static	int setcnt;
78 static	int poscnt;
79 
80 const char	*patbeg;
81 int	patlen;
82 
83 #define	NFA	128	/* cache this many dynamic fa's */
84 fa	*fatab[NFA];
85 int	nfatab	= 0;	/* entries in fatab */
86 
87 static int *
intalloc(size_t n,const char * f)88 intalloc(size_t n, const char *f)
89 {
90 	void *p = calloc(n, sizeof(int));
91 	if (p == NULL)
92 		overflo(f);
93 	return p;
94 }
95 
96 static void
resizesetvec(const char * f)97 resizesetvec(const char *f)
98 {
99 	if (maxsetvec == 0)
100 		maxsetvec = MAXLIN;
101 	else
102 		maxsetvec *= 4;
103 	setvec = realloc(setvec, maxsetvec * sizeof(*setvec));
104 	tmpset = realloc(tmpset, maxsetvec * sizeof(*tmpset));
105 	if (setvec == NULL || tmpset == NULL)
106 		overflo(f);
107 }
108 
109 static void
resize_state(fa * f,int state)110 resize_state(fa *f, int state)
111 {
112 	void *p;
113 	int i, new_count;
114 
115 	if (++state < f->state_count)
116 		return;
117 
118 	new_count = state + 10; /* needs to be tuned */
119 
120 	p = realloc(f->gototab, new_count * sizeof(f->gototab[0]));
121 	if (p == NULL)
122 		goto out;
123 	f->gototab = p;
124 
125 	p = realloc(f->out, new_count * sizeof(f->out[0]));
126 	if (p == NULL)
127 		goto out;
128 	f->out = p;
129 
130 	p = realloc(f->posns, new_count * sizeof(f->posns[0]));
131 	if (p == NULL)
132 		goto out;
133 	f->posns = p;
134 
135 	for (i = f->state_count; i < new_count; ++i) {
136 		f->gototab[i] = calloc(NCHARS, sizeof(**f->gototab));
137 		if (f->gototab[i] == NULL)
138 			goto out;
139 		f->out[i]  = 0;
140 		f->posns[i] = NULL;
141 	}
142 	f->state_count = new_count;
143 	return;
144 out:
145 	overflo(__func__);
146 }
147 
makedfa(const char * s,bool anchor)148 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
149 {
150 	int i, use, nuse;
151 	fa *pfa;
152 	static int now = 1;
153 
154 	if (setvec == NULL) {	/* first time through any RE */
155 		resizesetvec(__func__);
156 	}
157 
158 	if (compile_time != RUNNING)	/* a constant for sure */
159 		return mkdfa(s, anchor);
160 	for (i = 0; i < nfatab; i++)	/* is it there already? */
161 		if (fatab[i]->anchor == anchor
162 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
163 			fatab[i]->use = now++;
164 			return fatab[i];
165 		}
166 	pfa = mkdfa(s, anchor);
167 	if (nfatab < NFA) {	/* room for another */
168 		fatab[nfatab] = pfa;
169 		fatab[nfatab]->use = now++;
170 		nfatab++;
171 		return pfa;
172 	}
173 	use = fatab[0]->use;	/* replace least-recently used */
174 	nuse = 0;
175 	for (i = 1; i < nfatab; i++)
176 		if (fatab[i]->use < use) {
177 			use = fatab[i]->use;
178 			nuse = i;
179 		}
180 	freefa(fatab[nuse]);
181 	fatab[nuse] = pfa;
182 	pfa->use = now++;
183 	return pfa;
184 }
185 
mkdfa(const char * s,bool anchor)186 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
187 				/* anchor = true for anchored matches, else false */
188 {
189 	Node *p, *p1;
190 	fa *f;
191 
192 	firstbasestr = (const uschar *) s;
193 	basestr = firstbasestr;
194 	p = reparse(s);
195 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
196 		/* put ALL STAR in front of reg.  exp. */
197 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
198 		/* put FINAL after reg.  exp. */
199 
200 	poscnt = 0;
201 	penter(p1);	/* enter parent pointers and leaf indices */
202 	if ((f = calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
203 		overflo(__func__);
204 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
205 	cfoll(f, p1);	/* set up follow sets */
206 	freetr(p1);
207 	resize_state(f, 1);
208 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
209 	f->posns[1] = intalloc(1, __func__);
210 	*f->posns[1] = 0;
211 	f->initstat = makeinit(f, anchor);
212 	f->anchor = anchor;
213 	f->restr = (uschar *) tostring(s);
214 	if (firstbasestr != basestr) {
215 		if (basestr)
216 			xfree(basestr);
217 	}
218 	return f;
219 }
220 
makeinit(fa * f,bool anchor)221 int makeinit(fa *f, bool anchor)
222 {
223 	int i, k;
224 
225 	f->curstat = 2;
226 	f->out[2] = 0;
227 	k = *(f->re[0].lfollow);
228 	xfree(f->posns[2]);
229 	f->posns[2] = intalloc(k + 1,  __func__);
230 	for (i = 0; i <= k; i++) {
231 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
232 	}
233 	if ((f->posns[2])[1] == f->accept)
234 		f->out[2] = 1;
235 	for (i = 0; i < NCHARS; i++)
236 		f->gototab[2][i] = 0;
237 	f->curstat = cgoto(f, 2, HAT);
238 	if (anchor) {
239 		*f->posns[2] = k-1;	/* leave out position 0 */
240 		for (i = 0; i < k; i++) {
241 			(f->posns[0])[i] = (f->posns[2])[i];
242 		}
243 
244 		f->out[0] = f->out[2];
245 		if (f->curstat != 2)
246 			--(*f->posns[f->curstat]);
247 	}
248 	return f->curstat;
249 }
250 
penter(Node * p)251 void penter(Node *p)	/* set up parent pointers and leaf indices */
252 {
253 	switch (type(p)) {
254 	ELEAF
255 	LEAF
256 		info(p) = poscnt;
257 		poscnt++;
258 		break;
259 	UNARY
260 		penter(left(p));
261 		parent(left(p)) = p;
262 		break;
263 	case CAT:
264 	case OR:
265 		penter(left(p));
266 		penter(right(p));
267 		parent(left(p)) = p;
268 		parent(right(p)) = p;
269 		break;
270 	case ZERO:
271 		break;
272 	default:	/* can't happen */
273 		FATAL("can't happen: unknown type %d in penter", type(p));
274 		break;
275 	}
276 }
277 
freetr(Node * p)278 void freetr(Node *p)	/* free parse tree */
279 {
280 	switch (type(p)) {
281 	ELEAF
282 	LEAF
283 		xfree(p);
284 		break;
285 	UNARY
286 	case ZERO:
287 		freetr(left(p));
288 		xfree(p);
289 		break;
290 	case CAT:
291 	case OR:
292 		freetr(left(p));
293 		freetr(right(p));
294 		xfree(p);
295 		break;
296 	default:	/* can't happen */
297 		FATAL("can't happen: unknown type %d in freetr", type(p));
298 		break;
299 	}
300 }
301 
302 /* in the parsing of regular expressions, metacharacters like . have */
303 /* to be seen literally;  \056 is not a metacharacter. */
304 
hexstr(const uschar ** pp)305 int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
306 {			/* only pick up one 8-bit byte (2 chars) */
307 	const uschar *p;
308 	int n = 0;
309 	int i;
310 
311 	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
312 		if (isdigit(*p))
313 			n = 16 * n + *p - '0';
314 		else if (*p >= 'a' && *p <= 'f')
315 			n = 16 * n + *p - 'a' + 10;
316 		else if (*p >= 'A' && *p <= 'F')
317 			n = 16 * n + *p - 'A' + 10;
318 	}
319 	*pp = p;
320 	return n;
321 }
322 
323 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
324 
quoted(const uschar ** pp)325 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
326 			/* and increment *pp */
327 {
328 	const uschar *p = *pp;
329 	int c;
330 
331 	if ((c = *p++) == 't')
332 		c = '\t';
333 	else if (c == 'n')
334 		c = '\n';
335 	else if (c == 'f')
336 		c = '\f';
337 	else if (c == 'r')
338 		c = '\r';
339 	else if (c == 'b')
340 		c = '\b';
341 	else if (c == 'v')
342 		c = '\v';
343 	else if (c == 'a')
344 		c = '\a';
345 	else if (c == '\\')
346 		c = '\\';
347 	else if (c == 'x') {	/* hexadecimal goo follows */
348 		c = hexstr(&p);	/* this adds a null if number is invalid */
349 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
350 		int n = c - '0';
351 		if (isoctdigit(*p)) {
352 			n = 8 * n + *p++ - '0';
353 			if (isoctdigit(*p))
354 				n = 8 * n + *p++ - '0';
355 		}
356 		c = n;
357 	} /* else */
358 		/* c = c; */
359 	*pp = p;
360 	return c;
361 }
362 
cclenter(const char * argp)363 char *cclenter(const char *argp)	/* add a character class */
364 {
365 	int i, c, c2;
366 	const uschar *op, *p = (const uschar *) argp;
367 	uschar *bp;
368 	static uschar *buf = NULL;
369 	static int bufsz = 100;
370 
371 	op = p;
372 	if (buf == NULL && (buf = malloc(bufsz)) == NULL)
373 		FATAL("out of space for character class [%.10s...] 1", p);
374 	bp = buf;
375 	for (i = 0; (c = *p++) != 0; ) {
376 		if (c == '\\') {
377 			c = quoted(&p);
378 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
379 			if (*p != 0) {
380 				c = bp[-1];
381 				c2 = *p++;
382 				if (c2 == '\\')
383 					c2 = quoted(&p);
384 				if (c > c2) {	/* empty; ignore */
385 					bp--;
386 					i--;
387 					continue;
388 				}
389 				while (c < c2) {
390 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
391 						FATAL("out of space for character class [%.10s...] 2", p);
392 					*bp++ = ++c;
393 					i++;
394 				}
395 				continue;
396 			}
397 		}
398 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
399 			FATAL("out of space for character class [%.10s...] 3", p);
400 		*bp++ = c;
401 		i++;
402 	}
403 	*bp = 0;
404 	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
405 	xfree(op);
406 	return (char *) tostring((char *) buf);
407 }
408 
overflo(const char * s)409 void overflo(const char *s)
410 {
411 	FATAL("regular expression too big: out of space in %.30s...", s);
412 }
413 
cfoll(fa * f,Node * v)414 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
415 {
416 	int i;
417 	int *p;
418 
419 	switch (type(v)) {
420 	ELEAF
421 	LEAF
422 		f->re[info(v)].ltype = type(v);
423 		f->re[info(v)].lval.np = right(v);
424 		while (f->accept >= maxsetvec) {	/* guessing here! */
425 			resizesetvec(__func__);
426 		}
427 		for (i = 0; i <= f->accept; i++)
428 			setvec[i] = 0;
429 		setcnt = 0;
430 		follow(v);	/* computes setvec and setcnt */
431 		p = intalloc(setcnt + 1, __func__);
432 		f->re[info(v)].lfollow = p;
433 		*p = setcnt;
434 		for (i = f->accept; i >= 0; i--)
435 			if (setvec[i] == 1)
436 				*++p = i;
437 		break;
438 	UNARY
439 		cfoll(f,left(v));
440 		break;
441 	case CAT:
442 	case OR:
443 		cfoll(f,left(v));
444 		cfoll(f,right(v));
445 		break;
446 	case ZERO:
447 		break;
448 	default:	/* can't happen */
449 		FATAL("can't happen: unknown type %d in cfoll", type(v));
450 	}
451 }
452 
first(Node * p)453 int first(Node *p)	/* collects initially active leaves of p into setvec */
454 			/* returns 0 if p matches empty string */
455 {
456 	int b, lp;
457 
458 	switch (type(p)) {
459 	ELEAF
460 	LEAF
461 		lp = info(p);	/* look for high-water mark of subscripts */
462 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
463 			resizesetvec(__func__);
464 		}
465 		if (type(p) == EMPTYRE) {
466 			setvec[lp] = 0;
467 			return(0);
468 		}
469 		if (setvec[lp] != 1) {
470 			setvec[lp] = 1;
471 			setcnt++;
472 		}
473 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
474 			return(0);		/* empty CCL */
475 		return(1);
476 	case PLUS:
477 		if (first(left(p)) == 0)
478 			return(0);
479 		return(1);
480 	case STAR:
481 	case QUEST:
482 		first(left(p));
483 		return(0);
484 	case CAT:
485 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
486 		return(1);
487 	case OR:
488 		b = first(right(p));
489 		if (first(left(p)) == 0 || b == 0) return(0);
490 		return(1);
491 	case ZERO:
492 		return 0;
493 	}
494 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
495 	return(-1);
496 }
497 
follow(Node * v)498 void follow(Node *v)	/* collects leaves that can follow v into setvec */
499 {
500 	Node *p;
501 
502 	if (type(v) == FINAL)
503 		return;
504 	p = parent(v);
505 	switch (type(p)) {
506 	case STAR:
507 	case PLUS:
508 		first(v);
509 		follow(p);
510 		return;
511 
512 	case OR:
513 	case QUEST:
514 		follow(p);
515 		return;
516 
517 	case CAT:
518 		if (v == left(p)) {	/* v is left child of p */
519 			if (first(right(p)) == 0) {
520 				follow(p);
521 				return;
522 			}
523 		} else		/* v is right child */
524 			follow(p);
525 		return;
526 	}
527 }
528 
member(int c,const char * sarg)529 int member(int c, const char *sarg)	/* is c in s? */
530 {
531 	const uschar *s = (const uschar *) sarg;
532 
533 	while (*s)
534 		if (c == *s++)
535 			return(1);
536 	return(0);
537 }
538 
match(fa * f,const char * p0)539 int match(fa *f, const char *p0)	/* shortest match ? */
540 {
541 	int s, ns;
542 	const uschar *p = (const uschar *) p0;
543 
544 	s = f->initstat;
545 	assert (s < f->state_count);
546 
547 	if (f->out[s])
548 		return(1);
549 	do {
550 		/* assert(*p < NCHARS); */
551 		if ((ns = f->gototab[s][*p]) != 0)
552 			s = ns;
553 		else
554 			s = cgoto(f, s, *p);
555 		if (f->out[s])
556 			return(1);
557 	} while (*p++ != 0);
558 	return(0);
559 }
560 
pmatch(fa * f,const char * p0)561 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
562 {
563 	int s, ns;
564 	const uschar *p = (const uschar *) p0;
565 	const uschar *q;
566 
567 	s = f->initstat;
568 	assert(s < f->state_count);
569 
570 	patbeg = (const char *)p;
571 	patlen = -1;
572 	do {
573 		q = p;
574 		do {
575 			if (f->out[s])		/* final state */
576 				patlen = q-p;
577 			/* assert(*q < NCHARS); */
578 			if ((ns = f->gototab[s][*q]) != 0)
579 				s = ns;
580 			else
581 				s = cgoto(f, s, *q);
582 
583 			assert(s < f->state_count);
584 
585 			if (s == 1) {	/* no transition */
586 				if (patlen >= 0) {
587 					patbeg = (const char *) p;
588 					return(1);
589 				}
590 				else
591 					goto nextin;	/* no match */
592 			}
593 		} while (*q++ != 0);
594 		if (f->out[s])
595 			patlen = q-p-1;	/* don't count $ */
596 		if (patlen >= 0) {
597 			patbeg = (const char *) p;
598 			return(1);
599 		}
600 	nextin:
601 		s = 2;
602 	} while (*p++);
603 	return (0);
604 }
605 
nematch(fa * f,const char * p0)606 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
607 {
608 	int s, ns;
609 	const uschar *p = (const uschar *) p0;
610 	const uschar *q;
611 
612 	s = f->initstat;
613 	assert(s < f->state_count);
614 
615 	patbeg = (const char *)p;
616 	patlen = -1;
617 	while (*p) {
618 		q = p;
619 		do {
620 			if (f->out[s])		/* final state */
621 				patlen = q-p;
622 			/* assert(*q < NCHARS); */
623 			if ((ns = f->gototab[s][*q]) != 0)
624 				s = ns;
625 			else
626 				s = cgoto(f, s, *q);
627 			if (s == 1) {	/* no transition */
628 				if (patlen > 0) {
629 					patbeg = (const char *) p;
630 					return(1);
631 				} else
632 					goto nnextin;	/* no nonempty match */
633 			}
634 		} while (*q++ != 0);
635 		if (f->out[s])
636 			patlen = q-p-1;	/* don't count $ */
637 		if (patlen > 0 ) {
638 			patbeg = (const char *) p;
639 			return(1);
640 		}
641 	nnextin:
642 		s = 2;
643 		p++;
644 	}
645 	return (0);
646 }
647 
648 
649 /*
650  * NAME
651  *     fnematch
652  *
653  * DESCRIPTION
654  *     A stream-fed version of nematch which transfers characters to a
655  *     null-terminated buffer. All characters up to and including the last
656  *     character of the matching text or EOF are placed in the buffer. If
657  *     a match is found, patbeg and patlen are set appropriately.
658  *
659  * RETURN VALUES
660  *     false    No match found.
661  *     true     Match found.
662  */
663 
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)664 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
665 {
666 	char *buf = *pbuf;
667 	int bufsize = *pbufsize;
668 	int c, i, j, k, ns, s;
669 
670 	s = pfa->initstat;
671 	patlen = 0;
672 
673 	/*
674 	 * All indices relative to buf.
675 	 * i <= j <= k <= bufsize
676 	 *
677 	 * i: origin of active substring
678 	 * j: current character
679 	 * k: destination of next getc()
680 	 */
681 	i = -1, k = 0;
682         do {
683 		j = i++;
684 		do {
685 			if (++j == k) {
686 				if (k == bufsize)
687 					if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
688 						FATAL("stream '%.30s...' too long", buf);
689 				buf[k++] = (c = getc(f)) != EOF ? c : 0;
690 			}
691 			c = buf[j];
692 			/* assert(c < NCHARS); */
693 
694 			if ((ns = pfa->gototab[s][c]) != 0)
695 				s = ns;
696 			else
697 				s = cgoto(pfa, s, c);
698 
699 			if (pfa->out[s]) {	/* final state */
700 				patlen = j - i + 1;
701 				if (c == 0)	/* don't count $ */
702 					patlen--;
703 			}
704 		} while (buf[j] && s != 1);
705 		s = 2;
706 	} while (buf[i] && !patlen);
707 
708 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
709 	*pbuf = buf;
710 	*pbufsize = bufsize;
711 
712 	if (patlen) {
713 		patbeg = (char *) buf + i;
714 		/*
715 		 * Under no circumstances is the last character fed to
716 		 * the automaton part of the match. It is EOF's nullbyte,
717 		 * or it sent the automaton into a state with no further
718 		 * transitions available (s==1), or both. Room for a
719 		 * terminating nullbyte is guaranteed.
720 		 *
721 		 * ungetc any chars after the end of matching text
722 		 * (except for EOF's nullbyte, if present) and null
723 		 * terminate the buffer.
724 		 */
725 		do
726 			if (buf[--k] && ungetc(buf[k], f) == EOF)
727 				FATAL("unable to ungetc '%c'", buf[k]);
728 		while (k > i + patlen);
729 		buf[k] = '\0';
730 		return true;
731 	}
732 	else
733 		return false;
734 }
735 
reparse(const char * p)736 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
737 {			/* uses relex() to scan regular expression */
738 	Node *np;
739 
740 	dprintf( ("reparse <%s>\n", p) );
741 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
742 	rtok = relex();
743 	/* GNU compatibility: an empty regexp matches anything */
744 	if (rtok == '\0') {
745 		/* FATAL("empty regular expression"); previous */
746 		return(op2(EMPTYRE, NIL, NIL));
747 	}
748 	np = regexp();
749 	if (rtok != '\0')
750 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
751 	return(np);
752 }
753 
regexp(void)754 Node *regexp(void)	/* top-level parse of reg expr */
755 {
756 	return (alt(concat(primary())));
757 }
758 
primary(void)759 Node *primary(void)
760 {
761 	Node *np;
762 	int savelastatom;
763 
764 	switch (rtok) {
765 	case CHAR:
766 		lastatom = starttok;
767 		np = op2(CHAR, NIL, itonp(rlxval));
768 		rtok = relex();
769 		return (unary(np));
770 	case ALL:
771 		rtok = relex();
772 		return (unary(op2(ALL, NIL, NIL)));
773 	case EMPTYRE:
774 		rtok = relex();
775 		return (unary(op2(EMPTYRE, NIL, NIL)));
776 	case DOT:
777 		lastatom = starttok;
778 		rtok = relex();
779 		return (unary(op2(DOT, NIL, NIL)));
780 	case CCL:
781 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
782 		lastatom = starttok;
783 		rtok = relex();
784 		return (unary(np));
785 	case NCCL:
786 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
787 		lastatom = starttok;
788 		rtok = relex();
789 		return (unary(np));
790 	case '^':
791 		rtok = relex();
792 		return (unary(op2(CHAR, NIL, itonp(HAT))));
793 	case '$':
794 		rtok = relex();
795 		return (unary(op2(CHAR, NIL, NIL)));
796 	case '(':
797 		lastatom = starttok;
798 		savelastatom = starttok - basestr; /* Retain over recursion */
799 		rtok = relex();
800 		if (rtok == ')') {	/* special pleading for () */
801 			rtok = relex();
802 			return unary(op2(CCL, NIL, (Node *) tostring("")));
803 		}
804 		np = regexp();
805 		if (rtok == ')') {
806 			lastatom = basestr + savelastatom; /* Restore */
807 			rtok = relex();
808 			return (unary(np));
809 		}
810 		else
811 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
812 	default:
813 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
814 	}
815 	return 0;	/*NOTREACHED*/
816 }
817 
concat(Node * np)818 Node *concat(Node *np)
819 {
820 	switch (rtok) {
821 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
822 		return (concat(op2(CAT, np, primary())));
823 	case EMPTYRE:
824 		rtok = relex();
825 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
826 				primary())));
827 	}
828 	return (np);
829 }
830 
alt(Node * np)831 Node *alt(Node *np)
832 {
833 	if (rtok == OR) {
834 		rtok = relex();
835 		return (alt(op2(OR, np, concat(primary()))));
836 	}
837 	return (np);
838 }
839 
unary(Node * np)840 Node *unary(Node *np)
841 {
842 	switch (rtok) {
843 	case STAR:
844 		rtok = relex();
845 		return (unary(op2(STAR, np, NIL)));
846 	case PLUS:
847 		rtok = relex();
848 		return (unary(op2(PLUS, np, NIL)));
849 	case QUEST:
850 		rtok = relex();
851 		return (unary(op2(QUEST, np, NIL)));
852 	case ZERO:
853 		rtok = relex();
854 		return (unary(op2(ZERO, np, NIL)));
855 	default:
856 		return (np);
857 	}
858 }
859 
860 /*
861  * Character class definitions conformant to the POSIX locale as
862  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
863  * and operating character sets are both ASCII (ISO646) or supersets
864  * thereof.
865  *
866  * Note that to avoid overflowing the temporary buffer used in
867  * relex(), the expanded character class (prior to range expansion)
868  * must be less than twice the size of their full name.
869  */
870 
871 /* Because isblank doesn't show up in any of the header files on any
872  * system i use, it's defined here.  if some other locale has a richer
873  * definition of "blank", define HAS_ISBLANK and provide your own
874  * version.
875  * the parentheses here are an attempt to find a path through the maze
876  * of macro definition and/or function and/or version provided.  thanks
877  * to nelson beebe for the suggestion; let's see if it works everywhere.
878  */
879 
880 /* #define HAS_ISBLANK */
881 #ifndef HAS_ISBLANK
882 
883 int (xisblank)(int c)
884 {
885 	return c==' ' || c=='\t';
886 }
887 
888 #endif
889 
890 static const struct charclass {
891 	const char *cc_name;
892 	int cc_namelen;
893 	int (*cc_func)(int);
894 } charclasses[] = {
895 	{ "alnum",	5,	isalnum },
896 	{ "alpha",	5,	isalpha },
897 #ifndef HAS_ISBLANK
898 	{ "blank",	5,	xisblank },
899 #else
900 	{ "blank",	5,	isblank },
901 #endif
902 	{ "cntrl",	5,	iscntrl },
903 	{ "digit",	5,	isdigit },
904 	{ "graph",	5,	isgraph },
905 	{ "lower",	5,	islower },
906 	{ "print",	5,	isprint },
907 	{ "punct",	5,	ispunct },
908 	{ "space",	5,	isspace },
909 	{ "upper",	5,	isupper },
910 	{ "xdigit",	6,	isxdigit },
911 	{ NULL,		0,	NULL },
912 };
913 
914 #define REPEAT_SIMPLE		0
915 #define REPEAT_PLUS_APPENDED	1
916 #define REPEAT_WITH_Q		2
917 #define REPEAT_ZERO		3
918 
919 static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)920 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
921 	       int atomlen, int firstnum, int secondnum, int special_case)
922 {
923 	int i, j;
924 	uschar *buf = 0;
925 	int ret = 1;
926 	int init_q = (firstnum == 0);		/* first added char will be ? */
927 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
928 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
929 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
930 	int size = prefix_length +  suffix_length;
931 
932 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
933 		size += atomlen*(firstnum-1);
934 	}
935 
936 	/* Adjust size of buffer for special cases */
937 	if (special_case == REPEAT_PLUS_APPENDED) {
938 		size++;		/* for the final + */
939 	} else if (special_case == REPEAT_WITH_Q) {
940 		size += init_q + (atomlen+1)* n_q_reps;
941 	} else if (special_case == REPEAT_ZERO) {
942 		size += 2;	/* just a null ERE: () */
943 	}
944 	if ((buf = malloc(size + 1)) == NULL)
945 		FATAL("out of space in reg expr %.10s..", lastre);
946 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
947 	j = prefix_length;
948 	if (special_case == REPEAT_ZERO) {
949 		j -= atomlen;
950 		buf[j++] = '(';
951 		buf[j++] = ')';
952 	}
953 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
954 		memcpy(&buf[j], atom, atomlen);
955 		j += atomlen;
956 	}
957 	if (special_case == REPEAT_PLUS_APPENDED) {
958 		buf[j++] = '+';
959 	} else if (special_case == REPEAT_WITH_Q) {
960 		if (init_q)
961 			buf[j++] = '?';
962 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
963 			memcpy(&buf[j], atom, atomlen);
964 			j += atomlen;
965 			buf[j++] = '?';
966 		}
967 	}
968 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
969 	if (special_case == REPEAT_ZERO) {
970 		buf[j+suffix_length] = '\0';
971 	} else {
972 		buf[size] = '\0';
973 	}
974 	/* free old basestr */
975 	if (firstbasestr != basestr) {
976 		if (basestr)
977 			xfree(basestr);
978 	}
979 	basestr = buf;
980 	prestr  = buf + prefix_length;
981 	if (special_case == REPEAT_ZERO) {
982 		prestr  -= atomlen;
983 		ret++;
984 	}
985 	return ret;
986 }
987 
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)988 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
989 		  int atomlen, int firstnum, int secondnum)
990 {
991 	/*
992 	   In general, the repetition specifier or "bound" is replaced here
993 	   by an equivalent ERE string, repeating the immediately previous atom
994 	   and appending ? and + as needed. Note that the first copy of the
995 	   atom is left in place, except in the special_case of a zero-repeat
996 	   (i.e., {0}).
997 	 */
998 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
999 		if (firstnum < 2) {
1000 			/* 0 or 1: should be handled before you get here */
1001 			FATAL("internal error");
1002 		} else {
1003 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1004 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1005 		}
1006 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1007 		if (firstnum == 0) {	/* {0} or {0,0} */
1008 			/* This case is unusual because the resulting
1009 			   replacement string might actually be SMALLER than
1010 			   the original ERE */
1011 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1012 					firstnum, secondnum, REPEAT_ZERO);
1013 		} else {		/* (firstnum >= 1) */
1014 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1015 					firstnum, secondnum, REPEAT_SIMPLE);
1016 		}
1017 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1018 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1019 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1020 					firstnum, secondnum, REPEAT_WITH_Q);
1021 	} else {	/* Error - shouldn't be here (n>m) */
1022 		FATAL("internal error");
1023 	}
1024 	return 0;
1025 }
1026 
relex(void)1027 int relex(void)		/* lexical analyzer for reparse */
1028 {
1029 	int c, n;
1030 	int cflag;
1031 	static uschar *buf = NULL;
1032 	static int bufsz = 100;
1033 	uschar *bp;
1034 	const struct charclass *cc;
1035 	int i;
1036 	int num, m;
1037 	bool commafound, digitfound;
1038 	const uschar *startreptok;
1039 	static int parens = 0;
1040 
1041 rescan:
1042 	starttok = prestr;
1043 
1044 	switch (c = *prestr++) {
1045 	case '|': return OR;
1046 	case '*': return STAR;
1047 	case '+': return PLUS;
1048 	case '?': return QUEST;
1049 	case '.': return DOT;
1050 	case '\0': prestr--; return '\0';
1051 	case '^':
1052 	case '$':
1053 		return c;
1054 	case '(':
1055 		parens++;
1056 		return c;
1057 	case ')':
1058 		if (parens) {
1059 			parens--;
1060 			return c;
1061 		}
1062 		/* unmatched close parenthesis; per POSIX, treat as literal */
1063 		rlxval = c;
1064 		return CHAR;
1065 	case '\\':
1066 		rlxval = quoted(&prestr);
1067 		return CHAR;
1068 	default:
1069 		rlxval = c;
1070 		return CHAR;
1071 	case '[':
1072 		if (buf == NULL && (buf = malloc(bufsz)) == NULL)
1073 			FATAL("out of space in reg expr %.10s..", lastre);
1074 		bp = buf;
1075 		if (*prestr == '^') {
1076 			cflag = 1;
1077 			prestr++;
1078 		}
1079 		else
1080 			cflag = 0;
1081 		n = 2 * strlen((const char *) prestr)+1;
1082 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1083 			FATAL("out of space for reg expr %.10s...", lastre);
1084 		for (; ; ) {
1085 			if ((c = *prestr++) == '\\') {
1086 				*bp++ = '\\';
1087 				if ((c = *prestr++) == '\0')
1088 					FATAL("nonterminated character class %.20s...", lastre);
1089 				*bp++ = c;
1090 			/* } else if (c == '\n') { */
1091 			/* 	FATAL("newline in character class %.20s...", lastre); */
1092 			} else if (c == '[' && *prestr == ':') {
1093 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1094 				for (cc = charclasses; cc->cc_name; cc++)
1095 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1096 						break;
1097 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1098 				    prestr[2 + cc->cc_namelen] == ']') {
1099 					prestr += cc->cc_namelen + 3;
1100 					/*
1101 					 * BUG: We begin at 1, instead of 0, since we
1102 					 * would otherwise prematurely terminate the
1103 					 * string for classes like [[:cntrl:]]. This
1104 					 * means that we can't match the NUL character,
1105 					 * not without first adapting the entire
1106 					 * program to track each string's length.
1107 					 */
1108 					for (i = 1; i <= UCHAR_MAX; i++) {
1109 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1110 						    FATAL("out of space for reg expr %.10s...", lastre);
1111 						if (cc->cc_func(i)) {
1112 							*bp++ = i;
1113 							n++;
1114 						}
1115 					}
1116 				} else
1117 					*bp++ = c;
1118 			} else if (c == '[' && *prestr == '.') {
1119 				char collate_char;
1120 				prestr++;
1121 				collate_char = *prestr++;
1122 				if (*prestr == '.' && prestr[1] == ']') {
1123 					prestr += 2;
1124 					/* Found it: map via locale TBD: for
1125 					   now, simply return this char.  This
1126 					   is sufficient to pass conformance
1127 					   test awk.ex 156
1128 					 */
1129 					if (*prestr == ']') {
1130 						prestr++;
1131 						rlxval = collate_char;
1132 						return CHAR;
1133 					}
1134 				}
1135 			} else if (c == '[' && *prestr == '=') {
1136 				char equiv_char;
1137 				prestr++;
1138 				equiv_char = *prestr++;
1139 				if (*prestr == '=' && prestr[1] == ']') {
1140 					prestr += 2;
1141 					/* Found it: map via locale TBD: for now
1142 					   simply return this char. This is
1143 					   sufficient to pass conformance test
1144 					   awk.ex 156
1145 					 */
1146 					if (*prestr == ']') {
1147 						prestr++;
1148 						rlxval = equiv_char;
1149 						return CHAR;
1150 					}
1151 				}
1152 			} else if (c == '\0') {
1153 				FATAL("nonterminated character class %.20s", lastre);
1154 			} else if (bp == buf) {	/* 1st char is special */
1155 				*bp++ = c;
1156 			} else if (c == ']') {
1157 				*bp++ = 0;
1158 				rlxstr = (uschar *) tostring((char *) buf);
1159 				if (cflag == 0)
1160 					return CCL;
1161 				else
1162 					return NCCL;
1163 			} else
1164 				*bp++ = c;
1165 		}
1166 		break;
1167 	case '{':
1168 		if (isdigit(*(prestr))) {
1169 			num = 0;	/* Process as a repetition */
1170 			n = -1; m = -1;
1171 			commafound = false;
1172 			digitfound = false;
1173 			startreptok = prestr-1;
1174 			/* Remember start of previous atom here ? */
1175 		} else {        	/* just a { char, not a repetition */
1176 			rlxval = c;
1177 			return CHAR;
1178                 }
1179 		for (; ; ) {
1180 			if ((c = *prestr++) == '}') {
1181 				if (commafound) {
1182 					if (digitfound) { /* {n,m} */
1183 						m = num;
1184 						if (m < n)
1185 							FATAL("illegal repetition expression: class %.20s",
1186 								lastre);
1187 						if (n == 0 && m == 1) {
1188 							return QUEST;
1189 						}
1190 					} else {	/* {n,} */
1191 						if (n == 0)
1192 							return STAR;
1193 						else if (n == 1)
1194 							return PLUS;
1195 					}
1196 				} else {
1197 					if (digitfound) { /* {n} same as {n,n} */
1198 						n = num;
1199 						m = num;
1200 					} else {	/* {} */
1201 						FATAL("illegal repetition expression: class %.20s",
1202 							lastre);
1203 					}
1204 				}
1205 				if (repeat(starttok, prestr-starttok, lastatom,
1206 					   startreptok - lastatom, n, m) > 0) {
1207 					if (n == 0 && m == 0) {
1208 						return ZERO;
1209 					}
1210 					/* must rescan input for next token */
1211 					goto rescan;
1212 				}
1213 				/* Failed to replace: eat up {...} characters
1214 				   and treat like just PLUS */
1215 				return PLUS;
1216 			} else if (c == '\0') {
1217 				FATAL("nonterminated character class %.20s",
1218 					lastre);
1219 			} else if (isdigit(c)) {
1220 				num = 10 * num + c - '0';
1221 				digitfound = true;
1222 			} else if (c == ',') {
1223 				if (commafound)
1224 					FATAL("illegal repetition expression: class %.20s",
1225 						lastre);
1226 				/* looking for {n,} or {n,m} */
1227 				commafound = true;
1228 				n = num;
1229 				digitfound = false; /* reset */
1230 				num = 0;
1231 			} else {
1232 				FATAL("illegal repetition expression: class %.20s",
1233 					lastre);
1234 			}
1235 		}
1236 		break;
1237 	}
1238 }
1239 
cgoto(fa * f,int s,int c)1240 int cgoto(fa *f, int s, int c)
1241 {
1242 	int *p, *q;
1243 	int i, j, k;
1244 
1245 	assert(c == HAT || c < NCHARS);
1246 	while (f->accept >= maxsetvec) {	/* guessing here! */
1247 		resizesetvec(__func__);
1248 	}
1249 	for (i = 0; i <= f->accept; i++)
1250 		setvec[i] = 0;
1251 	setcnt = 0;
1252 	resize_state(f, s);
1253 	/* compute positions of gototab[s,c] into setvec */
1254 	p = f->posns[s];
1255 	for (i = 1; i <= *p; i++) {
1256 		if ((k = f->re[p[i]].ltype) != FINAL) {
1257 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1258 			 || (k == DOT && c != 0 && c != HAT)
1259 			 || (k == ALL && c != 0)
1260 			 || (k == EMPTYRE && c != 0)
1261 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1262 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1263 				q = f->re[p[i]].lfollow;
1264 				for (j = 1; j <= *q; j++) {
1265 					if (q[j] >= maxsetvec) {
1266 						resizesetvec(__func__);
1267 					}
1268 					if (setvec[q[j]] == 0) {
1269 						setcnt++;
1270 						setvec[q[j]] = 1;
1271 					}
1272 				}
1273 			}
1274 		}
1275 	}
1276 	/* determine if setvec is a previous state */
1277 	tmpset[0] = setcnt;
1278 	j = 1;
1279 	for (i = f->accept; i >= 0; i--)
1280 		if (setvec[i]) {
1281 			tmpset[j++] = i;
1282 		}
1283 	resize_state(f, f->curstat > s ? f->curstat : s);
1284 	/* tmpset == previous state? */
1285 	for (i = 1; i <= f->curstat; i++) {
1286 		p = f->posns[i];
1287 		if ((k = tmpset[0]) != p[0])
1288 			goto different;
1289 		for (j = 1; j <= k; j++)
1290 			if (tmpset[j] != p[j])
1291 				goto different;
1292 		/* setvec is state i */
1293 		if (c != HAT)
1294 			f->gototab[s][c] = i;
1295 		return i;
1296 	  different:;
1297 	}
1298 
1299 	/* add tmpset to current set of states */
1300 	++(f->curstat);
1301 	resize_state(f, f->curstat);
1302 	for (i = 0; i < NCHARS; i++)
1303 		f->gototab[f->curstat][i] = 0;
1304 	xfree(f->posns[f->curstat]);
1305 	p = intalloc(setcnt + 1, __func__);
1306 
1307 	f->posns[f->curstat] = p;
1308 	if (c != HAT)
1309 		f->gototab[s][c] = f->curstat;
1310 	for (i = 0; i <= setcnt; i++)
1311 		p[i] = tmpset[i];
1312 	if (setvec[f->accept])
1313 		f->out[f->curstat] = 1;
1314 	else
1315 		f->out[f->curstat] = 0;
1316 	return f->curstat;
1317 }
1318 
1319 
freefa(fa * f)1320 void freefa(fa *f)	/* free a finite automaton */
1321 {
1322 	int i;
1323 
1324 	if (f == NULL)
1325 		return;
1326 	for (i = 0; i < f->state_count; i++)
1327 		xfree(f->gototab[i])
1328 	for (i = 0; i <= f->curstat; i++)
1329 		xfree(f->posns[i]);
1330 	for (i = 0; i <= f->accept; i++) {
1331 		xfree(f->re[i].lfollow);
1332 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1333 			xfree(f->re[i].lval.np);
1334 	}
1335 	xfree(f->restr);
1336 	xfree(f->out);
1337 	xfree(f->posns);
1338 	xfree(f->gototab);
1339 	xfree(f);
1340 }
1341