xref: /minix/external/historical/nawk/dist/b.c (revision 84d9c625)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #if HAVE_NBTOOL_CONFIG_H
28 #include "nbtool_config.h"
29 #endif
30 
31 #define	DEBUG
32 
33 #include <ctype.h>
34 #include <stdio.h>
35 #include <string.h>
36 #include <stdlib.h>
37 #include <assert.h>
38 #include "awk.h"
39 #include "awkgram.h"
40 
41 #define	HAT	(NCHARS+2)	/* matches ^ in regular expr */
42 				/* NCHARS is 2**n */
43 #define MAXLIN 22
44 
45 #define type(v)		(v)->nobj	/* badly overloaded here */
46 #define info(v)		(v)->ntype	/* badly overloaded here */
47 #define left(v)		(v)->narg[0]
48 #define right(v)	(v)->narg[1]
49 #define parent(v)	(v)->nnext
50 
51 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
52 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
53 #define UNARY	case STAR: case PLUS: case QUEST:
54 
55 /* encoding in tree Nodes:
56 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
57 		left is index, right contains value or pointer to value
58 	unary (STAR, PLUS, QUEST): left is child, right is null
59 	binary (CAT, OR): left and right are children
60 	parent contains pointer to parent
61 */
62 
63 
64 int	*setvec;
65 int	*tmpset;
66 int	maxsetvec = 0;
67 
68 int	rtok;		/* next token in current re */
69 int	rlxval;
70 static const uschar	*rlxstr;
71 static const uschar	*prestr;	/* current position in current re */
72 static const uschar	*lastre;	/* origin of last re */
73 
74 static	int setcnt;
75 static	int poscnt;
76 
77 uschar	*patbeg;
78 int	patlen;
79 
80 #define	NFA	128	/* cache this many dynamic fa's */
81 fa	*fatab[NFA];
82 int	nfatab	= 0;	/* entries in fatab */
83 
84 static void
resizesetvec(const char * msg)85 resizesetvec(const char *msg)
86 {
87 	if (maxsetvec == 0)
88 		maxsetvec = MAXLIN;
89 	else
90 		maxsetvec *= 4;
91 	setvec = realloc(setvec, maxsetvec * sizeof(*setvec));
92 	tmpset = realloc(tmpset, maxsetvec * sizeof(*tmpset));
93 	if (setvec == 0 || tmpset == 0)
94 	    overflo(msg);
95 }
96 
97 static void
resize_state(fa * f,int state)98 resize_state(fa *f, int state)
99 {
100 	void *p;
101 	int i, new_count;
102 
103 	if (++state < f->state_count)
104 		return;
105 
106 	new_count = state + 10; /* needs to be tuned */
107 
108 	p = realloc(f->gototab, new_count * sizeof(f->gototab[0]));
109 	if (p == NULL)
110 		goto out;
111 	f->gototab = p;
112 
113 	p = realloc(f->out, new_count * sizeof(f->out[0]));
114 	if (p == NULL)
115 		goto out;
116 	f->out = p;
117 
118 	p = realloc(f->posns, new_count * sizeof(f->posns[0]));
119 	if (p == NULL)
120 		goto out;
121 	f->posns = p;
122 
123 	for (i = f->state_count; i < new_count; ++i) {
124 		f->gototab[i] = calloc(1, NCHARS * sizeof (**f->gototab));
125 		if (f->gototab[i] == NULL)
126 			goto out;
127 		f->out[i]  = 0;
128 		f->posns[i] = NULL;
129 	}
130 	f->state_count = new_count;
131 	return;
132 out:
133 	overflo("out of memory in resize_state");
134 }
135 
makedfa(const char * s,int anchor)136 fa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
137 {
138 	int i, use, nuse;
139 	fa *pfa;
140 	static int now = 1;
141 
142 	if (setvec == 0)	/* first time through any RE */
143 		resizesetvec("out of space initializing makedfa");
144 
145 	if (compile_time)	/* a constant for sure */
146 		return mkdfa(s, anchor);
147 	for (i = 0; i < nfatab; i++)	/* is it there already? */
148 		if (fatab[i]->anchor == anchor
149 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
150 			fatab[i]->use = now++;
151 			return fatab[i];
152 		}
153 	pfa = mkdfa(s, anchor);
154 	if (nfatab < NFA) {	/* room for another */
155 		fatab[nfatab] = pfa;
156 		fatab[nfatab]->use = now++;
157 		nfatab++;
158 		return pfa;
159 	}
160 	use = fatab[0]->use;	/* replace least-recently used */
161 	nuse = 0;
162 	for (i = 1; i < nfatab; i++)
163 		if (fatab[i]->use < use) {
164 			use = fatab[i]->use;
165 			nuse = i;
166 		}
167 	freefa(fatab[nuse]);
168 	fatab[nuse] = pfa;
169 	pfa->use = now++;
170 	return pfa;
171 }
172 
mkdfa(const char * s,int anchor)173 fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
174 				/* anchor = 1 for anchored matches, else 0 */
175 {
176 	Node *p, *p1;
177 	fa *f;
178 
179 	p = reparse(s);
180 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
181 		/* put ALL STAR in front of reg.  exp. */
182 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
183 		/* put FINAL after reg.  exp. */
184 
185 	poscnt = 0;
186 	penter(p1);	/* enter parent pointers and leaf indices */
187 	if ((f = calloc(1, sizeof(*f) + poscnt*sizeof(rrow))) == NULL)
188 		overflo("out of space for fa");
189 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
190 	cfoll(f, p1);	/* set up follow sets */
191 	freetr(p1);
192 	resize_state(f, 1);
193 	if ((f->posns[0] = calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
194 			overflo("out of space in makedfa");
195 	if ((f->posns[1] = calloc(1, sizeof(int))) == NULL)
196 		overflo("out of space in makedfa");
197 	*f->posns[1] = 0;
198 	f->initstat = makeinit(f, anchor);
199 	f->anchor = anchor;
200 	f->restr = (uschar *) tostring(s);
201 	return f;
202 }
203 
makeinit(fa * f,int anchor)204 int makeinit(fa *f, int anchor)
205 {
206 	int i, k;
207 
208 	resize_state(f, 2);
209 	f->curstat = 2;
210 	f->out[2] = 0;
211 	k = *(f->re[0].lfollow);
212 	xfree(f->posns[2]);
213 	if ((f->posns[2] = calloc(1, (k+1)*sizeof(int))) == NULL)
214 		overflo("out of space in makeinit");
215 	for (i=0; i <= k; i++) {
216 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
217 	}
218 	if ((f->posns[2])[1] == f->accept)
219 		f->out[2] = 1;
220 	for (i=0; i < NCHARS; i++)
221 		f->gototab[2][i] = 0;
222 	f->curstat = cgoto(f, 2, HAT);
223 	if (anchor) {
224 		*f->posns[2] = k-1;	/* leave out position 0 */
225 		for (i=0; i < k; i++) {
226 			(f->posns[0])[i] = (f->posns[2])[i];
227 		}
228 
229 		f->out[0] = f->out[2];
230 		if (f->curstat != 2) {
231 			resize_state(f, f->curstat);
232 			--(*f->posns[f->curstat]);
233 		}
234 	}
235 	return f->curstat;
236 }
237 
penter(Node * p)238 void penter(Node *p)	/* set up parent pointers and leaf indices */
239 {
240 	switch (type(p)) {
241 	ELEAF
242 	LEAF
243 		info(p) = poscnt;
244 		poscnt++;
245 		break;
246 	UNARY
247 		penter(left(p));
248 		parent(left(p)) = p;
249 		break;
250 	case CAT:
251 	case OR:
252 		penter(left(p));
253 		penter(right(p));
254 		parent(left(p)) = p;
255 		parent(right(p)) = p;
256 		break;
257 	default:	/* can't happen */
258 		FATAL("can't happen: unknown type %d in penter", type(p));
259 		break;
260 	}
261 }
262 
freetr(Node * p)263 void freetr(Node *p)	/* free parse tree */
264 {
265 	switch (type(p)) {
266 	ELEAF
267 	LEAF
268 		xfree(p);
269 		break;
270 	UNARY
271 		freetr(left(p));
272 		xfree(p);
273 		break;
274 	case CAT:
275 	case OR:
276 		freetr(left(p));
277 		freetr(right(p));
278 		xfree(p);
279 		break;
280 	default:	/* can't happen */
281 		FATAL("can't happen: unknown type %d in freetr", type(p));
282 		break;
283 	}
284 }
285 
286 /* in the parsing of regular expressions, metacharacters like . have */
287 /* to be seen literally;  \056 is not a metacharacter. */
288 
hexstr(const uschar ** pp)289 int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
290 {			/* only pick up one 8-bit byte (2 chars) */
291 	const uschar *p;
292 	int n = 0;
293 	int i;
294 
295 	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
296 		if (isdigit(*p))
297 			n = 16 * n + *p - '0';
298 		else if (*p >= 'a' && *p <= 'f')
299 			n = 16 * n + *p - 'a' + 10;
300 		else if (*p >= 'A' && *p <= 'F')
301 			n = 16 * n + *p - 'A' + 10;
302 	}
303 	*pp = p;
304 	return n;
305 }
306 
307 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
308 
quoted(const uschar ** pp)309 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
310 			/* and increment *pp */
311 {
312 	const uschar *p = *pp;
313 	int c;
314 
315 	if ((c = *p++) == 't')
316 		c = '\t';
317 	else if (c == 'n')
318 		c = '\n';
319 	else if (c == 'f')
320 		c = '\f';
321 	else if (c == 'r')
322 		c = '\r';
323 	else if (c == 'b')
324 		c = '\b';
325 	else if (c == '\\')
326 		c = '\\';
327 	else if (c == 'x') {	/* hexadecimal goo follows */
328 		c = hexstr(&p);	/* this adds a null if number is invalid */
329 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
330 		int n = c - '0';
331 		if (isoctdigit(*p)) {
332 			n = 8 * n + *p++ - '0';
333 			if (isoctdigit(*p))
334 				n = 8 * n + *p++ - '0';
335 		}
336 		c = n;
337 	} /* else */
338 		/* c = c; */
339 	*pp = p;
340 	return c;
341 }
342 
cclenter(const char * argp)343 char *cclenter(const char *argp)	/* add a character class */
344 {
345 	int i, c, c2;
346 	const uschar *p = (const uschar *) argp;
347 	const uschar *op;
348 	uschar *bp;
349 	static uschar *buf = 0;
350 	static int bufsz = 100;
351 
352 	op = p;
353 	if (buf == 0 && (buf = malloc(bufsz)) == NULL)
354 		FATAL("out of space for character class [%.10s...] 1", p);
355 	bp = buf;
356 	for (i = 0; (c = *p++) != 0; ) {
357 		if (c == '\\') {
358 			c = quoted(&p);
359 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
360 			if (*p != 0) {
361 				c = bp[-1];
362 				c2 = *p++;
363 				if (c2 == '\\')
364 					c2 = quoted(&p);
365 				if (c > c2) {	/* empty; ignore */
366 					bp--;
367 					i--;
368 					continue;
369 				}
370 				while (c < c2) {
371 					if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter1"))
372 						FATAL("out of space for character class [%.10s...] 2", p);
373 					*bp++ = ++c;
374 					i++;
375 				}
376 				continue;
377 			}
378 		}
379 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter2"))
380 			FATAL("out of space for character class [%.10s...] 3", p);
381 		*bp++ = c;
382 		i++;
383 	}
384 	*bp = 0;
385 	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
386 	free(__UNCONST(op));
387 	return (char *) tostring((char *) buf);
388 }
389 
overflo(const char * s)390 void overflo(const char *s)
391 {
392 	FATAL("regular expression too big: %.30s...", s);
393 }
394 
cfoll(fa * f,Node * v)395 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
396 {
397 	int i;
398 	int *p;
399 
400 	switch (type(v)) {
401 	ELEAF
402 	LEAF
403 		f->re[info(v)].ltype = type(v);
404 		f->re[info(v)].lval.np = right(v);
405 		while (f->accept >= maxsetvec) /* guessing here! */
406 			resizesetvec("out of space in cfoll()");
407 		for (i = 0; i <= f->accept; i++)
408 			setvec[i] = 0;
409 		setcnt = 0;
410 		follow(v);	/* computes setvec and setcnt */
411 		if ((p = calloc(1, (setcnt+1)*sizeof(int))) == NULL)
412 			overflo("out of space building follow set");
413 		f->re[info(v)].lfollow = p;
414 		*p = setcnt;
415 		for (i = f->accept; i >= 0; i--)
416 			if (setvec[i] == 1)
417 				*++p = i;
418 		break;
419 	UNARY
420 		cfoll(f,left(v));
421 		break;
422 	case CAT:
423 	case OR:
424 		cfoll(f,left(v));
425 		cfoll(f,right(v));
426 		break;
427 	default:	/* can't happen */
428 		FATAL("can't happen: unknown type %d in cfoll", type(v));
429 	}
430 }
431 
first(Node * p)432 int first(Node *p)	/* collects initially active leaves of p into setvec */
433 			/* returns 0 if p matches empty string */
434 {
435 	int b, lp;
436 
437 	switch (type(p)) {
438 	ELEAF
439 	LEAF
440 		lp = info(p);	/* look for high-water mark of subscripts */
441 		while (setcnt >= maxsetvec || lp >= maxsetvec) /* guessing here! */
442 			resizesetvec("out of space in first()");
443 		if (type(p) == EMPTYRE) {
444 			setvec[lp] = 0;
445 			return(0);
446 		}
447 		if (setvec[lp] != 1) {
448 			setvec[lp] = 1;
449 			setcnt++;
450 		}
451 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
452 			return(0);		/* empty CCL */
453 		else return(1);
454 	case PLUS:
455 		if (first(left(p)) == 0) return(0);
456 		return(1);
457 	case STAR:
458 	case QUEST:
459 		first(left(p));
460 		return(0);
461 	case CAT:
462 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
463 		return(1);
464 	case OR:
465 		b = first(right(p));
466 		if (first(left(p)) == 0 || b == 0) return(0);
467 		return(1);
468 	}
469 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
470 	return(-1);
471 }
472 
follow(Node * v)473 void follow(Node *v)	/* collects leaves that can follow v into setvec */
474 {
475 	Node *p;
476 
477 	if (type(v) == FINAL)
478 		return;
479 	p = parent(v);
480 	switch (type(p)) {
481 	case STAR:
482 	case PLUS:
483 		first(v);
484 		follow(p);
485 		return;
486 
487 	case OR:
488 	case QUEST:
489 		follow(p);
490 		return;
491 
492 	case CAT:
493 		if (v == left(p)) {	/* v is left child of p */
494 			if (first(right(p)) == 0) {
495 				follow(p);
496 				return;
497 			}
498 		} else		/* v is right child */
499 			follow(p);
500 		return;
501 	}
502 }
503 
member(int c,const char * sarg)504 int member(int c, const char *sarg)	/* is c in s? */
505 {
506 	const uschar *s = (const uschar *) sarg;
507 
508 	while (*s)
509 		if (c == *s++)
510 			return(1);
511 	return(0);
512 }
513 
match(fa * f,const char * p0)514 int match(fa *f, const char *p0)	/* shortest match ? */
515 {
516 	int s, ns;
517 	const uschar *p = (const uschar *) p0;
518 
519 	s = f->initstat;
520 	assert (s < f->state_count);
521 
522 	if (f->out[s])
523 		return(1);
524 	do {
525 		/* assert(*p < NCHARS); */
526 		if ((ns = f->gototab[s][*p]) != 0)
527 			s = ns;
528 		else
529 			s = cgoto(f, s, *p);
530 
531 		assert (s < f->state_count);
532 
533 		if (f->out[s])
534 			return(1);
535 	} while (*p++ != 0);
536 	return(0);
537 }
538 
pmatch(fa * f,const char * p0)539 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
540 {
541 	int s, ns;
542 	uschar *p = __UNCONST(p0);
543 	uschar *q;
544 
545 	s = f->initstat;
546 	assert(s < f->state_count);
547 	patbeg = p;
548 	patlen = -1;
549 	do {
550 		q = p;
551 		do {
552 			if (f->out[s])		/* final state */
553 				patlen = q-p;
554 			/* assert(*q < NCHARS); */
555 			if ((ns = f->gototab[s][*q]) != 0)
556 				s = ns;
557 			else
558 				s = cgoto(f, s, *q);
559 
560 			assert(s < f->state_count);
561 
562 			if (s == 1) {	/* no transition */
563 				if (patlen >= 0) {
564 					patbeg = p;
565 					return(1);
566 				}
567 				else
568 					goto nextin;	/* no match */
569 			}
570 		} while (*q++ != 0);
571 		if (f->out[s])
572 			patlen = q-p-1;	/* don't count $ */
573 		if (patlen >= 0) {
574 			patbeg = p;
575 			return(1);
576 		}
577 	nextin:
578 		s = 2;
579 	} while (*p++ != 0);
580 	return (0);
581 }
582 
nematch(fa * f,const char * p0)583 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
584 {
585 	int s, ns;
586 	uschar *p = __UNCONST(p0);
587 	uschar *q;
588 
589 	s = f->initstat;
590 	assert(s < f->state_count);
591 
592 	patlen = -1;
593 	while (*p) {
594 		q = p;
595 		do {
596 			if (f->out[s])		/* final state */
597 				patlen = q-p;
598 			/* assert(*q < NCHARS); */
599 			if ((ns = f->gototab[s][*q]) != 0)
600 				s = ns;
601 			else
602 				s = cgoto(f, s, *q);
603 
604 			assert(s < f->state_count);
605 
606 			if (s == 1) {	/* no transition */
607 				if (patlen > 0) {
608 					patbeg = p;
609 					return(1);
610 				} else
611 					goto nnextin;	/* no nonempty match */
612 			}
613 		} while (*q++ != 0);
614 		if (f->out[s])
615 			patlen = q-p-1;	/* don't count $ */
616 		if (patlen > 0 ) {
617 			patbeg = p;
618 			return(1);
619 		}
620 	nnextin:
621 		s = 2;
622 		p++;
623 	}
624 	return (0);
625 }
626 
627 
628 /*
629  * NAME
630  *     fnematch
631  *
632  * DESCRIPTION
633  *     A stream-fed version of nematch which transfers characters to a
634  *     null-terminated buffer. All characters up to and including the last
635  *     character of the matching text or EOF are placed in the buffer. If
636  *     a match is found, patbeg and patlen are set appropriately.
637  *
638  * RETURN VALUES
639  *     0    No match found.
640  *     1    Match found.
641  */
642 
fnematch(fa * pfa,FILE * f,uschar ** pbuf,int * pbufsize,int quantum)643 int fnematch(fa *pfa, FILE *f, uschar **pbuf, int *pbufsize, int quantum)
644 {
645 	uschar *buf = *pbuf;
646 	int bufsize = *pbufsize;
647 	int c, i, j, k, ns, s;
648 
649 	s = pfa->initstat;
650 	assert(s < pfa->state_count);
651 	patlen = 0;
652 
653 	/*
654 	 * All indices relative to buf.
655 	 * i <= j <= k <= bufsize
656 	 *
657 	 * i: origin of active substring
658 	 * j: current character
659 	 * k: destination of next getc()
660 	 */
661 	i = -1, k = 0;
662         do {
663 		j = i++;
664 		do {
665 			if (++j == k) {
666 				if (k == bufsize)
667 					if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
668 						FATAL("stream '%.30s...' too long", buf);
669 				buf[k++] = (c = getc(f)) != EOF ? c : 0;
670 			}
671 			c = buf[j];
672 			/* assert(c < NCHARS); */
673 
674 			if ((ns = pfa->gototab[s][c]) != 0)
675 				s = ns;
676 			else
677 				s = cgoto(pfa, s, c);
678 			assert(s < pfa->state_count);
679 
680 			if (pfa->out[s]) {	/* final state */
681 				patlen = j - i + 1;
682 				if (c == 0)	/* don't count $ */
683 					patlen--;
684 			}
685 		} while (buf[j] && s != 1);
686 		s = 2;
687 	} while (buf[i] && !patlen);
688 
689 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
690 	*pbuf = buf;
691 	*pbufsize = bufsize;
692 
693 	if (patlen) {
694 		patbeg = buf + i;
695 		/*
696 		 * Under no circumstances is the last character fed to
697 		 * the automaton part of the match. It is EOF's nullbyte,
698 		 * or it sent the automaton into a state with no further
699 		 * transitions available (s==1), or both. Room for a
700 		 * terminating nullbyte is guaranteed.
701 		 *
702 		 * ungetc any chars after the end of matching text
703 		 * (except for EOF's nullbyte, if present) and null
704 		 * terminate the buffer.
705 		 */
706 		do
707 			if (buf[--k] && ungetc(buf[k], f) == EOF)
708 				FATAL("unable to ungetc '%c'", buf[k]);
709 		while (k > i + patlen);
710 		buf[k] = 0;
711 		return 1;
712 	}
713 	else
714 		return 0;
715 }
716 
reparse(const char * p)717 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
718 {			/* uses relex() to scan regular expression */
719 	Node *np;
720 
721 	dprintf( ("reparse <%s>\n", p) );
722 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
723 	rtok = relex();
724 	/* GNU compatibility: an empty regexp matches anything */
725 	if (rtok == '\0') {
726 		/* FATAL("empty regular expression"); previous */
727 		return(op2(EMPTYRE, NIL, NIL));
728 	}
729 	np = regexp();
730 	if (rtok != '\0')
731 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
732 	return(np);
733 }
734 
regexp(void)735 Node *regexp(void)	/* top-level parse of reg expr */
736 {
737 	return (alt(concat(primary())));
738 }
739 
primary(void)740 Node *primary(void)
741 {
742 	Node *np;
743 
744 	switch (rtok) {
745 	case CHAR:
746 		np = op2(CHAR, NIL, itonp(rlxval));
747 		rtok = relex();
748 		return (unary(np));
749 	case ALL:
750 		rtok = relex();
751 		return (unary(op2(ALL, NIL, NIL)));
752 	case EMPTYRE:
753 		rtok = relex();
754 		return (unary(op2(ALL, NIL, NIL)));
755 	case DOT:
756 		rtok = relex();
757 		return (unary(op2(DOT, NIL, NIL)));
758 	case CCL:
759 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
760 		rtok = relex();
761 		return (unary(np));
762 	case NCCL:
763 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
764 		rtok = relex();
765 		return (unary(np));
766 	case '^':
767 		rtok = relex();
768 		return (unary(op2(CHAR, NIL, itonp(HAT))));
769 	case '$':
770 		rtok = relex();
771 		return (unary(op2(CHAR, NIL, NIL)));
772 	case '(':
773 		rtok = relex();
774 		if (rtok == ')') {	/* special pleading for () */
775 			rtok = relex();
776 			return unary(op2(CCL, NIL, (Node *) tostring("")));
777 		}
778 		np = regexp();
779 		if (rtok == ')') {
780 			rtok = relex();
781 			return (unary(np));
782 		}
783 		else
784 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
785 	default:
786 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
787 	}
788 	return 0;	/*NOTREACHED*/
789 }
790 
concat(Node * np)791 Node *concat(Node *np)
792 {
793 	switch (rtok) {
794 	case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
795 		return (concat(op2(CAT, np, primary())));
796 	}
797 	return (np);
798 }
799 
alt(Node * np)800 Node *alt(Node *np)
801 {
802 	if (rtok == OR) {
803 		rtok = relex();
804 		return (alt(op2(OR, np, concat(primary()))));
805 	}
806 	return (np);
807 }
808 
unary(Node * np)809 Node *unary(Node *np)
810 {
811 	switch (rtok) {
812 	case STAR:
813 		rtok = relex();
814 		return (unary(op2(STAR, np, NIL)));
815 	case PLUS:
816 		rtok = relex();
817 		return (unary(op2(PLUS, np, NIL)));
818 	case QUEST:
819 		rtok = relex();
820 		return (unary(op2(QUEST, np, NIL)));
821 	default:
822 		return (np);
823 	}
824 }
825 
826 /*
827  * Character class definitions conformant to the POSIX locale as
828  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
829  * and operating character sets are both ASCII (ISO646) or supersets
830  * thereof.
831  *
832  * Note that to avoid overflowing the temporary buffer used in
833  * relex(), the expanded character class (prior to range expansion)
834  * must be less than twice the size of their full name.
835  */
836 
837 /* Because isblank doesn't show up in any of the header files on any
838  * system i use, it's defined here.  if some other locale has a richer
839  * definition of "blank", define HAS_ISBLANK and provide your own
840  * version.
841  * the parentheses here are an attempt to find a path through the maze
842  * of macro definition and/or function and/or version provided.  thanks
843  * to nelson beebe for the suggestion; let's see if it works everywhere.
844  */
845 
846 /* #define HAS_ISBLANK */
847 
848 static const struct charclass {
849 	const char *cc_name;
850 	int cc_namelen;
851 	int (*cc_func)(int);
852 } charclasses[] = {
853 	{ "alnum",	5,	isalnum },
854 	{ "alpha",	5,	isalpha },
855 #ifndef HAS_ISBLANK
856 	{ "blank",	5,	isspace }, /* was isblank */
857 #else
858 	{ "blank",	5,	isblank },
859 #endif
860 	{ "cntrl",	5,	iscntrl },
861 	{ "digit",	5,	isdigit },
862 	{ "graph",	5,	isgraph },
863 	{ "lower",	5,	islower },
864 	{ "print",	5,	isprint },
865 	{ "punct",	5,	ispunct },
866 	{ "space",	5,	isspace },
867 	{ "upper",	5,	isupper },
868 	{ "xdigit",	6,	isxdigit },
869 	{ NULL,		0,	NULL },
870 };
871 
872 
relex(void)873 int relex(void)		/* lexical analyzer for reparse */
874 {
875 	int c, n;
876 	int cflag;
877 	static uschar *buf = 0;
878 	static int bufsz = 100;
879 	uschar *bp;
880 	const struct charclass *cc;
881 	int i;
882 
883 	switch (c = *prestr++) {
884 	case '|': return OR;
885 	case '*': return STAR;
886 	case '+': return PLUS;
887 	case '?': return QUEST;
888 	case '.': return DOT;
889 	case '\0': prestr--; return '\0';
890 	case '^':
891 	case '$':
892 	case '(':
893 	case ')':
894 		return c;
895 	case '\\':
896 		rlxval = quoted(&prestr);
897 		return CHAR;
898 	default:
899 		rlxval = c;
900 		return CHAR;
901 	case '[':
902 		if (buf == 0 && (buf = malloc(bufsz)) == NULL)
903 			FATAL("out of space in reg expr %.10s..", lastre);
904 		bp = buf;
905 		if (*prestr == '^') {
906 			cflag = 1;
907 			prestr++;
908 		}
909 		else
910 			cflag = 0;
911 		n = 2 * strlen((const char *) prestr)+1;
912 		if (!adjbuf(&buf, &bufsz, n, n, &bp, "relex1"))
913 			FATAL("out of space for reg expr %.10s...", lastre);
914 		for (; ; ) {
915 			if ((c = *prestr++) == '\\') {
916 				*bp++ = '\\';
917 				if ((c = *prestr++) == '\0')
918 					FATAL("nonterminated character class %.20s...", lastre);
919 				*bp++ = c;
920 			/* } else if (c == '\n') { */
921 			/* 	FATAL("newline in character class %.20s...", lastre); */
922 			} else if (c == '[' && *prestr == ':') {
923 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
924 				for (cc = charclasses; cc->cc_name; cc++)
925 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
926 						break;
927 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
928 				    prestr[2 + cc->cc_namelen] == ']') {
929 					prestr += cc->cc_namelen + 3;
930 					for (i = 1; i < NCHARS; i++) {
931 						if (!adjbuf(&buf, &bufsz, bp-buf+1, 100, &bp, "relex2"))
932 						    FATAL("out of space for reg expr %.10s...", lastre);
933 						if (cc->cc_func(i)) {
934 							*bp++ = i;
935 							n++;
936 						}
937 					}
938 				} else
939 					*bp++ = c;
940 			} else if (c == '\0') {
941 				FATAL("nonterminated character class %.20s", lastre);
942 			} else if (bp == buf) {	/* 1st char is special */
943 				*bp++ = c;
944 			} else if (c == ']') {
945 				*bp++ = 0;
946 				rlxstr = (uschar *) tostring((char *) buf);
947 				if (cflag == 0)
948 					return CCL;
949 				else
950 					return NCCL;
951 			} else
952 				*bp++ = c;
953 		}
954 	}
955 }
956 
cgoto(fa * f,int s,int c)957 int cgoto(fa *f, int s, int c)
958 {
959 	int i, j, k;
960 	int *p, *q;
961 
962 	assert(c == HAT || c < NCHARS);
963 	while (f->accept >= maxsetvec) 	/* guessing here! */
964 		resizesetvec("out of space in cgoto()");
965 	for (i = 0; i <= f->accept; i++)
966 		setvec[i] = 0;
967 	setcnt = 0;
968 	resize_state(f, s);
969 	/* compute positions of gototab[s,c] into setvec */
970 	p = f->posns[s];
971 	for (i = 1; i <= *p; i++) {
972 		if ((k = f->re[p[i]].ltype) != FINAL) {
973 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
974 			 || (k == DOT && c != 0 && c != HAT)
975 			 || (k == ALL && c != 0)
976 			 || (k == EMPTYRE && c != 0)
977 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
978 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
979 				q = f->re[p[i]].lfollow;
980 				for (j = 1; j <= *q; j++) {
981 					if (q[j] >= maxsetvec)
982 						resizesetvec("cgoto overflow");
983 					if (setvec[q[j]] == 0) {
984 						setcnt++;
985 						setvec[q[j]] = 1;
986 					}
987 				}
988 			}
989 		}
990 	}
991 	/* determine if setvec is a previous state */
992 	tmpset[0] = setcnt;
993 	j = 1;
994 	for (i = f->accept; i >= 0; i--)
995 		if (setvec[i]) {
996 			tmpset[j++] = i;
997 		}
998 
999 	resize_state(f, f->curstat > s ? f->curstat : s);
1000 	/* tmpset == previous state? */
1001 	for (i = 1; i <= f->curstat; i++) {
1002 		p = f->posns[i];
1003 		if ((k = tmpset[0]) != p[0])
1004 			goto different;
1005 		for (j = 1; j <= k; j++)
1006 			if (tmpset[j] != p[j])
1007 				goto different;
1008 		/* setvec is state i */
1009 		if (c != HAT)
1010 			f->gototab[s][c] = i;
1011 		return i;
1012 	  different:;
1013 	}
1014 
1015 	/* add tmpset to current set of states */
1016 	++(f->curstat);
1017 	resize_state(f, f->curstat);
1018 	for (i = 0; i < NCHARS; i++)
1019 		f->gototab[f->curstat][i] = 0;
1020 	xfree(f->posns[f->curstat]);
1021 	if ((p = calloc(1, (setcnt+1)*sizeof(int))) == NULL)
1022 		overflo("out of space in cgoto");
1023 
1024 	f->posns[f->curstat] = p;
1025 	if (c != HAT)
1026 		f->gototab[s][c] = f->curstat;
1027 	for (i = 0; i <= setcnt; i++)
1028 		p[i] = tmpset[i];
1029 	if (setvec[f->accept])
1030 		f->out[f->curstat] = 1;
1031 	else
1032 		f->out[f->curstat] = 0;
1033 	return f->curstat;
1034 }
1035 
1036 
freefa(fa * f)1037 void freefa(fa *f)	/* free a finite automaton */
1038 {
1039 	int i;
1040 
1041 	if (f == NULL)
1042 		return;
1043 	for (i = 0; i < f->state_count; i++) {
1044 		xfree(f->gototab[i])
1045 		xfree(f->posns[i]);
1046 	}
1047 	for (i = 0; i <= f->accept; i++) {
1048 		xfree(f->re[i].lfollow);
1049 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1050 			xfree((f->re[i].lval.np));
1051 	}
1052 	xfree(f->restr);
1053 	xfree(f->out);
1054 	xfree(f->posns);
1055 	xfree(f->gototab);
1056 	xfree(f);
1057 }
1058