xref: /freebsd/contrib/one-true-awk/b.c (revision addad6af)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #define	DEBUG
28 
29 #include <ctype.h>
30 #include <stdio.h>
31 #include <string.h>
32 #include <stdlib.h>
33 #include "awk.h"
34 #include "ytab.h"
35 
36 #define	HAT	(NCHARS+2)	/* matches ^ in regular expr */
37 				/* NCHARS is 2**n */
38 #define MAXLIN 22
39 
40 #define type(v)		(v)->nobj	/* badly overloaded here */
41 #define info(v)		(v)->ntype	/* badly overloaded here */
42 #define left(v)		(v)->narg[0]
43 #define right(v)	(v)->narg[1]
44 #define parent(v)	(v)->nnext
45 
46 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
47 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
48 #define UNARY	case STAR: case PLUS: case QUEST:
49 
50 /* encoding in tree Nodes:
51 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
52 		left is index, right contains value or pointer to value
53 	unary (STAR, PLUS, QUEST): left is child, right is null
54 	binary (CAT, OR): left and right are children
55 	parent contains pointer to parent
56 */
57 
58 
59 int	*setvec;
60 int	*tmpset;
61 int	maxsetvec = 0;
62 
63 int	rtok;		/* next token in current re */
64 int	rlxval;
65 static uschar	*rlxstr;
66 static uschar	*prestr;	/* current position in current re */
67 static uschar	*lastre;	/* origin of last re */
68 
69 static	int setcnt;
70 static	int poscnt;
71 
72 char	*patbeg;
73 int	patlen;
74 
75 #define	NFA	20	/* cache this many dynamic fa's */
76 fa	*fatab[NFA];
77 int	nfatab	= 0;	/* entries in fatab */
78 
79 fa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
80 {
81 	int i, use, nuse;
82 	fa *pfa;
83 	static int now = 1;
84 
85 	if (setvec == 0) {	/* first time through any RE */
86 		maxsetvec = MAXLIN;
87 		setvec = (int *) malloc(maxsetvec * sizeof(int));
88 		tmpset = (int *) malloc(maxsetvec * sizeof(int));
89 		if (setvec == 0 || tmpset == 0)
90 			overflo("out of space initializing makedfa");
91 	}
92 
93 	if (compile_time)	/* a constant for sure */
94 		return mkdfa(s, anchor);
95 	for (i = 0; i < nfatab; i++)	/* is it there already? */
96 		if (fatab[i]->anchor == anchor
97 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
98 			fatab[i]->use = now++;
99 			return fatab[i];
100 		}
101 	pfa = mkdfa(s, anchor);
102 	if (nfatab < NFA) {	/* room for another */
103 		fatab[nfatab] = pfa;
104 		fatab[nfatab]->use = now++;
105 		nfatab++;
106 		return pfa;
107 	}
108 	use = fatab[0]->use;	/* replace least-recently used */
109 	nuse = 0;
110 	for (i = 1; i < nfatab; i++)
111 		if (fatab[i]->use < use) {
112 			use = fatab[i]->use;
113 			nuse = i;
114 		}
115 	freefa(fatab[nuse]);
116 	fatab[nuse] = pfa;
117 	pfa->use = now++;
118 	return pfa;
119 }
120 
121 fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
122 				/* anchor = 1 for anchored matches, else 0 */
123 {
124 	Node *p, *p1;
125 	fa *f;
126 
127 	p = reparse(s);
128 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
129 		/* put ALL STAR in front of reg.  exp. */
130 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
131 		/* put FINAL after reg.  exp. */
132 
133 	poscnt = 0;
134 	penter(p1);	/* enter parent pointers and leaf indices */
135 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
136 		overflo("out of space for fa");
137 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
138 	cfoll(f, p1);	/* set up follow sets */
139 	freetr(p1);
140 	if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
141 			overflo("out of space in makedfa");
142 	if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
143 		overflo("out of space in makedfa");
144 	*f->posns[1] = 0;
145 	f->initstat = makeinit(f, anchor);
146 	f->anchor = anchor;
147 	f->restr = (uschar *) tostring(s);
148 	return f;
149 }
150 
151 int makeinit(fa *f, int anchor)
152 {
153 	int i, k;
154 
155 	f->curstat = 2;
156 	f->out[2] = 0;
157 	f->reset = 0;
158 	k = *(f->re[0].lfollow);
159 	xfree(f->posns[2]);
160 	if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
161 		overflo("out of space in makeinit");
162 	for (i=0; i <= k; i++) {
163 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
164 	}
165 	if ((f->posns[2])[1] == f->accept)
166 		f->out[2] = 1;
167 	for (i=0; i < NCHARS; i++)
168 		f->gototab[2][i] = 0;
169 	f->curstat = cgoto(f, 2, HAT);
170 	if (anchor) {
171 		*f->posns[2] = k-1;	/* leave out position 0 */
172 		for (i=0; i < k; i++) {
173 			(f->posns[0])[i] = (f->posns[2])[i];
174 		}
175 
176 		f->out[0] = f->out[2];
177 		if (f->curstat != 2)
178 			--(*f->posns[f->curstat]);
179 	}
180 	return f->curstat;
181 }
182 
183 void penter(Node *p)	/* set up parent pointers and leaf indices */
184 {
185 	switch (type(p)) {
186 	ELEAF
187 	LEAF
188 		info(p) = poscnt;
189 		poscnt++;
190 		break;
191 	UNARY
192 		penter(left(p));
193 		parent(left(p)) = p;
194 		break;
195 	case CAT:
196 	case OR:
197 		penter(left(p));
198 		penter(right(p));
199 		parent(left(p)) = p;
200 		parent(right(p)) = p;
201 		break;
202 	default:	/* can't happen */
203 		FATAL("can't happen: unknown type %d in penter", type(p));
204 		break;
205 	}
206 }
207 
208 void freetr(Node *p)	/* free parse tree */
209 {
210 	switch (type(p)) {
211 	ELEAF
212 	LEAF
213 		xfree(p);
214 		break;
215 	UNARY
216 		freetr(left(p));
217 		xfree(p);
218 		break;
219 	case CAT:
220 	case OR:
221 		freetr(left(p));
222 		freetr(right(p));
223 		xfree(p);
224 		break;
225 	default:	/* can't happen */
226 		FATAL("can't happen: unknown type %d in freetr", type(p));
227 		break;
228 	}
229 }
230 
231 /* in the parsing of regular expressions, metacharacters like . have */
232 /* to be seen literally;  \056 is not a metacharacter. */
233 
234 int hexstr(char **pp)	/* find and eval hex string at pp, return new p */
235 {			/* only pick up one 8-bit byte (2 chars) */
236 	uschar *p;
237 	int n = 0;
238 	int i;
239 
240 	for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
241 		if (isdigit(*p))
242 			n = 16 * n + *p - '0';
243 		else if (*p >= 'a' && *p <= 'f')
244 			n = 16 * n + *p - 'a' + 10;
245 		else if (*p >= 'A' && *p <= 'F')
246 			n = 16 * n + *p - 'A' + 10;
247 	}
248 	*pp = (char *) p;
249 	return n;
250 }
251 
252 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
253 
254 int quoted(char **pp)	/* pick up next thing after a \\ */
255 			/* and increment *pp */
256 {
257 	char *p = *pp;
258 	int c;
259 
260 	if ((c = *p++) == 't')
261 		c = '\t';
262 	else if (c == 'n')
263 		c = '\n';
264 	else if (c == 'f')
265 		c = '\f';
266 	else if (c == 'r')
267 		c = '\r';
268 	else if (c == 'b')
269 		c = '\b';
270 	else if (c == '\\')
271 		c = '\\';
272 	else if (c == 'x') {	/* hexadecimal goo follows */
273 		c = hexstr(&p);	/* this adds a null if number is invalid */
274 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
275 		int n = c - '0';
276 		if (isoctdigit(*p)) {
277 			n = 8 * n + *p++ - '0';
278 			if (isoctdigit(*p))
279 				n = 8 * n + *p++ - '0';
280 		}
281 		c = n;
282 	} /* else */
283 		/* c = c; */
284 	*pp = p;
285 	return c;
286 }
287 
288 char *cclenter(const char *argp)	/* add a character class */
289 {
290 	int i, c, c2;
291 	uschar *p = (uschar *) argp;
292 	uschar *op, *bp;
293 	static uschar *buf = 0;
294 	static int bufsz = 100;
295 
296 	op = p;
297 	if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
298 		FATAL("out of space for character class [%.10s...] 1", p);
299 	bp = buf;
300 	for (i = 0; (c = *p++) != 0; ) {
301 		if (c == '\\') {
302 			c = quoted((char **) &p);
303 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
304 			if (*p != 0) {
305 				c = bp[-1];
306 				c2 = *p++;
307 				if (c2 == '\\')
308 					c2 = quoted((char **) &p);
309 				if (c > c2) {	/* empty; ignore */
310 					bp--;
311 					i--;
312 					continue;
313 				}
314 				while (c < c2) {
315 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
316 						FATAL("out of space for character class [%.10s...] 2", p);
317 					*bp++ = ++c;
318 					i++;
319 				}
320 				continue;
321 			}
322 		}
323 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
324 			FATAL("out of space for character class [%.10s...] 3", p);
325 		*bp++ = c;
326 		i++;
327 	}
328 	*bp = 0;
329 	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
330 	xfree(op);
331 	return (char *) tostring((char *) buf);
332 }
333 
334 void overflo(const char *s)
335 {
336 	FATAL("regular expression too big: %.30s...", s);
337 }
338 
339 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
340 {
341 	int i;
342 	int *p;
343 
344 	switch (type(v)) {
345 	ELEAF
346 	LEAF
347 		f->re[info(v)].ltype = type(v);
348 		f->re[info(v)].lval.np = right(v);
349 		while (f->accept >= maxsetvec) {	/* guessing here! */
350 			maxsetvec *= 4;
351 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
352 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
353 			if (setvec == 0 || tmpset == 0)
354 				overflo("out of space in cfoll()");
355 		}
356 		for (i = 0; i <= f->accept; i++)
357 			setvec[i] = 0;
358 		setcnt = 0;
359 		follow(v);	/* computes setvec and setcnt */
360 		if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
361 			overflo("out of space building follow set");
362 		f->re[info(v)].lfollow = p;
363 		*p = setcnt;
364 		for (i = f->accept; i >= 0; i--)
365 			if (setvec[i] == 1)
366 				*++p = i;
367 		break;
368 	UNARY
369 		cfoll(f,left(v));
370 		break;
371 	case CAT:
372 	case OR:
373 		cfoll(f,left(v));
374 		cfoll(f,right(v));
375 		break;
376 	default:	/* can't happen */
377 		FATAL("can't happen: unknown type %d in cfoll", type(v));
378 	}
379 }
380 
381 int first(Node *p)	/* collects initially active leaves of p into setvec */
382 			/* returns 0 if p matches empty string */
383 {
384 	int b, lp;
385 
386 	switch (type(p)) {
387 	ELEAF
388 	LEAF
389 		lp = info(p);	/* look for high-water mark of subscripts */
390 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
391 			maxsetvec *= 4;
392 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
393 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
394 			if (setvec == 0 || tmpset == 0)
395 				overflo("out of space in first()");
396 		}
397 		if (type(p) == EMPTYRE) {
398 			setvec[lp] = 0;
399 			return(0);
400 		}
401 		if (setvec[lp] != 1) {
402 			setvec[lp] = 1;
403 			setcnt++;
404 		}
405 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
406 			return(0);		/* empty CCL */
407 		else return(1);
408 	case PLUS:
409 		if (first(left(p)) == 0) return(0);
410 		return(1);
411 	case STAR:
412 	case QUEST:
413 		first(left(p));
414 		return(0);
415 	case CAT:
416 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
417 		return(1);
418 	case OR:
419 		b = first(right(p));
420 		if (first(left(p)) == 0 || b == 0) return(0);
421 		return(1);
422 	}
423 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
424 	return(-1);
425 }
426 
427 void follow(Node *v)	/* collects leaves that can follow v into setvec */
428 {
429 	Node *p;
430 
431 	if (type(v) == FINAL)
432 		return;
433 	p = parent(v);
434 	switch (type(p)) {
435 	case STAR:
436 	case PLUS:
437 		first(v);
438 		follow(p);
439 		return;
440 
441 	case OR:
442 	case QUEST:
443 		follow(p);
444 		return;
445 
446 	case CAT:
447 		if (v == left(p)) {	/* v is left child of p */
448 			if (first(right(p)) == 0) {
449 				follow(p);
450 				return;
451 			}
452 		} else		/* v is right child */
453 			follow(p);
454 		return;
455 	}
456 }
457 
458 int member(int c, const char *sarg)	/* is c in s? */
459 {
460 	uschar *s = (uschar *) sarg;
461 
462 	while (*s)
463 		if (c == *s++)
464 			return(1);
465 	return(0);
466 }
467 
468 int match(fa *f, const char *p0)	/* shortest match ? */
469 {
470 	int s, ns;
471 	uschar *p = (uschar *) p0;
472 
473 	s = f->reset ? makeinit(f,0) : f->initstat;
474 	if (f->out[s])
475 		return(1);
476 	do {
477 		/* assert(*p < NCHARS); */
478 		if ((ns = f->gototab[s][*p]) != 0)
479 			s = ns;
480 		else
481 			s = cgoto(f, s, *p);
482 		if (f->out[s])
483 			return(1);
484 	} while (*p++ != 0);
485 	return(0);
486 }
487 
488 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
489 {
490 	int s, ns;
491 	uschar *p = (uschar *) p0;
492 	uschar *q;
493 	int i, k;
494 
495 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
496 	if (f->reset) {
497 		f->initstat = s = makeinit(f,1);
498 	} else {
499 		s = f->initstat;
500 	}
501 	patbeg = (char *) p;
502 	patlen = -1;
503 	do {
504 		q = p;
505 		do {
506 			if (f->out[s])		/* final state */
507 				patlen = q-p;
508 			/* assert(*q < NCHARS); */
509 			if ((ns = f->gototab[s][*q]) != 0)
510 				s = ns;
511 			else
512 				s = cgoto(f, s, *q);
513 			if (s == 1) {	/* no transition */
514 				if (patlen >= 0) {
515 					patbeg = (char *) p;
516 					return(1);
517 				}
518 				else
519 					goto nextin;	/* no match */
520 			}
521 		} while (*q++ != 0);
522 		if (f->out[s])
523 			patlen = q-p-1;	/* don't count $ */
524 		if (patlen >= 0) {
525 			patbeg = (char *) p;
526 			return(1);
527 		}
528 	nextin:
529 		s = 2;
530 		if (f->reset) {
531 			for (i = 2; i <= f->curstat; i++)
532 				xfree(f->posns[i]);
533 			k = *f->posns[0];
534 			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
535 				overflo("out of space in pmatch");
536 			for (i = 0; i <= k; i++)
537 				(f->posns[2])[i] = (f->posns[0])[i];
538 			f->initstat = f->curstat = 2;
539 			f->out[2] = f->out[0];
540 			for (i = 0; i < NCHARS; i++)
541 				f->gototab[2][i] = 0;
542 		}
543 	} while (*p++ != 0);
544 	return (0);
545 }
546 
547 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
548 {
549 	int s, ns;
550 	uschar *p = (uschar *) p0;
551 	uschar *q;
552 	int i, k;
553 
554 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
555 	if (f->reset) {
556 		f->initstat = s = makeinit(f,1);
557 	} else {
558 		s = f->initstat;
559 	}
560 	patlen = -1;
561 	while (*p) {
562 		q = p;
563 		do {
564 			if (f->out[s])		/* final state */
565 				patlen = q-p;
566 			/* assert(*q < NCHARS); */
567 			if ((ns = f->gototab[s][*q]) != 0)
568 				s = ns;
569 			else
570 				s = cgoto(f, s, *q);
571 			if (s == 1) {	/* no transition */
572 				if (patlen > 0) {
573 					patbeg = (char *) p;
574 					return(1);
575 				} else
576 					goto nnextin;	/* no nonempty match */
577 			}
578 		} while (*q++ != 0);
579 		if (f->out[s])
580 			patlen = q-p-1;	/* don't count $ */
581 		if (patlen > 0 ) {
582 			patbeg = (char *) p;
583 			return(1);
584 		}
585 	nnextin:
586 		s = 2;
587 		if (f->reset) {
588 			for (i = 2; i <= f->curstat; i++)
589 				xfree(f->posns[i]);
590 			k = *f->posns[0];
591 			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
592 				overflo("out of state space");
593 			for (i = 0; i <= k; i++)
594 				(f->posns[2])[i] = (f->posns[0])[i];
595 			f->initstat = f->curstat = 2;
596 			f->out[2] = f->out[0];
597 			for (i = 0; i < NCHARS; i++)
598 				f->gototab[2][i] = 0;
599 		}
600 		p++;
601 	}
602 	return (0);
603 }
604 
605 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
606 {			/* uses relex() to scan regular expression */
607 	Node *np;
608 
609 	dprintf( ("reparse <%s>\n", p) );
610 	lastre = prestr = (uschar *) p;	/* prestr points to string to be parsed */
611 	rtok = relex();
612 	/* GNU compatibility: an empty regexp matches anything */
613 	if (rtok == '\0') {
614 		/* FATAL("empty regular expression"); previous */
615 		return(op2(EMPTYRE, NIL, NIL));
616 	}
617 	np = regexp();
618 	if (rtok != '\0')
619 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
620 	return(np);
621 }
622 
623 Node *regexp(void)	/* top-level parse of reg expr */
624 {
625 	return (alt(concat(primary())));
626 }
627 
628 Node *primary(void)
629 {
630 	Node *np;
631 
632 	switch (rtok) {
633 	case CHAR:
634 		np = op2(CHAR, NIL, itonp(rlxval));
635 		rtok = relex();
636 		return (unary(np));
637 	case ALL:
638 		rtok = relex();
639 		return (unary(op2(ALL, NIL, NIL)));
640 	case EMPTYRE:
641 		rtok = relex();
642 		return (unary(op2(ALL, NIL, NIL)));
643 	case DOT:
644 		rtok = relex();
645 		return (unary(op2(DOT, NIL, NIL)));
646 	case CCL:
647 		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
648 		rtok = relex();
649 		return (unary(np));
650 	case NCCL:
651 		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
652 		rtok = relex();
653 		return (unary(np));
654 	case '^':
655 		rtok = relex();
656 		return (unary(op2(CHAR, NIL, itonp(HAT))));
657 	case '$':
658 		rtok = relex();
659 		return (unary(op2(CHAR, NIL, NIL)));
660 	case '(':
661 		rtok = relex();
662 		if (rtok == ')') {	/* special pleading for () */
663 			rtok = relex();
664 			return unary(op2(CCL, NIL, (Node *) tostring("")));
665 		}
666 		np = regexp();
667 		if (rtok == ')') {
668 			rtok = relex();
669 			return (unary(np));
670 		}
671 		else
672 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
673 	default:
674 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
675 	}
676 	return 0;	/*NOTREACHED*/
677 }
678 
679 Node *concat(Node *np)
680 {
681 	switch (rtok) {
682 	case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
683 		return (concat(op2(CAT, np, primary())));
684 	}
685 	return (np);
686 }
687 
688 Node *alt(Node *np)
689 {
690 	if (rtok == OR) {
691 		rtok = relex();
692 		return (alt(op2(OR, np, concat(primary()))));
693 	}
694 	return (np);
695 }
696 
697 Node *unary(Node *np)
698 {
699 	switch (rtok) {
700 	case STAR:
701 		rtok = relex();
702 		return (unary(op2(STAR, np, NIL)));
703 	case PLUS:
704 		rtok = relex();
705 		return (unary(op2(PLUS, np, NIL)));
706 	case QUEST:
707 		rtok = relex();
708 		return (unary(op2(QUEST, np, NIL)));
709 	default:
710 		return (np);
711 	}
712 }
713 
714 /*
715  * Character class definitions conformant to the POSIX locale as
716  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
717  * and operating character sets are both ASCII (ISO646) or supersets
718  * thereof.
719  *
720  * Note that to avoid overflowing the temporary buffer used in
721  * relex(), the expanded character class (prior to range expansion)
722  * must be less than twice the size of their full name.
723  */
724 
725 /* Because isblank doesn't show up in any of the header files on any
726  * system i use, it's defined here.  if some other locale has a richer
727  * definition of "blank", define HAS_ISBLANK and provide your own
728  * version.
729  * the parentheses here are an attempt to find a path through the maze
730  * of macro definition and/or function and/or version provided.  thanks
731  * to nelson beebe for the suggestion; let's see if it works everywhere.
732  */
733 
734 #ifndef HAS_ISBLANK
735 
736 int (isblank)(int c)
737 {
738 	return c==' ' || c=='\t';
739 }
740 
741 #endif
742 
743 struct charclass {
744 	const char *cc_name;
745 	int cc_namelen;
746 	int (*cc_func)(int);
747 } charclasses[] = {
748 	{ "alnum",	5,	isalnum },
749 	{ "alpha",	5,	isalpha },
750 	{ "blank",	5,	isblank },
751 	{ "cntrl",	5,	iscntrl },
752 	{ "digit",	5,	isdigit },
753 	{ "graph",	5,	isgraph },
754 	{ "lower",	5,	islower },
755 	{ "print",	5,	isprint },
756 	{ "punct",	5,	ispunct },
757 	{ "space",	5,	isspace },
758 	{ "upper",	5,	isupper },
759 	{ "xdigit",	6,	isxdigit },
760 	{ NULL,		0,	NULL },
761 };
762 
763 
764 int relex(void)		/* lexical analyzer for reparse */
765 {
766 	int c, n;
767 	int cflag;
768 	static uschar *buf = 0;
769 	static int bufsz = 100;
770 	uschar *bp;
771 	struct charclass *cc;
772 	int i;
773 
774 	switch (c = *prestr++) {
775 	case '|': return OR;
776 	case '*': return STAR;
777 	case '+': return PLUS;
778 	case '?': return QUEST;
779 	case '.': return DOT;
780 	case '\0': prestr--; return '\0';
781 	case '^':
782 	case '$':
783 	case '(':
784 	case ')':
785 		return c;
786 	case '\\':
787 		rlxval = quoted((char **) &prestr);
788 		return CHAR;
789 	default:
790 		rlxval = c;
791 		return CHAR;
792 	case '[':
793 		if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
794 			FATAL("out of space in reg expr %.10s..", lastre);
795 		bp = buf;
796 		if (*prestr == '^') {
797 			cflag = 1;
798 			prestr++;
799 		}
800 		else
801 			cflag = 0;
802 		n = 2 * strlen((const char *) prestr)+1;
803 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
804 			FATAL("out of space for reg expr %.10s...", lastre);
805 		for (; ; ) {
806 			if ((c = *prestr++) == '\\') {
807 				*bp++ = '\\';
808 				if ((c = *prestr++) == '\0')
809 					FATAL("nonterminated character class %.20s...", lastre);
810 				*bp++ = c;
811 			/* } else if (c == '\n') { */
812 			/* 	FATAL("newline in character class %.20s...", lastre); */
813 			} else if (c == '[' && *prestr == ':') {
814 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
815 				for (cc = charclasses; cc->cc_name; cc++)
816 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
817 						break;
818 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
819 				    prestr[2 + cc->cc_namelen] == ']') {
820 					prestr += cc->cc_namelen + 3;
821 					for (i = 0; i < NCHARS; i++) {
822 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
823 						    FATAL("out of space for reg expr %.10s...", lastre);
824 						if (cc->cc_func(i)) {
825 							*bp++ = i;
826 							n++;
827 						}
828 					}
829 				} else
830 					*bp++ = c;
831 			} else if (c == '\0') {
832 				FATAL("nonterminated character class %.20s", lastre);
833 			} else if (bp == buf) {	/* 1st char is special */
834 				*bp++ = c;
835 			} else if (c == ']') {
836 				*bp++ = 0;
837 				rlxstr = (uschar *) tostring((char *) buf);
838 				if (cflag == 0)
839 					return CCL;
840 				else
841 					return NCCL;
842 			} else
843 				*bp++ = c;
844 		}
845 	}
846 }
847 
848 int cgoto(fa *f, int s, int c)
849 {
850 	int i, j, k;
851 	int *p, *q;
852 
853 	assert(c == HAT || c < NCHARS);
854 	while (f->accept >= maxsetvec) {	/* guessing here! */
855 		maxsetvec *= 4;
856 		setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
857 		tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
858 		if (setvec == 0 || tmpset == 0)
859 			overflo("out of space in cgoto()");
860 	}
861 	for (i = 0; i <= f->accept; i++)
862 		setvec[i] = 0;
863 	setcnt = 0;
864 	/* compute positions of gototab[s,c] into setvec */
865 	p = f->posns[s];
866 	for (i = 1; i <= *p; i++) {
867 		if ((k = f->re[p[i]].ltype) != FINAL) {
868 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
869 			 || (k == DOT && c != 0 && c != HAT)
870 			 || (k == ALL && c != 0)
871 			 || (k == EMPTYRE && c != 0)
872 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
873 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
874 				q = f->re[p[i]].lfollow;
875 				for (j = 1; j <= *q; j++) {
876 					if (q[j] >= maxsetvec) {
877 						maxsetvec *= 4;
878 						setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
879 						tmpset = (int *) realloc(setvec, maxsetvec * sizeof(int));
880 						if (setvec == 0 || tmpset == 0)
881 							overflo("cgoto overflow");
882 					}
883 					if (setvec[q[j]] == 0) {
884 						setcnt++;
885 						setvec[q[j]] = 1;
886 					}
887 				}
888 			}
889 		}
890 	}
891 	/* determine if setvec is a previous state */
892 	tmpset[0] = setcnt;
893 	j = 1;
894 	for (i = f->accept; i >= 0; i--)
895 		if (setvec[i]) {
896 			tmpset[j++] = i;
897 		}
898 	/* tmpset == previous state? */
899 	for (i = 1; i <= f->curstat; i++) {
900 		p = f->posns[i];
901 		if ((k = tmpset[0]) != p[0])
902 			goto different;
903 		for (j = 1; j <= k; j++)
904 			if (tmpset[j] != p[j])
905 				goto different;
906 		/* setvec is state i */
907 		f->gototab[s][c] = i;
908 		return i;
909 	  different:;
910 	}
911 
912 	/* add tmpset to current set of states */
913 	if (f->curstat >= NSTATES-1) {
914 		f->curstat = 2;
915 		f->reset = 1;
916 		for (i = 2; i < NSTATES; i++)
917 			xfree(f->posns[i]);
918 	} else
919 		++(f->curstat);
920 	for (i = 0; i < NCHARS; i++)
921 		f->gototab[f->curstat][i] = 0;
922 	xfree(f->posns[f->curstat]);
923 	if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
924 		overflo("out of space in cgoto");
925 
926 	f->posns[f->curstat] = p;
927 	f->gototab[s][c] = f->curstat;
928 	for (i = 0; i <= setcnt; i++)
929 		p[i] = tmpset[i];
930 	if (setvec[f->accept])
931 		f->out[f->curstat] = 1;
932 	else
933 		f->out[f->curstat] = 0;
934 	return f->curstat;
935 }
936 
937 
938 void freefa(fa *f)	/* free a finite automaton */
939 {
940 	int i;
941 
942 	if (f == NULL)
943 		return;
944 	for (i = 0; i <= f->curstat; i++)
945 		xfree(f->posns[i]);
946 	for (i = 0; i <= f->accept; i++) {
947 		xfree(f->re[i].lfollow);
948 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
949 			xfree((f->re[i].lval.np));
950 	}
951 	xfree(f->restr);
952 	xfree(f);
953 }
954