xref: /dragonfly/contrib/awk/b.c (revision 6e278935)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #define	DEBUG
28 
29 #include <ctype.h>
30 #include <stdio.h>
31 #include <string.h>
32 #include <stdlib.h>
33 #include "awk.h"
34 #include "ytab.h"
35 
36 #define	HAT	(NCHARS+2)	/* matches ^ in regular expr */
37 				/* NCHARS is 2**n */
38 #define MAXLIN 22
39 
40 #define type(v)		(v)->nobj	/* badly overloaded here */
41 #define info(v)		(v)->ntype	/* badly overloaded here */
42 #define left(v)		(v)->narg[0]
43 #define right(v)	(v)->narg[1]
44 #define parent(v)	(v)->nnext
45 
46 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
47 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
48 #define UNARY	case STAR: case PLUS: case QUEST:
49 
50 /* encoding in tree Nodes:
51 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
52 		left is index, right contains value or pointer to value
53 	unary (STAR, PLUS, QUEST): left is child, right is null
54 	binary (CAT, OR): left and right are children
55 	parent contains pointer to parent
56 */
57 
58 
59 int	*setvec;
60 int	*tmpset;
61 int	maxsetvec = 0;
62 
63 int	rtok;		/* next token in current re */
64 int	rlxval;
65 static uschar	*rlxstr;
66 static uschar	*prestr;	/* current position in current re */
67 static uschar	*lastre;	/* origin of last re */
68 
69 static	int setcnt;
70 static	int poscnt;
71 
72 char	*patbeg;
73 int	patlen;
74 
75 #define	NFA	20	/* cache this many dynamic fa's */
76 fa	*fatab[NFA];
77 int	nfatab	= 0;	/* entries in fatab */
78 
79 fa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
80 {
81 	int i, use, nuse;
82 	fa *pfa;
83 	static int now = 1;
84 
85 	if (setvec == 0) {	/* first time through any RE */
86 		maxsetvec = MAXLIN;
87 		setvec = (int *) malloc(maxsetvec * sizeof(int));
88 		tmpset = (int *) malloc(maxsetvec * sizeof(int));
89 		if (setvec == 0 || tmpset == 0)
90 			overflo("out of space initializing makedfa");
91 	}
92 
93 	if (compile_time)	/* a constant for sure */
94 		return mkdfa(s, anchor);
95 	for (i = 0; i < nfatab; i++)	/* is it there already? */
96 		if (fatab[i]->anchor == anchor
97 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
98 			fatab[i]->use = now++;
99 			return fatab[i];
100 		}
101 	pfa = mkdfa(s, anchor);
102 	if (nfatab < NFA) {	/* room for another */
103 		fatab[nfatab] = pfa;
104 		fatab[nfatab]->use = now++;
105 		nfatab++;
106 		return pfa;
107 	}
108 	use = fatab[0]->use;	/* replace least-recently used */
109 	nuse = 0;
110 	for (i = 1; i < nfatab; i++)
111 		if (fatab[i]->use < use) {
112 			use = fatab[i]->use;
113 			nuse = i;
114 		}
115 	freefa(fatab[nuse]);
116 	fatab[nuse] = pfa;
117 	pfa->use = now++;
118 	return pfa;
119 }
120 
121 fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
122 				/* anchor = 1 for anchored matches, else 0 */
123 {
124 	Node *p, *p1;
125 	fa *f;
126 
127 	p = reparse(s);
128 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
129 		/* put ALL STAR in front of reg.  exp. */
130 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
131 		/* put FINAL after reg.  exp. */
132 
133 	poscnt = 0;
134 	penter(p1);	/* enter parent pointers and leaf indices */
135 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
136 		overflo("out of space for fa");
137 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
138 	cfoll(f, p1);	/* set up follow sets */
139 	freetr(p1);
140 	if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
141 			overflo("out of space in makedfa");
142 	if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
143 		overflo("out of space in makedfa");
144 	*f->posns[1] = 0;
145 	f->initstat = makeinit(f, anchor);
146 	f->anchor = anchor;
147 	f->restr = (uschar *) tostring(s);
148 	return f;
149 }
150 
151 int makeinit(fa *f, int anchor)
152 {
153 	int i, k;
154 
155 	f->curstat = 2;
156 	f->out[2] = 0;
157 	f->reset = 0;
158 	k = *(f->re[0].lfollow);
159 	xfree(f->posns[2]);
160 	if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
161 		overflo("out of space in makeinit");
162 	for (i=0; i <= k; i++) {
163 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
164 	}
165 	if ((f->posns[2])[1] == f->accept)
166 		f->out[2] = 1;
167 	for (i=0; i < NCHARS; i++)
168 		f->gototab[2][i] = 0;
169 	f->curstat = cgoto(f, 2, HAT);
170 	if (anchor) {
171 		*f->posns[2] = k-1;	/* leave out position 0 */
172 		for (i=0; i < k; i++) {
173 			(f->posns[0])[i] = (f->posns[2])[i];
174 		}
175 
176 		f->out[0] = f->out[2];
177 		if (f->curstat != 2)
178 			--(*f->posns[f->curstat]);
179 	}
180 	return f->curstat;
181 }
182 
183 void penter(Node *p)	/* set up parent pointers and leaf indices */
184 {
185 	switch (type(p)) {
186 	ELEAF
187 	LEAF
188 		info(p) = poscnt;
189 		poscnt++;
190 		break;
191 	UNARY
192 		penter(left(p));
193 		parent(left(p)) = p;
194 		break;
195 	case CAT:
196 	case OR:
197 		penter(left(p));
198 		penter(right(p));
199 		parent(left(p)) = p;
200 		parent(right(p)) = p;
201 		break;
202 	default:	/* can't happen */
203 		FATAL("can't happen: unknown type %d in penter", type(p));
204 		break;
205 	}
206 }
207 
208 void freetr(Node *p)	/* free parse tree */
209 {
210 	switch (type(p)) {
211 	ELEAF
212 	LEAF
213 		xfree(p);
214 		break;
215 	UNARY
216 		freetr(left(p));
217 		xfree(p);
218 		break;
219 	case CAT:
220 	case OR:
221 		freetr(left(p));
222 		freetr(right(p));
223 		xfree(p);
224 		break;
225 	default:	/* can't happen */
226 		FATAL("can't happen: unknown type %d in freetr", type(p));
227 		break;
228 	}
229 }
230 
231 /* in the parsing of regular expressions, metacharacters like . have */
232 /* to be seen literally;  \056 is not a metacharacter. */
233 
234 int hexstr(uschar **pp)	/* find and eval hex string at pp, return new p */
235 {			/* only pick up one 8-bit byte (2 chars) */
236 	uschar *p;
237 	int n = 0;
238 	int i;
239 
240 	for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
241 		if (isdigit(*p))
242 			n = 16 * n + *p - '0';
243 		else if (*p >= 'a' && *p <= 'f')
244 			n = 16 * n + *p - 'a' + 10;
245 		else if (*p >= 'A' && *p <= 'F')
246 			n = 16 * n + *p - 'A' + 10;
247 	}
248 	*pp = (uschar *) p;
249 	return n;
250 }
251 
252 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
253 
254 int quoted(uschar **pp)	/* pick up next thing after a \\ */
255 			/* and increment *pp */
256 {
257 	uschar *p = *pp;
258 	int c;
259 
260 	if ((c = *p++) == 't')
261 		c = '\t';
262 	else if (c == 'n')
263 		c = '\n';
264 	else if (c == 'f')
265 		c = '\f';
266 	else if (c == 'r')
267 		c = '\r';
268 	else if (c == 'b')
269 		c = '\b';
270 	else if (c == '\\')
271 		c = '\\';
272 	else if (c == 'x') {	/* hexadecimal goo follows */
273 		c = hexstr(&p);	/* this adds a null if number is invalid */
274 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
275 		int n = c - '0';
276 		if (isoctdigit(*p)) {
277 			n = 8 * n + *p++ - '0';
278 			if (isoctdigit(*p))
279 				n = 8 * n + *p++ - '0';
280 		}
281 		c = n;
282 	} /* else */
283 		/* c = c; */
284 	*pp = p;
285 	return c;
286 }
287 
288 char *cclenter(const char *argp)	/* add a character class */
289 {
290 	int i, c, c2;
291 	uschar *p = (uschar *) argp;
292 	uschar *op, *bp;
293 	static uschar *buf = 0;
294 	static int bufsz = 100;
295 
296 	op = p;
297 	if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
298 		FATAL("out of space for character class [%.10s...] 1", p);
299 	bp = buf;
300 	for (i = 0; (c = *p++) != 0; ) {
301 		if (c == '\\') {
302 			c = quoted(&p);
303 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
304 			if (*p != 0) {
305 				c = bp[-1];
306 				c2 = *p++;
307 				if (c2 == '\\')
308 					c2 = quoted(&p);
309 				if (c > c2) {	/* empty; ignore */
310 					bp--;
311 					i--;
312 					continue;
313 				}
314 				while (c < c2) {
315 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
316 						FATAL("out of space for character class [%.10s...] 2", p);
317 					*bp++ = ++c;
318 					i++;
319 				}
320 				continue;
321 			}
322 		}
323 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
324 			FATAL("out of space for character class [%.10s...] 3", p);
325 		*bp++ = c;
326 		i++;
327 	}
328 	*bp = 0;
329 	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
330 	xfree(op);
331 	return (char *) tostring((char *) buf);
332 }
333 
334 void overflo(const char *s)
335 {
336 	FATAL("regular expression too big: %.30s...", s);
337 }
338 
339 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
340 {
341 	int i;
342 	int *p;
343 
344 	switch (type(v)) {
345 	ELEAF
346 	LEAF
347 		f->re[info(v)].ltype = type(v);
348 		f->re[info(v)].lval.np = right(v);
349 		while (f->accept >= maxsetvec) {	/* guessing here! */
350 			maxsetvec *= 4;
351 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
352 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
353 			if (setvec == 0 || tmpset == 0)
354 				overflo("out of space in cfoll()");
355 		}
356 		for (i = 0; i <= f->accept; i++)
357 			setvec[i] = 0;
358 		setcnt = 0;
359 		follow(v);	/* computes setvec and setcnt */
360 		if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
361 			overflo("out of space building follow set");
362 		f->re[info(v)].lfollow = p;
363 		*p = setcnt;
364 		for (i = f->accept; i >= 0; i--)
365 			if (setvec[i] == 1)
366 				*++p = i;
367 		break;
368 	UNARY
369 		cfoll(f,left(v));
370 		break;
371 	case CAT:
372 	case OR:
373 		cfoll(f,left(v));
374 		cfoll(f,right(v));
375 		break;
376 	default:	/* can't happen */
377 		FATAL("can't happen: unknown type %d in cfoll", type(v));
378 	}
379 }
380 
381 int first(Node *p)	/* collects initially active leaves of p into setvec */
382 			/* returns 0 if p matches empty string */
383 {
384 	int b, lp;
385 
386 	switch (type(p)) {
387 	ELEAF
388 	LEAF
389 		lp = info(p);	/* look for high-water mark of subscripts */
390 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
391 			maxsetvec *= 4;
392 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
393 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
394 			if (setvec == 0 || tmpset == 0)
395 				overflo("out of space in first()");
396 		}
397 		if (type(p) == EMPTYRE) {
398 			setvec[lp] = 0;
399 			return(0);
400 		}
401 		if (setvec[lp] != 1) {
402 			setvec[lp] = 1;
403 			setcnt++;
404 		}
405 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
406 			return(0);		/* empty CCL */
407 		else return(1);
408 	case PLUS:
409 		if (first(left(p)) == 0) return(0);
410 		return(1);
411 	case STAR:
412 	case QUEST:
413 		first(left(p));
414 		return(0);
415 	case CAT:
416 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
417 		return(1);
418 	case OR:
419 		b = first(right(p));
420 		if (first(left(p)) == 0 || b == 0) return(0);
421 		return(1);
422 	}
423 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
424 	return(-1);
425 }
426 
427 void follow(Node *v)	/* collects leaves that can follow v into setvec */
428 {
429 	Node *p;
430 
431 	if (type(v) == FINAL)
432 		return;
433 	p = parent(v);
434 	switch (type(p)) {
435 	case STAR:
436 	case PLUS:
437 		first(v);
438 		follow(p);
439 		return;
440 
441 	case OR:
442 	case QUEST:
443 		follow(p);
444 		return;
445 
446 	case CAT:
447 		if (v == left(p)) {	/* v is left child of p */
448 			if (first(right(p)) == 0) {
449 				follow(p);
450 				return;
451 			}
452 		} else		/* v is right child */
453 			follow(p);
454 		return;
455 	}
456 }
457 
458 int member(int c, const char *sarg)	/* is c in s? */
459 {
460 	uschar *s = (uschar *) sarg;
461 
462 	while (*s)
463 		if (c == *s++)
464 			return(1);
465 	return(0);
466 }
467 
468 int match(fa *f, const char *p0)	/* shortest match ? */
469 {
470 	int s, ns;
471 	uschar *p = (uschar *) p0;
472 
473 	s = f->reset ? makeinit(f,0) : f->initstat;
474 	if (f->out[s])
475 		return(1);
476 	do {
477 		/* assert(*p < NCHARS); */
478 		if ((ns = f->gototab[s][*p]) != 0)
479 			s = ns;
480 		else
481 			s = cgoto(f, s, *p);
482 		if (f->out[s])
483 			return(1);
484 	} while (*p++ != 0);
485 	return(0);
486 }
487 
488 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
489 {
490 	int s, ns;
491 	uschar *p = (uschar *) p0;
492 	uschar *q;
493 	int i, k;
494 
495 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
496 	if (f->reset) {
497 		f->initstat = s = makeinit(f,1);
498 	} else {
499 		s = f->initstat;
500 	}
501 	patbeg = (char *) p;
502 	patlen = -1;
503 	do {
504 		q = p;
505 		do {
506 			if (f->out[s])		/* final state */
507 				patlen = q-p;
508 			/* assert(*q < NCHARS); */
509 			if ((ns = f->gototab[s][*q]) != 0)
510 				s = ns;
511 			else
512 				s = cgoto(f, s, *q);
513 			if (s == 1) {	/* no transition */
514 				if (patlen >= 0) {
515 					patbeg = (char *) p;
516 					return(1);
517 				}
518 				else
519 					goto nextin;	/* no match */
520 			}
521 		} while (*q++ != 0);
522 		if (f->out[s])
523 			patlen = q-p-1;	/* don't count $ */
524 		if (patlen >= 0) {
525 			patbeg = (char *) p;
526 			return(1);
527 		}
528 	nextin:
529 		s = 2;
530 		if (f->reset) {
531 			for (i = 2; i <= f->curstat; i++)
532 				xfree(f->posns[i]);
533 			k = *f->posns[0];
534 			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
535 				overflo("out of space in pmatch");
536 			for (i = 0; i <= k; i++)
537 				(f->posns[2])[i] = (f->posns[0])[i];
538 			f->initstat = f->curstat = 2;
539 			f->out[2] = f->out[0];
540 			for (i = 0; i < NCHARS; i++)
541 				f->gototab[2][i] = 0;
542 		}
543 	} while (*p++ != 0);
544 	return (0);
545 }
546 
547 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
548 {
549 	int s, ns;
550 	uschar *p = (uschar *) p0;
551 	uschar *q;
552 	int i, k;
553 
554 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
555 	if (f->reset) {
556 		f->initstat = s = makeinit(f,1);
557 	} else {
558 		s = f->initstat;
559 	}
560 	patlen = -1;
561 	while (*p) {
562 		q = p;
563 		do {
564 			if (f->out[s])		/* final state */
565 				patlen = q-p;
566 			/* assert(*q < NCHARS); */
567 			if ((ns = f->gototab[s][*q]) != 0)
568 				s = ns;
569 			else
570 				s = cgoto(f, s, *q);
571 			if (s == 1) {	/* no transition */
572 				if (patlen > 0) {
573 					patbeg = (char *) p;
574 					return(1);
575 				} else
576 					goto nnextin;	/* no nonempty match */
577 			}
578 		} while (*q++ != 0);
579 		if (f->out[s])
580 			patlen = q-p-1;	/* don't count $ */
581 		if (patlen > 0 ) {
582 			patbeg = (char *) p;
583 			return(1);
584 		}
585 	nnextin:
586 		s = 2;
587 		if (f->reset) {
588 			for (i = 2; i <= f->curstat; i++)
589 				xfree(f->posns[i]);
590 			k = *f->posns[0];
591 			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
592 				overflo("out of state space");
593 			for (i = 0; i <= k; i++)
594 				(f->posns[2])[i] = (f->posns[0])[i];
595 			f->initstat = f->curstat = 2;
596 			f->out[2] = f->out[0];
597 			for (i = 0; i < NCHARS; i++)
598 				f->gototab[2][i] = 0;
599 		}
600 		p++;
601 	}
602 	return (0);
603 }
604 
605 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
606 {			/* uses relex() to scan regular expression */
607 	Node *np;
608 
609 	dprintf( ("reparse <%s>\n", p) );
610 	lastre = prestr = (uschar *) p;	/* prestr points to string to be parsed */
611 	rtok = relex();
612 	/* GNU compatibility: an empty regexp matches anything */
613 	if (rtok == '\0') {
614 		/* FATAL("empty regular expression"); previous */
615 		return(op2(EMPTYRE, NIL, NIL));
616 	}
617 	np = regexp();
618 	if (rtok != '\0')
619 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
620 	return(np);
621 }
622 
623 Node *regexp(void)	/* top-level parse of reg expr */
624 {
625 	return (alt(concat(primary())));
626 }
627 
628 Node *primary(void)
629 {
630 	Node *np;
631 
632 	switch (rtok) {
633 	case CHAR:
634 		np = op2(CHAR, NIL, itonp(rlxval));
635 		rtok = relex();
636 		return (unary(np));
637 	case ALL:
638 		rtok = relex();
639 		return (unary(op2(ALL, NIL, NIL)));
640 	case EMPTYRE:
641 		rtok = relex();
642 		return (unary(op2(ALL, NIL, NIL)));
643 	case DOT:
644 		rtok = relex();
645 		return (unary(op2(DOT, NIL, NIL)));
646 	case CCL:
647 		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
648 		rtok = relex();
649 		return (unary(np));
650 	case NCCL:
651 		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
652 		rtok = relex();
653 		return (unary(np));
654 	case '^':
655 		rtok = relex();
656 		return (unary(op2(CHAR, NIL, itonp(HAT))));
657 	case '$':
658 		rtok = relex();
659 		return (unary(op2(CHAR, NIL, NIL)));
660 	case '(':
661 		rtok = relex();
662 		if (rtok == ')') {	/* special pleading for () */
663 			rtok = relex();
664 			return unary(op2(CCL, NIL, (Node *) tostring("")));
665 		}
666 		np = regexp();
667 		if (rtok == ')') {
668 			rtok = relex();
669 			return (unary(np));
670 		}
671 		else
672 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
673 	default:
674 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
675 	}
676 	return 0;	/*NOTREACHED*/
677 }
678 
679 Node *concat(Node *np)
680 {
681 	switch (rtok) {
682 	case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
683 		return (concat(op2(CAT, np, primary())));
684 	}
685 	return (np);
686 }
687 
688 Node *alt(Node *np)
689 {
690 	if (rtok == OR) {
691 		rtok = relex();
692 		return (alt(op2(OR, np, concat(primary()))));
693 	}
694 	return (np);
695 }
696 
697 Node *unary(Node *np)
698 {
699 	switch (rtok) {
700 	case STAR:
701 		rtok = relex();
702 		return (unary(op2(STAR, np, NIL)));
703 	case PLUS:
704 		rtok = relex();
705 		return (unary(op2(PLUS, np, NIL)));
706 	case QUEST:
707 		rtok = relex();
708 		return (unary(op2(QUEST, np, NIL)));
709 	default:
710 		return (np);
711 	}
712 }
713 
714 /*
715  * Character class definitions conformant to the POSIX locale as
716  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
717  * and operating character sets are both ASCII (ISO646) or supersets
718  * thereof.
719  *
720  * Note that to avoid overflowing the temporary buffer used in
721  * relex(), the expanded character class (prior to range expansion)
722  * must be less than twice the size of their full name.
723  */
724 
725 /* Because isblank doesn't show up in any of the header files on any
726  * system i use, it's defined here.  if some other locale has a richer
727  * definition of "blank", define HAS_ISBLANK and provide your own
728  * version.
729  * the parentheses here are an attempt to find a path through the maze
730  * of macro definition and/or function and/or version provided.  thanks
731  * to nelson beebe for the suggestion; let's see if it works everywhere.
732  */
733 
734 /* #define HAS_ISBLANK */
735 #ifndef HAS_ISBLANK
736 
737 int (xisblank)(int c)
738 {
739 	return c==' ' || c=='\t';
740 }
741 
742 #endif
743 
744 struct charclass {
745 	const char *cc_name;
746 	int cc_namelen;
747 	int (*cc_func)(int);
748 } charclasses[] = {
749 	{ "alnum",	5,	isalnum },
750 	{ "alpha",	5,	isalpha },
751 #ifndef HAS_ISBLANK
752 	{ "blank",	5,	isspace }, /* was isblank */
753 #else
754 	{ "blank",	5,	isblank },
755 #endif
756 	{ "cntrl",	5,	iscntrl },
757 	{ "digit",	5,	isdigit },
758 	{ "graph",	5,	isgraph },
759 	{ "lower",	5,	islower },
760 	{ "print",	5,	isprint },
761 	{ "punct",	5,	ispunct },
762 	{ "space",	5,	isspace },
763 	{ "upper",	5,	isupper },
764 	{ "xdigit",	6,	isxdigit },
765 	{ NULL,		0,	NULL },
766 };
767 
768 
769 int relex(void)		/* lexical analyzer for reparse */
770 {
771 	int c, n;
772 	int cflag;
773 	static uschar *buf = 0;
774 	static int bufsz = 100;
775 	uschar *bp;
776 	struct charclass *cc;
777 	int i;
778 
779 	switch (c = *prestr++) {
780 	case '|': return OR;
781 	case '*': return STAR;
782 	case '+': return PLUS;
783 	case '?': return QUEST;
784 	case '.': return DOT;
785 	case '\0': prestr--; return '\0';
786 	case '^':
787 	case '$':
788 	case '(':
789 	case ')':
790 		return c;
791 	case '\\':
792 		rlxval = quoted(&prestr);
793 		return CHAR;
794 	default:
795 		rlxval = c;
796 		return CHAR;
797 	case '[':
798 		if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
799 			FATAL("out of space in reg expr %.10s..", lastre);
800 		bp = buf;
801 		if (*prestr == '^') {
802 			cflag = 1;
803 			prestr++;
804 		}
805 		else
806 			cflag = 0;
807 		n = 2 * strlen((const char *) prestr)+1;
808 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
809 			FATAL("out of space for reg expr %.10s...", lastre);
810 		for (; ; ) {
811 			if ((c = *prestr++) == '\\') {
812 				*bp++ = '\\';
813 				if ((c = *prestr++) == '\0')
814 					FATAL("nonterminated character class %.20s...", lastre);
815 				*bp++ = c;
816 			/* } else if (c == '\n') { */
817 			/* 	FATAL("newline in character class %.20s...", lastre); */
818 			} else if (c == '[' && *prestr == ':') {
819 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
820 				for (cc = charclasses; cc->cc_name; cc++)
821 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
822 						break;
823 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
824 				    prestr[2 + cc->cc_namelen] == ']') {
825 					prestr += cc->cc_namelen + 3;
826 					for (i = 0; i < NCHARS; i++) {
827 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
828 						    FATAL("out of space for reg expr %.10s...", lastre);
829 						if (cc->cc_func(i)) {
830 							*bp++ = i;
831 							n++;
832 						}
833 					}
834 				} else
835 					*bp++ = c;
836 			} else if (c == '\0') {
837 				FATAL("nonterminated character class %.20s", lastre);
838 			} else if (bp == buf) {	/* 1st char is special */
839 				*bp++ = c;
840 			} else if (c == ']') {
841 				*bp++ = 0;
842 				rlxstr = (uschar *) tostring((char *) buf);
843 				if (cflag == 0)
844 					return CCL;
845 				else
846 					return NCCL;
847 			} else
848 				*bp++ = c;
849 		}
850 	}
851 }
852 
853 int cgoto(fa *f, int s, int c)
854 {
855 	int i, j, k;
856 	int *p, *q;
857 
858 	assert(c == HAT || c < NCHARS);
859 	while (f->accept >= maxsetvec) {	/* guessing here! */
860 		maxsetvec *= 4;
861 		setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
862 		tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
863 		if (setvec == 0 || tmpset == 0)
864 			overflo("out of space in cgoto()");
865 	}
866 	for (i = 0; i <= f->accept; i++)
867 		setvec[i] = 0;
868 	setcnt = 0;
869 	/* compute positions of gototab[s,c] into setvec */
870 	p = f->posns[s];
871 	for (i = 1; i <= *p; i++) {
872 		if ((k = f->re[p[i]].ltype) != FINAL) {
873 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
874 			 || (k == DOT && c != 0 && c != HAT)
875 			 || (k == ALL && c != 0)
876 			 || (k == EMPTYRE && c != 0)
877 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
878 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
879 				q = f->re[p[i]].lfollow;
880 				for (j = 1; j <= *q; j++) {
881 					if (q[j] >= maxsetvec) {
882 						maxsetvec *= 4;
883 						setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
884 						tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
885 						if (setvec == 0 || tmpset == 0)
886 							overflo("cgoto overflow");
887 					}
888 					if (setvec[q[j]] == 0) {
889 						setcnt++;
890 						setvec[q[j]] = 1;
891 					}
892 				}
893 			}
894 		}
895 	}
896 	/* determine if setvec is a previous state */
897 	tmpset[0] = setcnt;
898 	j = 1;
899 	for (i = f->accept; i >= 0; i--)
900 		if (setvec[i]) {
901 			tmpset[j++] = i;
902 		}
903 	/* tmpset == previous state? */
904 	for (i = 1; i <= f->curstat; i++) {
905 		p = f->posns[i];
906 		if ((k = tmpset[0]) != p[0])
907 			goto different;
908 		for (j = 1; j <= k; j++)
909 			if (tmpset[j] != p[j])
910 				goto different;
911 		/* setvec is state i */
912 		f->gototab[s][c] = i;
913 		return i;
914 	  different:;
915 	}
916 
917 	/* add tmpset to current set of states */
918 	if (f->curstat >= NSTATES-1) {
919 		f->curstat = 2;
920 		f->reset = 1;
921 		for (i = 2; i < NSTATES; i++)
922 			xfree(f->posns[i]);
923 	} else
924 		++(f->curstat);
925 	for (i = 0; i < NCHARS; i++)
926 		f->gototab[f->curstat][i] = 0;
927 	xfree(f->posns[f->curstat]);
928 	if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
929 		overflo("out of space in cgoto");
930 
931 	f->posns[f->curstat] = p;
932 	f->gototab[s][c] = f->curstat;
933 	for (i = 0; i <= setcnt; i++)
934 		p[i] = tmpset[i];
935 	if (setvec[f->accept])
936 		f->out[f->curstat] = 1;
937 	else
938 		f->out[f->curstat] = 0;
939 	return f->curstat;
940 }
941 
942 
943 void freefa(fa *f)	/* free a finite automaton */
944 {
945 	int i;
946 
947 	if (f == NULL)
948 		return;
949 	for (i = 0; i <= f->curstat; i++)
950 		xfree(f->posns[i]);
951 	for (i = 0; i <= f->accept; i++) {
952 		xfree(f->re[i].lfollow);
953 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
954 			xfree((f->re[i].lval.np));
955 	}
956 	xfree(f->restr);
957 	xfree(f);
958 }
959