xref: /netbsd/external/historical/nawk/dist/b.c (revision 6550d01e)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #if HAVE_NBTOOL_CONFIG_H
28 #include "nbtool_config.h"
29 #endif
30 
31 #define	DEBUG
32 
33 #include <ctype.h>
34 #include <stdio.h>
35 #include <string.h>
36 #include <stdlib.h>
37 #include <assert.h>
38 #include "awk.h"
39 #include "awkgram.h"
40 
41 #define	HAT	(NCHARS+2)	/* matches ^ in regular expr */
42 				/* NCHARS is 2**n */
43 #define MAXLIN 22
44 
45 #define type(v)		(v)->nobj	/* badly overloaded here */
46 #define info(v)		(v)->ntype	/* badly overloaded here */
47 #define left(v)		(v)->narg[0]
48 #define right(v)	(v)->narg[1]
49 #define parent(v)	(v)->nnext
50 
51 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
52 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
53 #define UNARY	case STAR: case PLUS: case QUEST:
54 
55 /* encoding in tree Nodes:
56 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
57 		left is index, right contains value or pointer to value
58 	unary (STAR, PLUS, QUEST): left is child, right is null
59 	binary (CAT, OR): left and right are children
60 	parent contains pointer to parent
61 */
62 
63 
64 int	*setvec;
65 int	*tmpset;
66 int	maxsetvec = 0;
67 
68 int	rtok;		/* next token in current re */
69 int	rlxval;
70 static const uschar	*rlxstr;
71 static const uschar	*prestr;	/* current position in current re */
72 static const uschar	*lastre;	/* origin of last re */
73 
74 static	int setcnt;
75 static	int poscnt;
76 
77 uschar	*patbeg;
78 int	patlen;
79 
80 #define	NFA	128	/* cache this many dynamic fa's */
81 fa	*fatab[NFA];
82 int	nfatab	= 0;	/* entries in fatab */
83 
84 static void
85 resizesetvec(const char *msg)
86 {
87 	if (maxsetvec == 0)
88 		maxsetvec = MAXLIN;
89 	else
90 		maxsetvec *= 4;
91 	setvec = realloc(setvec, maxsetvec * sizeof(*setvec));
92 	tmpset = realloc(tmpset, maxsetvec * sizeof(*tmpset));
93 	if (setvec == 0 || tmpset == 0)
94 	    overflo(msg);
95 }
96 
97 static void
98 resize_state(fa *f, int state)
99 {
100 	void *p;
101 	int i, new_count;
102 
103 	if (++state < f->state_count)
104 		return;
105 
106 	new_count = state + 10; /* needs to be tuned */
107 
108 	p = realloc(f->gototab, new_count * sizeof(f->gototab[0]));
109 	if (p == NULL)
110 		goto out;
111 	f->gototab = p;
112 
113 	p = realloc(f->out, new_count * sizeof(f->out[0]));
114 	if (p == NULL)
115 		goto out;
116 	f->out = p;
117 
118 	p = realloc(f->posns, new_count * sizeof(f->posns[0]));
119 	if (p == NULL)
120 		goto out;
121 	f->posns = p;
122 
123 	for (i = f->state_count; i < new_count; ++i) {
124 		f->gototab[i] = calloc(1, NCHARS * sizeof (**f->gototab));
125 		if (f->gototab[i] == NULL)
126 			goto out;
127 		f->out[i]  = 0;
128 		f->posns[i] = NULL;
129 	}
130 	f->state_count = new_count;
131 	return;
132 out:
133 	overflo("out of memory in resize_state");
134 }
135 
136 fa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
137 {
138 	int i, use, nuse;
139 	fa *pfa;
140 	static int now = 1;
141 
142 	if (setvec == 0)	/* first time through any RE */
143 		resizesetvec("out of space initializing makedfa");
144 
145 	if (compile_time)	/* a constant for sure */
146 		return mkdfa(s, anchor);
147 	for (i = 0; i < nfatab; i++)	/* is it there already? */
148 		if (fatab[i]->anchor == anchor
149 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
150 			fatab[i]->use = now++;
151 			return fatab[i];
152 		}
153 	pfa = mkdfa(s, anchor);
154 	if (nfatab < NFA) {	/* room for another */
155 		fatab[nfatab] = pfa;
156 		fatab[nfatab]->use = now++;
157 		nfatab++;
158 		return pfa;
159 	}
160 	use = fatab[0]->use;	/* replace least-recently used */
161 	nuse = 0;
162 	for (i = 1; i < nfatab; i++)
163 		if (fatab[i]->use < use) {
164 			use = fatab[i]->use;
165 			nuse = i;
166 		}
167 	freefa(fatab[nuse]);
168 	fatab[nuse] = pfa;
169 	pfa->use = now++;
170 	return pfa;
171 }
172 
173 fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
174 				/* anchor = 1 for anchored matches, else 0 */
175 {
176 	Node *p, *p1;
177 	fa *f;
178 
179 	p = reparse(s);
180 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
181 		/* put ALL STAR in front of reg.  exp. */
182 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
183 		/* put FINAL after reg.  exp. */
184 
185 	poscnt = 0;
186 	penter(p1);	/* enter parent pointers and leaf indices */
187 	if ((f = calloc(1, sizeof(*f) + poscnt*sizeof(rrow))) == NULL)
188 		overflo("out of space for fa");
189 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
190 	cfoll(f, p1);	/* set up follow sets */
191 	freetr(p1);
192 	resize_state(f, 1);
193 	if ((f->posns[0] = calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
194 			overflo("out of space in makedfa");
195 	if ((f->posns[1] = calloc(1, sizeof(int))) == NULL)
196 		overflo("out of space in makedfa");
197 	*f->posns[1] = 0;
198 	f->initstat = makeinit(f, anchor);
199 	f->anchor = anchor;
200 	f->restr = (uschar *) tostring(s);
201 	return f;
202 }
203 
204 int makeinit(fa *f, int anchor)
205 {
206 	int i, k;
207 
208 	resize_state(f, 2);
209 	f->curstat = 2;
210 	f->out[2] = 0;
211 	k = *(f->re[0].lfollow);
212 	xfree(f->posns[2]);
213 	if ((f->posns[2] = calloc(1, (k+1)*sizeof(int))) == NULL)
214 		overflo("out of space in makeinit");
215 	for (i=0; i <= k; i++) {
216 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
217 	}
218 	if ((f->posns[2])[1] == f->accept)
219 		f->out[2] = 1;
220 	for (i=0; i < NCHARS; i++)
221 		f->gototab[2][i] = 0;
222 	f->curstat = cgoto(f, 2, HAT);
223 	if (anchor) {
224 		*f->posns[2] = k-1;	/* leave out position 0 */
225 		for (i=0; i < k; i++) {
226 			(f->posns[0])[i] = (f->posns[2])[i];
227 		}
228 
229 		f->out[0] = f->out[2];
230 		if (f->curstat != 2) {
231 			resize_state(f, f->curstat);
232 			--(*f->posns[f->curstat]);
233 		}
234 	}
235 	return f->curstat;
236 }
237 
238 void penter(Node *p)	/* set up parent pointers and leaf indices */
239 {
240 	switch (type(p)) {
241 	ELEAF
242 	LEAF
243 		info(p) = poscnt;
244 		poscnt++;
245 		break;
246 	UNARY
247 		penter(left(p));
248 		parent(left(p)) = p;
249 		break;
250 	case CAT:
251 	case OR:
252 		penter(left(p));
253 		penter(right(p));
254 		parent(left(p)) = p;
255 		parent(right(p)) = p;
256 		break;
257 	default:	/* can't happen */
258 		FATAL("can't happen: unknown type %d in penter", type(p));
259 		break;
260 	}
261 }
262 
263 void freetr(Node *p)	/* free parse tree */
264 {
265 	switch (type(p)) {
266 	ELEAF
267 	LEAF
268 		xfree(p);
269 		break;
270 	UNARY
271 		freetr(left(p));
272 		xfree(p);
273 		break;
274 	case CAT:
275 	case OR:
276 		freetr(left(p));
277 		freetr(right(p));
278 		xfree(p);
279 		break;
280 	default:	/* can't happen */
281 		FATAL("can't happen: unknown type %d in freetr", type(p));
282 		break;
283 	}
284 }
285 
286 /* in the parsing of regular expressions, metacharacters like . have */
287 /* to be seen literally;  \056 is not a metacharacter. */
288 
289 int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
290 {			/* only pick up one 8-bit byte (2 chars) */
291 	const uschar *p;
292 	int n = 0;
293 	int i;
294 
295 	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
296 		if (isdigit(*p))
297 			n = 16 * n + *p - '0';
298 		else if (*p >= 'a' && *p <= 'f')
299 			n = 16 * n + *p - 'a' + 10;
300 		else if (*p >= 'A' && *p <= 'F')
301 			n = 16 * n + *p - 'A' + 10;
302 	}
303 	*pp = p;
304 	return n;
305 }
306 
307 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
308 
309 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
310 			/* and increment *pp */
311 {
312 	const uschar *p = *pp;
313 	int c;
314 
315 	if ((c = *p++) == 't')
316 		c = '\t';
317 	else if (c == 'n')
318 		c = '\n';
319 	else if (c == 'f')
320 		c = '\f';
321 	else if (c == 'r')
322 		c = '\r';
323 	else if (c == 'b')
324 		c = '\b';
325 	else if (c == '\\')
326 		c = '\\';
327 	else if (c == 'x') {	/* hexadecimal goo follows */
328 		c = hexstr(&p);	/* this adds a null if number is invalid */
329 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
330 		int n = c - '0';
331 		if (isoctdigit(*p)) {
332 			n = 8 * n + *p++ - '0';
333 			if (isoctdigit(*p))
334 				n = 8 * n + *p++ - '0';
335 		}
336 		c = n;
337 	} /* else */
338 		/* c = c; */
339 	*pp = p;
340 	return c;
341 }
342 
343 char *cclenter(const char *argp)	/* add a character class */
344 {
345 	int i, c, c2;
346 	const uschar *p = (const uschar *) argp;
347 	const uschar *op;
348 	uschar *bp;
349 	static uschar *buf = 0;
350 	static int bufsz = 100;
351 
352 	op = p;
353 	if (buf == 0 && (buf = malloc(bufsz)) == NULL)
354 		FATAL("out of space for character class [%.10s...] 1", p);
355 	bp = buf;
356 	for (i = 0; (c = *p++) != 0; ) {
357 		if (c == '\\') {
358 			c = quoted(&p);
359 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
360 			if (*p != 0) {
361 				c = bp[-1];
362 				c2 = *p++;
363 				if (c2 == '\\')
364 					c2 = quoted(&p);
365 				if (c > c2) {	/* empty; ignore */
366 					bp--;
367 					i--;
368 					continue;
369 				}
370 				while (c < c2) {
371 					if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter1"))
372 						FATAL("out of space for character class [%.10s...] 2", p);
373 					*bp++ = ++c;
374 					i++;
375 				}
376 				continue;
377 			}
378 		}
379 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter2"))
380 			FATAL("out of space for character class [%.10s...] 3", p);
381 		*bp++ = c;
382 		i++;
383 	}
384 	*bp = 0;
385 	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
386 	free(__UNCONST(op));
387 	return (char *) tostring((char *) buf);
388 }
389 
390 void overflo(const char *s)
391 {
392 	FATAL("regular expression too big: %.30s...", s);
393 }
394 
395 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
396 {
397 	int i;
398 	int *p;
399 
400 	switch (type(v)) {
401 	ELEAF
402 	LEAF
403 		f->re[info(v)].ltype = type(v);
404 		f->re[info(v)].lval.np = right(v);
405 		while (f->accept >= maxsetvec) /* guessing here! */
406 			resizesetvec("out of space in cfoll()");
407 		for (i = 0; i <= f->accept; i++)
408 			setvec[i] = 0;
409 		setcnt = 0;
410 		follow(v);	/* computes setvec and setcnt */
411 		if ((p = calloc(1, (setcnt+1)*sizeof(int))) == NULL)
412 			overflo("out of space building follow set");
413 		f->re[info(v)].lfollow = p;
414 		*p = setcnt;
415 		for (i = f->accept; i >= 0; i--)
416 			if (setvec[i] == 1)
417 				*++p = i;
418 		break;
419 	UNARY
420 		cfoll(f,left(v));
421 		break;
422 	case CAT:
423 	case OR:
424 		cfoll(f,left(v));
425 		cfoll(f,right(v));
426 		break;
427 	default:	/* can't happen */
428 		FATAL("can't happen: unknown type %d in cfoll", type(v));
429 	}
430 }
431 
432 int first(Node *p)	/* collects initially active leaves of p into setvec */
433 			/* returns 0 if p matches empty string */
434 {
435 	int b, lp;
436 
437 	switch (type(p)) {
438 	ELEAF
439 	LEAF
440 		lp = info(p);	/* look for high-water mark of subscripts */
441 		while (setcnt >= maxsetvec || lp >= maxsetvec) /* guessing here! */
442 			resizesetvec("out of space in first()");
443 		if (type(p) == EMPTYRE) {
444 			setvec[lp] = 0;
445 			return(0);
446 		}
447 		if (setvec[lp] != 1) {
448 			setvec[lp] = 1;
449 			setcnt++;
450 		}
451 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
452 			return(0);		/* empty CCL */
453 		else return(1);
454 	case PLUS:
455 		if (first(left(p)) == 0) return(0);
456 		return(1);
457 	case STAR:
458 	case QUEST:
459 		first(left(p));
460 		return(0);
461 	case CAT:
462 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
463 		return(1);
464 	case OR:
465 		b = first(right(p));
466 		if (first(left(p)) == 0 || b == 0) return(0);
467 		return(1);
468 	}
469 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
470 	return(-1);
471 }
472 
473 void follow(Node *v)	/* collects leaves that can follow v into setvec */
474 {
475 	Node *p;
476 
477 	if (type(v) == FINAL)
478 		return;
479 	p = parent(v);
480 	switch (type(p)) {
481 	case STAR:
482 	case PLUS:
483 		first(v);
484 		follow(p);
485 		return;
486 
487 	case OR:
488 	case QUEST:
489 		follow(p);
490 		return;
491 
492 	case CAT:
493 		if (v == left(p)) {	/* v is left child of p */
494 			if (first(right(p)) == 0) {
495 				follow(p);
496 				return;
497 			}
498 		} else		/* v is right child */
499 			follow(p);
500 		return;
501 	}
502 }
503 
504 int member(int c, const char *sarg)	/* is c in s? */
505 {
506 	const uschar *s = (const uschar *) sarg;
507 
508 	while (*s)
509 		if (c == *s++)
510 			return(1);
511 	return(0);
512 }
513 
514 int match(fa *f, const char *p0)	/* shortest match ? */
515 {
516 	int s, ns;
517 	const uschar *p = (const uschar *) p0;
518 
519 	s = f->initstat;
520 	assert (s < f->state_count);
521 
522 	if (f->out[s])
523 		return(1);
524 	do {
525 		/* assert(*p < NCHARS); */
526 		if ((ns = f->gototab[s][*p]) != 0)
527 			s = ns;
528 		else
529 			s = cgoto(f, s, *p);
530 
531 		assert (s < f->state_count);
532 
533 		if (f->out[s])
534 			return(1);
535 	} while (*p++ != 0);
536 	return(0);
537 }
538 
539 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
540 {
541 	int s, ns;
542 	uschar *p = __UNCONST(p0);
543 	uschar *q;
544 
545 	s = f->initstat;
546 	assert(s < f->state_count);
547 	patbeg = p;
548 	patlen = -1;
549 	do {
550 		q = p;
551 		do {
552 			if (f->out[s])		/* final state */
553 				patlen = q-p;
554 			/* assert(*q < NCHARS); */
555 			if ((ns = f->gototab[s][*q]) != 0)
556 				s = ns;
557 			else
558 				s = cgoto(f, s, *q);
559 
560 			assert(s < f->state_count);
561 
562 			if (s == 1) {	/* no transition */
563 				if (patlen >= 0) {
564 					patbeg = p;
565 					return(1);
566 				}
567 				else
568 					goto nextin;	/* no match */
569 			}
570 		} while (*q++ != 0);
571 		if (f->out[s])
572 			patlen = q-p-1;	/* don't count $ */
573 		if (patlen >= 0) {
574 			patbeg = p;
575 			return(1);
576 		}
577 	nextin:
578 		s = 2;
579 	} while (*p++ != 0);
580 	return (0);
581 }
582 
583 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
584 {
585 	int s, ns;
586 	uschar *p = __UNCONST(p0);
587 	uschar *q;
588 
589 	s = f->initstat;
590 	assert(s < f->state_count);
591 
592 	patlen = -1;
593 	while (*p) {
594 		q = p;
595 		do {
596 			if (f->out[s])		/* final state */
597 				patlen = q-p;
598 			/* assert(*q < NCHARS); */
599 			if ((ns = f->gototab[s][*q]) != 0)
600 				s = ns;
601 			else
602 				s = cgoto(f, s, *q);
603 
604 			assert(s < f->state_count);
605 
606 			if (s == 1) {	/* no transition */
607 				if (patlen > 0) {
608 					patbeg = p;
609 					return(1);
610 				} else
611 					goto nnextin;	/* no nonempty match */
612 			}
613 		} while (*q++ != 0);
614 		if (f->out[s])
615 			patlen = q-p-1;	/* don't count $ */
616 		if (patlen > 0 ) {
617 			patbeg = p;
618 			return(1);
619 		}
620 	nnextin:
621 		s = 2;
622 		p++;
623 	}
624 	return (0);
625 }
626 
627 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
628 {			/* uses relex() to scan regular expression */
629 	Node *np;
630 
631 	dprintf( ("reparse <%s>\n", p) );
632 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
633 	rtok = relex();
634 	/* GNU compatibility: an empty regexp matches anything */
635 	if (rtok == '\0') {
636 		/* FATAL("empty regular expression"); previous */
637 		return(op2(EMPTYRE, NIL, NIL));
638 	}
639 	np = regexp();
640 	if (rtok != '\0')
641 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
642 	return(np);
643 }
644 
645 Node *regexp(void)	/* top-level parse of reg expr */
646 {
647 	return (alt(concat(primary())));
648 }
649 
650 Node *primary(void)
651 {
652 	Node *np;
653 
654 	switch (rtok) {
655 	case CHAR:
656 		np = op2(CHAR, NIL, itonp(rlxval));
657 		rtok = relex();
658 		return (unary(np));
659 	case ALL:
660 		rtok = relex();
661 		return (unary(op2(ALL, NIL, NIL)));
662 	case EMPTYRE:
663 		rtok = relex();
664 		return (unary(op2(ALL, NIL, NIL)));
665 	case DOT:
666 		rtok = relex();
667 		return (unary(op2(DOT, NIL, NIL)));
668 	case CCL:
669 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
670 		rtok = relex();
671 		return (unary(np));
672 	case NCCL:
673 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
674 		rtok = relex();
675 		return (unary(np));
676 	case '^':
677 		rtok = relex();
678 		return (unary(op2(CHAR, NIL, itonp(HAT))));
679 	case '$':
680 		rtok = relex();
681 		return (unary(op2(CHAR, NIL, NIL)));
682 	case '(':
683 		rtok = relex();
684 		if (rtok == ')') {	/* special pleading for () */
685 			rtok = relex();
686 			return unary(op2(CCL, NIL, (Node *) tostring("")));
687 		}
688 		np = regexp();
689 		if (rtok == ')') {
690 			rtok = relex();
691 			return (unary(np));
692 		}
693 		else
694 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
695 	default:
696 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
697 	}
698 	return 0;	/*NOTREACHED*/
699 }
700 
701 Node *concat(Node *np)
702 {
703 	switch (rtok) {
704 	case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
705 		return (concat(op2(CAT, np, primary())));
706 	}
707 	return (np);
708 }
709 
710 Node *alt(Node *np)
711 {
712 	if (rtok == OR) {
713 		rtok = relex();
714 		return (alt(op2(OR, np, concat(primary()))));
715 	}
716 	return (np);
717 }
718 
719 Node *unary(Node *np)
720 {
721 	switch (rtok) {
722 	case STAR:
723 		rtok = relex();
724 		return (unary(op2(STAR, np, NIL)));
725 	case PLUS:
726 		rtok = relex();
727 		return (unary(op2(PLUS, np, NIL)));
728 	case QUEST:
729 		rtok = relex();
730 		return (unary(op2(QUEST, np, NIL)));
731 	default:
732 		return (np);
733 	}
734 }
735 
736 /*
737  * Character class definitions conformant to the POSIX locale as
738  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
739  * and operating character sets are both ASCII (ISO646) or supersets
740  * thereof.
741  *
742  * Note that to avoid overflowing the temporary buffer used in
743  * relex(), the expanded character class (prior to range expansion)
744  * must be less than twice the size of their full name.
745  */
746 
747 /* Because isblank doesn't show up in any of the header files on any
748  * system i use, it's defined here.  if some other locale has a richer
749  * definition of "blank", define HAS_ISBLANK and provide your own
750  * version.
751  * the parentheses here are an attempt to find a path through the maze
752  * of macro definition and/or function and/or version provided.  thanks
753  * to nelson beebe for the suggestion; let's see if it works everywhere.
754  */
755 
756 /* #define HAS_ISBLANK */
757 
758 static const struct charclass {
759 	const char *cc_name;
760 	int cc_namelen;
761 	int (*cc_func)(int);
762 } charclasses[] = {
763 	{ "alnum",	5,	isalnum },
764 	{ "alpha",	5,	isalpha },
765 	{ "blank",	5,	isblank },
766 	{ "cntrl",	5,	iscntrl },
767 	{ "digit",	5,	isdigit },
768 	{ "graph",	5,	isgraph },
769 	{ "lower",	5,	islower },
770 	{ "print",	5,	isprint },
771 	{ "punct",	5,	ispunct },
772 	{ "space",	5,	isspace },
773 	{ "upper",	5,	isupper },
774 	{ "xdigit",	6,	isxdigit },
775 	{ NULL,		0,	NULL },
776 };
777 
778 
779 int relex(void)		/* lexical analyzer for reparse */
780 {
781 	int c, n;
782 	int cflag;
783 	static uschar *buf = 0;
784 	static int bufsz = 100;
785 	uschar *bp;
786 	const struct charclass *cc;
787 	int i;
788 
789 	switch (c = *prestr++) {
790 	case '|': return OR;
791 	case '*': return STAR;
792 	case '+': return PLUS;
793 	case '?': return QUEST;
794 	case '.': return DOT;
795 	case '\0': prestr--; return '\0';
796 	case '^':
797 	case '$':
798 	case '(':
799 	case ')':
800 		return c;
801 	case '\\':
802 		rlxval = quoted(&prestr);
803 		return CHAR;
804 	default:
805 		rlxval = c;
806 		return CHAR;
807 	case '[':
808 		if (buf == 0 && (buf = malloc(bufsz)) == NULL)
809 			FATAL("out of space in reg expr %.10s..", lastre);
810 		bp = buf;
811 		if (*prestr == '^') {
812 			cflag = 1;
813 			prestr++;
814 		}
815 		else
816 			cflag = 0;
817 		n = 2 * strlen((const char *) prestr)+1;
818 		if (!adjbuf(&buf, &bufsz, n, n, &bp, "relex1"))
819 			FATAL("out of space for reg expr %.10s...", lastre);
820 		for (; ; ) {
821 			if ((c = *prestr++) == '\\') {
822 				*bp++ = '\\';
823 				if ((c = *prestr++) == '\0')
824 					FATAL("nonterminated character class %.20s...", lastre);
825 				*bp++ = c;
826 			/* } else if (c == '\n') { */
827 			/* 	FATAL("newline in character class %.20s...", lastre); */
828 			} else if (c == '[' && *prestr == ':') {
829 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
830 				for (cc = charclasses; cc->cc_name; cc++)
831 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
832 						break;
833 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
834 				    prestr[2 + cc->cc_namelen] == ']') {
835 					prestr += cc->cc_namelen + 3;
836 					for (i = 1; i < NCHARS; i++) {
837 						if (!adjbuf(&buf, &bufsz, bp-buf+1, 100, &bp, "relex2"))
838 						    FATAL("out of space for reg expr %.10s...", lastre);
839 						if (cc->cc_func(i)) {
840 							*bp++ = i;
841 							n++;
842 						}
843 					}
844 				} else
845 					*bp++ = c;
846 			} else if (c == '\0') {
847 				FATAL("nonterminated character class %.20s", lastre);
848 			} else if (bp == buf) {	/* 1st char is special */
849 				*bp++ = c;
850 			} else if (c == ']') {
851 				*bp++ = 0;
852 				rlxstr = (uschar *) tostring((char *) buf);
853 				if (cflag == 0)
854 					return CCL;
855 				else
856 					return NCCL;
857 			} else
858 				*bp++ = c;
859 		}
860 	}
861 }
862 
863 int cgoto(fa *f, int s, int c)
864 {
865 	int i, j, k;
866 	int *p, *q;
867 
868 	assert(c == HAT || c < NCHARS);
869 	while (f->accept >= maxsetvec) 	/* guessing here! */
870 		resizesetvec("out of space in cgoto()");
871 	for (i = 0; i <= f->accept; i++)
872 		setvec[i] = 0;
873 	setcnt = 0;
874 	resize_state(f, s);
875 	/* compute positions of gototab[s,c] into setvec */
876 	p = f->posns[s];
877 	for (i = 1; i <= *p; i++) {
878 		if ((k = f->re[p[i]].ltype) != FINAL) {
879 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
880 			 || (k == DOT && c != 0 && c != HAT)
881 			 || (k == ALL && c != 0)
882 			 || (k == EMPTYRE && c != 0)
883 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
884 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
885 				q = f->re[p[i]].lfollow;
886 				for (j = 1; j <= *q; j++) {
887 					if (q[j] >= maxsetvec)
888 						resizesetvec("cgoto overflow");
889 					if (setvec[q[j]] == 0) {
890 						setcnt++;
891 						setvec[q[j]] = 1;
892 					}
893 				}
894 			}
895 		}
896 	}
897 	/* determine if setvec is a previous state */
898 	tmpset[0] = setcnt;
899 	j = 1;
900 	for (i = f->accept; i >= 0; i--)
901 		if (setvec[i]) {
902 			tmpset[j++] = i;
903 		}
904 
905 	resize_state(f, f->curstat > s ? f->curstat : s);
906 	/* tmpset == previous state? */
907 	for (i = 1; i <= f->curstat; i++) {
908 		p = f->posns[i];
909 		if ((k = tmpset[0]) != p[0])
910 			goto different;
911 		for (j = 1; j <= k; j++)
912 			if (tmpset[j] != p[j])
913 				goto different;
914 		/* setvec is state i */
915 		if (c != HAT)
916 			f->gototab[s][c] = i;
917 		return i;
918 	  different:;
919 	}
920 
921 	/* add tmpset to current set of states */
922 	++(f->curstat);
923 	resize_state(f, f->curstat);
924 	for (i = 0; i < NCHARS; i++)
925 		f->gototab[f->curstat][i] = 0;
926 	xfree(f->posns[f->curstat]);
927 	if ((p = calloc(1, (setcnt+1)*sizeof(int))) == NULL)
928 		overflo("out of space in cgoto");
929 
930 	f->posns[f->curstat] = p;
931 	if (c != HAT)
932 		f->gototab[s][c] = f->curstat;
933 	for (i = 0; i <= setcnt; i++)
934 		p[i] = tmpset[i];
935 	if (setvec[f->accept])
936 		f->out[f->curstat] = 1;
937 	else
938 		f->out[f->curstat] = 0;
939 	return f->curstat;
940 }
941 
942 
943 void freefa(fa *f)	/* free a finite automaton */
944 {
945 	int i;
946 
947 	if (f == NULL)
948 		return;
949 	for (i = 0; i < f->state_count; i++) {
950 		xfree(f->gototab[i])
951 		xfree(f->posns[i]);
952 	}
953 	for (i = 0; i <= f->accept; i++) {
954 		xfree(f->re[i].lfollow);
955 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
956 			xfree((f->re[i].lval.np));
957 	}
958 	xfree(f->restr);
959 	xfree(f->out);
960 	xfree(f->posns);
961 	xfree(f->gototab);
962 	xfree(f);
963 }
964