xref: /original-bsd/old/awk/b.c (revision 894d3702)
1 /*-
2  * Copyright (c) 1991 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * %sccs.include.proprietary.c%
6  */
7 
8 #ifndef lint
9 static char sccsid[] = "@(#)b.c	4.4 (Berkeley) 04/17/91";
10 #endif /* not lint */
11 
12 #include "awk.def"
13 #include "stdio.h"
14 #include "awk.h"
15 
16 extern node *op2();
17 extern struct fa *cgotofn();
18 #define MAXLIN 256
19 #define NCHARS 128
20 #define NSTATES 256
21 
22 #define type(v)	v->nobj
23 #define left(v)	v->narg[0]
24 #define right(v)	v->narg[1]
25 #define parent(v)	v->nnext
26 
27 #define LEAF	case CCL: case NCCL: case CHAR: case DOT:
28 #define UNARY	case FINAL: case STAR: case PLUS: case QUEST:
29 
30 /* encoding in tree nodes:
31 	leaf (CCL, NCCL, CHAR, DOT): left is index, right contains value or pointer to value
32 	unary (FINAL, STAR, PLUS, QUEST): left is child, right is null
33 	binary (CAT, OR): left and right are children
34 	parent contains pointer to parent
35 */
36 
37 struct fa {
38 	int cch;
39 	struct fa *st;
40 };
41 
42 int	*state[NSTATES];
43 int	*foll[MAXLIN];
44 char	chars[MAXLIN];
45 int	setvec[MAXLIN];
46 node	*point[MAXLIN];
47 
48 int	setcnt;
49 int	line;
50 int	maxfoll;  /* index of highest foll[] entry set by cfoll() */
51 
52 
makedfa(p)53 struct fa *makedfa(p)	/* returns dfa for tree pointed to by p */
54 node *p;
55 {
56 	node *p1;
57 	struct fa *fap;
58 	p1 = op2(CAT, op2(STAR, op2(DOT, (node *) 0, (node *) 0), (node *) 0), p);
59 		/* put DOT STAR in front of reg. exp. */
60 	p1 = op2(FINAL, p1, (node *) 0);		/* install FINAL node */
61 
62 	line = 0;
63 	penter(p1);	/* enter parent pointers and leaf indices */
64 	point[line] = p1;	/* FINAL node */
65 	setvec[0] = 1;		/* for initial DOT STAR */
66 	cfoll(p1);	/* set up follow sets */
67 	fap = cgotofn();
68 	freetr(p1);	/* add this when alloc works */
69 	return(fap);
70 }
71 
penter(p)72 penter(p)	/* set up parent pointers and leaf indices */
73 node *p;
74 {
75 	switch(type(p)) {
76 		LEAF
77 			left(p) = (node *) line;
78 			point[line++] = p;
79 			break;
80 		UNARY
81 			penter(left(p));
82 			parent(left(p)) = p;
83 			break;
84 		case CAT:
85 		case OR:
86 			penter(left(p));
87 			penter(right(p));
88 			parent(left(p)) = p;
89 			parent(right(p)) = p;
90 			break;
91 		default:
92 			error(FATAL, "unknown type %d in penter\n", type(p));
93 			break;
94 	}
95 }
96 
freetr(p)97 freetr(p)	/* free parse tree and follow sets */
98 node *p;
99 {
100 	switch(type(p)) {
101 		LEAF
102 			foll_free((int) left(p));
103 			xfree(p);
104 			break;
105 		UNARY
106 			freetr(left(p));
107 			xfree(p);
108 			break;
109 		case CAT:
110 		case OR:
111 			freetr(left(p));
112 			freetr(right(p));
113 			xfree(p);
114 			break;
115 		default:
116 			error(FATAL, "unknown type %d in freetr", type(p));
117 			break;
118 	}
119 }
cclenter(p)120 char *cclenter(p)
121 register char *p;
122 {
123 	register i, c;
124 	char *op;
125 
126 	op = p;
127 	i = 0;
128 	while ((c = *p++) != 0) {
129 		if (c == '-' && i > 0 && chars[i-1] != 0) {
130 			if (*p != 0) {
131 				c = chars[i-1];
132 				while (c < *p) {
133 					if (i >= MAXLIN)
134 						overflo();
135 					chars[i++] = ++c;
136 				}
137 				p++;
138 				continue;
139 			}
140 		}
141 		if (i >= MAXLIN)
142 			overflo();
143 		chars[i++] = c;
144 	}
145 	chars[i++] = '\0';
146 	dprintf("cclenter: in = |%s|, out = |%s|\n", op, chars, NULL);
147 	xfree(op);
148 	return(tostring(chars));
149 }
150 
overflo()151 overflo()
152 {
153 	error(FATAL, "regular expression too long\n");
154 }
155 
cfoll(v)156 cfoll(v)		/* enter follow set of each leaf of vertex v into foll[leaf] */
157 register node *v;
158 {
159 	register i;
160 	int prev;
161 	int *add();
162 
163 	maxfoll = -1;
164 	switch(type(v)) {
165 		LEAF
166 			setcnt = 0;
167 			for (i=1; i<=line; i++)
168 				setvec[i] = 0;
169 			follow(v);
170 			if (notin(foll, ( (int) left(v))-1, &prev)) {
171 				foll[(int) left(v)] = add(setcnt);
172 			}
173 			else
174 				foll[ (int) left(v)] = foll[prev];
175 			if ((int)left(v) > maxfoll)
176 				maxfoll = (int)left(v);
177 			break;
178 		UNARY
179 			cfoll(left(v));
180 			break;
181 		case CAT:
182 		case OR:
183 			cfoll(left(v));
184 			cfoll(right(v));
185 			break;
186 		default:
187 			error(FATAL, "unknown type %d in cfoll", type(v));
188 	}
189 }
190 
first(p)191 first(p)			/* collects initially active leaves of p into setvec */
192 register node *p;		/* returns 0 or 1 depending on whether p matches empty string */
193 {
194 	register b;
195 
196 	switch(type(p)) {
197 		LEAF
198 			if (setvec[(int) left(p)] != 1) {
199 				setvec[(int) left(p)] = 1;
200 				setcnt++;
201 			}
202 			if (type(p) == CCL && (*(char *) right(p)) == '\0')
203 				return(0);		/* empty CCL */
204 			else return(1);
205 		case FINAL:
206 		case PLUS:
207 			if (first(left(p)) == 0) return(0);
208 			return(1);
209 		case STAR:
210 		case QUEST:
211 			first(left(p));
212 			return(0);
213 		case CAT:
214 			if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
215 			return(1);
216 		case OR:
217 			b = first(right(p));
218 			if (first(left(p)) == 0 || b == 0) return(0);
219 			return(1);
220 	}
221 	error(FATAL, "unknown type %d in first\n", type(p));
222 	return(-1);
223 }
224 
follow(v)225 follow(v)
226 node *v;		/* collects leaves that can follow v into setvec */
227 {
228 	node *p;
229 
230 	if (type(v) == FINAL)
231 		return;
232 	p = parent(v);
233 	switch (type(p)) {
234 		case STAR:
235 		case PLUS:	first(v);
236 				follow(p);
237 				return;
238 
239 		case OR:
240 		case QUEST:	follow(p);
241 				return;
242 
243 		case CAT:	if (v == left(p)) {	/* v is left child of p */
244 					if (first(right(p)) == 0) {
245 						follow(p);
246 						return;
247 					}
248 				}
249 				else		/* v is right child */
250 					follow(p);
251 				return;
252 		case FINAL:	if (setvec[line] != 1) {
253 					setvec[line] = 1;
254 					setcnt++;
255 				}
256 				return;
257 	}
258 }
259 
member(c,s)260 member(c, s)	/* is c in s? */
261 register char c, *s;
262 {
263 	while (*s)
264 		if (c == *s++)
265 			return(1);
266 	return(0);
267 }
268 
notin(array,n,prev)269 notin(array, n, prev)		/* is setvec in array[0] thru array[n]? */
270 int **array;
271 int *prev; {
272 	register i, j;
273 	int *ptr;
274 	for (i=0; i<=n; i++) {
275 		ptr = array[i];
276 		if (*ptr == setcnt) {
277 			for (j=0; j < setcnt; j++)
278 				if (setvec[*(++ptr)] != 1) goto nxt;
279 			*prev = i;
280 			return(0);
281 		}
282 		nxt: ;
283 	}
284 	return(1);
285 }
286 
add(n)287 int *add(n) {		/* remember setvec */
288 	int *ptr, *p;
289 	register i;
290 	if ((p = ptr = (int *) malloc((n+1)*sizeof(int))) == NULL)
291 		overflo();
292 	*ptr = n;
293 	dprintf("add(%d)\n", n, NULL, NULL);
294 	for (i=1; i <= line; i++)
295 		if (setvec[i] == 1) {
296 			*(++ptr) = i;
297 			dprintf("  ptr = %o, *ptr = %d, i = %d\n", ptr, *ptr, i);
298 		}
299 	dprintf("\n", NULL, NULL, NULL);
300 	return(p);
301 }
302 
cgotofn()303 struct fa *cgotofn()
304 {
305 	register i, k;
306 	register int *ptr;
307 	char c;
308 	char *p;
309 	node *cp;
310 	int j, n, s, ind, numtrans;
311 	int finflg;
312 	int curpos, num, prev;
313 	struct fa *where[NSTATES];
314 
315 	int fatab[257];
316 	struct fa *pfa;
317 
318 	char index[MAXLIN];
319 	char iposns[MAXLIN];
320 	int sposns[MAXLIN];
321 	int spmax, spinit;
322 
323 	char symbol[NCHARS];
324 	char isyms[NCHARS];
325 	char ssyms[NCHARS];
326 	int ssmax, ssinit;
327 
328 	for (i=0; i<=line; i++) index[i] = iposns[i] = setvec[i] = 0;
329 	for (i=0; i<NCHARS; i++)  isyms[i] = symbol[i] = 0;
330 	setcnt = 0;
331 	/* compute initial positions and symbols of state 0 */
332 	ssmax = 0;
333 	ptr = state[0] = foll[0];
334 	spinit = *ptr;
335 	for (i=0; i<spinit; i++) {
336 		curpos = *(++ptr);
337 		sposns[i] = curpos;
338 		iposns[curpos] = 1;
339 		cp = point[curpos];
340 		dprintf("i = %d, spinit = %d, curpos = %d\n", i, spinit, curpos);
341 		switch (type(cp)) {
342 			case CHAR:
343 				k = (int) right(cp);
344 				if (isyms[k] != 1) {
345 					isyms[k] = 1;
346 					ssyms[ssmax++] = k;
347 				}
348 				break;
349 			case DOT:
350 				for (k=1; k<NCHARS; k++) {
351 					if (k != HAT) {
352 						if (isyms[k] != 1) {
353 							isyms[k] = 1;
354 							ssyms[ssmax++] = k;
355 						}
356 					}
357 				}
358 				break;
359 			case CCL:
360 				for (p = (char *) right(cp); *p; p++) {
361 					if (*p != HAT) {
362 						if (isyms[*p] != 1) {
363 							isyms[*p] = 1;
364 							ssyms[ssmax++] = *p;
365 						}
366 					}
367 				}
368 				break;
369 			case NCCL:
370 				for (k=1; k<NCHARS; k++) {
371 					if (k != HAT && !member(k, (char *) right(cp))) {
372 						if (isyms[k] != 1) {
373 							isyms[k] = 1;
374 							ssyms[ssmax++] = k;
375 						}
376 					}
377 				}
378 		}
379 	}
380 	ssinit = ssmax;
381 	n = 0;
382 	for (s=0; s<=n; s++)  {
383 	dprintf("s = %d\n", s, NULL, NULL);
384 		ind = 0;
385 		numtrans = 0;
386 		finflg = 0;
387 		if (*(state[s] + *state[s]) == line) {		/* s final? */
388 			finflg = 1;
389 			goto tenter;
390 		}
391 		spmax = spinit;
392 		ssmax = ssinit;
393 		ptr = state[s];
394 		num = *ptr;
395 		for (i=0; i<num; i++) {
396 			curpos = *(++ptr);
397 			if (iposns[curpos] != 1 && index[curpos] != 1) {
398 				index[curpos] = 1;
399 				sposns[spmax++] = curpos;
400 			}
401 			cp = point[curpos];
402 			switch (type(cp)) {
403 				case CHAR:
404 					k = (int) right(cp);
405 					if (isyms[k] == 0 && symbol[k] == 0) {
406 						symbol[k] = 1;
407 						ssyms[ssmax++] = k;
408 					}
409 					break;
410 				case DOT:
411 					for (k=1; k<NCHARS; k++) {
412 						if (k != HAT) {
413 							if (isyms[k] == 0 && symbol[k] == 0) {
414 								symbol[k] = 1;
415 								ssyms[ssmax++] = k;
416 							}
417 						}
418 					}
419 					break;
420 				case CCL:
421 					for (p = (char *) right(cp); *p; p++) {
422 						if (*p != HAT) {
423 							if (isyms[*p] == 0 && symbol[*p] == 0) {
424 								symbol[*p] = 1;
425 								ssyms[ssmax++] = *p;
426 							}
427 						}
428 					}
429 					break;
430 				case NCCL:
431 					for (k=1; k<NCHARS; k++) {
432 						if (k != HAT && !member(k, (char *) right(cp))) {
433 							if (isyms[k] == 0 && symbol[k] == 0) {
434 								symbol[k] = 1;
435 								ssyms[ssmax++] = k;
436 							}
437 						}
438 					}
439 			}
440 		}
441 		for (j=0; j<ssmax; j++) {	/* nextstate(s, ssyms[j]) */
442 			c = ssyms[j];
443 			symbol[c] = 0;
444 			setcnt = 0;
445 			for (k=0; k<=line; k++) setvec[k] = 0;
446 			for (i=0; i<spmax; i++) {
447 				index[sposns[i]] = 0;
448 				cp = point[sposns[i]];
449 				if ((k = type(cp)) != FINAL)
450 					if (k == CHAR && c == (int) right(cp)
451 					 || k == DOT
452 					 || k == CCL && member(c, (char *) right(cp))
453 					 || k == NCCL && !member(c, (char *) right(cp))) {
454 						ptr = foll[sposns[i]];
455 						num = *ptr;
456 						for (k=0; k<num; k++) {
457 							if (setvec[*(++ptr)] != 1
458 								&& iposns[*ptr] != 1) {
459 								setvec[*ptr] = 1;
460 								setcnt++;
461 							}
462 						}
463 					}
464 			} /* end nextstate */
465 			if (notin(state, n, &prev)) {
466 				if (n >= NSTATES) {
467 					dprintf("cgotofn: notin; state = %d, n = %d\n", state, n, NULL);
468 					overflo();
469 				}
470 				state[++n] = add(setcnt);
471 				dprintf("	delta(%d,%o) = %d", s,c,n);
472 				dprintf(", ind = %d\n", ind+1, NULL, NULL);
473 				fatab[++ind] = c;
474 				fatab[++ind] = n;
475 				numtrans++;
476 			}
477 			else {
478 				if (prev != 0) {
479 					dprintf("	delta(%d,%o) = %d", s,c,prev);
480 					dprintf(", ind = %d\n", ind+1, NULL, NULL);
481 					fatab[++ind] = c;
482 					fatab[++ind] = prev;
483 					numtrans++;
484 				}
485 			}
486 		}
487 	tenter:
488 		if ((pfa = (struct fa *) malloc((numtrans + 1) * sizeof(struct fa))) == NULL)
489 			overflo();
490 		where[s] = pfa;
491 		if (finflg)
492 			pfa->cch = -1;		/* s is a final state */
493 		else
494 			pfa->cch = numtrans;
495 		pfa->st = 0;
496 		for (i=1, pfa += 1; i<=numtrans; i++, pfa++) {
497 			pfa->cch = fatab[2*i-1];
498 			pfa->st = (struct fa *) fatab[2*i];
499 		}
500 	}
501 	for (i=0; i<=n; i++) {
502 		/* N.b. state[0] == foll[0], not separately allocated */
503 		if (i>0)
504 			xfree(state[i]);       /* free state[i] */
505 		pfa = where[i];
506 		pfa->st = where[0];
507 		dprintf("state %d: (%o)\n", i, pfa, NULL);
508 		dprintf("	numtrans = %d,	default = %o\n", pfa->cch, pfa->st, NULL);
509 		for (k=1; k<=pfa->cch; k++) {
510 			(pfa+k)->st = where[ (int) (pfa+k)->st];
511 			dprintf("	char = %o,	nextstate = %o\n",(pfa+k)->cch, (pfa+k)->st, NULL);
512 		}
513 	}
514 	pfa = where[0];
515 	if ((num = pfa->cch) < 0)
516 		return(where[0]);
517 	for (pfa += num; num; num--, pfa--)
518 		if (pfa->cch == HAT) {
519 			return(pfa->st);
520 		}
521 	return(where[0]);
522 }
523 
match(pfa,p)524 match(pfa, p)
525 register struct fa *pfa;
526 register char *p;
527 {
528 	register count;
529 	char c;
530 	if (p == 0) return(0);
531 	if (pfa->cch == 1) {		/* fast test for first character, if possible */
532 		c = (++pfa)->cch;
533 		do
534 			if (c == *p) {
535 				p++;
536 				pfa = pfa->st;
537 				goto adv;
538 			}
539 		while (*p++ != 0);
540 		return(0);
541 	}
542    adv: if ((count = pfa->cch) < 0) return(1);
543 	do {
544 		for (pfa += count; count; count--, pfa--)
545 			if (pfa->cch == *p) {
546 				break;
547 			}
548 		pfa = pfa->st;
549 		if ((count = pfa->cch) < 0) return(1);
550 	} while(*p++ != 0);
551 	return(0);
552 }
553 
554 /*
555  * Free foll[i], taking into account identical foll[] entries.
556  * This is necessary because cfoll() uses the same physical follow set for
557  * several foll[] entries when the set is identical.  Called by freetr().
558  */
foll_free(i)559 foll_free(i)
560 int i;
561 {
562 	register int j;
563 	int *p = foll[i];
564 	if (p==NULL) return;
565 	for (j=0; j<=maxfoll; j++)
566 		if (foll[j]==p) foll[j]=NULL;
567 	xfree(p);
568 }
569