1 /*-
2 * Copyright (c) 1991 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * %sccs.include.proprietary.c%
6 */
7
8 #ifndef lint
9 static char sccsid[] = "@(#)b.c 4.4 (Berkeley) 04/17/91";
10 #endif /* not lint */
11
12 #include "awk.def"
13 #include "stdio.h"
14 #include "awk.h"
15
16 extern node *op2();
17 extern struct fa *cgotofn();
18 #define MAXLIN 256
19 #define NCHARS 128
20 #define NSTATES 256
21
22 #define type(v) v->nobj
23 #define left(v) v->narg[0]
24 #define right(v) v->narg[1]
25 #define parent(v) v->nnext
26
27 #define LEAF case CCL: case NCCL: case CHAR: case DOT:
28 #define UNARY case FINAL: case STAR: case PLUS: case QUEST:
29
30 /* encoding in tree nodes:
31 leaf (CCL, NCCL, CHAR, DOT): left is index, right contains value or pointer to value
32 unary (FINAL, STAR, PLUS, QUEST): left is child, right is null
33 binary (CAT, OR): left and right are children
34 parent contains pointer to parent
35 */
36
37 struct fa {
38 int cch;
39 struct fa *st;
40 };
41
42 int *state[NSTATES];
43 int *foll[MAXLIN];
44 char chars[MAXLIN];
45 int setvec[MAXLIN];
46 node *point[MAXLIN];
47
48 int setcnt;
49 int line;
50 int maxfoll; /* index of highest foll[] entry set by cfoll() */
51
52
makedfa(p)53 struct fa *makedfa(p) /* returns dfa for tree pointed to by p */
54 node *p;
55 {
56 node *p1;
57 struct fa *fap;
58 p1 = op2(CAT, op2(STAR, op2(DOT, (node *) 0, (node *) 0), (node *) 0), p);
59 /* put DOT STAR in front of reg. exp. */
60 p1 = op2(FINAL, p1, (node *) 0); /* install FINAL node */
61
62 line = 0;
63 penter(p1); /* enter parent pointers and leaf indices */
64 point[line] = p1; /* FINAL node */
65 setvec[0] = 1; /* for initial DOT STAR */
66 cfoll(p1); /* set up follow sets */
67 fap = cgotofn();
68 freetr(p1); /* add this when alloc works */
69 return(fap);
70 }
71
penter(p)72 penter(p) /* set up parent pointers and leaf indices */
73 node *p;
74 {
75 switch(type(p)) {
76 LEAF
77 left(p) = (node *) line;
78 point[line++] = p;
79 break;
80 UNARY
81 penter(left(p));
82 parent(left(p)) = p;
83 break;
84 case CAT:
85 case OR:
86 penter(left(p));
87 penter(right(p));
88 parent(left(p)) = p;
89 parent(right(p)) = p;
90 break;
91 default:
92 error(FATAL, "unknown type %d in penter\n", type(p));
93 break;
94 }
95 }
96
freetr(p)97 freetr(p) /* free parse tree and follow sets */
98 node *p;
99 {
100 switch(type(p)) {
101 LEAF
102 foll_free((int) left(p));
103 xfree(p);
104 break;
105 UNARY
106 freetr(left(p));
107 xfree(p);
108 break;
109 case CAT:
110 case OR:
111 freetr(left(p));
112 freetr(right(p));
113 xfree(p);
114 break;
115 default:
116 error(FATAL, "unknown type %d in freetr", type(p));
117 break;
118 }
119 }
cclenter(p)120 char *cclenter(p)
121 register char *p;
122 {
123 register i, c;
124 char *op;
125
126 op = p;
127 i = 0;
128 while ((c = *p++) != 0) {
129 if (c == '-' && i > 0 && chars[i-1] != 0) {
130 if (*p != 0) {
131 c = chars[i-1];
132 while (c < *p) {
133 if (i >= MAXLIN)
134 overflo();
135 chars[i++] = ++c;
136 }
137 p++;
138 continue;
139 }
140 }
141 if (i >= MAXLIN)
142 overflo();
143 chars[i++] = c;
144 }
145 chars[i++] = '\0';
146 dprintf("cclenter: in = |%s|, out = |%s|\n", op, chars, NULL);
147 xfree(op);
148 return(tostring(chars));
149 }
150
overflo()151 overflo()
152 {
153 error(FATAL, "regular expression too long\n");
154 }
155
cfoll(v)156 cfoll(v) /* enter follow set of each leaf of vertex v into foll[leaf] */
157 register node *v;
158 {
159 register i;
160 int prev;
161 int *add();
162
163 maxfoll = -1;
164 switch(type(v)) {
165 LEAF
166 setcnt = 0;
167 for (i=1; i<=line; i++)
168 setvec[i] = 0;
169 follow(v);
170 if (notin(foll, ( (int) left(v))-1, &prev)) {
171 foll[(int) left(v)] = add(setcnt);
172 }
173 else
174 foll[ (int) left(v)] = foll[prev];
175 if ((int)left(v) > maxfoll)
176 maxfoll = (int)left(v);
177 break;
178 UNARY
179 cfoll(left(v));
180 break;
181 case CAT:
182 case OR:
183 cfoll(left(v));
184 cfoll(right(v));
185 break;
186 default:
187 error(FATAL, "unknown type %d in cfoll", type(v));
188 }
189 }
190
first(p)191 first(p) /* collects initially active leaves of p into setvec */
192 register node *p; /* returns 0 or 1 depending on whether p matches empty string */
193 {
194 register b;
195
196 switch(type(p)) {
197 LEAF
198 if (setvec[(int) left(p)] != 1) {
199 setvec[(int) left(p)] = 1;
200 setcnt++;
201 }
202 if (type(p) == CCL && (*(char *) right(p)) == '\0')
203 return(0); /* empty CCL */
204 else return(1);
205 case FINAL:
206 case PLUS:
207 if (first(left(p)) == 0) return(0);
208 return(1);
209 case STAR:
210 case QUEST:
211 first(left(p));
212 return(0);
213 case CAT:
214 if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
215 return(1);
216 case OR:
217 b = first(right(p));
218 if (first(left(p)) == 0 || b == 0) return(0);
219 return(1);
220 }
221 error(FATAL, "unknown type %d in first\n", type(p));
222 return(-1);
223 }
224
follow(v)225 follow(v)
226 node *v; /* collects leaves that can follow v into setvec */
227 {
228 node *p;
229
230 if (type(v) == FINAL)
231 return;
232 p = parent(v);
233 switch (type(p)) {
234 case STAR:
235 case PLUS: first(v);
236 follow(p);
237 return;
238
239 case OR:
240 case QUEST: follow(p);
241 return;
242
243 case CAT: if (v == left(p)) { /* v is left child of p */
244 if (first(right(p)) == 0) {
245 follow(p);
246 return;
247 }
248 }
249 else /* v is right child */
250 follow(p);
251 return;
252 case FINAL: if (setvec[line] != 1) {
253 setvec[line] = 1;
254 setcnt++;
255 }
256 return;
257 }
258 }
259
member(c,s)260 member(c, s) /* is c in s? */
261 register char c, *s;
262 {
263 while (*s)
264 if (c == *s++)
265 return(1);
266 return(0);
267 }
268
notin(array,n,prev)269 notin(array, n, prev) /* is setvec in array[0] thru array[n]? */
270 int **array;
271 int *prev; {
272 register i, j;
273 int *ptr;
274 for (i=0; i<=n; i++) {
275 ptr = array[i];
276 if (*ptr == setcnt) {
277 for (j=0; j < setcnt; j++)
278 if (setvec[*(++ptr)] != 1) goto nxt;
279 *prev = i;
280 return(0);
281 }
282 nxt: ;
283 }
284 return(1);
285 }
286
add(n)287 int *add(n) { /* remember setvec */
288 int *ptr, *p;
289 register i;
290 if ((p = ptr = (int *) malloc((n+1)*sizeof(int))) == NULL)
291 overflo();
292 *ptr = n;
293 dprintf("add(%d)\n", n, NULL, NULL);
294 for (i=1; i <= line; i++)
295 if (setvec[i] == 1) {
296 *(++ptr) = i;
297 dprintf(" ptr = %o, *ptr = %d, i = %d\n", ptr, *ptr, i);
298 }
299 dprintf("\n", NULL, NULL, NULL);
300 return(p);
301 }
302
cgotofn()303 struct fa *cgotofn()
304 {
305 register i, k;
306 register int *ptr;
307 char c;
308 char *p;
309 node *cp;
310 int j, n, s, ind, numtrans;
311 int finflg;
312 int curpos, num, prev;
313 struct fa *where[NSTATES];
314
315 int fatab[257];
316 struct fa *pfa;
317
318 char index[MAXLIN];
319 char iposns[MAXLIN];
320 int sposns[MAXLIN];
321 int spmax, spinit;
322
323 char symbol[NCHARS];
324 char isyms[NCHARS];
325 char ssyms[NCHARS];
326 int ssmax, ssinit;
327
328 for (i=0; i<=line; i++) index[i] = iposns[i] = setvec[i] = 0;
329 for (i=0; i<NCHARS; i++) isyms[i] = symbol[i] = 0;
330 setcnt = 0;
331 /* compute initial positions and symbols of state 0 */
332 ssmax = 0;
333 ptr = state[0] = foll[0];
334 spinit = *ptr;
335 for (i=0; i<spinit; i++) {
336 curpos = *(++ptr);
337 sposns[i] = curpos;
338 iposns[curpos] = 1;
339 cp = point[curpos];
340 dprintf("i = %d, spinit = %d, curpos = %d\n", i, spinit, curpos);
341 switch (type(cp)) {
342 case CHAR:
343 k = (int) right(cp);
344 if (isyms[k] != 1) {
345 isyms[k] = 1;
346 ssyms[ssmax++] = k;
347 }
348 break;
349 case DOT:
350 for (k=1; k<NCHARS; k++) {
351 if (k != HAT) {
352 if (isyms[k] != 1) {
353 isyms[k] = 1;
354 ssyms[ssmax++] = k;
355 }
356 }
357 }
358 break;
359 case CCL:
360 for (p = (char *) right(cp); *p; p++) {
361 if (*p != HAT) {
362 if (isyms[*p] != 1) {
363 isyms[*p] = 1;
364 ssyms[ssmax++] = *p;
365 }
366 }
367 }
368 break;
369 case NCCL:
370 for (k=1; k<NCHARS; k++) {
371 if (k != HAT && !member(k, (char *) right(cp))) {
372 if (isyms[k] != 1) {
373 isyms[k] = 1;
374 ssyms[ssmax++] = k;
375 }
376 }
377 }
378 }
379 }
380 ssinit = ssmax;
381 n = 0;
382 for (s=0; s<=n; s++) {
383 dprintf("s = %d\n", s, NULL, NULL);
384 ind = 0;
385 numtrans = 0;
386 finflg = 0;
387 if (*(state[s] + *state[s]) == line) { /* s final? */
388 finflg = 1;
389 goto tenter;
390 }
391 spmax = spinit;
392 ssmax = ssinit;
393 ptr = state[s];
394 num = *ptr;
395 for (i=0; i<num; i++) {
396 curpos = *(++ptr);
397 if (iposns[curpos] != 1 && index[curpos] != 1) {
398 index[curpos] = 1;
399 sposns[spmax++] = curpos;
400 }
401 cp = point[curpos];
402 switch (type(cp)) {
403 case CHAR:
404 k = (int) right(cp);
405 if (isyms[k] == 0 && symbol[k] == 0) {
406 symbol[k] = 1;
407 ssyms[ssmax++] = k;
408 }
409 break;
410 case DOT:
411 for (k=1; k<NCHARS; k++) {
412 if (k != HAT) {
413 if (isyms[k] == 0 && symbol[k] == 0) {
414 symbol[k] = 1;
415 ssyms[ssmax++] = k;
416 }
417 }
418 }
419 break;
420 case CCL:
421 for (p = (char *) right(cp); *p; p++) {
422 if (*p != HAT) {
423 if (isyms[*p] == 0 && symbol[*p] == 0) {
424 symbol[*p] = 1;
425 ssyms[ssmax++] = *p;
426 }
427 }
428 }
429 break;
430 case NCCL:
431 for (k=1; k<NCHARS; k++) {
432 if (k != HAT && !member(k, (char *) right(cp))) {
433 if (isyms[k] == 0 && symbol[k] == 0) {
434 symbol[k] = 1;
435 ssyms[ssmax++] = k;
436 }
437 }
438 }
439 }
440 }
441 for (j=0; j<ssmax; j++) { /* nextstate(s, ssyms[j]) */
442 c = ssyms[j];
443 symbol[c] = 0;
444 setcnt = 0;
445 for (k=0; k<=line; k++) setvec[k] = 0;
446 for (i=0; i<spmax; i++) {
447 index[sposns[i]] = 0;
448 cp = point[sposns[i]];
449 if ((k = type(cp)) != FINAL)
450 if (k == CHAR && c == (int) right(cp)
451 || k == DOT
452 || k == CCL && member(c, (char *) right(cp))
453 || k == NCCL && !member(c, (char *) right(cp))) {
454 ptr = foll[sposns[i]];
455 num = *ptr;
456 for (k=0; k<num; k++) {
457 if (setvec[*(++ptr)] != 1
458 && iposns[*ptr] != 1) {
459 setvec[*ptr] = 1;
460 setcnt++;
461 }
462 }
463 }
464 } /* end nextstate */
465 if (notin(state, n, &prev)) {
466 if (n >= NSTATES) {
467 dprintf("cgotofn: notin; state = %d, n = %d\n", state, n, NULL);
468 overflo();
469 }
470 state[++n] = add(setcnt);
471 dprintf(" delta(%d,%o) = %d", s,c,n);
472 dprintf(", ind = %d\n", ind+1, NULL, NULL);
473 fatab[++ind] = c;
474 fatab[++ind] = n;
475 numtrans++;
476 }
477 else {
478 if (prev != 0) {
479 dprintf(" delta(%d,%o) = %d", s,c,prev);
480 dprintf(", ind = %d\n", ind+1, NULL, NULL);
481 fatab[++ind] = c;
482 fatab[++ind] = prev;
483 numtrans++;
484 }
485 }
486 }
487 tenter:
488 if ((pfa = (struct fa *) malloc((numtrans + 1) * sizeof(struct fa))) == NULL)
489 overflo();
490 where[s] = pfa;
491 if (finflg)
492 pfa->cch = -1; /* s is a final state */
493 else
494 pfa->cch = numtrans;
495 pfa->st = 0;
496 for (i=1, pfa += 1; i<=numtrans; i++, pfa++) {
497 pfa->cch = fatab[2*i-1];
498 pfa->st = (struct fa *) fatab[2*i];
499 }
500 }
501 for (i=0; i<=n; i++) {
502 /* N.b. state[0] == foll[0], not separately allocated */
503 if (i>0)
504 xfree(state[i]); /* free state[i] */
505 pfa = where[i];
506 pfa->st = where[0];
507 dprintf("state %d: (%o)\n", i, pfa, NULL);
508 dprintf(" numtrans = %d, default = %o\n", pfa->cch, pfa->st, NULL);
509 for (k=1; k<=pfa->cch; k++) {
510 (pfa+k)->st = where[ (int) (pfa+k)->st];
511 dprintf(" char = %o, nextstate = %o\n",(pfa+k)->cch, (pfa+k)->st, NULL);
512 }
513 }
514 pfa = where[0];
515 if ((num = pfa->cch) < 0)
516 return(where[0]);
517 for (pfa += num; num; num--, pfa--)
518 if (pfa->cch == HAT) {
519 return(pfa->st);
520 }
521 return(where[0]);
522 }
523
match(pfa,p)524 match(pfa, p)
525 register struct fa *pfa;
526 register char *p;
527 {
528 register count;
529 char c;
530 if (p == 0) return(0);
531 if (pfa->cch == 1) { /* fast test for first character, if possible */
532 c = (++pfa)->cch;
533 do
534 if (c == *p) {
535 p++;
536 pfa = pfa->st;
537 goto adv;
538 }
539 while (*p++ != 0);
540 return(0);
541 }
542 adv: if ((count = pfa->cch) < 0) return(1);
543 do {
544 for (pfa += count; count; count--, pfa--)
545 if (pfa->cch == *p) {
546 break;
547 }
548 pfa = pfa->st;
549 if ((count = pfa->cch) < 0) return(1);
550 } while(*p++ != 0);
551 return(0);
552 }
553
554 /*
555 * Free foll[i], taking into account identical foll[] entries.
556 * This is necessary because cfoll() uses the same physical follow set for
557 * several foll[] entries when the set is identical. Called by freetr().
558 */
foll_free(i)559 foll_free(i)
560 int i;
561 {
562 register int j;
563 int *p = foll[i];
564 if (p==NULL) return;
565 for (j=0; j<=maxfoll; j++)
566 if (foll[j]==p) foll[j]=NULL;
567 xfree(p);
568 }
569