1 /*
2   Copyright 2002-2004 John Plevyak, All Rights Reserved
3 */
4 
5 #include "d.h"
6 
7 typedef struct NFAState {
8   uint index;
9   Vec(struct NFAState *) chars[256];
10   Vec(struct NFAState *) epsilon;
11   Vec(Action *) accepts;
12   Vec(Action *) live;
13 } NFAState;
14 
15 typedef struct DFAState {
16   Vec(struct NFAState *) states;
17   struct DFAState *chars[256];
18   ScanState *scan;
19 } DFAState;
20 
21 typedef Vec(DFAState *) VecDFAState;
22 typedef Vec(NFAState *) VecNFAState;
23 
24 typedef struct LexState {
25   uint nfa_index;
26   VecNFAState allnfas;
27   uint transitions;
28   uint scanners;
29   uint ignore_case;
30 } LexState;
31 
new_NFAState(LexState * ls)32 static NFAState *new_NFAState(LexState *ls) {
33   NFAState *n = MALLOC(sizeof(NFAState));
34   memset(n, 0, sizeof(NFAState));
35   n->index = ls->nfa_index++;
36   vec_add(&ls->allnfas, n);
37   return n;
38 }
39 
new_DFAState()40 static DFAState *new_DFAState() {
41   DFAState *n = MALLOC(sizeof(DFAState));
42   memset(n, 0, sizeof(DFAState));
43   return n;
44 }
45 
free_DFAState(DFAState * y)46 static void free_DFAState(DFAState *y) {
47   vec_free(&y->states);
48   FREE(y);
49 }
50 
free_VecDFAState(VecDFAState * dfas)51 static void free_VecDFAState(VecDFAState *dfas) {
52   uint i;
53   for (i = 0; i < dfas->n; i++) free_DFAState(dfas->v[i]);
54   vec_free(dfas);
55 }
56 
free_NFAState(NFAState * y)57 static void free_NFAState(NFAState *y) {
58   uint i;
59   for (i = 0; i < 256; i++) vec_free(&y->chars[i]);
60   vec_free(&y->epsilon);
61   vec_free(&y->accepts);
62   FREE(y);
63 }
64 
free_VecNFAState(VecNFAState * nfas)65 static void free_VecNFAState(VecNFAState *nfas) {
66   uint i;
67   for (i = 0; i < nfas->n; i++) free_NFAState(nfas->v[i]);
68   vec_free(nfas);
69 }
70 
new_ScanState()71 static ScanState *new_ScanState() {
72   ScanState *n = MALLOC(sizeof(ScanState));
73   memset(n, 0, sizeof(ScanState));
74   return n;
75 }
76 
nfacmp(const void * ai,const void * aj)77 static int nfacmp(const void *ai, const void *aj) {
78   uint32 i = (*(NFAState **)ai)->index;
79   uint32 j = (*(NFAState **)aj)->index;
80   return (i > j) ? 1 : ((i < j) ? -1 : 0);
81 }
82 
nfa_closure(DFAState * x)83 static void nfa_closure(DFAState *x) {
84   uint i, j, k;
85 
86   for (i = 0; i < x->states.n; i++) {
87     NFAState *s = x->states.v[i];
88     for (j = 0; j < x->states.v[i]->epsilon.n; j++) {
89       for (k = 0; k < x->states.n; k++)
90         if (x->states.v[i]->epsilon.v[j] == x->states.v[k]) goto Lbreak;
91       vec_add(&x->states, s->epsilon.v[j]);
92     Lbreak:;
93     }
94   }
95   qsort(x->states.v, x->states.n, sizeof(x->states.v[0]), nfacmp);
96 }
97 
eq_dfa_state(DFAState * x,DFAState * y)98 static int eq_dfa_state(DFAState *x, DFAState *y) {
99   uint i;
100 
101   if (x->states.n != y->states.n) return 0;
102   for (i = 0; i < x->states.n; i++)
103     if (x->states.v[i] != y->states.v[i]) return 0;
104   return 1;
105 }
106 
dfa_to_scanner(VecDFAState * alldfas,VecScanState * scanner)107 static void dfa_to_scanner(VecDFAState *alldfas, VecScanState *scanner) {
108   uint i, j, k;
109   int highest, p;
110 
111   vec_clear(scanner);
112   for (i = 0; i < alldfas->n; i++) {
113     alldfas->v[i]->scan = new_ScanState();
114     alldfas->v[i]->scan->index = i;
115     vec_add(scanner, alldfas->v[i]->scan);
116   }
117   for (i = 0; i < alldfas->n; i++) {
118     for (j = 0; j < 256; j++)
119       if (alldfas->v[i]->chars[j]) alldfas->v[i]->scan->chars[j] = alldfas->v[i]->chars[j]->scan;
120     highest = INT_MIN;
121     for (j = 0; j < alldfas->v[i]->states.n; j++)
122       for (k = 0; k < alldfas->v[i]->states.v[j]->accepts.n; k++) {
123         p = alldfas->v[i]->states.v[j]->accepts.v[k]->term->term_priority;
124         if (highest < p) highest = p;
125       }
126     for (j = 0; j < alldfas->v[i]->states.n; j++)
127       for (k = 0; k < alldfas->v[i]->states.v[j]->accepts.n; k++) {
128         p = alldfas->v[i]->states.v[j]->accepts.v[k]->term->term_priority;
129         if (p == highest) vec_add(&alldfas->v[i]->scan->accepts, alldfas->v[i]->states.v[j]->accepts.v[k]);
130       }
131   }
132 }
133 
nfa_to_scanner(NFAState * n,Scanner * s)134 static void nfa_to_scanner(NFAState *n, Scanner *s) {
135   DFAState *x = new_DFAState(), *y;
136   VecDFAState alldfas;
137   uint i, i_alldfas, i_states, i_char;
138   VecScanState *scanner = &s->states;
139 
140   memset(&alldfas, 0, sizeof(alldfas));
141   vec_add(&x->states, n);
142   nfa_closure(x);
143   vec_add(&alldfas, x);
144   for (i_alldfas = 0; i_alldfas < alldfas.n; i_alldfas++) {
145     x = alldfas.v[i_alldfas];
146     for (i_char = 0; i_char < 256; i_char++) {
147       y = NULL;
148       for (i_states = 0; i_states < x->states.n; i_states++) {
149         for (i = 0; i < x->states.v[i_states]->chars[i_char].n; i++) {
150           if (!y) y = new_DFAState();
151           set_add(&y->states, x->states.v[i_states]->chars[i_char].v[i]);
152         }
153       }
154       if (y) {
155         set_to_vec(&y->states);
156         nfa_closure(y);
157         for (i = 0; i < alldfas.n; i++)
158           if (eq_dfa_state(y, alldfas.v[i])) {
159             free_DFAState(y);
160             y = alldfas.v[i];
161             goto Lnext;
162           }
163         vec_add(&alldfas, y);
164       Lnext:
165         x->chars[i_char] = y;
166       }
167     }
168   }
169   dfa_to_scanner(&alldfas, scanner);
170   free_VecDFAState(&alldfas);
171 }
172 
173 /* build a NFA for the regular expression */
build_regex_nfa(LexState * ls,uint8 ** areg,NFAState * pp,NFAState * nn,Action * trailing)174 static int build_regex_nfa(LexState *ls, uint8 **areg, NFAState *pp, NFAState *nn, Action *trailing) {
175   uint8 c, pc, *reg = *areg;
176   NFAState *p = pp, *s, *x, *n = nn;
177   int reversed, i, has_trailing = 0;
178   uint8 mark[256];
179 
180   s = p;
181   while ((c = *reg++)) {
182     switch (c) {
183       case '(':
184         has_trailing = build_regex_nfa(ls, &reg, s, (x = new_NFAState(ls)), trailing) || has_trailing;
185         p = s;
186         s = x;
187         break;
188       case ')':
189         goto Lreturn;
190       case '|':
191         vec_add(&s->epsilon, nn);
192         vec_add(&pp->epsilon, (s = new_NFAState(ls)));
193         break;
194       case '[':
195         if (*reg == '^') {
196           reg++;
197           reversed = 1;
198         } else
199           reversed = 0;
200         memset(mark, 0, sizeof(mark));
201         pc = UCHAR_MAX;
202         while ((c = *reg++)) {
203           switch (c) {
204             case ']':
205               goto Lsetdone;
206             case '-':
207               c = *reg++;
208               if (!c) goto Lerror;
209               if (c == '\\') c = *reg++;
210               if (!c) goto Lerror;
211               for (; pc <= c; pc++) mark[pc] = 1;
212               break;
213             case '\\':
214               c = *reg++;
215               /* fall through */
216             default:
217               pc = c;
218               mark[c] = 1;
219               break;
220           }
221         }
222       Lsetdone:
223         x = new_NFAState(ls);
224         for (i = 1; i < 256; i++)
225           if ((!reversed && mark[i]) || (reversed && !mark[i])) vec_add(&s->chars[i], x);
226         p = s;
227         s = x;
228         break;
229       case '?':
230         vec_add(&p->epsilon, s);
231         break;
232       case '*':
233         vec_add(&p->epsilon, s);
234         vec_add(&s->epsilon, p);
235         break;
236       case '+':
237         vec_add(&s->epsilon, p);
238         break;
239       case '/':
240         vec_add(&s->accepts, trailing);
241         has_trailing = 1;
242         break;
243       case '\\':
244         c = *reg++;
245         if (!c) goto Lerror;
246         /* fall through */
247       default:
248         if (!ls->ignore_case || !isalpha(c))
249           vec_add(&s->chars[c], (x = new_NFAState(ls)));
250         else {
251           vec_add(&s->chars[tolower(c)], (x = new_NFAState(ls)));
252           vec_add(&s->chars[toupper(c)], x);
253         }
254         p = s;
255         s = x;
256         break;
257     }
258   }
259 Lreturn:
260   vec_add(&s->epsilon, n);
261   *areg = reg;
262   return has_trailing;
263 Lerror:
264   d_fail("bad (part of) regex: %s\n", *areg);
265   return has_trailing;
266 }
267 
action_diff(VecAction * a,VecAction * b,VecAction * c)268 static void action_diff(VecAction *a, VecAction *b, VecAction *c) {
269   uint bb = 0, cc = 0;
270   while (1) {
271     if (bb >= b->n) break;
272   Lagainc:
273     if (cc >= c->n) {
274       while (bb < b->n) vec_add(a, b->v[bb++]);
275       break;
276     }
277   Lagainb:
278     if (b->v[bb]->index == c->v[cc]->index) {
279       bb++;
280       cc++;
281       continue;
282     }
283     if (b->v[bb]->index < c->v[cc]->index) {
284       vec_add(a, b->v[bb++]);
285       if (bb >= b->n) break;
286       goto Lagainb;
287     }
288     cc++;
289     goto Lagainc;
290   }
291 }
292 
action_intersect(VecAction * a,VecAction * b,VecAction * c)293 static void action_intersect(VecAction *a, VecAction *b, VecAction *c) {
294   uint bb = 0, cc = 0;
295   while (1) {
296     if (bb >= b->n) break;
297   Lagainc:
298     if (cc >= c->n) break;
299   Lagainb:
300     if (b->v[bb]->index == c->v[cc]->index) {
301       vec_add(a, b->v[bb++]);
302       cc++;
303       continue;
304     }
305     if (b->v[bb]->index < c->v[cc]->index) {
306       bb++;
307       if (bb >= b->n) break;
308       goto Lagainb;
309     }
310     cc++;
311     goto Lagainc;
312   }
313 }
314 
compute_liveness(Scanner * scanner)315 static void compute_liveness(Scanner *scanner) {
316   uint i, j, changed = 1;
317   ScanState *ss, *sss;
318   VecScanState *states = &scanner->states;
319 
320   /* basis */
321   for (i = 0; i < states->n; i++) {
322     ss = states->v[i];
323     set_union(&ss->live, &ss->accepts);
324   }
325   while (changed) {
326     changed = 0;
327     for (i = 0; i < states->n; i++) {
328       ss = states->v[i];
329       for (j = 0; j < 256; j++) {
330         if ((sss = ss->chars[j])) {
331           if (ss != sss)
332             if (set_union(&ss->live, &sss->live)) changed = 1;
333         }
334       }
335     }
336   }
337   for (i = 0; i < states->n; i++) {
338     ss = states->v[i];
339     set_to_vec(&ss->live);
340     sort_VecAction(&ss->live);
341   }
342 }
343 
trans_hash_fn(ScanStateTransition * a,hash_fns_t * fns)344 static uint32 trans_hash_fn(ScanStateTransition *a, hash_fns_t *fns) {
345   uint h = 0, i;
346 
347   if (!fns->data[0])
348     for (i = 0; i < a->live_diff.n; i++) h += 3 * a->live_diff.v[i]->index;
349   for (i = 0; i < a->accepts_diff.n; i++) h += 3 * a->accepts_diff.v[i]->index;
350   return h;
351 }
352 
trans_cmp_fn(ScanStateTransition * a,ScanStateTransition * b,hash_fns_t * fns)353 static int trans_cmp_fn(ScanStateTransition *a, ScanStateTransition *b, hash_fns_t *fns) {
354   uint i;
355 
356   if (!fns->data[0])
357     if (a->live_diff.n != b->live_diff.n) return 1;
358   if (a->accepts_diff.n != b->accepts_diff.n) return 1;
359   if (!fns->data[0])
360     for (i = 0; i < a->live_diff.n; i++)
361       if (a->live_diff.v[i] != b->live_diff.v[i]) return 1;
362   for (i = 0; i < a->accepts_diff.n; i++)
363     if (a->accepts_diff.v[i] != b->accepts_diff.v[i]) return 1;
364   return 0;
365 }
366 
367 static hash_fns_t trans_hash_fns = {(hash_fn_t)trans_hash_fn, (cmp_fn_t)trans_cmp_fn, {0, 0}};
368 
build_transitions(LexState * ls,Scanner * s)369 static void build_transitions(LexState *ls, Scanner *s) {
370   uint i, j;
371   ScanState *ss;
372   ScanStateTransition *trans = NULL, *x;
373   VecScanState *states = &s->states;
374 
375 #ifdef LIVE_DIFF_IN_TRANSITIONS
376   trans_hash_fns.data[0] = (void *)0;
377 #else
378   trans_hash_fns.data[0] = (void *)1;
379 #endif
380   for (i = 0; i < states->n; i++) {
381     ss = states->v[i];
382     for (j = 0; j < 256; j++) {
383       if (!trans) {
384         trans = MALLOC(sizeof(*trans));
385         memset(trans, 0, sizeof(*trans));
386       }
387       if (ss->chars[j]) {
388         action_diff(&trans->live_diff, &ss->live, &ss->chars[j]->live);
389         action_intersect(&trans->accepts_diff, &ss->accepts, &trans->live_diff);
390       }
391       if ((x = set_add_fn(&s->transitions, trans, &trans_hash_fns)) == trans)
392         trans = NULL;
393       else {
394         vec_free(&trans->live_diff);
395         vec_free(&trans->accepts_diff);
396       }
397       ss->transition[j] = x;
398     }
399   }
400   if (trans) FREE(trans);
401   j = 0;
402   set_to_vec(&s->transitions);
403   for (i = 0; i < s->transitions.n; i++) s->transitions.v[i]->index = i;
404   ls->transitions += s->transitions.n;
405 }
406 
compute_transitions(LexState * ls,Scanner * s)407 static void compute_transitions(LexState *ls, Scanner *s) {
408   compute_liveness(s);
409   build_transitions(ls, s);
410 }
411 
build_state_scanner(Grammar * g,LexState * ls,State * s)412 static void build_state_scanner(Grammar *g, LexState *ls, State *s) {
413   NFAState *n, *nn, *nnn;
414   Action *a;
415   uint8 *c, *reg;
416   uint j, one;
417 
418   one = 0;
419   n = new_NFAState(ls);
420   /* first strings since they can be trivially combined as a tree */
421   for (j = 0; j < s->shift_actions.n; j++) {
422     a = s->shift_actions.v[j];
423     if (a->kind == ACTION_ACCEPT) {
424       one = 1;
425       if (!n->chars[0].n)
426         vec_add(&n->chars[0], (nnn = new_NFAState(ls)));
427       else
428         nnn = n->chars[0].v[0];
429       vec_add(&nnn->accepts, a);
430     } else if (a->kind == ACTION_SHIFT && a->term->kind == TERM_STRING) {
431       one = 1;
432       nn = n;
433       if (!a->term->ignore_case) {
434         for (c = (uint8 *)a->term->string; *c; c++) {
435           if (!nn->chars[*c].n)
436             vec_add(&nn->chars[*c], (nnn = new_NFAState(ls)));
437           else
438             nnn = nn->chars[*c].v[0];
439           nn = nnn;
440         }
441       } else { /* use new states */
442         for (c = (uint8 *)a->term->string; *c; c++) {
443           if (isalpha(*c)) {
444             vec_add(&nn->chars[toupper(*c)], (nnn = new_NFAState(ls)));
445             vec_add(&nn->chars[tolower(*c)], nnn);
446           } else
447             vec_add(&nn->chars[*c], (nnn = new_NFAState(ls)));
448           nn = nnn;
449         }
450       }
451       vec_add(&nn->accepts, a);
452     }
453   }
454   /* now regexes */
455   for (j = 0; j < s->shift_actions.n; j++) {
456     a = s->shift_actions.v[j];
457     if (a->kind == ACTION_SHIFT && a->term->kind == TERM_REGEX) {
458       Action *trailing_context = (Action *)MALLOC(sizeof(Action));
459       memcpy(trailing_context, a, sizeof(Action));
460       trailing_context->kind = ACTION_SHIFT_TRAILING;
461       trailing_context->index = g->action_count++;
462       one = 1;
463       reg = (uint8 *)a->term->string;
464       vec_add(&n->epsilon, (nnn = new_NFAState(ls)));
465       nn = new_NFAState(ls);
466       ls->ignore_case = a->term->ignore_case;
467       if (build_regex_nfa(ls, &reg, nnn, nn, trailing_context)) {
468         a->term->trailing_context = 1;
469         s->trailing_context = 1;
470         vec_add(&g->actions, trailing_context);
471       } else
472         FREE(trailing_context);
473       vec_add(&nn->accepts, a);
474     }
475   }
476   if (one) {
477     nfa_to_scanner(n, &s->scanner);
478     compute_transitions(ls, &s->scanner);
479   }
480   free_VecNFAState(&ls->allnfas);
481   ls->scanners++;
482 }
483 
new_LexState()484 static LexState *new_LexState() {
485   LexState *ls = MALLOC(sizeof(LexState));
486   memset(ls, 0, sizeof(LexState));
487   vec_clear(&ls->allnfas);
488   return ls;
489 }
490 
build_scanners(Grammar * g)491 void build_scanners(Grammar *g) {
492   uint i, j, k;
493   State *s;
494   LexState *ls = new_LexState();
495 
496   /* detect identical scanners */
497   for (i = 0; i < g->states.n; i++) {
498     s = g->states.v[i];
499     if (s->same_shifts) continue;
500     for (j = 0; j < i; j++) {
501       if (g->states.v[j]->same_shifts) continue;
502       if (g->states.v[j]->shift_actions.n != s->shift_actions.n) continue;
503       if (g->states.v[j]->scan_kind != s->scan_kind) continue;
504       for (k = 0; k < g->states.v[j]->shift_actions.n; k++)
505         if (s->shift_actions.v[k]->term != g->states.v[j]->shift_actions.v[k]->term) break;
506       if (k >= g->states.v[j]->shift_actions.n) {
507         s->same_shifts = g->states.v[j];
508         break;
509       }
510     }
511   }
512   /* build scanners */
513   for (i = 0; i < g->states.n; i++) {
514     s = g->states.v[i];
515     if (s->shift_actions.n) {
516       if (s->same_shifts)
517         s->scanner = s->same_shifts->scanner;
518       else
519         build_state_scanner(g, ls, s);
520     }
521   }
522   if (d_verbose_level) printf("%d scanners %d transitions\n", ls->scanners, ls->transitions);
523   FREE(ls);
524 }
525