1 /*
2 Copyright 2002-2004 John Plevyak, All Rights Reserved
3 */
4
5 #include "d.h"
6
7 typedef struct NFAState {
8 uint index;
9 Vec(struct NFAState *) chars[256];
10 Vec(struct NFAState *) epsilon;
11 Vec(Action *) accepts;
12 Vec(Action *) live;
13 } NFAState;
14
15 typedef struct DFAState {
16 Vec(struct NFAState *) states;
17 struct DFAState *chars[256];
18 ScanState *scan;
19 } DFAState;
20
21 typedef Vec(DFAState *) VecDFAState;
22 typedef Vec(NFAState *) VecNFAState;
23
24 typedef struct LexState {
25 uint nfa_index;
26 VecNFAState allnfas;
27 uint transitions;
28 uint scanners;
29 uint ignore_case;
30 } LexState;
31
new_NFAState(LexState * ls)32 static NFAState *new_NFAState(LexState *ls) {
33 NFAState *n = MALLOC(sizeof(NFAState));
34 memset(n, 0, sizeof(NFAState));
35 n->index = ls->nfa_index++;
36 vec_add(&ls->allnfas, n);
37 return n;
38 }
39
new_DFAState()40 static DFAState *new_DFAState() {
41 DFAState *n = MALLOC(sizeof(DFAState));
42 memset(n, 0, sizeof(DFAState));
43 return n;
44 }
45
free_DFAState(DFAState * y)46 static void free_DFAState(DFAState *y) {
47 vec_free(&y->states);
48 FREE(y);
49 }
50
free_VecDFAState(VecDFAState * dfas)51 static void free_VecDFAState(VecDFAState *dfas) {
52 uint i;
53 for (i = 0; i < dfas->n; i++) free_DFAState(dfas->v[i]);
54 vec_free(dfas);
55 }
56
free_NFAState(NFAState * y)57 static void free_NFAState(NFAState *y) {
58 uint i;
59 for (i = 0; i < 256; i++) vec_free(&y->chars[i]);
60 vec_free(&y->epsilon);
61 vec_free(&y->accepts);
62 FREE(y);
63 }
64
free_VecNFAState(VecNFAState * nfas)65 static void free_VecNFAState(VecNFAState *nfas) {
66 uint i;
67 for (i = 0; i < nfas->n; i++) free_NFAState(nfas->v[i]);
68 vec_free(nfas);
69 }
70
new_ScanState()71 static ScanState *new_ScanState() {
72 ScanState *n = MALLOC(sizeof(ScanState));
73 memset(n, 0, sizeof(ScanState));
74 return n;
75 }
76
nfacmp(const void * ai,const void * aj)77 static int nfacmp(const void *ai, const void *aj) {
78 uint32 i = (*(NFAState **)ai)->index;
79 uint32 j = (*(NFAState **)aj)->index;
80 return (i > j) ? 1 : ((i < j) ? -1 : 0);
81 }
82
nfa_closure(DFAState * x)83 static void nfa_closure(DFAState *x) {
84 uint i, j, k;
85
86 for (i = 0; i < x->states.n; i++) {
87 NFAState *s = x->states.v[i];
88 for (j = 0; j < x->states.v[i]->epsilon.n; j++) {
89 for (k = 0; k < x->states.n; k++)
90 if (x->states.v[i]->epsilon.v[j] == x->states.v[k]) goto Lbreak;
91 vec_add(&x->states, s->epsilon.v[j]);
92 Lbreak:;
93 }
94 }
95 qsort(x->states.v, x->states.n, sizeof(x->states.v[0]), nfacmp);
96 }
97
eq_dfa_state(DFAState * x,DFAState * y)98 static int eq_dfa_state(DFAState *x, DFAState *y) {
99 uint i;
100
101 if (x->states.n != y->states.n) return 0;
102 for (i = 0; i < x->states.n; i++)
103 if (x->states.v[i] != y->states.v[i]) return 0;
104 return 1;
105 }
106
dfa_to_scanner(VecDFAState * alldfas,VecScanState * scanner)107 static void dfa_to_scanner(VecDFAState *alldfas, VecScanState *scanner) {
108 uint i, j, k;
109 int highest, p;
110
111 vec_clear(scanner);
112 for (i = 0; i < alldfas->n; i++) {
113 alldfas->v[i]->scan = new_ScanState();
114 alldfas->v[i]->scan->index = i;
115 vec_add(scanner, alldfas->v[i]->scan);
116 }
117 for (i = 0; i < alldfas->n; i++) {
118 for (j = 0; j < 256; j++)
119 if (alldfas->v[i]->chars[j]) alldfas->v[i]->scan->chars[j] = alldfas->v[i]->chars[j]->scan;
120 highest = INT_MIN;
121 for (j = 0; j < alldfas->v[i]->states.n; j++)
122 for (k = 0; k < alldfas->v[i]->states.v[j]->accepts.n; k++) {
123 p = alldfas->v[i]->states.v[j]->accepts.v[k]->term->term_priority;
124 if (highest < p) highest = p;
125 }
126 for (j = 0; j < alldfas->v[i]->states.n; j++)
127 for (k = 0; k < alldfas->v[i]->states.v[j]->accepts.n; k++) {
128 p = alldfas->v[i]->states.v[j]->accepts.v[k]->term->term_priority;
129 if (p == highest) vec_add(&alldfas->v[i]->scan->accepts, alldfas->v[i]->states.v[j]->accepts.v[k]);
130 }
131 }
132 }
133
nfa_to_scanner(NFAState * n,Scanner * s)134 static void nfa_to_scanner(NFAState *n, Scanner *s) {
135 DFAState *x = new_DFAState(), *y;
136 VecDFAState alldfas;
137 uint i, i_alldfas, i_states, i_char;
138 VecScanState *scanner = &s->states;
139
140 memset(&alldfas, 0, sizeof(alldfas));
141 vec_add(&x->states, n);
142 nfa_closure(x);
143 vec_add(&alldfas, x);
144 for (i_alldfas = 0; i_alldfas < alldfas.n; i_alldfas++) {
145 x = alldfas.v[i_alldfas];
146 for (i_char = 0; i_char < 256; i_char++) {
147 y = NULL;
148 for (i_states = 0; i_states < x->states.n; i_states++) {
149 for (i = 0; i < x->states.v[i_states]->chars[i_char].n; i++) {
150 if (!y) y = new_DFAState();
151 set_add(&y->states, x->states.v[i_states]->chars[i_char].v[i]);
152 }
153 }
154 if (y) {
155 set_to_vec(&y->states);
156 nfa_closure(y);
157 for (i = 0; i < alldfas.n; i++)
158 if (eq_dfa_state(y, alldfas.v[i])) {
159 free_DFAState(y);
160 y = alldfas.v[i];
161 goto Lnext;
162 }
163 vec_add(&alldfas, y);
164 Lnext:
165 x->chars[i_char] = y;
166 }
167 }
168 }
169 dfa_to_scanner(&alldfas, scanner);
170 free_VecDFAState(&alldfas);
171 }
172
173 /* build a NFA for the regular expression */
build_regex_nfa(LexState * ls,uint8 ** areg,NFAState * pp,NFAState * nn,Action * trailing)174 static int build_regex_nfa(LexState *ls, uint8 **areg, NFAState *pp, NFAState *nn, Action *trailing) {
175 uint8 c, pc, *reg = *areg;
176 NFAState *p = pp, *s, *x, *n = nn;
177 int reversed, i, has_trailing = 0;
178 uint8 mark[256];
179
180 s = p;
181 while ((c = *reg++)) {
182 switch (c) {
183 case '(':
184 has_trailing = build_regex_nfa(ls, ®, s, (x = new_NFAState(ls)), trailing) || has_trailing;
185 p = s;
186 s = x;
187 break;
188 case ')':
189 goto Lreturn;
190 case '|':
191 vec_add(&s->epsilon, nn);
192 vec_add(&pp->epsilon, (s = new_NFAState(ls)));
193 break;
194 case '[':
195 if (*reg == '^') {
196 reg++;
197 reversed = 1;
198 } else
199 reversed = 0;
200 memset(mark, 0, sizeof(mark));
201 pc = UCHAR_MAX;
202 while ((c = *reg++)) {
203 switch (c) {
204 case ']':
205 goto Lsetdone;
206 case '-':
207 c = *reg++;
208 if (!c) goto Lerror;
209 if (c == '\\') c = *reg++;
210 if (!c) goto Lerror;
211 for (; pc <= c; pc++) mark[pc] = 1;
212 break;
213 case '\\':
214 c = *reg++;
215 /* fall through */
216 default:
217 pc = c;
218 mark[c] = 1;
219 break;
220 }
221 }
222 Lsetdone:
223 x = new_NFAState(ls);
224 for (i = 1; i < 256; i++)
225 if ((!reversed && mark[i]) || (reversed && !mark[i])) vec_add(&s->chars[i], x);
226 p = s;
227 s = x;
228 break;
229 case '?':
230 vec_add(&p->epsilon, s);
231 break;
232 case '*':
233 vec_add(&p->epsilon, s);
234 vec_add(&s->epsilon, p);
235 break;
236 case '+':
237 vec_add(&s->epsilon, p);
238 break;
239 case '/':
240 vec_add(&s->accepts, trailing);
241 has_trailing = 1;
242 break;
243 case '\\':
244 c = *reg++;
245 if (!c) goto Lerror;
246 /* fall through */
247 default:
248 if (!ls->ignore_case || !isalpha(c))
249 vec_add(&s->chars[c], (x = new_NFAState(ls)));
250 else {
251 vec_add(&s->chars[tolower(c)], (x = new_NFAState(ls)));
252 vec_add(&s->chars[toupper(c)], x);
253 }
254 p = s;
255 s = x;
256 break;
257 }
258 }
259 Lreturn:
260 vec_add(&s->epsilon, n);
261 *areg = reg;
262 return has_trailing;
263 Lerror:
264 d_fail("bad (part of) regex: %s\n", *areg);
265 return has_trailing;
266 }
267
action_diff(VecAction * a,VecAction * b,VecAction * c)268 static void action_diff(VecAction *a, VecAction *b, VecAction *c) {
269 uint bb = 0, cc = 0;
270 while (1) {
271 if (bb >= b->n) break;
272 Lagainc:
273 if (cc >= c->n) {
274 while (bb < b->n) vec_add(a, b->v[bb++]);
275 break;
276 }
277 Lagainb:
278 if (b->v[bb]->index == c->v[cc]->index) {
279 bb++;
280 cc++;
281 continue;
282 }
283 if (b->v[bb]->index < c->v[cc]->index) {
284 vec_add(a, b->v[bb++]);
285 if (bb >= b->n) break;
286 goto Lagainb;
287 }
288 cc++;
289 goto Lagainc;
290 }
291 }
292
action_intersect(VecAction * a,VecAction * b,VecAction * c)293 static void action_intersect(VecAction *a, VecAction *b, VecAction *c) {
294 uint bb = 0, cc = 0;
295 while (1) {
296 if (bb >= b->n) break;
297 Lagainc:
298 if (cc >= c->n) break;
299 Lagainb:
300 if (b->v[bb]->index == c->v[cc]->index) {
301 vec_add(a, b->v[bb++]);
302 cc++;
303 continue;
304 }
305 if (b->v[bb]->index < c->v[cc]->index) {
306 bb++;
307 if (bb >= b->n) break;
308 goto Lagainb;
309 }
310 cc++;
311 goto Lagainc;
312 }
313 }
314
compute_liveness(Scanner * scanner)315 static void compute_liveness(Scanner *scanner) {
316 uint i, j, changed = 1;
317 ScanState *ss, *sss;
318 VecScanState *states = &scanner->states;
319
320 /* basis */
321 for (i = 0; i < states->n; i++) {
322 ss = states->v[i];
323 set_union(&ss->live, &ss->accepts);
324 }
325 while (changed) {
326 changed = 0;
327 for (i = 0; i < states->n; i++) {
328 ss = states->v[i];
329 for (j = 0; j < 256; j++) {
330 if ((sss = ss->chars[j])) {
331 if (ss != sss)
332 if (set_union(&ss->live, &sss->live)) changed = 1;
333 }
334 }
335 }
336 }
337 for (i = 0; i < states->n; i++) {
338 ss = states->v[i];
339 set_to_vec(&ss->live);
340 sort_VecAction(&ss->live);
341 }
342 }
343
trans_hash_fn(ScanStateTransition * a,hash_fns_t * fns)344 static uint32 trans_hash_fn(ScanStateTransition *a, hash_fns_t *fns) {
345 uint h = 0, i;
346
347 if (!fns->data[0])
348 for (i = 0; i < a->live_diff.n; i++) h += 3 * a->live_diff.v[i]->index;
349 for (i = 0; i < a->accepts_diff.n; i++) h += 3 * a->accepts_diff.v[i]->index;
350 return h;
351 }
352
trans_cmp_fn(ScanStateTransition * a,ScanStateTransition * b,hash_fns_t * fns)353 static int trans_cmp_fn(ScanStateTransition *a, ScanStateTransition *b, hash_fns_t *fns) {
354 uint i;
355
356 if (!fns->data[0])
357 if (a->live_diff.n != b->live_diff.n) return 1;
358 if (a->accepts_diff.n != b->accepts_diff.n) return 1;
359 if (!fns->data[0])
360 for (i = 0; i < a->live_diff.n; i++)
361 if (a->live_diff.v[i] != b->live_diff.v[i]) return 1;
362 for (i = 0; i < a->accepts_diff.n; i++)
363 if (a->accepts_diff.v[i] != b->accepts_diff.v[i]) return 1;
364 return 0;
365 }
366
367 static hash_fns_t trans_hash_fns = {(hash_fn_t)trans_hash_fn, (cmp_fn_t)trans_cmp_fn, {0, 0}};
368
build_transitions(LexState * ls,Scanner * s)369 static void build_transitions(LexState *ls, Scanner *s) {
370 uint i, j;
371 ScanState *ss;
372 ScanStateTransition *trans = NULL, *x;
373 VecScanState *states = &s->states;
374
375 #ifdef LIVE_DIFF_IN_TRANSITIONS
376 trans_hash_fns.data[0] = (void *)0;
377 #else
378 trans_hash_fns.data[0] = (void *)1;
379 #endif
380 for (i = 0; i < states->n; i++) {
381 ss = states->v[i];
382 for (j = 0; j < 256; j++) {
383 if (!trans) {
384 trans = MALLOC(sizeof(*trans));
385 memset(trans, 0, sizeof(*trans));
386 }
387 if (ss->chars[j]) {
388 action_diff(&trans->live_diff, &ss->live, &ss->chars[j]->live);
389 action_intersect(&trans->accepts_diff, &ss->accepts, &trans->live_diff);
390 }
391 if ((x = set_add_fn(&s->transitions, trans, &trans_hash_fns)) == trans)
392 trans = NULL;
393 else {
394 vec_free(&trans->live_diff);
395 vec_free(&trans->accepts_diff);
396 }
397 ss->transition[j] = x;
398 }
399 }
400 if (trans) FREE(trans);
401 j = 0;
402 set_to_vec(&s->transitions);
403 for (i = 0; i < s->transitions.n; i++) s->transitions.v[i]->index = i;
404 ls->transitions += s->transitions.n;
405 }
406
compute_transitions(LexState * ls,Scanner * s)407 static void compute_transitions(LexState *ls, Scanner *s) {
408 compute_liveness(s);
409 build_transitions(ls, s);
410 }
411
build_state_scanner(Grammar * g,LexState * ls,State * s)412 static void build_state_scanner(Grammar *g, LexState *ls, State *s) {
413 NFAState *n, *nn, *nnn;
414 Action *a;
415 uint8 *c, *reg;
416 uint j, one;
417
418 one = 0;
419 n = new_NFAState(ls);
420 /* first strings since they can be trivially combined as a tree */
421 for (j = 0; j < s->shift_actions.n; j++) {
422 a = s->shift_actions.v[j];
423 if (a->kind == ACTION_ACCEPT) {
424 one = 1;
425 if (!n->chars[0].n)
426 vec_add(&n->chars[0], (nnn = new_NFAState(ls)));
427 else
428 nnn = n->chars[0].v[0];
429 vec_add(&nnn->accepts, a);
430 } else if (a->kind == ACTION_SHIFT && a->term->kind == TERM_STRING) {
431 one = 1;
432 nn = n;
433 if (!a->term->ignore_case) {
434 for (c = (uint8 *)a->term->string; *c; c++) {
435 if (!nn->chars[*c].n)
436 vec_add(&nn->chars[*c], (nnn = new_NFAState(ls)));
437 else
438 nnn = nn->chars[*c].v[0];
439 nn = nnn;
440 }
441 } else { /* use new states */
442 for (c = (uint8 *)a->term->string; *c; c++) {
443 if (isalpha(*c)) {
444 vec_add(&nn->chars[toupper(*c)], (nnn = new_NFAState(ls)));
445 vec_add(&nn->chars[tolower(*c)], nnn);
446 } else
447 vec_add(&nn->chars[*c], (nnn = new_NFAState(ls)));
448 nn = nnn;
449 }
450 }
451 vec_add(&nn->accepts, a);
452 }
453 }
454 /* now regexes */
455 for (j = 0; j < s->shift_actions.n; j++) {
456 a = s->shift_actions.v[j];
457 if (a->kind == ACTION_SHIFT && a->term->kind == TERM_REGEX) {
458 Action *trailing_context = (Action *)MALLOC(sizeof(Action));
459 memcpy(trailing_context, a, sizeof(Action));
460 trailing_context->kind = ACTION_SHIFT_TRAILING;
461 trailing_context->index = g->action_count++;
462 one = 1;
463 reg = (uint8 *)a->term->string;
464 vec_add(&n->epsilon, (nnn = new_NFAState(ls)));
465 nn = new_NFAState(ls);
466 ls->ignore_case = a->term->ignore_case;
467 if (build_regex_nfa(ls, ®, nnn, nn, trailing_context)) {
468 a->term->trailing_context = 1;
469 s->trailing_context = 1;
470 vec_add(&g->actions, trailing_context);
471 } else
472 FREE(trailing_context);
473 vec_add(&nn->accepts, a);
474 }
475 }
476 if (one) {
477 nfa_to_scanner(n, &s->scanner);
478 compute_transitions(ls, &s->scanner);
479 }
480 free_VecNFAState(&ls->allnfas);
481 ls->scanners++;
482 }
483
new_LexState()484 static LexState *new_LexState() {
485 LexState *ls = MALLOC(sizeof(LexState));
486 memset(ls, 0, sizeof(LexState));
487 vec_clear(&ls->allnfas);
488 return ls;
489 }
490
build_scanners(Grammar * g)491 void build_scanners(Grammar *g) {
492 uint i, j, k;
493 State *s;
494 LexState *ls = new_LexState();
495
496 /* detect identical scanners */
497 for (i = 0; i < g->states.n; i++) {
498 s = g->states.v[i];
499 if (s->same_shifts) continue;
500 for (j = 0; j < i; j++) {
501 if (g->states.v[j]->same_shifts) continue;
502 if (g->states.v[j]->shift_actions.n != s->shift_actions.n) continue;
503 if (g->states.v[j]->scan_kind != s->scan_kind) continue;
504 for (k = 0; k < g->states.v[j]->shift_actions.n; k++)
505 if (s->shift_actions.v[k]->term != g->states.v[j]->shift_actions.v[k]->term) break;
506 if (k >= g->states.v[j]->shift_actions.n) {
507 s->same_shifts = g->states.v[j];
508 break;
509 }
510 }
511 }
512 /* build scanners */
513 for (i = 0; i < g->states.n; i++) {
514 s = g->states.v[i];
515 if (s->shift_actions.n) {
516 if (s->same_shifts)
517 s->scanner = s->same_shifts->scanner;
518 else
519 build_state_scanner(g, ls, s);
520 }
521 }
522 if (d_verbose_level) printf("%d scanners %d transitions\n", ls->scanners, ls->transitions);
523 FREE(ls);
524 }
525