1 /*
2 Copyright 2002-2004 John Plevyak, All Rights Reserved
3 */
4
5 #include "gramgram.h"
6 #include "d.h"
7
8 extern D_ParserTables parser_tables_dparser_gram;
9
10 static char *action_types[] = {"ACCEPT", "SHIFT", "REDUCE"};
11
12 static void print_state(State *s);
13
new_production(Grammar * g,char * name)14 Production *new_production(Grammar *g, char *name) {
15 Production *p;
16 if ((p = lookup_production(g, name, strlen(name)))) {
17 FREE(name);
18 return p;
19 }
20 p = MALLOC(sizeof(Production));
21 memset(p, 0, sizeof(Production));
22 vec_add(&g->productions, p);
23 p->name = name;
24 p->name_len = strlen(name);
25 return p;
26 }
27
new_elem()28 static Elem *new_elem() {
29 Elem *e = MALLOC(sizeof(Elem));
30 memset(e, 0, sizeof(Elem));
31 return e;
32 }
33
new_rule(Grammar * g,Production * p)34 Rule *new_rule(Grammar *g, Production *p) {
35 Rule *r = MALLOC(sizeof(Rule));
36 memset(r, 0, sizeof(Rule));
37 r->prod = p;
38 r->end = new_elem();
39 r->end->kind = ELEM_END;
40 r->end->rule = r;
41 r->action_index = g->action_index;
42 return r;
43 }
44
new_term()45 static Term *new_term() {
46 Term *term = MALLOC(sizeof(Term));
47 memset(term, 0, sizeof(Term));
48 return term;
49 }
50
new_elem_term(Term * t,Rule * r)51 static Elem *new_elem_term(Term *t, Rule *r) {
52 Elem *e = new_elem();
53 e->kind = ELEM_TERM;
54 e->e.term = t;
55 e->rule = r;
56 vec_add(&r->elems, e);
57 return e;
58 }
59
new_elem_nterm(Production * p,Rule * r)60 Elem *new_elem_nterm(Production *p, Rule *r) {
61 Elem *e = new_elem();
62 e->kind = ELEM_NTERM;
63 e->e.nterm = p;
64 e->rule = r;
65 return e;
66 }
67
new_term_string(Grammar * g,char * s,char * e,Rule * r)68 static Elem *new_term_string(Grammar *g, char *s, char *e, Rule *r) {
69 Term *t = new_term();
70 Elem *elem;
71
72 t->string = MALLOC(e - s + 1);
73 memcpy(t->string, s, e - s);
74 t->string[e - s] = 0;
75 t->string_len = e - s;
76 vec_add(&g->terminals, t);
77 elem = new_elem_term(t, r);
78 return elem;
79 }
80
escape_string_for_regex(const char * s)81 char *escape_string_for_regex(const char *s) {
82 char *ss = (char *)MALLOC((strlen(s) + 1) * 2), *sss = ss;
83 for (; *s; s++) {
84 switch (*s) {
85 case '(':
86 case ')':
87 case '[':
88 case ']':
89 case '-':
90 case '^':
91 case '*':
92 case '?':
93 case '+':
94 *ss++ = '\\';
95 /* fall through */
96 default:
97 *ss++ = *s;
98 break;
99 }
100 }
101 *ss = 0;
102 return sss;
103 }
104
unescape_term_string(Term * t)105 static void unescape_term_string(Term *t) {
106 char *s, *start = 0, *ss;
107 int length, base = 0;
108
109 for (ss = s = t->string; *s; s++) {
110 if (*s == '\\') {
111 switch (s[1]) {
112 case '\\':
113 if (t->kind == TERM_STRING) {
114 *ss = '\\';
115 s++;
116 break;
117 } else
118 goto Ldefault;
119 case 'b':
120 *ss = '\b';
121 s++;
122 break;
123 case 'f':
124 *ss = '\f';
125 s++;
126 break;
127 case 'n':
128 *ss = '\n';
129 s++;
130 break;
131 case 'r':
132 *ss = '\r';
133 s++;
134 break;
135 case 't':
136 *ss = '\t';
137 s++;
138 break;
139 case 'v':
140 *ss = '\v';
141 s++;
142 break;
143 case 'a':
144 *ss = '\a';
145 s++;
146 break;
147 case 'c':
148 *ss = 0;
149 return;
150 case '\"':
151 if (t->kind == TERM_REGEX) {
152 *ss = '\"';
153 s++;
154 break;
155 } else
156 goto Ldefault;
157 case '\'':
158 if (t->kind == TERM_STRING) {
159 *ss = '\'';
160 s++;
161 break;
162 } else
163 goto Ldefault;
164 case 'x':
165 length = 0;
166 if (isxdigit_(s[2])) {
167 base = 16;
168 start = s + 2;
169 length++;
170 if (isxdigit_(s[3])) length++;
171 }
172 s += length + 1;
173 goto Lncont;
174 case 'd':
175 length = 0;
176 if (isdigit_(s[2])) {
177 base = 10;
178 start = s + 2;
179 length++;
180 if (isdigit_(s[3])) {
181 length++;
182 if (isdigit_(s[4]) &&
183 ((s[2] < '2') || ((s[2] == '2') && ((s[3] < '5') || ((s[3] == '5') && (s[4] < '6'))))))
184 length++;
185 }
186 }
187 s += length + 1;
188 goto Lncont;
189 case '0':
190 case '1':
191 case '2':
192 case '3':
193 case '4':
194 case '5':
195 case '6':
196 case '7':
197 length = 1;
198 base = 8;
199 start = s + 1;
200 if (isdigit_(s[2]) && (s[2] != '8') && (s[2] != '9')) {
201 length++;
202 if (isdigit_(s[3]) && (s[3] != '8') && (s[3] != '9')) {
203 length++;
204 }
205 }
206 s += length;
207 /* fall through */
208 Lncont:
209 if (length > 0) {
210 char saved_c = start[length];
211 start[length] = '\0';
212 *ss = (unsigned char)strtol(start, NULL, base);
213 start[length] = saved_c;
214 if (*s > 0) break;
215 d_fail("encountered an escaped NULL while processing '%s'", t->string);
216 } else
217 goto next;
218 Ldefault:
219 default:
220 *ss++ = *s;
221 *ss = s[1];
222 s++;
223 break;
224 }
225 } else
226 *ss = *s;
227 ss++;
228 next:;
229 }
230 *ss = 0;
231 t->string_len = strlen(t->string);
232 if (!t->string_len) d_fail("empty string after unescape '%s'", t->string);
233 }
234
new_string(Grammar * g,char * s,char * e,Rule * r)235 Elem *new_string(Grammar *g, char *s, char *e, Rule *r) {
236 Elem *x = new_term_string(g, s + 1, e - 1, r);
237 x->e.term->kind = (*s == '"') ? TERM_REGEX : TERM_STRING;
238 unescape_term_string(x->e.term);
239 return x;
240 }
241
new_utf8_char(Grammar * g,char * s,char * e,Rule * r)242 Elem *new_utf8_char(Grammar *g, char *s, char *e, Rule *r) {
243 char utf8_code[4];
244 unsigned long utf32_code, base, len = 0;
245 Elem *x;
246 for (utf32_code = 0, base = 1; e >= s + 3; base *= 16) {
247 e--;
248 if (*e >= '0' && *e <= '9')
249 utf32_code += base * (*e - '0');
250 else if (*e >= 'a' && *e <= 'f')
251 utf32_code += base * (*e - 'a' + 10);
252 else if (*e >= 'A' && *e <= 'F')
253 utf32_code += base * (*e - 'A' + 10);
254 }
255 if (utf32_code < 0x80) {
256 utf8_code[0] = utf32_code;
257 len = 1;
258 } else if (utf32_code < 0x800) {
259 utf8_code[0] = 0xc0 | ((utf32_code >> 6) & 0x1f);
260 utf8_code[1] = 0x80 | (utf32_code & 0x3f);
261 len = 2;
262 } else if (utf32_code < 0x10000) {
263 utf8_code[0] = 0xe0 | ((utf32_code >> 12) & 0x0f);
264 utf8_code[1] = 0x80 | ((utf32_code >> 6) & 0x3f);
265 utf8_code[2] = 0x80 | (utf32_code & 0x3f);
266 len = 3;
267 } else if (utf32_code < 0x02000000) {
268 utf8_code[0] = 0xf0 | ((utf32_code >> 18) & 0x07);
269 utf8_code[1] = 0x80 | ((utf32_code >> 12) & 0x3f);
270 utf8_code[2] = 0x80 | ((utf32_code >> 6) & 0x3f);
271 utf8_code[3] = 0x80 | (utf32_code & 0x3f);
272 len = 4;
273 } else {
274 d_fail("UTF32 Unicode value U+%8X too large for valid UTF-8 encoding (cf. Unicode Spec 4.0, section 3.9)",
275 utf32_code);
276 }
277 x = new_term_string(g, utf8_code, utf8_code + len, r);
278 x->e.term->kind = TERM_STRING;
279 return x;
280 }
281
new_ident(char * s,char * e,Rule * r)282 Elem *new_ident(char *s, char *e, Rule *r) {
283 Elem *x = new_elem();
284 x->kind = ELEM_UNRESOLVED;
285 x->e.unresolved.string = dup_str(s, e);
286 x->e.unresolved.len = strlen(x->e.unresolved.string);
287 x->rule = r;
288 if (r) vec_add(&r->elems, x);
289 return x;
290 }
291
new_token(Grammar * g,char * s,char * e)292 void new_token(Grammar *g, char *s, char *e) {
293 Term *t = new_term();
294 t->string = MALLOC(e - s + 1);
295 memcpy(t->string, s, e - s);
296 t->string[e - s] = 0;
297 t->string_len = e - s;
298 vec_add(&g->terminals, t);
299 t->kind = TERM_TOKEN;
300 }
301
new_code(Grammar * g,char * s,char * e,Rule * r)302 Elem *new_code(Grammar *g, char *s, char *e, Rule *r) {
303 Elem *x = new_term_string(g, s, e, r);
304 x->e.term->kind = TERM_CODE;
305 return x;
306 }
307
dup_elem(Elem * e,Rule * r)308 Elem *dup_elem(Elem *e, Rule *r) {
309 Elem *ee = MALLOC(sizeof(Elem));
310 memcpy(ee, e, sizeof(Elem));
311 if (ee->kind == ELEM_UNRESOLVED) ee->e.unresolved.string = dup_str(e->e.unresolved.string, 0);
312 ee->rule = r;
313 return ee;
314 }
315
add_global_code(Grammar * g,char * start,char * end,int line)316 void add_global_code(Grammar *g, char *start, char *end, int line) {
317 if (!g->code)
318 g->code = MALLOC(sizeof(Code) * 4);
319 else if (!((g->ncode + 1) & 4))
320 g->code = REALLOC(g->code, sizeof(Code) * (g->ncode + 4));
321 g->code[g->ncode].code = dup_str(start, end);
322 g->code[g->ncode].line = line;
323 g->ncode++;
324 }
325
new_declaration(Grammar * g,Elem * e,uint kind)326 void new_declaration(Grammar *g, Elem *e, uint kind) {
327 Declaration *d = MALLOC(sizeof(*d));
328 d->elem = e;
329 d->kind = kind;
330 d->index = g->declarations.n;
331 vec_add(&g->declarations, d);
332 }
333
add_declaration(Grammar * g,char * start,char * end,uint kind,uint line)334 void add_declaration(Grammar *g, char *start, char *end, uint kind, uint line) {
335 if (start == end) {
336 switch (kind) {
337 case DECLARE_SET_OP_PRIORITY:
338 g->set_op_priority_from_rule = 1;
339 return;
340 case DECLARE_STATES_FOR_ALL_NTERMS:
341 g->states_for_all_nterms = 1;
342 return;
343 case DECLARE_LONGEST_MATCH:
344 g->longest_match = 1;
345 break;
346 case DECLARE_ALL_MATCHES:
347 g->longest_match = 0;
348 break;
349 case DECLARE_TOKENIZE:
350 g->tokenizer = 1;
351 break;
352 case DECLARE_SAVE_PARSE_TREE:
353 g->save_parse_tree = 1;
354 return;
355 default:
356 d_fail("declare expects argument, line %d", line);
357 }
358 }
359 switch (kind) {
360 case DECLARE_WHITESPACE:
361 g->default_white_space = dup_str(start, end);
362 return;
363 case DECLARE_SET_OP_PRIORITY:
364 d_fail("declare does not expect argument, line %d", line);
365 default:
366 new_declaration(g, new_ident(start, end, NULL), kind);
367 break;
368 }
369 }
370
find_pass(Grammar * g,char * start,char * end)371 D_Pass *find_pass(Grammar *g, char *start, char *end) {
372 uint i, l;
373 while (*start && isspace_(*start)) start++;
374 l = end - start;
375 for (i = 0; i < g->passes.n; i++)
376 if (l == g->passes.v[i]->name_len && !strncmp(g->passes.v[i]->name, start, l)) return g->passes.v[i];
377 return NULL;
378 }
379
add_pass(Grammar * g,char * start,char * end,uint kind,uint line)380 void add_pass(Grammar *g, char *start, char *end, uint kind, uint line) {
381 if (find_pass(g, start, end))
382 d_fail("duplicate pass '%s' line %d", dup_str(start, end), line);
383 else {
384 D_Pass *p = MALLOC(sizeof(*p));
385 p->name = dup_str(start, end);
386 p->name_len = end - start;
387 p->kind = kind;
388 p->index = g->pass_index++;
389 vec_add(&g->passes, p);
390 }
391 }
392
add_pass_code(Grammar * g,Rule * r,char * pass_start,char * pass_end,char * code_start,char * code_end,uint pass_line,uint code_line)393 void add_pass_code(Grammar *g, Rule *r, char *pass_start, char *pass_end, char *code_start, char *code_end,
394 uint pass_line, uint code_line) {
395 D_Pass *p = find_pass(g, pass_start, pass_end);
396 if (!p) d_fail("unknown pass '%s' line %d", dup_str(pass_start, pass_end), pass_line);
397 while (r->pass_code.n <= p->index) vec_add(&r->pass_code, NULL);
398 r->pass_code.v[p->index] = MALLOC(sizeof(Code));
399 r->pass_code.v[p->index]->code = dup_str(code_start, code_end);
400 r->pass_code.v[p->index]->line = code_line;
401 }
402
new_internal_production(Grammar * g,Production * p)403 Production *new_internal_production(Grammar *g, Production *p) {
404 char *n = p ? p->name : " _synthetic";
405 char *name = MALLOC(strlen(n) + 21);
406 Production *pp = NULL, *tp = NULL, *ttp;
407 uint i, found = 0;
408 sprintf(name, "%s__%d", n, g->productions.n);
409 pp = new_production(g, name);
410 pp->internal = INTERNAL_HIDDEN;
411 pp->regex = p ? p->regex : 0;
412 if (p) {
413 for (i = 0; i < g->productions.n; i++) {
414 if (found) {
415 ttp = g->productions.v[i];
416 g->productions.v[i] = tp;
417 tp = ttp;
418 } else if (p == g->productions.v[i]) {
419 found = 1;
420 tp = g->productions.v[i + 1];
421 g->productions.v[i + 1] = pp;
422 i++;
423 }
424 }
425 }
426 return pp;
427 }
428
conditional_EBNF(Grammar * g)429 void conditional_EBNF(Grammar *g) {
430 Production *pp;
431 Rule *rr;
432
433 pp = new_internal_production(g, g->p);
434 pp->internal = INTERNAL_CONDITIONAL;
435 rr = new_rule(g, pp);
436 vec_add(&rr->elems, last_elem(g->r));
437 last_elem(g->r)->rule = rr;
438 rr->elems.v[rr->elems.n - 1]->rule = rr;
439 vec_add(&pp->rules, rr);
440 vec_add(&pp->rules, new_rule(g, pp));
441 last_elem(g->r) = new_elem_nterm(pp, g->r);
442 }
443
star_EBNF(Grammar * g)444 void star_EBNF(Grammar *g) {
445 Production *pp;
446 Rule *rr;
447
448 pp = new_internal_production(g, g->p);
449 pp->internal = INTERNAL_STAR;
450 rr = new_rule(g, pp);
451 if (!g->right_recursive_BNF) {
452 vec_add(&rr->elems, new_elem_nterm(pp, rr));
453 vec_add(&rr->elems, last_elem(g->r));
454 last_elem(g->r) = new_elem_nterm(pp, g->r);
455 last_elem(rr)->rule = rr;
456 } else {
457 vec_add(&rr->elems, last_elem(g->r));
458 last_elem(g->r) = new_elem_nterm(pp, g->r);
459 last_elem(rr)->rule = rr;
460 vec_add(&rr->elems, new_elem_nterm(pp, rr));
461 }
462 vec_add(&pp->rules, rr);
463 vec_add(&pp->rules, new_rule(g, pp));
464 }
465
plus_EBNF(Grammar * g)466 void plus_EBNF(Grammar *g) {
467 Production *pp;
468 Rule *rr;
469 Elem *elem;
470
471 pp = new_internal_production(g, g->p);
472 pp->internal = INTERNAL_PLUS;
473 rr = new_rule(g, pp);
474 if (!g->right_recursive_BNF) {
475 elem = last_elem(g->r);
476 vec_add(&rr->elems, new_elem_nterm(pp, rr));
477 vec_add(&rr->elems, dup_elem(elem, rr));
478 last_elem(g->r) = new_elem_nterm(pp, g->r);
479 if (g->r->rule_priority) {
480 rr->rule_priority = g->r->rule_priority;
481 rr->rule_assoc = ASSOC_NARY_LEFT;
482 }
483 } else {
484 elem = last_elem(g->r);
485 vec_add(&rr->elems, dup_elem(elem, rr));
486 last_elem(g->r) = new_elem_nterm(pp, g->r);
487 vec_add(&rr->elems, new_elem_nterm(pp, rr));
488 if (g->r->rule_priority) {
489 rr->rule_priority = g->r->rule_priority;
490 rr->rule_assoc = ASSOC_NARY_RIGHT;
491 }
492 }
493 vec_add(&pp->rules, rr);
494 rr = new_rule(g, pp);
495 vec_add(&rr->elems, elem);
496 elem->rule = rr;
497 vec_add(&pp->rules, rr);
498 }
499
rep_EBNF(Grammar * g,int min,int max)500 void rep_EBNF(Grammar *g, int min, int max) {
501 Production *pp;
502 Rule *rr;
503 Elem *elem;
504 int i, j;
505 if (max < min) max = min;
506
507 pp = new_internal_production(g, g->p);
508 elem = last_elem(g->r);
509 for (i = min; i <= max; i++) {
510 rr = new_rule(g, pp);
511 for (j = 0; j < i; j++) vec_add(&rr->elems, dup_elem(elem, rr));
512 vec_add(&pp->rules, rr);
513 }
514 last_elem(g->r) = new_elem_nterm(pp, g->r);
515 FREE(elem);
516 }
517
initialize_productions(Grammar * g)518 void initialize_productions(Grammar *g) {
519 Production *pp = new_production(g, dup_str("0 Start", 0));
520 pp->internal = INTERNAL_HIDDEN;
521 }
522
finish_productions(Grammar * g)523 void finish_productions(Grammar *g) {
524 Production *pp = g->productions.v[0];
525 Rule *rr = new_rule(g, pp);
526 vec_add(&rr->elems, new_elem_nterm(NULL, rr));
527 vec_add(&pp->rules, rr);
528 rr->elems.v[0]->e.nterm = g->productions.v[1];
529 }
530
lookup_production(Grammar * g,char * name,uint l)531 Production *lookup_production(Grammar *g, char *name, uint l) {
532 uint i;
533
534 for (i = 0; i < g->productions.n; i++) {
535 Production *pp = g->productions.v[i];
536 if (pp->name_len != l || strncmp(pp->name, name, l)) continue;
537 return pp;
538 }
539 return NULL;
540 }
541
lookup_token(Grammar * g,char * name,int l)542 static Term *lookup_token(Grammar *g, char *name, int l) {
543 uint i;
544
545 for (i = 0; i < g->terminals.n; i++) {
546 Term *t = g->terminals.v[i];
547 if (t->kind != TERM_TOKEN || t->string_len != l || strncmp(t->string, name, l)) continue;
548 return t;
549 }
550 return NULL;
551 }
552
unique_term(Grammar * g,Term * t)553 static Term *unique_term(Grammar *g, Term *t) {
554 uint i;
555 for (i = 0; i < g->terminals.n; i++)
556 if (t->kind == g->terminals.v[i]->kind && t->string_len == g->terminals.v[i]->string_len &&
557 t->term_priority == g->terminals.v[i]->term_priority && t->term_name == g->terminals.v[i]->term_name &&
558 (!g->set_op_priority_from_rule ||
559 (t->op_assoc == g->terminals.v[i]->op_assoc && t->op_priority == g->terminals.v[i]->op_priority)) &&
560 !strncmp(t->string, g->terminals.v[i]->string, t->string_len))
561 return g->terminals.v[i];
562 return t;
563 }
564
compute_nullable(Grammar * g)565 static void compute_nullable(Grammar *g) {
566 uint i, j, k, changed = 1;
567 Elem *e;
568
569 /* ensure that the trivial case is the first cause */
570 for (i = 0; i < g->productions.n; i++) {
571 for (j = 0; j < g->productions.v[i]->rules.n; j++)
572 if (!g->productions.v[i]->rules.v[j]->elems.n) {
573 g->productions.v[i]->nullable = g->productions.v[i]->rules.v[j];
574 break;
575 }
576 }
577 /* transitive closure */
578 while (changed) {
579 changed = 0;
580 for (i = 0; i < g->productions.n; i++) {
581 if (!g->productions.v[i]->nullable)
582 for (j = 0; j < g->productions.v[i]->rules.n; j++) {
583 for (k = 0; k < g->productions.v[i]->rules.v[j]->elems.n; k++) {
584 e = g->productions.v[i]->rules.v[j]->elems.v[k];
585 if (e->kind != ELEM_NTERM || !e->e.nterm->nullable) goto Lnot_nullable;
586 }
587 changed = 1;
588 g->productions.v[i]->nullable = g->productions.v[i]->rules.v[j];
589 break;
590 }
591 Lnot_nullable:;
592 }
593 }
594 }
595
596 /*
597 verify and cleanup the grammar datastructures
598 - resolve non-terminals
599 - set element indexes
600 */
resolve_grammar(Grammar * g)601 static void resolve_grammar(Grammar *g) {
602 uint i, j, k, l;
603 Production *p, *pp;
604 Rule *r;
605 Elem *e;
606 Term *last_term, *t;
607
608 g->rule_index = 0;
609 for (i = 0; i < g->productions.n; i++) {
610 p = g->productions.v[i];
611 if (p != lookup_production(g, p->name, p->name_len)) d_fail("duplicate production '%s'", p->name);
612 p->index = i;
613 for (j = 0; j < p->rules.n; j++) {
614 r = p->rules.v[j];
615 r->index = g->rule_index++;
616 last_term = NULL;
617 for (k = 0; k < r->elems.n; k++) {
618 e = r->elems.v[k];
619 e->index = k;
620 if (e->kind == ELEM_UNRESOLVED) {
621 l = e->e.unresolved.len;
622 if ((pp = lookup_production(g, e->e.unresolved.string, l))) {
623 FREE(e->e.unresolved.string);
624 e->e.unresolved.string = 0;
625 e->kind = ELEM_NTERM;
626 e->e.nterm = pp;
627 } else if ((t = lookup_token(g, e->e.unresolved.string, l))) {
628 FREE(e->e.unresolved.string);
629 e->e.unresolved.string = 0;
630 e->kind = ELEM_TERM;
631 e->e.term = t;
632 } else {
633 char str[256];
634 strncpy(str, e->e.unresolved.string, l);
635 str[l < 255 ? l : 255] = 0;
636 d_fail("unresolved identifier: '%s'", str);
637 }
638 }
639 if (e->kind == ELEM_TERM) last_term = e->e.term;
640 }
641 r->end->index = r->elems.n;
642 if (g->set_op_priority_from_rule) {
643 if (last_term && r->rule_assoc) {
644 last_term->op_assoc = r->rule_assoc;
645 last_term->op_priority = r->rule_priority;
646 }
647 }
648 }
649 }
650 for (i = 0; i < g->terminals.n; i++) g->terminals.v[i]->index = i;
651 compute_nullable(g);
652 }
653
merge_identical_terminals(Grammar * g)654 static void merge_identical_terminals(Grammar *g) {
655 uint i, j, k;
656 Production *p;
657 Rule *r;
658 Elem *e;
659
660 for (i = 0; i < g->productions.n; i++) {
661 p = g->productions.v[i];
662 for (j = 0; j < p->rules.n; j++) {
663 r = p->rules.v[j];
664 for (k = 0; k < r->elems.n; k++) {
665 e = r->elems.v[k];
666 if (e->kind == ELEM_TERM) e->e.term = unique_term(g, e->e.term);
667 }
668 }
669 }
670 }
671
print_term(Term * t)672 void print_term(Term *t) {
673 char *s = t->string ? escape_string(t->string) : NULL;
674 if (t->term_name)
675 printf("term_name(\"%s\") ", t->term_name);
676 else if (t->kind == TERM_STRING) {
677 if (!t->string || !*t->string)
678 printf("<EOF> ");
679 else
680 printf("string(\"%s\") ", s);
681 } else if (t->kind == TERM_REGEX) {
682 printf("regex(\"%s\") ", s);
683 } else if (t->kind == TERM_CODE)
684 printf("code(\"%s\") ", s);
685 else if (t->kind == TERM_TOKEN)
686 printf("token(\"%s\") ", s);
687 else
688 d_fail("unknown token kind");
689 if (s) FREE(s);
690 }
691
print_elem(Elem * ee)692 void print_elem(Elem *ee) {
693 if (ee->kind == ELEM_TERM)
694 print_term(ee->e.term);
695 else if (ee->kind == ELEM_UNRESOLVED)
696 printf("%s ", ee->e.unresolved.string);
697 else
698 printf("%s ", ee->e.nterm->name);
699 }
700
701 struct EnumStr {
702 uint e;
703 char *s;
704 } assoc_strings[] = {{ASSOC_NONE, "$none"},
705 {ASSOC_NARY_LEFT, "$left"},
706 {ASSOC_NARY_RIGHT, "$right"},
707 {ASSOC_UNARY_LEFT, "$unary_left"},
708 {ASSOC_UNARY_RIGHT, "$unary_right"},
709 {ASSOC_BINARY_LEFT, "$binary_left"},
710 {ASSOC_BINARY_RIGHT, "$binary_right"},
711 {ASSOC_NO, "$noassoc"}};
712
assoc_str(uint e)713 static char *assoc_str(uint e) {
714 uint i;
715
716 for (i = 0; i < sizeof(assoc_strings) / sizeof(assoc_strings[0]); i++)
717 if (e == assoc_strings[i].e) return assoc_strings[i].s;
718 return assoc_strings[0].s;
719 }
720
print_rule(Rule * r)721 void print_rule(Rule *r) {
722 uint k;
723
724 printf("%s: ", r->prod->name);
725 for (k = 0; k < r->elems.n; k++) print_elem(r->elems.v[k]);
726 if (r->speculative_code.code) printf("SPECULATIVE_CODE\n%s\nEND CODE\n", r->speculative_code.code);
727 if (r->final_code.code) printf("FINAL_CODE\n%s\nEND CODE\n", r->final_code.code);
728 }
729
print_grammar(Grammar * g)730 void print_grammar(Grammar *g) {
731 uint i, j, k;
732 Production *pp;
733 Rule *rr;
734
735 if (!g->productions.n) return;
736 printf("PRODUCTIONS\n\n");
737 for (i = 0; i < g->productions.n; i++) {
738 pp = g->productions.v[i];
739 printf("%s (%d)\n", pp->name, i);
740 for (j = 0; j < pp->rules.n; j++) {
741 rr = pp->rules.v[j];
742 if (!j)
743 printf("\t: ");
744 else
745 printf("\t| ");
746 for (k = 0; k < rr->elems.n; k++) print_elem(rr->elems.v[k]);
747 if (rr->op_priority) printf("op %d ", rr->op_priority);
748 if (rr->op_assoc) printf("%s ", assoc_str(rr->op_assoc));
749 if (rr->rule_priority) printf("rule %d ", rr->rule_priority);
750 if (rr->rule_assoc) printf("%s ", assoc_str(rr->rule_assoc));
751 if (rr->speculative_code.code) printf("%s ", rr->speculative_code.code);
752 if (rr->final_code.code) printf("%s ", rr->final_code.code);
753 printf("\n");
754 }
755 printf("\t;\n");
756 printf("\n");
757 }
758 printf("TERMINALS\n\n");
759 for (i = 0; i < g->terminals.n; i++) {
760 printf("\t");
761 print_term(g->terminals.v[i]);
762 printf("(%d)\n", i + g->productions.n);
763 }
764 printf("\n");
765 }
766
print_item(Item * i)767 static void print_item(Item *i) {
768 uint j, end = 1;
769
770 printf("\t%s: ", i->rule->prod->name);
771 for (j = 0; j < i->rule->elems.n; j++) {
772 Elem *e = i->rule->elems.v[j];
773 if (i == e) {
774 printf(". ");
775 end = 0;
776 }
777 print_elem(e);
778 }
779 if (end) printf(". ");
780 printf("\n");
781 }
782
print_conflict(char * kind,uint * conflict)783 static void print_conflict(char *kind, uint *conflict) {
784 if (!*conflict) {
785 printf(" CONFLICT (before precedence and associativity)\n");
786 *conflict = 1;
787 }
788 printf("\t%s conflict ", kind);
789 printf("\n");
790 }
791
print_state(State * s)792 static void print_state(State *s) {
793 uint j, conflict = 0;
794
795 printf("STATE %d (%d ITEMS)%s\n", s->index, s->items.n, s->accept ? " ACCEPT" : "");
796 for (j = 0; j < s->items.n; j++) print_item(s->items.v[j]);
797 if (s->gotos.n) printf(" GOTO\n");
798 for (j = 0; j < s->gotos.n; j++) {
799 printf("\t");
800 print_elem(s->gotos.v[j]->elem);
801 printf(" : %d\n", s->gotos.v[j]->state->index);
802 }
803 printf(" ACTION\n");
804 for (j = 0; j < s->reduce_actions.n; j++) {
805 Action *a = s->reduce_actions.v[j];
806 printf("\t%s\t", action_types[a->kind]);
807 print_rule(a->rule);
808 printf("\n");
809 }
810 for (j = 0; j < s->shift_actions.n; j++) {
811 Action *a = s->shift_actions.v[j];
812 printf("\t%s\t", action_types[a->kind]);
813 if (a->kind == ACTION_SHIFT) {
814 print_term(a->term);
815 printf("%d", a->state->index);
816 }
817 printf("\n");
818 }
819 if (s->reduce_actions.n > 1) print_conflict("reduce/reduce", &conflict);
820 if (s->reduce_actions.n && s->shift_actions.n) print_conflict("shift/reduce", &conflict);
821 printf("\n");
822 }
823
print_states(Grammar * g)824 void print_states(Grammar *g) {
825 uint i;
826
827 for (i = 0; i < g->states.n; i++) print_state(g->states.v[i]);
828 }
829
state_for_declaration(Grammar * g,uint iproduction)830 uint state_for_declaration(Grammar *g, uint iproduction) {
831 uint i;
832
833 for (i = 0; i < g->declarations.n; i++)
834 if (g->declarations.v[i]->kind == DECLARE_STATE_FOR && g->declarations.v[i]->elem->e.nterm->index == iproduction)
835 return 1;
836 return 0;
837 }
838
make_elems_for_productions(Grammar * g)839 static void make_elems_for_productions(Grammar *g) {
840 uint i, j, k, l;
841 Rule *rr;
842 Production *pp, *ppp;
843
844 pp = g->productions.v[0];
845 for (i = 0; i < g->productions.n; i++)
846 if (!g->productions.v[i]->internal) {
847 if (g->states_for_all_nterms || state_for_declaration(g, i)) {
848 /* try to find an existing elem */
849 for (j = 0; j < g->productions.n; j++)
850 for (k = 0; k < g->productions.v[j]->rules.n; k++) {
851 rr = g->productions.v[j]->rules.v[k];
852 for (l = 0; l < rr->elems.n; l++)
853 if (rr->elems.v[l]->e.term_or_nterm == g->productions.v[i]) {
854 g->productions.v[i]->elem = rr->elems.v[l];
855 break;
856 }
857 }
858 if (j >= g->productions.n) { /* not found */
859 g->productions.v[i]->elem = new_elem_nterm(g->productions.v[i], new_rule(g, pp));
860 g->productions.v[i]->elem->rule->index = g->rule_index++; /* fake */
861 }
862 }
863 }
864 if (!g->states_for_all_nterms && g->states_for_whitespace &&
865 (ppp = lookup_production(g, "whitespace", sizeof("whitespace") - 1))) {
866 ppp->elem = new_elem_nterm(ppp, new_rule(g, pp));
867 ppp->elem->rule->index = g->rule_index++; /* fake */
868 }
869 }
870
convert_regex_production_one(Grammar * g,Production * p)871 static void convert_regex_production_one(Grammar *g, Production *p) {
872 uint j, k, l;
873 Production *pp;
874 Rule *r, *rr;
875 Elem *e;
876 Term *t;
877 int circular = 0;
878 char *buf = 0, *b, *s;
879 int buf_len = 0;
880
881 if (p->regex_term) /* already done */
882 return;
883 if (p->in_regex) d_fail("circular regex production '%s'", p->name);
884 p->in_regex = 1;
885 for (j = 0; j < p->rules.n; j++) {
886 r = p->rules.v[j];
887 if (r->final_code.code || (r->speculative_code.code && p->rules.n > 1))
888 d_fail("final and/or multi-rule code not permitted in regex productions '%s'", p->name);
889 for (k = 0; k < r->elems.n; k++) {
890 e = r->elems.v[k];
891 if (e->kind == ELEM_NTERM) {
892 if (!e->e.nterm->regex)
893 d_fail("regex production '%s' cannot invoke non-regex production '%s'", p->name, e->e.nterm->name);
894 pp = e->e.nterm;
895 for (l = 0; l < pp->rules.n; l++)
896 if (pp->rules.v[l]->speculative_code.code || pp->rules.v[l]->final_code.code)
897 d_fail("code not permitted in rule %d of regex productions '%s'", l, p->name);
898 if (p != pp) {
899 convert_regex_production_one(g, pp);
900 buf_len += pp->regex_term->string_len + 5;
901 } else {
902 circular = 1;
903 buf_len += 5;
904 }
905 } else { /* e->kind == ELEM_TERM */
906 if (e->e.term->kind == TERM_CODE || e->e.term->kind == TERM_TOKEN)
907 d_fail("regex production '%s' cannot include scanners or tokens");
908 buf_len += e->e.term->string_len + 5;
909 }
910 }
911 }
912 b = buf = (char *)MALLOC(buf_len + 1);
913 t = new_term();
914 t->kind = TERM_REGEX;
915 t->string = buf;
916 t->index = g->terminals.n;
917 t->regex_production = p;
918 vec_add(&g->terminals, t);
919 p->regex_term = t;
920 p->regex_term->term_name = dup_str(p->name, 0);
921 if (circular) { /* attempt to match to regex operators */
922 if (p->rules.n != 2)
923 Lfail:
924 d_fail("unable to resolve circular regex production: '%s'", p->name);
925 l = p->rules.v[0]->elems.n + p->rules.v[1]->elems.n;
926 if (l == 2 || l == 3) {
927 if (p->rules.v[0]->elems.n != 2 && p->rules.v[1]->elems.n != 2) goto Lfail;
928 r = p->rules.v[0]->elems.n == 2 ? p->rules.v[0] : p->rules.v[1];
929 rr = p->rules.v[0] == r ? p->rules.v[1] : p->rules.v[0];
930 if (r->elems.v[0]->e.nterm != p && r->elems.v[1]->e.nterm != p) goto Lfail;
931 e = r->elems.v[0]->e.nterm == p ? r->elems.v[1] : r->elems.v[1];
932 if (rr->elems.n && e->e.term_or_nterm != rr->elems.v[0]->e.term_or_nterm) goto Lfail;
933 t = e->kind == ELEM_TERM ? e->e.term : e->e.nterm->regex_term;
934 *b++ = '(';
935 if (t->kind == TERM_STRING)
936 s = escape_string_for_regex(t->string);
937 else
938 s = t->string;
939 memcpy(b, s, strlen(s));
940 b += strlen(s);
941 if (t->kind == TERM_STRING) FREE(s);
942 *b++ = ')';
943 if (l == 2)
944 *b++ = '*';
945 else
946 *b++ = '+';
947 *b = 0;
948 p->regex_term->string_len = strlen(p->regex_term->string);
949 } else
950 goto Lfail;
951 } else { /* handle the base case, p = (r | r'), r = (e e') */
952 if (p->rules.n > 1) *b++ = '(';
953 for (j = 0; j < p->rules.n; j++) {
954 r = p->rules.v[j];
955 if (r->elems.n > 1) *b++ = '(';
956 for (k = 0; k < r->elems.n; k++) {
957 e = r->elems.v[k];
958 t = e->kind == ELEM_TERM ? e->e.term : e->e.nterm->regex_term;
959 if (t->kind == TERM_STRING)
960 s = escape_string_for_regex(t->string);
961 else
962 s = t->string;
963 memcpy(b, s, strlen(s));
964 b += strlen(s);
965 if (t->kind == TERM_STRING) FREE(s);
966 }
967 if (r->elems.n > 1) *b++ = ')';
968 if (j != p->rules.n - 1) *b++ = '|';
969 }
970 if (p->rules.n > 1) *b++ = ')';
971 *b = 0;
972 p->regex_term->string_len = strlen(p->regex_term->string);
973 }
974 p->in_regex = 0;
975 }
976
convert_regex_productions(Grammar * g)977 static void convert_regex_productions(Grammar *g) {
978 uint i, j, k;
979 Production *p;
980 Rule *r;
981
982 for (i = 0; i < g->productions.n; i++) {
983 p = g->productions.v[i];
984 if (!p->regex) continue;
985 convert_regex_production_one(g, p);
986 }
987 for (i = 0; i < g->productions.n; i++) {
988 p = g->productions.v[i];
989 for (j = 0; j < p->rules.n; j++) {
990 r = p->rules.v[j];
991 for (k = 0; k < r->elems.n; k++) {
992 if (r->elems.v[k]->kind == ELEM_NTERM && r->elems.v[k]->e.nterm->regex_term) {
993 r->elems.v[k]->e.term = r->elems.v[k]->e.nterm->regex_term;
994 r->elems.v[k]->kind = ELEM_TERM;
995 }
996 }
997 }
998 }
999 }
1000
check_default_actions(Grammar * g)1001 static void check_default_actions(Grammar *g) {
1002 Production *pdefault;
1003
1004 pdefault = lookup_production(g, "_", 1);
1005 if (pdefault && pdefault->rules.n > 1) d_fail("number of rules in default action != 1");
1006 }
1007
1008 typedef struct {
1009 struct State *eq;
1010 struct Rule *diff_rule;
1011 struct State *diff_state;
1012 } EqState;
1013
build_eq(Grammar * g)1014 void build_eq(Grammar *g) {
1015 uint i, j, k, changed = 1, x, xx;
1016 State *s, *ss;
1017 EqState *eq, *e, *ee;
1018
1019 eq = (EqState *)MALLOC(sizeof(EqState) * g->states.n);
1020 memset(eq, 0, sizeof(EqState) * g->states.n);
1021 while (changed) {
1022 changed = 0;
1023 for (i = 0; i < g->states.n; i++) {
1024 s = g->states.v[i];
1025 e = &eq[s->index];
1026 for (j = i + 1; j < g->states.n; j++) {
1027 ss = g->states.v[j];
1028 ee = &eq[ss->index];
1029 if (e->eq || ee->eq) continue;
1030 if (s->same_shifts != ss->same_shifts && ss->same_shifts != s) continue;
1031 /* check gotos */
1032 if (s->gotos.n != ss->gotos.n) continue;
1033 for (k = 0; k < s->gotos.n; k++) {
1034 if (elem_symbol(g, s->gotos.v[k]->elem) != elem_symbol(g, ss->gotos.v[k]->elem)) goto Lcontinue;
1035 if (s->gotos.v[k]->state != ss->gotos.v[k]->state) {
1036 EqState *ge = &eq[s->gotos.v[k]->state->index];
1037 EqState *gee = &eq[ss->gotos.v[k]->state->index];
1038 if (ge->eq != ss->gotos.v[k]->state && gee->eq != s->gotos.v[k]->state) goto Lcontinue;
1039 if ((ee->diff_state && ee->diff_state != eq[ss->gotos.v[k]->state->index].eq) ||
1040 (e->diff_state && e->diff_state != eq[s->gotos.v[k]->state->index].eq))
1041 goto Lcontinue;
1042 /* allow one different state */
1043 ee->diff_state = ss->gotos.v[k]->state;
1044 e->diff_state = s->gotos.v[k]->state;
1045 }
1046 }
1047 /* check reductions */
1048 if (s->reduce_actions.n != ss->reduce_actions.n) continue;
1049 for (k = 0; k < s->reduce_actions.n; k++) {
1050 if (s->reduce_actions.v[k]->rule == ss->reduce_actions.v[k]->rule) continue;
1051 if (s->reduce_actions.v[k]->rule->prod != ss->reduce_actions.v[k]->rule->prod) goto Lcontinue;
1052 if ((x = s->reduce_actions.v[k]->rule->elems.n) != (xx = ss->reduce_actions.v[k]->rule->elems.n)) {
1053 if ((ee->diff_rule && ee->diff_rule != ss->reduce_actions.v[k]->rule) ||
1054 (e->diff_rule && e->diff_rule != s->reduce_actions.v[k]->rule))
1055 goto Lcontinue;
1056 /* allow one different rule */
1057 ee->diff_rule = ss->reduce_actions.v[k]->rule;
1058 e->diff_rule = s->reduce_actions.v[k]->rule;
1059 }
1060 }
1061 ee->eq = s;
1062 changed = 1;
1063 Lcontinue:;
1064 }
1065 }
1066 }
1067 for (i = 0; i < g->states.n; i++) {
1068 s = g->states.v[i];
1069 e = &eq[s->index];
1070 if (e->eq) {
1071 if (d_verbose_level > 2) {
1072 printf("eq %d %d ", s->index, e->eq->index);
1073 if (e->diff_state) printf("diff state (%d %d) ", e->diff_state->index, eq[e->eq->index].diff_state->index);
1074 if (e->diff_rule) {
1075 printf("diff rule ");
1076 printf("[ ");
1077 print_rule(e->diff_rule);
1078 printf("][ ");
1079 print_rule(eq[e->eq->index].diff_rule);
1080 printf("]");
1081 }
1082 printf("\n");
1083 }
1084 }
1085 }
1086 for (i = 0; i < g->states.n; i++) {
1087 s = g->states.v[i];
1088 e = &eq[s->index];
1089 if (e->eq && e->diff_state) {
1090 if (eq[e->diff_state->index].diff_rule && eq[e->diff_state->index].diff_rule->elems.n == 2) {
1091 s->reduces_to = e->eq;
1092 s->reduces_with = eq[e->eq->index].diff_rule;
1093 s->reduces_to_then_with = e->diff_rule;
1094 } else if (eq[eq[e->eq->index].diff_state->index].diff_rule &&
1095 eq[eq[e->eq->index].diff_state->index].diff_rule->elems.n == 2) {
1096 e->eq->reduces_to = s;
1097 s->reduces_with = e->diff_rule;
1098 s->reduces_to_then_with = eq[e->eq->index].diff_rule;
1099 }
1100 }
1101 }
1102 for (i = 0; i < g->states.n; i++) {
1103 s = g->states.v[i];
1104 if (s->reduces_to)
1105 if (d_verbose_level) printf("reduces_to %d %d\n", s->index, s->reduces_to->index);
1106 }
1107 FREE(eq);
1108 }
1109
new_D_Grammar(char * pathname)1110 Grammar *new_D_Grammar(char *pathname) {
1111 Grammar *g = (Grammar *)MALLOC(sizeof(Grammar));
1112 memset(g, 0, sizeof(Grammar));
1113 g->pathname = dup_str(pathname, pathname + strlen(pathname));
1114 return g;
1115 }
1116
free_rule(Rule * r)1117 static void free_rule(Rule *r) {
1118 uint i;
1119 FREE(r->end);
1120 if (r->final_code.code) FREE(r->final_code.code);
1121 if (r->speculative_code.code) FREE(r->speculative_code.code);
1122 vec_free(&r->elems);
1123 for (i = 0; i < r->pass_code.n; i++) {
1124 FREE(r->pass_code.v[i]->code);
1125 FREE(r->pass_code.v[i]);
1126 }
1127 vec_free(&r->pass_code);
1128 FREE(r);
1129 }
1130
free_D_Grammar(Grammar * g)1131 void free_D_Grammar(Grammar *g) {
1132 uint i, j, k;
1133
1134 for (i = 0; i < g->productions.n; i++) {
1135 Production *p = g->productions.v[i];
1136 for (j = 0; j < p->rules.n; j++) {
1137 Rule *r = p->rules.v[j];
1138 if (r == g->r) g->r = 0;
1139 for (k = 0; k < r->elems.n; k++) {
1140 Elem *e = r->elems.v[k];
1141 if (e == p->elem) p->elem = 0;
1142 FREE(e);
1143 }
1144 if (r->end == p->elem) p->elem = 0;
1145 free_rule(r);
1146 }
1147 vec_free(&p->rules);
1148 FREE(p->name);
1149 if (p->elem) {
1150 free_rule(p->elem->rule);
1151 FREE(p->elem);
1152 }
1153 FREE(p);
1154 }
1155 vec_free(&g->productions);
1156 for (i = 0; i < g->terminals.n; i++) {
1157 Term *t = g->terminals.v[i];
1158 if (t->string) FREE(t->string);
1159 if (t->term_name) FREE(t->term_name);
1160 FREE(t);
1161 }
1162 vec_free(&g->terminals);
1163 for (i = 0; i < g->actions.n; i++) free_Action(g->actions.v[i]);
1164 vec_free(&g->actions);
1165 if (g->scanner.code) FREE(g->scanner.code);
1166 for (i = 0; i < g->states.n; i++) {
1167 State *s = g->states.v[i];
1168 vec_free(&s->items);
1169 vec_free(&s->items_hash);
1170 for (j = 0; j < s->gotos.n; j++) {
1171 FREE(s->gotos.v[j]->elem);
1172 FREE(s->gotos.v[j]);
1173 }
1174 vec_free(&s->gotos);
1175 vec_free(&s->shift_actions);
1176 vec_free(&s->reduce_actions);
1177 for (j = 0; j < s->right_epsilon_hints.n; j++) FREE(s->right_epsilon_hints.v[j]);
1178 vec_free(&s->right_epsilon_hints);
1179 for (j = 0; j < s->error_recovery_hints.n; j++) FREE(s->error_recovery_hints.v[j]);
1180 vec_free(&s->error_recovery_hints);
1181 if (!s->same_shifts) {
1182 for (j = 0; j < s->scanner.states.n; j++) {
1183 vec_free(&s->scanner.states.v[j]->accepts);
1184 vec_free(&s->scanner.states.v[j]->live);
1185 FREE(s->scanner.states.v[j]);
1186 }
1187 vec_free(&s->scanner.states);
1188 for (j = 0; j < s->scanner.transitions.n; j++)
1189 if (s->scanner.transitions.v[j]) {
1190 vec_free(&s->scanner.transitions.v[j]->live_diff);
1191 vec_free(&s->scanner.transitions.v[j]->accepts_diff);
1192 FREE(s->scanner.transitions.v[j]);
1193 }
1194 vec_free(&s->scanner.transitions);
1195 }
1196 FREE(s->goto_valid);
1197 FREE(s);
1198 }
1199 vec_free(&g->states);
1200 for (i = 0; i < g->ncode; i++) FREE(g->code[i].code);
1201 FREE(g->code);
1202 for (i = 0; i < g->declarations.n; i++) {
1203 FREE(g->declarations.v[i]->elem);
1204 FREE(g->declarations.v[i]);
1205 }
1206 vec_free(&g->declarations);
1207 for (i = 0; i < g->passes.n; i++) {
1208 FREE(g->passes.v[i]->name);
1209 FREE(g->passes.v[i]);
1210 }
1211 vec_free(&g->passes);
1212 for (i = 0; i < g->all_pathnames.n; i++) FREE(g->all_pathnames.v[i]);
1213 FREE(g->pathname);
1214 if (g->default_white_space) FREE(g->default_white_space);
1215 FREE(g);
1216 }
1217
parse_grammar(Grammar * g,char * pathname,char * sarg)1218 int parse_grammar(Grammar *g, char *pathname, char *sarg) {
1219 D_Parser *p;
1220 int res = 0;
1221 char *s = sarg;
1222
1223 vec_add(&g->all_pathnames, dup_str(pathname, 0));
1224 if (!s)
1225 if (!(s = sbuf_read(pathname))) return -1;
1226 if (!g->productions.n) initialize_productions(g);
1227 p = new_D_Parser(&parser_tables_dparser_gram, sizeof(D_ParseNode_User));
1228 p->initial_globals = g;
1229 p->loc.pathname = pathname;
1230 if (dparse(p, s, strlen(s))) {
1231 if (g->productions.n > 1) finish_productions(g);
1232 } else
1233 res = -1;
1234 if (!sarg) FREE(s);
1235 free_D_Parser(p);
1236 return res;
1237 }
1238
scanner_declaration(Declaration * d)1239 static int scanner_declaration(Declaration *d) {
1240 switch (d->kind) {
1241 case DECLARE_TOKENIZE:
1242 case DECLARE_LONGEST_MATCH:
1243 case DECLARE_ALL_MATCHES:
1244 return 1;
1245 default:
1246 return 0;
1247 }
1248 }
1249
set_declaration_group(Production * p,Production * root,Declaration * d)1250 static void set_declaration_group(Production *p, Production *root, Declaration *d) {
1251 uint i, j;
1252 if (p->declaration_group[d->kind] == root) return;
1253 if (d->kind == DECLARE_TOKENIZE && p->declaration_group[d->kind]) {
1254 d_fail("shared tokenize subtrees");
1255 return;
1256 }
1257 p->declaration_group[d->kind] = root;
1258 p->last_declaration[d->kind] = d;
1259 for (i = 0; i < p->rules.n; i++) {
1260 for (j = 0; j < p->rules.v[i]->elems.n; j++)
1261 if (p->rules.v[i]->elems.v[j]->kind == ELEM_NTERM)
1262 set_declaration_group(p->rules.v[i]->elems.v[j]->e.nterm, root, d);
1263 }
1264 }
1265
propogate_declarations(Grammar * g)1266 static void propogate_declarations(Grammar *g) {
1267 uint i, j, k;
1268 Production *p, *start = g->productions.v[0];
1269 Rule *r;
1270 Elem *e;
1271
1272 /* global defaults */
1273 if (g->tokenizer) new_declaration(g, new_elem_nterm(g->productions.v[0], NULL), DECLARE_TOKENIZE);
1274 if (g->longest_match) new_declaration(g, new_elem_nterm(g->productions.v[0], NULL), DECLARE_LONGEST_MATCH);
1275 /* resolve declarations */
1276 for (i = 0; i < g->declarations.n; i++) {
1277 e = g->declarations.v[i]->elem;
1278 if (e->kind == ELEM_UNRESOLVED) {
1279 if (e->e.unresolved.len == 0)
1280 p = g->productions.v[0];
1281 else if (!(p = lookup_production(g, e->e.unresolved.string, e->e.unresolved.len)))
1282 d_fail("unresolved declaration '%s'", e->e.unresolved.string);
1283 FREE(e->e.unresolved.string);
1284 e->e.unresolved.string = 0;
1285 e->kind = ELEM_NTERM;
1286 e->e.nterm = p;
1287 }
1288 }
1289 /* build declaration groups (covering a production subtrees) */
1290 for (i = 0; i < g->declarations.n; i++) {
1291 if (scanner_declaration(g->declarations.v[i])) {
1292 p = g->declarations.v[i]->elem->e.nterm;
1293 if (p == start) {
1294 for (j = 0; j < g->productions.n; j++) {
1295 g->productions.v[j]->declaration_group[g->declarations.v[i]->kind] = start;
1296 g->productions.v[j]->last_declaration[g->declarations.v[i]->kind] = g->declarations.v[i];
1297 }
1298 } else
1299 set_declaration_group(p, p, g->declarations.v[i]);
1300 }
1301 }
1302 /* set terminal scan_kind */
1303 for (i = 0; i < g->productions.n; i++) {
1304 p = g->productions.v[i];
1305 for (j = 0; j < p->rules.n; j++) {
1306 r = p->rules.v[j];
1307 for (k = 0; k < r->elems.n; k++) {
1308 e = r->elems.v[k];
1309 if (e->kind == ELEM_TERM) {
1310 if (!p->declaration_group[DECLARE_LONGEST_MATCH] && !p->declaration_group[DECLARE_ALL_MATCHES])
1311 e->e.term->scan_kind = D_SCAN_DEFAULT;
1312 else if (p->declaration_group[DECLARE_LONGEST_MATCH] && !p->declaration_group[DECLARE_ALL_MATCHES])
1313 e->e.term->scan_kind = D_SCAN_LONGEST;
1314 else if (!p->declaration_group[DECLARE_LONGEST_MATCH] && p->declaration_group[DECLARE_ALL_MATCHES])
1315 e->e.term->scan_kind = D_SCAN_ALL;
1316 else {
1317 if (p->last_declaration[DECLARE_LONGEST_MATCH]->index > p->last_declaration[DECLARE_ALL_MATCHES]->index)
1318 e->e.term->scan_kind = D_SCAN_LONGEST;
1319 else
1320 e->e.term->scan_kind = D_SCAN_ALL;
1321 }
1322 }
1323 }
1324 }
1325 }
1326 }
1327
merge_shift_actions(State * to,State * from)1328 static void merge_shift_actions(State *to, State *from) {
1329 uint i, j;
1330 for (i = 0; i < from->shift_actions.n; i++) {
1331 for (j = 0; j < to->shift_actions.n; j++)
1332 if (from->shift_actions.v[i]->term == to->shift_actions.v[j]->term) goto Lnext;
1333 vec_add(&to->shift_actions, from->shift_actions.v[i]);
1334 Lnext:;
1335 }
1336 }
1337
compute_declaration_states(Grammar * g,Production * p,Declaration * d)1338 static void compute_declaration_states(Grammar *g, Production *p, Declaration *d) {
1339 State *s, *base_s = NULL;
1340 uint j, k, scanner = scanner_declaration(d);
1341 (void)p;
1342
1343 for (j = 0; j < g->states.n; j++) {
1344 s = g->states.v[j];
1345 if (d->kind == DECLARE_TOKENIZE) {
1346 if (!base_s)
1347 base_s = s;
1348 else {
1349 s->same_shifts = base_s;
1350 merge_shift_actions(base_s, s);
1351 }
1352 }
1353 if (scanner) {
1354 for (k = 0; k < s->items.n; k++)
1355 if (s->items.v[k]->kind == ELEM_TERM) switch (s->items.v[k]->e.term->scan_kind) {
1356 case D_SCAN_LONGEST:
1357 if (s->scan_kind == D_SCAN_RESERVED || s->scan_kind == D_SCAN_LONGEST)
1358 s->scan_kind = D_SCAN_LONGEST;
1359 else
1360 s->scan_kind = D_SCAN_MIXED;
1361 break;
1362 case D_SCAN_ALL:
1363 if (s->scan_kind == D_SCAN_RESERVED || s->scan_kind == D_SCAN_ALL)
1364 s->scan_kind = D_SCAN_ALL;
1365 else
1366 s->scan_kind = D_SCAN_MIXED;
1367 break;
1368 default:
1369 break;
1370 }
1371 }
1372 }
1373 }
1374
map_declarations_to_states(Grammar * g)1375 static void map_declarations_to_states(Grammar *g) {
1376 uint i;
1377 State *s;
1378
1379 for (i = 0; i < g->states.n; i++) {
1380 s = g->states.v[i];
1381 s->scan_kind = D_SCAN_RESERVED;
1382 }
1383 /* map groups to sets of states */
1384 for (i = 0; i < g->declarations.n; i++)
1385 if (scanner_declaration(g->declarations.v[i]))
1386 compute_declaration_states(g, g->declarations.v[i]->elem->e.nterm, g->declarations.v[i]);
1387 for (i = 0; i < g->states.n; i++) {
1388 s = g->states.v[i];
1389 if (s->scan_kind == D_SCAN_RESERVED) s->scan_kind = D_SCAN_DEFAULT; /* set the default */
1390 }
1391 }
1392
build_grammar(Grammar * g)1393 int build_grammar(Grammar *g) {
1394 resolve_grammar(g);
1395 convert_regex_productions(g);
1396 propogate_declarations(g);
1397 merge_identical_terminals(g);
1398 make_elems_for_productions(g);
1399 check_default_actions(g);
1400 build_LR_tables(g);
1401 map_declarations_to_states(g);
1402 if (d_verbose_level) {
1403 printf("%d productions %d terminals %d states %d declarations\n", g->productions.n, g->terminals.n, g->states.n,
1404 g->declarations.n);
1405 }
1406 if (d_verbose_level > 1) {
1407 print_grammar(g);
1408 print_states(g);
1409 }
1410 build_scanners(g);
1411 build_eq(g);
1412 return 0;
1413 }
1414
1415 /* Wlodek Bzyl, <matwb@univ.gda.pl>
1416 */
1417
print_term_escaped(Term * t,int double_escaped)1418 static void print_term_escaped(Term *t, int double_escaped) {
1419 char *s = 0;
1420 if (t->term_name) {
1421 printf("%s ", t->term_name);
1422 } else if (t->kind == TERM_STRING) {
1423 s = t->string ? escape_string_single_quote(t->string) : NULL;
1424 if (!t->string || !*t->string)
1425 printf("<EOF> ");
1426 else {
1427 printf("'%s' ", double_escaped ? escape_string_single_quote(s) : s);
1428 if (t->ignore_case) printf("/i ");
1429 if (t->term_priority) printf("%sterm %d ", double_escaped ? "#" : "$", t->term_priority);
1430 }
1431 } else if (t->kind == TERM_REGEX) {
1432 char *quote = double_escaped ? "\\\"" : "\"";
1433 s = t->string ? escape_string(t->string) : NULL;
1434 /* char *s = t->string; // ? escape_string(t->string) : NULL; */
1435 printf("%s%s%s ", quote, double_escaped ? escape_string(s) : s, quote);
1436 if (t->ignore_case) printf("/i ");
1437 if (t->term_priority) printf("%sterm %d ", double_escaped ? "#" : "$", t->term_priority);
1438 } else if (t->kind == TERM_CODE) {
1439 s = t->string ? escape_string(t->string) : NULL;
1440 printf("code(\"%s\") ", s);
1441 } else if (t->kind == TERM_TOKEN) {
1442 s = t->string ? escape_string(t->string) : NULL;
1443 printf("%s ", s);
1444 } else
1445 d_fail("unknown token kind");
1446 if (s) FREE(s);
1447 }
1448
1449 /* print_elem changed to call print_term_escaped */
print_element_escaped(Elem * ee,int double_escaped)1450 static void print_element_escaped(Elem *ee, int double_escaped) {
1451 if (ee->kind == ELEM_TERM)
1452 print_term_escaped(ee->e.term, double_escaped);
1453 else if (ee->kind == ELEM_UNRESOLVED)
1454 printf("%s ", ee->e.unresolved.string);
1455 else
1456 printf("%s ", ee->e.nterm->name);
1457 }
1458
print_global_code(Grammar * g)1459 static void print_global_code(Grammar *g) {
1460 uint i;
1461 int print_stdio_h = 1;
1462 printf("%%<");
1463 for (i = 0; i < g->ncode; i++) {
1464 printf("%s", g->code[i].code);
1465 if (strstr(g->code[i].code, "<stdio.h>") != NULL) print_stdio_h = 0;
1466 }
1467 if (print_stdio_h) printf("\n #include <stdio.h>\n");
1468 printf("%%>\n\n");
1469 }
1470
print_production(Production * p)1471 static void print_production(Production *p) {
1472 uint j, k;
1473 Rule *r;
1474 char *opening[] = {"", "\n\t [ printf(\"", "\n\t { printf(\""};
1475 char *closing[] = {"\n", "\\n\"); ]\n", "\\n\"); }\n"};
1476 char *middle[] = {"\n\t: ", " <- ", " <= "};
1477 char *assoc[] = {"$", "#", "#"};
1478 char *speculative_final_closing = "\\n\"); ]"; /* closing[1] without final newline */
1479 char *next_or_rule = "\t| ";
1480 /* char *regex_production = " ::= "; */
1481
1482 uint variant = 0;
1483
1484 for (j = 0; j < p->rules.n; j++) {
1485 Lmore:
1486 r = p->rules.v[j];
1487 if (!j) {
1488 /*
1489 if (p->regex) {
1490 printf("%s%s%s", opening[variant], p->name, regex_production);
1491 } else {
1492 */
1493 printf("%s%s%s", opening[variant], p->name, middle[variant]);
1494 /* } */
1495 } else {
1496 if (variant == 0)
1497 printf("%s", next_or_rule);
1498 else
1499 printf("%s%s%s", opening[variant], p->name, middle[variant]);
1500 }
1501
1502 for (k = 0; k < r->elems.n; k++) print_element_escaped(r->elems.v[k], variant);
1503
1504 if (r->op_assoc) printf(" %s%s ", assoc[variant], assoc_str(r->op_assoc) + 1);
1505 if (r->op_priority) printf("%d ", r->op_priority);
1506 if (r->rule_assoc) printf(" %s%s ", assoc[variant], assoc_str(r->rule_assoc) + 1);
1507 if (r->rule_priority) printf("%d ", r->rule_priority);
1508
1509 if ((d_rdebug_grammar_level == 1 && variant == 0) || (d_rdebug_grammar_level == 3 && variant == 0)) {
1510 variant = 1;
1511 goto Lmore;
1512 }
1513 if ((d_rdebug_grammar_level == 2 && variant == 0) || (d_rdebug_grammar_level == 3 && variant == 1)) {
1514 if (variant == 1) printf("%s", speculative_final_closing);
1515 variant = 2;
1516 goto Lmore;
1517 }
1518
1519 printf("%s", closing[variant]);
1520 variant = 0;
1521 }
1522 printf("\t;\n");
1523 printf("\n");
1524 }
1525
print_productions(Grammar * g,char * pathname)1526 static void print_productions(Grammar *g, char *pathname) {
1527 uint i;
1528 if (!g->productions.n) {
1529 printf("/*\n There were no productions in the grammar %s\n*/\n", pathname);
1530 return;
1531 }
1532 for (i = 1; i < g->productions.n; i++) print_production(g->productions.v[i]);
1533 }
1534
print_declare(char * s,char * n)1535 static void print_declare(char *s, char *n) {
1536 while (*n && (isspace_(*n) || isdigit_(*n))) n++;
1537 printf(s, n);
1538 }
1539
print_declarations(Grammar * g)1540 static void print_declarations(Grammar *g) {
1541 uint i;
1542
1543 if (g->tokenizer) printf("${declare tokenize}\n");
1544 for (i = 0; i < g->declarations.n; i++) {
1545 Declaration *dd = g->declarations.v[i];
1546 Elem *ee = dd->elem;
1547 switch (dd->kind) {
1548 case DECLARE_LONGEST_MATCH:
1549 if (g->longest_match)
1550 printf("${declare longest_match}\n");
1551 else
1552 print_declare("${declare longest_match %s}\n", ee->e.nterm->name);
1553 break;
1554 case DECLARE_ALL_MATCHES:
1555 if (!g->longest_match)
1556 printf("${declare all_matches}\n");
1557 else
1558 print_declare("${declare all_matches %s}\n", ee->e.nterm->name);
1559 break;
1560 default:
1561 printf("\n/*\nDeclaration->kind: %d", dd->kind);
1562 printf("\nElem->kind: %d\n*/\n", ee->kind);
1563 }
1564 }
1565 if (g->set_op_priority_from_rule) printf("${declare set_op_priority_from_rule}\n");
1566 if (g->states_for_all_nterms) printf("${declare all_subparsers}\n");
1567 /* todo: DECLARE_STATE_FOR */
1568 if (g->default_white_space) printf("${declare whitespace %s}\n", g->default_white_space);
1569 if (g->save_parse_tree) printf("${declare save_parse_tree}\n");
1570 /* todo: DECLARE_NUM */
1571
1572 if (g->scanner.code) printf("${scanner %s}\n", g->scanner.code);
1573
1574 {
1575 int token_exists = 0;
1576 for (i = 0; i < g->terminals.n; i++) {
1577 Term *t = g->terminals.v[i];
1578 if (t->kind == TERM_TOKEN) {
1579 printf("%s %s", token_exists ? "" : "${token", t->string);
1580 token_exists = 1;
1581 }
1582 }
1583 if (token_exists) printf("}\n");
1584 }
1585
1586 printf("\n");
1587 }
1588
print_rdebug_grammar(Grammar * g,char * pathname)1589 void print_rdebug_grammar(Grammar *g, char *pathname) {
1590 char ver[30];
1591 d_version(ver);
1592
1593 printf("/*\n Generated by Make DParser Version %s\n", ver);
1594 printf(" Available at http://dparser.sf.net\n*/\n\n");
1595
1596 print_global_code(g);
1597 print_declarations(g);
1598 print_productions(g, pathname);
1599 }
1600