1 /*
2   Copyright 2002-2004 John Plevyak, All Rights Reserved
3 */
4 
5 #include "gramgram.h"
6 #include "d.h"
7 
8 extern D_ParserTables parser_tables_dparser_gram;
9 
10 static char *action_types[] = {"ACCEPT", "SHIFT", "REDUCE"};
11 
12 static void print_state(State *s);
13 
new_production(Grammar * g,char * name)14 Production *new_production(Grammar *g, char *name) {
15   Production *p;
16   if ((p = lookup_production(g, name, strlen(name)))) {
17     FREE(name);
18     return p;
19   }
20   p = MALLOC(sizeof(Production));
21   memset(p, 0, sizeof(Production));
22   vec_add(&g->productions, p);
23   p->name = name;
24   p->name_len = strlen(name);
25   return p;
26 }
27 
new_elem()28 static Elem *new_elem() {
29   Elem *e = MALLOC(sizeof(Elem));
30   memset(e, 0, sizeof(Elem));
31   return e;
32 }
33 
new_rule(Grammar * g,Production * p)34 Rule *new_rule(Grammar *g, Production *p) {
35   Rule *r = MALLOC(sizeof(Rule));
36   memset(r, 0, sizeof(Rule));
37   r->prod = p;
38   r->end = new_elem();
39   r->end->kind = ELEM_END;
40   r->end->rule = r;
41   r->action_index = g->action_index;
42   return r;
43 }
44 
new_term()45 static Term *new_term() {
46   Term *term = MALLOC(sizeof(Term));
47   memset(term, 0, sizeof(Term));
48   return term;
49 }
50 
new_elem_term(Term * t,Rule * r)51 static Elem *new_elem_term(Term *t, Rule *r) {
52   Elem *e = new_elem();
53   e->kind = ELEM_TERM;
54   e->e.term = t;
55   e->rule = r;
56   vec_add(&r->elems, e);
57   return e;
58 }
59 
new_elem_nterm(Production * p,Rule * r)60 Elem *new_elem_nterm(Production *p, Rule *r) {
61   Elem *e = new_elem();
62   e->kind = ELEM_NTERM;
63   e->e.nterm = p;
64   e->rule = r;
65   return e;
66 }
67 
new_term_string(Grammar * g,char * s,char * e,Rule * r)68 static Elem *new_term_string(Grammar *g, char *s, char *e, Rule *r) {
69   Term *t = new_term();
70   Elem *elem;
71 
72   t->string = MALLOC(e - s + 1);
73   memcpy(t->string, s, e - s);
74   t->string[e - s] = 0;
75   t->string_len = e - s;
76   vec_add(&g->terminals, t);
77   elem = new_elem_term(t, r);
78   return elem;
79 }
80 
escape_string_for_regex(const char * s)81 char *escape_string_for_regex(const char *s) {
82   char *ss = (char *)MALLOC((strlen(s) + 1) * 2), *sss = ss;
83   for (; *s; s++) {
84     switch (*s) {
85       case '(':
86       case ')':
87       case '[':
88       case ']':
89       case '-':
90       case '^':
91       case '*':
92       case '?':
93       case '+':
94         *ss++ = '\\';
95         /* fall through */
96       default:
97         *ss++ = *s;
98         break;
99     }
100   }
101   *ss = 0;
102   return sss;
103 }
104 
unescape_term_string(Term * t)105 static void unescape_term_string(Term *t) {
106   char *s, *start = 0, *ss;
107   int length, base = 0;
108 
109   for (ss = s = t->string; *s; s++) {
110     if (*s == '\\') {
111       switch (s[1]) {
112         case '\\':
113           if (t->kind == TERM_STRING) {
114             *ss = '\\';
115             s++;
116             break;
117           } else
118             goto Ldefault;
119         case 'b':
120           *ss = '\b';
121           s++;
122           break;
123         case 'f':
124           *ss = '\f';
125           s++;
126           break;
127         case 'n':
128           *ss = '\n';
129           s++;
130           break;
131         case 'r':
132           *ss = '\r';
133           s++;
134           break;
135         case 't':
136           *ss = '\t';
137           s++;
138           break;
139         case 'v':
140           *ss = '\v';
141           s++;
142           break;
143         case 'a':
144           *ss = '\a';
145           s++;
146           break;
147         case 'c':
148           *ss = 0;
149           return;
150         case '\"':
151           if (t->kind == TERM_REGEX) {
152             *ss = '\"';
153             s++;
154             break;
155           } else
156             goto Ldefault;
157         case '\'':
158           if (t->kind == TERM_STRING) {
159             *ss = '\'';
160             s++;
161             break;
162           } else
163             goto Ldefault;
164         case 'x':
165           length = 0;
166           if (isxdigit_(s[2])) {
167             base = 16;
168             start = s + 2;
169             length++;
170             if (isxdigit_(s[3])) length++;
171           }
172           s += length + 1;
173           goto Lncont;
174         case 'd':
175           length = 0;
176           if (isdigit_(s[2])) {
177             base = 10;
178             start = s + 2;
179             length++;
180             if (isdigit_(s[3])) {
181               length++;
182               if (isdigit_(s[4]) &&
183                   ((s[2] < '2') || ((s[2] == '2') && ((s[3] < '5') || ((s[3] == '5') && (s[4] < '6'))))))
184                 length++;
185             }
186           }
187           s += length + 1;
188           goto Lncont;
189         case '0':
190         case '1':
191         case '2':
192         case '3':
193         case '4':
194         case '5':
195         case '6':
196         case '7':
197           length = 1;
198           base = 8;
199           start = s + 1;
200           if (isdigit_(s[2]) && (s[2] != '8') && (s[2] != '9')) {
201             length++;
202             if (isdigit_(s[3]) && (s[3] != '8') && (s[3] != '9')) {
203               length++;
204             }
205           }
206           s += length;
207           /* fall through */
208         Lncont:
209           if (length > 0) {
210             char saved_c = start[length];
211             start[length] = '\0';
212             *ss = (unsigned char)strtol(start, NULL, base);
213             start[length] = saved_c;
214             if (*s > 0) break;
215             d_fail("encountered an escaped NULL while processing '%s'", t->string);
216           } else
217             goto next;
218         Ldefault:
219         default:
220           *ss++ = *s;
221           *ss = s[1];
222           s++;
223           break;
224       }
225     } else
226       *ss = *s;
227     ss++;
228   next:;
229   }
230   *ss = 0;
231   t->string_len = strlen(t->string);
232   if (!t->string_len) d_fail("empty string after unescape '%s'", t->string);
233 }
234 
new_string(Grammar * g,char * s,char * e,Rule * r)235 Elem *new_string(Grammar *g, char *s, char *e, Rule *r) {
236   Elem *x = new_term_string(g, s + 1, e - 1, r);
237   x->e.term->kind = (*s == '"') ? TERM_REGEX : TERM_STRING;
238   unescape_term_string(x->e.term);
239   return x;
240 }
241 
new_utf8_char(Grammar * g,char * s,char * e,Rule * r)242 Elem *new_utf8_char(Grammar *g, char *s, char *e, Rule *r) {
243   char utf8_code[4];
244   unsigned long utf32_code, base, len = 0;
245   Elem *x;
246   for (utf32_code = 0, base = 1; e >= s + 3; base *= 16) {
247     e--;
248     if (*e >= '0' && *e <= '9')
249       utf32_code += base * (*e - '0');
250     else if (*e >= 'a' && *e <= 'f')
251       utf32_code += base * (*e - 'a' + 10);
252     else if (*e >= 'A' && *e <= 'F')
253       utf32_code += base * (*e - 'A' + 10);
254   }
255   if (utf32_code < 0x80) {
256     utf8_code[0] = utf32_code;
257     len = 1;
258   } else if (utf32_code < 0x800) {
259     utf8_code[0] = 0xc0 | ((utf32_code >> 6) & 0x1f);
260     utf8_code[1] = 0x80 | (utf32_code & 0x3f);
261     len = 2;
262   } else if (utf32_code < 0x10000) {
263     utf8_code[0] = 0xe0 | ((utf32_code >> 12) & 0x0f);
264     utf8_code[1] = 0x80 | ((utf32_code >> 6) & 0x3f);
265     utf8_code[2] = 0x80 | (utf32_code & 0x3f);
266     len = 3;
267   } else if (utf32_code < 0x02000000) {
268     utf8_code[0] = 0xf0 | ((utf32_code >> 18) & 0x07);
269     utf8_code[1] = 0x80 | ((utf32_code >> 12) & 0x3f);
270     utf8_code[2] = 0x80 | ((utf32_code >> 6) & 0x3f);
271     utf8_code[3] = 0x80 | (utf32_code & 0x3f);
272     len = 4;
273   } else {
274     d_fail("UTF32 Unicode value U+%8X too large for valid UTF-8 encoding (cf. Unicode Spec 4.0, section 3.9)",
275            utf32_code);
276   }
277   x = new_term_string(g, utf8_code, utf8_code + len, r);
278   x->e.term->kind = TERM_STRING;
279   return x;
280 }
281 
new_ident(char * s,char * e,Rule * r)282 Elem *new_ident(char *s, char *e, Rule *r) {
283   Elem *x = new_elem();
284   x->kind = ELEM_UNRESOLVED;
285   x->e.unresolved.string = dup_str(s, e);
286   x->e.unresolved.len = strlen(x->e.unresolved.string);
287   x->rule = r;
288   if (r) vec_add(&r->elems, x);
289   return x;
290 }
291 
new_token(Grammar * g,char * s,char * e)292 void new_token(Grammar *g, char *s, char *e) {
293   Term *t = new_term();
294   t->string = MALLOC(e - s + 1);
295   memcpy(t->string, s, e - s);
296   t->string[e - s] = 0;
297   t->string_len = e - s;
298   vec_add(&g->terminals, t);
299   t->kind = TERM_TOKEN;
300 }
301 
new_code(Grammar * g,char * s,char * e,Rule * r)302 Elem *new_code(Grammar *g, char *s, char *e, Rule *r) {
303   Elem *x = new_term_string(g, s, e, r);
304   x->e.term->kind = TERM_CODE;
305   return x;
306 }
307 
dup_elem(Elem * e,Rule * r)308 Elem *dup_elem(Elem *e, Rule *r) {
309   Elem *ee = MALLOC(sizeof(Elem));
310   memcpy(ee, e, sizeof(Elem));
311   if (ee->kind == ELEM_UNRESOLVED) ee->e.unresolved.string = dup_str(e->e.unresolved.string, 0);
312   ee->rule = r;
313   return ee;
314 }
315 
add_global_code(Grammar * g,char * start,char * end,int line)316 void add_global_code(Grammar *g, char *start, char *end, int line) {
317   if (!g->code)
318     g->code = MALLOC(sizeof(Code) * 4);
319   else if (!((g->ncode + 1) & 4))
320     g->code = REALLOC(g->code, sizeof(Code) * (g->ncode + 4));
321   g->code[g->ncode].code = dup_str(start, end);
322   g->code[g->ncode].line = line;
323   g->ncode++;
324 }
325 
new_declaration(Grammar * g,Elem * e,uint kind)326 void new_declaration(Grammar *g, Elem *e, uint kind) {
327   Declaration *d = MALLOC(sizeof(*d));
328   d->elem = e;
329   d->kind = kind;
330   d->index = g->declarations.n;
331   vec_add(&g->declarations, d);
332 }
333 
add_declaration(Grammar * g,char * start,char * end,uint kind,uint line)334 void add_declaration(Grammar *g, char *start, char *end, uint kind, uint line) {
335   if (start == end) {
336     switch (kind) {
337       case DECLARE_SET_OP_PRIORITY:
338         g->set_op_priority_from_rule = 1;
339         return;
340       case DECLARE_STATES_FOR_ALL_NTERMS:
341         g->states_for_all_nterms = 1;
342         return;
343       case DECLARE_LONGEST_MATCH:
344         g->longest_match = 1;
345         break;
346       case DECLARE_ALL_MATCHES:
347         g->longest_match = 0;
348         break;
349       case DECLARE_TOKENIZE:
350         g->tokenizer = 1;
351         break;
352       case DECLARE_SAVE_PARSE_TREE:
353         g->save_parse_tree = 1;
354         return;
355       default:
356         d_fail("declare expects argument, line %d", line);
357     }
358   }
359   switch (kind) {
360     case DECLARE_WHITESPACE:
361       g->default_white_space = dup_str(start, end);
362       return;
363     case DECLARE_SET_OP_PRIORITY:
364       d_fail("declare does not expect argument, line %d", line);
365     default:
366       new_declaration(g, new_ident(start, end, NULL), kind);
367       break;
368   }
369 }
370 
find_pass(Grammar * g,char * start,char * end)371 D_Pass *find_pass(Grammar *g, char *start, char *end) {
372   uint i, l;
373   while (*start && isspace_(*start)) start++;
374   l = end - start;
375   for (i = 0; i < g->passes.n; i++)
376     if (l == g->passes.v[i]->name_len && !strncmp(g->passes.v[i]->name, start, l)) return g->passes.v[i];
377   return NULL;
378 }
379 
add_pass(Grammar * g,char * start,char * end,uint kind,uint line)380 void add_pass(Grammar *g, char *start, char *end, uint kind, uint line) {
381   if (find_pass(g, start, end))
382     d_fail("duplicate pass '%s' line %d", dup_str(start, end), line);
383   else {
384     D_Pass *p = MALLOC(sizeof(*p));
385     p->name = dup_str(start, end);
386     p->name_len = end - start;
387     p->kind = kind;
388     p->index = g->pass_index++;
389     vec_add(&g->passes, p);
390   }
391 }
392 
add_pass_code(Grammar * g,Rule * r,char * pass_start,char * pass_end,char * code_start,char * code_end,uint pass_line,uint code_line)393 void add_pass_code(Grammar *g, Rule *r, char *pass_start, char *pass_end, char *code_start, char *code_end,
394                    uint pass_line, uint code_line) {
395   D_Pass *p = find_pass(g, pass_start, pass_end);
396   if (!p) d_fail("unknown pass '%s' line %d", dup_str(pass_start, pass_end), pass_line);
397   while (r->pass_code.n <= p->index) vec_add(&r->pass_code, NULL);
398   r->pass_code.v[p->index] = MALLOC(sizeof(Code));
399   r->pass_code.v[p->index]->code = dup_str(code_start, code_end);
400   r->pass_code.v[p->index]->line = code_line;
401 }
402 
new_internal_production(Grammar * g,Production * p)403 Production *new_internal_production(Grammar *g, Production *p) {
404   char *n = p ? p->name : " _synthetic";
405   char *name = MALLOC(strlen(n) + 21);
406   Production *pp = NULL, *tp = NULL, *ttp;
407   uint i, found = 0;
408   sprintf(name, "%s__%d", n, g->productions.n);
409   pp = new_production(g, name);
410   pp->internal = INTERNAL_HIDDEN;
411   pp->regex = p ? p->regex : 0;
412   if (p) {
413     for (i = 0; i < g->productions.n; i++) {
414       if (found) {
415         ttp = g->productions.v[i];
416         g->productions.v[i] = tp;
417         tp = ttp;
418       } else if (p == g->productions.v[i]) {
419         found = 1;
420         tp = g->productions.v[i + 1];
421         g->productions.v[i + 1] = pp;
422         i++;
423       }
424     }
425   }
426   return pp;
427 }
428 
conditional_EBNF(Grammar * g)429 void conditional_EBNF(Grammar *g) {
430   Production *pp;
431   Rule *rr;
432 
433   pp = new_internal_production(g, g->p);
434   pp->internal = INTERNAL_CONDITIONAL;
435   rr = new_rule(g, pp);
436   vec_add(&rr->elems, last_elem(g->r));
437   last_elem(g->r)->rule = rr;
438   rr->elems.v[rr->elems.n - 1]->rule = rr;
439   vec_add(&pp->rules, rr);
440   vec_add(&pp->rules, new_rule(g, pp));
441   last_elem(g->r) = new_elem_nterm(pp, g->r);
442 }
443 
star_EBNF(Grammar * g)444 void star_EBNF(Grammar *g) {
445   Production *pp;
446   Rule *rr;
447 
448   pp = new_internal_production(g, g->p);
449   pp->internal = INTERNAL_STAR;
450   rr = new_rule(g, pp);
451   if (!g->right_recursive_BNF) {
452     vec_add(&rr->elems, new_elem_nterm(pp, rr));
453     vec_add(&rr->elems, last_elem(g->r));
454     last_elem(g->r) = new_elem_nterm(pp, g->r);
455     last_elem(rr)->rule = rr;
456   } else {
457     vec_add(&rr->elems, last_elem(g->r));
458     last_elem(g->r) = new_elem_nterm(pp, g->r);
459     last_elem(rr)->rule = rr;
460     vec_add(&rr->elems, new_elem_nterm(pp, rr));
461   }
462   vec_add(&pp->rules, rr);
463   vec_add(&pp->rules, new_rule(g, pp));
464 }
465 
plus_EBNF(Grammar * g)466 void plus_EBNF(Grammar *g) {
467   Production *pp;
468   Rule *rr;
469   Elem *elem;
470 
471   pp = new_internal_production(g, g->p);
472   pp->internal = INTERNAL_PLUS;
473   rr = new_rule(g, pp);
474   if (!g->right_recursive_BNF) {
475     elem = last_elem(g->r);
476     vec_add(&rr->elems, new_elem_nterm(pp, rr));
477     vec_add(&rr->elems, dup_elem(elem, rr));
478     last_elem(g->r) = new_elem_nterm(pp, g->r);
479     if (g->r->rule_priority) {
480       rr->rule_priority = g->r->rule_priority;
481       rr->rule_assoc = ASSOC_NARY_LEFT;
482     }
483   } else {
484     elem = last_elem(g->r);
485     vec_add(&rr->elems, dup_elem(elem, rr));
486     last_elem(g->r) = new_elem_nterm(pp, g->r);
487     vec_add(&rr->elems, new_elem_nterm(pp, rr));
488     if (g->r->rule_priority) {
489       rr->rule_priority = g->r->rule_priority;
490       rr->rule_assoc = ASSOC_NARY_RIGHT;
491     }
492   }
493   vec_add(&pp->rules, rr);
494   rr = new_rule(g, pp);
495   vec_add(&rr->elems, elem);
496   elem->rule = rr;
497   vec_add(&pp->rules, rr);
498 }
499 
rep_EBNF(Grammar * g,int min,int max)500 void rep_EBNF(Grammar *g, int min, int max) {
501   Production *pp;
502   Rule *rr;
503   Elem *elem;
504   int i, j;
505   if (max < min) max = min;
506 
507   pp = new_internal_production(g, g->p);
508   elem = last_elem(g->r);
509   for (i = min; i <= max; i++) {
510     rr = new_rule(g, pp);
511     for (j = 0; j < i; j++) vec_add(&rr->elems, dup_elem(elem, rr));
512     vec_add(&pp->rules, rr);
513   }
514   last_elem(g->r) = new_elem_nterm(pp, g->r);
515   FREE(elem);
516 }
517 
initialize_productions(Grammar * g)518 void initialize_productions(Grammar *g) {
519   Production *pp = new_production(g, dup_str("0 Start", 0));
520   pp->internal = INTERNAL_HIDDEN;
521 }
522 
finish_productions(Grammar * g)523 void finish_productions(Grammar *g) {
524   Production *pp = g->productions.v[0];
525   Rule *rr = new_rule(g, pp);
526   vec_add(&rr->elems, new_elem_nterm(NULL, rr));
527   vec_add(&pp->rules, rr);
528   rr->elems.v[0]->e.nterm = g->productions.v[1];
529 }
530 
lookup_production(Grammar * g,char * name,uint l)531 Production *lookup_production(Grammar *g, char *name, uint l) {
532   uint i;
533 
534   for (i = 0; i < g->productions.n; i++) {
535     Production *pp = g->productions.v[i];
536     if (pp->name_len != l || strncmp(pp->name, name, l)) continue;
537     return pp;
538   }
539   return NULL;
540 }
541 
lookup_token(Grammar * g,char * name,int l)542 static Term *lookup_token(Grammar *g, char *name, int l) {
543   uint i;
544 
545   for (i = 0; i < g->terminals.n; i++) {
546     Term *t = g->terminals.v[i];
547     if (t->kind != TERM_TOKEN || t->string_len != l || strncmp(t->string, name, l)) continue;
548     return t;
549   }
550   return NULL;
551 }
552 
unique_term(Grammar * g,Term * t)553 static Term *unique_term(Grammar *g, Term *t) {
554   uint i;
555   for (i = 0; i < g->terminals.n; i++)
556     if (t->kind == g->terminals.v[i]->kind && t->string_len == g->terminals.v[i]->string_len &&
557         t->term_priority == g->terminals.v[i]->term_priority && t->term_name == g->terminals.v[i]->term_name &&
558         (!g->set_op_priority_from_rule ||
559          (t->op_assoc == g->terminals.v[i]->op_assoc && t->op_priority == g->terminals.v[i]->op_priority)) &&
560         !strncmp(t->string, g->terminals.v[i]->string, t->string_len))
561       return g->terminals.v[i];
562   return t;
563 }
564 
compute_nullable(Grammar * g)565 static void compute_nullable(Grammar *g) {
566   uint i, j, k, changed = 1;
567   Elem *e;
568 
569   /* ensure that the trivial case is the first cause */
570   for (i = 0; i < g->productions.n; i++) {
571     for (j = 0; j < g->productions.v[i]->rules.n; j++)
572       if (!g->productions.v[i]->rules.v[j]->elems.n) {
573         g->productions.v[i]->nullable = g->productions.v[i]->rules.v[j];
574         break;
575       }
576   }
577   /* transitive closure */
578   while (changed) {
579     changed = 0;
580     for (i = 0; i < g->productions.n; i++) {
581       if (!g->productions.v[i]->nullable)
582         for (j = 0; j < g->productions.v[i]->rules.n; j++) {
583           for (k = 0; k < g->productions.v[i]->rules.v[j]->elems.n; k++) {
584             e = g->productions.v[i]->rules.v[j]->elems.v[k];
585             if (e->kind != ELEM_NTERM || !e->e.nterm->nullable) goto Lnot_nullable;
586           }
587           changed = 1;
588           g->productions.v[i]->nullable = g->productions.v[i]->rules.v[j];
589           break;
590         }
591     Lnot_nullable:;
592     }
593   }
594 }
595 
596 /*
597   verify and cleanup the grammar datastructures
598   - resolve non-terminals
599   - set element indexes
600 */
resolve_grammar(Grammar * g)601 static void resolve_grammar(Grammar *g) {
602   uint i, j, k, l;
603   Production *p, *pp;
604   Rule *r;
605   Elem *e;
606   Term *last_term, *t;
607 
608   g->rule_index = 0;
609   for (i = 0; i < g->productions.n; i++) {
610     p = g->productions.v[i];
611     if (p != lookup_production(g, p->name, p->name_len)) d_fail("duplicate production '%s'", p->name);
612     p->index = i;
613     for (j = 0; j < p->rules.n; j++) {
614       r = p->rules.v[j];
615       r->index = g->rule_index++;
616       last_term = NULL;
617       for (k = 0; k < r->elems.n; k++) {
618         e = r->elems.v[k];
619         e->index = k;
620         if (e->kind == ELEM_UNRESOLVED) {
621           l = e->e.unresolved.len;
622           if ((pp = lookup_production(g, e->e.unresolved.string, l))) {
623             FREE(e->e.unresolved.string);
624             e->e.unresolved.string = 0;
625             e->kind = ELEM_NTERM;
626             e->e.nterm = pp;
627           } else if ((t = lookup_token(g, e->e.unresolved.string, l))) {
628             FREE(e->e.unresolved.string);
629             e->e.unresolved.string = 0;
630             e->kind = ELEM_TERM;
631             e->e.term = t;
632           } else {
633             char str[256];
634             strncpy(str, e->e.unresolved.string, l);
635             str[l < 255 ? l : 255] = 0;
636             d_fail("unresolved identifier: '%s'", str);
637           }
638         }
639         if (e->kind == ELEM_TERM) last_term = e->e.term;
640       }
641       r->end->index = r->elems.n;
642       if (g->set_op_priority_from_rule) {
643         if (last_term && r->rule_assoc) {
644           last_term->op_assoc = r->rule_assoc;
645           last_term->op_priority = r->rule_priority;
646         }
647       }
648     }
649   }
650   for (i = 0; i < g->terminals.n; i++) g->terminals.v[i]->index = i;
651   compute_nullable(g);
652 }
653 
merge_identical_terminals(Grammar * g)654 static void merge_identical_terminals(Grammar *g) {
655   uint i, j, k;
656   Production *p;
657   Rule *r;
658   Elem *e;
659 
660   for (i = 0; i < g->productions.n; i++) {
661     p = g->productions.v[i];
662     for (j = 0; j < p->rules.n; j++) {
663       r = p->rules.v[j];
664       for (k = 0; k < r->elems.n; k++) {
665         e = r->elems.v[k];
666         if (e->kind == ELEM_TERM) e->e.term = unique_term(g, e->e.term);
667       }
668     }
669   }
670 }
671 
print_term(Term * t)672 void print_term(Term *t) {
673   char *s = t->string ? escape_string(t->string) : NULL;
674   if (t->term_name)
675     printf("term_name(\"%s\") ", t->term_name);
676   else if (t->kind == TERM_STRING) {
677     if (!t->string || !*t->string)
678       printf("<EOF> ");
679     else
680       printf("string(\"%s\") ", s);
681   } else if (t->kind == TERM_REGEX) {
682     printf("regex(\"%s\") ", s);
683   } else if (t->kind == TERM_CODE)
684     printf("code(\"%s\") ", s);
685   else if (t->kind == TERM_TOKEN)
686     printf("token(\"%s\") ", s);
687   else
688     d_fail("unknown token kind");
689   if (s) FREE(s);
690 }
691 
print_elem(Elem * ee)692 void print_elem(Elem *ee) {
693   if (ee->kind == ELEM_TERM)
694     print_term(ee->e.term);
695   else if (ee->kind == ELEM_UNRESOLVED)
696     printf("%s ", ee->e.unresolved.string);
697   else
698     printf("%s ", ee->e.nterm->name);
699 }
700 
701 struct EnumStr {
702   uint e;
703   char *s;
704 } assoc_strings[] = {{ASSOC_NONE, "$none"},
705                      {ASSOC_NARY_LEFT, "$left"},
706                      {ASSOC_NARY_RIGHT, "$right"},
707                      {ASSOC_UNARY_LEFT, "$unary_left"},
708                      {ASSOC_UNARY_RIGHT, "$unary_right"},
709                      {ASSOC_BINARY_LEFT, "$binary_left"},
710                      {ASSOC_BINARY_RIGHT, "$binary_right"},
711                      {ASSOC_NO, "$noassoc"}};
712 
assoc_str(uint e)713 static char *assoc_str(uint e) {
714   uint i;
715 
716   for (i = 0; i < sizeof(assoc_strings) / sizeof(assoc_strings[0]); i++)
717     if (e == assoc_strings[i].e) return assoc_strings[i].s;
718   return assoc_strings[0].s;
719 }
720 
print_rule(Rule * r)721 void print_rule(Rule *r) {
722   uint k;
723 
724   printf("%s: ", r->prod->name);
725   for (k = 0; k < r->elems.n; k++) print_elem(r->elems.v[k]);
726   if (r->speculative_code.code) printf("SPECULATIVE_CODE\n%s\nEND CODE\n", r->speculative_code.code);
727   if (r->final_code.code) printf("FINAL_CODE\n%s\nEND CODE\n", r->final_code.code);
728 }
729 
print_grammar(Grammar * g)730 void print_grammar(Grammar *g) {
731   uint i, j, k;
732   Production *pp;
733   Rule *rr;
734 
735   if (!g->productions.n) return;
736   printf("PRODUCTIONS\n\n");
737   for (i = 0; i < g->productions.n; i++) {
738     pp = g->productions.v[i];
739     printf("%s (%d)\n", pp->name, i);
740     for (j = 0; j < pp->rules.n; j++) {
741       rr = pp->rules.v[j];
742       if (!j)
743         printf("\t: ");
744       else
745         printf("\t| ");
746       for (k = 0; k < rr->elems.n; k++) print_elem(rr->elems.v[k]);
747       if (rr->op_priority) printf("op %d ", rr->op_priority);
748       if (rr->op_assoc) printf("%s ", assoc_str(rr->op_assoc));
749       if (rr->rule_priority) printf("rule %d ", rr->rule_priority);
750       if (rr->rule_assoc) printf("%s ", assoc_str(rr->rule_assoc));
751       if (rr->speculative_code.code) printf("%s ", rr->speculative_code.code);
752       if (rr->final_code.code) printf("%s ", rr->final_code.code);
753       printf("\n");
754     }
755     printf("\t;\n");
756     printf("\n");
757   }
758   printf("TERMINALS\n\n");
759   for (i = 0; i < g->terminals.n; i++) {
760     printf("\t");
761     print_term(g->terminals.v[i]);
762     printf("(%d)\n", i + g->productions.n);
763   }
764   printf("\n");
765 }
766 
print_item(Item * i)767 static void print_item(Item *i) {
768   uint j, end = 1;
769 
770   printf("\t%s: ", i->rule->prod->name);
771   for (j = 0; j < i->rule->elems.n; j++) {
772     Elem *e = i->rule->elems.v[j];
773     if (i == e) {
774       printf(". ");
775       end = 0;
776     }
777     print_elem(e);
778   }
779   if (end) printf(". ");
780   printf("\n");
781 }
782 
print_conflict(char * kind,uint * conflict)783 static void print_conflict(char *kind, uint *conflict) {
784   if (!*conflict) {
785     printf("  CONFLICT (before precedence and associativity)\n");
786     *conflict = 1;
787   }
788   printf("\t%s conflict ", kind);
789   printf("\n");
790 }
791 
print_state(State * s)792 static void print_state(State *s) {
793   uint j, conflict = 0;
794 
795   printf("STATE %d (%d ITEMS)%s\n", s->index, s->items.n, s->accept ? " ACCEPT" : "");
796   for (j = 0; j < s->items.n; j++) print_item(s->items.v[j]);
797   if (s->gotos.n) printf("  GOTO\n");
798   for (j = 0; j < s->gotos.n; j++) {
799     printf("\t");
800     print_elem(s->gotos.v[j]->elem);
801     printf(" : %d\n", s->gotos.v[j]->state->index);
802   }
803   printf("  ACTION\n");
804   for (j = 0; j < s->reduce_actions.n; j++) {
805     Action *a = s->reduce_actions.v[j];
806     printf("\t%s\t", action_types[a->kind]);
807     print_rule(a->rule);
808     printf("\n");
809   }
810   for (j = 0; j < s->shift_actions.n; j++) {
811     Action *a = s->shift_actions.v[j];
812     printf("\t%s\t", action_types[a->kind]);
813     if (a->kind == ACTION_SHIFT) {
814       print_term(a->term);
815       printf("%d", a->state->index);
816     }
817     printf("\n");
818   }
819   if (s->reduce_actions.n > 1) print_conflict("reduce/reduce", &conflict);
820   if (s->reduce_actions.n && s->shift_actions.n) print_conflict("shift/reduce", &conflict);
821   printf("\n");
822 }
823 
print_states(Grammar * g)824 void print_states(Grammar *g) {
825   uint i;
826 
827   for (i = 0; i < g->states.n; i++) print_state(g->states.v[i]);
828 }
829 
state_for_declaration(Grammar * g,uint iproduction)830 uint state_for_declaration(Grammar *g, uint iproduction) {
831   uint i;
832 
833   for (i = 0; i < g->declarations.n; i++)
834     if (g->declarations.v[i]->kind == DECLARE_STATE_FOR && g->declarations.v[i]->elem->e.nterm->index == iproduction)
835       return 1;
836   return 0;
837 }
838 
make_elems_for_productions(Grammar * g)839 static void make_elems_for_productions(Grammar *g) {
840   uint i, j, k, l;
841   Rule *rr;
842   Production *pp, *ppp;
843 
844   pp = g->productions.v[0];
845   for (i = 0; i < g->productions.n; i++)
846     if (!g->productions.v[i]->internal) {
847       if (g->states_for_all_nterms || state_for_declaration(g, i)) {
848         /* try to find an existing elem */
849         for (j = 0; j < g->productions.n; j++)
850           for (k = 0; k < g->productions.v[j]->rules.n; k++) {
851             rr = g->productions.v[j]->rules.v[k];
852             for (l = 0; l < rr->elems.n; l++)
853               if (rr->elems.v[l]->e.term_or_nterm == g->productions.v[i]) {
854                 g->productions.v[i]->elem = rr->elems.v[l];
855                 break;
856               }
857           }
858         if (j >= g->productions.n) { /* not found */
859           g->productions.v[i]->elem = new_elem_nterm(g->productions.v[i], new_rule(g, pp));
860           g->productions.v[i]->elem->rule->index = g->rule_index++; /* fake */
861         }
862       }
863     }
864   if (!g->states_for_all_nterms && g->states_for_whitespace &&
865       (ppp = lookup_production(g, "whitespace", sizeof("whitespace") - 1))) {
866     ppp->elem = new_elem_nterm(ppp, new_rule(g, pp));
867     ppp->elem->rule->index = g->rule_index++; /* fake */
868   }
869 }
870 
convert_regex_production_one(Grammar * g,Production * p)871 static void convert_regex_production_one(Grammar *g, Production *p) {
872   uint j, k, l;
873   Production *pp;
874   Rule *r, *rr;
875   Elem *e;
876   Term *t;
877   int circular = 0;
878   char *buf = 0, *b, *s;
879   int buf_len = 0;
880 
881   if (p->regex_term) /* already done */
882     return;
883   if (p->in_regex) d_fail("circular regex production '%s'", p->name);
884   p->in_regex = 1;
885   for (j = 0; j < p->rules.n; j++) {
886     r = p->rules.v[j];
887     if (r->final_code.code || (r->speculative_code.code && p->rules.n > 1))
888       d_fail("final and/or multi-rule code not permitted in regex productions '%s'", p->name);
889     for (k = 0; k < r->elems.n; k++) {
890       e = r->elems.v[k];
891       if (e->kind == ELEM_NTERM) {
892         if (!e->e.nterm->regex)
893           d_fail("regex production '%s' cannot invoke non-regex production '%s'", p->name, e->e.nterm->name);
894         pp = e->e.nterm;
895         for (l = 0; l < pp->rules.n; l++)
896           if (pp->rules.v[l]->speculative_code.code || pp->rules.v[l]->final_code.code)
897             d_fail("code not permitted in rule %d of regex productions '%s'", l, p->name);
898         if (p != pp) {
899           convert_regex_production_one(g, pp);
900           buf_len += pp->regex_term->string_len + 5;
901         } else {
902           circular = 1;
903           buf_len += 5;
904         }
905       } else { /* e->kind == ELEM_TERM */
906         if (e->e.term->kind == TERM_CODE || e->e.term->kind == TERM_TOKEN)
907           d_fail("regex production '%s' cannot include scanners or tokens");
908         buf_len += e->e.term->string_len + 5;
909       }
910     }
911   }
912   b = buf = (char *)MALLOC(buf_len + 1);
913   t = new_term();
914   t->kind = TERM_REGEX;
915   t->string = buf;
916   t->index = g->terminals.n;
917   t->regex_production = p;
918   vec_add(&g->terminals, t);
919   p->regex_term = t;
920   p->regex_term->term_name = dup_str(p->name, 0);
921   if (circular) { /* attempt to match to regex operators */
922     if (p->rules.n != 2)
923     Lfail:
924       d_fail("unable to resolve circular regex production: '%s'", p->name);
925     l = p->rules.v[0]->elems.n + p->rules.v[1]->elems.n;
926     if (l == 2 || l == 3) {
927       if (p->rules.v[0]->elems.n != 2 && p->rules.v[1]->elems.n != 2) goto Lfail;
928       r = p->rules.v[0]->elems.n == 2 ? p->rules.v[0] : p->rules.v[1];
929       rr = p->rules.v[0] == r ? p->rules.v[1] : p->rules.v[0];
930       if (r->elems.v[0]->e.nterm != p && r->elems.v[1]->e.nterm != p) goto Lfail;
931       e = r->elems.v[0]->e.nterm == p ? r->elems.v[1] : r->elems.v[1];
932       if (rr->elems.n && e->e.term_or_nterm != rr->elems.v[0]->e.term_or_nterm) goto Lfail;
933       t = e->kind == ELEM_TERM ? e->e.term : e->e.nterm->regex_term;
934       *b++ = '(';
935       if (t->kind == TERM_STRING)
936         s = escape_string_for_regex(t->string);
937       else
938         s = t->string;
939       memcpy(b, s, strlen(s));
940       b += strlen(s);
941       if (t->kind == TERM_STRING) FREE(s);
942       *b++ = ')';
943       if (l == 2)
944         *b++ = '*';
945       else
946         *b++ = '+';
947       *b = 0;
948       p->regex_term->string_len = strlen(p->regex_term->string);
949     } else
950       goto Lfail;
951   } else { /* handle the base case, p = (r | r'), r = (e e') */
952     if (p->rules.n > 1) *b++ = '(';
953     for (j = 0; j < p->rules.n; j++) {
954       r = p->rules.v[j];
955       if (r->elems.n > 1) *b++ = '(';
956       for (k = 0; k < r->elems.n; k++) {
957         e = r->elems.v[k];
958         t = e->kind == ELEM_TERM ? e->e.term : e->e.nterm->regex_term;
959         if (t->kind == TERM_STRING)
960           s = escape_string_for_regex(t->string);
961         else
962           s = t->string;
963         memcpy(b, s, strlen(s));
964         b += strlen(s);
965         if (t->kind == TERM_STRING) FREE(s);
966       }
967       if (r->elems.n > 1) *b++ = ')';
968       if (j != p->rules.n - 1) *b++ = '|';
969     }
970     if (p->rules.n > 1) *b++ = ')';
971     *b = 0;
972     p->regex_term->string_len = strlen(p->regex_term->string);
973   }
974   p->in_regex = 0;
975 }
976 
convert_regex_productions(Grammar * g)977 static void convert_regex_productions(Grammar *g) {
978   uint i, j, k;
979   Production *p;
980   Rule *r;
981 
982   for (i = 0; i < g->productions.n; i++) {
983     p = g->productions.v[i];
984     if (!p->regex) continue;
985     convert_regex_production_one(g, p);
986   }
987   for (i = 0; i < g->productions.n; i++) {
988     p = g->productions.v[i];
989     for (j = 0; j < p->rules.n; j++) {
990       r = p->rules.v[j];
991       for (k = 0; k < r->elems.n; k++) {
992         if (r->elems.v[k]->kind == ELEM_NTERM && r->elems.v[k]->e.nterm->regex_term) {
993           r->elems.v[k]->e.term = r->elems.v[k]->e.nterm->regex_term;
994           r->elems.v[k]->kind = ELEM_TERM;
995         }
996       }
997     }
998   }
999 }
1000 
check_default_actions(Grammar * g)1001 static void check_default_actions(Grammar *g) {
1002   Production *pdefault;
1003 
1004   pdefault = lookup_production(g, "_", 1);
1005   if (pdefault && pdefault->rules.n > 1) d_fail("number of rules in default action != 1");
1006 }
1007 
1008 typedef struct {
1009   struct State *eq;
1010   struct Rule *diff_rule;
1011   struct State *diff_state;
1012 } EqState;
1013 
build_eq(Grammar * g)1014 void build_eq(Grammar *g) {
1015   uint i, j, k, changed = 1, x, xx;
1016   State *s, *ss;
1017   EqState *eq, *e, *ee;
1018 
1019   eq = (EqState *)MALLOC(sizeof(EqState) * g->states.n);
1020   memset(eq, 0, sizeof(EqState) * g->states.n);
1021   while (changed) {
1022     changed = 0;
1023     for (i = 0; i < g->states.n; i++) {
1024       s = g->states.v[i];
1025       e = &eq[s->index];
1026       for (j = i + 1; j < g->states.n; j++) {
1027         ss = g->states.v[j];
1028         ee = &eq[ss->index];
1029         if (e->eq || ee->eq) continue;
1030         if (s->same_shifts != ss->same_shifts && ss->same_shifts != s) continue;
1031         /* check gotos */
1032         if (s->gotos.n != ss->gotos.n) continue;
1033         for (k = 0; k < s->gotos.n; k++) {
1034           if (elem_symbol(g, s->gotos.v[k]->elem) != elem_symbol(g, ss->gotos.v[k]->elem)) goto Lcontinue;
1035           if (s->gotos.v[k]->state != ss->gotos.v[k]->state) {
1036             EqState *ge = &eq[s->gotos.v[k]->state->index];
1037             EqState *gee = &eq[ss->gotos.v[k]->state->index];
1038             if (ge->eq != ss->gotos.v[k]->state && gee->eq != s->gotos.v[k]->state) goto Lcontinue;
1039             if ((ee->diff_state && ee->diff_state != eq[ss->gotos.v[k]->state->index].eq) ||
1040                 (e->diff_state && e->diff_state != eq[s->gotos.v[k]->state->index].eq))
1041               goto Lcontinue;
1042             /* allow one different state */
1043             ee->diff_state = ss->gotos.v[k]->state;
1044             e->diff_state = s->gotos.v[k]->state;
1045           }
1046         }
1047         /* check reductions */
1048         if (s->reduce_actions.n != ss->reduce_actions.n) continue;
1049         for (k = 0; k < s->reduce_actions.n; k++) {
1050           if (s->reduce_actions.v[k]->rule == ss->reduce_actions.v[k]->rule) continue;
1051           if (s->reduce_actions.v[k]->rule->prod != ss->reduce_actions.v[k]->rule->prod) goto Lcontinue;
1052           if ((x = s->reduce_actions.v[k]->rule->elems.n) != (xx = ss->reduce_actions.v[k]->rule->elems.n)) {
1053             if ((ee->diff_rule && ee->diff_rule != ss->reduce_actions.v[k]->rule) ||
1054                 (e->diff_rule && e->diff_rule != s->reduce_actions.v[k]->rule))
1055               goto Lcontinue;
1056             /* allow one different rule */
1057             ee->diff_rule = ss->reduce_actions.v[k]->rule;
1058             e->diff_rule = s->reduce_actions.v[k]->rule;
1059           }
1060         }
1061         ee->eq = s;
1062         changed = 1;
1063       Lcontinue:;
1064       }
1065     }
1066   }
1067   for (i = 0; i < g->states.n; i++) {
1068     s = g->states.v[i];
1069     e = &eq[s->index];
1070     if (e->eq) {
1071       if (d_verbose_level > 2) {
1072         printf("eq %d %d ", s->index, e->eq->index);
1073         if (e->diff_state) printf("diff state (%d %d) ", e->diff_state->index, eq[e->eq->index].diff_state->index);
1074         if (e->diff_rule) {
1075           printf("diff rule ");
1076           printf("[ ");
1077           print_rule(e->diff_rule);
1078           printf("][ ");
1079           print_rule(eq[e->eq->index].diff_rule);
1080           printf("]");
1081         }
1082         printf("\n");
1083       }
1084     }
1085   }
1086   for (i = 0; i < g->states.n; i++) {
1087     s = g->states.v[i];
1088     e = &eq[s->index];
1089     if (e->eq && e->diff_state) {
1090       if (eq[e->diff_state->index].diff_rule && eq[e->diff_state->index].diff_rule->elems.n == 2) {
1091         s->reduces_to = e->eq;
1092         s->reduces_with = eq[e->eq->index].diff_rule;
1093         s->reduces_to_then_with = e->diff_rule;
1094       } else if (eq[eq[e->eq->index].diff_state->index].diff_rule &&
1095                  eq[eq[e->eq->index].diff_state->index].diff_rule->elems.n == 2) {
1096         e->eq->reduces_to = s;
1097         s->reduces_with = e->diff_rule;
1098         s->reduces_to_then_with = eq[e->eq->index].diff_rule;
1099       }
1100     }
1101   }
1102   for (i = 0; i < g->states.n; i++) {
1103     s = g->states.v[i];
1104     if (s->reduces_to)
1105       if (d_verbose_level) printf("reduces_to %d %d\n", s->index, s->reduces_to->index);
1106   }
1107   FREE(eq);
1108 }
1109 
new_D_Grammar(char * pathname)1110 Grammar *new_D_Grammar(char *pathname) {
1111   Grammar *g = (Grammar *)MALLOC(sizeof(Grammar));
1112   memset(g, 0, sizeof(Grammar));
1113   g->pathname = dup_str(pathname, pathname + strlen(pathname));
1114   return g;
1115 }
1116 
free_rule(Rule * r)1117 static void free_rule(Rule *r) {
1118   uint i;
1119   FREE(r->end);
1120   if (r->final_code.code) FREE(r->final_code.code);
1121   if (r->speculative_code.code) FREE(r->speculative_code.code);
1122   vec_free(&r->elems);
1123   for (i = 0; i < r->pass_code.n; i++) {
1124     FREE(r->pass_code.v[i]->code);
1125     FREE(r->pass_code.v[i]);
1126   }
1127   vec_free(&r->pass_code);
1128   FREE(r);
1129 }
1130 
free_D_Grammar(Grammar * g)1131 void free_D_Grammar(Grammar *g) {
1132   uint i, j, k;
1133 
1134   for (i = 0; i < g->productions.n; i++) {
1135     Production *p = g->productions.v[i];
1136     for (j = 0; j < p->rules.n; j++) {
1137       Rule *r = p->rules.v[j];
1138       if (r == g->r) g->r = 0;
1139       for (k = 0; k < r->elems.n; k++) {
1140         Elem *e = r->elems.v[k];
1141         if (e == p->elem) p->elem = 0;
1142         FREE(e);
1143       }
1144       if (r->end == p->elem) p->elem = 0;
1145       free_rule(r);
1146     }
1147     vec_free(&p->rules);
1148     FREE(p->name);
1149     if (p->elem) {
1150       free_rule(p->elem->rule);
1151       FREE(p->elem);
1152     }
1153     FREE(p);
1154   }
1155   vec_free(&g->productions);
1156   for (i = 0; i < g->terminals.n; i++) {
1157     Term *t = g->terminals.v[i];
1158     if (t->string) FREE(t->string);
1159     if (t->term_name) FREE(t->term_name);
1160     FREE(t);
1161   }
1162   vec_free(&g->terminals);
1163   for (i = 0; i < g->actions.n; i++) free_Action(g->actions.v[i]);
1164   vec_free(&g->actions);
1165   if (g->scanner.code) FREE(g->scanner.code);
1166   for (i = 0; i < g->states.n; i++) {
1167     State *s = g->states.v[i];
1168     vec_free(&s->items);
1169     vec_free(&s->items_hash);
1170     for (j = 0; j < s->gotos.n; j++) {
1171       FREE(s->gotos.v[j]->elem);
1172       FREE(s->gotos.v[j]);
1173     }
1174     vec_free(&s->gotos);
1175     vec_free(&s->shift_actions);
1176     vec_free(&s->reduce_actions);
1177     for (j = 0; j < s->right_epsilon_hints.n; j++) FREE(s->right_epsilon_hints.v[j]);
1178     vec_free(&s->right_epsilon_hints);
1179     for (j = 0; j < s->error_recovery_hints.n; j++) FREE(s->error_recovery_hints.v[j]);
1180     vec_free(&s->error_recovery_hints);
1181     if (!s->same_shifts) {
1182       for (j = 0; j < s->scanner.states.n; j++) {
1183         vec_free(&s->scanner.states.v[j]->accepts);
1184         vec_free(&s->scanner.states.v[j]->live);
1185         FREE(s->scanner.states.v[j]);
1186       }
1187       vec_free(&s->scanner.states);
1188       for (j = 0; j < s->scanner.transitions.n; j++)
1189         if (s->scanner.transitions.v[j]) {
1190           vec_free(&s->scanner.transitions.v[j]->live_diff);
1191           vec_free(&s->scanner.transitions.v[j]->accepts_diff);
1192           FREE(s->scanner.transitions.v[j]);
1193         }
1194       vec_free(&s->scanner.transitions);
1195     }
1196     FREE(s->goto_valid);
1197     FREE(s);
1198   }
1199   vec_free(&g->states);
1200   for (i = 0; i < g->ncode; i++) FREE(g->code[i].code);
1201   FREE(g->code);
1202   for (i = 0; i < g->declarations.n; i++) {
1203     FREE(g->declarations.v[i]->elem);
1204     FREE(g->declarations.v[i]);
1205   }
1206   vec_free(&g->declarations);
1207   for (i = 0; i < g->passes.n; i++) {
1208     FREE(g->passes.v[i]->name);
1209     FREE(g->passes.v[i]);
1210   }
1211   vec_free(&g->passes);
1212   for (i = 0; i < g->all_pathnames.n; i++) FREE(g->all_pathnames.v[i]);
1213   FREE(g->pathname);
1214   if (g->default_white_space) FREE(g->default_white_space);
1215   FREE(g);
1216 }
1217 
parse_grammar(Grammar * g,char * pathname,char * sarg)1218 int parse_grammar(Grammar *g, char *pathname, char *sarg) {
1219   D_Parser *p;
1220   int res = 0;
1221   char *s = sarg;
1222 
1223   vec_add(&g->all_pathnames, dup_str(pathname, 0));
1224   if (!s)
1225     if (!(s = sbuf_read(pathname))) return -1;
1226   if (!g->productions.n) initialize_productions(g);
1227   p = new_D_Parser(&parser_tables_dparser_gram, sizeof(D_ParseNode_User));
1228   p->initial_globals = g;
1229   p->loc.pathname = pathname;
1230   if (dparse(p, s, strlen(s))) {
1231     if (g->productions.n > 1) finish_productions(g);
1232   } else
1233     res = -1;
1234   if (!sarg) FREE(s);
1235   free_D_Parser(p);
1236   return res;
1237 }
1238 
scanner_declaration(Declaration * d)1239 static int scanner_declaration(Declaration *d) {
1240   switch (d->kind) {
1241     case DECLARE_TOKENIZE:
1242     case DECLARE_LONGEST_MATCH:
1243     case DECLARE_ALL_MATCHES:
1244       return 1;
1245     default:
1246       return 0;
1247   }
1248 }
1249 
set_declaration_group(Production * p,Production * root,Declaration * d)1250 static void set_declaration_group(Production *p, Production *root, Declaration *d) {
1251   uint i, j;
1252   if (p->declaration_group[d->kind] == root) return;
1253   if (d->kind == DECLARE_TOKENIZE && p->declaration_group[d->kind]) {
1254     d_fail("shared tokenize subtrees");
1255     return;
1256   }
1257   p->declaration_group[d->kind] = root;
1258   p->last_declaration[d->kind] = d;
1259   for (i = 0; i < p->rules.n; i++) {
1260     for (j = 0; j < p->rules.v[i]->elems.n; j++)
1261       if (p->rules.v[i]->elems.v[j]->kind == ELEM_NTERM)
1262         set_declaration_group(p->rules.v[i]->elems.v[j]->e.nterm, root, d);
1263   }
1264 }
1265 
propogate_declarations(Grammar * g)1266 static void propogate_declarations(Grammar *g) {
1267   uint i, j, k;
1268   Production *p, *start = g->productions.v[0];
1269   Rule *r;
1270   Elem *e;
1271 
1272   /* global defaults */
1273   if (g->tokenizer) new_declaration(g, new_elem_nterm(g->productions.v[0], NULL), DECLARE_TOKENIZE);
1274   if (g->longest_match) new_declaration(g, new_elem_nterm(g->productions.v[0], NULL), DECLARE_LONGEST_MATCH);
1275   /* resolve declarations */
1276   for (i = 0; i < g->declarations.n; i++) {
1277     e = g->declarations.v[i]->elem;
1278     if (e->kind == ELEM_UNRESOLVED) {
1279       if (e->e.unresolved.len == 0)
1280         p = g->productions.v[0];
1281       else if (!(p = lookup_production(g, e->e.unresolved.string, e->e.unresolved.len)))
1282         d_fail("unresolved declaration '%s'", e->e.unresolved.string);
1283       FREE(e->e.unresolved.string);
1284       e->e.unresolved.string = 0;
1285       e->kind = ELEM_NTERM;
1286       e->e.nterm = p;
1287     }
1288   }
1289   /* build declaration groups (covering a production subtrees) */
1290   for (i = 0; i < g->declarations.n; i++) {
1291     if (scanner_declaration(g->declarations.v[i])) {
1292       p = g->declarations.v[i]->elem->e.nterm;
1293       if (p == start) {
1294         for (j = 0; j < g->productions.n; j++) {
1295           g->productions.v[j]->declaration_group[g->declarations.v[i]->kind] = start;
1296           g->productions.v[j]->last_declaration[g->declarations.v[i]->kind] = g->declarations.v[i];
1297         }
1298       } else
1299         set_declaration_group(p, p, g->declarations.v[i]);
1300     }
1301   }
1302   /* set terminal scan_kind */
1303   for (i = 0; i < g->productions.n; i++) {
1304     p = g->productions.v[i];
1305     for (j = 0; j < p->rules.n; j++) {
1306       r = p->rules.v[j];
1307       for (k = 0; k < r->elems.n; k++) {
1308         e = r->elems.v[k];
1309         if (e->kind == ELEM_TERM) {
1310           if (!p->declaration_group[DECLARE_LONGEST_MATCH] && !p->declaration_group[DECLARE_ALL_MATCHES])
1311             e->e.term->scan_kind = D_SCAN_DEFAULT;
1312           else if (p->declaration_group[DECLARE_LONGEST_MATCH] && !p->declaration_group[DECLARE_ALL_MATCHES])
1313             e->e.term->scan_kind = D_SCAN_LONGEST;
1314           else if (!p->declaration_group[DECLARE_LONGEST_MATCH] && p->declaration_group[DECLARE_ALL_MATCHES])
1315             e->e.term->scan_kind = D_SCAN_ALL;
1316           else {
1317             if (p->last_declaration[DECLARE_LONGEST_MATCH]->index > p->last_declaration[DECLARE_ALL_MATCHES]->index)
1318               e->e.term->scan_kind = D_SCAN_LONGEST;
1319             else
1320               e->e.term->scan_kind = D_SCAN_ALL;
1321           }
1322         }
1323       }
1324     }
1325   }
1326 }
1327 
merge_shift_actions(State * to,State * from)1328 static void merge_shift_actions(State *to, State *from) {
1329   uint i, j;
1330   for (i = 0; i < from->shift_actions.n; i++) {
1331     for (j = 0; j < to->shift_actions.n; j++)
1332       if (from->shift_actions.v[i]->term == to->shift_actions.v[j]->term) goto Lnext;
1333     vec_add(&to->shift_actions, from->shift_actions.v[i]);
1334   Lnext:;
1335   }
1336 }
1337 
compute_declaration_states(Grammar * g,Production * p,Declaration * d)1338 static void compute_declaration_states(Grammar *g, Production *p, Declaration *d) {
1339   State *s, *base_s = NULL;
1340   uint j, k, scanner = scanner_declaration(d);
1341   (void)p;
1342 
1343   for (j = 0; j < g->states.n; j++) {
1344     s = g->states.v[j];
1345     if (d->kind == DECLARE_TOKENIZE) {
1346       if (!base_s)
1347         base_s = s;
1348       else {
1349         s->same_shifts = base_s;
1350         merge_shift_actions(base_s, s);
1351       }
1352     }
1353     if (scanner) {
1354       for (k = 0; k < s->items.n; k++)
1355         if (s->items.v[k]->kind == ELEM_TERM) switch (s->items.v[k]->e.term->scan_kind) {
1356             case D_SCAN_LONGEST:
1357               if (s->scan_kind == D_SCAN_RESERVED || s->scan_kind == D_SCAN_LONGEST)
1358                 s->scan_kind = D_SCAN_LONGEST;
1359               else
1360                 s->scan_kind = D_SCAN_MIXED;
1361               break;
1362             case D_SCAN_ALL:
1363               if (s->scan_kind == D_SCAN_RESERVED || s->scan_kind == D_SCAN_ALL)
1364                 s->scan_kind = D_SCAN_ALL;
1365               else
1366                 s->scan_kind = D_SCAN_MIXED;
1367               break;
1368             default:
1369               break;
1370           }
1371     }
1372   }
1373 }
1374 
map_declarations_to_states(Grammar * g)1375 static void map_declarations_to_states(Grammar *g) {
1376   uint i;
1377   State *s;
1378 
1379   for (i = 0; i < g->states.n; i++) {
1380     s = g->states.v[i];
1381     s->scan_kind = D_SCAN_RESERVED;
1382   }
1383   /* map groups to sets of states */
1384   for (i = 0; i < g->declarations.n; i++)
1385     if (scanner_declaration(g->declarations.v[i]))
1386       compute_declaration_states(g, g->declarations.v[i]->elem->e.nterm, g->declarations.v[i]);
1387   for (i = 0; i < g->states.n; i++) {
1388     s = g->states.v[i];
1389     if (s->scan_kind == D_SCAN_RESERVED) s->scan_kind = D_SCAN_DEFAULT; /* set the default */
1390   }
1391 }
1392 
build_grammar(Grammar * g)1393 int build_grammar(Grammar *g) {
1394   resolve_grammar(g);
1395   convert_regex_productions(g);
1396   propogate_declarations(g);
1397   merge_identical_terminals(g);
1398   make_elems_for_productions(g);
1399   check_default_actions(g);
1400   build_LR_tables(g);
1401   map_declarations_to_states(g);
1402   if (d_verbose_level) {
1403     printf("%d productions %d terminals %d states %d declarations\n", g->productions.n, g->terminals.n, g->states.n,
1404            g->declarations.n);
1405   }
1406   if (d_verbose_level > 1) {
1407     print_grammar(g);
1408     print_states(g);
1409   }
1410   build_scanners(g);
1411   build_eq(g);
1412   return 0;
1413 }
1414 
1415 /*   Wlodek Bzyl, <matwb@univ.gda.pl>
1416  */
1417 
print_term_escaped(Term * t,int double_escaped)1418 static void print_term_escaped(Term *t, int double_escaped) {
1419   char *s = 0;
1420   if (t->term_name) {
1421     printf("%s ", t->term_name);
1422   } else if (t->kind == TERM_STRING) {
1423     s = t->string ? escape_string_single_quote(t->string) : NULL;
1424     if (!t->string || !*t->string)
1425       printf("<EOF> ");
1426     else {
1427       printf("'%s' ", double_escaped ? escape_string_single_quote(s) : s);
1428       if (t->ignore_case) printf("/i ");
1429       if (t->term_priority) printf("%sterm %d ", double_escaped ? "#" : "$", t->term_priority);
1430     }
1431   } else if (t->kind == TERM_REGEX) {
1432     char *quote = double_escaped ? "\\\"" : "\"";
1433     s = t->string ? escape_string(t->string) : NULL;
1434     /* char *s = t->string; // ? escape_string(t->string) : NULL; */
1435     printf("%s%s%s ", quote, double_escaped ? escape_string(s) : s, quote);
1436     if (t->ignore_case) printf("/i ");
1437     if (t->term_priority) printf("%sterm %d ", double_escaped ? "#" : "$", t->term_priority);
1438   } else if (t->kind == TERM_CODE) {
1439     s = t->string ? escape_string(t->string) : NULL;
1440     printf("code(\"%s\") ", s);
1441   } else if (t->kind == TERM_TOKEN) {
1442     s = t->string ? escape_string(t->string) : NULL;
1443     printf("%s ", s);
1444   } else
1445     d_fail("unknown token kind");
1446   if (s) FREE(s);
1447 }
1448 
1449 /* print_elem changed to call print_term_escaped */
print_element_escaped(Elem * ee,int double_escaped)1450 static void print_element_escaped(Elem *ee, int double_escaped) {
1451   if (ee->kind == ELEM_TERM)
1452     print_term_escaped(ee->e.term, double_escaped);
1453   else if (ee->kind == ELEM_UNRESOLVED)
1454     printf("%s ", ee->e.unresolved.string);
1455   else
1456     printf("%s ", ee->e.nterm->name);
1457 }
1458 
print_global_code(Grammar * g)1459 static void print_global_code(Grammar *g) {
1460   uint i;
1461   int print_stdio_h = 1;
1462   printf("%%<");
1463   for (i = 0; i < g->ncode; i++) {
1464     printf("%s", g->code[i].code);
1465     if (strstr(g->code[i].code, "<stdio.h>") != NULL) print_stdio_h = 0;
1466   }
1467   if (print_stdio_h) printf("\n  #include <stdio.h>\n");
1468   printf("%%>\n\n");
1469 }
1470 
print_production(Production * p)1471 static void print_production(Production *p) {
1472   uint j, k;
1473   Rule *r;
1474   char *opening[] = {"", "\n\t  [ printf(\"", "\n\t  { printf(\""};
1475   char *closing[] = {"\n", "\\n\"); ]\n", "\\n\"); }\n"};
1476   char *middle[] = {"\n\t:   ", "  <-  ", "  <=  "};
1477   char *assoc[] = {"$", "#", "#"};
1478   char *speculative_final_closing = "\\n\"); ]"; /* closing[1] without final newline */
1479   char *next_or_rule = "\t|   ";
1480   /*  char *regex_production = "  ::=  "; */
1481 
1482   uint variant = 0;
1483 
1484   for (j = 0; j < p->rules.n; j++) {
1485   Lmore:
1486     r = p->rules.v[j];
1487     if (!j) {
1488       /*
1489               if (p->regex) {
1490                 printf("%s%s%s", opening[variant], p->name, regex_production);
1491               } else {
1492       */
1493       printf("%s%s%s", opening[variant], p->name, middle[variant]);
1494       /*      } */
1495     } else {
1496       if (variant == 0)
1497         printf("%s", next_or_rule);
1498       else
1499         printf("%s%s%s", opening[variant], p->name, middle[variant]);
1500     }
1501 
1502     for (k = 0; k < r->elems.n; k++) print_element_escaped(r->elems.v[k], variant);
1503 
1504     if (r->op_assoc) printf(" %s%s ", assoc[variant], assoc_str(r->op_assoc) + 1);
1505     if (r->op_priority) printf("%d ", r->op_priority);
1506     if (r->rule_assoc) printf(" %s%s ", assoc[variant], assoc_str(r->rule_assoc) + 1);
1507     if (r->rule_priority) printf("%d ", r->rule_priority);
1508 
1509     if ((d_rdebug_grammar_level == 1 && variant == 0) || (d_rdebug_grammar_level == 3 && variant == 0)) {
1510       variant = 1;
1511       goto Lmore;
1512     }
1513     if ((d_rdebug_grammar_level == 2 && variant == 0) || (d_rdebug_grammar_level == 3 && variant == 1)) {
1514       if (variant == 1) printf("%s", speculative_final_closing);
1515       variant = 2;
1516       goto Lmore;
1517     }
1518 
1519     printf("%s", closing[variant]);
1520     variant = 0;
1521   }
1522   printf("\t;\n");
1523   printf("\n");
1524 }
1525 
print_productions(Grammar * g,char * pathname)1526 static void print_productions(Grammar *g, char *pathname) {
1527   uint i;
1528   if (!g->productions.n) {
1529     printf("/*\n  There were no productions in the grammar %s\n*/\n", pathname);
1530     return;
1531   }
1532   for (i = 1; i < g->productions.n; i++) print_production(g->productions.v[i]);
1533 }
1534 
print_declare(char * s,char * n)1535 static void print_declare(char *s, char *n) {
1536   while (*n && (isspace_(*n) || isdigit_(*n))) n++;
1537   printf(s, n);
1538 }
1539 
print_declarations(Grammar * g)1540 static void print_declarations(Grammar *g) {
1541   uint i;
1542 
1543   if (g->tokenizer) printf("${declare tokenize}\n");
1544   for (i = 0; i < g->declarations.n; i++) {
1545     Declaration *dd = g->declarations.v[i];
1546     Elem *ee = dd->elem;
1547     switch (dd->kind) {
1548       case DECLARE_LONGEST_MATCH:
1549         if (g->longest_match)
1550           printf("${declare longest_match}\n");
1551         else
1552           print_declare("${declare longest_match %s}\n", ee->e.nterm->name);
1553         break;
1554       case DECLARE_ALL_MATCHES:
1555         if (!g->longest_match)
1556           printf("${declare all_matches}\n");
1557         else
1558           print_declare("${declare all_matches %s}\n", ee->e.nterm->name);
1559         break;
1560       default:
1561         printf("\n/*\nDeclaration->kind: %d", dd->kind);
1562         printf("\nElem->kind:        %d\n*/\n", ee->kind);
1563     }
1564   }
1565   if (g->set_op_priority_from_rule) printf("${declare set_op_priority_from_rule}\n");
1566   if (g->states_for_all_nterms) printf("${declare all_subparsers}\n");
1567   /* todo: DECLARE_STATE_FOR */
1568   if (g->default_white_space) printf("${declare whitespace %s}\n", g->default_white_space);
1569   if (g->save_parse_tree) printf("${declare save_parse_tree}\n");
1570   /* todo: DECLARE_NUM */
1571 
1572   if (g->scanner.code) printf("${scanner %s}\n", g->scanner.code);
1573 
1574   {
1575     int token_exists = 0;
1576     for (i = 0; i < g->terminals.n; i++) {
1577       Term *t = g->terminals.v[i];
1578       if (t->kind == TERM_TOKEN) {
1579         printf("%s %s", token_exists ? "" : "${token", t->string);
1580         token_exists = 1;
1581       }
1582     }
1583     if (token_exists) printf("}\n");
1584   }
1585 
1586   printf("\n");
1587 }
1588 
print_rdebug_grammar(Grammar * g,char * pathname)1589 void print_rdebug_grammar(Grammar *g, char *pathname) {
1590   char ver[30];
1591   d_version(ver);
1592 
1593   printf("/*\n  Generated by Make DParser Version %s\n", ver);
1594   printf("  Available at http://dparser.sf.net\n*/\n\n");
1595 
1596   print_global_code(g);
1597   print_declarations(g);
1598   print_productions(g, pathname);
1599 }
1600