1 
2 /* Parser implementation */
3 
4 /* For a description, see the comments at end of this file */
5 
6 /* XXX To do: error recovery */
7 
8 #include "Python.h"
9 #include "../Include/pgenheaders.h"
10 #include "../Include/token.h"
11 #include "../Include/grammar.h"
12 #include "../Include/node.h"
13 #include "parser.h"
14 #include "../Include/errcode.h"
15 
16 
17 #ifdef Py_DEBUG
18 PyAPI_DATA(int) Py_DebugFlag;
19 #define D(x) if (!Py_DebugFlag); else x
20 #else
21 #define D(x)
22 #endif
23 
24 
25 /* STACK DATA TYPE */
26 
27 static void s_reset(stack *);
28 
29 static void
s_reset(stack * s)30 s_reset(stack *s)
31 {
32     s->s_top = &s->s_base[MAXSTACK];
33 }
34 
35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
36 
37 static int
s_push(stack * s,dfa * d,node * parent)38 s_push(stack *s, dfa *d, node *parent)
39 {
40     stackentry *top;
41     if (s->s_top == s->s_base) {
42         fprintf(stderr, "s_push: parser stack overflow\n");
43         return E_NOMEM;
44     }
45     top = --s->s_top;
46     top->s_dfa = d;
47     top->s_parent = parent;
48     top->s_state = 0;
49     return 0;
50 }
51 
52 #ifdef Py_DEBUG
53 
54 static void
s_pop(stack * s)55 s_pop(stack *s)
56 {
57     if (s_empty(s))
58         Py_FatalError("s_pop: parser stack underflow -- FATAL");
59     s->s_top++;
60 }
61 
62 #else /* !Py_DEBUG */
63 
64 #define s_pop(s) (s)->s_top++
65 
66 #endif
67 
68 
69 /* PARSER CREATION */
70 
71 parser_state *
Ta3Parser_New(grammar * g,int start)72 Ta3Parser_New(grammar *g, int start)
73 {
74     parser_state *ps;
75 
76     if (!g->g_accel)
77         Ta3Grammar_AddAccelerators(g);
78     ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
79     if (ps == NULL)
80         return NULL;
81     ps->p_grammar = g;
82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
83     ps->p_flags = 0;
84 #endif
85     ps->p_tree = Ta3Node_New(start);
86     if (ps->p_tree == NULL) {
87         PyMem_FREE(ps);
88         return NULL;
89     }
90     s_reset(&ps->p_stack);
91     (void) s_push(&ps->p_stack, Ta3Grammar_FindDFA(g, start), ps->p_tree);
92     return ps;
93 }
94 
95 void
Ta3Parser_Delete(parser_state * ps)96 Ta3Parser_Delete(parser_state *ps)
97 {
98     /* NB If you want to save the parse tree,
99        you must set p_tree to NULL before calling delparser! */
100     Ta3Node_Free(ps->p_tree);
101     PyMem_FREE(ps);
102 }
103 
104 
105 /* PARSER STACK OPERATIONS */
106 
107 static int
shift(stack * s,int type,char * str,int newstate,int lineno,int col_offset)108 shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset)
109 {
110     int err;
111     assert(!s_empty(s));
112     err = Ta3Node_AddChild(s->s_top->s_parent, type, str, lineno, col_offset);
113     if (err)
114         return err;
115     s->s_top->s_state = newstate;
116     return 0;
117 }
118 
119 static int
push(stack * s,int type,dfa * d,int newstate,int lineno,int col_offset)120 push(stack *s, int type, dfa *d, int newstate, int lineno, int col_offset)
121 {
122     int err;
123     node *n;
124     n = s->s_top->s_parent;
125     assert(!s_empty(s));
126     err = Ta3Node_AddChild(n, type, (char *)NULL, lineno, col_offset);
127     if (err)
128         return err;
129     s->s_top->s_state = newstate;
130     return s_push(s, d, CHILD(n, NCH(n)-1));
131 }
132 
133 
134 /* PARSER PROPER */
135 
136 static int
classify(parser_state * ps,int type,const char * str)137 classify(parser_state *ps, int type, const char *str)
138 {
139     grammar *g = ps->p_grammar;
140     int n = g->g_ll.ll_nlabels;
141 
142     if (type == NAME) {
143         label *l = g->g_ll.ll_label;
144         int i;
145         for (i = n; i > 0; i--, l++) {
146             if (l->lb_type != NAME || l->lb_str == NULL ||
147                 l->lb_str[0] != str[0] ||
148                 strcmp(l->lb_str, str) != 0)
149                 continue;
150 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
151 #if 0
152             /* Leaving this in as an example */
153             if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) {
154                 if (str[0] == 'w' && strcmp(str, "with") == 0)
155                     break; /* not a keyword yet */
156                 else if (str[0] == 'a' && strcmp(str, "as") == 0)
157                     break; /* not a keyword yet */
158             }
159 #endif
160 #endif
161             D(printf("It's a keyword\n"));
162             return n - i;
163         }
164     }
165 
166     {
167         label *l = g->g_ll.ll_label;
168         int i;
169         for (i = n; i > 0; i--, l++) {
170             if (l->lb_type == type && l->lb_str == NULL) {
171                 D(printf("It's a token we know\n"));
172                 return n - i;
173             }
174         }
175     }
176 
177     D(printf("Illegal token\n"));
178     return -1;
179 }
180 
181 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
182 #if 0
183 /* Leaving this in as an example */
184 static void
185 future_hack(parser_state *ps)
186 {
187     node *n = ps->p_stack.s_top->s_parent;
188     node *ch, *cch;
189     int i;
190 
191     /* from __future__ import ..., must have at least 4 children */
192     n = CHILD(n, 0);
193     if (NCH(n) < 4)
194         return;
195     ch = CHILD(n, 0);
196     if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
197         return;
198     ch = CHILD(n, 1);
199     if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
200         strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
201         return;
202     ch = CHILD(n, 3);
203     /* ch can be a star, a parenthesis or import_as_names */
204     if (TYPE(ch) == STAR)
205         return;
206     if (TYPE(ch) == LPAR)
207         ch = CHILD(n, 4);
208 
209     for (i = 0; i < NCH(ch); i += 2) {
210         cch = CHILD(ch, i);
211         if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
212             char *str_ch = STR(CHILD(cch, 0));
213             if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
214                 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
215             } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
216                 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
217             } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
218                 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
219             }
220         }
221     }
222 }
223 #endif
224 #endif /* future keyword */
225 
226 int
Ta3Parser_AddToken(parser_state * ps,int type,char * str,int lineno,int col_offset,int * expected_ret)227 Ta3Parser_AddToken(parser_state *ps, int type, char *str,
228                   int lineno, int col_offset, int *expected_ret)
229 {
230     int ilabel;
231     int err;
232 
233     D(printf("Token %s/'%s' ... ", _Ta3Parser_TokenNames[type], str));
234 
235     /* Find out which label this token is */
236     ilabel = classify(ps, type, str);
237     if (ilabel < 0)
238         return E_SYNTAX;
239 
240     /* Loop until the token is shifted or an error occurred */
241     for (;;) {
242         /* Fetch the current dfa and state */
243         dfa *d = ps->p_stack.s_top->s_dfa;
244         state *s = &d->d_state[ps->p_stack.s_top->s_state];
245 
246         D(printf(" DFA '%s', state %d:",
247             d->d_name, ps->p_stack.s_top->s_state));
248 
249         /* Check accelerator */
250         if (s->s_lower <= ilabel && ilabel < s->s_upper) {
251             int x = s->s_accel[ilabel - s->s_lower];
252             if (x != -1) {
253                 if (x & (1<<7)) {
254                     /* Push non-terminal */
255                     int nt = (x >> 8) + NT_OFFSET;
256                     int arrow = x & ((1<<7)-1);
257                     dfa *d1 = Ta3Grammar_FindDFA(
258                         ps->p_grammar, nt);
259                     if ((err = push(&ps->p_stack, nt, d1,
260                         arrow, lineno, col_offset)) > 0) {
261                         D(printf(" MemError: push\n"));
262                         return err;
263                     }
264                     D(printf(" Push ...\n"));
265                     continue;
266                 }
267 
268                 /* Shift the token */
269                 if ((err = shift(&ps->p_stack, type, str,
270                                 x, lineno, col_offset)) > 0) {
271                     D(printf(" MemError: shift.\n"));
272                     return err;
273                 }
274                 D(printf(" Shift.\n"));
275                 /* Pop while we are in an accept-only state */
276                 while (s = &d->d_state
277                                 [ps->p_stack.s_top->s_state],
278                     s->s_accept && s->s_narcs == 1) {
279                     D(printf("  DFA '%s', state %d: "
280                              "Direct pop.\n",
281                              d->d_name,
282                              ps->p_stack.s_top->s_state));
283 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
284 #if 0
285                     if (d->d_name[0] == 'i' &&
286                         strcmp(d->d_name,
287                            "import_stmt") == 0)
288                         future_hack(ps);
289 #endif
290 #endif
291                     s_pop(&ps->p_stack);
292                     if (s_empty(&ps->p_stack)) {
293                         D(printf("  ACCEPT.\n"));
294                         return E_DONE;
295                     }
296                     d = ps->p_stack.s_top->s_dfa;
297                 }
298                 return E_OK;
299             }
300         }
301 
302         if (s->s_accept) {
303 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
304 #if 0
305             if (d->d_name[0] == 'i' &&
306                 strcmp(d->d_name, "import_stmt") == 0)
307                 future_hack(ps);
308 #endif
309 #endif
310             /* Pop this dfa and try again */
311             s_pop(&ps->p_stack);
312             D(printf(" Pop ...\n"));
313             if (s_empty(&ps->p_stack)) {
314                 D(printf(" Error: bottom of stack.\n"));
315                 return E_SYNTAX;
316             }
317             continue;
318         }
319 
320         /* Stuck, report syntax error */
321         D(printf(" Error.\n"));
322         if (expected_ret) {
323             if (s->s_lower == s->s_upper - 1) {
324                 /* Only one possible expected token */
325                 *expected_ret = ps->p_grammar->
326                     g_ll.ll_label[s->s_lower].lb_type;
327             }
328             else
329                 *expected_ret = -1;
330         }
331         return E_SYNTAX;
332     }
333 }
334 
335 
336 #ifdef Py_DEBUG
337 
338 /* DEBUG OUTPUT */
339 
340 void
dumptree(grammar * g,node * n)341 dumptree(grammar *g, node *n)
342 {
343     int i;
344 
345     if (n == NULL)
346         printf("NIL");
347     else {
348         label l;
349         l.lb_type = TYPE(n);
350         l.lb_str = STR(n);
351         printf("%s", Ta3Grammar_LabelRepr(&l));
352         if (ISNONTERMINAL(TYPE(n))) {
353             printf("(");
354             for (i = 0; i < NCH(n); i++) {
355                 if (i > 0)
356                     printf(",");
357                 dumptree(g, CHILD(n, i));
358             }
359             printf(")");
360         }
361     }
362 }
363 
364 void
showtree(grammar * g,node * n)365 showtree(grammar *g, node *n)
366 {
367     int i;
368 
369     if (n == NULL)
370         return;
371     if (ISNONTERMINAL(TYPE(n))) {
372         for (i = 0; i < NCH(n); i++)
373             showtree(g, CHILD(n, i));
374     }
375     else if (ISTERMINAL(TYPE(n))) {
376         printf("%s", _Ta3Parser_TokenNames[TYPE(n)]);
377         if (TYPE(n) == NUMBER || TYPE(n) == NAME)
378             printf("(%s)", STR(n));
379         printf(" ");
380     }
381     else
382         printf("? ");
383 }
384 
385 #endif /* Py_DEBUG */
386 
387 /*
388 
389 Description
390 -----------
391 
392 The parser's interface is different than usual: the function addtoken()
393 must be called for each token in the input.  This makes it possible to
394 turn it into an incremental parsing system later.  The parsing system
395 constructs a parse tree as it goes.
396 
397 A parsing rule is represented as a Deterministic Finite-state Automaton
398 (DFA).  A node in a DFA represents a state of the parser; an arc represents
399 a transition.  Transitions are either labeled with terminal symbols or
400 with non-terminals.  When the parser decides to follow an arc labeled
401 with a non-terminal, it is invoked recursively with the DFA representing
402 the parsing rule for that as its initial state; when that DFA accepts,
403 the parser that invoked it continues.  The parse tree constructed by the
404 recursively called parser is inserted as a child in the current parse tree.
405 
406 The DFA's can be constructed automatically from a more conventional
407 language description.  An extended LL(1) grammar (ELL(1)) is suitable.
408 Certain restrictions make the parser's life easier: rules that can produce
409 the empty string should be outlawed (there are other ways to put loops
410 or optional parts in the language).  To avoid the need to construct
411 FIRST sets, we can require that all but the last alternative of a rule
412 (really: arc going out of a DFA's state) must begin with a terminal
413 symbol.
414 
415 As an example, consider this grammar:
416 
417 expr:   term (OP term)*
418 term:   CONSTANT | '(' expr ')'
419 
420 The DFA corresponding to the rule for expr is:
421 
422 ------->.---term-->.------->
423     ^          |
424     |          |
425     \----OP----/
426 
427 The parse tree generated for the input a+b is:
428 
429 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
430 
431 */
432