1 
2 /* Parser implementation */
3 
4 /* For a description, see the comments at end of this file */
5 
6 /* XXX To do: error recovery */
7 
8 #include "Python.h"
9 #include "token.h"
10 #include "grammar.h"
11 #include "node.h"
12 #include "parser.h"
13 #include "errcode.h"
14 #include "graminit.h"
15 
16 
17 #ifdef Py_DEBUG
18 extern int Py_DebugFlag;
19 #define D(x) if (!Py_DebugFlag); else x
20 #else
21 #define D(x)
22 #endif
23 
24 
25 /* STACK DATA TYPE */
26 
27 static void s_reset(stack *);
28 
29 static void
s_reset(stack * s)30 s_reset(stack *s)
31 {
32     s->s_top = &s->s_base[MAXSTACK];
33 }
34 
35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
36 
37 static int
s_push(stack * s,const dfa * d,node * parent)38 s_push(stack *s, const dfa *d, node *parent)
39 {
40     stackentry *top;
41     if (s->s_top == s->s_base) {
42         fprintf(stderr, "s_push: parser stack overflow\n");
43         return E_NOMEM;
44     }
45     top = --s->s_top;
46     top->s_dfa = d;
47     top->s_parent = parent;
48     top->s_state = 0;
49     return 0;
50 }
51 
52 #ifdef Py_DEBUG
53 
54 static void
s_pop(stack * s)55 s_pop(stack *s)
56 {
57     if (s_empty(s)) {
58         Py_FatalError("parser stack underflow");
59     }
60     s->s_top++;
61 }
62 
63 #else /* !Py_DEBUG */
64 
65 #define s_pop(s) (s)->s_top++
66 
67 #endif
68 
69 
70 /* PARSER CREATION */
71 
72 parser_state *
PyParser_New(grammar * g,int start)73 PyParser_New(grammar *g, int start)
74 {
75     parser_state *ps;
76 
77     if (!g->g_accel)
78         PyGrammar_AddAccelerators(g);
79     ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
80     if (ps == NULL)
81         return NULL;
82     ps->p_grammar = g;
83 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
84     ps->p_flags = 0;
85 #endif
86     ps->p_tree = PyNode_New(start);
87     if (ps->p_tree == NULL) {
88         PyMem_FREE(ps);
89         return NULL;
90     }
91     s_reset(&ps->p_stack);
92     (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
93     return ps;
94 }
95 
96 void
PyParser_Delete(parser_state * ps)97 PyParser_Delete(parser_state *ps)
98 {
99     /* NB If you want to save the parse tree,
100        you must set p_tree to NULL before calling delparser! */
101     PyNode_Free(ps->p_tree);
102     PyMem_FREE(ps);
103 }
104 
105 
106 /* PARSER STACK OPERATIONS */
107 
108 static int
shift(stack * s,int type,char * str,int newstate,int lineno,int col_offset,int end_lineno,int end_col_offset)109 shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset,
110       int end_lineno, int end_col_offset)
111 {
112     int err;
113     assert(!s_empty(s));
114     err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset,
115                           end_lineno, end_col_offset);
116     if (err)
117         return err;
118     s->s_top->s_state = newstate;
119     return 0;
120 }
121 
122 static int
push(stack * s,int type,const dfa * d,int newstate,int lineno,int col_offset,int end_lineno,int end_col_offset)123 push(stack *s, int type, const dfa *d, int newstate, int lineno, int col_offset,
124      int end_lineno, int end_col_offset)
125 {
126     int err;
127     node *n;
128     n = s->s_top->s_parent;
129     assert(!s_empty(s));
130     err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset,
131                           end_lineno, end_col_offset);
132     if (err)
133         return err;
134     s->s_top->s_state = newstate;
135     return s_push(s, d, CHILD(n, NCH(n)-1));
136 }
137 
138 
139 /* PARSER PROPER */
140 
141 static int
classify(parser_state * ps,int type,const char * str)142 classify(parser_state *ps, int type, const char *str)
143 {
144     grammar *g = ps->p_grammar;
145     int n = g->g_ll.ll_nlabels;
146 
147     if (type == NAME) {
148         const label *l = g->g_ll.ll_label;
149         int i;
150         for (i = n; i > 0; i--, l++) {
151             if (l->lb_type != NAME || l->lb_str == NULL ||
152                 l->lb_str[0] != str[0] ||
153                 strcmp(l->lb_str, str) != 0)
154                 continue;
155 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
156 #if 0
157             /* Leaving this in as an example */
158             if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) {
159                 if (str[0] == 'w' && strcmp(str, "with") == 0)
160                     break; /* not a keyword yet */
161                 else if (str[0] == 'a' && strcmp(str, "as") == 0)
162                     break; /* not a keyword yet */
163             }
164 #endif
165 #endif
166             D(printf("It's a keyword\n"));
167             return n - i;
168         }
169     }
170 
171     {
172         const label *l = g->g_ll.ll_label;
173         int i;
174         for (i = n; i > 0; i--, l++) {
175             if (l->lb_type == type && l->lb_str == NULL) {
176                 D(printf("It's a token we know\n"));
177                 return n - i;
178             }
179         }
180     }
181 
182     D(printf("Illegal token\n"));
183     return -1;
184 }
185 
186 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
187 #if 0
188 /* Leaving this in as an example */
189 static void
190 future_hack(parser_state *ps)
191 {
192     node *n = ps->p_stack.s_top->s_parent;
193     node *ch, *cch;
194     int i;
195 
196     /* from __future__ import ..., must have at least 4 children */
197     n = CHILD(n, 0);
198     if (NCH(n) < 4)
199         return;
200     ch = CHILD(n, 0);
201     if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
202         return;
203     ch = CHILD(n, 1);
204     if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
205         strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
206         return;
207     ch = CHILD(n, 3);
208     /* ch can be a star, a parenthesis or import_as_names */
209     if (TYPE(ch) == STAR)
210         return;
211     if (TYPE(ch) == LPAR)
212         ch = CHILD(n, 4);
213 
214     for (i = 0; i < NCH(ch); i += 2) {
215         cch = CHILD(ch, i);
216         if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
217             char *str_ch = STR(CHILD(cch, 0));
218             if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
219                 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
220             } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
221                 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
222             } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
223                 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
224             }
225         }
226     }
227 }
228 #endif
229 #endif /* future keyword */
230 
231 int
PyParser_AddToken(parser_state * ps,int type,char * str,int lineno,int col_offset,int end_lineno,int end_col_offset,int * expected_ret)232 PyParser_AddToken(parser_state *ps, int type, char *str,
233                   int lineno, int col_offset,
234                   int end_lineno, int end_col_offset,
235                   int *expected_ret)
236 {
237     int ilabel;
238     int err;
239 
240     D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
241 
242     /* Find out which label this token is */
243     ilabel = classify(ps, type, str);
244     if (ilabel < 0)
245         return E_SYNTAX;
246 
247     /* Loop until the token is shifted or an error occurred */
248     for (;;) {
249         /* Fetch the current dfa and state */
250         const dfa *d = ps->p_stack.s_top->s_dfa;
251         state *s = &d->d_state[ps->p_stack.s_top->s_state];
252 
253         D(printf(" DFA '%s', state %d:",
254             d->d_name, ps->p_stack.s_top->s_state));
255 
256         /* Check accelerator */
257         if (s->s_lower <= ilabel && ilabel < s->s_upper) {
258             int x = s->s_accel[ilabel - s->s_lower];
259             if (x != -1) {
260                 if (x & (1<<7)) {
261                     /* Push non-terminal */
262                     int nt = (x >> 8) + NT_OFFSET;
263                     int arrow = x & ((1<<7)-1);
264                     if (nt == func_body_suite && !(ps->p_flags & PyCF_TYPE_COMMENTS)) {
265                         /* When parsing type comments is not requested,
266                            we can provide better errors about bad indentation
267                            by using 'suite' for the body of a funcdef */
268                         D(printf(" [switch func_body_suite to suite]"));
269                         nt = suite;
270                     }
271                     const dfa *d1 = PyGrammar_FindDFA(
272                         ps->p_grammar, nt);
273                     if ((err = push(&ps->p_stack, nt, d1,
274                         arrow, lineno, col_offset,
275                         end_lineno, end_col_offset)) > 0) {
276                         D(printf(" MemError: push\n"));
277                         return err;
278                     }
279                     D(printf(" Push '%s'\n", d1->d_name));
280                     continue;
281                 }
282 
283                 /* Shift the token */
284                 if ((err = shift(&ps->p_stack, type, str,
285                                 x, lineno, col_offset,
286                                 end_lineno, end_col_offset)) > 0) {
287                     D(printf(" MemError: shift.\n"));
288                     return err;
289                 }
290                 D(printf(" Shift.\n"));
291                 /* Pop while we are in an accept-only state */
292                 while (s = &d->d_state
293                                 [ps->p_stack.s_top->s_state],
294                     s->s_accept && s->s_narcs == 1) {
295                     D(printf("  DFA '%s', state %d: "
296                              "Direct pop.\n",
297                              d->d_name,
298                              ps->p_stack.s_top->s_state));
299 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
300 #if 0
301                     if (d->d_name[0] == 'i' &&
302                         strcmp(d->d_name,
303                            "import_stmt") == 0)
304                         future_hack(ps);
305 #endif
306 #endif
307                     s_pop(&ps->p_stack);
308                     if (s_empty(&ps->p_stack)) {
309                         D(printf("  ACCEPT.\n"));
310                         return E_DONE;
311                     }
312                     d = ps->p_stack.s_top->s_dfa;
313                 }
314                 return E_OK;
315             }
316         }
317 
318         if (s->s_accept) {
319 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
320 #if 0
321             if (d->d_name[0] == 'i' &&
322                 strcmp(d->d_name, "import_stmt") == 0)
323                 future_hack(ps);
324 #endif
325 #endif
326             /* Pop this dfa and try again */
327             s_pop(&ps->p_stack);
328             D(printf(" Pop ...\n"));
329             if (s_empty(&ps->p_stack)) {
330                 D(printf(" Error: bottom of stack.\n"));
331                 return E_SYNTAX;
332             }
333             continue;
334         }
335 
336         /* Stuck, report syntax error */
337         D(printf(" Error.\n"));
338         if (expected_ret) {
339             if (s->s_lower == s->s_upper - 1) {
340                 /* Only one possible expected token */
341                 *expected_ret = ps->p_grammar->
342                     g_ll.ll_label[s->s_lower].lb_type;
343             }
344             else
345                 *expected_ret = -1;
346         }
347         return E_SYNTAX;
348     }
349 }
350 
351 
352 #ifdef Py_DEBUG
353 
354 /* DEBUG OUTPUT */
355 
356 void
dumptree(grammar * g,node * n)357 dumptree(grammar *g, node *n)
358 {
359     int i;
360 
361     if (n == NULL)
362         printf("NIL");
363     else {
364         label l;
365         l.lb_type = TYPE(n);
366         l.lb_str = STR(n);
367         printf("%s", PyGrammar_LabelRepr(&l));
368         if (ISNONTERMINAL(TYPE(n))) {
369             printf("(");
370             for (i = 0; i < NCH(n); i++) {
371                 if (i > 0)
372                     printf(",");
373                 dumptree(g, CHILD(n, i));
374             }
375             printf(")");
376         }
377     }
378 }
379 
380 void
showtree(grammar * g,node * n)381 showtree(grammar *g, node *n)
382 {
383     int i;
384 
385     if (n == NULL)
386         return;
387     if (ISNONTERMINAL(TYPE(n))) {
388         for (i = 0; i < NCH(n); i++)
389             showtree(g, CHILD(n, i));
390     }
391     else if (ISTERMINAL(TYPE(n))) {
392         printf("%s", _PyParser_TokenNames[TYPE(n)]);
393         if (TYPE(n) == NUMBER || TYPE(n) == NAME)
394             printf("(%s)", STR(n));
395         printf(" ");
396     }
397     else
398         printf("? ");
399 }
400 
401 void
printtree(parser_state * ps)402 printtree(parser_state *ps)
403 {
404     if (Py_DebugFlag) {
405         printf("Parse tree:\n");
406         dumptree(ps->p_grammar, ps->p_tree);
407         printf("\n");
408         printf("Tokens:\n");
409         showtree(ps->p_grammar, ps->p_tree);
410         printf("\n");
411     }
412     printf("Listing:\n");
413     PyNode_ListTree(ps->p_tree);
414     printf("\n");
415 }
416 
417 #endif /* Py_DEBUG */
418 
419 /*
420 
421 Description
422 -----------
423 
424 The parser's interface is different than usual: the function addtoken()
425 must be called for each token in the input.  This makes it possible to
426 turn it into an incremental parsing system later.  The parsing system
427 constructs a parse tree as it goes.
428 
429 A parsing rule is represented as a Deterministic Finite-state Automaton
430 (DFA).  A node in a DFA represents a state of the parser; an arc represents
431 a transition.  Transitions are either labeled with terminal symbols or
432 with non-terminals.  When the parser decides to follow an arc labeled
433 with a non-terminal, it is invoked recursively with the DFA representing
434 the parsing rule for that as its initial state; when that DFA accepts,
435 the parser that invoked it continues.  The parse tree constructed by the
436 recursively called parser is inserted as a child in the current parse tree.
437 
438 The DFA's can be constructed automatically from a more conventional
439 language description.  An extended LL(1) grammar (ELL(1)) is suitable.
440 Certain restrictions make the parser's life easier: rules that can produce
441 the empty string should be outlawed (there are other ways to put loops
442 or optional parts in the language).  To avoid the need to construct
443 FIRST sets, we can require that all but the last alternative of a rule
444 (really: arc going out of a DFA's state) must begin with a terminal
445 symbol.
446 
447 As an example, consider this grammar:
448 
449 expr:   term (OP term)*
450 term:   CONSTANT | '(' expr ')'
451 
452 The DFA corresponding to the rule for expr is:
453 
454 ------->.---term-->.------->
455     ^          |
456     |          |
457     \----OP----/
458 
459 The parse tree generated for the input a+b is:
460 
461 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
462 
463 */
464