1 
2 /* Parser implementation */
3 
4 /* For a description, see the comments at end of this file */
5 
6 /* XXX To do: error recovery */
7 
8 #include "Python.h"
9 #include "token.h"
10 #include "grammar.h"
11 #include "node.h"
12 #include "parser.h"
13 #include "errcode.h"
14 #include "graminit.h"
15 
16 
17 #ifdef Py_DEBUG
18 extern int Py_DebugFlag;
19 #define D(x) if (!Py_DebugFlag); else x
20 #else
21 #define D(x)
22 #endif
23 
24 
25 /* STACK DATA TYPE */
26 
27 static void s_reset(stack *);
28 
29 static void
s_reset(stack * s)30 s_reset(stack *s)
31 {
32     s->s_top = &s->s_base[MAXSTACK];
33 }
34 
35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
36 
37 static int
s_push(stack * s,const dfa * d,node * parent)38 s_push(stack *s, const dfa *d, node *parent)
39 {
40     stackentry *top;
41     if (s->s_top == s->s_base) {
42         fprintf(stderr, "s_push: parser stack overflow\n");
43         return E_NOMEM;
44     }
45     top = --s->s_top;
46     top->s_dfa = d;
47     top->s_parent = parent;
48     top->s_state = 0;
49     return 0;
50 }
51 
52 #ifdef Py_DEBUG
53 
54 static void
s_pop(stack * s)55 s_pop(stack *s)
56 {
57     if (s_empty(s))
58         Py_FatalError("s_pop: parser stack underflow -- FATAL");
59     s->s_top++;
60 }
61 
62 #else /* !Py_DEBUG */
63 
64 #define s_pop(s) (s)->s_top++
65 
66 #endif
67 
68 
69 /* PARSER CREATION */
70 
71 parser_state *
PyParser_New(grammar * g,int start)72 PyParser_New(grammar *g, int start)
73 {
74     parser_state *ps;
75 
76     if (!g->g_accel)
77         PyGrammar_AddAccelerators(g);
78     ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
79     if (ps == NULL)
80         return NULL;
81     ps->p_grammar = g;
82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
83     ps->p_flags = 0;
84 #endif
85     ps->p_tree = PyNode_New(start);
86     if (ps->p_tree == NULL) {
87         PyMem_FREE(ps);
88         return NULL;
89     }
90     s_reset(&ps->p_stack);
91     (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
92     return ps;
93 }
94 
95 void
PyParser_Delete(parser_state * ps)96 PyParser_Delete(parser_state *ps)
97 {
98     /* NB If you want to save the parse tree,
99        you must set p_tree to NULL before calling delparser! */
100     PyNode_Free(ps->p_tree);
101     PyMem_FREE(ps);
102 }
103 
104 
105 /* PARSER STACK OPERATIONS */
106 
107 static int
shift(stack * s,int type,char * str,int newstate,int lineno,int col_offset,int end_lineno,int end_col_offset)108 shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset,
109       int end_lineno, int end_col_offset)
110 {
111     int err;
112     assert(!s_empty(s));
113     err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset,
114                           end_lineno, end_col_offset);
115     if (err)
116         return err;
117     s->s_top->s_state = newstate;
118     return 0;
119 }
120 
121 static int
push(stack * s,int type,const dfa * d,int newstate,int lineno,int col_offset,int end_lineno,int end_col_offset)122 push(stack *s, int type, const dfa *d, int newstate, int lineno, int col_offset,
123      int end_lineno, int end_col_offset)
124 {
125     int err;
126     node *n;
127     n = s->s_top->s_parent;
128     assert(!s_empty(s));
129     err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset,
130                           end_lineno, end_col_offset);
131     if (err)
132         return err;
133     s->s_top->s_state = newstate;
134     return s_push(s, d, CHILD(n, NCH(n)-1));
135 }
136 
137 
138 /* PARSER PROPER */
139 
140 static int
classify(parser_state * ps,int type,const char * str)141 classify(parser_state *ps, int type, const char *str)
142 {
143     grammar *g = ps->p_grammar;
144     int n = g->g_ll.ll_nlabels;
145 
146     if (type == NAME) {
147         const label *l = g->g_ll.ll_label;
148         int i;
149         for (i = n; i > 0; i--, l++) {
150             if (l->lb_type != NAME || l->lb_str == NULL ||
151                 l->lb_str[0] != str[0] ||
152                 strcmp(l->lb_str, str) != 0)
153                 continue;
154 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
155 #if 0
156             /* Leaving this in as an example */
157             if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) {
158                 if (str[0] == 'w' && strcmp(str, "with") == 0)
159                     break; /* not a keyword yet */
160                 else if (str[0] == 'a' && strcmp(str, "as") == 0)
161                     break; /* not a keyword yet */
162             }
163 #endif
164 #endif
165             D(printf("It's a keyword\n"));
166             return n - i;
167         }
168     }
169 
170     {
171         const label *l = g->g_ll.ll_label;
172         int i;
173         for (i = n; i > 0; i--, l++) {
174             if (l->lb_type == type && l->lb_str == NULL) {
175                 D(printf("It's a token we know\n"));
176                 return n - i;
177             }
178         }
179     }
180 
181     D(printf("Illegal token\n"));
182     return -1;
183 }
184 
185 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
186 #if 0
187 /* Leaving this in as an example */
188 static void
189 future_hack(parser_state *ps)
190 {
191     node *n = ps->p_stack.s_top->s_parent;
192     node *ch, *cch;
193     int i;
194 
195     /* from __future__ import ..., must have at least 4 children */
196     n = CHILD(n, 0);
197     if (NCH(n) < 4)
198         return;
199     ch = CHILD(n, 0);
200     if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
201         return;
202     ch = CHILD(n, 1);
203     if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
204         strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
205         return;
206     ch = CHILD(n, 3);
207     /* ch can be a star, a parenthesis or import_as_names */
208     if (TYPE(ch) == STAR)
209         return;
210     if (TYPE(ch) == LPAR)
211         ch = CHILD(n, 4);
212 
213     for (i = 0; i < NCH(ch); i += 2) {
214         cch = CHILD(ch, i);
215         if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
216             char *str_ch = STR(CHILD(cch, 0));
217             if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
218                 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
219             } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
220                 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
221             } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
222                 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
223             }
224         }
225     }
226 }
227 #endif
228 #endif /* future keyword */
229 
230 int
PyParser_AddToken(parser_state * ps,int type,char * str,int lineno,int col_offset,int end_lineno,int end_col_offset,int * expected_ret)231 PyParser_AddToken(parser_state *ps, int type, char *str,
232                   int lineno, int col_offset,
233                   int end_lineno, int end_col_offset,
234                   int *expected_ret)
235 {
236     int ilabel;
237     int err;
238 
239     D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
240 
241     /* Find out which label this token is */
242     ilabel = classify(ps, type, str);
243     if (ilabel < 0)
244         return E_SYNTAX;
245 
246     /* Loop until the token is shifted or an error occurred */
247     for (;;) {
248         /* Fetch the current dfa and state */
249         const dfa *d = ps->p_stack.s_top->s_dfa;
250         state *s = &d->d_state[ps->p_stack.s_top->s_state];
251 
252         D(printf(" DFA '%s', state %d:",
253             d->d_name, ps->p_stack.s_top->s_state));
254 
255         /* Check accelerator */
256         if (s->s_lower <= ilabel && ilabel < s->s_upper) {
257             int x = s->s_accel[ilabel - s->s_lower];
258             if (x != -1) {
259                 if (x & (1<<7)) {
260                     /* Push non-terminal */
261                     int nt = (x >> 8) + NT_OFFSET;
262                     int arrow = x & ((1<<7)-1);
263                     if (nt == func_body_suite && !(ps->p_flags & PyCF_TYPE_COMMENTS)) {
264                         /* When parsing type comments is not requested,
265                            we can provide better errors about bad indentation
266                            by using 'suite' for the body of a funcdef */
267                         D(printf(" [switch func_body_suite to suite]"));
268                         nt = suite;
269                     }
270                     const dfa *d1 = PyGrammar_FindDFA(
271                         ps->p_grammar, nt);
272                     if ((err = push(&ps->p_stack, nt, d1,
273                         arrow, lineno, col_offset,
274                         end_lineno, end_col_offset)) > 0) {
275                         D(printf(" MemError: push\n"));
276                         return err;
277                     }
278                     D(printf(" Push '%s'\n", d1->d_name));
279                     continue;
280                 }
281 
282                 /* Shift the token */
283                 if ((err = shift(&ps->p_stack, type, str,
284                                 x, lineno, col_offset,
285                                 end_lineno, end_col_offset)) > 0) {
286                     D(printf(" MemError: shift.\n"));
287                     return err;
288                 }
289                 D(printf(" Shift.\n"));
290                 /* Pop while we are in an accept-only state */
291                 while (s = &d->d_state
292                                 [ps->p_stack.s_top->s_state],
293                     s->s_accept && s->s_narcs == 1) {
294                     D(printf("  DFA '%s', state %d: "
295                              "Direct pop.\n",
296                              d->d_name,
297                              ps->p_stack.s_top->s_state));
298 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
299 #if 0
300                     if (d->d_name[0] == 'i' &&
301                         strcmp(d->d_name,
302                            "import_stmt") == 0)
303                         future_hack(ps);
304 #endif
305 #endif
306                     s_pop(&ps->p_stack);
307                     if (s_empty(&ps->p_stack)) {
308                         D(printf("  ACCEPT.\n"));
309                         return E_DONE;
310                     }
311                     d = ps->p_stack.s_top->s_dfa;
312                 }
313                 return E_OK;
314             }
315         }
316 
317         if (s->s_accept) {
318 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
319 #if 0
320             if (d->d_name[0] == 'i' &&
321                 strcmp(d->d_name, "import_stmt") == 0)
322                 future_hack(ps);
323 #endif
324 #endif
325             /* Pop this dfa and try again */
326             s_pop(&ps->p_stack);
327             D(printf(" Pop ...\n"));
328             if (s_empty(&ps->p_stack)) {
329                 D(printf(" Error: bottom of stack.\n"));
330                 return E_SYNTAX;
331             }
332             continue;
333         }
334 
335         /* Stuck, report syntax error */
336         D(printf(" Error.\n"));
337         if (expected_ret) {
338             if (s->s_lower == s->s_upper - 1) {
339                 /* Only one possible expected token */
340                 *expected_ret = ps->p_grammar->
341                     g_ll.ll_label[s->s_lower].lb_type;
342             }
343             else
344                 *expected_ret = -1;
345         }
346         return E_SYNTAX;
347     }
348 }
349 
350 
351 #ifdef Py_DEBUG
352 
353 /* DEBUG OUTPUT */
354 
355 void
dumptree(grammar * g,node * n)356 dumptree(grammar *g, node *n)
357 {
358     int i;
359 
360     if (n == NULL)
361         printf("NIL");
362     else {
363         label l;
364         l.lb_type = TYPE(n);
365         l.lb_str = STR(n);
366         printf("%s", PyGrammar_LabelRepr(&l));
367         if (ISNONTERMINAL(TYPE(n))) {
368             printf("(");
369             for (i = 0; i < NCH(n); i++) {
370                 if (i > 0)
371                     printf(",");
372                 dumptree(g, CHILD(n, i));
373             }
374             printf(")");
375         }
376     }
377 }
378 
379 void
showtree(grammar * g,node * n)380 showtree(grammar *g, node *n)
381 {
382     int i;
383 
384     if (n == NULL)
385         return;
386     if (ISNONTERMINAL(TYPE(n))) {
387         for (i = 0; i < NCH(n); i++)
388             showtree(g, CHILD(n, i));
389     }
390     else if (ISTERMINAL(TYPE(n))) {
391         printf("%s", _PyParser_TokenNames[TYPE(n)]);
392         if (TYPE(n) == NUMBER || TYPE(n) == NAME)
393             printf("(%s)", STR(n));
394         printf(" ");
395     }
396     else
397         printf("? ");
398 }
399 
400 void
printtree(parser_state * ps)401 printtree(parser_state *ps)
402 {
403     if (Py_DebugFlag) {
404         printf("Parse tree:\n");
405         dumptree(ps->p_grammar, ps->p_tree);
406         printf("\n");
407         printf("Tokens:\n");
408         showtree(ps->p_grammar, ps->p_tree);
409         printf("\n");
410     }
411     printf("Listing:\n");
412     PyNode_ListTree(ps->p_tree);
413     printf("\n");
414 }
415 
416 #endif /* Py_DEBUG */
417 
418 /*
419 
420 Description
421 -----------
422 
423 The parser's interface is different than usual: the function addtoken()
424 must be called for each token in the input.  This makes it possible to
425 turn it into an incremental parsing system later.  The parsing system
426 constructs a parse tree as it goes.
427 
428 A parsing rule is represented as a Deterministic Finite-state Automaton
429 (DFA).  A node in a DFA represents a state of the parser; an arc represents
430 a transition.  Transitions are either labeled with terminal symbols or
431 with non-terminals.  When the parser decides to follow an arc labeled
432 with a non-terminal, it is invoked recursively with the DFA representing
433 the parsing rule for that as its initial state; when that DFA accepts,
434 the parser that invoked it continues.  The parse tree constructed by the
435 recursively called parser is inserted as a child in the current parse tree.
436 
437 The DFA's can be constructed automatically from a more conventional
438 language description.  An extended LL(1) grammar (ELL(1)) is suitable.
439 Certain restrictions make the parser's life easier: rules that can produce
440 the empty string should be outlawed (there are other ways to put loops
441 or optional parts in the language).  To avoid the need to construct
442 FIRST sets, we can require that all but the last alternative of a rule
443 (really: arc going out of a DFA's state) must begin with a terminal
444 symbol.
445 
446 As an example, consider this grammar:
447 
448 expr:   term (OP term)*
449 term:   CONSTANT | '(' expr ')'
450 
451 The DFA corresponding to the rule for expr is:
452 
453 ------->.---term-->.------->
454     ^          |
455     |          |
456     \----OP----/
457 
458 The parse tree generated for the input a+b is:
459 
460 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
461 
462 */
463