1
2 /* Parser implementation */
3
4 /* For a description, see the comments at end of this file */
5
6 /* XXX To do: error recovery */
7
8 #include "Python.h"
9 #include "token.h"
10 #include "grammar.h"
11 #include "node.h"
12 #include "parser.h"
13 #include "errcode.h"
14 #include "graminit.h"
15
16
17 #ifdef Py_DEBUG
18 extern int Py_DebugFlag;
19 #define D(x) if (!Py_DebugFlag); else x
20 #else
21 #define D(x)
22 #endif
23
24
25 /* STACK DATA TYPE */
26
27 static void s_reset(stack *);
28
29 static void
s_reset(stack * s)30 s_reset(stack *s)
31 {
32 s->s_top = &s->s_base[MAXSTACK];
33 }
34
35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
36
37 static int
s_push(stack * s,const dfa * d,node * parent)38 s_push(stack *s, const dfa *d, node *parent)
39 {
40 stackentry *top;
41 if (s->s_top == s->s_base) {
42 fprintf(stderr, "s_push: parser stack overflow\n");
43 return E_NOMEM;
44 }
45 top = --s->s_top;
46 top->s_dfa = d;
47 top->s_parent = parent;
48 top->s_state = 0;
49 return 0;
50 }
51
52 #ifdef Py_DEBUG
53
54 static void
s_pop(stack * s)55 s_pop(stack *s)
56 {
57 if (s_empty(s))
58 Py_FatalError("s_pop: parser stack underflow -- FATAL");
59 s->s_top++;
60 }
61
62 #else /* !Py_DEBUG */
63
64 #define s_pop(s) (s)->s_top++
65
66 #endif
67
68
69 /* PARSER CREATION */
70
71 parser_state *
PyParser_New(grammar * g,int start)72 PyParser_New(grammar *g, int start)
73 {
74 parser_state *ps;
75
76 if (!g->g_accel)
77 PyGrammar_AddAccelerators(g);
78 ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
79 if (ps == NULL)
80 return NULL;
81 ps->p_grammar = g;
82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
83 ps->p_flags = 0;
84 #endif
85 ps->p_tree = PyNode_New(start);
86 if (ps->p_tree == NULL) {
87 PyMem_FREE(ps);
88 return NULL;
89 }
90 s_reset(&ps->p_stack);
91 (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
92 return ps;
93 }
94
95 void
PyParser_Delete(parser_state * ps)96 PyParser_Delete(parser_state *ps)
97 {
98 /* NB If you want to save the parse tree,
99 you must set p_tree to NULL before calling delparser! */
100 PyNode_Free(ps->p_tree);
101 PyMem_FREE(ps);
102 }
103
104
105 /* PARSER STACK OPERATIONS */
106
107 static int
shift(stack * s,int type,char * str,int newstate,int lineno,int col_offset,int end_lineno,int end_col_offset)108 shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset,
109 int end_lineno, int end_col_offset)
110 {
111 int err;
112 assert(!s_empty(s));
113 err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset,
114 end_lineno, end_col_offset);
115 if (err)
116 return err;
117 s->s_top->s_state = newstate;
118 return 0;
119 }
120
121 static int
push(stack * s,int type,const dfa * d,int newstate,int lineno,int col_offset,int end_lineno,int end_col_offset)122 push(stack *s, int type, const dfa *d, int newstate, int lineno, int col_offset,
123 int end_lineno, int end_col_offset)
124 {
125 int err;
126 node *n;
127 n = s->s_top->s_parent;
128 assert(!s_empty(s));
129 err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset,
130 end_lineno, end_col_offset);
131 if (err)
132 return err;
133 s->s_top->s_state = newstate;
134 return s_push(s, d, CHILD(n, NCH(n)-1));
135 }
136
137
138 /* PARSER PROPER */
139
140 static int
classify(parser_state * ps,int type,const char * str)141 classify(parser_state *ps, int type, const char *str)
142 {
143 grammar *g = ps->p_grammar;
144 int n = g->g_ll.ll_nlabels;
145
146 if (type == NAME) {
147 const label *l = g->g_ll.ll_label;
148 int i;
149 for (i = n; i > 0; i--, l++) {
150 if (l->lb_type != NAME || l->lb_str == NULL ||
151 l->lb_str[0] != str[0] ||
152 strcmp(l->lb_str, str) != 0)
153 continue;
154 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
155 #if 0
156 /* Leaving this in as an example */
157 if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) {
158 if (str[0] == 'w' && strcmp(str, "with") == 0)
159 break; /* not a keyword yet */
160 else if (str[0] == 'a' && strcmp(str, "as") == 0)
161 break; /* not a keyword yet */
162 }
163 #endif
164 #endif
165 D(printf("It's a keyword\n"));
166 return n - i;
167 }
168 }
169
170 {
171 const label *l = g->g_ll.ll_label;
172 int i;
173 for (i = n; i > 0; i--, l++) {
174 if (l->lb_type == type && l->lb_str == NULL) {
175 D(printf("It's a token we know\n"));
176 return n - i;
177 }
178 }
179 }
180
181 D(printf("Illegal token\n"));
182 return -1;
183 }
184
185 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
186 #if 0
187 /* Leaving this in as an example */
188 static void
189 future_hack(parser_state *ps)
190 {
191 node *n = ps->p_stack.s_top->s_parent;
192 node *ch, *cch;
193 int i;
194
195 /* from __future__ import ..., must have at least 4 children */
196 n = CHILD(n, 0);
197 if (NCH(n) < 4)
198 return;
199 ch = CHILD(n, 0);
200 if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
201 return;
202 ch = CHILD(n, 1);
203 if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
204 strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
205 return;
206 ch = CHILD(n, 3);
207 /* ch can be a star, a parenthesis or import_as_names */
208 if (TYPE(ch) == STAR)
209 return;
210 if (TYPE(ch) == LPAR)
211 ch = CHILD(n, 4);
212
213 for (i = 0; i < NCH(ch); i += 2) {
214 cch = CHILD(ch, i);
215 if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
216 char *str_ch = STR(CHILD(cch, 0));
217 if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
218 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
219 } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
220 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
221 } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
222 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
223 }
224 }
225 }
226 }
227 #endif
228 #endif /* future keyword */
229
230 int
PyParser_AddToken(parser_state * ps,int type,char * str,int lineno,int col_offset,int end_lineno,int end_col_offset,int * expected_ret)231 PyParser_AddToken(parser_state *ps, int type, char *str,
232 int lineno, int col_offset,
233 int end_lineno, int end_col_offset,
234 int *expected_ret)
235 {
236 int ilabel;
237 int err;
238
239 D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
240
241 /* Find out which label this token is */
242 ilabel = classify(ps, type, str);
243 if (ilabel < 0)
244 return E_SYNTAX;
245
246 /* Loop until the token is shifted or an error occurred */
247 for (;;) {
248 /* Fetch the current dfa and state */
249 const dfa *d = ps->p_stack.s_top->s_dfa;
250 state *s = &d->d_state[ps->p_stack.s_top->s_state];
251
252 D(printf(" DFA '%s', state %d:",
253 d->d_name, ps->p_stack.s_top->s_state));
254
255 /* Check accelerator */
256 if (s->s_lower <= ilabel && ilabel < s->s_upper) {
257 int x = s->s_accel[ilabel - s->s_lower];
258 if (x != -1) {
259 if (x & (1<<7)) {
260 /* Push non-terminal */
261 int nt = (x >> 8) + NT_OFFSET;
262 int arrow = x & ((1<<7)-1);
263 if (nt == func_body_suite && !(ps->p_flags & PyCF_TYPE_COMMENTS)) {
264 /* When parsing type comments is not requested,
265 we can provide better errors about bad indentation
266 by using 'suite' for the body of a funcdef */
267 D(printf(" [switch func_body_suite to suite]"));
268 nt = suite;
269 }
270 const dfa *d1 = PyGrammar_FindDFA(
271 ps->p_grammar, nt);
272 if ((err = push(&ps->p_stack, nt, d1,
273 arrow, lineno, col_offset,
274 end_lineno, end_col_offset)) > 0) {
275 D(printf(" MemError: push\n"));
276 return err;
277 }
278 D(printf(" Push '%s'\n", d1->d_name));
279 continue;
280 }
281
282 /* Shift the token */
283 if ((err = shift(&ps->p_stack, type, str,
284 x, lineno, col_offset,
285 end_lineno, end_col_offset)) > 0) {
286 D(printf(" MemError: shift.\n"));
287 return err;
288 }
289 D(printf(" Shift.\n"));
290 /* Pop while we are in an accept-only state */
291 while (s = &d->d_state
292 [ps->p_stack.s_top->s_state],
293 s->s_accept && s->s_narcs == 1) {
294 D(printf(" DFA '%s', state %d: "
295 "Direct pop.\n",
296 d->d_name,
297 ps->p_stack.s_top->s_state));
298 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
299 #if 0
300 if (d->d_name[0] == 'i' &&
301 strcmp(d->d_name,
302 "import_stmt") == 0)
303 future_hack(ps);
304 #endif
305 #endif
306 s_pop(&ps->p_stack);
307 if (s_empty(&ps->p_stack)) {
308 D(printf(" ACCEPT.\n"));
309 return E_DONE;
310 }
311 d = ps->p_stack.s_top->s_dfa;
312 }
313 return E_OK;
314 }
315 }
316
317 if (s->s_accept) {
318 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
319 #if 0
320 if (d->d_name[0] == 'i' &&
321 strcmp(d->d_name, "import_stmt") == 0)
322 future_hack(ps);
323 #endif
324 #endif
325 /* Pop this dfa and try again */
326 s_pop(&ps->p_stack);
327 D(printf(" Pop ...\n"));
328 if (s_empty(&ps->p_stack)) {
329 D(printf(" Error: bottom of stack.\n"));
330 return E_SYNTAX;
331 }
332 continue;
333 }
334
335 /* Stuck, report syntax error */
336 D(printf(" Error.\n"));
337 if (expected_ret) {
338 if (s->s_lower == s->s_upper - 1) {
339 /* Only one possible expected token */
340 *expected_ret = ps->p_grammar->
341 g_ll.ll_label[s->s_lower].lb_type;
342 }
343 else
344 *expected_ret = -1;
345 }
346 return E_SYNTAX;
347 }
348 }
349
350
351 #ifdef Py_DEBUG
352
353 /* DEBUG OUTPUT */
354
355 void
dumptree(grammar * g,node * n)356 dumptree(grammar *g, node *n)
357 {
358 int i;
359
360 if (n == NULL)
361 printf("NIL");
362 else {
363 label l;
364 l.lb_type = TYPE(n);
365 l.lb_str = STR(n);
366 printf("%s", PyGrammar_LabelRepr(&l));
367 if (ISNONTERMINAL(TYPE(n))) {
368 printf("(");
369 for (i = 0; i < NCH(n); i++) {
370 if (i > 0)
371 printf(",");
372 dumptree(g, CHILD(n, i));
373 }
374 printf(")");
375 }
376 }
377 }
378
379 void
showtree(grammar * g,node * n)380 showtree(grammar *g, node *n)
381 {
382 int i;
383
384 if (n == NULL)
385 return;
386 if (ISNONTERMINAL(TYPE(n))) {
387 for (i = 0; i < NCH(n); i++)
388 showtree(g, CHILD(n, i));
389 }
390 else if (ISTERMINAL(TYPE(n))) {
391 printf("%s", _PyParser_TokenNames[TYPE(n)]);
392 if (TYPE(n) == NUMBER || TYPE(n) == NAME)
393 printf("(%s)", STR(n));
394 printf(" ");
395 }
396 else
397 printf("? ");
398 }
399
400 void
printtree(parser_state * ps)401 printtree(parser_state *ps)
402 {
403 if (Py_DebugFlag) {
404 printf("Parse tree:\n");
405 dumptree(ps->p_grammar, ps->p_tree);
406 printf("\n");
407 printf("Tokens:\n");
408 showtree(ps->p_grammar, ps->p_tree);
409 printf("\n");
410 }
411 printf("Listing:\n");
412 PyNode_ListTree(ps->p_tree);
413 printf("\n");
414 }
415
416 #endif /* Py_DEBUG */
417
418 /*
419
420 Description
421 -----------
422
423 The parser's interface is different than usual: the function addtoken()
424 must be called for each token in the input. This makes it possible to
425 turn it into an incremental parsing system later. The parsing system
426 constructs a parse tree as it goes.
427
428 A parsing rule is represented as a Deterministic Finite-state Automaton
429 (DFA). A node in a DFA represents a state of the parser; an arc represents
430 a transition. Transitions are either labeled with terminal symbols or
431 with non-terminals. When the parser decides to follow an arc labeled
432 with a non-terminal, it is invoked recursively with the DFA representing
433 the parsing rule for that as its initial state; when that DFA accepts,
434 the parser that invoked it continues. The parse tree constructed by the
435 recursively called parser is inserted as a child in the current parse tree.
436
437 The DFA's can be constructed automatically from a more conventional
438 language description. An extended LL(1) grammar (ELL(1)) is suitable.
439 Certain restrictions make the parser's life easier: rules that can produce
440 the empty string should be outlawed (there are other ways to put loops
441 or optional parts in the language). To avoid the need to construct
442 FIRST sets, we can require that all but the last alternative of a rule
443 (really: arc going out of a DFA's state) must begin with a terminal
444 symbol.
445
446 As an example, consider this grammar:
447
448 expr: term (OP term)*
449 term: CONSTANT | '(' expr ')'
450
451 The DFA corresponding to the rule for expr is:
452
453 ------->.---term-->.------->
454 ^ |
455 | |
456 \----OP----/
457
458 The parse tree generated for the input a+b is:
459
460 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
461
462 */
463