1
2 /* Parser implementation */
3
4 /* For a description, see the comments at end of this file */
5
6 /* XXX To do: error recovery */
7
8 #include "Python.h"
9 #include "pgenheaders.h"
10 #include "token.h"
11 #include "grammar.h"
12 #include "node.h"
13 #include "parser.h"
14 #include "errcode.h"
15
16
17 #ifdef Py_DEBUG
18 extern int Py_DebugFlag;
19 #define D(x) if (!Py_DebugFlag); else x
20 #else
21 #define D(x)
22 #endif
23
24
25 /* STACK DATA TYPE */
26
27 static void s_reset(stack *);
28
29 static void
s_reset(stack * s)30 s_reset(stack *s)
31 {
32 s->s_top = &s->s_base[MAXSTACK];
33 }
34
35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
36
37 static int
s_push(stack * s,dfa * d,node * parent)38 s_push(stack *s, dfa *d, node *parent)
39 {
40 stackentry *top;
41 if (s->s_top == s->s_base) {
42 fprintf(stderr, "s_push: parser stack overflow\n");
43 return E_NOMEM;
44 }
45 top = --s->s_top;
46 top->s_dfa = d;
47 top->s_parent = parent;
48 top->s_state = 0;
49 return 0;
50 }
51
52 #ifdef Py_DEBUG
53
54 static void
s_pop(stack * s)55 s_pop(stack *s)
56 {
57 if (s_empty(s))
58 Py_FatalError("s_pop: parser stack underflow -- FATAL");
59 s->s_top++;
60 }
61
62 #else /* !Py_DEBUG */
63
64 #define s_pop(s) (s)->s_top++
65
66 #endif
67
68
69 /* PARSER CREATION */
70
71 parser_state *
PyParser_New(grammar * g,int start)72 PyParser_New(grammar *g, int start)
73 {
74 parser_state *ps;
75
76 if (!g->g_accel)
77 PyGrammar_AddAccelerators(g);
78 ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
79 if (ps == NULL)
80 return NULL;
81 ps->p_grammar = g;
82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
83 ps->p_flags = 0;
84 #endif
85 ps->p_tree = PyNode_New(start);
86 if (ps->p_tree == NULL) {
87 PyMem_FREE(ps);
88 return NULL;
89 }
90 s_reset(&ps->p_stack);
91 (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
92 return ps;
93 }
94
95 void
PyParser_Delete(parser_state * ps)96 PyParser_Delete(parser_state *ps)
97 {
98 /* NB If you want to save the parse tree,
99 you must set p_tree to NULL before calling delparser! */
100 PyNode_Free(ps->p_tree);
101 PyMem_FREE(ps);
102 }
103
104
105 /* PARSER STACK OPERATIONS */
106
107 static int
shift(stack * s,int type,char * str,int newstate,int lineno,int col_offset)108 shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset)
109 {
110 int err;
111 assert(!s_empty(s));
112 err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset);
113 if (err)
114 return err;
115 s->s_top->s_state = newstate;
116 return 0;
117 }
118
119 static int
push(stack * s,int type,dfa * d,int newstate,int lineno,int col_offset)120 push(stack *s, int type, dfa *d, int newstate, int lineno, int col_offset)
121 {
122 int err;
123 node *n;
124 n = s->s_top->s_parent;
125 assert(!s_empty(s));
126 err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset);
127 if (err)
128 return err;
129 s->s_top->s_state = newstate;
130 return s_push(s, d, CHILD(n, NCH(n)-1));
131 }
132
133
134 /* PARSER PROPER */
135
136 static int
classify(parser_state * ps,int type,const char * str)137 classify(parser_state *ps, int type, const char *str)
138 {
139 grammar *g = ps->p_grammar;
140 int n = g->g_ll.ll_nlabels;
141
142 if (type == NAME) {
143 label *l = g->g_ll.ll_label;
144 int i;
145 for (i = n; i > 0; i--, l++) {
146 if (l->lb_type != NAME || l->lb_str == NULL ||
147 l->lb_str[0] != str[0] ||
148 strcmp(l->lb_str, str) != 0)
149 continue;
150 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
151 #if 0
152 /* Leaving this in as an example */
153 if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) {
154 if (str[0] == 'w' && strcmp(str, "with") == 0)
155 break; /* not a keyword yet */
156 else if (str[0] == 'a' && strcmp(str, "as") == 0)
157 break; /* not a keyword yet */
158 }
159 #endif
160 #endif
161 D(printf("It's a keyword\n"));
162 return n - i;
163 }
164 }
165
166 {
167 label *l = g->g_ll.ll_label;
168 int i;
169 for (i = n; i > 0; i--, l++) {
170 if (l->lb_type == type && l->lb_str == NULL) {
171 D(printf("It's a token we know\n"));
172 return n - i;
173 }
174 }
175 }
176
177 D(printf("Illegal token\n"));
178 return -1;
179 }
180
181 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
182 #if 0
183 /* Leaving this in as an example */
184 static void
185 future_hack(parser_state *ps)
186 {
187 node *n = ps->p_stack.s_top->s_parent;
188 node *ch, *cch;
189 int i;
190
191 /* from __future__ import ..., must have at least 4 children */
192 n = CHILD(n, 0);
193 if (NCH(n) < 4)
194 return;
195 ch = CHILD(n, 0);
196 if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
197 return;
198 ch = CHILD(n, 1);
199 if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
200 strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
201 return;
202 ch = CHILD(n, 3);
203 /* ch can be a star, a parenthesis or import_as_names */
204 if (TYPE(ch) == STAR)
205 return;
206 if (TYPE(ch) == LPAR)
207 ch = CHILD(n, 4);
208
209 for (i = 0; i < NCH(ch); i += 2) {
210 cch = CHILD(ch, i);
211 if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
212 char *str_ch = STR(CHILD(cch, 0));
213 if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
214 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
215 } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
216 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
217 } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
218 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
219 }
220 }
221 }
222 }
223 #endif
224 #endif /* future keyword */
225
226 int
PyParser_AddToken(parser_state * ps,int type,char * str,int lineno,int col_offset,int * expected_ret)227 PyParser_AddToken(parser_state *ps, int type, char *str,
228 int lineno, int col_offset, int *expected_ret)
229 {
230 int ilabel;
231 int err;
232
233 D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
234
235 /* Find out which label this token is */
236 ilabel = classify(ps, type, str);
237 if (ilabel < 0)
238 return E_SYNTAX;
239
240 /* Loop until the token is shifted or an error occurred */
241 for (;;) {
242 /* Fetch the current dfa and state */
243 dfa *d = ps->p_stack.s_top->s_dfa;
244 state *s = &d->d_state[ps->p_stack.s_top->s_state];
245
246 D(printf(" DFA '%s', state %d:",
247 d->d_name, ps->p_stack.s_top->s_state));
248
249 /* Check accelerator */
250 if (s->s_lower <= ilabel && ilabel < s->s_upper) {
251 int x = s->s_accel[ilabel - s->s_lower];
252 if (x != -1) {
253 if (x & (1<<7)) {
254 /* Push non-terminal */
255 int nt = (x >> 8) + NT_OFFSET;
256 int arrow = x & ((1<<7)-1);
257 dfa *d1 = PyGrammar_FindDFA(
258 ps->p_grammar, nt);
259 if ((err = push(&ps->p_stack, nt, d1,
260 arrow, lineno, col_offset)) > 0) {
261 D(printf(" MemError: push\n"));
262 return err;
263 }
264 D(printf(" Push ...\n"));
265 continue;
266 }
267
268 /* Shift the token */
269 if ((err = shift(&ps->p_stack, type, str,
270 x, lineno, col_offset)) > 0) {
271 D(printf(" MemError: shift.\n"));
272 return err;
273 }
274 D(printf(" Shift.\n"));
275 /* Pop while we are in an accept-only state */
276 while (s = &d->d_state
277 [ps->p_stack.s_top->s_state],
278 s->s_accept && s->s_narcs == 1) {
279 D(printf(" DFA '%s', state %d: "
280 "Direct pop.\n",
281 d->d_name,
282 ps->p_stack.s_top->s_state));
283 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
284 #if 0
285 if (d->d_name[0] == 'i' &&
286 strcmp(d->d_name,
287 "import_stmt") == 0)
288 future_hack(ps);
289 #endif
290 #endif
291 s_pop(&ps->p_stack);
292 if (s_empty(&ps->p_stack)) {
293 D(printf(" ACCEPT.\n"));
294 return E_DONE;
295 }
296 d = ps->p_stack.s_top->s_dfa;
297 }
298 return E_OK;
299 }
300 }
301
302 if (s->s_accept) {
303 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
304 #if 0
305 if (d->d_name[0] == 'i' &&
306 strcmp(d->d_name, "import_stmt") == 0)
307 future_hack(ps);
308 #endif
309 #endif
310 /* Pop this dfa and try again */
311 s_pop(&ps->p_stack);
312 D(printf(" Pop ...\n"));
313 if (s_empty(&ps->p_stack)) {
314 D(printf(" Error: bottom of stack.\n"));
315 return E_SYNTAX;
316 }
317 continue;
318 }
319
320 /* Stuck, report syntax error */
321 D(printf(" Error.\n"));
322 if (expected_ret) {
323 if (s->s_lower == s->s_upper - 1) {
324 /* Only one possible expected token */
325 *expected_ret = ps->p_grammar->
326 g_ll.ll_label[s->s_lower].lb_type;
327 }
328 else
329 *expected_ret = -1;
330 }
331 return E_SYNTAX;
332 }
333 }
334
335
336 #ifdef Py_DEBUG
337
338 /* DEBUG OUTPUT */
339
340 void
dumptree(grammar * g,node * n)341 dumptree(grammar *g, node *n)
342 {
343 int i;
344
345 if (n == NULL)
346 printf("NIL");
347 else {
348 label l;
349 l.lb_type = TYPE(n);
350 l.lb_str = STR(n);
351 printf("%s", PyGrammar_LabelRepr(&l));
352 if (ISNONTERMINAL(TYPE(n))) {
353 printf("(");
354 for (i = 0; i < NCH(n); i++) {
355 if (i > 0)
356 printf(",");
357 dumptree(g, CHILD(n, i));
358 }
359 printf(")");
360 }
361 }
362 }
363
364 void
showtree(grammar * g,node * n)365 showtree(grammar *g, node *n)
366 {
367 int i;
368
369 if (n == NULL)
370 return;
371 if (ISNONTERMINAL(TYPE(n))) {
372 for (i = 0; i < NCH(n); i++)
373 showtree(g, CHILD(n, i));
374 }
375 else if (ISTERMINAL(TYPE(n))) {
376 printf("%s", _PyParser_TokenNames[TYPE(n)]);
377 if (TYPE(n) == NUMBER || TYPE(n) == NAME)
378 printf("(%s)", STR(n));
379 printf(" ");
380 }
381 else
382 printf("? ");
383 }
384
385 void
printtree(parser_state * ps)386 printtree(parser_state *ps)
387 {
388 if (Py_DebugFlag) {
389 printf("Parse tree:\n");
390 dumptree(ps->p_grammar, ps->p_tree);
391 printf("\n");
392 printf("Tokens:\n");
393 showtree(ps->p_grammar, ps->p_tree);
394 printf("\n");
395 }
396 printf("Listing:\n");
397 PyNode_ListTree(ps->p_tree);
398 printf("\n");
399 }
400
401 #endif /* Py_DEBUG */
402
403 /*
404
405 Description
406 -----------
407
408 The parser's interface is different than usual: the function addtoken()
409 must be called for each token in the input. This makes it possible to
410 turn it into an incremental parsing system later. The parsing system
411 constructs a parse tree as it goes.
412
413 A parsing rule is represented as a Deterministic Finite-state Automaton
414 (DFA). A node in a DFA represents a state of the parser; an arc represents
415 a transition. Transitions are either labeled with terminal symbols or
416 with non-terminals. When the parser decides to follow an arc labeled
417 with a non-terminal, it is invoked recursively with the DFA representing
418 the parsing rule for that as its initial state; when that DFA accepts,
419 the parser that invoked it continues. The parse tree constructed by the
420 recursively called parser is inserted as a child in the current parse tree.
421
422 The DFA's can be constructed automatically from a more conventional
423 language description. An extended LL(1) grammar (ELL(1)) is suitable.
424 Certain restrictions make the parser's life easier: rules that can produce
425 the empty string should be outlawed (there are other ways to put loops
426 or optional parts in the language). To avoid the need to construct
427 FIRST sets, we can require that all but the last alternative of a rule
428 (really: arc going out of a DFA's state) must begin with a terminal
429 symbol.
430
431 As an example, consider this grammar:
432
433 expr: term (OP term)*
434 term: CONSTANT | '(' expr ')'
435
436 The DFA corresponding to the rule for expr is:
437
438 ------->.---term-->.------->
439 ^ |
440 | |
441 \----OP----/
442
443 The parse tree generated for the input a+b is:
444
445 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
446
447 */
448