1 
2 /* Parser-tokenizer link implementation */
3 
4 #include "Python.h"
5 #include "tokenizer.h"
6 #include "node.h"
7 #include "grammar.h"
8 #include "parser.h"
9 #include "parsetok.h"
10 #include "errcode.h"
11 #include "graminit.h"
12 
13 
14 /* Forward */
15 static node *parsetok(struct tok_state *, grammar *, int, perrdetail *, int *);
16 static int initerr(perrdetail *err_ret, PyObject * filename);
17 
18 typedef struct {
19     struct {
20         int lineno;
21         char *comment;
22     } *items;
23     size_t size;
24     size_t num_items;
25 } growable_comment_array;
26 
27 static int
growable_comment_array_init(growable_comment_array * arr,size_t initial_size)28 growable_comment_array_init(growable_comment_array *arr, size_t initial_size) {
29     assert(initial_size > 0);
30     arr->items = malloc(initial_size * sizeof(*arr->items));
31     arr->size = initial_size;
32     arr->num_items = 0;
33 
34     return arr->items != NULL;
35 }
36 
37 static int
growable_comment_array_add(growable_comment_array * arr,int lineno,char * comment)38 growable_comment_array_add(growable_comment_array *arr, int lineno, char *comment) {
39     if (arr->num_items >= arr->size) {
40         size_t new_size = arr->size * 2;
41         void *new_items_array = realloc(arr->items, new_size * sizeof(*arr->items));
42         if (!new_items_array) {
43             return 0;
44         }
45         arr->items = new_items_array;
46         arr->size = new_size;
47     }
48 
49     arr->items[arr->num_items].lineno = lineno;
50     arr->items[arr->num_items].comment = comment;
51     arr->num_items++;
52     return 1;
53 }
54 
55 static void
growable_comment_array_deallocate(growable_comment_array * arr)56 growable_comment_array_deallocate(growable_comment_array *arr) {
57     for (unsigned i = 0; i < arr->num_items; i++) {
58         PyObject_FREE(arr->items[i].comment);
59     }
60     free(arr->items);
61 }
62 
63 /* Parse input coming from a string.  Return error code, print some errors. */
64 node *
PyParser_ParseString(const char * s,grammar * g,int start,perrdetail * err_ret)65 PyParser_ParseString(const char *s, grammar *g, int start, perrdetail *err_ret)
66 {
67     return PyParser_ParseStringFlagsFilename(s, NULL, g, start, err_ret, 0);
68 }
69 
70 node *
PyParser_ParseStringFlags(const char * s,grammar * g,int start,perrdetail * err_ret,int flags)71 PyParser_ParseStringFlags(const char *s, grammar *g, int start,
72                           perrdetail *err_ret, int flags)
73 {
74     return PyParser_ParseStringFlagsFilename(s, NULL,
75                                              g, start, err_ret, flags);
76 }
77 
78 node *
PyParser_ParseStringFlagsFilename(const char * s,const char * filename,grammar * g,int start,perrdetail * err_ret,int flags)79 PyParser_ParseStringFlagsFilename(const char *s, const char *filename,
80                           grammar *g, int start,
81                           perrdetail *err_ret, int flags)
82 {
83     int iflags = flags;
84     return PyParser_ParseStringFlagsFilenameEx(s, filename, g, start,
85                                                err_ret, &iflags);
86 }
87 
88 node *
PyParser_ParseStringObject(const char * s,PyObject * filename,grammar * g,int start,perrdetail * err_ret,int * flags)89 PyParser_ParseStringObject(const char *s, PyObject *filename,
90                            grammar *g, int start,
91                            perrdetail *err_ret, int *flags)
92 {
93     struct tok_state *tok;
94     int exec_input = start == file_input;
95 
96     if (initerr(err_ret, filename) < 0)
97         return NULL;
98 
99     if (PySys_Audit("compile", "yO", s, err_ret->filename) < 0) {
100         err_ret->error = E_ERROR;
101         return NULL;
102     }
103 
104     if (*flags & PyPARSE_IGNORE_COOKIE)
105         tok = PyTokenizer_FromUTF8(s, exec_input);
106     else
107         tok = PyTokenizer_FromString(s, exec_input);
108     if (tok == NULL) {
109         err_ret->error = PyErr_Occurred() ? E_DECODE : E_NOMEM;
110         return NULL;
111     }
112     if (*flags & PyPARSE_TYPE_COMMENTS) {
113         tok->type_comments = 1;
114     }
115 
116     Py_INCREF(err_ret->filename);
117     tok->filename = err_ret->filename;
118     if (*flags & PyPARSE_ASYNC_HACKS)
119         tok->async_hacks = 1;
120     return parsetok(tok, g, start, err_ret, flags);
121 }
122 
123 node *
PyParser_ParseStringFlagsFilenameEx(const char * s,const char * filename_str,grammar * g,int start,perrdetail * err_ret,int * flags)124 PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename_str,
125                           grammar *g, int start,
126                           perrdetail *err_ret, int *flags)
127 {
128     node *n;
129     PyObject *filename = NULL;
130     if (filename_str != NULL) {
131         filename = PyUnicode_DecodeFSDefault(filename_str);
132         if (filename == NULL) {
133             err_ret->error = E_ERROR;
134             return NULL;
135         }
136     }
137     n = PyParser_ParseStringObject(s, filename, g, start, err_ret, flags);
138     Py_XDECREF(filename);
139     return n;
140 }
141 
142 /* Parse input coming from a file.  Return error code, print some errors. */
143 
144 node *
PyParser_ParseFile(FILE * fp,const char * filename,grammar * g,int start,const char * ps1,const char * ps2,perrdetail * err_ret)145 PyParser_ParseFile(FILE *fp, const char *filename, grammar *g, int start,
146                    const char *ps1, const char *ps2,
147                    perrdetail *err_ret)
148 {
149     return PyParser_ParseFileFlags(fp, filename, NULL,
150                                    g, start, ps1, ps2, err_ret, 0);
151 }
152 
153 node *
PyParser_ParseFileFlags(FILE * fp,const char * filename,const char * enc,grammar * g,int start,const char * ps1,const char * ps2,perrdetail * err_ret,int flags)154 PyParser_ParseFileFlags(FILE *fp, const char *filename, const char *enc,
155                         grammar *g, int start,
156                         const char *ps1, const char *ps2,
157                         perrdetail *err_ret, int flags)
158 {
159     int iflags = flags;
160     return PyParser_ParseFileFlagsEx(fp, filename, enc, g, start, ps1,
161                                      ps2, err_ret, &iflags);
162 }
163 
164 node *
PyParser_ParseFileObject(FILE * fp,PyObject * filename,const char * enc,grammar * g,int start,const char * ps1,const char * ps2,perrdetail * err_ret,int * flags)165 PyParser_ParseFileObject(FILE *fp, PyObject *filename,
166                          const char *enc, grammar *g, int start,
167                          const char *ps1, const char *ps2,
168                          perrdetail *err_ret, int *flags)
169 {
170     struct tok_state *tok;
171 
172     if (initerr(err_ret, filename) < 0)
173         return NULL;
174 
175     if (PySys_Audit("compile", "OO", Py_None, err_ret->filename) < 0) {
176         return NULL;
177     }
178 
179     if ((tok = PyTokenizer_FromFile(fp, enc, ps1, ps2)) == NULL) {
180         err_ret->error = E_NOMEM;
181         return NULL;
182     }
183     if (*flags & PyPARSE_TYPE_COMMENTS) {
184         tok->type_comments = 1;
185     }
186     Py_INCREF(err_ret->filename);
187     tok->filename = err_ret->filename;
188     return parsetok(tok, g, start, err_ret, flags);
189 }
190 
191 node *
PyParser_ParseFileFlagsEx(FILE * fp,const char * filename,const char * enc,grammar * g,int start,const char * ps1,const char * ps2,perrdetail * err_ret,int * flags)192 PyParser_ParseFileFlagsEx(FILE *fp, const char *filename,
193                           const char *enc, grammar *g, int start,
194                           const char *ps1, const char *ps2,
195                           perrdetail *err_ret, int *flags)
196 {
197     node *n;
198     PyObject *fileobj = NULL;
199     if (filename != NULL) {
200         fileobj = PyUnicode_DecodeFSDefault(filename);
201         if (fileobj == NULL) {
202             err_ret->error = E_ERROR;
203             return NULL;
204         }
205     }
206     n = PyParser_ParseFileObject(fp, fileobj, enc, g,
207                                  start, ps1, ps2, err_ret, flags);
208     Py_XDECREF(fileobj);
209     return n;
210 }
211 
212 /* Parse input coming from the given tokenizer structure.
213    Return error code. */
214 
215 static node *
parsetok(struct tok_state * tok,grammar * g,int start,perrdetail * err_ret,int * flags)216 parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
217          int *flags)
218 {
219     parser_state *ps;
220     node *n;
221     int started = 0;
222     int col_offset, end_col_offset;
223     growable_comment_array type_ignores;
224 
225     if (!growable_comment_array_init(&type_ignores, 10)) {
226         err_ret->error = E_NOMEM;
227         PyTokenizer_Free(tok);
228         return NULL;
229     }
230 
231     if ((ps = PyParser_New(g, start)) == NULL) {
232         err_ret->error = E_NOMEM;
233         growable_comment_array_deallocate(&type_ignores);
234         PyTokenizer_Free(tok);
235         return NULL;
236     }
237 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
238     if (*flags & PyPARSE_BARRY_AS_BDFL)
239         ps->p_flags |= CO_FUTURE_BARRY_AS_BDFL;
240     if (*flags & PyPARSE_TYPE_COMMENTS)
241         ps->p_flags |= PyCF_TYPE_COMMENTS;
242 #endif
243 
244     for (;;) {
245         const char *a, *b;
246         int type;
247         size_t len;
248         char *str;
249         col_offset = -1;
250         int lineno;
251         const char *line_start;
252 
253         type = PyTokenizer_Get(tok, &a, &b);
254 
255         len = (a != NULL && b != NULL) ? b - a : 0;
256         str = (char *) PyObject_MALLOC(len + 1);
257         if (str == NULL) {
258             err_ret->error = E_NOMEM;
259             break;
260         }
261         if (len > 0)
262             strncpy(str, a, len);
263         str[len] = '\0';
264 
265 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
266         if (type == NOTEQUAL) {
267             if (!(ps->p_flags & CO_FUTURE_BARRY_AS_BDFL) &&
268                             strcmp(str, "!=")) {
269                 PyObject_FREE(str);
270                 err_ret->error = E_SYNTAX;
271                 break;
272             }
273             else if ((ps->p_flags & CO_FUTURE_BARRY_AS_BDFL) &&
274                             strcmp(str, "<>")) {
275                 PyObject_FREE(str);
276                 err_ret->expected = NOTEQUAL;
277                 err_ret->error = E_SYNTAX;
278                 break;
279             }
280         }
281 #endif
282 
283         /* Nodes of type STRING, especially multi line strings
284            must be handled differently in order to get both
285            the starting line number and the column offset right.
286            (cf. issue 16806) */
287         lineno = type == STRING ? tok->first_lineno : tok->lineno;
288         line_start = type == STRING ? tok->multi_line_start : tok->line_start;
289         if (a != NULL && a >= line_start) {
290             col_offset = Py_SAFE_DOWNCAST(a - line_start,
291                                           intptr_t, int);
292         }
293         else {
294             col_offset = -1;
295         }
296 
297         if (b != NULL && b >= tok->line_start) {
298             end_col_offset = Py_SAFE_DOWNCAST(b - tok->line_start,
299                                               intptr_t, int);
300         }
301         else {
302             end_col_offset = -1;
303         }
304 
305         if (type == TYPE_IGNORE) {
306             if (!growable_comment_array_add(&type_ignores, tok->lineno, str)) {
307                 err_ret->error = E_NOMEM;
308                 break;
309             }
310             continue;
311         }
312 
313         if (type == ERRORTOKEN) {
314             err_ret->error = tok->done;
315             break;
316         }
317         if (type == ENDMARKER && started) {
318             type = NEWLINE; /* Add an extra newline */
319             started = 0;
320             /* Add the right number of dedent tokens,
321                except if a certain flag is given --
322                codeop.py uses this. */
323             if (tok->indent &&
324                 !(*flags & PyPARSE_DONT_IMPLY_DEDENT))
325             {
326                 tok->pendin = -tok->indent;
327                 tok->indent = 0;
328             }
329         }
330         else {
331             started = 1;
332         }
333 
334         if ((err_ret->error =
335              PyParser_AddToken(ps, (int)type, str,
336                                lineno, col_offset, tok->lineno, end_col_offset,
337                                &(err_ret->expected))) != E_OK) {
338             if (tok->done == E_EOF && !ISWHITESPACE(type)) {
339                 tok->done = E_SYNTAX;
340             }
341             if (err_ret->error != E_DONE) {
342                 PyObject_FREE(str);
343                 err_ret->token = type;
344             }
345             break;
346         }
347     }
348 
349     if (err_ret->error == E_DONE) {
350         n = ps->p_tree;
351         ps->p_tree = NULL;
352 
353         if (n->n_type == file_input) {
354             /* Put type_ignore nodes in the ENDMARKER of file_input. */
355             int num;
356             node *ch;
357             size_t i;
358 
359             num = NCH(n);
360             ch = CHILD(n, num - 1);
361             REQ(ch, ENDMARKER);
362 
363             for (i = 0; i < type_ignores.num_items; i++) {
364                 int res = PyNode_AddChild(ch, TYPE_IGNORE, type_ignores.items[i].comment,
365                                           type_ignores.items[i].lineno, 0,
366                                           type_ignores.items[i].lineno, 0);
367                 if (res != 0) {
368                     err_ret->error = res;
369                     PyNode_Free(n);
370                     n = NULL;
371                     break;
372                 }
373                 type_ignores.items[i].comment = NULL;
374             }
375         }
376 
377         /* Check that the source for a single input statement really
378            is a single statement by looking at what is left in the
379            buffer after parsing.  Trailing whitespace and comments
380            are OK.  */
381         if (err_ret->error == E_DONE && start == single_input) {
382             const char *cur = tok->cur;
383             char c = *tok->cur;
384 
385             for (;;) {
386                 while (c == ' ' || c == '\t' || c == '\n' || c == '\014')
387                     c = *++cur;
388 
389                 if (!c)
390                     break;
391 
392                 if (c != '#') {
393                     err_ret->error = E_BADSINGLE;
394                     PyNode_Free(n);
395                     n = NULL;
396                     break;
397                 }
398 
399                 /* Suck up comment. */
400                 while (c && c != '\n')
401                     c = *++cur;
402             }
403         }
404     }
405     else
406         n = NULL;
407 
408     growable_comment_array_deallocate(&type_ignores);
409 
410 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
411     *flags = ps->p_flags;
412 #endif
413     PyParser_Delete(ps);
414 
415     if (n == NULL) {
416         if (tok->done == E_EOF)
417             err_ret->error = E_EOF;
418         err_ret->lineno = tok->lineno;
419         if (tok->buf != NULL) {
420             size_t len;
421             assert(tok->cur - tok->buf < INT_MAX);
422             /* if we've managed to parse a token, point the offset to its start,
423              * else use the current reading position of the tokenizer
424              */
425             err_ret->offset = col_offset != -1 ? col_offset + 1 : ((int)(tok->cur - tok->buf));
426             len = tok->inp - tok->buf;
427             err_ret->text = (char *) PyObject_MALLOC(len + 1);
428             if (err_ret->text != NULL) {
429                 if (len > 0)
430                     strncpy(err_ret->text, tok->buf, len);
431                 err_ret->text[len] = '\0';
432             }
433         }
434     } else if (tok->encoding != NULL) {
435         /* 'nodes->n_str' uses PyObject_*, while 'tok->encoding' was
436          * allocated using PyMem_
437          */
438         node* r = PyNode_New(encoding_decl);
439         if (r)
440             r->n_str = PyObject_MALLOC(strlen(tok->encoding)+1);
441         if (!r || !r->n_str) {
442             err_ret->error = E_NOMEM;
443             if (r)
444                 PyObject_FREE(r);
445             n = NULL;
446             goto done;
447         }
448         strcpy(r->n_str, tok->encoding);
449         PyMem_FREE(tok->encoding);
450         tok->encoding = NULL;
451         r->n_nchildren = 1;
452         r->n_child = n;
453         n = r;
454     }
455 
456 done:
457     PyTokenizer_Free(tok);
458 
459     if (n != NULL) {
460         _PyNode_FinalizeEndPos(n);
461     }
462     return n;
463 }
464 
465 static int
initerr(perrdetail * err_ret,PyObject * filename)466 initerr(perrdetail *err_ret, PyObject *filename)
467 {
468     err_ret->error = E_OK;
469     err_ret->lineno = 0;
470     err_ret->offset = 0;
471     err_ret->text = NULL;
472     err_ret->token = -1;
473     err_ret->expected = -1;
474     if (filename) {
475         Py_INCREF(filename);
476         err_ret->filename = filename;
477     }
478     else {
479         err_ret->filename = PyUnicode_FromString("<string>");
480         if (err_ret->filename == NULL) {
481             err_ret->error = E_ERROR;
482             return -1;
483         }
484     }
485     return 0;
486 }
487