1 
2 /* Tokenizer implementation */
3 
4 #define PY_SSIZE_T_CLEAN
5 #include "Python.h"
6 #include "pycore_call.h"          // _PyObject_CallNoArgs()
7 
8 #include <ctype.h>
9 #include <assert.h>
10 
11 #include "tokenizer.h"
12 #include "errcode.h"
13 
14 #include "unicodeobject.h"
15 #include "bytesobject.h"
16 #include "fileobject.h"
17 #include "abstract.h"
18 
19 /* Alternate tab spacing */
20 #define ALTTABSIZE 1
21 
22 #define is_potential_identifier_start(c) (\
23               (c >= 'a' && c <= 'z')\
24                || (c >= 'A' && c <= 'Z')\
25                || c == '_'\
26                || (c >= 128))
27 
28 #define is_potential_identifier_char(c) (\
29               (c >= 'a' && c <= 'z')\
30                || (c >= 'A' && c <= 'Z')\
31                || (c >= '0' && c <= '9')\
32                || c == '_'\
33                || (c >= 128))
34 
35 
36 /* Don't ever change this -- it would break the portability of Python code */
37 #define TABSIZE 8
38 
39 /* Forward */
40 static struct tok_state *tok_new(void);
41 static int tok_nextc(struct tok_state *tok);
42 static void tok_backup(struct tok_state *tok, int c);
43 
44 
45 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
46    tokenizing. */
47 static const char* type_comment_prefix = "# type: ";
48 
49 /* Create and initialize a new tok_state structure */
50 
51 static struct tok_state *
tok_new(void)52 tok_new(void)
53 {
54     struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
55                                             sizeof(struct tok_state));
56     if (tok == NULL)
57         return NULL;
58     tok->buf = tok->cur = tok->inp = NULL;
59     tok->fp_interactive = 0;
60     tok->interactive_src_start = NULL;
61     tok->interactive_src_end = NULL;
62     tok->start = NULL;
63     tok->end = NULL;
64     tok->done = E_OK;
65     tok->fp = NULL;
66     tok->input = NULL;
67     tok->tabsize = TABSIZE;
68     tok->indent = 0;
69     tok->indstack[0] = 0;
70     tok->atbol = 1;
71     tok->pendin = 0;
72     tok->prompt = tok->nextprompt = NULL;
73     tok->lineno = 0;
74     tok->level = 0;
75     tok->altindstack[0] = 0;
76     tok->decoding_state = STATE_INIT;
77     tok->decoding_erred = 0;
78     tok->enc = NULL;
79     tok->encoding = NULL;
80     tok->cont_line = 0;
81     tok->filename = NULL;
82     tok->decoding_readline = NULL;
83     tok->decoding_buffer = NULL;
84     tok->type_comments = 0;
85     tok->async_hacks = 0;
86     tok->async_def = 0;
87     tok->async_def_indent = 0;
88     tok->async_def_nl = 0;
89     tok->interactive_underflow = IUNDERFLOW_NORMAL;
90     tok->str = NULL;
91     return tok;
92 }
93 
94 static char *
new_string(const char * s,Py_ssize_t len,struct tok_state * tok)95 new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
96 {
97     char* result = (char *)PyMem_Malloc(len + 1);
98     if (!result) {
99         tok->done = E_NOMEM;
100         return NULL;
101     }
102     memcpy(result, s, len);
103     result[len] = '\0';
104     return result;
105 }
106 
107 static char *
error_ret(struct tok_state * tok)108 error_ret(struct tok_state *tok) /* XXX */
109 {
110     tok->decoding_erred = 1;
111     if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
112         PyMem_Free(tok->buf);
113     tok->buf = tok->cur = tok->inp = NULL;
114     tok->start = NULL;
115     tok->end = NULL;
116     tok->done = E_DECODE;
117     return NULL;                /* as if it were EOF */
118 }
119 
120 
121 static const char *
get_normal_name(const char * s)122 get_normal_name(const char *s)  /* for utf-8 and latin-1 */
123 {
124     char buf[13];
125     int i;
126     for (i = 0; i < 12; i++) {
127         int c = s[i];
128         if (c == '\0')
129             break;
130         else if (c == '_')
131             buf[i] = '-';
132         else
133             buf[i] = tolower(c);
134     }
135     buf[i] = '\0';
136     if (strcmp(buf, "utf-8") == 0 ||
137         strncmp(buf, "utf-8-", 6) == 0)
138         return "utf-8";
139     else if (strcmp(buf, "latin-1") == 0 ||
140              strcmp(buf, "iso-8859-1") == 0 ||
141              strcmp(buf, "iso-latin-1") == 0 ||
142              strncmp(buf, "latin-1-", 8) == 0 ||
143              strncmp(buf, "iso-8859-1-", 11) == 0 ||
144              strncmp(buf, "iso-latin-1-", 12) == 0)
145         return "iso-8859-1";
146     else
147         return s;
148 }
149 
150 /* Return the coding spec in S, or NULL if none is found.  */
151 
152 static int
get_coding_spec(const char * s,char ** spec,Py_ssize_t size,struct tok_state * tok)153 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
154 {
155     Py_ssize_t i;
156     *spec = NULL;
157     /* Coding spec must be in a comment, and that comment must be
158      * the only statement on the source code line. */
159     for (i = 0; i < size - 6; i++) {
160         if (s[i] == '#')
161             break;
162         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
163             return 1;
164     }
165     for (; i < size - 6; i++) { /* XXX inefficient search */
166         const char* t = s + i;
167         if (memcmp(t, "coding", 6) == 0) {
168             const char* begin = NULL;
169             t += 6;
170             if (t[0] != ':' && t[0] != '=')
171                 continue;
172             do {
173                 t++;
174             } while (t[0] == ' ' || t[0] == '\t');
175 
176             begin = t;
177             while (Py_ISALNUM(t[0]) ||
178                    t[0] == '-' || t[0] == '_' || t[0] == '.')
179                 t++;
180 
181             if (begin < t) {
182                 char* r = new_string(begin, t - begin, tok);
183                 const char* q;
184                 if (!r)
185                     return 0;
186                 q = get_normal_name(r);
187                 if (r != q) {
188                     PyMem_Free(r);
189                     r = new_string(q, strlen(q), tok);
190                     if (!r)
191                         return 0;
192                 }
193                 *spec = r;
194                 break;
195             }
196         }
197     }
198     return 1;
199 }
200 
201 /* Check whether the line contains a coding spec. If it does,
202    invoke the set_readline function for the new encoding.
203    This function receives the tok_state and the new encoding.
204    Return 1 on success, 0 on failure.  */
205 
206 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))207 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
208                   int set_readline(struct tok_state *, const char *))
209 {
210     char *cs;
211     if (tok->cont_line) {
212         /* It's a continuation line, so it can't be a coding spec. */
213         tok->decoding_state = STATE_NORMAL;
214         return 1;
215     }
216     if (!get_coding_spec(line, &cs, size, tok)) {
217         return 0;
218     }
219     if (!cs) {
220         Py_ssize_t i;
221         for (i = 0; i < size; i++) {
222             if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
223                 break;
224             if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
225                 /* Stop checking coding spec after a line containing
226                  * anything except a comment. */
227                 tok->decoding_state = STATE_NORMAL;
228                 break;
229             }
230         }
231         return 1;
232     }
233     tok->decoding_state = STATE_NORMAL;
234     if (tok->encoding == NULL) {
235         assert(tok->decoding_readline == NULL);
236         if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
237             error_ret(tok);
238             PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
239             PyMem_Free(cs);
240             return 0;
241         }
242         tok->encoding = cs;
243     } else {                /* then, compare cs with BOM */
244         if (strcmp(tok->encoding, cs) != 0) {
245             error_ret(tok);
246             PyErr_Format(PyExc_SyntaxError,
247                          "encoding problem: %s with BOM", cs);
248             PyMem_Free(cs);
249             return 0;
250         }
251         PyMem_Free(cs);
252     }
253     return 1;
254 }
255 
256 /* See whether the file starts with a BOM. If it does,
257    invoke the set_readline function with the new encoding.
258    Return 1 on success, 0 on failure.  */
259 
260 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)261 check_bom(int get_char(struct tok_state *),
262           void unget_char(int, struct tok_state *),
263           int set_readline(struct tok_state *, const char *),
264           struct tok_state *tok)
265 {
266     int ch1, ch2, ch3;
267     ch1 = get_char(tok);
268     tok->decoding_state = STATE_SEEK_CODING;
269     if (ch1 == EOF) {
270         return 1;
271     } else if (ch1 == 0xEF) {
272         ch2 = get_char(tok);
273         if (ch2 != 0xBB) {
274             unget_char(ch2, tok);
275             unget_char(ch1, tok);
276             return 1;
277         }
278         ch3 = get_char(tok);
279         if (ch3 != 0xBF) {
280             unget_char(ch3, tok);
281             unget_char(ch2, tok);
282             unget_char(ch1, tok);
283             return 1;
284         }
285 #if 0
286     /* Disable support for UTF-16 BOMs until a decision
287        is made whether this needs to be supported.  */
288     } else if (ch1 == 0xFE) {
289         ch2 = get_char(tok);
290         if (ch2 != 0xFF) {
291             unget_char(ch2, tok);
292             unget_char(ch1, tok);
293             return 1;
294         }
295         if (!set_readline(tok, "utf-16-be"))
296             return 0;
297         tok->decoding_state = STATE_NORMAL;
298     } else if (ch1 == 0xFF) {
299         ch2 = get_char(tok);
300         if (ch2 != 0xFE) {
301             unget_char(ch2, tok);
302             unget_char(ch1, tok);
303             return 1;
304         }
305         if (!set_readline(tok, "utf-16-le"))
306             return 0;
307         tok->decoding_state = STATE_NORMAL;
308 #endif
309     } else {
310         unget_char(ch1, tok);
311         return 1;
312     }
313     if (tok->encoding != NULL)
314         PyMem_Free(tok->encoding);
315     tok->encoding = new_string("utf-8", 5, tok);
316     if (!tok->encoding)
317         return 0;
318     /* No need to set_readline: input is already utf-8 */
319     return 1;
320 }
321 
322 static int
tok_concatenate_interactive_new_line(struct tok_state * tok,const char * line)323 tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
324     assert(tok->fp_interactive);
325 
326     if (!line) {
327         return 0;
328     }
329 
330     Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
331     Py_ssize_t line_size = strlen(line);
332     char* new_str = tok->interactive_src_start;
333 
334     new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
335     if (!new_str) {
336         if (tok->interactive_src_start) {
337             PyMem_Free(tok->interactive_src_start);
338         }
339         tok->interactive_src_start = NULL;
340         tok->interactive_src_end = NULL;
341         tok->done = E_NOMEM;
342         return -1;
343     }
344     strcpy(new_str + current_size, line);
345 
346     tok->interactive_src_start = new_str;
347     tok->interactive_src_end = new_str + current_size + line_size;
348     return 0;
349 }
350 
351 
352 /* Read a line of text from TOK into S, using the stream in TOK.
353    Return NULL on failure, else S.
354 
355    On entry, tok->decoding_buffer will be one of:
356      1) NULL: need to call tok->decoding_readline to get a new line
357      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
358        stored the result in tok->decoding_buffer
359      3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
360        (in the s buffer) to copy entire contents of the line read
361        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
362        In this case, tok_readline_recode is called in a loop (with an expanded buffer)
363        until the buffer ends with a '\n' (or until the end of the file is
364        reached): see tok_nextc and its calls to tok_reserve_buf.
365 */
366 
367 static int
tok_reserve_buf(struct tok_state * tok,Py_ssize_t size)368 tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
369 {
370     Py_ssize_t cur = tok->cur - tok->buf;
371     Py_ssize_t oldsize = tok->inp - tok->buf;
372     Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
373     if (newsize > tok->end - tok->buf) {
374         char *newbuf = tok->buf;
375         Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
376         Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
377         Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
378         newbuf = (char *)PyMem_Realloc(newbuf, newsize);
379         if (newbuf == NULL) {
380             tok->done = E_NOMEM;
381             return 0;
382         }
383         tok->buf = newbuf;
384         tok->cur = tok->buf + cur;
385         tok->inp = tok->buf + oldsize;
386         tok->end = tok->buf + newsize;
387         tok->start = start < 0 ? NULL : tok->buf + start;
388         tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
389         tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
390     }
391     return 1;
392 }
393 
394 static int
tok_readline_recode(struct tok_state * tok)395 tok_readline_recode(struct tok_state *tok) {
396     PyObject *line;
397     const  char *buf;
398     Py_ssize_t buflen;
399     line = tok->decoding_buffer;
400     if (line == NULL) {
401         line = PyObject_CallNoArgs(tok->decoding_readline);
402         if (line == NULL) {
403             error_ret(tok);
404             goto error;
405         }
406     }
407     else {
408         tok->decoding_buffer = NULL;
409     }
410     buf = PyUnicode_AsUTF8AndSize(line, &buflen);
411     if (buf == NULL) {
412         error_ret(tok);
413         goto error;
414     }
415     if (!tok_reserve_buf(tok, buflen + 1)) {
416         goto error;
417     }
418     memcpy(tok->inp, buf, buflen);
419     tok->inp += buflen;
420     *tok->inp = '\0';
421     if (tok->fp_interactive &&
422         tok_concatenate_interactive_new_line(tok, buf) == -1) {
423         goto error;
424     }
425     Py_DECREF(line);
426     return 1;
427 error:
428     Py_XDECREF(line);
429     return 0;
430 }
431 
432 /* Set the readline function for TOK to a StreamReader's
433    readline function. The StreamReader is named ENC.
434 
435    This function is called from check_bom and check_coding_spec.
436 
437    ENC is usually identical to the future value of tok->encoding,
438    except for the (currently unsupported) case of UTF-16.
439 
440    Return 1 on success, 0 on failure. */
441 
442 static int
fp_setreadl(struct tok_state * tok,const char * enc)443 fp_setreadl(struct tok_state *tok, const char* enc)
444 {
445     PyObject *readline, *io, *stream;
446     _Py_IDENTIFIER(open);
447     _Py_IDENTIFIER(readline);
448     int fd;
449     long pos;
450 
451     fd = fileno(tok->fp);
452     /* Due to buffering the file offset for fd can be different from the file
453      * position of tok->fp.  If tok->fp was opened in text mode on Windows,
454      * its file position counts CRLF as one char and can't be directly mapped
455      * to the file offset for fd.  Instead we step back one byte and read to
456      * the end of line.*/
457     pos = ftell(tok->fp);
458     if (pos == -1 ||
459         lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
460         PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
461         return 0;
462     }
463 
464     io = PyImport_ImportModuleNoBlock("io");
465     if (io == NULL)
466         return 0;
467 
468     stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
469                     fd, "r", -1, enc, Py_None, Py_None, Py_False);
470     Py_DECREF(io);
471     if (stream == NULL)
472         return 0;
473 
474     readline = _PyObject_GetAttrId(stream, &PyId_readline);
475     Py_DECREF(stream);
476     if (readline == NULL)
477         return 0;
478     Py_XSETREF(tok->decoding_readline, readline);
479 
480     if (pos > 0) {
481         PyObject *bufobj = _PyObject_CallNoArgs(readline);
482         if (bufobj == NULL)
483             return 0;
484         Py_DECREF(bufobj);
485     }
486 
487     return 1;
488 }
489 
490 /* Fetch the next byte from TOK. */
491 
fp_getc(struct tok_state * tok)492 static int fp_getc(struct tok_state *tok) {
493     return getc(tok->fp);
494 }
495 
496 /* Unfetch the last byte back into TOK.  */
497 
fp_ungetc(int c,struct tok_state * tok)498 static void fp_ungetc(int c, struct tok_state *tok) {
499     ungetc(c, tok->fp);
500 }
501 
502 /* Check whether the characters at s start a valid
503    UTF-8 sequence. Return the number of characters forming
504    the sequence if yes, 0 if not.  */
valid_utf8(const unsigned char * s)505 static int valid_utf8(const unsigned char* s)
506 {
507     int expected = 0;
508     int length;
509     if (*s < 0x80)
510         /* single-byte code */
511         return 1;
512     if (*s < 0xc0)
513         /* following byte */
514         return 0;
515     if (*s < 0xE0)
516         expected = 1;
517     else if (*s < 0xF0)
518         expected = 2;
519     else if (*s < 0xF8)
520         expected = 3;
521     else
522         return 0;
523     length = expected + 1;
524     for (; expected; expected--)
525         if (s[expected] < 0x80 || s[expected] >= 0xC0)
526             return 0;
527     return length;
528 }
529 
530 static int
ensure_utf8(char * line,struct tok_state * tok)531 ensure_utf8(char *line, struct tok_state *tok)
532 {
533     int badchar = 0;
534     unsigned char *c;
535     int length;
536     for (c = (unsigned char *)line; *c; c += length) {
537         if (!(length = valid_utf8(c))) {
538             badchar = *c;
539             break;
540         }
541     }
542     if (badchar) {
543         /* Need to add 1 to the line number, since this line
544        has not been counted, yet.  */
545         PyErr_Format(PyExc_SyntaxError,
546                      "Non-UTF-8 code starting with '\\x%.2x' "
547                      "in file %U on line %i, "
548                      "but no encoding declared; "
549                      "see https://python.org/dev/peps/pep-0263/ for details",
550                      badchar, tok->filename, tok->lineno + 1);
551         return 0;
552     }
553     return 1;
554 }
555 
556 /* Fetch a byte from TOK, using the string buffer. */
557 
558 static int
buf_getc(struct tok_state * tok)559 buf_getc(struct tok_state *tok) {
560     return Py_CHARMASK(*tok->str++);
561 }
562 
563 /* Unfetch a byte from TOK, using the string buffer. */
564 
565 static void
buf_ungetc(int c,struct tok_state * tok)566 buf_ungetc(int c, struct tok_state *tok) {
567     tok->str--;
568     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
569 }
570 
571 /* Set the readline function for TOK to ENC. For the string-based
572    tokenizer, this means to just record the encoding. */
573 
574 static int
buf_setreadl(struct tok_state * tok,const char * enc)575 buf_setreadl(struct tok_state *tok, const char* enc) {
576     tok->enc = enc;
577     return 1;
578 }
579 
580 /* Return a UTF-8 encoding Python string object from the
581    C byte string STR, which is encoded with ENC. */
582 
583 static PyObject *
translate_into_utf8(const char * str,const char * enc)584 translate_into_utf8(const char* str, const char* enc) {
585     PyObject *utf8;
586     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
587     if (buf == NULL)
588         return NULL;
589     utf8 = PyUnicode_AsUTF8String(buf);
590     Py_DECREF(buf);
591     return utf8;
592 }
593 
594 
595 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)596 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
597     int skip_next_lf = 0;
598     size_t needed_length = strlen(s) + 2, final_length;
599     char *buf, *current;
600     char c = '\0';
601     buf = PyMem_Malloc(needed_length);
602     if (buf == NULL) {
603         tok->done = E_NOMEM;
604         return NULL;
605     }
606     for (current = buf; *s; s++, current++) {
607         c = *s;
608         if (skip_next_lf) {
609             skip_next_lf = 0;
610             if (c == '\n') {
611                 c = *++s;
612                 if (!c)
613                     break;
614             }
615         }
616         if (c == '\r') {
617             skip_next_lf = 1;
618             c = '\n';
619         }
620         *current = c;
621     }
622     /* If this is exec input, add a newline to the end of the string if
623        there isn't one already. */
624     if (exec_input && c != '\n') {
625         *current = '\n';
626         current++;
627     }
628     *current = '\0';
629     final_length = current - buf + 1;
630     if (final_length < needed_length && final_length) {
631         /* should never fail */
632         char* result = PyMem_Realloc(buf, final_length);
633         if (result == NULL) {
634             PyMem_Free(buf);
635         }
636         buf = result;
637     }
638     return buf;
639 }
640 
641 /* Decode a byte string STR for use as the buffer of TOK.
642    Look for encoding declarations inside STR, and record them
643    inside TOK.  */
644 
645 static char *
decode_str(const char * input,int single,struct tok_state * tok)646 decode_str(const char *input, int single, struct tok_state *tok)
647 {
648     PyObject* utf8 = NULL;
649     char *str;
650     const char *s;
651     const char *newl[2] = {NULL, NULL};
652     int lineno = 0;
653     tok->input = str = translate_newlines(input, single, tok);
654     if (str == NULL)
655         return NULL;
656     tok->enc = NULL;
657     tok->str = str;
658     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
659         return error_ret(tok);
660     str = tok->str;             /* string after BOM if any */
661     assert(str);
662     if (tok->enc != NULL) {
663         utf8 = translate_into_utf8(str, tok->enc);
664         if (utf8 == NULL)
665             return error_ret(tok);
666         str = PyBytes_AsString(utf8);
667     }
668     for (s = str;; s++) {
669         if (*s == '\0') break;
670         else if (*s == '\n') {
671             assert(lineno < 2);
672             newl[lineno] = s;
673             lineno++;
674             if (lineno == 2) break;
675         }
676     }
677     tok->enc = NULL;
678     /* need to check line 1 and 2 separately since check_coding_spec
679        assumes a single line as input */
680     if (newl[0]) {
681         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
682             return NULL;
683         }
684         if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
685             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
686                                    tok, buf_setreadl))
687                 return NULL;
688         }
689     }
690     if (tok->enc != NULL) {
691         assert(utf8 == NULL);
692         utf8 = translate_into_utf8(str, tok->enc);
693         if (utf8 == NULL)
694             return error_ret(tok);
695         str = PyBytes_AS_STRING(utf8);
696     }
697     assert(tok->decoding_buffer == NULL);
698     tok->decoding_buffer = utf8; /* CAUTION */
699     return str;
700 }
701 
702 /* Set up tokenizer for string */
703 
704 struct tok_state *
_PyTokenizer_FromString(const char * str,int exec_input)705 _PyTokenizer_FromString(const char *str, int exec_input)
706 {
707     struct tok_state *tok = tok_new();
708     char *decoded;
709 
710     if (tok == NULL)
711         return NULL;
712     decoded = decode_str(str, exec_input, tok);
713     if (decoded == NULL) {
714         _PyTokenizer_Free(tok);
715         return NULL;
716     }
717 
718     tok->buf = tok->cur = tok->inp = decoded;
719     tok->end = decoded;
720     return tok;
721 }
722 
723 /* Set up tokenizer for UTF-8 string */
724 
725 struct tok_state *
_PyTokenizer_FromUTF8(const char * str,int exec_input)726 _PyTokenizer_FromUTF8(const char *str, int exec_input)
727 {
728     struct tok_state *tok = tok_new();
729     char *translated;
730     if (tok == NULL)
731         return NULL;
732     tok->input = translated = translate_newlines(str, exec_input, tok);
733     if (translated == NULL) {
734         _PyTokenizer_Free(tok);
735         return NULL;
736     }
737     tok->decoding_state = STATE_NORMAL;
738     tok->enc = NULL;
739     tok->str = translated;
740     tok->encoding = new_string("utf-8", 5, tok);
741     if (!tok->encoding) {
742         _PyTokenizer_Free(tok);
743         return NULL;
744     }
745 
746     tok->buf = tok->cur = tok->inp = translated;
747     tok->end = translated;
748     return tok;
749 }
750 
751 /* Set up tokenizer for file */
752 
753 struct tok_state *
_PyTokenizer_FromFile(FILE * fp,const char * enc,const char * ps1,const char * ps2)754 _PyTokenizer_FromFile(FILE *fp, const char* enc,
755                       const char *ps1, const char *ps2)
756 {
757     struct tok_state *tok = tok_new();
758     if (tok == NULL)
759         return NULL;
760     if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
761         _PyTokenizer_Free(tok);
762         return NULL;
763     }
764     tok->cur = tok->inp = tok->buf;
765     tok->end = tok->buf + BUFSIZ;
766     tok->fp = fp;
767     tok->prompt = ps1;
768     tok->nextprompt = ps2;
769     if (enc != NULL) {
770         /* Must copy encoding declaration since it
771            gets copied into the parse tree. */
772         tok->encoding = new_string(enc, strlen(enc), tok);
773         if (!tok->encoding) {
774             _PyTokenizer_Free(tok);
775             return NULL;
776         }
777         tok->decoding_state = STATE_NORMAL;
778     }
779     return tok;
780 }
781 
782 /* Free a tok_state structure */
783 
784 void
_PyTokenizer_Free(struct tok_state * tok)785 _PyTokenizer_Free(struct tok_state *tok)
786 {
787     if (tok->encoding != NULL) {
788         PyMem_Free(tok->encoding);
789     }
790     Py_XDECREF(tok->decoding_readline);
791     Py_XDECREF(tok->decoding_buffer);
792     Py_XDECREF(tok->filename);
793     if (tok->fp != NULL && tok->buf != NULL) {
794         PyMem_Free(tok->buf);
795     }
796     if (tok->input) {
797         PyMem_Free(tok->input);
798     }
799     if (tok->interactive_src_start != NULL) {
800         PyMem_Free(tok->interactive_src_start);
801     }
802     PyMem_Free(tok);
803 }
804 
805 static int
tok_readline_raw(struct tok_state * tok)806 tok_readline_raw(struct tok_state *tok)
807 {
808     do {
809         if (!tok_reserve_buf(tok, BUFSIZ)) {
810             return 0;
811         }
812         char *line = Py_UniversalNewlineFgets(tok->inp,
813                                               (int)(tok->end - tok->inp),
814                                               tok->fp, NULL);
815         if (line == NULL) {
816             return 1;
817         }
818         if (tok->fp_interactive &&
819             tok_concatenate_interactive_new_line(tok, line) == -1) {
820             return 0;
821         }
822         if (*tok->inp == '\0') {
823             return 0;
824         }
825         tok->inp = strchr(tok->inp, '\0');
826     } while (tok->inp[-1] != '\n');
827     return 1;
828 }
829 
830 static int
tok_underflow_string(struct tok_state * tok)831 tok_underflow_string(struct tok_state *tok) {
832     char *end = strchr(tok->inp, '\n');
833     if (end != NULL) {
834         end++;
835     }
836     else {
837         end = strchr(tok->inp, '\0');
838         if (end == tok->inp) {
839             tok->done = E_EOF;
840             return 0;
841         }
842     }
843     if (tok->start == NULL) {
844         tok->buf = tok->cur;
845     }
846     tok->line_start = tok->cur;
847     tok->lineno++;
848     tok->inp = end;
849     return 1;
850 }
851 
852 static int
tok_underflow_interactive(struct tok_state * tok)853 tok_underflow_interactive(struct tok_state *tok) {
854     if (tok->interactive_underflow == IUNDERFLOW_STOP) {
855         tok->done = E_INTERACT_STOP;
856         return 1;
857     }
858     char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
859     if (newtok != NULL) {
860         char *translated = translate_newlines(newtok, 0, tok);
861         PyMem_Free(newtok);
862         if (translated == NULL) {
863             return 0;
864         }
865         newtok = translated;
866     }
867     if (tok->encoding && newtok && *newtok) {
868         /* Recode to UTF-8 */
869         Py_ssize_t buflen;
870         const char* buf;
871         PyObject *u = translate_into_utf8(newtok, tok->encoding);
872         PyMem_Free(newtok);
873         if (u == NULL) {
874             tok->done = E_DECODE;
875             return 0;
876         }
877         buflen = PyBytes_GET_SIZE(u);
878         buf = PyBytes_AS_STRING(u);
879         newtok = PyMem_Malloc(buflen+1);
880         if (newtok == NULL) {
881             Py_DECREF(u);
882             tok->done = E_NOMEM;
883             return 0;
884         }
885         strcpy(newtok, buf);
886         Py_DECREF(u);
887     }
888     if (tok->fp_interactive &&
889         tok_concatenate_interactive_new_line(tok, newtok) == -1) {
890         PyMem_Free(newtok);
891         return 0;
892     }
893     if (tok->nextprompt != NULL) {
894         tok->prompt = tok->nextprompt;
895     }
896     if (newtok == NULL) {
897         tok->done = E_INTR;
898     }
899     else if (*newtok == '\0') {
900         PyMem_Free(newtok);
901         tok->done = E_EOF;
902     }
903     else if (tok->start != NULL) {
904         Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
905         size_t size = strlen(newtok);
906         tok->lineno++;
907         if (!tok_reserve_buf(tok, size + 1)) {
908             PyMem_Free(tok->buf);
909             tok->buf = NULL;
910             PyMem_Free(newtok);
911             return 0;
912         }
913         memcpy(tok->cur, newtok, size + 1);
914         PyMem_Free(newtok);
915         tok->inp += size;
916         tok->multi_line_start = tok->buf + cur_multi_line_start;
917     }
918     else {
919         tok->lineno++;
920         PyMem_Free(tok->buf);
921         tok->buf = newtok;
922         tok->cur = tok->buf;
923         tok->line_start = tok->buf;
924         tok->inp = strchr(tok->buf, '\0');
925         tok->end = tok->inp + 1;
926     }
927     if (tok->done != E_OK) {
928         if (tok->prompt != NULL) {
929             PySys_WriteStderr("\n");
930         }
931         return 0;
932     }
933     return 1;
934 }
935 
936 static int
tok_underflow_file(struct tok_state * tok)937 tok_underflow_file(struct tok_state *tok) {
938     if (tok->start == NULL) {
939         tok->cur = tok->inp = tok->buf;
940     }
941     if (tok->decoding_state == STATE_INIT) {
942         /* We have not yet determined the encoding.
943            If an encoding is found, use the file-pointer
944            reader functions from now on. */
945         if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
946             error_ret(tok);
947             return 0;
948         }
949         assert(tok->decoding_state != STATE_INIT);
950     }
951     /* Read until '\n' or EOF */
952     if (tok->decoding_readline != NULL) {
953         /* We already have a codec associated with this input. */
954         if (!tok_readline_recode(tok)) {
955             return 0;
956         }
957     }
958     else {
959         /* We want a 'raw' read. */
960         if (!tok_readline_raw(tok)) {
961             return 0;
962         }
963     }
964     if (tok->inp == tok->cur) {
965         tok->done = E_EOF;
966         return 0;
967     }
968     if (tok->inp[-1] != '\n') {
969         /* Last line does not end in \n, fake one */
970         *tok->inp++ = '\n';
971         *tok->inp = '\0';
972     }
973 
974     tok->lineno++;
975     if (tok->decoding_state != STATE_NORMAL) {
976         if (tok->lineno > 2) {
977             tok->decoding_state = STATE_NORMAL;
978         }
979         else if (!check_coding_spec(tok->cur, strlen(tok->cur),
980                                     tok, fp_setreadl))
981         {
982             return 0;
983         }
984     }
985     /* The default encoding is UTF-8, so make sure we don't have any
986        non-UTF-8 sequences in it. */
987     if (!tok->encoding
988         && (tok->decoding_state != STATE_NORMAL || tok->lineno >= 2)) {
989         if (!ensure_utf8(tok->cur, tok)) {
990             error_ret(tok);
991             return 0;
992         }
993     }
994     assert(tok->done == E_OK);
995     return tok->done == E_OK;
996 }
997 
998 #if defined(Py_DEBUG)
999 static void
print_escape(FILE * f,const char * s,Py_ssize_t size)1000 print_escape(FILE *f, const char *s, Py_ssize_t size)
1001 {
1002     if (s == NULL) {
1003         fputs("NULL", f);
1004         return;
1005     }
1006     putc('"', f);
1007     while (size-- > 0) {
1008         unsigned char c = *s++;
1009         switch (c) {
1010             case '\n': fputs("\\n", f); break;
1011             case '\r': fputs("\\r", f); break;
1012             case '\t': fputs("\\t", f); break;
1013             case '\f': fputs("\\f", f); break;
1014             case '\'': fputs("\\'", f); break;
1015             case '"': fputs("\\\"", f); break;
1016             default:
1017                 if (0x20 <= c && c <= 0x7f)
1018                     putc(c, f);
1019                 else
1020                     fprintf(f, "\\x%02x", c);
1021         }
1022     }
1023     putc('"', f);
1024 }
1025 #endif
1026 
1027 /* Get next char, updating state; error code goes into tok->done */
1028 
1029 static int
tok_nextc(struct tok_state * tok)1030 tok_nextc(struct tok_state *tok)
1031 {
1032     int rc;
1033     for (;;) {
1034         if (tok->cur != tok->inp) {
1035             return Py_CHARMASK(*tok->cur++); /* Fast path */
1036         }
1037         if (tok->done != E_OK)
1038             return EOF;
1039         if (tok->fp == NULL) {
1040             rc = tok_underflow_string(tok);
1041         }
1042         else if (tok->prompt != NULL) {
1043             rc = tok_underflow_interactive(tok);
1044         }
1045         else {
1046             rc = tok_underflow_file(tok);
1047         }
1048 #if defined(Py_DEBUG)
1049         if (Py_DebugFlag) {
1050             fprintf(stderr, "line[%d] = ", tok->lineno);
1051             print_escape(stderr, tok->cur, tok->inp - tok->cur);
1052             fprintf(stderr, "  tok->done = %d\n", tok->done);
1053         }
1054 #endif
1055         if (!rc) {
1056             tok->cur = tok->inp;
1057             return EOF;
1058         }
1059         tok->line_start = tok->cur;
1060     }
1061     Py_UNREACHABLE();
1062 }
1063 
1064 /* Back-up one character */
1065 
1066 static void
tok_backup(struct tok_state * tok,int c)1067 tok_backup(struct tok_state *tok, int c)
1068 {
1069     if (c != EOF) {
1070         if (--tok->cur < tok->buf) {
1071             Py_FatalError("tokenizer beginning of buffer");
1072         }
1073         if ((int)(unsigned char)*tok->cur != c) {
1074             Py_FatalError("tok_backup: wrong character");
1075         }
1076     }
1077 }
1078 
1079 static int
_syntaxerror_range(struct tok_state * tok,const char * format,int col_offset,int end_col_offset,va_list vargs)1080 _syntaxerror_range(struct tok_state *tok, const char *format,
1081                    int col_offset, int end_col_offset,
1082                    va_list vargs)
1083 {
1084     PyObject *errmsg, *errtext, *args;
1085     errmsg = PyUnicode_FromFormatV(format, vargs);
1086     if (!errmsg) {
1087         goto error;
1088     }
1089 
1090     errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1091                                    "replace");
1092     if (!errtext) {
1093         goto error;
1094     }
1095 
1096     if (col_offset == -1) {
1097         col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1098     }
1099     if (end_col_offset == -1) {
1100         end_col_offset = col_offset;
1101     }
1102 
1103     Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1104     if (line_len != tok->cur - tok->line_start) {
1105         Py_DECREF(errtext);
1106         errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1107                                        "replace");
1108     }
1109     if (!errtext) {
1110         goto error;
1111     }
1112 
1113     args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1114                          col_offset, errtext, tok->lineno, end_col_offset);
1115     if (args) {
1116         PyErr_SetObject(PyExc_SyntaxError, args);
1117         Py_DECREF(args);
1118     }
1119 
1120 error:
1121     Py_XDECREF(errmsg);
1122     tok->done = E_ERROR;
1123     return ERRORTOKEN;
1124 }
1125 
1126 static int
syntaxerror(struct tok_state * tok,const char * format,...)1127 syntaxerror(struct tok_state *tok, const char *format, ...)
1128 {
1129     va_list vargs;
1130 #ifdef HAVE_STDARG_PROTOTYPES
1131     va_start(vargs, format);
1132 #else
1133     va_start(vargs);
1134 #endif
1135     int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1136     va_end(vargs);
1137     return ret;
1138 }
1139 
1140 static int
syntaxerror_known_range(struct tok_state * tok,int col_offset,int end_col_offset,const char * format,...)1141 syntaxerror_known_range(struct tok_state *tok,
1142                         int col_offset, int end_col_offset,
1143                         const char *format, ...)
1144 {
1145     va_list vargs;
1146 #ifdef HAVE_STDARG_PROTOTYPES
1147     va_start(vargs, format);
1148 #else
1149     va_start(vargs);
1150 #endif
1151     int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1152     va_end(vargs);
1153     return ret;
1154 }
1155 
1156 
1157 
1158 static int
indenterror(struct tok_state * tok)1159 indenterror(struct tok_state *tok)
1160 {
1161     tok->done = E_TABSPACE;
1162     tok->cur = tok->inp;
1163     return ERRORTOKEN;
1164 }
1165 
1166 static int
parser_warn(struct tok_state * tok,const char * format,...)1167 parser_warn(struct tok_state *tok, const char *format, ...)
1168 {
1169     PyObject *errmsg;
1170     va_list vargs;
1171 #ifdef HAVE_STDARG_PROTOTYPES
1172     va_start(vargs, format);
1173 #else
1174     va_start(vargs);
1175 #endif
1176     errmsg = PyUnicode_FromFormatV(format, vargs);
1177     va_end(vargs);
1178     if (!errmsg) {
1179         goto error;
1180     }
1181 
1182     if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, errmsg, tok->filename,
1183                                  tok->lineno, NULL, NULL) < 0) {
1184         if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
1185             /* Replace the DeprecationWarning exception with a SyntaxError
1186                to get a more accurate error report */
1187             PyErr_Clear();
1188             syntaxerror(tok, "%U", errmsg);
1189         }
1190         goto error;
1191     }
1192     Py_DECREF(errmsg);
1193     return 0;
1194 
1195 error:
1196     Py_XDECREF(errmsg);
1197     tok->done = E_ERROR;
1198     return -1;
1199 }
1200 
1201 static int
lookahead(struct tok_state * tok,const char * test)1202 lookahead(struct tok_state *tok, const char *test)
1203 {
1204     const char *s = test;
1205     int res = 0;
1206     while (1) {
1207         int c = tok_nextc(tok);
1208         if (*s == 0) {
1209             res = !is_potential_identifier_char(c);
1210         }
1211         else if (c == *s) {
1212             s++;
1213             continue;
1214         }
1215 
1216         tok_backup(tok, c);
1217         while (s != test) {
1218             tok_backup(tok, *--s);
1219         }
1220         return res;
1221     }
1222 }
1223 
1224 static int
verify_end_of_number(struct tok_state * tok,int c,const char * kind)1225 verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1226 {
1227     /* Emit a deprecation warning only if the numeric literal is immediately
1228      * followed by one of keywords which can occur after a numeric literal
1229      * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1230      * It allows to gradually deprecate existing valid code without adding
1231      * warning before error in most cases of invalid numeric literal (which
1232      * would be confusing and break existing tests).
1233      * Raise a syntax error with slightly better message than plain
1234      * "invalid syntax" if the numeric literal is immediately followed by
1235      * other keyword or identifier.
1236      */
1237     int r = 0;
1238     if (c == 'a') {
1239         r = lookahead(tok, "nd");
1240     }
1241     else if (c == 'e') {
1242         r = lookahead(tok, "lse");
1243     }
1244     else if (c == 'f') {
1245         r = lookahead(tok, "or");
1246     }
1247     else if (c == 'i') {
1248         int c2 = tok_nextc(tok);
1249         if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1250             r = 1;
1251         }
1252         tok_backup(tok, c2);
1253     }
1254     else if (c == 'o') {
1255         r = lookahead(tok, "r");
1256     }
1257     if (r) {
1258         tok_backup(tok, c);
1259         if (parser_warn(tok, "invalid %s literal", kind)) {
1260             return 0;
1261         }
1262         tok_nextc(tok);
1263     }
1264     else /* In future releases, only error will remain. */
1265     if (is_potential_identifier_char(c)) {
1266         tok_backup(tok, c);
1267         syntaxerror(tok, "invalid %s literal", kind);
1268         return 0;
1269     }
1270     return 1;
1271 }
1272 
1273 /* Verify that the identifier follows PEP 3131.
1274    All identifier strings are guaranteed to be "ready" unicode objects.
1275  */
1276 static int
verify_identifier(struct tok_state * tok)1277 verify_identifier(struct tok_state *tok)
1278 {
1279     PyObject *s;
1280     if (tok->decoding_erred)
1281         return 0;
1282     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1283     if (s == NULL) {
1284         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1285             tok->done = E_DECODE;
1286         }
1287         else {
1288             tok->done = E_ERROR;
1289         }
1290         return 0;
1291     }
1292     Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1293     if (invalid < 0) {
1294         Py_DECREF(s);
1295         tok->done = E_ERROR;
1296         return 0;
1297     }
1298     assert(PyUnicode_GET_LENGTH(s) > 0);
1299     if (invalid < PyUnicode_GET_LENGTH(s)) {
1300         Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1301         if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1302             /* Determine the offset in UTF-8 encoded input */
1303             Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1304             if (s != NULL) {
1305                 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1306             }
1307             if (s == NULL) {
1308                 tok->done = E_ERROR;
1309                 return 0;
1310             }
1311             tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1312         }
1313         Py_DECREF(s);
1314         // PyUnicode_FromFormatV() does not support %X
1315         char hex[9];
1316         (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
1317         if (Py_UNICODE_ISPRINTABLE(ch)) {
1318             syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1319         }
1320         else {
1321             syntaxerror(tok, "invalid non-printable character U+%s", hex);
1322         }
1323         return 0;
1324     }
1325     Py_DECREF(s);
1326     return 1;
1327 }
1328 
1329 static int
tok_decimal_tail(struct tok_state * tok)1330 tok_decimal_tail(struct tok_state *tok)
1331 {
1332     int c;
1333 
1334     while (1) {
1335         do {
1336             c = tok_nextc(tok);
1337         } while (isdigit(c));
1338         if (c != '_') {
1339             break;
1340         }
1341         c = tok_nextc(tok);
1342         if (!isdigit(c)) {
1343             tok_backup(tok, c);
1344             syntaxerror(tok, "invalid decimal literal");
1345             return 0;
1346         }
1347     }
1348     return c;
1349 }
1350 
1351 /* Get next token, after space stripping etc. */
1352 
1353 static int
tok_get(struct tok_state * tok,const char ** p_start,const char ** p_end)1354 tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
1355 {
1356     int c;
1357     int blankline, nonascii;
1358 
1359     *p_start = *p_end = NULL;
1360   nextline:
1361     tok->start = NULL;
1362     blankline = 0;
1363 
1364     /* Get indentation level */
1365     if (tok->atbol) {
1366         int col = 0;
1367         int altcol = 0;
1368         tok->atbol = 0;
1369         for (;;) {
1370             c = tok_nextc(tok);
1371             if (c == ' ') {
1372                 col++, altcol++;
1373             }
1374             else if (c == '\t') {
1375                 col = (col / tok->tabsize + 1) * tok->tabsize;
1376                 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1377             }
1378             else if (c == '\014')  {/* Control-L (formfeed) */
1379                 col = altcol = 0; /* For Emacs users */
1380             }
1381             else {
1382                 break;
1383             }
1384         }
1385         tok_backup(tok, c);
1386         if (c == '#' || c == '\n' || c == '\\') {
1387             /* Lines with only whitespace and/or comments
1388                and/or a line continuation character
1389                shouldn't affect the indentation and are
1390                not passed to the parser as NEWLINE tokens,
1391                except *totally* empty lines in interactive
1392                mode, which signal the end of a command group. */
1393             if (col == 0 && c == '\n' && tok->prompt != NULL) {
1394                 blankline = 0; /* Let it through */
1395             }
1396             else if (tok->prompt != NULL && tok->lineno == 1) {
1397                 /* In interactive mode, if the first line contains
1398                    only spaces and/or a comment, let it through. */
1399                 blankline = 0;
1400                 col = altcol = 0;
1401             }
1402             else {
1403                 blankline = 1; /* Ignore completely */
1404             }
1405             /* We can't jump back right here since we still
1406                may need to skip to the end of a comment */
1407         }
1408         if (!blankline && tok->level == 0) {
1409             if (col == tok->indstack[tok->indent]) {
1410                 /* No change */
1411                 if (altcol != tok->altindstack[tok->indent]) {
1412                     return indenterror(tok);
1413                 }
1414             }
1415             else if (col > tok->indstack[tok->indent]) {
1416                 /* Indent -- always one */
1417                 if (tok->indent+1 >= MAXINDENT) {
1418                     tok->done = E_TOODEEP;
1419                     tok->cur = tok->inp;
1420                     return ERRORTOKEN;
1421                 }
1422                 if (altcol <= tok->altindstack[tok->indent]) {
1423                     return indenterror(tok);
1424                 }
1425                 tok->pendin++;
1426                 tok->indstack[++tok->indent] = col;
1427                 tok->altindstack[tok->indent] = altcol;
1428             }
1429             else /* col < tok->indstack[tok->indent] */ {
1430                 /* Dedent -- any number, must be consistent */
1431                 while (tok->indent > 0 &&
1432                     col < tok->indstack[tok->indent]) {
1433                     tok->pendin--;
1434                     tok->indent--;
1435                 }
1436                 if (col != tok->indstack[tok->indent]) {
1437                     tok->done = E_DEDENT;
1438                     tok->cur = tok->inp;
1439                     return ERRORTOKEN;
1440                 }
1441                 if (altcol != tok->altindstack[tok->indent]) {
1442                     return indenterror(tok);
1443                 }
1444             }
1445         }
1446     }
1447 
1448     tok->start = tok->cur;
1449 
1450     /* Return pending indents/dedents */
1451     if (tok->pendin != 0) {
1452         if (tok->pendin < 0) {
1453             tok->pendin++;
1454             return DEDENT;
1455         }
1456         else {
1457             tok->pendin--;
1458             return INDENT;
1459         }
1460     }
1461 
1462     /* Peek ahead at the next character */
1463     c = tok_nextc(tok);
1464     tok_backup(tok, c);
1465     /* Check if we are closing an async function */
1466     if (tok->async_def
1467         && !blankline
1468         /* Due to some implementation artifacts of type comments,
1469          * a TYPE_COMMENT at the start of a function won't set an
1470          * indentation level and it will produce a NEWLINE after it.
1471          * To avoid spuriously ending an async function due to this,
1472          * wait until we have some non-newline char in front of us. */
1473         && c != '\n'
1474         && tok->level == 0
1475         /* There was a NEWLINE after ASYNC DEF,
1476            so we're past the signature. */
1477         && tok->async_def_nl
1478         /* Current indentation level is less than where
1479            the async function was defined */
1480         && tok->async_def_indent >= tok->indent)
1481     {
1482         tok->async_def = 0;
1483         tok->async_def_indent = 0;
1484         tok->async_def_nl = 0;
1485     }
1486 
1487  again:
1488     tok->start = NULL;
1489     /* Skip spaces */
1490     do {
1491         c = tok_nextc(tok);
1492     } while (c == ' ' || c == '\t' || c == '\014');
1493 
1494     /* Set start of current token */
1495     tok->start = tok->cur - 1;
1496 
1497     /* Skip comment, unless it's a type comment */
1498     if (c == '#') {
1499         const char *prefix, *p, *type_start;
1500 
1501         while (c != EOF && c != '\n') {
1502             c = tok_nextc(tok);
1503         }
1504 
1505         if (tok->type_comments) {
1506             p = tok->start;
1507             prefix = type_comment_prefix;
1508             while (*prefix && p < tok->cur) {
1509                 if (*prefix == ' ') {
1510                     while (*p == ' ' || *p == '\t') {
1511                         p++;
1512                     }
1513                 } else if (*prefix == *p) {
1514                     p++;
1515                 } else {
1516                     break;
1517                 }
1518 
1519                 prefix++;
1520             }
1521 
1522             /* This is a type comment if we matched all of type_comment_prefix. */
1523             if (!*prefix) {
1524                 int is_type_ignore = 1;
1525                 const char *ignore_end = p + 6;
1526                 tok_backup(tok, c);  /* don't eat the newline or EOF */
1527 
1528                 type_start = p;
1529 
1530                 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1531                  * or anything ASCII and non-alphanumeric. */
1532                 is_type_ignore = (
1533                     tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1534                     && !(tok->cur > ignore_end
1535                          && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1536 
1537                 if (is_type_ignore) {
1538                     *p_start = ignore_end;
1539                     *p_end = tok->cur;
1540 
1541                     /* If this type ignore is the only thing on the line, consume the newline also. */
1542                     if (blankline) {
1543                         tok_nextc(tok);
1544                         tok->atbol = 1;
1545                     }
1546                     return TYPE_IGNORE;
1547                 } else {
1548                     *p_start = type_start;  /* after type_comment_prefix */
1549                     *p_end = tok->cur;
1550                     return TYPE_COMMENT;
1551                 }
1552             }
1553         }
1554     }
1555 
1556     if (tok->done == E_INTERACT_STOP) {
1557         return ENDMARKER;
1558     }
1559 
1560     /* Check for EOF and errors now */
1561     if (c == EOF) {
1562         if (tok->level) {
1563             return ERRORTOKEN;
1564         }
1565         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1566     }
1567 
1568     /* Identifier (most frequent token!) */
1569     nonascii = 0;
1570     if (is_potential_identifier_start(c)) {
1571         /* Process the various legal combinations of b"", r"", u"", and f"". */
1572         int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1573         while (1) {
1574             if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1575                 saw_b = 1;
1576             /* Since this is a backwards compatibility support literal we don't
1577                want to support it in arbitrary order like byte literals. */
1578             else if (!(saw_b || saw_u || saw_r || saw_f)
1579                      && (c == 'u'|| c == 'U')) {
1580                 saw_u = 1;
1581             }
1582             /* ur"" and ru"" are not supported */
1583             else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1584                 saw_r = 1;
1585             }
1586             else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1587                 saw_f = 1;
1588             }
1589             else {
1590                 break;
1591             }
1592             c = tok_nextc(tok);
1593             if (c == '"' || c == '\'') {
1594                 goto letter_quote;
1595             }
1596         }
1597         while (is_potential_identifier_char(c)) {
1598             if (c >= 128) {
1599                 nonascii = 1;
1600             }
1601             c = tok_nextc(tok);
1602         }
1603         tok_backup(tok, c);
1604         if (nonascii && !verify_identifier(tok)) {
1605             return ERRORTOKEN;
1606         }
1607 
1608         *p_start = tok->start;
1609         *p_end = tok->cur;
1610 
1611         /* async/await parsing block. */
1612         if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1613             /* May be an 'async' or 'await' token.  For Python 3.7 or
1614                later we recognize them unconditionally.  For Python
1615                3.5 or 3.6 we recognize 'async' in front of 'def', and
1616                either one inside of 'async def'.  (Technically we
1617                shouldn't recognize these at all for 3.4 or earlier,
1618                but there's no *valid* Python 3.4 code that would be
1619                rejected, and async functions will be rejected in a
1620                later phase.) */
1621             if (!tok->async_hacks || tok->async_def) {
1622                 /* Always recognize the keywords. */
1623                 if (memcmp(tok->start, "async", 5) == 0) {
1624                     return ASYNC;
1625                 }
1626                 if (memcmp(tok->start, "await", 5) == 0) {
1627                     return AWAIT;
1628                 }
1629             }
1630             else if (memcmp(tok->start, "async", 5) == 0) {
1631                 /* The current token is 'async'.
1632                    Look ahead one token to see if that is 'def'. */
1633 
1634                 struct tok_state ahead_tok;
1635                 const char *ahead_tok_start = NULL;
1636                 const char *ahead_tok_end = NULL;
1637                 int ahead_tok_kind;
1638 
1639                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1640                 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1641                                          &ahead_tok_end);
1642 
1643                 if (ahead_tok_kind == NAME
1644                     && ahead_tok.cur - ahead_tok.start == 3
1645                     && memcmp(ahead_tok.start, "def", 3) == 0)
1646                 {
1647                     /* The next token is going to be 'def', so instead of
1648                        returning a plain NAME token, return ASYNC. */
1649                     tok->async_def_indent = tok->indent;
1650                     tok->async_def = 1;
1651                     return ASYNC;
1652                 }
1653             }
1654         }
1655 
1656         return NAME;
1657     }
1658 
1659     /* Newline */
1660     if (c == '\n') {
1661         tok->atbol = 1;
1662         if (blankline || tok->level > 0) {
1663             goto nextline;
1664         }
1665         *p_start = tok->start;
1666         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1667         tok->cont_line = 0;
1668         if (tok->async_def) {
1669             /* We're somewhere inside an 'async def' function, and
1670                we've encountered a NEWLINE after its signature. */
1671             tok->async_def_nl = 1;
1672         }
1673         return NEWLINE;
1674     }
1675 
1676     /* Period or number starting with period? */
1677     if (c == '.') {
1678         c = tok_nextc(tok);
1679         if (isdigit(c)) {
1680             goto fraction;
1681         } else if (c == '.') {
1682             c = tok_nextc(tok);
1683             if (c == '.') {
1684                 *p_start = tok->start;
1685                 *p_end = tok->cur;
1686                 return ELLIPSIS;
1687             }
1688             else {
1689                 tok_backup(tok, c);
1690             }
1691             tok_backup(tok, '.');
1692         }
1693         else {
1694             tok_backup(tok, c);
1695         }
1696         *p_start = tok->start;
1697         *p_end = tok->cur;
1698         return DOT;
1699     }
1700 
1701     /* Number */
1702     if (isdigit(c)) {
1703         if (c == '0') {
1704             /* Hex, octal or binary -- maybe. */
1705             c = tok_nextc(tok);
1706             if (c == 'x' || c == 'X') {
1707                 /* Hex */
1708                 c = tok_nextc(tok);
1709                 do {
1710                     if (c == '_') {
1711                         c = tok_nextc(tok);
1712                     }
1713                     if (!isxdigit(c)) {
1714                         tok_backup(tok, c);
1715                         return syntaxerror(tok, "invalid hexadecimal literal");
1716                     }
1717                     do {
1718                         c = tok_nextc(tok);
1719                     } while (isxdigit(c));
1720                 } while (c == '_');
1721                 if (!verify_end_of_number(tok, c, "hexadecimal")) {
1722                     return ERRORTOKEN;
1723                 }
1724             }
1725             else if (c == 'o' || c == 'O') {
1726                 /* Octal */
1727                 c = tok_nextc(tok);
1728                 do {
1729                     if (c == '_') {
1730                         c = tok_nextc(tok);
1731                     }
1732                     if (c < '0' || c >= '8') {
1733                         if (isdigit(c)) {
1734                             return syntaxerror(tok,
1735                                     "invalid digit '%c' in octal literal", c);
1736                         }
1737                         else {
1738                             tok_backup(tok, c);
1739                             return syntaxerror(tok, "invalid octal literal");
1740                         }
1741                     }
1742                     do {
1743                         c = tok_nextc(tok);
1744                     } while ('0' <= c && c < '8');
1745                 } while (c == '_');
1746                 if (isdigit(c)) {
1747                     return syntaxerror(tok,
1748                             "invalid digit '%c' in octal literal", c);
1749                 }
1750                 if (!verify_end_of_number(tok, c, "octal")) {
1751                     return ERRORTOKEN;
1752                 }
1753             }
1754             else if (c == 'b' || c == 'B') {
1755                 /* Binary */
1756                 c = tok_nextc(tok);
1757                 do {
1758                     if (c == '_') {
1759                         c = tok_nextc(tok);
1760                     }
1761                     if (c != '0' && c != '1') {
1762                         if (isdigit(c)) {
1763                             return syntaxerror(tok,
1764                                     "invalid digit '%c' in binary literal", c);
1765                         }
1766                         else {
1767                             tok_backup(tok, c);
1768                             return syntaxerror(tok, "invalid binary literal");
1769                         }
1770                     }
1771                     do {
1772                         c = tok_nextc(tok);
1773                     } while (c == '0' || c == '1');
1774                 } while (c == '_');
1775                 if (isdigit(c)) {
1776                     return syntaxerror(tok,
1777                             "invalid digit '%c' in binary literal", c);
1778                 }
1779                 if (!verify_end_of_number(tok, c, "binary")) {
1780                     return ERRORTOKEN;
1781                 }
1782             }
1783             else {
1784                 int nonzero = 0;
1785                 /* maybe old-style octal; c is first char of it */
1786                 /* in any case, allow '0' as a literal */
1787                 while (1) {
1788                     if (c == '_') {
1789                         c = tok_nextc(tok);
1790                         if (!isdigit(c)) {
1791                             tok_backup(tok, c);
1792                             return syntaxerror(tok, "invalid decimal literal");
1793                         }
1794                     }
1795                     if (c != '0') {
1796                         break;
1797                     }
1798                     c = tok_nextc(tok);
1799                 }
1800                 char* zeros_end = tok->cur;
1801                 if (isdigit(c)) {
1802                     nonzero = 1;
1803                     c = tok_decimal_tail(tok);
1804                     if (c == 0) {
1805                         return ERRORTOKEN;
1806                     }
1807                 }
1808                 if (c == '.') {
1809                     c = tok_nextc(tok);
1810                     goto fraction;
1811                 }
1812                 else if (c == 'e' || c == 'E') {
1813                     goto exponent;
1814                 }
1815                 else if (c == 'j' || c == 'J') {
1816                     goto imaginary;
1817                 }
1818                 else if (nonzero) {
1819                     /* Old-style octal: now disallowed. */
1820                     tok_backup(tok, c);
1821                     return syntaxerror_known_range(
1822                             tok, (int)(tok->start + 1 - tok->line_start),
1823                             (int)(zeros_end - tok->line_start),
1824                             "leading zeros in decimal integer "
1825                             "literals are not permitted; "
1826                             "use an 0o prefix for octal integers");
1827                 }
1828                 if (!verify_end_of_number(tok, c, "decimal")) {
1829                     return ERRORTOKEN;
1830                 }
1831             }
1832         }
1833         else {
1834             /* Decimal */
1835             c = tok_decimal_tail(tok);
1836             if (c == 0) {
1837                 return ERRORTOKEN;
1838             }
1839             {
1840                 /* Accept floating point numbers. */
1841                 if (c == '.') {
1842                     c = tok_nextc(tok);
1843         fraction:
1844                     /* Fraction */
1845                     if (isdigit(c)) {
1846                         c = tok_decimal_tail(tok);
1847                         if (c == 0) {
1848                             return ERRORTOKEN;
1849                         }
1850                     }
1851                 }
1852                 if (c == 'e' || c == 'E') {
1853                     int e;
1854                   exponent:
1855                     e = c;
1856                     /* Exponent part */
1857                     c = tok_nextc(tok);
1858                     if (c == '+' || c == '-') {
1859                         c = tok_nextc(tok);
1860                         if (!isdigit(c)) {
1861                             tok_backup(tok, c);
1862                             return syntaxerror(tok, "invalid decimal literal");
1863                         }
1864                     } else if (!isdigit(c)) {
1865                         tok_backup(tok, c);
1866                         if (!verify_end_of_number(tok, e, "decimal")) {
1867                             return ERRORTOKEN;
1868                         }
1869                         tok_backup(tok, e);
1870                         *p_start = tok->start;
1871                         *p_end = tok->cur;
1872                         return NUMBER;
1873                     }
1874                     c = tok_decimal_tail(tok);
1875                     if (c == 0) {
1876                         return ERRORTOKEN;
1877                     }
1878                 }
1879                 if (c == 'j' || c == 'J') {
1880                     /* Imaginary part */
1881         imaginary:
1882                     c = tok_nextc(tok);
1883                     if (!verify_end_of_number(tok, c, "imaginary")) {
1884                         return ERRORTOKEN;
1885                     }
1886                 }
1887                 else if (!verify_end_of_number(tok, c, "decimal")) {
1888                     return ERRORTOKEN;
1889                 }
1890             }
1891         }
1892         tok_backup(tok, c);
1893         *p_start = tok->start;
1894         *p_end = tok->cur;
1895         return NUMBER;
1896     }
1897 
1898   letter_quote:
1899     /* String */
1900     if (c == '\'' || c == '"') {
1901         int quote = c;
1902         int quote_size = 1;             /* 1 or 3 */
1903         int end_quote_size = 0;
1904 
1905         /* Nodes of type STRING, especially multi line strings
1906            must be handled differently in order to get both
1907            the starting line number and the column offset right.
1908            (cf. issue 16806) */
1909         tok->first_lineno = tok->lineno;
1910         tok->multi_line_start = tok->line_start;
1911 
1912         /* Find the quote size and start of string */
1913         c = tok_nextc(tok);
1914         if (c == quote) {
1915             c = tok_nextc(tok);
1916             if (c == quote) {
1917                 quote_size = 3;
1918             }
1919             else {
1920                 end_quote_size = 1;     /* empty string found */
1921             }
1922         }
1923         if (c != quote) {
1924             tok_backup(tok, c);
1925         }
1926 
1927         /* Get rest of string */
1928         while (end_quote_size != quote_size) {
1929             c = tok_nextc(tok);
1930             if (c == EOF || (quote_size == 1 && c == '\n')) {
1931                 assert(tok->multi_line_start != NULL);
1932                 // shift the tok_state's location into
1933                 // the start of string, and report the error
1934                 // from the initial quote character
1935                 tok->cur = (char *)tok->start;
1936                 tok->cur++;
1937                 tok->line_start = tok->multi_line_start;
1938                 int start = tok->lineno;
1939                 tok->lineno = tok->first_lineno;
1940 
1941                 if (quote_size == 3) {
1942                     return syntaxerror(tok,
1943                                        "unterminated triple-quoted string literal"
1944                                        " (detected at line %d)", start);
1945                 }
1946                 else {
1947                     return syntaxerror(tok,
1948                                        "unterminated string literal (detected at"
1949                                        " line %d)", start);
1950                 }
1951             }
1952             if (c == quote) {
1953                 end_quote_size += 1;
1954             }
1955             else {
1956                 end_quote_size = 0;
1957                 if (c == '\\') {
1958                     tok_nextc(tok);  /* skip escaped char */
1959                 }
1960             }
1961         }
1962 
1963         *p_start = tok->start;
1964         *p_end = tok->cur;
1965         return STRING;
1966     }
1967 
1968     /* Line continuation */
1969     if (c == '\\') {
1970         c = tok_nextc(tok);
1971         if (c != '\n') {
1972             tok->done = E_LINECONT;
1973             return ERRORTOKEN;
1974         }
1975         c = tok_nextc(tok);
1976         if (c == EOF) {
1977             tok->done = E_EOF;
1978             tok->cur = tok->inp;
1979             return ERRORTOKEN;
1980         } else {
1981             tok_backup(tok, c);
1982         }
1983         tok->cont_line = 1;
1984         goto again; /* Read next line */
1985     }
1986 
1987     /* Check for two-character token */
1988     {
1989         int c2 = tok_nextc(tok);
1990         int token = PyToken_TwoChars(c, c2);
1991         if (token != OP) {
1992             int c3 = tok_nextc(tok);
1993             int token3 = PyToken_ThreeChars(c, c2, c3);
1994             if (token3 != OP) {
1995                 token = token3;
1996             }
1997             else {
1998                 tok_backup(tok, c3);
1999             }
2000             *p_start = tok->start;
2001             *p_end = tok->cur;
2002             return token;
2003         }
2004         tok_backup(tok, c2);
2005     }
2006 
2007     /* Keep track of parentheses nesting level */
2008     switch (c) {
2009     case '(':
2010     case '[':
2011     case '{':
2012         if (tok->level >= MAXLEVEL) {
2013             return syntaxerror(tok, "too many nested parentheses");
2014         }
2015         tok->parenstack[tok->level] = c;
2016         tok->parenlinenostack[tok->level] = tok->lineno;
2017         tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
2018         tok->level++;
2019         break;
2020     case ')':
2021     case ']':
2022     case '}':
2023         if (!tok->level) {
2024             return syntaxerror(tok, "unmatched '%c'", c);
2025         }
2026         tok->level--;
2027         int opening = tok->parenstack[tok->level];
2028         if (!((opening == '(' && c == ')') ||
2029               (opening == '[' && c == ']') ||
2030               (opening == '{' && c == '}')))
2031         {
2032             if (tok->parenlinenostack[tok->level] != tok->lineno) {
2033                 return syntaxerror(tok,
2034                         "closing parenthesis '%c' does not match "
2035                         "opening parenthesis '%c' on line %d",
2036                         c, opening, tok->parenlinenostack[tok->level]);
2037             }
2038             else {
2039                 return syntaxerror(tok,
2040                         "closing parenthesis '%c' does not match "
2041                         "opening parenthesis '%c'",
2042                         c, opening);
2043             }
2044         }
2045         break;
2046     }
2047 
2048     if (!Py_UNICODE_ISPRINTABLE(c)) {
2049         char hex[9];
2050         (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
2051         return syntaxerror(tok, "invalid non-printable character U+%s", hex);
2052     }
2053 
2054     /* Punctuation character */
2055     *p_start = tok->start;
2056     *p_end = tok->cur;
2057     return PyToken_OneChar(c);
2058 }
2059 
2060 int
_PyTokenizer_Get(struct tok_state * tok,const char ** p_start,const char ** p_end)2061 _PyTokenizer_Get(struct tok_state *tok,
2062                  const char **p_start, const char **p_end)
2063 {
2064     int result = tok_get(tok, p_start, p_end);
2065     if (tok->decoding_erred) {
2066         result = ERRORTOKEN;
2067         tok->done = E_DECODE;
2068     }
2069     return result;
2070 }
2071 
2072 /* Get the encoding of a Python file. Check for the coding cookie and check if
2073    the file starts with a BOM.
2074 
2075    _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2076    encoding in the first or second line of the file (in which case the encoding
2077    should be assumed to be UTF-8).
2078 
2079    The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
2080    by the caller. */
2081 
2082 char *
_PyTokenizer_FindEncodingFilename(int fd,PyObject * filename)2083 _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
2084 {
2085     struct tok_state *tok;
2086     FILE *fp;
2087     const char *p_start = NULL;
2088     const char *p_end = NULL;
2089     char *encoding = NULL;
2090 
2091     fd = _Py_dup(fd);
2092     if (fd < 0) {
2093         return NULL;
2094     }
2095 
2096     fp = fdopen(fd, "r");
2097     if (fp == NULL) {
2098         return NULL;
2099     }
2100     tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2101     if (tok == NULL) {
2102         fclose(fp);
2103         return NULL;
2104     }
2105     if (filename != NULL) {
2106         Py_INCREF(filename);
2107         tok->filename = filename;
2108     }
2109     else {
2110         tok->filename = PyUnicode_FromString("<string>");
2111         if (tok->filename == NULL) {
2112             fclose(fp);
2113             _PyTokenizer_Free(tok);
2114             return encoding;
2115         }
2116     }
2117     while (tok->lineno < 2 && tok->done == E_OK) {
2118         _PyTokenizer_Get(tok, &p_start, &p_end);
2119     }
2120     fclose(fp);
2121     if (tok->encoding) {
2122         encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
2123         if (encoding) {
2124             strcpy(encoding, tok->encoding);
2125         }
2126     }
2127     _PyTokenizer_Free(tok);
2128     return encoding;
2129 }
2130 
2131 #ifdef Py_DEBUG
2132 void
tok_dump(int type,char * start,char * end)2133 tok_dump(int type, char *start, char *end)
2134 {
2135     fprintf(stderr, "%s", _PyParser_TokenNames[type]);
2136     if (type == NAME || type == NUMBER || type == STRING || type == OP)
2137         fprintf(stderr, "(%.*s)", (int)(end - start), start);
2138 }
2139 #endif  // Py_DEBUG
2140