1 
2 /* Tokenizer implementation */
3 
4 #define PY_SSIZE_T_CLEAN
5 #include "Python.h"
6 
7 #include <ctype.h>
8 #include <assert.h>
9 
10 #include "tokenizer.h"
11 #include "errcode.h"
12 
13 #include "unicodeobject.h"
14 #include "bytesobject.h"
15 #include "fileobject.h"
16 #include "codecs.h"
17 #include "abstract.h"
18 
19 /* Alternate tab spacing */
20 #define ALTTABSIZE 1
21 
22 #define is_potential_identifier_start(c) (\
23               (c >= 'a' && c <= 'z')\
24                || (c >= 'A' && c <= 'Z')\
25                || c == '_'\
26                || (c >= 128))
27 
28 #define is_potential_identifier_char(c) (\
29               (c >= 'a' && c <= 'z')\
30                || (c >= 'A' && c <= 'Z')\
31                || (c >= '0' && c <= '9')\
32                || c == '_'\
33                || (c >= 128))
34 
35 extern char *PyOS_Readline(FILE *, FILE *, const char *);
36 /* Return malloc'ed string including trailing \n;
37    empty malloc'ed string for EOF;
38    NULL if interrupted */
39 
40 /* Don't ever change this -- it would break the portability of Python code */
41 #define TABSIZE 8
42 
43 /* Forward */
44 static struct tok_state *tok_new(void);
45 static int tok_nextc(struct tok_state *tok);
46 static void tok_backup(struct tok_state *tok, int c);
47 
48 
49 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
50    tokenizing. */
51 static const char* type_comment_prefix = "# type: ";
52 
53 /* Create and initialize a new tok_state structure */
54 
55 static struct tok_state *
tok_new(void)56 tok_new(void)
57 {
58     struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
59                                             sizeof(struct tok_state));
60     if (tok == NULL)
61         return NULL;
62     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
63     tok->done = E_OK;
64     tok->fp = NULL;
65     tok->input = NULL;
66     tok->tabsize = TABSIZE;
67     tok->indent = 0;
68     tok->indstack[0] = 0;
69 
70     tok->atbol = 1;
71     tok->pendin = 0;
72     tok->prompt = tok->nextprompt = NULL;
73     tok->lineno = 0;
74     tok->level = 0;
75     tok->altindstack[0] = 0;
76     tok->decoding_state = STATE_INIT;
77     tok->decoding_erred = 0;
78     tok->read_coding_spec = 0;
79     tok->enc = NULL;
80     tok->encoding = NULL;
81     tok->cont_line = 0;
82     tok->filename = NULL;
83     tok->decoding_readline = NULL;
84     tok->decoding_buffer = NULL;
85     tok->type_comments = 0;
86 
87     tok->async_hacks = 0;
88     tok->async_def = 0;
89     tok->async_def_indent = 0;
90     tok->async_def_nl = 0;
91 
92     return tok;
93 }
94 
95 static char *
new_string(const char * s,Py_ssize_t len,struct tok_state * tok)96 new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
97 {
98     char* result = (char *)PyMem_MALLOC(len + 1);
99     if (!result) {
100         tok->done = E_NOMEM;
101         return NULL;
102     }
103     memcpy(result, s, len);
104     result[len] = '\0';
105     return result;
106 }
107 
108 static char *
error_ret(struct tok_state * tok)109 error_ret(struct tok_state *tok) /* XXX */
110 {
111     tok->decoding_erred = 1;
112     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
113         PyMem_FREE(tok->buf);
114     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
115     tok->done = E_DECODE;
116     return NULL;                /* as if it were EOF */
117 }
118 
119 
120 static const char *
get_normal_name(const char * s)121 get_normal_name(const char *s)  /* for utf-8 and latin-1 */
122 {
123     char buf[13];
124     int i;
125     for (i = 0; i < 12; i++) {
126         int c = s[i];
127         if (c == '\0')
128             break;
129         else if (c == '_')
130             buf[i] = '-';
131         else
132             buf[i] = tolower(c);
133     }
134     buf[i] = '\0';
135     if (strcmp(buf, "utf-8") == 0 ||
136         strncmp(buf, "utf-8-", 6) == 0)
137         return "utf-8";
138     else if (strcmp(buf, "latin-1") == 0 ||
139              strcmp(buf, "iso-8859-1") == 0 ||
140              strcmp(buf, "iso-latin-1") == 0 ||
141              strncmp(buf, "latin-1-", 8) == 0 ||
142              strncmp(buf, "iso-8859-1-", 11) == 0 ||
143              strncmp(buf, "iso-latin-1-", 12) == 0)
144         return "iso-8859-1";
145     else
146         return s;
147 }
148 
149 /* Return the coding spec in S, or NULL if none is found.  */
150 
151 static int
get_coding_spec(const char * s,char ** spec,Py_ssize_t size,struct tok_state * tok)152 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
153 {
154     Py_ssize_t i;
155     *spec = NULL;
156     /* Coding spec must be in a comment, and that comment must be
157      * the only statement on the source code line. */
158     for (i = 0; i < size - 6; i++) {
159         if (s[i] == '#')
160             break;
161         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
162             return 1;
163     }
164     for (; i < size - 6; i++) { /* XXX inefficient search */
165         const char* t = s + i;
166         if (strncmp(t, "coding", 6) == 0) {
167             const char* begin = NULL;
168             t += 6;
169             if (t[0] != ':' && t[0] != '=')
170                 continue;
171             do {
172                 t++;
173             } while (t[0] == '\x20' || t[0] == '\t');
174 
175             begin = t;
176             while (Py_ISALNUM(t[0]) ||
177                    t[0] == '-' || t[0] == '_' || t[0] == '.')
178                 t++;
179 
180             if (begin < t) {
181                 char* r = new_string(begin, t - begin, tok);
182                 const char* q;
183                 if (!r)
184                     return 0;
185                 q = get_normal_name(r);
186                 if (r != q) {
187                     PyMem_FREE(r);
188                     r = new_string(q, strlen(q), tok);
189                     if (!r)
190                         return 0;
191                 }
192                 *spec = r;
193                 break;
194             }
195         }
196     }
197     return 1;
198 }
199 
200 /* Check whether the line contains a coding spec. If it does,
201    invoke the set_readline function for the new encoding.
202    This function receives the tok_state and the new encoding.
203    Return 1 on success, 0 on failure.  */
204 
205 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))206 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
207                   int set_readline(struct tok_state *, const char *))
208 {
209     char *cs;
210     int r = 1;
211 
212     if (tok->cont_line) {
213         /* It's a continuation line, so it can't be a coding spec. */
214         tok->read_coding_spec = 1;
215         return 1;
216     }
217     if (!get_coding_spec(line, &cs, size, tok))
218         return 0;
219     if (!cs) {
220         Py_ssize_t i;
221         for (i = 0; i < size; i++) {
222             if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
223                 break;
224             if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
225                 /* Stop checking coding spec after a line containing
226                  * anything except a comment. */
227                 tok->read_coding_spec = 1;
228                 break;
229             }
230         }
231         return 1;
232     }
233     tok->read_coding_spec = 1;
234     if (tok->encoding == NULL) {
235         assert(tok->decoding_state == STATE_RAW);
236         if (strcmp(cs, "utf-8") == 0) {
237             tok->encoding = cs;
238         } else {
239             r = set_readline(tok, cs);
240             if (r) {
241                 tok->encoding = cs;
242                 tok->decoding_state = STATE_NORMAL;
243             }
244             else {
245                 PyErr_Format(PyExc_SyntaxError,
246                              "encoding problem: %s", cs);
247                 PyMem_FREE(cs);
248             }
249         }
250     } else {                /* then, compare cs with BOM */
251         r = (strcmp(tok->encoding, cs) == 0);
252         if (!r)
253             PyErr_Format(PyExc_SyntaxError,
254                          "encoding problem: %s with BOM", cs);
255         PyMem_FREE(cs);
256     }
257     return r;
258 }
259 
260 /* See whether the file starts with a BOM. If it does,
261    invoke the set_readline function with the new encoding.
262    Return 1 on success, 0 on failure.  */
263 
264 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)265 check_bom(int get_char(struct tok_state *),
266           void unget_char(int, struct tok_state *),
267           int set_readline(struct tok_state *, const char *),
268           struct tok_state *tok)
269 {
270     int ch1, ch2, ch3;
271     ch1 = get_char(tok);
272     tok->decoding_state = STATE_RAW;
273     if (ch1 == EOF) {
274         return 1;
275     } else if (ch1 == 0xEF) {
276         ch2 = get_char(tok);
277         if (ch2 != 0xBB) {
278             unget_char(ch2, tok);
279             unget_char(ch1, tok);
280             return 1;
281         }
282         ch3 = get_char(tok);
283         if (ch3 != 0xBF) {
284             unget_char(ch3, tok);
285             unget_char(ch2, tok);
286             unget_char(ch1, tok);
287             return 1;
288         }
289 #if 0
290     /* Disable support for UTF-16 BOMs until a decision
291        is made whether this needs to be supported.  */
292     } else if (ch1 == 0xFE) {
293         ch2 = get_char(tok);
294         if (ch2 != 0xFF) {
295             unget_char(ch2, tok);
296             unget_char(ch1, tok);
297             return 1;
298         }
299         if (!set_readline(tok, "utf-16-be"))
300             return 0;
301         tok->decoding_state = STATE_NORMAL;
302     } else if (ch1 == 0xFF) {
303         ch2 = get_char(tok);
304         if (ch2 != 0xFE) {
305             unget_char(ch2, tok);
306             unget_char(ch1, tok);
307             return 1;
308         }
309         if (!set_readline(tok, "utf-16-le"))
310             return 0;
311         tok->decoding_state = STATE_NORMAL;
312 #endif
313     } else {
314         unget_char(ch1, tok);
315         return 1;
316     }
317     if (tok->encoding != NULL)
318         PyMem_FREE(tok->encoding);
319     tok->encoding = new_string("utf-8", 5, tok);
320     if (!tok->encoding)
321         return 0;
322     /* No need to set_readline: input is already utf-8 */
323     return 1;
324 }
325 
326 /* Read a line of text from TOK into S, using the stream in TOK.
327    Return NULL on failure, else S.
328 
329    On entry, tok->decoding_buffer will be one of:
330      1) NULL: need to call tok->decoding_readline to get a new line
331      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
332        stored the result in tok->decoding_buffer
333      3) PyByteArrayObject *: previous call to fp_readl did not have enough room
334        (in the s buffer) to copy entire contents of the line read
335        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
336        In this case, fp_readl is called in a loop (with an expanded buffer)
337        until the buffer ends with a '\n' (or until the end of the file is
338        reached): see tok_nextc and its calls to decoding_fgets.
339 */
340 
341 static char *
fp_readl(char * s,int size,struct tok_state * tok)342 fp_readl(char *s, int size, struct tok_state *tok)
343 {
344     PyObject* bufobj;
345     const char *buf;
346     Py_ssize_t buflen;
347 
348     /* Ask for one less byte so we can terminate it */
349     assert(size > 0);
350     size--;
351 
352     if (tok->decoding_buffer) {
353         bufobj = tok->decoding_buffer;
354         Py_INCREF(bufobj);
355     }
356     else
357     {
358         bufobj = _PyObject_CallNoArg(tok->decoding_readline);
359         if (bufobj == NULL)
360             goto error;
361     }
362     if (PyUnicode_CheckExact(bufobj))
363     {
364         buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
365         if (buf == NULL) {
366             goto error;
367         }
368     }
369     else
370     {
371         buf = PyByteArray_AsString(bufobj);
372         if (buf == NULL) {
373             goto error;
374         }
375         buflen = PyByteArray_GET_SIZE(bufobj);
376     }
377 
378     Py_XDECREF(tok->decoding_buffer);
379     if (buflen > size) {
380         /* Too many chars, the rest goes into tok->decoding_buffer */
381         tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
382                                                          buflen-size);
383         if (tok->decoding_buffer == NULL)
384             goto error;
385         buflen = size;
386     }
387     else
388         tok->decoding_buffer = NULL;
389 
390     memcpy(s, buf, buflen);
391     s[buflen] = '\0';
392     if (buflen == 0) /* EOF */
393         s = NULL;
394     Py_DECREF(bufobj);
395     return s;
396 
397 error:
398     Py_XDECREF(bufobj);
399     return error_ret(tok);
400 }
401 
402 /* Set the readline function for TOK to a StreamReader's
403    readline function. The StreamReader is named ENC.
404 
405    This function is called from check_bom and check_coding_spec.
406 
407    ENC is usually identical to the future value of tok->encoding,
408    except for the (currently unsupported) case of UTF-16.
409 
410    Return 1 on success, 0 on failure. */
411 
412 static int
fp_setreadl(struct tok_state * tok,const char * enc)413 fp_setreadl(struct tok_state *tok, const char* enc)
414 {
415     PyObject *readline, *io, *stream;
416     _Py_IDENTIFIER(open);
417     _Py_IDENTIFIER(readline);
418     int fd;
419     long pos;
420 
421     fd = fileno(tok->fp);
422     /* Due to buffering the file offset for fd can be different from the file
423      * position of tok->fp.  If tok->fp was opened in text mode on Windows,
424      * its file position counts CRLF as one char and can't be directly mapped
425      * to the file offset for fd.  Instead we step back one byte and read to
426      * the end of line.*/
427     pos = ftell(tok->fp);
428     if (pos == -1 ||
429         lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
430         PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
431         return 0;
432     }
433 
434     io = PyImport_ImportModuleNoBlock("io");
435     if (io == NULL)
436         return 0;
437 
438     stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
439                     fd, "r", -1, enc, Py_None, Py_None, Py_False);
440     Py_DECREF(io);
441     if (stream == NULL)
442         return 0;
443 
444     readline = _PyObject_GetAttrId(stream, &PyId_readline);
445     Py_DECREF(stream);
446     if (readline == NULL)
447         return 0;
448     Py_XSETREF(tok->decoding_readline, readline);
449 
450     if (pos > 0) {
451         PyObject *bufobj = _PyObject_CallNoArg(readline);
452         if (bufobj == NULL)
453             return 0;
454         Py_DECREF(bufobj);
455     }
456 
457     return 1;
458 }
459 
460 /* Fetch the next byte from TOK. */
461 
fp_getc(struct tok_state * tok)462 static int fp_getc(struct tok_state *tok) {
463     return getc(tok->fp);
464 }
465 
466 /* Unfetch the last byte back into TOK.  */
467 
fp_ungetc(int c,struct tok_state * tok)468 static void fp_ungetc(int c, struct tok_state *tok) {
469     ungetc(c, tok->fp);
470 }
471 
472 /* Check whether the characters at s start a valid
473    UTF-8 sequence. Return the number of characters forming
474    the sequence if yes, 0 if not.  */
valid_utf8(const unsigned char * s)475 static int valid_utf8(const unsigned char* s)
476 {
477     int expected = 0;
478     int length;
479     if (*s < 0x80)
480         /* single-byte code */
481         return 1;
482     if (*s < 0xc0)
483         /* following byte */
484         return 0;
485     if (*s < 0xE0)
486         expected = 1;
487     else if (*s < 0xF0)
488         expected = 2;
489     else if (*s < 0xF8)
490         expected = 3;
491     else
492         return 0;
493     length = expected + 1;
494     for (; expected; expected--)
495         if (s[expected] < 0x80 || s[expected] >= 0xC0)
496             return 0;
497     return length;
498 }
499 
500 /* Read a line of input from TOK. Determine encoding
501    if necessary.  */
502 
503 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)504 decoding_fgets(char *s, int size, struct tok_state *tok)
505 {
506     char *line = NULL;
507     int badchar = 0;
508     for (;;) {
509         if (tok->decoding_state == STATE_NORMAL) {
510             /* We already have a codec associated with
511                this input. */
512             line = fp_readl(s, size, tok);
513             break;
514         } else if (tok->decoding_state == STATE_RAW) {
515             /* We want a 'raw' read. */
516             line = Py_UniversalNewlineFgets(s, size,
517                                             tok->fp, NULL);
518             break;
519         } else {
520             /* We have not yet determined the encoding.
521                If an encoding is found, use the file-pointer
522                reader functions from now on. */
523             if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
524                 return error_ret(tok);
525             assert(tok->decoding_state != STATE_INIT);
526         }
527     }
528     if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
529         if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
530             return error_ret(tok);
531         }
532     }
533     /* The default encoding is UTF-8, so make sure we don't have any
534        non-UTF-8 sequences in it. */
535     if (line && !tok->encoding) {
536         unsigned char *c;
537         int length;
538         for (c = (unsigned char *)line; *c; c += length)
539             if (!(length = valid_utf8(c))) {
540                 badchar = *c;
541                 break;
542             }
543     }
544     if (badchar) {
545         /* Need to add 1 to the line number, since this line
546            has not been counted, yet.  */
547         PyErr_Format(PyExc_SyntaxError,
548                 "Non-UTF-8 code starting with '\\x%.2x' "
549                 "in file %U on line %i, "
550                 "but no encoding declared; "
551                 "see http://python.org/dev/peps/pep-0263/ for details",
552                 badchar, tok->filename, tok->lineno + 1);
553         return error_ret(tok);
554     }
555     return line;
556 }
557 
558 static int
decoding_feof(struct tok_state * tok)559 decoding_feof(struct tok_state *tok)
560 {
561     if (tok->decoding_state != STATE_NORMAL) {
562         return feof(tok->fp);
563     } else {
564         PyObject* buf = tok->decoding_buffer;
565         if (buf == NULL) {
566             buf = _PyObject_CallNoArg(tok->decoding_readline);
567             if (buf == NULL) {
568                 error_ret(tok);
569                 return 1;
570             } else {
571                 tok->decoding_buffer = buf;
572             }
573         }
574         return PyObject_Length(buf) == 0;
575     }
576 }
577 
578 /* Fetch a byte from TOK, using the string buffer. */
579 
580 static int
buf_getc(struct tok_state * tok)581 buf_getc(struct tok_state *tok) {
582     return Py_CHARMASK(*tok->str++);
583 }
584 
585 /* Unfetch a byte from TOK, using the string buffer. */
586 
587 static void
buf_ungetc(int c,struct tok_state * tok)588 buf_ungetc(int c, struct tok_state *tok) {
589     tok->str--;
590     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
591 }
592 
593 /* Set the readline function for TOK to ENC. For the string-based
594    tokenizer, this means to just record the encoding. */
595 
596 static int
buf_setreadl(struct tok_state * tok,const char * enc)597 buf_setreadl(struct tok_state *tok, const char* enc) {
598     tok->enc = enc;
599     return 1;
600 }
601 
602 /* Return a UTF-8 encoding Python string object from the
603    C byte string STR, which is encoded with ENC. */
604 
605 static PyObject *
translate_into_utf8(const char * str,const char * enc)606 translate_into_utf8(const char* str, const char* enc) {
607     PyObject *utf8;
608     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
609     if (buf == NULL)
610         return NULL;
611     utf8 = PyUnicode_AsUTF8String(buf);
612     Py_DECREF(buf);
613     return utf8;
614 }
615 
616 
617 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)618 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
619     int skip_next_lf = 0;
620     size_t needed_length = strlen(s) + 2, final_length;
621     char *buf, *current;
622     char c = '\0';
623     buf = PyMem_MALLOC(needed_length);
624     if (buf == NULL) {
625         tok->done = E_NOMEM;
626         return NULL;
627     }
628     for (current = buf; *s; s++, current++) {
629         c = *s;
630         if (skip_next_lf) {
631             skip_next_lf = 0;
632             if (c == '\n') {
633                 c = *++s;
634                 if (!c)
635                     break;
636             }
637         }
638         if (c == '\r') {
639             skip_next_lf = 1;
640             c = '\n';
641         }
642         *current = c;
643     }
644     /* If this is exec input, add a newline to the end of the string if
645        there isn't one already. */
646     if (exec_input && c != '\n') {
647         *current = '\n';
648         current++;
649     }
650     *current = '\0';
651     final_length = current - buf + 1;
652     if (final_length < needed_length && final_length) {
653         /* should never fail */
654         char* result = PyMem_REALLOC(buf, final_length);
655         if (result == NULL) {
656             PyMem_FREE(buf);
657         }
658         buf = result;
659     }
660     return buf;
661 }
662 
663 /* Decode a byte string STR for use as the buffer of TOK.
664    Look for encoding declarations inside STR, and record them
665    inside TOK.  */
666 
667 static const char *
decode_str(const char * input,int single,struct tok_state * tok)668 decode_str(const char *input, int single, struct tok_state *tok)
669 {
670     PyObject* utf8 = NULL;
671     const char *str;
672     const char *s;
673     const char *newl[2] = {NULL, NULL};
674     int lineno = 0;
675     tok->input = str = translate_newlines(input, single, tok);
676     if (str == NULL)
677         return NULL;
678     tok->enc = NULL;
679     tok->str = str;
680     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
681         return error_ret(tok);
682     str = tok->str;             /* string after BOM if any */
683     assert(str);
684     if (tok->enc != NULL) {
685         utf8 = translate_into_utf8(str, tok->enc);
686         if (utf8 == NULL)
687             return error_ret(tok);
688         str = PyBytes_AsString(utf8);
689     }
690     for (s = str;; s++) {
691         if (*s == '\0') break;
692         else if (*s == '\n') {
693             assert(lineno < 2);
694             newl[lineno] = s;
695             lineno++;
696             if (lineno == 2) break;
697         }
698     }
699     tok->enc = NULL;
700     /* need to check line 1 and 2 separately since check_coding_spec
701        assumes a single line as input */
702     if (newl[0]) {
703         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
704             return error_ret(tok);
705         if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
706             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
707                                    tok, buf_setreadl))
708                 return error_ret(tok);
709         }
710     }
711     if (tok->enc != NULL) {
712         assert(utf8 == NULL);
713         utf8 = translate_into_utf8(str, tok->enc);
714         if (utf8 == NULL)
715             return error_ret(tok);
716         str = PyBytes_AS_STRING(utf8);
717     }
718     assert(tok->decoding_buffer == NULL);
719     tok->decoding_buffer = utf8; /* CAUTION */
720     return str;
721 }
722 
723 /* Set up tokenizer for string */
724 
725 struct tok_state *
PyTokenizer_FromString(const char * str,int exec_input)726 PyTokenizer_FromString(const char *str, int exec_input)
727 {
728     struct tok_state *tok = tok_new();
729     if (tok == NULL)
730         return NULL;
731     str = decode_str(str, exec_input, tok);
732     if (str == NULL) {
733         PyTokenizer_Free(tok);
734         return NULL;
735     }
736 
737     /* XXX: constify members. */
738     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
739     return tok;
740 }
741 
742 struct tok_state *
PyTokenizer_FromUTF8(const char * str,int exec_input)743 PyTokenizer_FromUTF8(const char *str, int exec_input)
744 {
745     struct tok_state *tok = tok_new();
746     if (tok == NULL)
747         return NULL;
748     tok->input = str = translate_newlines(str, exec_input, tok);
749     if (str == NULL) {
750         PyTokenizer_Free(tok);
751         return NULL;
752     }
753     tok->decoding_state = STATE_RAW;
754     tok->read_coding_spec = 1;
755     tok->enc = NULL;
756     tok->str = str;
757     tok->encoding = (char *)PyMem_MALLOC(6);
758     if (!tok->encoding) {
759         PyTokenizer_Free(tok);
760         return NULL;
761     }
762     strcpy(tok->encoding, "utf-8");
763 
764     /* XXX: constify members. */
765     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
766     return tok;
767 }
768 
769 /* Set up tokenizer for file */
770 
771 struct tok_state *
PyTokenizer_FromFile(FILE * fp,const char * enc,const char * ps1,const char * ps2)772 PyTokenizer_FromFile(FILE *fp, const char* enc,
773                      const char *ps1, const char *ps2)
774 {
775     struct tok_state *tok = tok_new();
776     if (tok == NULL)
777         return NULL;
778     if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
779         PyTokenizer_Free(tok);
780         return NULL;
781     }
782     tok->cur = tok->inp = tok->buf;
783     tok->end = tok->buf + BUFSIZ;
784     tok->fp = fp;
785     tok->prompt = ps1;
786     tok->nextprompt = ps2;
787     if (enc != NULL) {
788         /* Must copy encoding declaration since it
789            gets copied into the parse tree. */
790         tok->encoding = PyMem_MALLOC(strlen(enc)+1);
791         if (!tok->encoding) {
792             PyTokenizer_Free(tok);
793             return NULL;
794         }
795         strcpy(tok->encoding, enc);
796         tok->decoding_state = STATE_NORMAL;
797     }
798     return tok;
799 }
800 
801 
802 /* Free a tok_state structure */
803 
804 void
PyTokenizer_Free(struct tok_state * tok)805 PyTokenizer_Free(struct tok_state *tok)
806 {
807     if (tok->encoding != NULL)
808         PyMem_FREE(tok->encoding);
809     Py_XDECREF(tok->decoding_readline);
810     Py_XDECREF(tok->decoding_buffer);
811     Py_XDECREF(tok->filename);
812     if (tok->fp != NULL && tok->buf != NULL)
813         PyMem_FREE(tok->buf);
814     if (tok->input)
815         PyMem_FREE((char *)tok->input);
816     PyMem_FREE(tok);
817 }
818 
819 /* Get next char, updating state; error code goes into tok->done */
820 
821 static int
tok_nextc(struct tok_state * tok)822 tok_nextc(struct tok_state *tok)
823 {
824     for (;;) {
825         if (tok->cur != tok->inp) {
826             return Py_CHARMASK(*tok->cur++); /* Fast path */
827         }
828         if (tok->done != E_OK)
829             return EOF;
830         if (tok->fp == NULL) {
831             char *end = strchr(tok->inp, '\n');
832             if (end != NULL)
833                 end++;
834             else {
835                 end = strchr(tok->inp, '\0');
836                 if (end == tok->inp) {
837                     tok->done = E_EOF;
838                     return EOF;
839                 }
840             }
841             if (tok->start == NULL)
842                 tok->buf = tok->cur;
843             tok->line_start = tok->cur;
844             tok->lineno++;
845             tok->inp = end;
846             return Py_CHARMASK(*tok->cur++);
847         }
848         if (tok->prompt != NULL) {
849             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
850             if (newtok != NULL) {
851                 char *translated = translate_newlines(newtok, 0, tok);
852                 PyMem_FREE(newtok);
853                 if (translated == NULL)
854                     return EOF;
855                 newtok = translated;
856             }
857             if (tok->encoding && newtok && *newtok) {
858                 /* Recode to UTF-8 */
859                 Py_ssize_t buflen;
860                 const char* buf;
861                 PyObject *u = translate_into_utf8(newtok, tok->encoding);
862                 PyMem_FREE(newtok);
863                 if (!u) {
864                     tok->done = E_DECODE;
865                     return EOF;
866                 }
867                 buflen = PyBytes_GET_SIZE(u);
868                 buf = PyBytes_AS_STRING(u);
869                 newtok = PyMem_MALLOC(buflen+1);
870                 if (newtok == NULL) {
871                     Py_DECREF(u);
872                     tok->done = E_NOMEM;
873                     return EOF;
874                 }
875                 strcpy(newtok, buf);
876                 Py_DECREF(u);
877             }
878             if (tok->nextprompt != NULL)
879                 tok->prompt = tok->nextprompt;
880             if (newtok == NULL)
881                 tok->done = E_INTR;
882             else if (*newtok == '\0') {
883                 PyMem_FREE(newtok);
884                 tok->done = E_EOF;
885             }
886             else if (tok->start != NULL) {
887                 size_t start = tok->start - tok->buf;
888                 size_t oldlen = tok->cur - tok->buf;
889                 size_t newlen = oldlen + strlen(newtok);
890                 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
891                 char *buf = tok->buf;
892                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
893                 tok->lineno++;
894                 if (buf == NULL) {
895                     PyMem_FREE(tok->buf);
896                     tok->buf = NULL;
897                     PyMem_FREE(newtok);
898                     tok->done = E_NOMEM;
899                     return EOF;
900                 }
901                 tok->buf = buf;
902                 tok->cur = tok->buf + oldlen;
903                 tok->multi_line_start = tok->buf + cur_multi_line_start;
904                 tok->line_start = tok->cur;
905                 strcpy(tok->buf + oldlen, newtok);
906                 PyMem_FREE(newtok);
907                 tok->inp = tok->buf + newlen;
908                 tok->end = tok->inp + 1;
909                 tok->start = tok->buf + start;
910             }
911             else {
912                 tok->lineno++;
913                 if (tok->buf != NULL)
914                     PyMem_FREE(tok->buf);
915                 tok->buf = newtok;
916                 tok->cur = tok->buf;
917                 tok->line_start = tok->buf;
918                 tok->inp = strchr(tok->buf, '\0');
919                 tok->end = tok->inp + 1;
920             }
921         }
922         else {
923             int done = 0;
924             Py_ssize_t cur = 0;
925             char *pt;
926             if (tok->start == NULL) {
927                 if (tok->buf == NULL) {
928                     tok->buf = (char *)
929                         PyMem_MALLOC(BUFSIZ);
930                     if (tok->buf == NULL) {
931                         tok->done = E_NOMEM;
932                         return EOF;
933                     }
934                     tok->end = tok->buf + BUFSIZ;
935                 }
936                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
937                           tok) == NULL) {
938                     if (!tok->decoding_erred)
939                         tok->done = E_EOF;
940                     done = 1;
941                 }
942                 else {
943                     tok->done = E_OK;
944                     tok->inp = strchr(tok->buf, '\0');
945                     done = tok->inp == tok->buf || tok->inp[-1] == '\n';
946                 }
947             }
948             else {
949                 cur = tok->cur - tok->buf;
950                 if (decoding_feof(tok)) {
951                     tok->done = E_EOF;
952                     done = 1;
953                 }
954                 else
955                     tok->done = E_OK;
956             }
957             tok->lineno++;
958             /* Read until '\n' or EOF */
959             while (!done) {
960                 Py_ssize_t curstart = tok->start == NULL ? -1 :
961                           tok->start - tok->buf;
962                 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
963                 Py_ssize_t curvalid = tok->inp - tok->buf;
964                 Py_ssize_t newsize = curvalid + BUFSIZ;
965                 char *newbuf = tok->buf;
966                 newbuf = (char *)PyMem_REALLOC(newbuf,
967                                                newsize);
968                 if (newbuf == NULL) {
969                     tok->done = E_NOMEM;
970                     tok->cur = tok->inp;
971                     return EOF;
972                 }
973                 tok->buf = newbuf;
974                 tok->cur = tok->buf + cur;
975                 tok->multi_line_start = tok->buf + cur_multi_line_start;
976                 tok->line_start = tok->cur;
977                 tok->inp = tok->buf + curvalid;
978                 tok->end = tok->buf + newsize;
979                 tok->start = curstart < 0 ? NULL :
980                          tok->buf + curstart;
981                 if (decoding_fgets(tok->inp,
982                                (int)(tok->end - tok->inp),
983                                tok) == NULL) {
984                     /* Break out early on decoding
985                        errors, as tok->buf will be NULL
986                      */
987                     if (tok->decoding_erred)
988                         return EOF;
989                     /* Last line does not end in \n,
990                        fake one */
991                     if (tok->inp[-1] != '\n')
992                         strcpy(tok->inp, "\n");
993                 }
994                 tok->inp = strchr(tok->inp, '\0');
995                 done = tok->inp[-1] == '\n';
996             }
997             if (tok->buf != NULL) {
998                 tok->cur = tok->buf + cur;
999                 tok->line_start = tok->cur;
1000                 /* replace "\r\n" with "\n" */
1001                 /* For Mac leave the \r, giving a syntax error */
1002                 pt = tok->inp - 2;
1003                 if (pt >= tok->buf && *pt == '\r') {
1004                     *pt++ = '\n';
1005                     *pt = '\0';
1006                     tok->inp = pt;
1007                 }
1008             }
1009         }
1010         if (tok->done != E_OK) {
1011             if (tok->prompt != NULL)
1012                 PySys_WriteStderr("\n");
1013             tok->cur = tok->inp;
1014             return EOF;
1015         }
1016     }
1017     /*NOTREACHED*/
1018 }
1019 
1020 
1021 /* Back-up one character */
1022 
1023 static void
tok_backup(struct tok_state * tok,int c)1024 tok_backup(struct tok_state *tok, int c)
1025 {
1026     if (c != EOF) {
1027         if (--tok->cur < tok->buf)
1028             Py_FatalError("tok_backup: beginning of buffer");
1029         if (*tok->cur != c)
1030             *tok->cur = c;
1031     }
1032 }
1033 
1034 
1035 static int
syntaxerror(struct tok_state * tok,const char * format,...)1036 syntaxerror(struct tok_state *tok, const char *format, ...)
1037 {
1038     PyObject *errmsg, *errtext, *args;
1039     va_list vargs;
1040 #ifdef HAVE_STDARG_PROTOTYPES
1041     va_start(vargs, format);
1042 #else
1043     va_start(vargs);
1044 #endif
1045     errmsg = PyUnicode_FromFormatV(format, vargs);
1046     va_end(vargs);
1047     if (!errmsg) {
1048         goto error;
1049     }
1050 
1051     errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1052                                    "replace");
1053     if (!errtext) {
1054         goto error;
1055     }
1056     int offset = (int)PyUnicode_GET_LENGTH(errtext);
1057     Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1058     if (line_len != tok->cur - tok->line_start) {
1059         Py_DECREF(errtext);
1060         errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1061                                        "replace");
1062     }
1063     if (!errtext) {
1064         goto error;
1065     }
1066 
1067     args = Py_BuildValue("(O(OiiN))", errmsg,
1068                          tok->filename, tok->lineno, offset, errtext);
1069     if (args) {
1070         PyErr_SetObject(PyExc_SyntaxError, args);
1071         Py_DECREF(args);
1072     }
1073 
1074 error:
1075     Py_XDECREF(errmsg);
1076     tok->done = E_ERROR;
1077     return ERRORTOKEN;
1078 }
1079 
1080 static int
indenterror(struct tok_state * tok)1081 indenterror(struct tok_state *tok)
1082 {
1083     tok->done = E_TABSPACE;
1084     tok->cur = tok->inp;
1085     return ERRORTOKEN;
1086 }
1087 
1088 /* Verify that the identifier follows PEP 3131.
1089    All identifier strings are guaranteed to be "ready" unicode objects.
1090  */
1091 static int
verify_identifier(struct tok_state * tok)1092 verify_identifier(struct tok_state *tok)
1093 {
1094     PyObject *s;
1095     int result;
1096     if (tok->decoding_erred)
1097         return 0;
1098     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1099     if (s == NULL) {
1100         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1101             PyErr_Clear();
1102             tok->done = E_IDENTIFIER;
1103         } else {
1104             tok->done = E_ERROR;
1105         }
1106         return 0;
1107     }
1108     result = PyUnicode_IsIdentifier(s);
1109     Py_DECREF(s);
1110     if (result == 0)
1111         tok->done = E_IDENTIFIER;
1112     return result;
1113 }
1114 
1115 static int
tok_decimal_tail(struct tok_state * tok)1116 tok_decimal_tail(struct tok_state *tok)
1117 {
1118     int c;
1119 
1120     while (1) {
1121         do {
1122             c = tok_nextc(tok);
1123         } while (isdigit(c));
1124         if (c != '_') {
1125             break;
1126         }
1127         c = tok_nextc(tok);
1128         if (!isdigit(c)) {
1129             tok_backup(tok, c);
1130             syntaxerror(tok, "invalid decimal literal");
1131             return 0;
1132         }
1133     }
1134     return c;
1135 }
1136 
1137 /* Get next token, after space stripping etc. */
1138 
1139 static int
tok_get(struct tok_state * tok,char ** p_start,char ** p_end)1140 tok_get(struct tok_state *tok, char **p_start, char **p_end)
1141 {
1142     int c;
1143     int blankline, nonascii;
1144 
1145     *p_start = *p_end = NULL;
1146   nextline:
1147     tok->start = NULL;
1148     blankline = 0;
1149 
1150     /* Get indentation level */
1151     if (tok->atbol) {
1152         int col = 0;
1153         int altcol = 0;
1154         tok->atbol = 0;
1155         for (;;) {
1156             c = tok_nextc(tok);
1157             if (c == ' ') {
1158                 col++, altcol++;
1159             }
1160             else if (c == '\t') {
1161                 col = (col / tok->tabsize + 1) * tok->tabsize;
1162                 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1163             }
1164             else if (c == '\014')  {/* Control-L (formfeed) */
1165                 col = altcol = 0; /* For Emacs users */
1166             }
1167             else {
1168                 break;
1169             }
1170         }
1171         tok_backup(tok, c);
1172         if (c == '#' || c == '\n') {
1173             /* Lines with only whitespace and/or comments
1174                shouldn't affect the indentation and are
1175                not passed to the parser as NEWLINE tokens,
1176                except *totally* empty lines in interactive
1177                mode, which signal the end of a command group. */
1178             if (col == 0 && c == '\n' && tok->prompt != NULL) {
1179                 blankline = 0; /* Let it through */
1180             }
1181             else if (tok->prompt != NULL && tok->lineno == 1) {
1182                 /* In interactive mode, if the first line contains
1183                    only spaces and/or a comment, let it through. */
1184                 blankline = 0;
1185                 col = altcol = 0;
1186             }
1187             else {
1188                 blankline = 1; /* Ignore completely */
1189             }
1190             /* We can't jump back right here since we still
1191                may need to skip to the end of a comment */
1192         }
1193         if (!blankline && tok->level == 0) {
1194             if (col == tok->indstack[tok->indent]) {
1195                 /* No change */
1196                 if (altcol != tok->altindstack[tok->indent]) {
1197                     return indenterror(tok);
1198                 }
1199             }
1200             else if (col > tok->indstack[tok->indent]) {
1201                 /* Indent -- always one */
1202                 if (tok->indent+1 >= MAXINDENT) {
1203                     tok->done = E_TOODEEP;
1204                     tok->cur = tok->inp;
1205                     return ERRORTOKEN;
1206                 }
1207                 if (altcol <= tok->altindstack[tok->indent]) {
1208                     return indenterror(tok);
1209                 }
1210                 tok->pendin++;
1211                 tok->indstack[++tok->indent] = col;
1212                 tok->altindstack[tok->indent] = altcol;
1213             }
1214             else /* col < tok->indstack[tok->indent] */ {
1215                 /* Dedent -- any number, must be consistent */
1216                 while (tok->indent > 0 &&
1217                     col < tok->indstack[tok->indent]) {
1218                     tok->pendin--;
1219                     tok->indent--;
1220                 }
1221                 if (col != tok->indstack[tok->indent]) {
1222                     tok->done = E_DEDENT;
1223                     tok->cur = tok->inp;
1224                     return ERRORTOKEN;
1225                 }
1226                 if (altcol != tok->altindstack[tok->indent]) {
1227                     return indenterror(tok);
1228                 }
1229             }
1230         }
1231     }
1232 
1233     tok->start = tok->cur;
1234 
1235     /* Return pending indents/dedents */
1236     if (tok->pendin != 0) {
1237         if (tok->pendin < 0) {
1238             tok->pendin++;
1239             return DEDENT;
1240         }
1241         else {
1242             tok->pendin--;
1243             return INDENT;
1244         }
1245     }
1246 
1247     /* Peek ahead at the next character */
1248     c = tok_nextc(tok);
1249     tok_backup(tok, c);
1250     /* Check if we are closing an async function */
1251     if (tok->async_def
1252         && !blankline
1253         /* Due to some implementation artifacts of type comments,
1254          * a TYPE_COMMENT at the start of a function won't set an
1255          * indentation level and it will produce a NEWLINE after it.
1256          * To avoid spuriously ending an async function due to this,
1257          * wait until we have some non-newline char in front of us. */
1258         && c != '\n'
1259         && tok->level == 0
1260         /* There was a NEWLINE after ASYNC DEF,
1261            so we're past the signature. */
1262         && tok->async_def_nl
1263         /* Current indentation level is less than where
1264            the async function was defined */
1265         && tok->async_def_indent >= tok->indent)
1266     {
1267         tok->async_def = 0;
1268         tok->async_def_indent = 0;
1269         tok->async_def_nl = 0;
1270     }
1271 
1272  again:
1273     tok->start = NULL;
1274     /* Skip spaces */
1275     do {
1276         c = tok_nextc(tok);
1277     } while (c == ' ' || c == '\t' || c == '\014');
1278 
1279     /* Set start of current token */
1280     tok->start = tok->cur - 1;
1281 
1282     /* Skip comment, unless it's a type comment */
1283     if (c == '#') {
1284         const char *prefix, *p, *type_start;
1285 
1286         while (c != EOF && c != '\n') {
1287             c = tok_nextc(tok);
1288         }
1289 
1290         if (tok->type_comments) {
1291             p = tok->start;
1292             prefix = type_comment_prefix;
1293             while (*prefix && p < tok->cur) {
1294                 if (*prefix == ' ') {
1295                     while (*p == ' ' || *p == '\t') {
1296                         p++;
1297                     }
1298                 } else if (*prefix == *p) {
1299                     p++;
1300                 } else {
1301                     break;
1302                 }
1303 
1304                 prefix++;
1305             }
1306 
1307             /* This is a type comment if we matched all of type_comment_prefix. */
1308             if (!*prefix) {
1309                 int is_type_ignore = 1;
1310                 const char *ignore_end = p + 6;
1311                 tok_backup(tok, c);  /* don't eat the newline or EOF */
1312 
1313                 type_start = p;
1314 
1315                 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1316                  * or anything ASCII and non-alphanumeric. */
1317                 is_type_ignore = (
1318                     tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1319                     && !(tok->cur > ignore_end
1320                          && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1321 
1322                 if (is_type_ignore) {
1323                     *p_start = (char *) ignore_end;
1324                     *p_end = tok->cur;
1325 
1326                     /* If this type ignore is the only thing on the line, consume the newline also. */
1327                     if (blankline) {
1328                         tok_nextc(tok);
1329                         tok->atbol = 1;
1330                     }
1331                     return TYPE_IGNORE;
1332                 } else {
1333                     *p_start = (char *) type_start;  /* after type_comment_prefix */
1334                     *p_end = tok->cur;
1335                     return TYPE_COMMENT;
1336                 }
1337             }
1338         }
1339     }
1340 
1341     /* Check for EOF and errors now */
1342     if (c == EOF) {
1343         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1344     }
1345 
1346     /* Identifier (most frequent token!) */
1347     nonascii = 0;
1348     if (is_potential_identifier_start(c)) {
1349         /* Process the various legal combinations of b"", r"", u"", and f"". */
1350         int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1351         while (1) {
1352             if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1353                 saw_b = 1;
1354             /* Since this is a backwards compatibility support literal we don't
1355                want to support it in arbitrary order like byte literals. */
1356             else if (!(saw_b || saw_u || saw_r || saw_f)
1357                      && (c == 'u'|| c == 'U')) {
1358                 saw_u = 1;
1359             }
1360             /* ur"" and ru"" are not supported */
1361             else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1362                 saw_r = 1;
1363             }
1364             else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1365                 saw_f = 1;
1366             }
1367             else {
1368                 break;
1369             }
1370             c = tok_nextc(tok);
1371             if (c == '"' || c == '\'') {
1372                 goto letter_quote;
1373             }
1374         }
1375         while (is_potential_identifier_char(c)) {
1376             if (c >= 128) {
1377                 nonascii = 1;
1378             }
1379             c = tok_nextc(tok);
1380         }
1381         tok_backup(tok, c);
1382         if (nonascii && !verify_identifier(tok)) {
1383             return ERRORTOKEN;
1384         }
1385         *p_start = tok->start;
1386         *p_end = tok->cur;
1387 
1388         /* async/await parsing block. */
1389         if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1390             /* May be an 'async' or 'await' token.  For Python 3.7 or
1391                later we recognize them unconditionally.  For Python
1392                3.5 or 3.6 we recognize 'async' in front of 'def', and
1393                either one inside of 'async def'.  (Technically we
1394                shouldn't recognize these at all for 3.4 or earlier,
1395                but there's no *valid* Python 3.4 code that would be
1396                rejected, and async functions will be rejected in a
1397                later phase.) */
1398             if (!tok->async_hacks || tok->async_def) {
1399                 /* Always recognize the keywords. */
1400                 if (memcmp(tok->start, "async", 5) == 0) {
1401                     return ASYNC;
1402                 }
1403                 if (memcmp(tok->start, "await", 5) == 0) {
1404                     return AWAIT;
1405                 }
1406             }
1407             else if (memcmp(tok->start, "async", 5) == 0) {
1408                 /* The current token is 'async'.
1409                    Look ahead one token to see if that is 'def'. */
1410 
1411                 struct tok_state ahead_tok;
1412                 char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
1413                 int ahead_tok_kind;
1414 
1415                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1416                 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1417                                          &ahead_tok_end);
1418 
1419                 if (ahead_tok_kind == NAME
1420                     && ahead_tok.cur - ahead_tok.start == 3
1421                     && memcmp(ahead_tok.start, "def", 3) == 0)
1422                 {
1423                     /* The next token is going to be 'def', so instead of
1424                        returning a plain NAME token, return ASYNC. */
1425                     tok->async_def_indent = tok->indent;
1426                     tok->async_def = 1;
1427                     return ASYNC;
1428                 }
1429             }
1430         }
1431 
1432         return NAME;
1433     }
1434 
1435     /* Newline */
1436     if (c == '\n') {
1437         tok->atbol = 1;
1438         if (blankline || tok->level > 0) {
1439             goto nextline;
1440         }
1441         *p_start = tok->start;
1442         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1443         tok->cont_line = 0;
1444         if (tok->async_def) {
1445             /* We're somewhere inside an 'async def' function, and
1446                we've encountered a NEWLINE after its signature. */
1447             tok->async_def_nl = 1;
1448         }
1449         return NEWLINE;
1450     }
1451 
1452     /* Period or number starting with period? */
1453     if (c == '.') {
1454         c = tok_nextc(tok);
1455         if (isdigit(c)) {
1456             goto fraction;
1457         } else if (c == '.') {
1458             c = tok_nextc(tok);
1459             if (c == '.') {
1460                 *p_start = tok->start;
1461                 *p_end = tok->cur;
1462                 return ELLIPSIS;
1463             }
1464             else {
1465                 tok_backup(tok, c);
1466             }
1467             tok_backup(tok, '.');
1468         }
1469         else {
1470             tok_backup(tok, c);
1471         }
1472         *p_start = tok->start;
1473         *p_end = tok->cur;
1474         return DOT;
1475     }
1476 
1477     /* Number */
1478     if (isdigit(c)) {
1479         if (c == '0') {
1480             /* Hex, octal or binary -- maybe. */
1481             c = tok_nextc(tok);
1482             if (c == 'x' || c == 'X') {
1483                 /* Hex */
1484                 c = tok_nextc(tok);
1485                 do {
1486                     if (c == '_') {
1487                         c = tok_nextc(tok);
1488                     }
1489                     if (!isxdigit(c)) {
1490                         tok_backup(tok, c);
1491                         return syntaxerror(tok, "invalid hexadecimal literal");
1492                     }
1493                     do {
1494                         c = tok_nextc(tok);
1495                     } while (isxdigit(c));
1496                 } while (c == '_');
1497             }
1498             else if (c == 'o' || c == 'O') {
1499                 /* Octal */
1500                 c = tok_nextc(tok);
1501                 do {
1502                     if (c == '_') {
1503                         c = tok_nextc(tok);
1504                     }
1505                     if (c < '0' || c >= '8') {
1506                         tok_backup(tok, c);
1507                         if (isdigit(c)) {
1508                             return syntaxerror(tok,
1509                                     "invalid digit '%c' in octal literal", c);
1510                         }
1511                         else {
1512                             return syntaxerror(tok, "invalid octal literal");
1513                         }
1514                     }
1515                     do {
1516                         c = tok_nextc(tok);
1517                     } while ('0' <= c && c < '8');
1518                 } while (c == '_');
1519                 if (isdigit(c)) {
1520                     return syntaxerror(tok,
1521                             "invalid digit '%c' in octal literal", c);
1522                 }
1523             }
1524             else if (c == 'b' || c == 'B') {
1525                 /* Binary */
1526                 c = tok_nextc(tok);
1527                 do {
1528                     if (c == '_') {
1529                         c = tok_nextc(tok);
1530                     }
1531                     if (c != '0' && c != '1') {
1532                         tok_backup(tok, c);
1533                         if (isdigit(c)) {
1534                             return syntaxerror(tok,
1535                                     "invalid digit '%c' in binary literal", c);
1536                         }
1537                         else {
1538                             return syntaxerror(tok, "invalid binary literal");
1539                         }
1540                     }
1541                     do {
1542                         c = tok_nextc(tok);
1543                     } while (c == '0' || c == '1');
1544                 } while (c == '_');
1545                 if (isdigit(c)) {
1546                     return syntaxerror(tok,
1547                             "invalid digit '%c' in binary literal", c);
1548                 }
1549             }
1550             else {
1551                 int nonzero = 0;
1552                 /* maybe old-style octal; c is first char of it */
1553                 /* in any case, allow '0' as a literal */
1554                 while (1) {
1555                     if (c == '_') {
1556                         c = tok_nextc(tok);
1557                         if (!isdigit(c)) {
1558                             tok_backup(tok, c);
1559                             return syntaxerror(tok, "invalid decimal literal");
1560                         }
1561                     }
1562                     if (c != '0') {
1563                         break;
1564                     }
1565                     c = tok_nextc(tok);
1566                 }
1567                 if (isdigit(c)) {
1568                     nonzero = 1;
1569                     c = tok_decimal_tail(tok);
1570                     if (c == 0) {
1571                         return ERRORTOKEN;
1572                     }
1573                 }
1574                 if (c == '.') {
1575                     c = tok_nextc(tok);
1576                     goto fraction;
1577                 }
1578                 else if (c == 'e' || c == 'E') {
1579                     goto exponent;
1580                 }
1581                 else if (c == 'j' || c == 'J') {
1582                     goto imaginary;
1583                 }
1584                 else if (nonzero) {
1585                     /* Old-style octal: now disallowed. */
1586                     tok_backup(tok, c);
1587                     return syntaxerror(tok,
1588                                        "leading zeros in decimal integer "
1589                                        "literals are not permitted; "
1590                                        "use an 0o prefix for octal integers");
1591                 }
1592             }
1593         }
1594         else {
1595             /* Decimal */
1596             c = tok_decimal_tail(tok);
1597             if (c == 0) {
1598                 return ERRORTOKEN;
1599             }
1600             {
1601                 /* Accept floating point numbers. */
1602                 if (c == '.') {
1603                     c = tok_nextc(tok);
1604         fraction:
1605                     /* Fraction */
1606                     if (isdigit(c)) {
1607                         c = tok_decimal_tail(tok);
1608                         if (c == 0) {
1609                             return ERRORTOKEN;
1610                         }
1611                     }
1612                 }
1613                 if (c == 'e' || c == 'E') {
1614                     int e;
1615                   exponent:
1616                     e = c;
1617                     /* Exponent part */
1618                     c = tok_nextc(tok);
1619                     if (c == '+' || c == '-') {
1620                         c = tok_nextc(tok);
1621                         if (!isdigit(c)) {
1622                             tok_backup(tok, c);
1623                             return syntaxerror(tok, "invalid decimal literal");
1624                         }
1625                     } else if (!isdigit(c)) {
1626                         tok_backup(tok, c);
1627                         tok_backup(tok, e);
1628                         *p_start = tok->start;
1629                         *p_end = tok->cur;
1630                         return NUMBER;
1631                     }
1632                     c = tok_decimal_tail(tok);
1633                     if (c == 0) {
1634                         return ERRORTOKEN;
1635                     }
1636                 }
1637                 if (c == 'j' || c == 'J') {
1638                     /* Imaginary part */
1639         imaginary:
1640                     c = tok_nextc(tok);
1641                 }
1642             }
1643         }
1644         tok_backup(tok, c);
1645         *p_start = tok->start;
1646         *p_end = tok->cur;
1647         return NUMBER;
1648     }
1649 
1650   letter_quote:
1651     /* String */
1652     if (c == '\'' || c == '"') {
1653         int quote = c;
1654         int quote_size = 1;             /* 1 or 3 */
1655         int end_quote_size = 0;
1656 
1657         /* Nodes of type STRING, especially multi line strings
1658            must be handled differently in order to get both
1659            the starting line number and the column offset right.
1660            (cf. issue 16806) */
1661         tok->first_lineno = tok->lineno;
1662         tok->multi_line_start = tok->line_start;
1663 
1664         /* Find the quote size and start of string */
1665         c = tok_nextc(tok);
1666         if (c == quote) {
1667             c = tok_nextc(tok);
1668             if (c == quote) {
1669                 quote_size = 3;
1670             }
1671             else {
1672                 end_quote_size = 1;     /* empty string found */
1673             }
1674         }
1675         if (c != quote) {
1676             tok_backup(tok, c);
1677         }
1678 
1679         /* Get rest of string */
1680         while (end_quote_size != quote_size) {
1681             c = tok_nextc(tok);
1682             if (c == EOF) {
1683                 if (quote_size == 3) {
1684                     tok->done = E_EOFS;
1685                 }
1686                 else {
1687                     tok->done = E_EOLS;
1688                 }
1689                 tok->cur = tok->inp;
1690                 return ERRORTOKEN;
1691             }
1692             if (quote_size == 1 && c == '\n') {
1693                 tok->done = E_EOLS;
1694                 tok->cur = tok->inp;
1695                 return ERRORTOKEN;
1696             }
1697             if (c == quote) {
1698                 end_quote_size += 1;
1699             }
1700             else {
1701                 end_quote_size = 0;
1702                 if (c == '\\') {
1703                     tok_nextc(tok);  /* skip escaped char */
1704                 }
1705             }
1706         }
1707 
1708         *p_start = tok->start;
1709         *p_end = tok->cur;
1710         return STRING;
1711     }
1712 
1713     /* Line continuation */
1714     if (c == '\\') {
1715         c = tok_nextc(tok);
1716         if (c != '\n') {
1717             tok->done = E_LINECONT;
1718             tok->cur = tok->inp;
1719             return ERRORTOKEN;
1720         }
1721         c = tok_nextc(tok);
1722         if (c == EOF) {
1723             tok->done = E_EOF;
1724             tok->cur = tok->inp;
1725             return ERRORTOKEN;
1726         } else {
1727             tok_backup(tok, c);
1728         }
1729         tok->cont_line = 1;
1730         goto again; /* Read next line */
1731     }
1732 
1733     /* Check for two-character token */
1734     {
1735         int c2 = tok_nextc(tok);
1736         int token = PyToken_TwoChars(c, c2);
1737         if (token != OP) {
1738             int c3 = tok_nextc(tok);
1739             int token3 = PyToken_ThreeChars(c, c2, c3);
1740             if (token3 != OP) {
1741                 token = token3;
1742             }
1743             else {
1744                 tok_backup(tok, c3);
1745             }
1746             *p_start = tok->start;
1747             *p_end = tok->cur;
1748             return token;
1749         }
1750         tok_backup(tok, c2);
1751     }
1752 
1753     /* Keep track of parentheses nesting level */
1754     switch (c) {
1755     case '(':
1756     case '[':
1757     case '{':
1758         if (tok->level >= MAXLEVEL) {
1759             return syntaxerror(tok, "too many nested parentheses");
1760         }
1761         tok->parenstack[tok->level] = c;
1762         tok->parenlinenostack[tok->level] = tok->lineno;
1763         tok->level++;
1764         break;
1765     case ')':
1766     case ']':
1767     case '}':
1768         if (!tok->level) {
1769             return syntaxerror(tok, "unmatched '%c'", c);
1770         }
1771         tok->level--;
1772         int opening = tok->parenstack[tok->level];
1773         if (!((opening == '(' && c == ')') ||
1774               (opening == '[' && c == ']') ||
1775               (opening == '{' && c == '}')))
1776         {
1777             if (tok->parenlinenostack[tok->level] != tok->lineno) {
1778                 return syntaxerror(tok,
1779                         "closing parenthesis '%c' does not match "
1780                         "opening parenthesis '%c' on line %d",
1781                         c, opening, tok->parenlinenostack[tok->level]);
1782             }
1783             else {
1784                 return syntaxerror(tok,
1785                         "closing parenthesis '%c' does not match "
1786                         "opening parenthesis '%c'",
1787                         c, opening);
1788             }
1789         }
1790         break;
1791     }
1792 
1793     /* Punctuation character */
1794     *p_start = tok->start;
1795     *p_end = tok->cur;
1796     return PyToken_OneChar(c);
1797 }
1798 
1799 int
PyTokenizer_Get(struct tok_state * tok,char ** p_start,char ** p_end)1800 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1801 {
1802     int result = tok_get(tok, p_start, p_end);
1803     if (tok->decoding_erred) {
1804         result = ERRORTOKEN;
1805         tok->done = E_DECODE;
1806     }
1807     return result;
1808 }
1809 
1810 /* Get the encoding of a Python file. Check for the coding cookie and check if
1811    the file starts with a BOM.
1812 
1813    PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1814    encoding in the first or second line of the file (in which case the encoding
1815    should be assumed to be UTF-8).
1816 
1817    The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1818    by the caller. */
1819 
1820 char *
PyTokenizer_FindEncodingFilename(int fd,PyObject * filename)1821 PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
1822 {
1823     struct tok_state *tok;
1824     FILE *fp;
1825     char *p_start =NULL , *p_end =NULL , *encoding = NULL;
1826 
1827     fd = _Py_dup(fd);
1828     if (fd < 0) {
1829         return NULL;
1830     }
1831 
1832     fp = fdopen(fd, "r");
1833     if (fp == NULL) {
1834         return NULL;
1835     }
1836     tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1837     if (tok == NULL) {
1838         fclose(fp);
1839         return NULL;
1840     }
1841     if (filename != NULL) {
1842         Py_INCREF(filename);
1843         tok->filename = filename;
1844     }
1845     else {
1846         tok->filename = PyUnicode_FromString("<string>");
1847         if (tok->filename == NULL) {
1848             fclose(fp);
1849             PyTokenizer_Free(tok);
1850             return encoding;
1851         }
1852     }
1853     while (tok->lineno < 2 && tok->done == E_OK) {
1854         PyTokenizer_Get(tok, &p_start, &p_end);
1855     }
1856     fclose(fp);
1857     if (tok->encoding) {
1858         encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1859         if (encoding)
1860             strcpy(encoding, tok->encoding);
1861     }
1862     PyTokenizer_Free(tok);
1863     return encoding;
1864 }
1865 
1866 char *
PyTokenizer_FindEncoding(int fd)1867 PyTokenizer_FindEncoding(int fd)
1868 {
1869     return PyTokenizer_FindEncodingFilename(fd, NULL);
1870 }
1871 
1872 #ifdef Py_DEBUG
1873 
1874 void
tok_dump(int type,char * start,char * end)1875 tok_dump(int type, char *start, char *end)
1876 {
1877     printf("%s", _PyParser_TokenNames[type]);
1878     if (type == NAME || type == NUMBER || type == STRING || type == OP)
1879         printf("(%.*s)", (int)(end - start), start);
1880 }
1881 
1882 #endif
1883