1 
2 /* Tokenizer implementation */
3 
4 #include "Python.h"
5 #include "../Include/pgenheaders.h"
6 
7 #include <ctype.h>
8 #include <assert.h>
9 
10 #include "tokenizer.h"
11 #include "../Include/errcode.h"
12 
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "bytesobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #endif /* PGEN */
20 
21 #ifndef Py_XSETREF
22 #define Py_XSETREF(op, op2)                     \
23     do {                                        \
24         PyObject *_py_tmp = (PyObject *)(op);   \
25         (op) = (op2);                           \
26         Py_XDECREF(_py_tmp);                    \
27     } while (0)
28 #endif /* Py_XSETREF */
29 
30 #ifndef _PyObject_CallNoArg
31 #define _PyObject_CallNoArg(func) PyObject_CallObject(func, NULL)
32 #endif
33 
34 /* Alternate tab spacing */
35 #define ALTTABSIZE 1
36 
37 #define is_potential_identifier_start(c) (\
38               (c >= 'a' && c <= 'z')\
39                || (c >= 'A' && c <= 'Z')\
40                || c == '_'\
41                || (c >= 128))
42 
43 #define is_potential_identifier_char(c) (\
44               (c >= 'a' && c <= 'z')\
45                || (c >= 'A' && c <= 'Z')\
46                || (c >= '0' && c <= '9')\
47                || c == '_'\
48                || (c >= 128))
49 
50 PyAPI_FUNC(char *) PyOS_Readline(FILE *, FILE *, const char *);
51 /* Return malloc'ed string including trailing \n;
52    empty malloc'ed string for EOF;
53    NULL if interrupted */
54 
55 /* Don't ever change this -- it would break the portability of Python code */
56 #define TABSIZE 8
57 
58 /* Forward */
59 static struct tok_state *tok_new(void);
60 static int tok_nextc(struct tok_state *tok);
61 static void tok_backup(struct tok_state *tok, int c);
62 
63 
64 /* Token names */
65 
66 const char *_Ta3Parser_TokenNames[] = {
67     "ENDMARKER",
68     "NAME",
69     "NUMBER",
70     "STRING",
71     "NEWLINE",
72     "INDENT",
73     "DEDENT",
74     "LPAR",
75     "RPAR",
76     "LSQB",
77     "RSQB",
78     "COLON",
79     "COMMA",
80     "SEMI",
81     "PLUS",
82     "MINUS",
83     "STAR",
84     "SLASH",
85     "VBAR",
86     "AMPER",
87     "LESS",
88     "GREATER",
89     "EQUAL",
90     "DOT",
91     "PERCENT",
92     "LBRACE",
93     "RBRACE",
94     "EQEQUAL",
95     "NOTEQUAL",
96     "LESSEQUAL",
97     "GREATEREQUAL",
98     "TILDE",
99     "CIRCUMFLEX",
100     "LEFTSHIFT",
101     "RIGHTSHIFT",
102     "DOUBLESTAR",
103     "PLUSEQUAL",
104     "MINEQUAL",
105     "STAREQUAL",
106     "SLASHEQUAL",
107     "PERCENTEQUAL",
108     "AMPEREQUAL",
109     "VBAREQUAL",
110     "CIRCUMFLEXEQUAL",
111     "LEFTSHIFTEQUAL",
112     "RIGHTSHIFTEQUAL",
113     "DOUBLESTAREQUAL",
114     "DOUBLESLASH",
115     "DOUBLESLASHEQUAL",
116     "AT",
117     "ATEQUAL",
118     "RARROW",
119     "ELLIPSIS",
120     /* This table must match the #defines in token.h! */
121     "OP",
122     "AWAIT",
123     "ASYNC",
124     "TYPE_IGNORE",
125     "TYPE_COMMENT",
126     "<ERRORTOKEN>",
127     "COMMENT",
128     "NL",
129     "ENCODING",
130     "<N_TOKENS>"
131 };
132 
133 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
134    tokenizing. */
135 static const char* type_comment_prefix = "# type: ";
136 
137 
138 /* Create and initialize a new tok_state structure */
139 
140 static struct tok_state *
tok_new(void)141 tok_new(void)
142 {
143     struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
144                                             sizeof(struct tok_state));
145     if (tok == NULL)
146         return NULL;
147     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
148     tok->done = E_OK;
149     tok->fp = NULL;
150     tok->input = NULL;
151     tok->tabsize = TABSIZE;
152     tok->indent = 0;
153     tok->indstack[0] = 0;
154 
155     tok->atbol = 1;
156     tok->pendin = 0;
157     tok->prompt = tok->nextprompt = NULL;
158     tok->lineno = 0;
159     tok->level = 0;
160     tok->altindstack[0] = 0;
161     tok->decoding_state = STATE_INIT;
162     tok->decoding_erred = 0;
163     tok->read_coding_spec = 0;
164     tok->enc = NULL;
165     tok->encoding = NULL;
166     tok->cont_line = 0;
167 #ifndef PGEN
168     tok->filename = NULL;
169     tok->decoding_readline = NULL;
170     tok->decoding_buffer = NULL;
171 #endif
172 
173     tok->async_def = 0;
174     tok->async_def_indent = 0;
175     tok->async_def_nl = 0;
176     tok->async_always = 0;
177 
178     return tok;
179 }
180 
181 static char *
new_string(const char * s,Py_ssize_t len,struct tok_state * tok)182 new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
183 {
184     char* result = (char *)PyMem_MALLOC(len + 1);
185     if (!result) {
186         tok->done = E_NOMEM;
187         return NULL;
188     }
189     memcpy(result, s, len);
190     result[len] = '\0';
191     return result;
192 }
193 
194 #ifdef PGEN
195 
196 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)197 decoding_fgets(char *s, int size, struct tok_state *tok)
198 {
199     return fgets(s, size, tok->fp);
200 }
201 
202 static int
decoding_feof(struct tok_state * tok)203 decoding_feof(struct tok_state *tok)
204 {
205     return feof(tok->fp);
206 }
207 
208 static char *
decode_str(const char * str,int exec_input,struct tok_state * tok)209 decode_str(const char *str, int exec_input, struct tok_state *tok)
210 {
211     return new_string(str, strlen(str), tok);
212 }
213 
214 #else /* PGEN */
215 
216 static char *
error_ret(struct tok_state * tok)217 error_ret(struct tok_state *tok) /* XXX */
218 {
219     tok->decoding_erred = 1;
220     if (tok->fp != NULL && tok->buf != NULL) /* see Ta3Tokenizer_Free */
221         PyMem_FREE(tok->buf);
222     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
223     tok->done = E_DECODE;
224     return NULL;                /* as if it were EOF */
225 }
226 
227 
228 static const char *
get_normal_name(const char * s)229 get_normal_name(const char *s)  /* for utf-8 and latin-1 */
230 {
231     char buf[13];
232     int i;
233     for (i = 0; i < 12; i++) {
234         int c = s[i];
235         if (c == '\0')
236             break;
237         else if (c == '_')
238             buf[i] = '-';
239         else
240             buf[i] = tolower(c);
241     }
242     buf[i] = '\0';
243     if (strcmp(buf, "utf-8") == 0 ||
244         strncmp(buf, "utf-8-", 6) == 0)
245         return "utf-8";
246     else if (strcmp(buf, "latin-1") == 0 ||
247              strcmp(buf, "iso-8859-1") == 0 ||
248              strcmp(buf, "iso-latin-1") == 0 ||
249              strncmp(buf, "latin-1-", 8) == 0 ||
250              strncmp(buf, "iso-8859-1-", 11) == 0 ||
251              strncmp(buf, "iso-latin-1-", 12) == 0)
252         return "iso-8859-1";
253     else
254         return s;
255 }
256 
257 /* Return the coding spec in S, or NULL if none is found.  */
258 
259 static int
get_coding_spec(const char * s,char ** spec,Py_ssize_t size,struct tok_state * tok)260 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
261 {
262     Py_ssize_t i;
263     *spec = NULL;
264     /* Coding spec must be in a comment, and that comment must be
265      * the only statement on the source code line. */
266     for (i = 0; i < size - 6; i++) {
267         if (s[i] == '#')
268             break;
269         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
270             return 1;
271     }
272     for (; i < size - 6; i++) { /* XXX inefficient search */
273         const char* t = s + i;
274         if (strncmp(t, "coding", 6) == 0) {
275             const char* begin = NULL;
276             t += 6;
277             if (t[0] != ':' && t[0] != '=')
278                 continue;
279             do {
280                 t++;
281             } while (t[0] == '\x20' || t[0] == '\t');
282 
283             begin = t;
284             while (Py_ISALNUM(t[0]) ||
285                    t[0] == '-' || t[0] == '_' || t[0] == '.')
286                 t++;
287 
288             if (begin < t) {
289                 char* r = new_string(begin, t - begin, tok);
290                 const char* q;
291                 if (!r)
292                     return 0;
293                 q = get_normal_name(r);
294                 if (r != q) {
295                     PyMem_FREE(r);
296                     r = new_string(q, strlen(q), tok);
297                     if (!r)
298                         return 0;
299                 }
300                 *spec = r;
301                 break;
302             }
303         }
304     }
305     return 1;
306 }
307 
308 /* Check whether the line contains a coding spec. If it does,
309    invoke the set_readline function for the new encoding.
310    This function receives the tok_state and the new encoding.
311    Return 1 on success, 0 on failure.  */
312 
313 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))314 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
315                   int set_readline(struct tok_state *, const char *))
316 {
317     char *cs;
318     int r = 1;
319 
320     if (tok->cont_line) {
321         /* It's a continuation line, so it can't be a coding spec. */
322         tok->read_coding_spec = 1;
323         return 1;
324     }
325     if (!get_coding_spec(line, &cs, size, tok))
326         return 0;
327     if (!cs) {
328         Py_ssize_t i;
329         for (i = 0; i < size; i++) {
330             if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
331                 break;
332             if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
333                 /* Stop checking coding spec after a line containing
334                  * anything except a comment. */
335                 tok->read_coding_spec = 1;
336                 break;
337             }
338         }
339         return 1;
340     }
341     tok->read_coding_spec = 1;
342     if (tok->encoding == NULL) {
343         assert(tok->decoding_state == STATE_RAW);
344         if (strcmp(cs, "utf-8") == 0) {
345             tok->encoding = cs;
346         } else {
347             r = set_readline(tok, cs);
348             if (r) {
349                 tok->encoding = cs;
350                 tok->decoding_state = STATE_NORMAL;
351             }
352             else {
353                 PyErr_Format(PyExc_SyntaxError,
354                              "encoding problem: %s", cs);
355                 PyMem_FREE(cs);
356             }
357         }
358     } else {                /* then, compare cs with BOM */
359         r = (strcmp(tok->encoding, cs) == 0);
360         if (!r)
361             PyErr_Format(PyExc_SyntaxError,
362                          "encoding problem: %s with BOM", cs);
363         PyMem_FREE(cs);
364     }
365     return r;
366 }
367 
368 /* See whether the file starts with a BOM. If it does,
369    invoke the set_readline function with the new encoding.
370    Return 1 on success, 0 on failure.  */
371 
372 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)373 check_bom(int get_char(struct tok_state *),
374           void unget_char(int, struct tok_state *),
375           int set_readline(struct tok_state *, const char *),
376           struct tok_state *tok)
377 {
378     int ch1, ch2, ch3;
379     ch1 = get_char(tok);
380     tok->decoding_state = STATE_RAW;
381     if (ch1 == EOF) {
382         return 1;
383     } else if (ch1 == 0xEF) {
384         ch2 = get_char(tok);
385         if (ch2 != 0xBB) {
386             unget_char(ch2, tok);
387             unget_char(ch1, tok);
388             return 1;
389         }
390         ch3 = get_char(tok);
391         if (ch3 != 0xBF) {
392             unget_char(ch3, tok);
393             unget_char(ch2, tok);
394             unget_char(ch1, tok);
395             return 1;
396         }
397 #if 0
398     /* Disable support for UTF-16 BOMs until a decision
399        is made whether this needs to be supported.  */
400     } else if (ch1 == 0xFE) {
401         ch2 = get_char(tok);
402         if (ch2 != 0xFF) {
403             unget_char(ch2, tok);
404             unget_char(ch1, tok);
405             return 1;
406         }
407         if (!set_readline(tok, "utf-16-be"))
408             return 0;
409         tok->decoding_state = STATE_NORMAL;
410     } else if (ch1 == 0xFF) {
411         ch2 = get_char(tok);
412         if (ch2 != 0xFE) {
413             unget_char(ch2, tok);
414             unget_char(ch1, tok);
415             return 1;
416         }
417         if (!set_readline(tok, "utf-16-le"))
418             return 0;
419         tok->decoding_state = STATE_NORMAL;
420 #endif
421     } else {
422         unget_char(ch1, tok);
423         return 1;
424     }
425     if (tok->encoding != NULL)
426         PyMem_FREE(tok->encoding);
427     tok->encoding = new_string("utf-8", 5, tok);
428     if (!tok->encoding)
429         return 0;
430     /* No need to set_readline: input is already utf-8 */
431     return 1;
432 }
433 
434 /* Read a line of text from TOK into S, using the stream in TOK.
435    Return NULL on failure, else S.
436 
437    On entry, tok->decoding_buffer will be one of:
438      1) NULL: need to call tok->decoding_readline to get a new line
439      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
440        stored the result in tok->decoding_buffer
441      3) PyByteArrayObject *: previous call to fp_readl did not have enough room
442        (in the s buffer) to copy entire contents of the line read
443        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
444        In this case, fp_readl is called in a loop (with an expanded buffer)
445        until the buffer ends with a '\n' (or until the end of the file is
446        reached): see tok_nextc and its calls to decoding_fgets.
447 */
448 
449 static char *
fp_readl(char * s,int size,struct tok_state * tok)450 fp_readl(char *s, int size, struct tok_state *tok)
451 {
452     PyObject* bufobj;
453     const char *buf;
454     Py_ssize_t buflen;
455 
456     /* Ask for one less byte so we can terminate it */
457     assert(size > 0);
458     size--;
459 
460     if (tok->decoding_buffer) {
461         bufobj = tok->decoding_buffer;
462         Py_INCREF(bufobj);
463     }
464     else
465     {
466         bufobj = _PyObject_CallNoArg(tok->decoding_readline);
467         if (bufobj == NULL)
468             goto error;
469     }
470     if (PyUnicode_CheckExact(bufobj))
471     {
472         buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
473         if (buf == NULL) {
474             goto error;
475         }
476     }
477     else
478     {
479         buf = PyByteArray_AsString(bufobj);
480         if (buf == NULL) {
481             goto error;
482         }
483         buflen = PyByteArray_GET_SIZE(bufobj);
484     }
485 
486     Py_XDECREF(tok->decoding_buffer);
487     if (buflen > size) {
488         /* Too many chars, the rest goes into tok->decoding_buffer */
489         tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
490                                                          buflen-size);
491         if (tok->decoding_buffer == NULL)
492             goto error;
493         buflen = size;
494     }
495     else
496         tok->decoding_buffer = NULL;
497 
498     memcpy(s, buf, buflen);
499     s[buflen] = '\0';
500     if (buflen == 0) /* EOF */
501         s = NULL;
502     Py_DECREF(bufobj);
503     return s;
504 
505 error:
506     Py_XDECREF(bufobj);
507     return error_ret(tok);
508 }
509 
510 /* Set the readline function for TOK to a StreamReader's
511    readline function. The StreamReader is named ENC.
512 
513    This function is called from check_bom and check_coding_spec.
514 
515    ENC is usually identical to the future value of tok->encoding,
516    except for the (currently unsupported) case of UTF-16.
517 
518    Return 1 on success, 0 on failure. */
519 
520 static int
fp_setreadl(struct tok_state * tok,const char * enc)521 fp_setreadl(struct tok_state *tok, const char* enc)
522 {
523     PyObject *readline, *io, *stream;
524     _Py_IDENTIFIER(open);
525     _Py_IDENTIFIER(readline);
526     int fd;
527     long pos;
528 
529     fd = fileno(tok->fp);
530     /* Due to buffering the file offset for fd can be different from the file
531      * position of tok->fp.  If tok->fp was opened in text mode on Windows,
532      * its file position counts CRLF as one char and can't be directly mapped
533      * to the file offset for fd.  Instead we step back one byte and read to
534      * the end of line.*/
535     pos = ftell(tok->fp);
536     if (pos == -1 ||
537         lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
538         PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
539         return 0;
540     }
541 
542     io = PyImport_ImportModuleNoBlock("io");
543     if (io == NULL)
544         return 0;
545 
546     stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
547                     fd, "r", -1, enc, Py_None, Py_None, Py_False);
548     Py_DECREF(io);
549     if (stream == NULL)
550         return 0;
551 
552     readline = _PyObject_GetAttrId(stream, &PyId_readline);
553     Py_DECREF(stream);
554     if (readline == NULL)
555         return 0;
556     Py_XSETREF(tok->decoding_readline, readline);
557 
558     if (pos > 0) {
559         PyObject *bufobj = _PyObject_CallNoArg(readline);
560         if (bufobj == NULL)
561             return 0;
562         Py_DECREF(bufobj);
563     }
564 
565     return 1;
566 }
567 
568 /* Fetch the next byte from TOK. */
569 
fp_getc(struct tok_state * tok)570 static int fp_getc(struct tok_state *tok) {
571     return getc(tok->fp);
572 }
573 
574 /* Unfetch the last byte back into TOK.  */
575 
fp_ungetc(int c,struct tok_state * tok)576 static void fp_ungetc(int c, struct tok_state *tok) {
577     ungetc(c, tok->fp);
578 }
579 
580 /* Check whether the characters at s start a valid
581    UTF-8 sequence. Return the number of characters forming
582    the sequence if yes, 0 if not.  */
valid_utf8(const unsigned char * s)583 static int valid_utf8(const unsigned char* s)
584 {
585     int expected = 0;
586     int length;
587     if (*s < 0x80)
588         /* single-byte code */
589         return 1;
590     if (*s < 0xc0)
591         /* following byte */
592         return 0;
593     if (*s < 0xE0)
594         expected = 1;
595     else if (*s < 0xF0)
596         expected = 2;
597     else if (*s < 0xF8)
598         expected = 3;
599     else
600         return 0;
601     length = expected + 1;
602     for (; expected; expected--)
603         if (s[expected] < 0x80 || s[expected] >= 0xC0)
604             return 0;
605     return length;
606 }
607 
608 /* Read a line of input from TOK. Determine encoding
609    if necessary.  */
610 
611 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)612 decoding_fgets(char *s, int size, struct tok_state *tok)
613 {
614     char *line = NULL;
615     int badchar = 0;
616     for (;;) {
617         if (tok->decoding_state == STATE_NORMAL) {
618             /* We already have a codec associated with
619                this input. */
620             line = fp_readl(s, size, tok);
621             break;
622         } else if (tok->decoding_state == STATE_RAW) {
623             /* We want a 'raw' read. */
624             line = Py_UniversalNewlineFgets(s, size,
625                                             tok->fp, NULL);
626             break;
627         } else {
628             /* We have not yet determined the encoding.
629                If an encoding is found, use the file-pointer
630                reader functions from now on. */
631             if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
632                 return error_ret(tok);
633             assert(tok->decoding_state != STATE_INIT);
634         }
635     }
636     if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
637         if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
638             return error_ret(tok);
639         }
640     }
641 #ifndef PGEN
642     /* The default encoding is UTF-8, so make sure we don't have any
643        non-UTF-8 sequences in it. */
644     if (line && !tok->encoding) {
645         unsigned char *c;
646         int length;
647         for (c = (unsigned char *)line; *c; c += length)
648             if (!(length = valid_utf8(c))) {
649                 badchar = *c;
650                 break;
651             }
652     }
653     if (badchar) {
654         /* Need to add 1 to the line number, since this line
655            has not been counted, yet.  */
656         PyErr_Format(PyExc_SyntaxError,
657                 "Non-UTF-8 code starting with '\\x%.2x' "
658                 "in file %U on line %i, "
659                 "but no encoding declared; "
660                 "see http://python.org/dev/peps/pep-0263/ for details",
661                 badchar, tok->filename, tok->lineno + 1);
662         return error_ret(tok);
663     }
664 #endif
665     return line;
666 }
667 
668 static int
decoding_feof(struct tok_state * tok)669 decoding_feof(struct tok_state *tok)
670 {
671     if (tok->decoding_state != STATE_NORMAL) {
672         return feof(tok->fp);
673     } else {
674         PyObject* buf = tok->decoding_buffer;
675         if (buf == NULL) {
676             buf = _PyObject_CallNoArg(tok->decoding_readline);
677             if (buf == NULL) {
678                 error_ret(tok);
679                 return 1;
680             } else {
681                 tok->decoding_buffer = buf;
682             }
683         }
684         return PyObject_Length(buf) == 0;
685     }
686 }
687 
688 /* Fetch a byte from TOK, using the string buffer. */
689 
690 static int
buf_getc(struct tok_state * tok)691 buf_getc(struct tok_state *tok) {
692     return Py_CHARMASK(*tok->str++);
693 }
694 
695 /* Unfetch a byte from TOK, using the string buffer. */
696 
697 static void
buf_ungetc(int c,struct tok_state * tok)698 buf_ungetc(int c, struct tok_state *tok) {
699     tok->str--;
700     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
701 }
702 
703 /* Set the readline function for TOK to ENC. For the string-based
704    tokenizer, this means to just record the encoding. */
705 
706 static int
buf_setreadl(struct tok_state * tok,const char * enc)707 buf_setreadl(struct tok_state *tok, const char* enc) {
708     tok->enc = enc;
709     return 1;
710 }
711 
712 /* Return a UTF-8 encoding Python string object from the
713    C byte string STR, which is encoded with ENC. */
714 
715 static PyObject *
translate_into_utf8(const char * str,const char * enc)716 translate_into_utf8(const char* str, const char* enc) {
717     PyObject *utf8;
718     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
719     if (buf == NULL)
720         return NULL;
721     utf8 = PyUnicode_AsUTF8String(buf);
722     Py_DECREF(buf);
723     return utf8;
724 }
725 
726 
727 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)728 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
729     int skip_next_lf = 0;
730     size_t needed_length = strlen(s) + 2, final_length;
731     char *buf, *current;
732     char c = '\0';
733     buf = PyMem_MALLOC(needed_length);
734     if (buf == NULL) {
735         tok->done = E_NOMEM;
736         return NULL;
737     }
738     for (current = buf; *s; s++, current++) {
739         c = *s;
740         if (skip_next_lf) {
741             skip_next_lf = 0;
742             if (c == '\n') {
743                 c = *++s;
744                 if (!c)
745                     break;
746             }
747         }
748         if (c == '\r') {
749             skip_next_lf = 1;
750             c = '\n';
751         }
752         *current = c;
753     }
754     /* If this is exec input, add a newline to the end of the string if
755        there isn't one already. */
756     if (exec_input && c != '\n') {
757         *current = '\n';
758         current++;
759     }
760     *current = '\0';
761     final_length = current - buf + 1;
762     if (final_length < needed_length && final_length)
763         /* should never fail */
764         buf = PyMem_REALLOC(buf, final_length);
765     return buf;
766 }
767 
768 /* Decode a byte string STR for use as the buffer of TOK.
769    Look for encoding declarations inside STR, and record them
770    inside TOK.  */
771 
772 static const char *
decode_str(const char * input,int single,struct tok_state * tok)773 decode_str(const char *input, int single, struct tok_state *tok)
774 {
775     PyObject* utf8 = NULL;
776     const char *str;
777     const char *s;
778     const char *newl[2] = {NULL, NULL};
779     int lineno = 0;
780     tok->input = str = translate_newlines(input, single, tok);
781     if (str == NULL)
782         return NULL;
783     tok->enc = NULL;
784     tok->str = str;
785     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
786         return error_ret(tok);
787     str = tok->str;             /* string after BOM if any */
788     assert(str);
789     if (tok->enc != NULL) {
790         utf8 = translate_into_utf8(str, tok->enc);
791         if (utf8 == NULL)
792             return error_ret(tok);
793         str = PyBytes_AsString(utf8);
794     }
795     for (s = str;; s++) {
796         if (*s == '\0') break;
797         else if (*s == '\n') {
798             assert(lineno < 2);
799             newl[lineno] = s;
800             lineno++;
801             if (lineno == 2) break;
802         }
803     }
804     tok->enc = NULL;
805     /* need to check line 1 and 2 separately since check_coding_spec
806        assumes a single line as input */
807     if (newl[0]) {
808         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
809             return error_ret(tok);
810         if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
811             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
812                                    tok, buf_setreadl))
813                 return error_ret(tok);
814         }
815     }
816     if (tok->enc != NULL) {
817         assert(utf8 == NULL);
818         utf8 = translate_into_utf8(str, tok->enc);
819         if (utf8 == NULL)
820             return error_ret(tok);
821         str = PyBytes_AS_STRING(utf8);
822     }
823     assert(tok->decoding_buffer == NULL);
824     tok->decoding_buffer = utf8; /* CAUTION */
825     return str;
826 }
827 
828 #endif /* PGEN */
829 
830 /* Set up tokenizer for string */
831 
832 struct tok_state *
Ta3Tokenizer_FromString(const char * str,int exec_input)833 Ta3Tokenizer_FromString(const char *str, int exec_input)
834 {
835     struct tok_state *tok = tok_new();
836     if (tok == NULL)
837         return NULL;
838     str = decode_str(str, exec_input, tok);
839     if (str == NULL) {
840         Ta3Tokenizer_Free(tok);
841         return NULL;
842     }
843 
844     /* XXX: constify members. */
845     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
846     return tok;
847 }
848 
849 struct tok_state *
Ta3Tokenizer_FromUTF8(const char * str,int exec_input)850 Ta3Tokenizer_FromUTF8(const char *str, int exec_input)
851 {
852     struct tok_state *tok = tok_new();
853     if (tok == NULL)
854         return NULL;
855 #ifndef PGEN
856     tok->input = str = translate_newlines(str, exec_input, tok);
857 #endif
858     if (str == NULL) {
859         Ta3Tokenizer_Free(tok);
860         return NULL;
861     }
862     tok->decoding_state = STATE_RAW;
863     tok->read_coding_spec = 1;
864     tok->enc = NULL;
865     tok->str = str;
866     tok->encoding = (char *)PyMem_MALLOC(6);
867     if (!tok->encoding) {
868         Ta3Tokenizer_Free(tok);
869         return NULL;
870     }
871     strcpy(tok->encoding, "utf-8");
872 
873     /* XXX: constify members. */
874     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
875     return tok;
876 }
877 
878 /* Set up tokenizer for file */
879 
880 struct tok_state *
Ta3Tokenizer_FromFile(FILE * fp,const char * enc,const char * ps1,const char * ps2)881 Ta3Tokenizer_FromFile(FILE *fp, const char* enc,
882                      const char *ps1, const char *ps2)
883 {
884     struct tok_state *tok = tok_new();
885     if (tok == NULL)
886         return NULL;
887     if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
888         Ta3Tokenizer_Free(tok);
889         return NULL;
890     }
891     tok->cur = tok->inp = tok->buf;
892     tok->end = tok->buf + BUFSIZ;
893     tok->fp = fp;
894     tok->prompt = ps1;
895     tok->nextprompt = ps2;
896     if (enc != NULL) {
897         /* Must copy encoding declaration since it
898            gets copied into the parse tree. */
899         tok->encoding = PyMem_MALLOC(strlen(enc)+1);
900         if (!tok->encoding) {
901             Ta3Tokenizer_Free(tok);
902             return NULL;
903         }
904         strcpy(tok->encoding, enc);
905         tok->decoding_state = STATE_NORMAL;
906     }
907     return tok;
908 }
909 
910 
911 /* Free a tok_state structure */
912 
913 void
Ta3Tokenizer_Free(struct tok_state * tok)914 Ta3Tokenizer_Free(struct tok_state *tok)
915 {
916     if (tok->encoding != NULL)
917         PyMem_FREE(tok->encoding);
918 #ifndef PGEN
919     Py_XDECREF(tok->decoding_readline);
920     Py_XDECREF(tok->decoding_buffer);
921     Py_XDECREF(tok->filename);
922 #endif
923     if (tok->fp != NULL && tok->buf != NULL)
924         PyMem_FREE(tok->buf);
925     if (tok->input)
926         PyMem_FREE((char *)tok->input);
927     PyMem_FREE(tok);
928 }
929 
930 /* Get next char, updating state; error code goes into tok->done */
931 
932 static int
tok_nextc(struct tok_state * tok)933 tok_nextc(struct tok_state *tok)
934 {
935     for (;;) {
936         if (tok->cur != tok->inp) {
937             return Py_CHARMASK(*tok->cur++); /* Fast path */
938         }
939         if (tok->done != E_OK)
940             return EOF;
941         if (tok->fp == NULL) {
942             char *end = strchr(tok->inp, '\n');
943             if (end != NULL)
944                 end++;
945             else {
946                 end = strchr(tok->inp, '\0');
947                 if (end == tok->inp) {
948                     tok->done = E_EOF;
949                     return EOF;
950                 }
951             }
952             if (tok->start == NULL)
953                 tok->buf = tok->cur;
954             tok->line_start = tok->cur;
955             tok->lineno++;
956             tok->inp = end;
957             return Py_CHARMASK(*tok->cur++);
958         }
959         if (tok->prompt != NULL) {
960             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
961 #ifndef PGEN
962             if (newtok != NULL) {
963                 char *translated = translate_newlines(newtok, 0, tok);
964                 PyMem_FREE(newtok);
965                 if (translated == NULL)
966                     return EOF;
967                 newtok = translated;
968             }
969             if (tok->encoding && newtok && *newtok) {
970                 /* Recode to UTF-8 */
971                 Py_ssize_t buflen;
972                 const char* buf;
973                 PyObject *u = translate_into_utf8(newtok, tok->encoding);
974                 PyMem_FREE(newtok);
975                 if (!u) {
976                     tok->done = E_DECODE;
977                     return EOF;
978                 }
979                 buflen = PyBytes_GET_SIZE(u);
980                 buf = PyBytes_AS_STRING(u);
981                 newtok = PyMem_MALLOC(buflen+1);
982                 if (newtok == NULL) {
983                     Py_DECREF(u);
984                     tok->done = E_NOMEM;
985                     return EOF;
986                 }
987                 strcpy(newtok, buf);
988                 Py_DECREF(u);
989             }
990 #endif
991             if (tok->nextprompt != NULL)
992                 tok->prompt = tok->nextprompt;
993             if (newtok == NULL)
994                 tok->done = E_INTR;
995             else if (*newtok == '\0') {
996                 PyMem_FREE(newtok);
997                 tok->done = E_EOF;
998             }
999             else if (tok->start != NULL) {
1000                 size_t start = tok->start - tok->buf;
1001                 size_t oldlen = tok->cur - tok->buf;
1002                 size_t newlen = oldlen + strlen(newtok);
1003                 char *buf = tok->buf;
1004                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
1005                 tok->lineno++;
1006                 if (buf == NULL) {
1007                     PyMem_FREE(tok->buf);
1008                     tok->buf = NULL;
1009                     PyMem_FREE(newtok);
1010                     tok->done = E_NOMEM;
1011                     return EOF;
1012                 }
1013                 tok->buf = buf;
1014                 tok->cur = tok->buf + oldlen;
1015                 tok->line_start = tok->cur;
1016                 strcpy(tok->buf + oldlen, newtok);
1017                 PyMem_FREE(newtok);
1018                 tok->inp = tok->buf + newlen;
1019                 tok->end = tok->inp + 1;
1020                 tok->start = tok->buf + start;
1021             }
1022             else {
1023                 tok->lineno++;
1024                 if (tok->buf != NULL)
1025                     PyMem_FREE(tok->buf);
1026                 tok->buf = newtok;
1027                 tok->cur = tok->buf;
1028                 tok->line_start = tok->buf;
1029                 tok->inp = strchr(tok->buf, '\0');
1030                 tok->end = tok->inp + 1;
1031             }
1032         }
1033         else {
1034             int done = 0;
1035             Py_ssize_t cur = 0;
1036             char *pt;
1037             if (tok->start == NULL) {
1038                 if (tok->buf == NULL) {
1039                     tok->buf = (char *)
1040                         PyMem_MALLOC(BUFSIZ);
1041                     if (tok->buf == NULL) {
1042                         tok->done = E_NOMEM;
1043                         return EOF;
1044                     }
1045                     tok->end = tok->buf + BUFSIZ;
1046                 }
1047                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1048                           tok) == NULL) {
1049                     if (!tok->decoding_erred)
1050                         tok->done = E_EOF;
1051                     done = 1;
1052                 }
1053                 else {
1054                     tok->done = E_OK;
1055                     tok->inp = strchr(tok->buf, '\0');
1056                     done = tok->inp == tok->buf || tok->inp[-1] == '\n';
1057                 }
1058             }
1059             else {
1060                 cur = tok->cur - tok->buf;
1061                 if (decoding_feof(tok)) {
1062                     tok->done = E_EOF;
1063                     done = 1;
1064                 }
1065                 else
1066                     tok->done = E_OK;
1067             }
1068             tok->lineno++;
1069             /* Read until '\n' or EOF */
1070             while (!done) {
1071                 Py_ssize_t curstart = tok->start == NULL ? -1 :
1072                           tok->start - tok->buf;
1073                 Py_ssize_t curvalid = tok->inp - tok->buf;
1074                 Py_ssize_t newsize = curvalid + BUFSIZ;
1075                 char *newbuf = tok->buf;
1076                 newbuf = (char *)PyMem_REALLOC(newbuf,
1077                                                newsize);
1078                 if (newbuf == NULL) {
1079                     tok->done = E_NOMEM;
1080                     tok->cur = tok->inp;
1081                     return EOF;
1082                 }
1083                 tok->buf = newbuf;
1084                 tok->cur = tok->buf + cur;
1085                 tok->line_start = tok->cur;
1086                 tok->inp = tok->buf + curvalid;
1087                 tok->end = tok->buf + newsize;
1088                 tok->start = curstart < 0 ? NULL :
1089                          tok->buf + curstart;
1090                 if (decoding_fgets(tok->inp,
1091                                (int)(tok->end - tok->inp),
1092                                tok) == NULL) {
1093                     /* Break out early on decoding
1094                        errors, as tok->buf will be NULL
1095                      */
1096                     if (tok->decoding_erred)
1097                         return EOF;
1098                     /* Last line does not end in \n,
1099                        fake one */
1100                     strcpy(tok->inp, "\n");
1101                 }
1102                 tok->inp = strchr(tok->inp, '\0');
1103                 done = tok->inp[-1] == '\n';
1104             }
1105             if (tok->buf != NULL) {
1106                 tok->cur = tok->buf + cur;
1107                 tok->line_start = tok->cur;
1108                 /* replace "\r\n" with "\n" */
1109                 /* For Mac leave the \r, giving a syntax error */
1110                 pt = tok->inp - 2;
1111                 if (pt >= tok->buf && *pt == '\r') {
1112                     *pt++ = '\n';
1113                     *pt = '\0';
1114                     tok->inp = pt;
1115                 }
1116             }
1117         }
1118         if (tok->done != E_OK) {
1119             if (tok->prompt != NULL)
1120                 PySys_WriteStderr("\n");
1121             tok->cur = tok->inp;
1122             return EOF;
1123         }
1124     }
1125     /*NOTREACHED*/
1126 }
1127 
1128 
1129 /* Back-up one character */
1130 
1131 static void
tok_backup(struct tok_state * tok,int c)1132 tok_backup(struct tok_state *tok, int c)
1133 {
1134     if (c != EOF) {
1135         if (--tok->cur < tok->buf)
1136             Py_FatalError("tok_backup: beginning of buffer");
1137         if (*tok->cur != c)
1138             *tok->cur = c;
1139     }
1140 }
1141 
1142 
1143 /* Return the token corresponding to a single character */
1144 
1145 int
Ta3Token_OneChar(int c)1146 Ta3Token_OneChar(int c)
1147 {
1148     switch (c) {
1149     case '(':           return LPAR;
1150     case ')':           return RPAR;
1151     case '[':           return LSQB;
1152     case ']':           return RSQB;
1153     case ':':           return COLON;
1154     case ',':           return COMMA;
1155     case ';':           return SEMI;
1156     case '+':           return PLUS;
1157     case '-':           return MINUS;
1158     case '*':           return STAR;
1159     case '/':           return SLASH;
1160     case '|':           return VBAR;
1161     case '&':           return AMPER;
1162     case '<':           return LESS;
1163     case '>':           return GREATER;
1164     case '=':           return EQUAL;
1165     case '.':           return DOT;
1166     case '%':           return PERCENT;
1167     case '{':           return LBRACE;
1168     case '}':           return RBRACE;
1169     case '^':           return CIRCUMFLEX;
1170     case '~':           return TILDE;
1171     case '@':           return AT;
1172     default:            return OP;
1173     }
1174 }
1175 
1176 
1177 int
Ta3Token_TwoChars(int c1,int c2)1178 Ta3Token_TwoChars(int c1, int c2)
1179 {
1180     switch (c1) {
1181     case '=':
1182         switch (c2) {
1183         case '=':               return EQEQUAL;
1184         }
1185         break;
1186     case '!':
1187         switch (c2) {
1188         case '=':               return NOTEQUAL;
1189         }
1190         break;
1191     case '<':
1192         switch (c2) {
1193         case '>':               return NOTEQUAL;
1194         case '=':               return LESSEQUAL;
1195         case '<':               return LEFTSHIFT;
1196         }
1197         break;
1198     case '>':
1199         switch (c2) {
1200         case '=':               return GREATEREQUAL;
1201         case '>':               return RIGHTSHIFT;
1202         }
1203         break;
1204     case '+':
1205         switch (c2) {
1206         case '=':               return PLUSEQUAL;
1207         }
1208         break;
1209     case '-':
1210         switch (c2) {
1211         case '=':               return MINEQUAL;
1212         case '>':               return RARROW;
1213         }
1214         break;
1215     case '*':
1216         switch (c2) {
1217         case '*':               return DOUBLESTAR;
1218         case '=':               return STAREQUAL;
1219         }
1220         break;
1221     case '/':
1222         switch (c2) {
1223         case '/':               return DOUBLESLASH;
1224         case '=':               return SLASHEQUAL;
1225         }
1226         break;
1227     case '|':
1228         switch (c2) {
1229         case '=':               return VBAREQUAL;
1230         }
1231         break;
1232     case '%':
1233         switch (c2) {
1234         case '=':               return PERCENTEQUAL;
1235         }
1236         break;
1237     case '&':
1238         switch (c2) {
1239         case '=':               return AMPEREQUAL;
1240         }
1241         break;
1242     case '^':
1243         switch (c2) {
1244         case '=':               return CIRCUMFLEXEQUAL;
1245         }
1246         break;
1247     case '@':
1248         switch (c2) {
1249         case '=':               return ATEQUAL;
1250         }
1251         break;
1252     }
1253     return OP;
1254 }
1255 
1256 int
Ta3Token_ThreeChars(int c1,int c2,int c3)1257 Ta3Token_ThreeChars(int c1, int c2, int c3)
1258 {
1259     switch (c1) {
1260     case '<':
1261         switch (c2) {
1262         case '<':
1263             switch (c3) {
1264             case '=':
1265                 return LEFTSHIFTEQUAL;
1266             }
1267             break;
1268         }
1269         break;
1270     case '>':
1271         switch (c2) {
1272         case '>':
1273             switch (c3) {
1274             case '=':
1275                 return RIGHTSHIFTEQUAL;
1276             }
1277             break;
1278         }
1279         break;
1280     case '*':
1281         switch (c2) {
1282         case '*':
1283             switch (c3) {
1284             case '=':
1285                 return DOUBLESTAREQUAL;
1286             }
1287             break;
1288         }
1289         break;
1290     case '/':
1291         switch (c2) {
1292         case '/':
1293             switch (c3) {
1294             case '=':
1295                 return DOUBLESLASHEQUAL;
1296             }
1297             break;
1298         }
1299         break;
1300     case '.':
1301         switch (c2) {
1302         case '.':
1303             switch (c3) {
1304             case '.':
1305                 return ELLIPSIS;
1306             }
1307             break;
1308         }
1309         break;
1310     }
1311     return OP;
1312 }
1313 
1314 static int
indenterror(struct tok_state * tok)1315 indenterror(struct tok_state *tok)
1316 {
1317     tok->done = E_TABSPACE;
1318     tok->cur = tok->inp;
1319     return ERRORTOKEN;
1320 }
1321 
1322 #ifdef PGEN
1323 #define verify_identifier(tok) 1
1324 #else
1325 /* Verify that the identifier follows PEP 3131.
1326    All identifier strings are guaranteed to be "ready" unicode objects.
1327  */
1328 static int
verify_identifier(struct tok_state * tok)1329 verify_identifier(struct tok_state *tok)
1330 {
1331     PyObject *s;
1332     int result;
1333     if (tok->decoding_erred)
1334         return 0;
1335     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1336     if (s == NULL || PyUnicode_READY(s) == -1) {
1337         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1338             PyErr_Clear();
1339             tok->done = E_IDENTIFIER;
1340         } else {
1341             tok->done = E_ERROR;
1342         }
1343         return 0;
1344     }
1345     result = PyUnicode_IsIdentifier(s);
1346     Py_DECREF(s);
1347     if (result == 0)
1348         tok->done = E_IDENTIFIER;
1349     return result;
1350 }
1351 #endif
1352 
1353 static int
tok_decimal_tail(struct tok_state * tok)1354 tok_decimal_tail(struct tok_state *tok)
1355 {
1356     int c;
1357 
1358     while (1) {
1359         do {
1360             c = tok_nextc(tok);
1361         } while (isdigit(c));
1362         if (c != '_') {
1363             break;
1364         }
1365         c = tok_nextc(tok);
1366         if (!isdigit(c)) {
1367             tok->done = E_TOKEN;
1368             tok_backup(tok, c);
1369             return 0;
1370         }
1371     }
1372     return c;
1373 }
1374 
1375 /* Get next token, after space stripping etc. */
1376 
1377 static int
tok_get(struct tok_state * tok,char ** p_start,char ** p_end)1378 tok_get(struct tok_state *tok, char **p_start, char **p_end)
1379 {
1380     int c;
1381     int blankline, nonascii;
1382 
1383     *p_start = *p_end = NULL;
1384   nextline:
1385     tok->start = NULL;
1386     blankline = 0;
1387 
1388     /* Get indentation level */
1389     if (tok->atbol) {
1390         int col = 0;
1391         int altcol = 0;
1392         tok->atbol = 0;
1393         for (;;) {
1394             c = tok_nextc(tok);
1395             if (c == ' ') {
1396                 col++, altcol++;
1397             }
1398             else if (c == '\t') {
1399                 col = (col / tok->tabsize + 1) * tok->tabsize;
1400                 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1401             }
1402             else if (c == '\014')  {/* Control-L (formfeed) */
1403                 col = altcol = 0; /* For Emacs users */
1404             }
1405             else {
1406                 break;
1407             }
1408         }
1409         tok_backup(tok, c);
1410         if (c == '#' || c == '\n') {
1411             /* Lines with only whitespace and/or comments
1412                shouldn't affect the indentation and are
1413                not passed to the parser as NEWLINE tokens,
1414                except *totally* empty lines in interactive
1415                mode, which signal the end of a command group. */
1416             if (col == 0 && c == '\n' && tok->prompt != NULL) {
1417                 blankline = 0; /* Let it through */
1418             }
1419             else {
1420                 blankline = 1; /* Ignore completely */
1421             }
1422             /* We can't jump back right here since we still
1423                may need to skip to the end of a comment */
1424         }
1425         if (!blankline && tok->level == 0) {
1426             if (col == tok->indstack[tok->indent]) {
1427                 /* No change */
1428                 if (altcol != tok->altindstack[tok->indent]) {
1429                     return indenterror(tok);
1430                 }
1431             }
1432             else if (col > tok->indstack[tok->indent]) {
1433                 /* Indent -- always one */
1434                 if (tok->indent+1 >= MAXINDENT) {
1435                     tok->done = E_TOODEEP;
1436                     tok->cur = tok->inp;
1437                     return ERRORTOKEN;
1438                 }
1439                 if (altcol <= tok->altindstack[tok->indent]) {
1440                     return indenterror(tok);
1441                 }
1442                 tok->pendin++;
1443                 tok->indstack[++tok->indent] = col;
1444                 tok->altindstack[tok->indent] = altcol;
1445             }
1446             else /* col < tok->indstack[tok->indent] */ {
1447                 /* Dedent -- any number, must be consistent */
1448                 while (tok->indent > 0 &&
1449                     col < tok->indstack[tok->indent]) {
1450                     tok->pendin--;
1451                     tok->indent--;
1452                 }
1453                 if (col != tok->indstack[tok->indent]) {
1454                     tok->done = E_DEDENT;
1455                     tok->cur = tok->inp;
1456                     return ERRORTOKEN;
1457                 }
1458                 if (altcol != tok->altindstack[tok->indent]) {
1459                     return indenterror(tok);
1460                 }
1461             }
1462         }
1463     }
1464 
1465     tok->start = tok->cur;
1466 
1467     /* Return pending indents/dedents */
1468     if (tok->pendin != 0) {
1469         if (tok->pendin < 0) {
1470             tok->pendin++;
1471             return DEDENT;
1472         }
1473         else {
1474             tok->pendin--;
1475             return INDENT;
1476         }
1477     }
1478 
1479     /* Peek ahead at the next character */
1480     c = tok_nextc(tok);
1481     tok_backup(tok, c);
1482     /* Check if we are closing an async function */
1483     if (tok->async_def
1484         && !blankline
1485         /* Due to some implementation artifacts of type comments,
1486          * a TYPE_COMMENT at the start of a function won't set an
1487          * indentation level and it will produce a NEWLINE after it.
1488          * To avoid spuriously ending an async function due to this,
1489          * wait until we have some non-newline char in front of us. */
1490         && c != '\n'
1491         && tok->level == 0
1492         /* There was a NEWLINE after ASYNC DEF,
1493            so we're past the signature. */
1494         && tok->async_def_nl
1495         /* Current indentation level is less than where
1496            the async function was defined */
1497         && tok->async_def_indent >= tok->indent)
1498     {
1499         tok->async_def = 0;
1500         tok->async_def_indent = 0;
1501         tok->async_def_nl = 0;
1502     }
1503 
1504  again:
1505     tok->start = NULL;
1506     /* Skip spaces */
1507     do {
1508         c = tok_nextc(tok);
1509     } while (c == ' ' || c == '\t' || c == '\014');
1510 
1511     /* Set start of current token */
1512     tok->start = tok->cur - 1;
1513 
1514     /* Skip comment, unless it's a type comment */
1515     if (c == '#') {
1516         const char *prefix, *p, *type_start;
1517 
1518         while (c != EOF && c != '\n')
1519             c = tok_nextc(tok);
1520 
1521         p = tok->start;
1522         prefix = type_comment_prefix;
1523         while (*prefix && p < tok->cur) {
1524             if (*prefix == ' ') {
1525                 while (*p == ' ' || *p == '\t')
1526                     p++;
1527             } else if (*prefix == *p) {
1528                 p++;
1529             } else {
1530                 break;
1531             }
1532 
1533             prefix++;
1534         }
1535 
1536         /* This is a type comment if we matched all of type_comment_prefix. */
1537         if (!*prefix) {
1538             int is_type_ignore = 1;
1539             const char *ignore_end = p + 6;
1540             tok_backup(tok, c);  /* don't eat the newline or EOF */
1541 
1542             type_start = p;
1543 
1544             /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1545              * or anything ASCII and non-alphanumeric. */
1546             is_type_ignore = (
1547                 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1548                 && !(tok->cur > ignore_end
1549                      && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1550 
1551             if (is_type_ignore) {
1552                 *p_start = (char *) ignore_end;
1553                 *p_end = tok->cur;
1554 
1555                 /* If this type ignore is the only thing on the line, consume the newline also. */
1556                 if (blankline) {
1557                     tok_nextc(tok);
1558                     tok->atbol = 1;
1559                 }
1560                 return TYPE_IGNORE;
1561             } else {
1562                 *p_start = (char *) type_start;  /* after type_comment_prefix */
1563                 *p_end = tok->cur;
1564                 return TYPE_COMMENT;
1565             }
1566         }
1567     }
1568 
1569     /* Check for EOF and errors now */
1570     if (c == EOF) {
1571         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1572     }
1573 
1574     /* Identifier (most frequent token!) */
1575     nonascii = 0;
1576     if (is_potential_identifier_start(c)) {
1577         /* Process the various legal combinations of b"", r"", u"", and f"". */
1578         int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1579         while (1) {
1580             if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1581                 saw_b = 1;
1582             /* Since this is a backwards compatibility support literal we don't
1583                want to support it in arbitrary order like byte literals. */
1584             else if (!(saw_b || saw_u || saw_r || saw_f)
1585                      && (c == 'u'|| c == 'U')) {
1586                 saw_u = 1;
1587             }
1588             /* ur"" and ru"" are not supported */
1589             else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1590                 saw_r = 1;
1591             }
1592             else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1593                 saw_f = 1;
1594             }
1595             else {
1596                 break;
1597             }
1598             c = tok_nextc(tok);
1599             if (c == '"' || c == '\'') {
1600                 goto letter_quote;
1601             }
1602         }
1603         while (is_potential_identifier_char(c)) {
1604             if (c >= 128) {
1605                 nonascii = 1;
1606             }
1607             c = tok_nextc(tok);
1608         }
1609         tok_backup(tok, c);
1610         if (nonascii && !verify_identifier(tok)) {
1611             return ERRORTOKEN;
1612         }
1613         *p_start = tok->start;
1614         *p_end = tok->cur;
1615 
1616         /* async/await parsing block. */
1617         if (tok->cur - tok->start == 5) {
1618             /* Current token length is 5. */
1619             if (tok->async_always || tok->async_def) {
1620                 /* We're inside an 'async def' function. */
1621                 if (memcmp(tok->start, "async", 5) == 0) {
1622                     return ASYNC;
1623                 }
1624                 if (memcmp(tok->start, "await", 5) == 0) {
1625                     return AWAIT;
1626                 }
1627             }
1628             else if (memcmp(tok->start, "async", 5) == 0) {
1629                 /* The current token is 'async'.
1630                    Look ahead one token.*/
1631 
1632                 struct tok_state ahead_tok;
1633                 char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
1634                 int ahead_tok_kind;
1635 
1636                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1637                 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1638                                          &ahead_tok_end);
1639 
1640                 if (ahead_tok_kind == NAME
1641                     && ahead_tok.cur - ahead_tok.start == 3
1642                     && memcmp(ahead_tok.start, "def", 3) == 0)
1643                 {
1644                     /* The next token is going to be 'def', so instead of
1645                        returning 'async' NAME token, we return ASYNC. */
1646                     tok->async_def_indent = tok->indent;
1647                     tok->async_def = 1;
1648                     return ASYNC;
1649                 }
1650             }
1651         }
1652 
1653         return NAME;
1654     }
1655 
1656     /* Newline */
1657     if (c == '\n') {
1658         tok->atbol = 1;
1659         if (blankline || tok->level > 0) {
1660             goto nextline;
1661         }
1662         *p_start = tok->start;
1663         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1664         tok->cont_line = 0;
1665         if (tok->async_def) {
1666             /* We're somewhere inside an 'async def' function, and
1667                we've encountered a NEWLINE after its signature. */
1668             tok->async_def_nl = 1;
1669         }
1670         return NEWLINE;
1671     }
1672 
1673     /* Period or number starting with period? */
1674     if (c == '.') {
1675         c = tok_nextc(tok);
1676         if (isdigit(c)) {
1677             goto fraction;
1678         } else if (c == '.') {
1679             c = tok_nextc(tok);
1680             if (c == '.') {
1681                 *p_start = tok->start;
1682                 *p_end = tok->cur;
1683                 return ELLIPSIS;
1684             }
1685             else {
1686                 tok_backup(tok, c);
1687             }
1688             tok_backup(tok, '.');
1689         }
1690         else {
1691             tok_backup(tok, c);
1692         }
1693         *p_start = tok->start;
1694         *p_end = tok->cur;
1695         return DOT;
1696     }
1697 
1698     /* Number */
1699     if (isdigit(c)) {
1700         if (c == '0') {
1701             /* Hex, octal or binary -- maybe. */
1702             c = tok_nextc(tok);
1703             if (c == 'x' || c == 'X') {
1704                 /* Hex */
1705                 c = tok_nextc(tok);
1706                 do {
1707                     if (c == '_') {
1708                         c = tok_nextc(tok);
1709                     }
1710                     if (!isxdigit(c)) {
1711                         tok->done = E_TOKEN;
1712                         tok_backup(tok, c);
1713                         return ERRORTOKEN;
1714                     }
1715                     do {
1716                         c = tok_nextc(tok);
1717                     } while (isxdigit(c));
1718                 } while (c == '_');
1719             }
1720             else if (c == 'o' || c == 'O') {
1721                 /* Octal */
1722                 c = tok_nextc(tok);
1723                 do {
1724                     if (c == '_') {
1725                         c = tok_nextc(tok);
1726                     }
1727                     if (c < '0' || c >= '8') {
1728                         tok->done = E_TOKEN;
1729                         tok_backup(tok, c);
1730                         return ERRORTOKEN;
1731                     }
1732                     do {
1733                         c = tok_nextc(tok);
1734                     } while ('0' <= c && c < '8');
1735                 } while (c == '_');
1736             }
1737             else if (c == 'b' || c == 'B') {
1738                 /* Binary */
1739                 c = tok_nextc(tok);
1740                 do {
1741                     if (c == '_') {
1742                         c = tok_nextc(tok);
1743                     }
1744                     if (c != '0' && c != '1') {
1745                         tok->done = E_TOKEN;
1746                         tok_backup(tok, c);
1747                         return ERRORTOKEN;
1748                     }
1749                     do {
1750                         c = tok_nextc(tok);
1751                     } while (c == '0' || c == '1');
1752                 } while (c == '_');
1753             }
1754             else {
1755                 int nonzero = 0;
1756                 /* maybe old-style octal; c is first char of it */
1757                 /* in any case, allow '0' as a literal */
1758                 while (1) {
1759                     if (c == '_') {
1760                         c = tok_nextc(tok);
1761                         if (!isdigit(c)) {
1762                             tok->done = E_TOKEN;
1763                             tok_backup(tok, c);
1764                             return ERRORTOKEN;
1765                         }
1766                     }
1767                     if (c != '0') {
1768                         break;
1769                     }
1770                     c = tok_nextc(tok);
1771                 }
1772                 if (isdigit(c)) {
1773                     nonzero = 1;
1774                     c = tok_decimal_tail(tok);
1775                     if (c == 0) {
1776                         return ERRORTOKEN;
1777                     }
1778                 }
1779                 if (c == '.') {
1780                     c = tok_nextc(tok);
1781                     goto fraction;
1782                 }
1783                 else if (c == 'e' || c == 'E') {
1784                     goto exponent;
1785                 }
1786                 else if (c == 'j' || c == 'J') {
1787                     goto imaginary;
1788                 }
1789                 else if (nonzero) {
1790                     /* Old-style octal: now disallowed. */
1791                     tok->done = E_TOKEN;
1792                     tok_backup(tok, c);
1793                     return ERRORTOKEN;
1794                 }
1795             }
1796         }
1797         else {
1798             /* Decimal */
1799             c = tok_decimal_tail(tok);
1800             if (c == 0) {
1801                 return ERRORTOKEN;
1802             }
1803             {
1804                 /* Accept floating point numbers. */
1805                 if (c == '.') {
1806                     c = tok_nextc(tok);
1807         fraction:
1808                     /* Fraction */
1809                     if (isdigit(c)) {
1810                         c = tok_decimal_tail(tok);
1811                         if (c == 0) {
1812                             return ERRORTOKEN;
1813                         }
1814                     }
1815                 }
1816                 if (c == 'e' || c == 'E') {
1817                     int e;
1818                   exponent:
1819                     e = c;
1820                     /* Exponent part */
1821                     c = tok_nextc(tok);
1822                     if (c == '+' || c == '-') {
1823                         c = tok_nextc(tok);
1824                         if (!isdigit(c)) {
1825                             tok->done = E_TOKEN;
1826                             tok_backup(tok, c);
1827                             return ERRORTOKEN;
1828                         }
1829                     } else if (!isdigit(c)) {
1830                         tok_backup(tok, c);
1831                         tok_backup(tok, e);
1832                         *p_start = tok->start;
1833                         *p_end = tok->cur;
1834                         return NUMBER;
1835                     }
1836                     c = tok_decimal_tail(tok);
1837                     if (c == 0) {
1838                         return ERRORTOKEN;
1839                     }
1840                 }
1841                 if (c == 'j' || c == 'J') {
1842                     /* Imaginary part */
1843         imaginary:
1844                     c = tok_nextc(tok);
1845                 }
1846             }
1847         }
1848         tok_backup(tok, c);
1849         *p_start = tok->start;
1850         *p_end = tok->cur;
1851         return NUMBER;
1852     }
1853 
1854   letter_quote:
1855     /* String */
1856     if (c == '\'' || c == '"') {
1857         int quote = c;
1858         int quote_size = 1;             /* 1 or 3 */
1859         int end_quote_size = 0;
1860 
1861         /* Find the quote size and start of string */
1862         c = tok_nextc(tok);
1863         if (c == quote) {
1864             c = tok_nextc(tok);
1865             if (c == quote) {
1866                 quote_size = 3;
1867             }
1868             else {
1869                 end_quote_size = 1;     /* empty string found */
1870             }
1871         }
1872         if (c != quote) {
1873             tok_backup(tok, c);
1874         }
1875 
1876         /* Get rest of string */
1877         while (end_quote_size != quote_size) {
1878             c = tok_nextc(tok);
1879             if (c == EOF) {
1880                 if (quote_size == 3) {
1881                     tok->done = E_EOFS;
1882                 }
1883                 else {
1884                     tok->done = E_EOLS;
1885                 }
1886                 tok->cur = tok->inp;
1887                 return ERRORTOKEN;
1888             }
1889             if (quote_size == 1 && c == '\n') {
1890                 tok->done = E_EOLS;
1891                 tok->cur = tok->inp;
1892                 return ERRORTOKEN;
1893             }
1894             if (c == quote) {
1895                 end_quote_size += 1;
1896             }
1897             else {
1898                 end_quote_size = 0;
1899                 if (c == '\\') {
1900                     tok_nextc(tok);  /* skip escaped char */
1901                 }
1902             }
1903         }
1904 
1905         *p_start = tok->start;
1906         *p_end = tok->cur;
1907         return STRING;
1908     }
1909 
1910     /* Line continuation */
1911     if (c == '\\') {
1912         c = tok_nextc(tok);
1913         if (c != '\n') {
1914             tok->done = E_LINECONT;
1915             tok->cur = tok->inp;
1916             return ERRORTOKEN;
1917         }
1918         tok->cont_line = 1;
1919         goto again; /* Read next line */
1920     }
1921 
1922     /* Check for two-character token */
1923     {
1924         int c2 = tok_nextc(tok);
1925         int token = Ta3Token_TwoChars(c, c2);
1926         if (token != OP) {
1927             int c3 = tok_nextc(tok);
1928             int token3 = Ta3Token_ThreeChars(c, c2, c3);
1929             if (token3 != OP) {
1930                 token = token3;
1931             }
1932             else {
1933                 tok_backup(tok, c3);
1934             }
1935             *p_start = tok->start;
1936             *p_end = tok->cur;
1937             return token;
1938         }
1939         tok_backup(tok, c2);
1940     }
1941 
1942     /* Keep track of parentheses nesting level */
1943     switch (c) {
1944     case '(':
1945     case '[':
1946     case '{':
1947         tok->level++;
1948         break;
1949     case ')':
1950     case ']':
1951     case '}':
1952         tok->level--;
1953         break;
1954     }
1955 
1956     /* Punctuation character */
1957     *p_start = tok->start;
1958     *p_end = tok->cur;
1959     return Ta3Token_OneChar(c);
1960 }
1961 
1962 int
Ta3Tokenizer_Get(struct tok_state * tok,char ** p_start,char ** p_end)1963 Ta3Tokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1964 {
1965     int result = tok_get(tok, p_start, p_end);
1966     if (tok->decoding_erred) {
1967         result = ERRORTOKEN;
1968         tok->done = E_DECODE;
1969     }
1970     return result;
1971 }
1972 
1973 /* Get the encoding of a Python file. Check for the coding cookie and check if
1974    the file starts with a BOM.
1975 
1976    Ta3Tokenizer_FindEncodingFilename() returns NULL when it can't find the
1977    encoding in the first or second line of the file (in which case the encoding
1978    should be assumed to be UTF-8).
1979 
1980    The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1981    by the caller. */
1982 
1983 char *
Ta3Tokenizer_FindEncodingFilename(int fd,PyObject * filename)1984 Ta3Tokenizer_FindEncodingFilename(int fd, PyObject *filename)
1985 {
1986     struct tok_state *tok;
1987     FILE *fp;
1988     char *p_start =NULL , *p_end =NULL , *encoding = NULL;
1989 
1990 #ifndef PGEN
1991     fd = _Py_dup(fd);
1992 #else
1993     fd = dup(fd);
1994 #endif
1995     if (fd < 0) {
1996         return NULL;
1997     }
1998 
1999     fp = fdopen(fd, "r");
2000     if (fp == NULL) {
2001         return NULL;
2002     }
2003     tok = Ta3Tokenizer_FromFile(fp, NULL, NULL, NULL);
2004     if (tok == NULL) {
2005         fclose(fp);
2006         return NULL;
2007     }
2008 #ifndef PGEN
2009     if (filename != NULL) {
2010         Py_INCREF(filename);
2011         tok->filename = filename;
2012     }
2013     else {
2014         tok->filename = PyUnicode_FromString("<string>");
2015         if (tok->filename == NULL) {
2016             fclose(fp);
2017             Ta3Tokenizer_Free(tok);
2018             return encoding;
2019         }
2020     }
2021 #endif
2022     while (tok->lineno < 2 && tok->done == E_OK) {
2023         Ta3Tokenizer_Get(tok, &p_start, &p_end);
2024     }
2025     fclose(fp);
2026     if (tok->encoding) {
2027         encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
2028         if (encoding)
2029         strcpy(encoding, tok->encoding);
2030     }
2031     Ta3Tokenizer_Free(tok);
2032     return encoding;
2033 }
2034 
2035 char *
Ta3Tokenizer_FindEncoding(int fd)2036 Ta3Tokenizer_FindEncoding(int fd)
2037 {
2038     return Ta3Tokenizer_FindEncodingFilename(fd, NULL);
2039 }
2040 
2041 #ifdef Py_DEBUG
2042 
2043 void
tok_dump(int type,char * start,char * end)2044 tok_dump(int type, char *start, char *end)
2045 {
2046     printf("%s", _Ta3Parser_TokenNames[type]);
2047     if (type == NAME || type == NUMBER || type == STRING || type == OP)
2048         printf("(%.*s)", (int)(end - start), start);
2049 }
2050 
2051 #endif
2052