1
2 /* Tokenizer implementation */
3
4 #define PY_SSIZE_T_CLEAN
5 #include "Python.h"
6 #include "pycore_call.h" // _PyObject_CallNoArgs()
7
8 #include <ctype.h>
9 #include <assert.h>
10
11 #include "tokenizer.h"
12 #include "errcode.h"
13
14 #include "unicodeobject.h"
15 #include "bytesobject.h"
16 #include "fileobject.h"
17 #include "abstract.h"
18
19 /* Alternate tab spacing */
20 #define ALTTABSIZE 1
21
22 #define is_potential_identifier_start(c) (\
23 (c >= 'a' && c <= 'z')\
24 || (c >= 'A' && c <= 'Z')\
25 || c == '_'\
26 || (c >= 128))
27
28 #define is_potential_identifier_char(c) (\
29 (c >= 'a' && c <= 'z')\
30 || (c >= 'A' && c <= 'Z')\
31 || (c >= '0' && c <= '9')\
32 || c == '_'\
33 || (c >= 128))
34
35
36 /* Don't ever change this -- it would break the portability of Python code */
37 #define TABSIZE 8
38
39 /* Forward */
40 static struct tok_state *tok_new(void);
41 static int tok_nextc(struct tok_state *tok);
42 static void tok_backup(struct tok_state *tok, int c);
43
44
45 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
46 tokenizing. */
47 static const char* type_comment_prefix = "# type: ";
48
49 /* Create and initialize a new tok_state structure */
50
51 static struct tok_state *
tok_new(void)52 tok_new(void)
53 {
54 struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
55 sizeof(struct tok_state));
56 if (tok == NULL)
57 return NULL;
58 tok->buf = tok->cur = tok->inp = NULL;
59 tok->fp_interactive = 0;
60 tok->interactive_src_start = NULL;
61 tok->interactive_src_end = NULL;
62 tok->start = NULL;
63 tok->end = NULL;
64 tok->done = E_OK;
65 tok->fp = NULL;
66 tok->input = NULL;
67 tok->tabsize = TABSIZE;
68 tok->indent = 0;
69 tok->indstack[0] = 0;
70 tok->atbol = 1;
71 tok->pendin = 0;
72 tok->prompt = tok->nextprompt = NULL;
73 tok->lineno = 0;
74 tok->level = 0;
75 tok->altindstack[0] = 0;
76 tok->decoding_state = STATE_INIT;
77 tok->decoding_erred = 0;
78 tok->enc = NULL;
79 tok->encoding = NULL;
80 tok->cont_line = 0;
81 tok->filename = NULL;
82 tok->decoding_readline = NULL;
83 tok->decoding_buffer = NULL;
84 tok->type_comments = 0;
85 tok->async_hacks = 0;
86 tok->async_def = 0;
87 tok->async_def_indent = 0;
88 tok->async_def_nl = 0;
89 tok->interactive_underflow = IUNDERFLOW_NORMAL;
90 tok->str = NULL;
91 return tok;
92 }
93
94 static char *
new_string(const char * s,Py_ssize_t len,struct tok_state * tok)95 new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
96 {
97 char* result = (char *)PyMem_Malloc(len + 1);
98 if (!result) {
99 tok->done = E_NOMEM;
100 return NULL;
101 }
102 memcpy(result, s, len);
103 result[len] = '\0';
104 return result;
105 }
106
107 static char *
error_ret(struct tok_state * tok)108 error_ret(struct tok_state *tok) /* XXX */
109 {
110 tok->decoding_erred = 1;
111 if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
112 PyMem_Free(tok->buf);
113 tok->buf = tok->cur = tok->inp = NULL;
114 tok->start = NULL;
115 tok->end = NULL;
116 tok->done = E_DECODE;
117 return NULL; /* as if it were EOF */
118 }
119
120
121 static const char *
get_normal_name(const char * s)122 get_normal_name(const char *s) /* for utf-8 and latin-1 */
123 {
124 char buf[13];
125 int i;
126 for (i = 0; i < 12; i++) {
127 int c = s[i];
128 if (c == '\0')
129 break;
130 else if (c == '_')
131 buf[i] = '-';
132 else
133 buf[i] = tolower(c);
134 }
135 buf[i] = '\0';
136 if (strcmp(buf, "utf-8") == 0 ||
137 strncmp(buf, "utf-8-", 6) == 0)
138 return "utf-8";
139 else if (strcmp(buf, "latin-1") == 0 ||
140 strcmp(buf, "iso-8859-1") == 0 ||
141 strcmp(buf, "iso-latin-1") == 0 ||
142 strncmp(buf, "latin-1-", 8) == 0 ||
143 strncmp(buf, "iso-8859-1-", 11) == 0 ||
144 strncmp(buf, "iso-latin-1-", 12) == 0)
145 return "iso-8859-1";
146 else
147 return s;
148 }
149
150 /* Return the coding spec in S, or NULL if none is found. */
151
152 static int
get_coding_spec(const char * s,char ** spec,Py_ssize_t size,struct tok_state * tok)153 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
154 {
155 Py_ssize_t i;
156 *spec = NULL;
157 /* Coding spec must be in a comment, and that comment must be
158 * the only statement on the source code line. */
159 for (i = 0; i < size - 6; i++) {
160 if (s[i] == '#')
161 break;
162 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
163 return 1;
164 }
165 for (; i < size - 6; i++) { /* XXX inefficient search */
166 const char* t = s + i;
167 if (memcmp(t, "coding", 6) == 0) {
168 const char* begin = NULL;
169 t += 6;
170 if (t[0] != ':' && t[0] != '=')
171 continue;
172 do {
173 t++;
174 } while (t[0] == ' ' || t[0] == '\t');
175
176 begin = t;
177 while (Py_ISALNUM(t[0]) ||
178 t[0] == '-' || t[0] == '_' || t[0] == '.')
179 t++;
180
181 if (begin < t) {
182 char* r = new_string(begin, t - begin, tok);
183 const char* q;
184 if (!r)
185 return 0;
186 q = get_normal_name(r);
187 if (r != q) {
188 PyMem_Free(r);
189 r = new_string(q, strlen(q), tok);
190 if (!r)
191 return 0;
192 }
193 *spec = r;
194 break;
195 }
196 }
197 }
198 return 1;
199 }
200
201 /* Check whether the line contains a coding spec. If it does,
202 invoke the set_readline function for the new encoding.
203 This function receives the tok_state and the new encoding.
204 Return 1 on success, 0 on failure. */
205
206 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))207 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
208 int set_readline(struct tok_state *, const char *))
209 {
210 char *cs;
211 if (tok->cont_line) {
212 /* It's a continuation line, so it can't be a coding spec. */
213 tok->decoding_state = STATE_NORMAL;
214 return 1;
215 }
216 if (!get_coding_spec(line, &cs, size, tok)) {
217 return 0;
218 }
219 if (!cs) {
220 Py_ssize_t i;
221 for (i = 0; i < size; i++) {
222 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
223 break;
224 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
225 /* Stop checking coding spec after a line containing
226 * anything except a comment. */
227 tok->decoding_state = STATE_NORMAL;
228 break;
229 }
230 }
231 return 1;
232 }
233 tok->decoding_state = STATE_NORMAL;
234 if (tok->encoding == NULL) {
235 assert(tok->decoding_readline == NULL);
236 if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
237 error_ret(tok);
238 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
239 PyMem_Free(cs);
240 return 0;
241 }
242 tok->encoding = cs;
243 } else { /* then, compare cs with BOM */
244 if (strcmp(tok->encoding, cs) != 0) {
245 error_ret(tok);
246 PyErr_Format(PyExc_SyntaxError,
247 "encoding problem: %s with BOM", cs);
248 PyMem_Free(cs);
249 return 0;
250 }
251 PyMem_Free(cs);
252 }
253 return 1;
254 }
255
256 /* See whether the file starts with a BOM. If it does,
257 invoke the set_readline function with the new encoding.
258 Return 1 on success, 0 on failure. */
259
260 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)261 check_bom(int get_char(struct tok_state *),
262 void unget_char(int, struct tok_state *),
263 int set_readline(struct tok_state *, const char *),
264 struct tok_state *tok)
265 {
266 int ch1, ch2, ch3;
267 ch1 = get_char(tok);
268 tok->decoding_state = STATE_SEEK_CODING;
269 if (ch1 == EOF) {
270 return 1;
271 } else if (ch1 == 0xEF) {
272 ch2 = get_char(tok);
273 if (ch2 != 0xBB) {
274 unget_char(ch2, tok);
275 unget_char(ch1, tok);
276 return 1;
277 }
278 ch3 = get_char(tok);
279 if (ch3 != 0xBF) {
280 unget_char(ch3, tok);
281 unget_char(ch2, tok);
282 unget_char(ch1, tok);
283 return 1;
284 }
285 #if 0
286 /* Disable support for UTF-16 BOMs until a decision
287 is made whether this needs to be supported. */
288 } else if (ch1 == 0xFE) {
289 ch2 = get_char(tok);
290 if (ch2 != 0xFF) {
291 unget_char(ch2, tok);
292 unget_char(ch1, tok);
293 return 1;
294 }
295 if (!set_readline(tok, "utf-16-be"))
296 return 0;
297 tok->decoding_state = STATE_NORMAL;
298 } else if (ch1 == 0xFF) {
299 ch2 = get_char(tok);
300 if (ch2 != 0xFE) {
301 unget_char(ch2, tok);
302 unget_char(ch1, tok);
303 return 1;
304 }
305 if (!set_readline(tok, "utf-16-le"))
306 return 0;
307 tok->decoding_state = STATE_NORMAL;
308 #endif
309 } else {
310 unget_char(ch1, tok);
311 return 1;
312 }
313 if (tok->encoding != NULL)
314 PyMem_Free(tok->encoding);
315 tok->encoding = new_string("utf-8", 5, tok);
316 if (!tok->encoding)
317 return 0;
318 /* No need to set_readline: input is already utf-8 */
319 return 1;
320 }
321
322 static int
tok_concatenate_interactive_new_line(struct tok_state * tok,const char * line)323 tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
324 assert(tok->fp_interactive);
325
326 if (!line) {
327 return 0;
328 }
329
330 Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
331 Py_ssize_t line_size = strlen(line);
332 char* new_str = tok->interactive_src_start;
333
334 new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
335 if (!new_str) {
336 if (tok->interactive_src_start) {
337 PyMem_Free(tok->interactive_src_start);
338 }
339 tok->interactive_src_start = NULL;
340 tok->interactive_src_end = NULL;
341 tok->done = E_NOMEM;
342 return -1;
343 }
344 strcpy(new_str + current_size, line);
345
346 tok->interactive_src_start = new_str;
347 tok->interactive_src_end = new_str + current_size + line_size;
348 return 0;
349 }
350
351
352 /* Read a line of text from TOK into S, using the stream in TOK.
353 Return NULL on failure, else S.
354
355 On entry, tok->decoding_buffer will be one of:
356 1) NULL: need to call tok->decoding_readline to get a new line
357 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
358 stored the result in tok->decoding_buffer
359 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
360 (in the s buffer) to copy entire contents of the line read
361 by tok->decoding_readline. tok->decoding_buffer has the overflow.
362 In this case, tok_readline_recode is called in a loop (with an expanded buffer)
363 until the buffer ends with a '\n' (or until the end of the file is
364 reached): see tok_nextc and its calls to tok_reserve_buf.
365 */
366
367 static int
tok_reserve_buf(struct tok_state * tok,Py_ssize_t size)368 tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
369 {
370 Py_ssize_t cur = tok->cur - tok->buf;
371 Py_ssize_t oldsize = tok->inp - tok->buf;
372 Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
373 if (newsize > tok->end - tok->buf) {
374 char *newbuf = tok->buf;
375 Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
376 Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
377 Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
378 newbuf = (char *)PyMem_Realloc(newbuf, newsize);
379 if (newbuf == NULL) {
380 tok->done = E_NOMEM;
381 return 0;
382 }
383 tok->buf = newbuf;
384 tok->cur = tok->buf + cur;
385 tok->inp = tok->buf + oldsize;
386 tok->end = tok->buf + newsize;
387 tok->start = start < 0 ? NULL : tok->buf + start;
388 tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
389 tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
390 }
391 return 1;
392 }
393
394 static int
tok_readline_recode(struct tok_state * tok)395 tok_readline_recode(struct tok_state *tok) {
396 PyObject *line;
397 const char *buf;
398 Py_ssize_t buflen;
399 line = tok->decoding_buffer;
400 if (line == NULL) {
401 line = PyObject_CallNoArgs(tok->decoding_readline);
402 if (line == NULL) {
403 error_ret(tok);
404 goto error;
405 }
406 }
407 else {
408 tok->decoding_buffer = NULL;
409 }
410 buf = PyUnicode_AsUTF8AndSize(line, &buflen);
411 if (buf == NULL) {
412 error_ret(tok);
413 goto error;
414 }
415 if (!tok_reserve_buf(tok, buflen + 1)) {
416 goto error;
417 }
418 memcpy(tok->inp, buf, buflen);
419 tok->inp += buflen;
420 *tok->inp = '\0';
421 if (tok->fp_interactive &&
422 tok_concatenate_interactive_new_line(tok, buf) == -1) {
423 goto error;
424 }
425 Py_DECREF(line);
426 return 1;
427 error:
428 Py_XDECREF(line);
429 return 0;
430 }
431
432 /* Set the readline function for TOK to a StreamReader's
433 readline function. The StreamReader is named ENC.
434
435 This function is called from check_bom and check_coding_spec.
436
437 ENC is usually identical to the future value of tok->encoding,
438 except for the (currently unsupported) case of UTF-16.
439
440 Return 1 on success, 0 on failure. */
441
442 static int
fp_setreadl(struct tok_state * tok,const char * enc)443 fp_setreadl(struct tok_state *tok, const char* enc)
444 {
445 PyObject *readline, *io, *stream;
446 _Py_IDENTIFIER(open);
447 _Py_IDENTIFIER(readline);
448 int fd;
449 long pos;
450
451 fd = fileno(tok->fp);
452 /* Due to buffering the file offset for fd can be different from the file
453 * position of tok->fp. If tok->fp was opened in text mode on Windows,
454 * its file position counts CRLF as one char and can't be directly mapped
455 * to the file offset for fd. Instead we step back one byte and read to
456 * the end of line.*/
457 pos = ftell(tok->fp);
458 if (pos == -1 ||
459 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
460 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
461 return 0;
462 }
463
464 io = PyImport_ImportModuleNoBlock("io");
465 if (io == NULL)
466 return 0;
467
468 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
469 fd, "r", -1, enc, Py_None, Py_None, Py_False);
470 Py_DECREF(io);
471 if (stream == NULL)
472 return 0;
473
474 readline = _PyObject_GetAttrId(stream, &PyId_readline);
475 Py_DECREF(stream);
476 if (readline == NULL)
477 return 0;
478 Py_XSETREF(tok->decoding_readline, readline);
479
480 if (pos > 0) {
481 PyObject *bufobj = _PyObject_CallNoArgs(readline);
482 if (bufobj == NULL)
483 return 0;
484 Py_DECREF(bufobj);
485 }
486
487 return 1;
488 }
489
490 /* Fetch the next byte from TOK. */
491
fp_getc(struct tok_state * tok)492 static int fp_getc(struct tok_state *tok) {
493 return getc(tok->fp);
494 }
495
496 /* Unfetch the last byte back into TOK. */
497
fp_ungetc(int c,struct tok_state * tok)498 static void fp_ungetc(int c, struct tok_state *tok) {
499 ungetc(c, tok->fp);
500 }
501
502 /* Check whether the characters at s start a valid
503 UTF-8 sequence. Return the number of characters forming
504 the sequence if yes, 0 if not. */
valid_utf8(const unsigned char * s)505 static int valid_utf8(const unsigned char* s)
506 {
507 int expected = 0;
508 int length;
509 if (*s < 0x80)
510 /* single-byte code */
511 return 1;
512 if (*s < 0xc0)
513 /* following byte */
514 return 0;
515 if (*s < 0xE0)
516 expected = 1;
517 else if (*s < 0xF0)
518 expected = 2;
519 else if (*s < 0xF8)
520 expected = 3;
521 else
522 return 0;
523 length = expected + 1;
524 for (; expected; expected--)
525 if (s[expected] < 0x80 || s[expected] >= 0xC0)
526 return 0;
527 return length;
528 }
529
530 static int
ensure_utf8(char * line,struct tok_state * tok)531 ensure_utf8(char *line, struct tok_state *tok)
532 {
533 int badchar = 0;
534 unsigned char *c;
535 int length;
536 for (c = (unsigned char *)line; *c; c += length) {
537 if (!(length = valid_utf8(c))) {
538 badchar = *c;
539 break;
540 }
541 }
542 if (badchar) {
543 /* Need to add 1 to the line number, since this line
544 has not been counted, yet. */
545 PyErr_Format(PyExc_SyntaxError,
546 "Non-UTF-8 code starting with '\\x%.2x' "
547 "in file %U on line %i, "
548 "but no encoding declared; "
549 "see https://python.org/dev/peps/pep-0263/ for details",
550 badchar, tok->filename, tok->lineno + 1);
551 return 0;
552 }
553 return 1;
554 }
555
556 /* Fetch a byte from TOK, using the string buffer. */
557
558 static int
buf_getc(struct tok_state * tok)559 buf_getc(struct tok_state *tok) {
560 return Py_CHARMASK(*tok->str++);
561 }
562
563 /* Unfetch a byte from TOK, using the string buffer. */
564
565 static void
buf_ungetc(int c,struct tok_state * tok)566 buf_ungetc(int c, struct tok_state *tok) {
567 tok->str--;
568 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
569 }
570
571 /* Set the readline function for TOK to ENC. For the string-based
572 tokenizer, this means to just record the encoding. */
573
574 static int
buf_setreadl(struct tok_state * tok,const char * enc)575 buf_setreadl(struct tok_state *tok, const char* enc) {
576 tok->enc = enc;
577 return 1;
578 }
579
580 /* Return a UTF-8 encoding Python string object from the
581 C byte string STR, which is encoded with ENC. */
582
583 static PyObject *
translate_into_utf8(const char * str,const char * enc)584 translate_into_utf8(const char* str, const char* enc) {
585 PyObject *utf8;
586 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
587 if (buf == NULL)
588 return NULL;
589 utf8 = PyUnicode_AsUTF8String(buf);
590 Py_DECREF(buf);
591 return utf8;
592 }
593
594
595 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)596 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
597 int skip_next_lf = 0;
598 size_t needed_length = strlen(s) + 2, final_length;
599 char *buf, *current;
600 char c = '\0';
601 buf = PyMem_Malloc(needed_length);
602 if (buf == NULL) {
603 tok->done = E_NOMEM;
604 return NULL;
605 }
606 for (current = buf; *s; s++, current++) {
607 c = *s;
608 if (skip_next_lf) {
609 skip_next_lf = 0;
610 if (c == '\n') {
611 c = *++s;
612 if (!c)
613 break;
614 }
615 }
616 if (c == '\r') {
617 skip_next_lf = 1;
618 c = '\n';
619 }
620 *current = c;
621 }
622 /* If this is exec input, add a newline to the end of the string if
623 there isn't one already. */
624 if (exec_input && c != '\n') {
625 *current = '\n';
626 current++;
627 }
628 *current = '\0';
629 final_length = current - buf + 1;
630 if (final_length < needed_length && final_length) {
631 /* should never fail */
632 char* result = PyMem_Realloc(buf, final_length);
633 if (result == NULL) {
634 PyMem_Free(buf);
635 }
636 buf = result;
637 }
638 return buf;
639 }
640
641 /* Decode a byte string STR for use as the buffer of TOK.
642 Look for encoding declarations inside STR, and record them
643 inside TOK. */
644
645 static char *
decode_str(const char * input,int single,struct tok_state * tok)646 decode_str(const char *input, int single, struct tok_state *tok)
647 {
648 PyObject* utf8 = NULL;
649 char *str;
650 const char *s;
651 const char *newl[2] = {NULL, NULL};
652 int lineno = 0;
653 tok->input = str = translate_newlines(input, single, tok);
654 if (str == NULL)
655 return NULL;
656 tok->enc = NULL;
657 tok->str = str;
658 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
659 return error_ret(tok);
660 str = tok->str; /* string after BOM if any */
661 assert(str);
662 if (tok->enc != NULL) {
663 utf8 = translate_into_utf8(str, tok->enc);
664 if (utf8 == NULL)
665 return error_ret(tok);
666 str = PyBytes_AsString(utf8);
667 }
668 for (s = str;; s++) {
669 if (*s == '\0') break;
670 else if (*s == '\n') {
671 assert(lineno < 2);
672 newl[lineno] = s;
673 lineno++;
674 if (lineno == 2) break;
675 }
676 }
677 tok->enc = NULL;
678 /* need to check line 1 and 2 separately since check_coding_spec
679 assumes a single line as input */
680 if (newl[0]) {
681 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
682 return NULL;
683 }
684 if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
685 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
686 tok, buf_setreadl))
687 return NULL;
688 }
689 }
690 if (tok->enc != NULL) {
691 assert(utf8 == NULL);
692 utf8 = translate_into_utf8(str, tok->enc);
693 if (utf8 == NULL)
694 return error_ret(tok);
695 str = PyBytes_AS_STRING(utf8);
696 }
697 assert(tok->decoding_buffer == NULL);
698 tok->decoding_buffer = utf8; /* CAUTION */
699 return str;
700 }
701
702 /* Set up tokenizer for string */
703
704 struct tok_state *
_PyTokenizer_FromString(const char * str,int exec_input)705 _PyTokenizer_FromString(const char *str, int exec_input)
706 {
707 struct tok_state *tok = tok_new();
708 char *decoded;
709
710 if (tok == NULL)
711 return NULL;
712 decoded = decode_str(str, exec_input, tok);
713 if (decoded == NULL) {
714 _PyTokenizer_Free(tok);
715 return NULL;
716 }
717
718 tok->buf = tok->cur = tok->inp = decoded;
719 tok->end = decoded;
720 return tok;
721 }
722
723 /* Set up tokenizer for UTF-8 string */
724
725 struct tok_state *
_PyTokenizer_FromUTF8(const char * str,int exec_input)726 _PyTokenizer_FromUTF8(const char *str, int exec_input)
727 {
728 struct tok_state *tok = tok_new();
729 char *translated;
730 if (tok == NULL)
731 return NULL;
732 tok->input = translated = translate_newlines(str, exec_input, tok);
733 if (translated == NULL) {
734 _PyTokenizer_Free(tok);
735 return NULL;
736 }
737 tok->decoding_state = STATE_NORMAL;
738 tok->enc = NULL;
739 tok->str = translated;
740 tok->encoding = new_string("utf-8", 5, tok);
741 if (!tok->encoding) {
742 _PyTokenizer_Free(tok);
743 return NULL;
744 }
745
746 tok->buf = tok->cur = tok->inp = translated;
747 tok->end = translated;
748 return tok;
749 }
750
751 /* Set up tokenizer for file */
752
753 struct tok_state *
_PyTokenizer_FromFile(FILE * fp,const char * enc,const char * ps1,const char * ps2)754 _PyTokenizer_FromFile(FILE *fp, const char* enc,
755 const char *ps1, const char *ps2)
756 {
757 struct tok_state *tok = tok_new();
758 if (tok == NULL)
759 return NULL;
760 if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
761 _PyTokenizer_Free(tok);
762 return NULL;
763 }
764 tok->cur = tok->inp = tok->buf;
765 tok->end = tok->buf + BUFSIZ;
766 tok->fp = fp;
767 tok->prompt = ps1;
768 tok->nextprompt = ps2;
769 if (enc != NULL) {
770 /* Must copy encoding declaration since it
771 gets copied into the parse tree. */
772 tok->encoding = new_string(enc, strlen(enc), tok);
773 if (!tok->encoding) {
774 _PyTokenizer_Free(tok);
775 return NULL;
776 }
777 tok->decoding_state = STATE_NORMAL;
778 }
779 return tok;
780 }
781
782 /* Free a tok_state structure */
783
784 void
_PyTokenizer_Free(struct tok_state * tok)785 _PyTokenizer_Free(struct tok_state *tok)
786 {
787 if (tok->encoding != NULL) {
788 PyMem_Free(tok->encoding);
789 }
790 Py_XDECREF(tok->decoding_readline);
791 Py_XDECREF(tok->decoding_buffer);
792 Py_XDECREF(tok->filename);
793 if (tok->fp != NULL && tok->buf != NULL) {
794 PyMem_Free(tok->buf);
795 }
796 if (tok->input) {
797 PyMem_Free(tok->input);
798 }
799 if (tok->interactive_src_start != NULL) {
800 PyMem_Free(tok->interactive_src_start);
801 }
802 PyMem_Free(tok);
803 }
804
805 static int
tok_readline_raw(struct tok_state * tok)806 tok_readline_raw(struct tok_state *tok)
807 {
808 do {
809 if (!tok_reserve_buf(tok, BUFSIZ)) {
810 return 0;
811 }
812 char *line = Py_UniversalNewlineFgets(tok->inp,
813 (int)(tok->end - tok->inp),
814 tok->fp, NULL);
815 if (line == NULL) {
816 return 1;
817 }
818 if (tok->fp_interactive &&
819 tok_concatenate_interactive_new_line(tok, line) == -1) {
820 return 0;
821 }
822 if (*tok->inp == '\0') {
823 return 0;
824 }
825 tok->inp = strchr(tok->inp, '\0');
826 } while (tok->inp[-1] != '\n');
827 return 1;
828 }
829
830 static int
tok_underflow_string(struct tok_state * tok)831 tok_underflow_string(struct tok_state *tok) {
832 char *end = strchr(tok->inp, '\n');
833 if (end != NULL) {
834 end++;
835 }
836 else {
837 end = strchr(tok->inp, '\0');
838 if (end == tok->inp) {
839 tok->done = E_EOF;
840 return 0;
841 }
842 }
843 if (tok->start == NULL) {
844 tok->buf = tok->cur;
845 }
846 tok->line_start = tok->cur;
847 tok->lineno++;
848 tok->inp = end;
849 return 1;
850 }
851
852 static int
tok_underflow_interactive(struct tok_state * tok)853 tok_underflow_interactive(struct tok_state *tok) {
854 if (tok->interactive_underflow == IUNDERFLOW_STOP) {
855 tok->done = E_INTERACT_STOP;
856 return 1;
857 }
858 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
859 if (newtok != NULL) {
860 char *translated = translate_newlines(newtok, 0, tok);
861 PyMem_Free(newtok);
862 if (translated == NULL) {
863 return 0;
864 }
865 newtok = translated;
866 }
867 if (tok->encoding && newtok && *newtok) {
868 /* Recode to UTF-8 */
869 Py_ssize_t buflen;
870 const char* buf;
871 PyObject *u = translate_into_utf8(newtok, tok->encoding);
872 PyMem_Free(newtok);
873 if (u == NULL) {
874 tok->done = E_DECODE;
875 return 0;
876 }
877 buflen = PyBytes_GET_SIZE(u);
878 buf = PyBytes_AS_STRING(u);
879 newtok = PyMem_Malloc(buflen+1);
880 if (newtok == NULL) {
881 Py_DECREF(u);
882 tok->done = E_NOMEM;
883 return 0;
884 }
885 strcpy(newtok, buf);
886 Py_DECREF(u);
887 }
888 if (tok->fp_interactive &&
889 tok_concatenate_interactive_new_line(tok, newtok) == -1) {
890 PyMem_Free(newtok);
891 return 0;
892 }
893 if (tok->nextprompt != NULL) {
894 tok->prompt = tok->nextprompt;
895 }
896 if (newtok == NULL) {
897 tok->done = E_INTR;
898 }
899 else if (*newtok == '\0') {
900 PyMem_Free(newtok);
901 tok->done = E_EOF;
902 }
903 else if (tok->start != NULL) {
904 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
905 size_t size = strlen(newtok);
906 tok->lineno++;
907 if (!tok_reserve_buf(tok, size + 1)) {
908 PyMem_Free(tok->buf);
909 tok->buf = NULL;
910 PyMem_Free(newtok);
911 return 0;
912 }
913 memcpy(tok->cur, newtok, size + 1);
914 PyMem_Free(newtok);
915 tok->inp += size;
916 tok->multi_line_start = tok->buf + cur_multi_line_start;
917 }
918 else {
919 tok->lineno++;
920 PyMem_Free(tok->buf);
921 tok->buf = newtok;
922 tok->cur = tok->buf;
923 tok->line_start = tok->buf;
924 tok->inp = strchr(tok->buf, '\0');
925 tok->end = tok->inp + 1;
926 }
927 if (tok->done != E_OK) {
928 if (tok->prompt != NULL) {
929 PySys_WriteStderr("\n");
930 }
931 return 0;
932 }
933 return 1;
934 }
935
936 static int
tok_underflow_file(struct tok_state * tok)937 tok_underflow_file(struct tok_state *tok) {
938 if (tok->start == NULL) {
939 tok->cur = tok->inp = tok->buf;
940 }
941 if (tok->decoding_state == STATE_INIT) {
942 /* We have not yet determined the encoding.
943 If an encoding is found, use the file-pointer
944 reader functions from now on. */
945 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
946 error_ret(tok);
947 return 0;
948 }
949 assert(tok->decoding_state != STATE_INIT);
950 }
951 /* Read until '\n' or EOF */
952 if (tok->decoding_readline != NULL) {
953 /* We already have a codec associated with this input. */
954 if (!tok_readline_recode(tok)) {
955 return 0;
956 }
957 }
958 else {
959 /* We want a 'raw' read. */
960 if (!tok_readline_raw(tok)) {
961 return 0;
962 }
963 }
964 if (tok->inp == tok->cur) {
965 tok->done = E_EOF;
966 return 0;
967 }
968 if (tok->inp[-1] != '\n') {
969 /* Last line does not end in \n, fake one */
970 *tok->inp++ = '\n';
971 *tok->inp = '\0';
972 }
973
974 tok->lineno++;
975 if (tok->decoding_state != STATE_NORMAL) {
976 if (tok->lineno > 2) {
977 tok->decoding_state = STATE_NORMAL;
978 }
979 else if (!check_coding_spec(tok->cur, strlen(tok->cur),
980 tok, fp_setreadl))
981 {
982 return 0;
983 }
984 }
985 /* The default encoding is UTF-8, so make sure we don't have any
986 non-UTF-8 sequences in it. */
987 if (!tok->encoding
988 && (tok->decoding_state != STATE_NORMAL || tok->lineno >= 2)) {
989 if (!ensure_utf8(tok->cur, tok)) {
990 error_ret(tok);
991 return 0;
992 }
993 }
994 assert(tok->done == E_OK);
995 return tok->done == E_OK;
996 }
997
998 #if defined(Py_DEBUG)
999 static void
print_escape(FILE * f,const char * s,Py_ssize_t size)1000 print_escape(FILE *f, const char *s, Py_ssize_t size)
1001 {
1002 if (s == NULL) {
1003 fputs("NULL", f);
1004 return;
1005 }
1006 putc('"', f);
1007 while (size-- > 0) {
1008 unsigned char c = *s++;
1009 switch (c) {
1010 case '\n': fputs("\\n", f); break;
1011 case '\r': fputs("\\r", f); break;
1012 case '\t': fputs("\\t", f); break;
1013 case '\f': fputs("\\f", f); break;
1014 case '\'': fputs("\\'", f); break;
1015 case '"': fputs("\\\"", f); break;
1016 default:
1017 if (0x20 <= c && c <= 0x7f)
1018 putc(c, f);
1019 else
1020 fprintf(f, "\\x%02x", c);
1021 }
1022 }
1023 putc('"', f);
1024 }
1025 #endif
1026
1027 /* Get next char, updating state; error code goes into tok->done */
1028
1029 static int
tok_nextc(struct tok_state * tok)1030 tok_nextc(struct tok_state *tok)
1031 {
1032 int rc;
1033 for (;;) {
1034 if (tok->cur != tok->inp) {
1035 return Py_CHARMASK(*tok->cur++); /* Fast path */
1036 }
1037 if (tok->done != E_OK)
1038 return EOF;
1039 if (tok->fp == NULL) {
1040 rc = tok_underflow_string(tok);
1041 }
1042 else if (tok->prompt != NULL) {
1043 rc = tok_underflow_interactive(tok);
1044 }
1045 else {
1046 rc = tok_underflow_file(tok);
1047 }
1048 #if defined(Py_DEBUG)
1049 if (Py_DebugFlag) {
1050 fprintf(stderr, "line[%d] = ", tok->lineno);
1051 print_escape(stderr, tok->cur, tok->inp - tok->cur);
1052 fprintf(stderr, " tok->done = %d\n", tok->done);
1053 }
1054 #endif
1055 if (!rc) {
1056 tok->cur = tok->inp;
1057 return EOF;
1058 }
1059 tok->line_start = tok->cur;
1060 }
1061 Py_UNREACHABLE();
1062 }
1063
1064 /* Back-up one character */
1065
1066 static void
tok_backup(struct tok_state * tok,int c)1067 tok_backup(struct tok_state *tok, int c)
1068 {
1069 if (c != EOF) {
1070 if (--tok->cur < tok->buf) {
1071 Py_FatalError("tokenizer beginning of buffer");
1072 }
1073 if ((int)(unsigned char)*tok->cur != c) {
1074 Py_FatalError("tok_backup: wrong character");
1075 }
1076 }
1077 }
1078
1079 static int
_syntaxerror_range(struct tok_state * tok,const char * format,int col_offset,int end_col_offset,va_list vargs)1080 _syntaxerror_range(struct tok_state *tok, const char *format,
1081 int col_offset, int end_col_offset,
1082 va_list vargs)
1083 {
1084 PyObject *errmsg, *errtext, *args;
1085 errmsg = PyUnicode_FromFormatV(format, vargs);
1086 if (!errmsg) {
1087 goto error;
1088 }
1089
1090 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1091 "replace");
1092 if (!errtext) {
1093 goto error;
1094 }
1095
1096 if (col_offset == -1) {
1097 col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1098 }
1099 if (end_col_offset == -1) {
1100 end_col_offset = col_offset;
1101 }
1102
1103 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1104 if (line_len != tok->cur - tok->line_start) {
1105 Py_DECREF(errtext);
1106 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1107 "replace");
1108 }
1109 if (!errtext) {
1110 goto error;
1111 }
1112
1113 args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1114 col_offset, errtext, tok->lineno, end_col_offset);
1115 if (args) {
1116 PyErr_SetObject(PyExc_SyntaxError, args);
1117 Py_DECREF(args);
1118 }
1119
1120 error:
1121 Py_XDECREF(errmsg);
1122 tok->done = E_ERROR;
1123 return ERRORTOKEN;
1124 }
1125
1126 static int
syntaxerror(struct tok_state * tok,const char * format,...)1127 syntaxerror(struct tok_state *tok, const char *format, ...)
1128 {
1129 va_list vargs;
1130 #ifdef HAVE_STDARG_PROTOTYPES
1131 va_start(vargs, format);
1132 #else
1133 va_start(vargs);
1134 #endif
1135 int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1136 va_end(vargs);
1137 return ret;
1138 }
1139
1140 static int
syntaxerror_known_range(struct tok_state * tok,int col_offset,int end_col_offset,const char * format,...)1141 syntaxerror_known_range(struct tok_state *tok,
1142 int col_offset, int end_col_offset,
1143 const char *format, ...)
1144 {
1145 va_list vargs;
1146 #ifdef HAVE_STDARG_PROTOTYPES
1147 va_start(vargs, format);
1148 #else
1149 va_start(vargs);
1150 #endif
1151 int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1152 va_end(vargs);
1153 return ret;
1154 }
1155
1156
1157
1158 static int
indenterror(struct tok_state * tok)1159 indenterror(struct tok_state *tok)
1160 {
1161 tok->done = E_TABSPACE;
1162 tok->cur = tok->inp;
1163 return ERRORTOKEN;
1164 }
1165
1166 static int
parser_warn(struct tok_state * tok,const char * format,...)1167 parser_warn(struct tok_state *tok, const char *format, ...)
1168 {
1169 PyObject *errmsg;
1170 va_list vargs;
1171 #ifdef HAVE_STDARG_PROTOTYPES
1172 va_start(vargs, format);
1173 #else
1174 va_start(vargs);
1175 #endif
1176 errmsg = PyUnicode_FromFormatV(format, vargs);
1177 va_end(vargs);
1178 if (!errmsg) {
1179 goto error;
1180 }
1181
1182 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, errmsg, tok->filename,
1183 tok->lineno, NULL, NULL) < 0) {
1184 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
1185 /* Replace the DeprecationWarning exception with a SyntaxError
1186 to get a more accurate error report */
1187 PyErr_Clear();
1188 syntaxerror(tok, "%U", errmsg);
1189 }
1190 goto error;
1191 }
1192 Py_DECREF(errmsg);
1193 return 0;
1194
1195 error:
1196 Py_XDECREF(errmsg);
1197 tok->done = E_ERROR;
1198 return -1;
1199 }
1200
1201 static int
lookahead(struct tok_state * tok,const char * test)1202 lookahead(struct tok_state *tok, const char *test)
1203 {
1204 const char *s = test;
1205 int res = 0;
1206 while (1) {
1207 int c = tok_nextc(tok);
1208 if (*s == 0) {
1209 res = !is_potential_identifier_char(c);
1210 }
1211 else if (c == *s) {
1212 s++;
1213 continue;
1214 }
1215
1216 tok_backup(tok, c);
1217 while (s != test) {
1218 tok_backup(tok, *--s);
1219 }
1220 return res;
1221 }
1222 }
1223
1224 static int
verify_end_of_number(struct tok_state * tok,int c,const char * kind)1225 verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1226 {
1227 /* Emit a deprecation warning only if the numeric literal is immediately
1228 * followed by one of keywords which can occur after a numeric literal
1229 * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1230 * It allows to gradually deprecate existing valid code without adding
1231 * warning before error in most cases of invalid numeric literal (which
1232 * would be confusing and break existing tests).
1233 * Raise a syntax error with slightly better message than plain
1234 * "invalid syntax" if the numeric literal is immediately followed by
1235 * other keyword or identifier.
1236 */
1237 int r = 0;
1238 if (c == 'a') {
1239 r = lookahead(tok, "nd");
1240 }
1241 else if (c == 'e') {
1242 r = lookahead(tok, "lse");
1243 }
1244 else if (c == 'f') {
1245 r = lookahead(tok, "or");
1246 }
1247 else if (c == 'i') {
1248 int c2 = tok_nextc(tok);
1249 if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1250 r = 1;
1251 }
1252 tok_backup(tok, c2);
1253 }
1254 else if (c == 'o') {
1255 r = lookahead(tok, "r");
1256 }
1257 if (r) {
1258 tok_backup(tok, c);
1259 if (parser_warn(tok, "invalid %s literal", kind)) {
1260 return 0;
1261 }
1262 tok_nextc(tok);
1263 }
1264 else /* In future releases, only error will remain. */
1265 if (is_potential_identifier_char(c)) {
1266 tok_backup(tok, c);
1267 syntaxerror(tok, "invalid %s literal", kind);
1268 return 0;
1269 }
1270 return 1;
1271 }
1272
1273 /* Verify that the identifier follows PEP 3131.
1274 All identifier strings are guaranteed to be "ready" unicode objects.
1275 */
1276 static int
verify_identifier(struct tok_state * tok)1277 verify_identifier(struct tok_state *tok)
1278 {
1279 PyObject *s;
1280 if (tok->decoding_erred)
1281 return 0;
1282 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1283 if (s == NULL) {
1284 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1285 tok->done = E_DECODE;
1286 }
1287 else {
1288 tok->done = E_ERROR;
1289 }
1290 return 0;
1291 }
1292 Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1293 if (invalid < 0) {
1294 Py_DECREF(s);
1295 tok->done = E_ERROR;
1296 return 0;
1297 }
1298 assert(PyUnicode_GET_LENGTH(s) > 0);
1299 if (invalid < PyUnicode_GET_LENGTH(s)) {
1300 Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1301 if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1302 /* Determine the offset in UTF-8 encoded input */
1303 Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1304 if (s != NULL) {
1305 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1306 }
1307 if (s == NULL) {
1308 tok->done = E_ERROR;
1309 return 0;
1310 }
1311 tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1312 }
1313 Py_DECREF(s);
1314 // PyUnicode_FromFormatV() does not support %X
1315 char hex[9];
1316 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
1317 if (Py_UNICODE_ISPRINTABLE(ch)) {
1318 syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1319 }
1320 else {
1321 syntaxerror(tok, "invalid non-printable character U+%s", hex);
1322 }
1323 return 0;
1324 }
1325 Py_DECREF(s);
1326 return 1;
1327 }
1328
1329 static int
tok_decimal_tail(struct tok_state * tok)1330 tok_decimal_tail(struct tok_state *tok)
1331 {
1332 int c;
1333
1334 while (1) {
1335 do {
1336 c = tok_nextc(tok);
1337 } while (isdigit(c));
1338 if (c != '_') {
1339 break;
1340 }
1341 c = tok_nextc(tok);
1342 if (!isdigit(c)) {
1343 tok_backup(tok, c);
1344 syntaxerror(tok, "invalid decimal literal");
1345 return 0;
1346 }
1347 }
1348 return c;
1349 }
1350
1351 /* Get next token, after space stripping etc. */
1352
1353 static int
tok_get(struct tok_state * tok,const char ** p_start,const char ** p_end)1354 tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
1355 {
1356 int c;
1357 int blankline, nonascii;
1358
1359 *p_start = *p_end = NULL;
1360 nextline:
1361 tok->start = NULL;
1362 blankline = 0;
1363
1364 /* Get indentation level */
1365 if (tok->atbol) {
1366 int col = 0;
1367 int altcol = 0;
1368 tok->atbol = 0;
1369 for (;;) {
1370 c = tok_nextc(tok);
1371 if (c == ' ') {
1372 col++, altcol++;
1373 }
1374 else if (c == '\t') {
1375 col = (col / tok->tabsize + 1) * tok->tabsize;
1376 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1377 }
1378 else if (c == '\014') {/* Control-L (formfeed) */
1379 col = altcol = 0; /* For Emacs users */
1380 }
1381 else {
1382 break;
1383 }
1384 }
1385 tok_backup(tok, c);
1386 if (c == '#' || c == '\n' || c == '\\') {
1387 /* Lines with only whitespace and/or comments
1388 and/or a line continuation character
1389 shouldn't affect the indentation and are
1390 not passed to the parser as NEWLINE tokens,
1391 except *totally* empty lines in interactive
1392 mode, which signal the end of a command group. */
1393 if (col == 0 && c == '\n' && tok->prompt != NULL) {
1394 blankline = 0; /* Let it through */
1395 }
1396 else if (tok->prompt != NULL && tok->lineno == 1) {
1397 /* In interactive mode, if the first line contains
1398 only spaces and/or a comment, let it through. */
1399 blankline = 0;
1400 col = altcol = 0;
1401 }
1402 else {
1403 blankline = 1; /* Ignore completely */
1404 }
1405 /* We can't jump back right here since we still
1406 may need to skip to the end of a comment */
1407 }
1408 if (!blankline && tok->level == 0) {
1409 if (col == tok->indstack[tok->indent]) {
1410 /* No change */
1411 if (altcol != tok->altindstack[tok->indent]) {
1412 return indenterror(tok);
1413 }
1414 }
1415 else if (col > tok->indstack[tok->indent]) {
1416 /* Indent -- always one */
1417 if (tok->indent+1 >= MAXINDENT) {
1418 tok->done = E_TOODEEP;
1419 tok->cur = tok->inp;
1420 return ERRORTOKEN;
1421 }
1422 if (altcol <= tok->altindstack[tok->indent]) {
1423 return indenterror(tok);
1424 }
1425 tok->pendin++;
1426 tok->indstack[++tok->indent] = col;
1427 tok->altindstack[tok->indent] = altcol;
1428 }
1429 else /* col < tok->indstack[tok->indent] */ {
1430 /* Dedent -- any number, must be consistent */
1431 while (tok->indent > 0 &&
1432 col < tok->indstack[tok->indent]) {
1433 tok->pendin--;
1434 tok->indent--;
1435 }
1436 if (col != tok->indstack[tok->indent]) {
1437 tok->done = E_DEDENT;
1438 tok->cur = tok->inp;
1439 return ERRORTOKEN;
1440 }
1441 if (altcol != tok->altindstack[tok->indent]) {
1442 return indenterror(tok);
1443 }
1444 }
1445 }
1446 }
1447
1448 tok->start = tok->cur;
1449
1450 /* Return pending indents/dedents */
1451 if (tok->pendin != 0) {
1452 if (tok->pendin < 0) {
1453 tok->pendin++;
1454 return DEDENT;
1455 }
1456 else {
1457 tok->pendin--;
1458 return INDENT;
1459 }
1460 }
1461
1462 /* Peek ahead at the next character */
1463 c = tok_nextc(tok);
1464 tok_backup(tok, c);
1465 /* Check if we are closing an async function */
1466 if (tok->async_def
1467 && !blankline
1468 /* Due to some implementation artifacts of type comments,
1469 * a TYPE_COMMENT at the start of a function won't set an
1470 * indentation level and it will produce a NEWLINE after it.
1471 * To avoid spuriously ending an async function due to this,
1472 * wait until we have some non-newline char in front of us. */
1473 && c != '\n'
1474 && tok->level == 0
1475 /* There was a NEWLINE after ASYNC DEF,
1476 so we're past the signature. */
1477 && tok->async_def_nl
1478 /* Current indentation level is less than where
1479 the async function was defined */
1480 && tok->async_def_indent >= tok->indent)
1481 {
1482 tok->async_def = 0;
1483 tok->async_def_indent = 0;
1484 tok->async_def_nl = 0;
1485 }
1486
1487 again:
1488 tok->start = NULL;
1489 /* Skip spaces */
1490 do {
1491 c = tok_nextc(tok);
1492 } while (c == ' ' || c == '\t' || c == '\014');
1493
1494 /* Set start of current token */
1495 tok->start = tok->cur - 1;
1496
1497 /* Skip comment, unless it's a type comment */
1498 if (c == '#') {
1499 const char *prefix, *p, *type_start;
1500
1501 while (c != EOF && c != '\n') {
1502 c = tok_nextc(tok);
1503 }
1504
1505 if (tok->type_comments) {
1506 p = tok->start;
1507 prefix = type_comment_prefix;
1508 while (*prefix && p < tok->cur) {
1509 if (*prefix == ' ') {
1510 while (*p == ' ' || *p == '\t') {
1511 p++;
1512 }
1513 } else if (*prefix == *p) {
1514 p++;
1515 } else {
1516 break;
1517 }
1518
1519 prefix++;
1520 }
1521
1522 /* This is a type comment if we matched all of type_comment_prefix. */
1523 if (!*prefix) {
1524 int is_type_ignore = 1;
1525 const char *ignore_end = p + 6;
1526 tok_backup(tok, c); /* don't eat the newline or EOF */
1527
1528 type_start = p;
1529
1530 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1531 * or anything ASCII and non-alphanumeric. */
1532 is_type_ignore = (
1533 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1534 && !(tok->cur > ignore_end
1535 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1536
1537 if (is_type_ignore) {
1538 *p_start = ignore_end;
1539 *p_end = tok->cur;
1540
1541 /* If this type ignore is the only thing on the line, consume the newline also. */
1542 if (blankline) {
1543 tok_nextc(tok);
1544 tok->atbol = 1;
1545 }
1546 return TYPE_IGNORE;
1547 } else {
1548 *p_start = type_start; /* after type_comment_prefix */
1549 *p_end = tok->cur;
1550 return TYPE_COMMENT;
1551 }
1552 }
1553 }
1554 }
1555
1556 if (tok->done == E_INTERACT_STOP) {
1557 return ENDMARKER;
1558 }
1559
1560 /* Check for EOF and errors now */
1561 if (c == EOF) {
1562 if (tok->level) {
1563 return ERRORTOKEN;
1564 }
1565 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1566 }
1567
1568 /* Identifier (most frequent token!) */
1569 nonascii = 0;
1570 if (is_potential_identifier_start(c)) {
1571 /* Process the various legal combinations of b"", r"", u"", and f"". */
1572 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1573 while (1) {
1574 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1575 saw_b = 1;
1576 /* Since this is a backwards compatibility support literal we don't
1577 want to support it in arbitrary order like byte literals. */
1578 else if (!(saw_b || saw_u || saw_r || saw_f)
1579 && (c == 'u'|| c == 'U')) {
1580 saw_u = 1;
1581 }
1582 /* ur"" and ru"" are not supported */
1583 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1584 saw_r = 1;
1585 }
1586 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1587 saw_f = 1;
1588 }
1589 else {
1590 break;
1591 }
1592 c = tok_nextc(tok);
1593 if (c == '"' || c == '\'') {
1594 goto letter_quote;
1595 }
1596 }
1597 while (is_potential_identifier_char(c)) {
1598 if (c >= 128) {
1599 nonascii = 1;
1600 }
1601 c = tok_nextc(tok);
1602 }
1603 tok_backup(tok, c);
1604 if (nonascii && !verify_identifier(tok)) {
1605 return ERRORTOKEN;
1606 }
1607
1608 *p_start = tok->start;
1609 *p_end = tok->cur;
1610
1611 /* async/await parsing block. */
1612 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1613 /* May be an 'async' or 'await' token. For Python 3.7 or
1614 later we recognize them unconditionally. For Python
1615 3.5 or 3.6 we recognize 'async' in front of 'def', and
1616 either one inside of 'async def'. (Technically we
1617 shouldn't recognize these at all for 3.4 or earlier,
1618 but there's no *valid* Python 3.4 code that would be
1619 rejected, and async functions will be rejected in a
1620 later phase.) */
1621 if (!tok->async_hacks || tok->async_def) {
1622 /* Always recognize the keywords. */
1623 if (memcmp(tok->start, "async", 5) == 0) {
1624 return ASYNC;
1625 }
1626 if (memcmp(tok->start, "await", 5) == 0) {
1627 return AWAIT;
1628 }
1629 }
1630 else if (memcmp(tok->start, "async", 5) == 0) {
1631 /* The current token is 'async'.
1632 Look ahead one token to see if that is 'def'. */
1633
1634 struct tok_state ahead_tok;
1635 const char *ahead_tok_start = NULL;
1636 const char *ahead_tok_end = NULL;
1637 int ahead_tok_kind;
1638
1639 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1640 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1641 &ahead_tok_end);
1642
1643 if (ahead_tok_kind == NAME
1644 && ahead_tok.cur - ahead_tok.start == 3
1645 && memcmp(ahead_tok.start, "def", 3) == 0)
1646 {
1647 /* The next token is going to be 'def', so instead of
1648 returning a plain NAME token, return ASYNC. */
1649 tok->async_def_indent = tok->indent;
1650 tok->async_def = 1;
1651 return ASYNC;
1652 }
1653 }
1654 }
1655
1656 return NAME;
1657 }
1658
1659 /* Newline */
1660 if (c == '\n') {
1661 tok->atbol = 1;
1662 if (blankline || tok->level > 0) {
1663 goto nextline;
1664 }
1665 *p_start = tok->start;
1666 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1667 tok->cont_line = 0;
1668 if (tok->async_def) {
1669 /* We're somewhere inside an 'async def' function, and
1670 we've encountered a NEWLINE after its signature. */
1671 tok->async_def_nl = 1;
1672 }
1673 return NEWLINE;
1674 }
1675
1676 /* Period or number starting with period? */
1677 if (c == '.') {
1678 c = tok_nextc(tok);
1679 if (isdigit(c)) {
1680 goto fraction;
1681 } else if (c == '.') {
1682 c = tok_nextc(tok);
1683 if (c == '.') {
1684 *p_start = tok->start;
1685 *p_end = tok->cur;
1686 return ELLIPSIS;
1687 }
1688 else {
1689 tok_backup(tok, c);
1690 }
1691 tok_backup(tok, '.');
1692 }
1693 else {
1694 tok_backup(tok, c);
1695 }
1696 *p_start = tok->start;
1697 *p_end = tok->cur;
1698 return DOT;
1699 }
1700
1701 /* Number */
1702 if (isdigit(c)) {
1703 if (c == '0') {
1704 /* Hex, octal or binary -- maybe. */
1705 c = tok_nextc(tok);
1706 if (c == 'x' || c == 'X') {
1707 /* Hex */
1708 c = tok_nextc(tok);
1709 do {
1710 if (c == '_') {
1711 c = tok_nextc(tok);
1712 }
1713 if (!isxdigit(c)) {
1714 tok_backup(tok, c);
1715 return syntaxerror(tok, "invalid hexadecimal literal");
1716 }
1717 do {
1718 c = tok_nextc(tok);
1719 } while (isxdigit(c));
1720 } while (c == '_');
1721 if (!verify_end_of_number(tok, c, "hexadecimal")) {
1722 return ERRORTOKEN;
1723 }
1724 }
1725 else if (c == 'o' || c == 'O') {
1726 /* Octal */
1727 c = tok_nextc(tok);
1728 do {
1729 if (c == '_') {
1730 c = tok_nextc(tok);
1731 }
1732 if (c < '0' || c >= '8') {
1733 if (isdigit(c)) {
1734 return syntaxerror(tok,
1735 "invalid digit '%c' in octal literal", c);
1736 }
1737 else {
1738 tok_backup(tok, c);
1739 return syntaxerror(tok, "invalid octal literal");
1740 }
1741 }
1742 do {
1743 c = tok_nextc(tok);
1744 } while ('0' <= c && c < '8');
1745 } while (c == '_');
1746 if (isdigit(c)) {
1747 return syntaxerror(tok,
1748 "invalid digit '%c' in octal literal", c);
1749 }
1750 if (!verify_end_of_number(tok, c, "octal")) {
1751 return ERRORTOKEN;
1752 }
1753 }
1754 else if (c == 'b' || c == 'B') {
1755 /* Binary */
1756 c = tok_nextc(tok);
1757 do {
1758 if (c == '_') {
1759 c = tok_nextc(tok);
1760 }
1761 if (c != '0' && c != '1') {
1762 if (isdigit(c)) {
1763 return syntaxerror(tok,
1764 "invalid digit '%c' in binary literal", c);
1765 }
1766 else {
1767 tok_backup(tok, c);
1768 return syntaxerror(tok, "invalid binary literal");
1769 }
1770 }
1771 do {
1772 c = tok_nextc(tok);
1773 } while (c == '0' || c == '1');
1774 } while (c == '_');
1775 if (isdigit(c)) {
1776 return syntaxerror(tok,
1777 "invalid digit '%c' in binary literal", c);
1778 }
1779 if (!verify_end_of_number(tok, c, "binary")) {
1780 return ERRORTOKEN;
1781 }
1782 }
1783 else {
1784 int nonzero = 0;
1785 /* maybe old-style octal; c is first char of it */
1786 /* in any case, allow '0' as a literal */
1787 while (1) {
1788 if (c == '_') {
1789 c = tok_nextc(tok);
1790 if (!isdigit(c)) {
1791 tok_backup(tok, c);
1792 return syntaxerror(tok, "invalid decimal literal");
1793 }
1794 }
1795 if (c != '0') {
1796 break;
1797 }
1798 c = tok_nextc(tok);
1799 }
1800 char* zeros_end = tok->cur;
1801 if (isdigit(c)) {
1802 nonzero = 1;
1803 c = tok_decimal_tail(tok);
1804 if (c == 0) {
1805 return ERRORTOKEN;
1806 }
1807 }
1808 if (c == '.') {
1809 c = tok_nextc(tok);
1810 goto fraction;
1811 }
1812 else if (c == 'e' || c == 'E') {
1813 goto exponent;
1814 }
1815 else if (c == 'j' || c == 'J') {
1816 goto imaginary;
1817 }
1818 else if (nonzero) {
1819 /* Old-style octal: now disallowed. */
1820 tok_backup(tok, c);
1821 return syntaxerror_known_range(
1822 tok, (int)(tok->start + 1 - tok->line_start),
1823 (int)(zeros_end - tok->line_start),
1824 "leading zeros in decimal integer "
1825 "literals are not permitted; "
1826 "use an 0o prefix for octal integers");
1827 }
1828 if (!verify_end_of_number(tok, c, "decimal")) {
1829 return ERRORTOKEN;
1830 }
1831 }
1832 }
1833 else {
1834 /* Decimal */
1835 c = tok_decimal_tail(tok);
1836 if (c == 0) {
1837 return ERRORTOKEN;
1838 }
1839 {
1840 /* Accept floating point numbers. */
1841 if (c == '.') {
1842 c = tok_nextc(tok);
1843 fraction:
1844 /* Fraction */
1845 if (isdigit(c)) {
1846 c = tok_decimal_tail(tok);
1847 if (c == 0) {
1848 return ERRORTOKEN;
1849 }
1850 }
1851 }
1852 if (c == 'e' || c == 'E') {
1853 int e;
1854 exponent:
1855 e = c;
1856 /* Exponent part */
1857 c = tok_nextc(tok);
1858 if (c == '+' || c == '-') {
1859 c = tok_nextc(tok);
1860 if (!isdigit(c)) {
1861 tok_backup(tok, c);
1862 return syntaxerror(tok, "invalid decimal literal");
1863 }
1864 } else if (!isdigit(c)) {
1865 tok_backup(tok, c);
1866 if (!verify_end_of_number(tok, e, "decimal")) {
1867 return ERRORTOKEN;
1868 }
1869 tok_backup(tok, e);
1870 *p_start = tok->start;
1871 *p_end = tok->cur;
1872 return NUMBER;
1873 }
1874 c = tok_decimal_tail(tok);
1875 if (c == 0) {
1876 return ERRORTOKEN;
1877 }
1878 }
1879 if (c == 'j' || c == 'J') {
1880 /* Imaginary part */
1881 imaginary:
1882 c = tok_nextc(tok);
1883 if (!verify_end_of_number(tok, c, "imaginary")) {
1884 return ERRORTOKEN;
1885 }
1886 }
1887 else if (!verify_end_of_number(tok, c, "decimal")) {
1888 return ERRORTOKEN;
1889 }
1890 }
1891 }
1892 tok_backup(tok, c);
1893 *p_start = tok->start;
1894 *p_end = tok->cur;
1895 return NUMBER;
1896 }
1897
1898 letter_quote:
1899 /* String */
1900 if (c == '\'' || c == '"') {
1901 int quote = c;
1902 int quote_size = 1; /* 1 or 3 */
1903 int end_quote_size = 0;
1904
1905 /* Nodes of type STRING, especially multi line strings
1906 must be handled differently in order to get both
1907 the starting line number and the column offset right.
1908 (cf. issue 16806) */
1909 tok->first_lineno = tok->lineno;
1910 tok->multi_line_start = tok->line_start;
1911
1912 /* Find the quote size and start of string */
1913 c = tok_nextc(tok);
1914 if (c == quote) {
1915 c = tok_nextc(tok);
1916 if (c == quote) {
1917 quote_size = 3;
1918 }
1919 else {
1920 end_quote_size = 1; /* empty string found */
1921 }
1922 }
1923 if (c != quote) {
1924 tok_backup(tok, c);
1925 }
1926
1927 /* Get rest of string */
1928 while (end_quote_size != quote_size) {
1929 c = tok_nextc(tok);
1930 if (c == EOF || (quote_size == 1 && c == '\n')) {
1931 assert(tok->multi_line_start != NULL);
1932 // shift the tok_state's location into
1933 // the start of string, and report the error
1934 // from the initial quote character
1935 tok->cur = (char *)tok->start;
1936 tok->cur++;
1937 tok->line_start = tok->multi_line_start;
1938 int start = tok->lineno;
1939 tok->lineno = tok->first_lineno;
1940
1941 if (quote_size == 3) {
1942 return syntaxerror(tok,
1943 "unterminated triple-quoted string literal"
1944 " (detected at line %d)", start);
1945 }
1946 else {
1947 return syntaxerror(tok,
1948 "unterminated string literal (detected at"
1949 " line %d)", start);
1950 }
1951 }
1952 if (c == quote) {
1953 end_quote_size += 1;
1954 }
1955 else {
1956 end_quote_size = 0;
1957 if (c == '\\') {
1958 tok_nextc(tok); /* skip escaped char */
1959 }
1960 }
1961 }
1962
1963 *p_start = tok->start;
1964 *p_end = tok->cur;
1965 return STRING;
1966 }
1967
1968 /* Line continuation */
1969 if (c == '\\') {
1970 c = tok_nextc(tok);
1971 if (c != '\n') {
1972 tok->done = E_LINECONT;
1973 return ERRORTOKEN;
1974 }
1975 c = tok_nextc(tok);
1976 if (c == EOF) {
1977 tok->done = E_EOF;
1978 tok->cur = tok->inp;
1979 return ERRORTOKEN;
1980 } else {
1981 tok_backup(tok, c);
1982 }
1983 tok->cont_line = 1;
1984 goto again; /* Read next line */
1985 }
1986
1987 /* Check for two-character token */
1988 {
1989 int c2 = tok_nextc(tok);
1990 int token = PyToken_TwoChars(c, c2);
1991 if (token != OP) {
1992 int c3 = tok_nextc(tok);
1993 int token3 = PyToken_ThreeChars(c, c2, c3);
1994 if (token3 != OP) {
1995 token = token3;
1996 }
1997 else {
1998 tok_backup(tok, c3);
1999 }
2000 *p_start = tok->start;
2001 *p_end = tok->cur;
2002 return token;
2003 }
2004 tok_backup(tok, c2);
2005 }
2006
2007 /* Keep track of parentheses nesting level */
2008 switch (c) {
2009 case '(':
2010 case '[':
2011 case '{':
2012 if (tok->level >= MAXLEVEL) {
2013 return syntaxerror(tok, "too many nested parentheses");
2014 }
2015 tok->parenstack[tok->level] = c;
2016 tok->parenlinenostack[tok->level] = tok->lineno;
2017 tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
2018 tok->level++;
2019 break;
2020 case ')':
2021 case ']':
2022 case '}':
2023 if (!tok->level) {
2024 return syntaxerror(tok, "unmatched '%c'", c);
2025 }
2026 tok->level--;
2027 int opening = tok->parenstack[tok->level];
2028 if (!((opening == '(' && c == ')') ||
2029 (opening == '[' && c == ']') ||
2030 (opening == '{' && c == '}')))
2031 {
2032 if (tok->parenlinenostack[tok->level] != tok->lineno) {
2033 return syntaxerror(tok,
2034 "closing parenthesis '%c' does not match "
2035 "opening parenthesis '%c' on line %d",
2036 c, opening, tok->parenlinenostack[tok->level]);
2037 }
2038 else {
2039 return syntaxerror(tok,
2040 "closing parenthesis '%c' does not match "
2041 "opening parenthesis '%c'",
2042 c, opening);
2043 }
2044 }
2045 break;
2046 }
2047
2048 if (!Py_UNICODE_ISPRINTABLE(c)) {
2049 char hex[9];
2050 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
2051 return syntaxerror(tok, "invalid non-printable character U+%s", hex);
2052 }
2053
2054 /* Punctuation character */
2055 *p_start = tok->start;
2056 *p_end = tok->cur;
2057 return PyToken_OneChar(c);
2058 }
2059
2060 int
_PyTokenizer_Get(struct tok_state * tok,const char ** p_start,const char ** p_end)2061 _PyTokenizer_Get(struct tok_state *tok,
2062 const char **p_start, const char **p_end)
2063 {
2064 int result = tok_get(tok, p_start, p_end);
2065 if (tok->decoding_erred) {
2066 result = ERRORTOKEN;
2067 tok->done = E_DECODE;
2068 }
2069 return result;
2070 }
2071
2072 /* Get the encoding of a Python file. Check for the coding cookie and check if
2073 the file starts with a BOM.
2074
2075 _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2076 encoding in the first or second line of the file (in which case the encoding
2077 should be assumed to be UTF-8).
2078
2079 The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
2080 by the caller. */
2081
2082 char *
_PyTokenizer_FindEncodingFilename(int fd,PyObject * filename)2083 _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
2084 {
2085 struct tok_state *tok;
2086 FILE *fp;
2087 const char *p_start = NULL;
2088 const char *p_end = NULL;
2089 char *encoding = NULL;
2090
2091 fd = _Py_dup(fd);
2092 if (fd < 0) {
2093 return NULL;
2094 }
2095
2096 fp = fdopen(fd, "r");
2097 if (fp == NULL) {
2098 return NULL;
2099 }
2100 tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2101 if (tok == NULL) {
2102 fclose(fp);
2103 return NULL;
2104 }
2105 if (filename != NULL) {
2106 Py_INCREF(filename);
2107 tok->filename = filename;
2108 }
2109 else {
2110 tok->filename = PyUnicode_FromString("<string>");
2111 if (tok->filename == NULL) {
2112 fclose(fp);
2113 _PyTokenizer_Free(tok);
2114 return encoding;
2115 }
2116 }
2117 while (tok->lineno < 2 && tok->done == E_OK) {
2118 _PyTokenizer_Get(tok, &p_start, &p_end);
2119 }
2120 fclose(fp);
2121 if (tok->encoding) {
2122 encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
2123 if (encoding) {
2124 strcpy(encoding, tok->encoding);
2125 }
2126 }
2127 _PyTokenizer_Free(tok);
2128 return encoding;
2129 }
2130
2131 #ifdef Py_DEBUG
2132 void
tok_dump(int type,char * start,char * end)2133 tok_dump(int type, char *start, char *end)
2134 {
2135 fprintf(stderr, "%s", _PyParser_TokenNames[type]);
2136 if (type == NAME || type == NUMBER || type == STRING || type == OP)
2137 fprintf(stderr, "(%.*s)", (int)(end - start), start);
2138 }
2139 #endif // Py_DEBUG
2140