1
2 /* Tokenizer implementation */
3
4 #include "Python.h"
5 #include "../Include/pgenheaders.h"
6
7 #include <ctype.h>
8 #include <assert.h>
9
10 #include "tokenizer.h"
11 #include "../Include/errcode.h"
12
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "bytesobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #endif /* PGEN */
20
21 #ifndef Py_XSETREF
22 #define Py_XSETREF(op, op2) \
23 do { \
24 PyObject *_py_tmp = (PyObject *)(op); \
25 (op) = (op2); \
26 Py_XDECREF(_py_tmp); \
27 } while (0)
28 #endif /* Py_XSETREF */
29
30 #ifndef _PyObject_CallNoArg
31 #define _PyObject_CallNoArg(func) PyObject_CallObject(func, NULL)
32 #endif
33
34 /* Alternate tab spacing */
35 #define ALTTABSIZE 1
36
37 #define is_potential_identifier_start(c) (\
38 (c >= 'a' && c <= 'z')\
39 || (c >= 'A' && c <= 'Z')\
40 || c == '_'\
41 || (c >= 128))
42
43 #define is_potential_identifier_char(c) (\
44 (c >= 'a' && c <= 'z')\
45 || (c >= 'A' && c <= 'Z')\
46 || (c >= '0' && c <= '9')\
47 || c == '_'\
48 || (c >= 128))
49
50 PyAPI_FUNC(char *) PyOS_Readline(FILE *, FILE *, const char *);
51 /* Return malloc'ed string including trailing \n;
52 empty malloc'ed string for EOF;
53 NULL if interrupted */
54
55 /* Don't ever change this -- it would break the portability of Python code */
56 #define TABSIZE 8
57
58 /* Forward */
59 static struct tok_state *tok_new(void);
60 static int tok_nextc(struct tok_state *tok);
61 static void tok_backup(struct tok_state *tok, int c);
62
63
64 /* Token names */
65
66 const char *_Ta3Parser_TokenNames[] = {
67 "ENDMARKER",
68 "NAME",
69 "NUMBER",
70 "STRING",
71 "NEWLINE",
72 "INDENT",
73 "DEDENT",
74 "LPAR",
75 "RPAR",
76 "LSQB",
77 "RSQB",
78 "COLON",
79 "COMMA",
80 "SEMI",
81 "PLUS",
82 "MINUS",
83 "STAR",
84 "SLASH",
85 "VBAR",
86 "AMPER",
87 "LESS",
88 "GREATER",
89 "EQUAL",
90 "DOT",
91 "PERCENT",
92 "LBRACE",
93 "RBRACE",
94 "EQEQUAL",
95 "NOTEQUAL",
96 "LESSEQUAL",
97 "GREATEREQUAL",
98 "TILDE",
99 "CIRCUMFLEX",
100 "LEFTSHIFT",
101 "RIGHTSHIFT",
102 "DOUBLESTAR",
103 "PLUSEQUAL",
104 "MINEQUAL",
105 "STAREQUAL",
106 "SLASHEQUAL",
107 "PERCENTEQUAL",
108 "AMPEREQUAL",
109 "VBAREQUAL",
110 "CIRCUMFLEXEQUAL",
111 "LEFTSHIFTEQUAL",
112 "RIGHTSHIFTEQUAL",
113 "DOUBLESTAREQUAL",
114 "DOUBLESLASH",
115 "DOUBLESLASHEQUAL",
116 "AT",
117 "ATEQUAL",
118 "RARROW",
119 "ELLIPSIS",
120 /* This table must match the #defines in token.h! */
121 "OP",
122 "AWAIT",
123 "ASYNC",
124 "TYPE_IGNORE",
125 "TYPE_COMMENT",
126 "<ERRORTOKEN>",
127 "COMMENT",
128 "NL",
129 "ENCODING",
130 "<N_TOKENS>"
131 };
132
133 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
134 tokenizing. */
135 static const char* type_comment_prefix = "# type: ";
136
137
138 /* Create and initialize a new tok_state structure */
139
140 static struct tok_state *
tok_new(void)141 tok_new(void)
142 {
143 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
144 sizeof(struct tok_state));
145 if (tok == NULL)
146 return NULL;
147 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
148 tok->done = E_OK;
149 tok->fp = NULL;
150 tok->input = NULL;
151 tok->tabsize = TABSIZE;
152 tok->indent = 0;
153 tok->indstack[0] = 0;
154
155 tok->atbol = 1;
156 tok->pendin = 0;
157 tok->prompt = tok->nextprompt = NULL;
158 tok->lineno = 0;
159 tok->level = 0;
160 tok->altindstack[0] = 0;
161 tok->decoding_state = STATE_INIT;
162 tok->decoding_erred = 0;
163 tok->read_coding_spec = 0;
164 tok->enc = NULL;
165 tok->encoding = NULL;
166 tok->cont_line = 0;
167 #ifndef PGEN
168 tok->filename = NULL;
169 tok->decoding_readline = NULL;
170 tok->decoding_buffer = NULL;
171 #endif
172
173 tok->async_def = 0;
174 tok->async_def_indent = 0;
175 tok->async_def_nl = 0;
176 tok->async_always = 0;
177
178 return tok;
179 }
180
181 static char *
new_string(const char * s,Py_ssize_t len,struct tok_state * tok)182 new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
183 {
184 char* result = (char *)PyMem_MALLOC(len + 1);
185 if (!result) {
186 tok->done = E_NOMEM;
187 return NULL;
188 }
189 memcpy(result, s, len);
190 result[len] = '\0';
191 return result;
192 }
193
194 #ifdef PGEN
195
196 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)197 decoding_fgets(char *s, int size, struct tok_state *tok)
198 {
199 return fgets(s, size, tok->fp);
200 }
201
202 static int
decoding_feof(struct tok_state * tok)203 decoding_feof(struct tok_state *tok)
204 {
205 return feof(tok->fp);
206 }
207
208 static char *
decode_str(const char * str,int exec_input,struct tok_state * tok)209 decode_str(const char *str, int exec_input, struct tok_state *tok)
210 {
211 return new_string(str, strlen(str), tok);
212 }
213
214 #else /* PGEN */
215
216 static char *
error_ret(struct tok_state * tok)217 error_ret(struct tok_state *tok) /* XXX */
218 {
219 tok->decoding_erred = 1;
220 if (tok->fp != NULL && tok->buf != NULL) /* see Ta3Tokenizer_Free */
221 PyMem_FREE(tok->buf);
222 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
223 tok->done = E_DECODE;
224 return NULL; /* as if it were EOF */
225 }
226
227
228 static const char *
get_normal_name(const char * s)229 get_normal_name(const char *s) /* for utf-8 and latin-1 */
230 {
231 char buf[13];
232 int i;
233 for (i = 0; i < 12; i++) {
234 int c = s[i];
235 if (c == '\0')
236 break;
237 else if (c == '_')
238 buf[i] = '-';
239 else
240 buf[i] = tolower(c);
241 }
242 buf[i] = '\0';
243 if (strcmp(buf, "utf-8") == 0 ||
244 strncmp(buf, "utf-8-", 6) == 0)
245 return "utf-8";
246 else if (strcmp(buf, "latin-1") == 0 ||
247 strcmp(buf, "iso-8859-1") == 0 ||
248 strcmp(buf, "iso-latin-1") == 0 ||
249 strncmp(buf, "latin-1-", 8) == 0 ||
250 strncmp(buf, "iso-8859-1-", 11) == 0 ||
251 strncmp(buf, "iso-latin-1-", 12) == 0)
252 return "iso-8859-1";
253 else
254 return s;
255 }
256
257 /* Return the coding spec in S, or NULL if none is found. */
258
259 static int
get_coding_spec(const char * s,char ** spec,Py_ssize_t size,struct tok_state * tok)260 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
261 {
262 Py_ssize_t i;
263 *spec = NULL;
264 /* Coding spec must be in a comment, and that comment must be
265 * the only statement on the source code line. */
266 for (i = 0; i < size - 6; i++) {
267 if (s[i] == '#')
268 break;
269 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
270 return 1;
271 }
272 for (; i < size - 6; i++) { /* XXX inefficient search */
273 const char* t = s + i;
274 if (strncmp(t, "coding", 6) == 0) {
275 const char* begin = NULL;
276 t += 6;
277 if (t[0] != ':' && t[0] != '=')
278 continue;
279 do {
280 t++;
281 } while (t[0] == '\x20' || t[0] == '\t');
282
283 begin = t;
284 while (Py_ISALNUM(t[0]) ||
285 t[0] == '-' || t[0] == '_' || t[0] == '.')
286 t++;
287
288 if (begin < t) {
289 char* r = new_string(begin, t - begin, tok);
290 const char* q;
291 if (!r)
292 return 0;
293 q = get_normal_name(r);
294 if (r != q) {
295 PyMem_FREE(r);
296 r = new_string(q, strlen(q), tok);
297 if (!r)
298 return 0;
299 }
300 *spec = r;
301 break;
302 }
303 }
304 }
305 return 1;
306 }
307
308 /* Check whether the line contains a coding spec. If it does,
309 invoke the set_readline function for the new encoding.
310 This function receives the tok_state and the new encoding.
311 Return 1 on success, 0 on failure. */
312
313 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))314 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
315 int set_readline(struct tok_state *, const char *))
316 {
317 char *cs;
318 int r = 1;
319
320 if (tok->cont_line) {
321 /* It's a continuation line, so it can't be a coding spec. */
322 tok->read_coding_spec = 1;
323 return 1;
324 }
325 if (!get_coding_spec(line, &cs, size, tok))
326 return 0;
327 if (!cs) {
328 Py_ssize_t i;
329 for (i = 0; i < size; i++) {
330 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
331 break;
332 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
333 /* Stop checking coding spec after a line containing
334 * anything except a comment. */
335 tok->read_coding_spec = 1;
336 break;
337 }
338 }
339 return 1;
340 }
341 tok->read_coding_spec = 1;
342 if (tok->encoding == NULL) {
343 assert(tok->decoding_state == STATE_RAW);
344 if (strcmp(cs, "utf-8") == 0) {
345 tok->encoding = cs;
346 } else {
347 r = set_readline(tok, cs);
348 if (r) {
349 tok->encoding = cs;
350 tok->decoding_state = STATE_NORMAL;
351 }
352 else {
353 PyErr_Format(PyExc_SyntaxError,
354 "encoding problem: %s", cs);
355 PyMem_FREE(cs);
356 }
357 }
358 } else { /* then, compare cs with BOM */
359 r = (strcmp(tok->encoding, cs) == 0);
360 if (!r)
361 PyErr_Format(PyExc_SyntaxError,
362 "encoding problem: %s with BOM", cs);
363 PyMem_FREE(cs);
364 }
365 return r;
366 }
367
368 /* See whether the file starts with a BOM. If it does,
369 invoke the set_readline function with the new encoding.
370 Return 1 on success, 0 on failure. */
371
372 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)373 check_bom(int get_char(struct tok_state *),
374 void unget_char(int, struct tok_state *),
375 int set_readline(struct tok_state *, const char *),
376 struct tok_state *tok)
377 {
378 int ch1, ch2, ch3;
379 ch1 = get_char(tok);
380 tok->decoding_state = STATE_RAW;
381 if (ch1 == EOF) {
382 return 1;
383 } else if (ch1 == 0xEF) {
384 ch2 = get_char(tok);
385 if (ch2 != 0xBB) {
386 unget_char(ch2, tok);
387 unget_char(ch1, tok);
388 return 1;
389 }
390 ch3 = get_char(tok);
391 if (ch3 != 0xBF) {
392 unget_char(ch3, tok);
393 unget_char(ch2, tok);
394 unget_char(ch1, tok);
395 return 1;
396 }
397 #if 0
398 /* Disable support for UTF-16 BOMs until a decision
399 is made whether this needs to be supported. */
400 } else if (ch1 == 0xFE) {
401 ch2 = get_char(tok);
402 if (ch2 != 0xFF) {
403 unget_char(ch2, tok);
404 unget_char(ch1, tok);
405 return 1;
406 }
407 if (!set_readline(tok, "utf-16-be"))
408 return 0;
409 tok->decoding_state = STATE_NORMAL;
410 } else if (ch1 == 0xFF) {
411 ch2 = get_char(tok);
412 if (ch2 != 0xFE) {
413 unget_char(ch2, tok);
414 unget_char(ch1, tok);
415 return 1;
416 }
417 if (!set_readline(tok, "utf-16-le"))
418 return 0;
419 tok->decoding_state = STATE_NORMAL;
420 #endif
421 } else {
422 unget_char(ch1, tok);
423 return 1;
424 }
425 if (tok->encoding != NULL)
426 PyMem_FREE(tok->encoding);
427 tok->encoding = new_string("utf-8", 5, tok);
428 if (!tok->encoding)
429 return 0;
430 /* No need to set_readline: input is already utf-8 */
431 return 1;
432 }
433
434 /* Read a line of text from TOK into S, using the stream in TOK.
435 Return NULL on failure, else S.
436
437 On entry, tok->decoding_buffer will be one of:
438 1) NULL: need to call tok->decoding_readline to get a new line
439 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
440 stored the result in tok->decoding_buffer
441 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
442 (in the s buffer) to copy entire contents of the line read
443 by tok->decoding_readline. tok->decoding_buffer has the overflow.
444 In this case, fp_readl is called in a loop (with an expanded buffer)
445 until the buffer ends with a '\n' (or until the end of the file is
446 reached): see tok_nextc and its calls to decoding_fgets.
447 */
448
449 static char *
fp_readl(char * s,int size,struct tok_state * tok)450 fp_readl(char *s, int size, struct tok_state *tok)
451 {
452 PyObject* bufobj;
453 const char *buf;
454 Py_ssize_t buflen;
455
456 /* Ask for one less byte so we can terminate it */
457 assert(size > 0);
458 size--;
459
460 if (tok->decoding_buffer) {
461 bufobj = tok->decoding_buffer;
462 Py_INCREF(bufobj);
463 }
464 else
465 {
466 bufobj = _PyObject_CallNoArg(tok->decoding_readline);
467 if (bufobj == NULL)
468 goto error;
469 }
470 if (PyUnicode_CheckExact(bufobj))
471 {
472 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
473 if (buf == NULL) {
474 goto error;
475 }
476 }
477 else
478 {
479 buf = PyByteArray_AsString(bufobj);
480 if (buf == NULL) {
481 goto error;
482 }
483 buflen = PyByteArray_GET_SIZE(bufobj);
484 }
485
486 Py_XDECREF(tok->decoding_buffer);
487 if (buflen > size) {
488 /* Too many chars, the rest goes into tok->decoding_buffer */
489 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
490 buflen-size);
491 if (tok->decoding_buffer == NULL)
492 goto error;
493 buflen = size;
494 }
495 else
496 tok->decoding_buffer = NULL;
497
498 memcpy(s, buf, buflen);
499 s[buflen] = '\0';
500 if (buflen == 0) /* EOF */
501 s = NULL;
502 Py_DECREF(bufobj);
503 return s;
504
505 error:
506 Py_XDECREF(bufobj);
507 return error_ret(tok);
508 }
509
510 /* Set the readline function for TOK to a StreamReader's
511 readline function. The StreamReader is named ENC.
512
513 This function is called from check_bom and check_coding_spec.
514
515 ENC is usually identical to the future value of tok->encoding,
516 except for the (currently unsupported) case of UTF-16.
517
518 Return 1 on success, 0 on failure. */
519
520 static int
fp_setreadl(struct tok_state * tok,const char * enc)521 fp_setreadl(struct tok_state *tok, const char* enc)
522 {
523 PyObject *readline, *io, *stream;
524 _Py_IDENTIFIER(open);
525 _Py_IDENTIFIER(readline);
526 int fd;
527 long pos;
528
529 fd = fileno(tok->fp);
530 /* Due to buffering the file offset for fd can be different from the file
531 * position of tok->fp. If tok->fp was opened in text mode on Windows,
532 * its file position counts CRLF as one char and can't be directly mapped
533 * to the file offset for fd. Instead we step back one byte and read to
534 * the end of line.*/
535 pos = ftell(tok->fp);
536 if (pos == -1 ||
537 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
538 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
539 return 0;
540 }
541
542 io = PyImport_ImportModuleNoBlock("io");
543 if (io == NULL)
544 return 0;
545
546 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
547 fd, "r", -1, enc, Py_None, Py_None, Py_False);
548 Py_DECREF(io);
549 if (stream == NULL)
550 return 0;
551
552 readline = _PyObject_GetAttrId(stream, &PyId_readline);
553 Py_DECREF(stream);
554 if (readline == NULL)
555 return 0;
556 Py_XSETREF(tok->decoding_readline, readline);
557
558 if (pos > 0) {
559 PyObject *bufobj = _PyObject_CallNoArg(readline);
560 if (bufobj == NULL)
561 return 0;
562 Py_DECREF(bufobj);
563 }
564
565 return 1;
566 }
567
568 /* Fetch the next byte from TOK. */
569
fp_getc(struct tok_state * tok)570 static int fp_getc(struct tok_state *tok) {
571 return getc(tok->fp);
572 }
573
574 /* Unfetch the last byte back into TOK. */
575
fp_ungetc(int c,struct tok_state * tok)576 static void fp_ungetc(int c, struct tok_state *tok) {
577 ungetc(c, tok->fp);
578 }
579
580 /* Check whether the characters at s start a valid
581 UTF-8 sequence. Return the number of characters forming
582 the sequence if yes, 0 if not. */
valid_utf8(const unsigned char * s)583 static int valid_utf8(const unsigned char* s)
584 {
585 int expected = 0;
586 int length;
587 if (*s < 0x80)
588 /* single-byte code */
589 return 1;
590 if (*s < 0xc0)
591 /* following byte */
592 return 0;
593 if (*s < 0xE0)
594 expected = 1;
595 else if (*s < 0xF0)
596 expected = 2;
597 else if (*s < 0xF8)
598 expected = 3;
599 else
600 return 0;
601 length = expected + 1;
602 for (; expected; expected--)
603 if (s[expected] < 0x80 || s[expected] >= 0xC0)
604 return 0;
605 return length;
606 }
607
608 /* Read a line of input from TOK. Determine encoding
609 if necessary. */
610
611 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)612 decoding_fgets(char *s, int size, struct tok_state *tok)
613 {
614 char *line = NULL;
615 int badchar = 0;
616 for (;;) {
617 if (tok->decoding_state == STATE_NORMAL) {
618 /* We already have a codec associated with
619 this input. */
620 line = fp_readl(s, size, tok);
621 break;
622 } else if (tok->decoding_state == STATE_RAW) {
623 /* We want a 'raw' read. */
624 line = Py_UniversalNewlineFgets(s, size,
625 tok->fp, NULL);
626 break;
627 } else {
628 /* We have not yet determined the encoding.
629 If an encoding is found, use the file-pointer
630 reader functions from now on. */
631 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
632 return error_ret(tok);
633 assert(tok->decoding_state != STATE_INIT);
634 }
635 }
636 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
637 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
638 return error_ret(tok);
639 }
640 }
641 #ifndef PGEN
642 /* The default encoding is UTF-8, so make sure we don't have any
643 non-UTF-8 sequences in it. */
644 if (line && !tok->encoding) {
645 unsigned char *c;
646 int length;
647 for (c = (unsigned char *)line; *c; c += length)
648 if (!(length = valid_utf8(c))) {
649 badchar = *c;
650 break;
651 }
652 }
653 if (badchar) {
654 /* Need to add 1 to the line number, since this line
655 has not been counted, yet. */
656 PyErr_Format(PyExc_SyntaxError,
657 "Non-UTF-8 code starting with '\\x%.2x' "
658 "in file %U on line %i, "
659 "but no encoding declared; "
660 "see http://python.org/dev/peps/pep-0263/ for details",
661 badchar, tok->filename, tok->lineno + 1);
662 return error_ret(tok);
663 }
664 #endif
665 return line;
666 }
667
668 static int
decoding_feof(struct tok_state * tok)669 decoding_feof(struct tok_state *tok)
670 {
671 if (tok->decoding_state != STATE_NORMAL) {
672 return feof(tok->fp);
673 } else {
674 PyObject* buf = tok->decoding_buffer;
675 if (buf == NULL) {
676 buf = _PyObject_CallNoArg(tok->decoding_readline);
677 if (buf == NULL) {
678 error_ret(tok);
679 return 1;
680 } else {
681 tok->decoding_buffer = buf;
682 }
683 }
684 return PyObject_Length(buf) == 0;
685 }
686 }
687
688 /* Fetch a byte from TOK, using the string buffer. */
689
690 static int
buf_getc(struct tok_state * tok)691 buf_getc(struct tok_state *tok) {
692 return Py_CHARMASK(*tok->str++);
693 }
694
695 /* Unfetch a byte from TOK, using the string buffer. */
696
697 static void
buf_ungetc(int c,struct tok_state * tok)698 buf_ungetc(int c, struct tok_state *tok) {
699 tok->str--;
700 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
701 }
702
703 /* Set the readline function for TOK to ENC. For the string-based
704 tokenizer, this means to just record the encoding. */
705
706 static int
buf_setreadl(struct tok_state * tok,const char * enc)707 buf_setreadl(struct tok_state *tok, const char* enc) {
708 tok->enc = enc;
709 return 1;
710 }
711
712 /* Return a UTF-8 encoding Python string object from the
713 C byte string STR, which is encoded with ENC. */
714
715 static PyObject *
translate_into_utf8(const char * str,const char * enc)716 translate_into_utf8(const char* str, const char* enc) {
717 PyObject *utf8;
718 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
719 if (buf == NULL)
720 return NULL;
721 utf8 = PyUnicode_AsUTF8String(buf);
722 Py_DECREF(buf);
723 return utf8;
724 }
725
726
727 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)728 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
729 int skip_next_lf = 0;
730 size_t needed_length = strlen(s) + 2, final_length;
731 char *buf, *current;
732 char c = '\0';
733 buf = PyMem_MALLOC(needed_length);
734 if (buf == NULL) {
735 tok->done = E_NOMEM;
736 return NULL;
737 }
738 for (current = buf; *s; s++, current++) {
739 c = *s;
740 if (skip_next_lf) {
741 skip_next_lf = 0;
742 if (c == '\n') {
743 c = *++s;
744 if (!c)
745 break;
746 }
747 }
748 if (c == '\r') {
749 skip_next_lf = 1;
750 c = '\n';
751 }
752 *current = c;
753 }
754 /* If this is exec input, add a newline to the end of the string if
755 there isn't one already. */
756 if (exec_input && c != '\n') {
757 *current = '\n';
758 current++;
759 }
760 *current = '\0';
761 final_length = current - buf + 1;
762 if (final_length < needed_length && final_length)
763 /* should never fail */
764 buf = PyMem_REALLOC(buf, final_length);
765 return buf;
766 }
767
768 /* Decode a byte string STR for use as the buffer of TOK.
769 Look for encoding declarations inside STR, and record them
770 inside TOK. */
771
772 static const char *
decode_str(const char * input,int single,struct tok_state * tok)773 decode_str(const char *input, int single, struct tok_state *tok)
774 {
775 PyObject* utf8 = NULL;
776 const char *str;
777 const char *s;
778 const char *newl[2] = {NULL, NULL};
779 int lineno = 0;
780 tok->input = str = translate_newlines(input, single, tok);
781 if (str == NULL)
782 return NULL;
783 tok->enc = NULL;
784 tok->str = str;
785 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
786 return error_ret(tok);
787 str = tok->str; /* string after BOM if any */
788 assert(str);
789 if (tok->enc != NULL) {
790 utf8 = translate_into_utf8(str, tok->enc);
791 if (utf8 == NULL)
792 return error_ret(tok);
793 str = PyBytes_AsString(utf8);
794 }
795 for (s = str;; s++) {
796 if (*s == '\0') break;
797 else if (*s == '\n') {
798 assert(lineno < 2);
799 newl[lineno] = s;
800 lineno++;
801 if (lineno == 2) break;
802 }
803 }
804 tok->enc = NULL;
805 /* need to check line 1 and 2 separately since check_coding_spec
806 assumes a single line as input */
807 if (newl[0]) {
808 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
809 return error_ret(tok);
810 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
811 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
812 tok, buf_setreadl))
813 return error_ret(tok);
814 }
815 }
816 if (tok->enc != NULL) {
817 assert(utf8 == NULL);
818 utf8 = translate_into_utf8(str, tok->enc);
819 if (utf8 == NULL)
820 return error_ret(tok);
821 str = PyBytes_AS_STRING(utf8);
822 }
823 assert(tok->decoding_buffer == NULL);
824 tok->decoding_buffer = utf8; /* CAUTION */
825 return str;
826 }
827
828 #endif /* PGEN */
829
830 /* Set up tokenizer for string */
831
832 struct tok_state *
Ta3Tokenizer_FromString(const char * str,int exec_input)833 Ta3Tokenizer_FromString(const char *str, int exec_input)
834 {
835 struct tok_state *tok = tok_new();
836 if (tok == NULL)
837 return NULL;
838 str = decode_str(str, exec_input, tok);
839 if (str == NULL) {
840 Ta3Tokenizer_Free(tok);
841 return NULL;
842 }
843
844 /* XXX: constify members. */
845 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
846 return tok;
847 }
848
849 struct tok_state *
Ta3Tokenizer_FromUTF8(const char * str,int exec_input)850 Ta3Tokenizer_FromUTF8(const char *str, int exec_input)
851 {
852 struct tok_state *tok = tok_new();
853 if (tok == NULL)
854 return NULL;
855 #ifndef PGEN
856 tok->input = str = translate_newlines(str, exec_input, tok);
857 #endif
858 if (str == NULL) {
859 Ta3Tokenizer_Free(tok);
860 return NULL;
861 }
862 tok->decoding_state = STATE_RAW;
863 tok->read_coding_spec = 1;
864 tok->enc = NULL;
865 tok->str = str;
866 tok->encoding = (char *)PyMem_MALLOC(6);
867 if (!tok->encoding) {
868 Ta3Tokenizer_Free(tok);
869 return NULL;
870 }
871 strcpy(tok->encoding, "utf-8");
872
873 /* XXX: constify members. */
874 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
875 return tok;
876 }
877
878 /* Set up tokenizer for file */
879
880 struct tok_state *
Ta3Tokenizer_FromFile(FILE * fp,const char * enc,const char * ps1,const char * ps2)881 Ta3Tokenizer_FromFile(FILE *fp, const char* enc,
882 const char *ps1, const char *ps2)
883 {
884 struct tok_state *tok = tok_new();
885 if (tok == NULL)
886 return NULL;
887 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
888 Ta3Tokenizer_Free(tok);
889 return NULL;
890 }
891 tok->cur = tok->inp = tok->buf;
892 tok->end = tok->buf + BUFSIZ;
893 tok->fp = fp;
894 tok->prompt = ps1;
895 tok->nextprompt = ps2;
896 if (enc != NULL) {
897 /* Must copy encoding declaration since it
898 gets copied into the parse tree. */
899 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
900 if (!tok->encoding) {
901 Ta3Tokenizer_Free(tok);
902 return NULL;
903 }
904 strcpy(tok->encoding, enc);
905 tok->decoding_state = STATE_NORMAL;
906 }
907 return tok;
908 }
909
910
911 /* Free a tok_state structure */
912
913 void
Ta3Tokenizer_Free(struct tok_state * tok)914 Ta3Tokenizer_Free(struct tok_state *tok)
915 {
916 if (tok->encoding != NULL)
917 PyMem_FREE(tok->encoding);
918 #ifndef PGEN
919 Py_XDECREF(tok->decoding_readline);
920 Py_XDECREF(tok->decoding_buffer);
921 Py_XDECREF(tok->filename);
922 #endif
923 if (tok->fp != NULL && tok->buf != NULL)
924 PyMem_FREE(tok->buf);
925 if (tok->input)
926 PyMem_FREE((char *)tok->input);
927 PyMem_FREE(tok);
928 }
929
930 /* Get next char, updating state; error code goes into tok->done */
931
932 static int
tok_nextc(struct tok_state * tok)933 tok_nextc(struct tok_state *tok)
934 {
935 for (;;) {
936 if (tok->cur != tok->inp) {
937 return Py_CHARMASK(*tok->cur++); /* Fast path */
938 }
939 if (tok->done != E_OK)
940 return EOF;
941 if (tok->fp == NULL) {
942 char *end = strchr(tok->inp, '\n');
943 if (end != NULL)
944 end++;
945 else {
946 end = strchr(tok->inp, '\0');
947 if (end == tok->inp) {
948 tok->done = E_EOF;
949 return EOF;
950 }
951 }
952 if (tok->start == NULL)
953 tok->buf = tok->cur;
954 tok->line_start = tok->cur;
955 tok->lineno++;
956 tok->inp = end;
957 return Py_CHARMASK(*tok->cur++);
958 }
959 if (tok->prompt != NULL) {
960 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
961 #ifndef PGEN
962 if (newtok != NULL) {
963 char *translated = translate_newlines(newtok, 0, tok);
964 PyMem_FREE(newtok);
965 if (translated == NULL)
966 return EOF;
967 newtok = translated;
968 }
969 if (tok->encoding && newtok && *newtok) {
970 /* Recode to UTF-8 */
971 Py_ssize_t buflen;
972 const char* buf;
973 PyObject *u = translate_into_utf8(newtok, tok->encoding);
974 PyMem_FREE(newtok);
975 if (!u) {
976 tok->done = E_DECODE;
977 return EOF;
978 }
979 buflen = PyBytes_GET_SIZE(u);
980 buf = PyBytes_AS_STRING(u);
981 newtok = PyMem_MALLOC(buflen+1);
982 if (newtok == NULL) {
983 Py_DECREF(u);
984 tok->done = E_NOMEM;
985 return EOF;
986 }
987 strcpy(newtok, buf);
988 Py_DECREF(u);
989 }
990 #endif
991 if (tok->nextprompt != NULL)
992 tok->prompt = tok->nextprompt;
993 if (newtok == NULL)
994 tok->done = E_INTR;
995 else if (*newtok == '\0') {
996 PyMem_FREE(newtok);
997 tok->done = E_EOF;
998 }
999 else if (tok->start != NULL) {
1000 size_t start = tok->start - tok->buf;
1001 size_t oldlen = tok->cur - tok->buf;
1002 size_t newlen = oldlen + strlen(newtok);
1003 char *buf = tok->buf;
1004 buf = (char *)PyMem_REALLOC(buf, newlen+1);
1005 tok->lineno++;
1006 if (buf == NULL) {
1007 PyMem_FREE(tok->buf);
1008 tok->buf = NULL;
1009 PyMem_FREE(newtok);
1010 tok->done = E_NOMEM;
1011 return EOF;
1012 }
1013 tok->buf = buf;
1014 tok->cur = tok->buf + oldlen;
1015 tok->line_start = tok->cur;
1016 strcpy(tok->buf + oldlen, newtok);
1017 PyMem_FREE(newtok);
1018 tok->inp = tok->buf + newlen;
1019 tok->end = tok->inp + 1;
1020 tok->start = tok->buf + start;
1021 }
1022 else {
1023 tok->lineno++;
1024 if (tok->buf != NULL)
1025 PyMem_FREE(tok->buf);
1026 tok->buf = newtok;
1027 tok->cur = tok->buf;
1028 tok->line_start = tok->buf;
1029 tok->inp = strchr(tok->buf, '\0');
1030 tok->end = tok->inp + 1;
1031 }
1032 }
1033 else {
1034 int done = 0;
1035 Py_ssize_t cur = 0;
1036 char *pt;
1037 if (tok->start == NULL) {
1038 if (tok->buf == NULL) {
1039 tok->buf = (char *)
1040 PyMem_MALLOC(BUFSIZ);
1041 if (tok->buf == NULL) {
1042 tok->done = E_NOMEM;
1043 return EOF;
1044 }
1045 tok->end = tok->buf + BUFSIZ;
1046 }
1047 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1048 tok) == NULL) {
1049 if (!tok->decoding_erred)
1050 tok->done = E_EOF;
1051 done = 1;
1052 }
1053 else {
1054 tok->done = E_OK;
1055 tok->inp = strchr(tok->buf, '\0');
1056 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
1057 }
1058 }
1059 else {
1060 cur = tok->cur - tok->buf;
1061 if (decoding_feof(tok)) {
1062 tok->done = E_EOF;
1063 done = 1;
1064 }
1065 else
1066 tok->done = E_OK;
1067 }
1068 tok->lineno++;
1069 /* Read until '\n' or EOF */
1070 while (!done) {
1071 Py_ssize_t curstart = tok->start == NULL ? -1 :
1072 tok->start - tok->buf;
1073 Py_ssize_t curvalid = tok->inp - tok->buf;
1074 Py_ssize_t newsize = curvalid + BUFSIZ;
1075 char *newbuf = tok->buf;
1076 newbuf = (char *)PyMem_REALLOC(newbuf,
1077 newsize);
1078 if (newbuf == NULL) {
1079 tok->done = E_NOMEM;
1080 tok->cur = tok->inp;
1081 return EOF;
1082 }
1083 tok->buf = newbuf;
1084 tok->cur = tok->buf + cur;
1085 tok->line_start = tok->cur;
1086 tok->inp = tok->buf + curvalid;
1087 tok->end = tok->buf + newsize;
1088 tok->start = curstart < 0 ? NULL :
1089 tok->buf + curstart;
1090 if (decoding_fgets(tok->inp,
1091 (int)(tok->end - tok->inp),
1092 tok) == NULL) {
1093 /* Break out early on decoding
1094 errors, as tok->buf will be NULL
1095 */
1096 if (tok->decoding_erred)
1097 return EOF;
1098 /* Last line does not end in \n,
1099 fake one */
1100 strcpy(tok->inp, "\n");
1101 }
1102 tok->inp = strchr(tok->inp, '\0');
1103 done = tok->inp[-1] == '\n';
1104 }
1105 if (tok->buf != NULL) {
1106 tok->cur = tok->buf + cur;
1107 tok->line_start = tok->cur;
1108 /* replace "\r\n" with "\n" */
1109 /* For Mac leave the \r, giving a syntax error */
1110 pt = tok->inp - 2;
1111 if (pt >= tok->buf && *pt == '\r') {
1112 *pt++ = '\n';
1113 *pt = '\0';
1114 tok->inp = pt;
1115 }
1116 }
1117 }
1118 if (tok->done != E_OK) {
1119 if (tok->prompt != NULL)
1120 PySys_WriteStderr("\n");
1121 tok->cur = tok->inp;
1122 return EOF;
1123 }
1124 }
1125 /*NOTREACHED*/
1126 }
1127
1128
1129 /* Back-up one character */
1130
1131 static void
tok_backup(struct tok_state * tok,int c)1132 tok_backup(struct tok_state *tok, int c)
1133 {
1134 if (c != EOF) {
1135 if (--tok->cur < tok->buf)
1136 Py_FatalError("tok_backup: beginning of buffer");
1137 if (*tok->cur != c)
1138 *tok->cur = c;
1139 }
1140 }
1141
1142
1143 /* Return the token corresponding to a single character */
1144
1145 int
Ta3Token_OneChar(int c)1146 Ta3Token_OneChar(int c)
1147 {
1148 switch (c) {
1149 case '(': return LPAR;
1150 case ')': return RPAR;
1151 case '[': return LSQB;
1152 case ']': return RSQB;
1153 case ':': return COLON;
1154 case ',': return COMMA;
1155 case ';': return SEMI;
1156 case '+': return PLUS;
1157 case '-': return MINUS;
1158 case '*': return STAR;
1159 case '/': return SLASH;
1160 case '|': return VBAR;
1161 case '&': return AMPER;
1162 case '<': return LESS;
1163 case '>': return GREATER;
1164 case '=': return EQUAL;
1165 case '.': return DOT;
1166 case '%': return PERCENT;
1167 case '{': return LBRACE;
1168 case '}': return RBRACE;
1169 case '^': return CIRCUMFLEX;
1170 case '~': return TILDE;
1171 case '@': return AT;
1172 default: return OP;
1173 }
1174 }
1175
1176
1177 int
Ta3Token_TwoChars(int c1,int c2)1178 Ta3Token_TwoChars(int c1, int c2)
1179 {
1180 switch (c1) {
1181 case '=':
1182 switch (c2) {
1183 case '=': return EQEQUAL;
1184 }
1185 break;
1186 case '!':
1187 switch (c2) {
1188 case '=': return NOTEQUAL;
1189 }
1190 break;
1191 case '<':
1192 switch (c2) {
1193 case '>': return NOTEQUAL;
1194 case '=': return LESSEQUAL;
1195 case '<': return LEFTSHIFT;
1196 }
1197 break;
1198 case '>':
1199 switch (c2) {
1200 case '=': return GREATEREQUAL;
1201 case '>': return RIGHTSHIFT;
1202 }
1203 break;
1204 case '+':
1205 switch (c2) {
1206 case '=': return PLUSEQUAL;
1207 }
1208 break;
1209 case '-':
1210 switch (c2) {
1211 case '=': return MINEQUAL;
1212 case '>': return RARROW;
1213 }
1214 break;
1215 case '*':
1216 switch (c2) {
1217 case '*': return DOUBLESTAR;
1218 case '=': return STAREQUAL;
1219 }
1220 break;
1221 case '/':
1222 switch (c2) {
1223 case '/': return DOUBLESLASH;
1224 case '=': return SLASHEQUAL;
1225 }
1226 break;
1227 case '|':
1228 switch (c2) {
1229 case '=': return VBAREQUAL;
1230 }
1231 break;
1232 case '%':
1233 switch (c2) {
1234 case '=': return PERCENTEQUAL;
1235 }
1236 break;
1237 case '&':
1238 switch (c2) {
1239 case '=': return AMPEREQUAL;
1240 }
1241 break;
1242 case '^':
1243 switch (c2) {
1244 case '=': return CIRCUMFLEXEQUAL;
1245 }
1246 break;
1247 case '@':
1248 switch (c2) {
1249 case '=': return ATEQUAL;
1250 }
1251 break;
1252 }
1253 return OP;
1254 }
1255
1256 int
Ta3Token_ThreeChars(int c1,int c2,int c3)1257 Ta3Token_ThreeChars(int c1, int c2, int c3)
1258 {
1259 switch (c1) {
1260 case '<':
1261 switch (c2) {
1262 case '<':
1263 switch (c3) {
1264 case '=':
1265 return LEFTSHIFTEQUAL;
1266 }
1267 break;
1268 }
1269 break;
1270 case '>':
1271 switch (c2) {
1272 case '>':
1273 switch (c3) {
1274 case '=':
1275 return RIGHTSHIFTEQUAL;
1276 }
1277 break;
1278 }
1279 break;
1280 case '*':
1281 switch (c2) {
1282 case '*':
1283 switch (c3) {
1284 case '=':
1285 return DOUBLESTAREQUAL;
1286 }
1287 break;
1288 }
1289 break;
1290 case '/':
1291 switch (c2) {
1292 case '/':
1293 switch (c3) {
1294 case '=':
1295 return DOUBLESLASHEQUAL;
1296 }
1297 break;
1298 }
1299 break;
1300 case '.':
1301 switch (c2) {
1302 case '.':
1303 switch (c3) {
1304 case '.':
1305 return ELLIPSIS;
1306 }
1307 break;
1308 }
1309 break;
1310 }
1311 return OP;
1312 }
1313
1314 static int
indenterror(struct tok_state * tok)1315 indenterror(struct tok_state *tok)
1316 {
1317 tok->done = E_TABSPACE;
1318 tok->cur = tok->inp;
1319 return ERRORTOKEN;
1320 }
1321
1322 #ifdef PGEN
1323 #define verify_identifier(tok) 1
1324 #else
1325 /* Verify that the identifier follows PEP 3131.
1326 All identifier strings are guaranteed to be "ready" unicode objects.
1327 */
1328 static int
verify_identifier(struct tok_state * tok)1329 verify_identifier(struct tok_state *tok)
1330 {
1331 PyObject *s;
1332 int result;
1333 if (tok->decoding_erred)
1334 return 0;
1335 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1336 if (s == NULL || PyUnicode_READY(s) == -1) {
1337 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1338 PyErr_Clear();
1339 tok->done = E_IDENTIFIER;
1340 } else {
1341 tok->done = E_ERROR;
1342 }
1343 return 0;
1344 }
1345 result = PyUnicode_IsIdentifier(s);
1346 Py_DECREF(s);
1347 if (result == 0)
1348 tok->done = E_IDENTIFIER;
1349 return result;
1350 }
1351 #endif
1352
1353 static int
tok_decimal_tail(struct tok_state * tok)1354 tok_decimal_tail(struct tok_state *tok)
1355 {
1356 int c;
1357
1358 while (1) {
1359 do {
1360 c = tok_nextc(tok);
1361 } while (isdigit(c));
1362 if (c != '_') {
1363 break;
1364 }
1365 c = tok_nextc(tok);
1366 if (!isdigit(c)) {
1367 tok->done = E_TOKEN;
1368 tok_backup(tok, c);
1369 return 0;
1370 }
1371 }
1372 return c;
1373 }
1374
1375 /* Get next token, after space stripping etc. */
1376
1377 static int
tok_get(struct tok_state * tok,char ** p_start,char ** p_end)1378 tok_get(struct tok_state *tok, char **p_start, char **p_end)
1379 {
1380 int c;
1381 int blankline, nonascii;
1382
1383 *p_start = *p_end = NULL;
1384 nextline:
1385 tok->start = NULL;
1386 blankline = 0;
1387
1388 /* Get indentation level */
1389 if (tok->atbol) {
1390 int col = 0;
1391 int altcol = 0;
1392 tok->atbol = 0;
1393 for (;;) {
1394 c = tok_nextc(tok);
1395 if (c == ' ') {
1396 col++, altcol++;
1397 }
1398 else if (c == '\t') {
1399 col = (col / tok->tabsize + 1) * tok->tabsize;
1400 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1401 }
1402 else if (c == '\014') {/* Control-L (formfeed) */
1403 col = altcol = 0; /* For Emacs users */
1404 }
1405 else {
1406 break;
1407 }
1408 }
1409 tok_backup(tok, c);
1410 if (c == '#' || c == '\n') {
1411 /* Lines with only whitespace and/or comments
1412 shouldn't affect the indentation and are
1413 not passed to the parser as NEWLINE tokens,
1414 except *totally* empty lines in interactive
1415 mode, which signal the end of a command group. */
1416 if (col == 0 && c == '\n' && tok->prompt != NULL) {
1417 blankline = 0; /* Let it through */
1418 }
1419 else {
1420 blankline = 1; /* Ignore completely */
1421 }
1422 /* We can't jump back right here since we still
1423 may need to skip to the end of a comment */
1424 }
1425 if (!blankline && tok->level == 0) {
1426 if (col == tok->indstack[tok->indent]) {
1427 /* No change */
1428 if (altcol != tok->altindstack[tok->indent]) {
1429 return indenterror(tok);
1430 }
1431 }
1432 else if (col > tok->indstack[tok->indent]) {
1433 /* Indent -- always one */
1434 if (tok->indent+1 >= MAXINDENT) {
1435 tok->done = E_TOODEEP;
1436 tok->cur = tok->inp;
1437 return ERRORTOKEN;
1438 }
1439 if (altcol <= tok->altindstack[tok->indent]) {
1440 return indenterror(tok);
1441 }
1442 tok->pendin++;
1443 tok->indstack[++tok->indent] = col;
1444 tok->altindstack[tok->indent] = altcol;
1445 }
1446 else /* col < tok->indstack[tok->indent] */ {
1447 /* Dedent -- any number, must be consistent */
1448 while (tok->indent > 0 &&
1449 col < tok->indstack[tok->indent]) {
1450 tok->pendin--;
1451 tok->indent--;
1452 }
1453 if (col != tok->indstack[tok->indent]) {
1454 tok->done = E_DEDENT;
1455 tok->cur = tok->inp;
1456 return ERRORTOKEN;
1457 }
1458 if (altcol != tok->altindstack[tok->indent]) {
1459 return indenterror(tok);
1460 }
1461 }
1462 }
1463 }
1464
1465 tok->start = tok->cur;
1466
1467 /* Return pending indents/dedents */
1468 if (tok->pendin != 0) {
1469 if (tok->pendin < 0) {
1470 tok->pendin++;
1471 return DEDENT;
1472 }
1473 else {
1474 tok->pendin--;
1475 return INDENT;
1476 }
1477 }
1478
1479 /* Peek ahead at the next character */
1480 c = tok_nextc(tok);
1481 tok_backup(tok, c);
1482 /* Check if we are closing an async function */
1483 if (tok->async_def
1484 && !blankline
1485 /* Due to some implementation artifacts of type comments,
1486 * a TYPE_COMMENT at the start of a function won't set an
1487 * indentation level and it will produce a NEWLINE after it.
1488 * To avoid spuriously ending an async function due to this,
1489 * wait until we have some non-newline char in front of us. */
1490 && c != '\n'
1491 && tok->level == 0
1492 /* There was a NEWLINE after ASYNC DEF,
1493 so we're past the signature. */
1494 && tok->async_def_nl
1495 /* Current indentation level is less than where
1496 the async function was defined */
1497 && tok->async_def_indent >= tok->indent)
1498 {
1499 tok->async_def = 0;
1500 tok->async_def_indent = 0;
1501 tok->async_def_nl = 0;
1502 }
1503
1504 again:
1505 tok->start = NULL;
1506 /* Skip spaces */
1507 do {
1508 c = tok_nextc(tok);
1509 } while (c == ' ' || c == '\t' || c == '\014');
1510
1511 /* Set start of current token */
1512 tok->start = tok->cur - 1;
1513
1514 /* Skip comment, unless it's a type comment */
1515 if (c == '#') {
1516 const char *prefix, *p, *type_start;
1517
1518 while (c != EOF && c != '\n')
1519 c = tok_nextc(tok);
1520
1521 p = tok->start;
1522 prefix = type_comment_prefix;
1523 while (*prefix && p < tok->cur) {
1524 if (*prefix == ' ') {
1525 while (*p == ' ' || *p == '\t')
1526 p++;
1527 } else if (*prefix == *p) {
1528 p++;
1529 } else {
1530 break;
1531 }
1532
1533 prefix++;
1534 }
1535
1536 /* This is a type comment if we matched all of type_comment_prefix. */
1537 if (!*prefix) {
1538 int is_type_ignore = 1;
1539 const char *ignore_end = p + 6;
1540 tok_backup(tok, c); /* don't eat the newline or EOF */
1541
1542 type_start = p;
1543
1544 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1545 * or anything ASCII and non-alphanumeric. */
1546 is_type_ignore = (
1547 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1548 && !(tok->cur > ignore_end
1549 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1550
1551 if (is_type_ignore) {
1552 *p_start = (char *) ignore_end;
1553 *p_end = tok->cur;
1554
1555 /* If this type ignore is the only thing on the line, consume the newline also. */
1556 if (blankline) {
1557 tok_nextc(tok);
1558 tok->atbol = 1;
1559 }
1560 return TYPE_IGNORE;
1561 } else {
1562 *p_start = (char *) type_start; /* after type_comment_prefix */
1563 *p_end = tok->cur;
1564 return TYPE_COMMENT;
1565 }
1566 }
1567 }
1568
1569 /* Check for EOF and errors now */
1570 if (c == EOF) {
1571 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1572 }
1573
1574 /* Identifier (most frequent token!) */
1575 nonascii = 0;
1576 if (is_potential_identifier_start(c)) {
1577 /* Process the various legal combinations of b"", r"", u"", and f"". */
1578 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1579 while (1) {
1580 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1581 saw_b = 1;
1582 /* Since this is a backwards compatibility support literal we don't
1583 want to support it in arbitrary order like byte literals. */
1584 else if (!(saw_b || saw_u || saw_r || saw_f)
1585 && (c == 'u'|| c == 'U')) {
1586 saw_u = 1;
1587 }
1588 /* ur"" and ru"" are not supported */
1589 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1590 saw_r = 1;
1591 }
1592 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1593 saw_f = 1;
1594 }
1595 else {
1596 break;
1597 }
1598 c = tok_nextc(tok);
1599 if (c == '"' || c == '\'') {
1600 goto letter_quote;
1601 }
1602 }
1603 while (is_potential_identifier_char(c)) {
1604 if (c >= 128) {
1605 nonascii = 1;
1606 }
1607 c = tok_nextc(tok);
1608 }
1609 tok_backup(tok, c);
1610 if (nonascii && !verify_identifier(tok)) {
1611 return ERRORTOKEN;
1612 }
1613 *p_start = tok->start;
1614 *p_end = tok->cur;
1615
1616 /* async/await parsing block. */
1617 if (tok->cur - tok->start == 5) {
1618 /* Current token length is 5. */
1619 if (tok->async_always || tok->async_def) {
1620 /* We're inside an 'async def' function. */
1621 if (memcmp(tok->start, "async", 5) == 0) {
1622 return ASYNC;
1623 }
1624 if (memcmp(tok->start, "await", 5) == 0) {
1625 return AWAIT;
1626 }
1627 }
1628 else if (memcmp(tok->start, "async", 5) == 0) {
1629 /* The current token is 'async'.
1630 Look ahead one token.*/
1631
1632 struct tok_state ahead_tok;
1633 char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
1634 int ahead_tok_kind;
1635
1636 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1637 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1638 &ahead_tok_end);
1639
1640 if (ahead_tok_kind == NAME
1641 && ahead_tok.cur - ahead_tok.start == 3
1642 && memcmp(ahead_tok.start, "def", 3) == 0)
1643 {
1644 /* The next token is going to be 'def', so instead of
1645 returning 'async' NAME token, we return ASYNC. */
1646 tok->async_def_indent = tok->indent;
1647 tok->async_def = 1;
1648 return ASYNC;
1649 }
1650 }
1651 }
1652
1653 return NAME;
1654 }
1655
1656 /* Newline */
1657 if (c == '\n') {
1658 tok->atbol = 1;
1659 if (blankline || tok->level > 0) {
1660 goto nextline;
1661 }
1662 *p_start = tok->start;
1663 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1664 tok->cont_line = 0;
1665 if (tok->async_def) {
1666 /* We're somewhere inside an 'async def' function, and
1667 we've encountered a NEWLINE after its signature. */
1668 tok->async_def_nl = 1;
1669 }
1670 return NEWLINE;
1671 }
1672
1673 /* Period or number starting with period? */
1674 if (c == '.') {
1675 c = tok_nextc(tok);
1676 if (isdigit(c)) {
1677 goto fraction;
1678 } else if (c == '.') {
1679 c = tok_nextc(tok);
1680 if (c == '.') {
1681 *p_start = tok->start;
1682 *p_end = tok->cur;
1683 return ELLIPSIS;
1684 }
1685 else {
1686 tok_backup(tok, c);
1687 }
1688 tok_backup(tok, '.');
1689 }
1690 else {
1691 tok_backup(tok, c);
1692 }
1693 *p_start = tok->start;
1694 *p_end = tok->cur;
1695 return DOT;
1696 }
1697
1698 /* Number */
1699 if (isdigit(c)) {
1700 if (c == '0') {
1701 /* Hex, octal or binary -- maybe. */
1702 c = tok_nextc(tok);
1703 if (c == 'x' || c == 'X') {
1704 /* Hex */
1705 c = tok_nextc(tok);
1706 do {
1707 if (c == '_') {
1708 c = tok_nextc(tok);
1709 }
1710 if (!isxdigit(c)) {
1711 tok->done = E_TOKEN;
1712 tok_backup(tok, c);
1713 return ERRORTOKEN;
1714 }
1715 do {
1716 c = tok_nextc(tok);
1717 } while (isxdigit(c));
1718 } while (c == '_');
1719 }
1720 else if (c == 'o' || c == 'O') {
1721 /* Octal */
1722 c = tok_nextc(tok);
1723 do {
1724 if (c == '_') {
1725 c = tok_nextc(tok);
1726 }
1727 if (c < '0' || c >= '8') {
1728 tok->done = E_TOKEN;
1729 tok_backup(tok, c);
1730 return ERRORTOKEN;
1731 }
1732 do {
1733 c = tok_nextc(tok);
1734 } while ('0' <= c && c < '8');
1735 } while (c == '_');
1736 }
1737 else if (c == 'b' || c == 'B') {
1738 /* Binary */
1739 c = tok_nextc(tok);
1740 do {
1741 if (c == '_') {
1742 c = tok_nextc(tok);
1743 }
1744 if (c != '0' && c != '1') {
1745 tok->done = E_TOKEN;
1746 tok_backup(tok, c);
1747 return ERRORTOKEN;
1748 }
1749 do {
1750 c = tok_nextc(tok);
1751 } while (c == '0' || c == '1');
1752 } while (c == '_');
1753 }
1754 else {
1755 int nonzero = 0;
1756 /* maybe old-style octal; c is first char of it */
1757 /* in any case, allow '0' as a literal */
1758 while (1) {
1759 if (c == '_') {
1760 c = tok_nextc(tok);
1761 if (!isdigit(c)) {
1762 tok->done = E_TOKEN;
1763 tok_backup(tok, c);
1764 return ERRORTOKEN;
1765 }
1766 }
1767 if (c != '0') {
1768 break;
1769 }
1770 c = tok_nextc(tok);
1771 }
1772 if (isdigit(c)) {
1773 nonzero = 1;
1774 c = tok_decimal_tail(tok);
1775 if (c == 0) {
1776 return ERRORTOKEN;
1777 }
1778 }
1779 if (c == '.') {
1780 c = tok_nextc(tok);
1781 goto fraction;
1782 }
1783 else if (c == 'e' || c == 'E') {
1784 goto exponent;
1785 }
1786 else if (c == 'j' || c == 'J') {
1787 goto imaginary;
1788 }
1789 else if (nonzero) {
1790 /* Old-style octal: now disallowed. */
1791 tok->done = E_TOKEN;
1792 tok_backup(tok, c);
1793 return ERRORTOKEN;
1794 }
1795 }
1796 }
1797 else {
1798 /* Decimal */
1799 c = tok_decimal_tail(tok);
1800 if (c == 0) {
1801 return ERRORTOKEN;
1802 }
1803 {
1804 /* Accept floating point numbers. */
1805 if (c == '.') {
1806 c = tok_nextc(tok);
1807 fraction:
1808 /* Fraction */
1809 if (isdigit(c)) {
1810 c = tok_decimal_tail(tok);
1811 if (c == 0) {
1812 return ERRORTOKEN;
1813 }
1814 }
1815 }
1816 if (c == 'e' || c == 'E') {
1817 int e;
1818 exponent:
1819 e = c;
1820 /* Exponent part */
1821 c = tok_nextc(tok);
1822 if (c == '+' || c == '-') {
1823 c = tok_nextc(tok);
1824 if (!isdigit(c)) {
1825 tok->done = E_TOKEN;
1826 tok_backup(tok, c);
1827 return ERRORTOKEN;
1828 }
1829 } else if (!isdigit(c)) {
1830 tok_backup(tok, c);
1831 tok_backup(tok, e);
1832 *p_start = tok->start;
1833 *p_end = tok->cur;
1834 return NUMBER;
1835 }
1836 c = tok_decimal_tail(tok);
1837 if (c == 0) {
1838 return ERRORTOKEN;
1839 }
1840 }
1841 if (c == 'j' || c == 'J') {
1842 /* Imaginary part */
1843 imaginary:
1844 c = tok_nextc(tok);
1845 }
1846 }
1847 }
1848 tok_backup(tok, c);
1849 *p_start = tok->start;
1850 *p_end = tok->cur;
1851 return NUMBER;
1852 }
1853
1854 letter_quote:
1855 /* String */
1856 if (c == '\'' || c == '"') {
1857 int quote = c;
1858 int quote_size = 1; /* 1 or 3 */
1859 int end_quote_size = 0;
1860
1861 /* Find the quote size and start of string */
1862 c = tok_nextc(tok);
1863 if (c == quote) {
1864 c = tok_nextc(tok);
1865 if (c == quote) {
1866 quote_size = 3;
1867 }
1868 else {
1869 end_quote_size = 1; /* empty string found */
1870 }
1871 }
1872 if (c != quote) {
1873 tok_backup(tok, c);
1874 }
1875
1876 /* Get rest of string */
1877 while (end_quote_size != quote_size) {
1878 c = tok_nextc(tok);
1879 if (c == EOF) {
1880 if (quote_size == 3) {
1881 tok->done = E_EOFS;
1882 }
1883 else {
1884 tok->done = E_EOLS;
1885 }
1886 tok->cur = tok->inp;
1887 return ERRORTOKEN;
1888 }
1889 if (quote_size == 1 && c == '\n') {
1890 tok->done = E_EOLS;
1891 tok->cur = tok->inp;
1892 return ERRORTOKEN;
1893 }
1894 if (c == quote) {
1895 end_quote_size += 1;
1896 }
1897 else {
1898 end_quote_size = 0;
1899 if (c == '\\') {
1900 tok_nextc(tok); /* skip escaped char */
1901 }
1902 }
1903 }
1904
1905 *p_start = tok->start;
1906 *p_end = tok->cur;
1907 return STRING;
1908 }
1909
1910 /* Line continuation */
1911 if (c == '\\') {
1912 c = tok_nextc(tok);
1913 if (c != '\n') {
1914 tok->done = E_LINECONT;
1915 tok->cur = tok->inp;
1916 return ERRORTOKEN;
1917 }
1918 tok->cont_line = 1;
1919 goto again; /* Read next line */
1920 }
1921
1922 /* Check for two-character token */
1923 {
1924 int c2 = tok_nextc(tok);
1925 int token = Ta3Token_TwoChars(c, c2);
1926 if (token != OP) {
1927 int c3 = tok_nextc(tok);
1928 int token3 = Ta3Token_ThreeChars(c, c2, c3);
1929 if (token3 != OP) {
1930 token = token3;
1931 }
1932 else {
1933 tok_backup(tok, c3);
1934 }
1935 *p_start = tok->start;
1936 *p_end = tok->cur;
1937 return token;
1938 }
1939 tok_backup(tok, c2);
1940 }
1941
1942 /* Keep track of parentheses nesting level */
1943 switch (c) {
1944 case '(':
1945 case '[':
1946 case '{':
1947 tok->level++;
1948 break;
1949 case ')':
1950 case ']':
1951 case '}':
1952 tok->level--;
1953 break;
1954 }
1955
1956 /* Punctuation character */
1957 *p_start = tok->start;
1958 *p_end = tok->cur;
1959 return Ta3Token_OneChar(c);
1960 }
1961
1962 int
Ta3Tokenizer_Get(struct tok_state * tok,char ** p_start,char ** p_end)1963 Ta3Tokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1964 {
1965 int result = tok_get(tok, p_start, p_end);
1966 if (tok->decoding_erred) {
1967 result = ERRORTOKEN;
1968 tok->done = E_DECODE;
1969 }
1970 return result;
1971 }
1972
1973 /* Get the encoding of a Python file. Check for the coding cookie and check if
1974 the file starts with a BOM.
1975
1976 Ta3Tokenizer_FindEncodingFilename() returns NULL when it can't find the
1977 encoding in the first or second line of the file (in which case the encoding
1978 should be assumed to be UTF-8).
1979
1980 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1981 by the caller. */
1982
1983 char *
Ta3Tokenizer_FindEncodingFilename(int fd,PyObject * filename)1984 Ta3Tokenizer_FindEncodingFilename(int fd, PyObject *filename)
1985 {
1986 struct tok_state *tok;
1987 FILE *fp;
1988 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
1989
1990 #ifndef PGEN
1991 fd = _Py_dup(fd);
1992 #else
1993 fd = dup(fd);
1994 #endif
1995 if (fd < 0) {
1996 return NULL;
1997 }
1998
1999 fp = fdopen(fd, "r");
2000 if (fp == NULL) {
2001 return NULL;
2002 }
2003 tok = Ta3Tokenizer_FromFile(fp, NULL, NULL, NULL);
2004 if (tok == NULL) {
2005 fclose(fp);
2006 return NULL;
2007 }
2008 #ifndef PGEN
2009 if (filename != NULL) {
2010 Py_INCREF(filename);
2011 tok->filename = filename;
2012 }
2013 else {
2014 tok->filename = PyUnicode_FromString("<string>");
2015 if (tok->filename == NULL) {
2016 fclose(fp);
2017 Ta3Tokenizer_Free(tok);
2018 return encoding;
2019 }
2020 }
2021 #endif
2022 while (tok->lineno < 2 && tok->done == E_OK) {
2023 Ta3Tokenizer_Get(tok, &p_start, &p_end);
2024 }
2025 fclose(fp);
2026 if (tok->encoding) {
2027 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
2028 if (encoding)
2029 strcpy(encoding, tok->encoding);
2030 }
2031 Ta3Tokenizer_Free(tok);
2032 return encoding;
2033 }
2034
2035 char *
Ta3Tokenizer_FindEncoding(int fd)2036 Ta3Tokenizer_FindEncoding(int fd)
2037 {
2038 return Ta3Tokenizer_FindEncodingFilename(fd, NULL);
2039 }
2040
2041 #ifdef Py_DEBUG
2042
2043 void
tok_dump(int type,char * start,char * end)2044 tok_dump(int type, char *start, char *end)
2045 {
2046 printf("%s", _Ta3Parser_TokenNames[type]);
2047 if (type == NAME || type == NUMBER || type == STRING || type == OP)
2048 printf("(%.*s)", (int)(end - start), start);
2049 }
2050
2051 #endif
2052