1
2 /* Tokenizer implementation */
3
4 #include "Python.h"
5 #include "../Include/pgenheaders.h"
6
7 #include <ctype.h>
8 #include <assert.h>
9
10 #include "tokenizer.h"
11 #include "../Include/errcode.h"
12
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "bytesobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #endif /* PGEN */
20
21 #if PY_MINOR_VERSION >= 4
22 PyAPI_FUNC(char *) PyOS_Readline(FILE *, FILE *, const char *);
23 #else
24 PyAPI_FUNC(char *) PyOS_Readline(FILE *, FILE *, char *);
25 #endif
26 /* Return malloc'ed string including trailing \n;
27 empty malloc'ed string for EOF;
28 NULL if interrupted */
29
30 /* Don't ever change this -- it would break the portability of Python code */
31 #define TABSIZE 8
32
33 /* Forward */
34 static struct tok_state *tok_new(void);
35 static int tok_nextc(struct tok_state *tok);
36 static void tok_backup(struct tok_state *tok, int c);
37
38 /* Token names */
39
40 char *_Ta27Parser_TokenNames[] = {
41 "ENDMARKER",
42 "NAME",
43 "NUMBER",
44 "STRING",
45 "NEWLINE",
46 "INDENT",
47 "DEDENT",
48 "LPAR",
49 "RPAR",
50 "LSQB",
51 "RSQB",
52 "COLON",
53 "COMMA",
54 "SEMI",
55 "PLUS",
56 "MINUS",
57 "STAR",
58 "SLASH",
59 "VBAR",
60 "AMPER",
61 "LESS",
62 "GREATER",
63 "EQUAL",
64 "DOT",
65 "PERCENT",
66 "BACKQUOTE",
67 "LBRACE",
68 "RBRACE",
69 "EQEQUAL",
70 "NOTEQUAL",
71 "LESSEQUAL",
72 "GREATEREQUAL",
73 "TILDE",
74 "CIRCUMFLEX",
75 "LEFTSHIFT",
76 "RIGHTSHIFT",
77 "DOUBLESTAR",
78 "PLUSEQUAL",
79 "MINEQUAL",
80 "STAREQUAL",
81 "SLASHEQUAL",
82 "PERCENTEQUAL",
83 "AMPEREQUAL",
84 "VBAREQUAL",
85 "CIRCUMFLEXEQUAL",
86 "LEFTSHIFTEQUAL",
87 "RIGHTSHIFTEQUAL",
88 "DOUBLESTAREQUAL",
89 "DOUBLESLASH",
90 "DOUBLESLASHEQUAL",
91 "AT",
92 /* This table must match the #defines in token.h! */
93 "OP",
94 "RARROW",
95 "TYPE_IGNORE",
96 "TYPE_COMMENT",
97 "<ERRORTOKEN>",
98 "<N_TOKENS>"
99 };
100
101 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
102 tokenizing. */
103 static const char* type_comment_prefix = "# type: ";
104
105 /* Create and initialize a new tok_state structure */
106
107 static struct tok_state *
tok_new(void)108 tok_new(void)
109 {
110 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
111 sizeof(struct tok_state));
112 if (tok == NULL)
113 return NULL;
114 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
115 tok->done = E_OK;
116 tok->fp = NULL;
117 tok->input = NULL;
118 tok->tabsize = TABSIZE;
119 tok->indent = 0;
120 tok->indstack[0] = 0;
121 tok->atbol = 1;
122 tok->pendin = 0;
123 tok->prompt = tok->nextprompt = NULL;
124 tok->lineno = 0;
125 tok->level = 0;
126 tok->filename = NULL;
127 tok->altwarning = 0;
128 tok->alterror = 0;
129 tok->alttabsize = 1;
130 tok->altindstack[0] = 0;
131 tok->decoding_state = 0;
132 tok->decoding_erred = 0;
133 tok->read_coding_spec = 0;
134 tok->encoding = NULL;
135 tok->cont_line = 0;
136 #ifndef PGEN
137 tok->decoding_readline = NULL;
138 tok->decoding_buffer = NULL;
139 #endif
140 return tok;
141 }
142
143 static char *
new_string(const char * s,Py_ssize_t len)144 new_string(const char *s, Py_ssize_t len)
145 {
146 char* result = (char *)PyMem_MALLOC(len + 1);
147 if (result != NULL) {
148 memcpy(result, s, len);
149 result[len] = '\0';
150 }
151 return result;
152 }
153
154 #ifdef PGEN
155
156 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)157 decoding_fgets(char *s, int size, struct tok_state *tok)
158 {
159 return fgets(s, size, tok->fp);
160 }
161
162 static int
decoding_feof(struct tok_state * tok)163 decoding_feof(struct tok_state *tok)
164 {
165 return feof(tok->fp);
166 }
167
168 static char *
decode_str(const char * str,int exec_input,struct tok_state * tok)169 decode_str(const char *str, int exec_input, struct tok_state *tok)
170 {
171 return new_string(str, strlen(str));
172 }
173
174 #else /* PGEN */
175
176 static char *
error_ret(struct tok_state * tok)177 error_ret(struct tok_state *tok) /* XXX */
178 {
179 tok->decoding_erred = 1;
180 if (tok->fp != NULL && tok->buf != NULL) /* see Ta27Tokenizer_Free */
181 PyMem_FREE(tok->buf);
182 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
183 tok->done = E_DECODE;
184 return NULL; /* as if it were EOF */
185 }
186
187
188 static char *
get_normal_name(char * s)189 get_normal_name(char *s) /* for utf-8 and latin-1 */
190 {
191 char buf[13];
192 int i;
193 for (i = 0; i < 12; i++) {
194 int c = s[i];
195 if (c == '\0')
196 break;
197 else if (c == '_')
198 buf[i] = '-';
199 else
200 buf[i] = tolower(c);
201 }
202 buf[i] = '\0';
203 if (strcmp(buf, "utf-8") == 0 ||
204 strncmp(buf, "utf-8-", 6) == 0)
205 return "utf-8";
206 else if (strcmp(buf, "latin-1") == 0 ||
207 strcmp(buf, "iso-8859-1") == 0 ||
208 strcmp(buf, "iso-latin-1") == 0 ||
209 strncmp(buf, "latin-1-", 8) == 0 ||
210 strncmp(buf, "iso-8859-1-", 11) == 0 ||
211 strncmp(buf, "iso-latin-1-", 12) == 0)
212 return "iso-8859-1";
213 else
214 return s;
215 }
216
217 /* Return the coding spec in S, or NULL if none is found. */
218
219 static char *
get_coding_spec(const char * s,Py_ssize_t size)220 get_coding_spec(const char *s, Py_ssize_t size)
221 {
222 Py_ssize_t i;
223 /* Coding spec must be in a comment, and that comment must be
224 * the only statement on the source code line. */
225 for (i = 0; i < size - 6; i++) {
226 if (s[i] == '#')
227 break;
228 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
229 return NULL;
230 }
231 for (; i < size - 6; i++) { /* XXX inefficient search */
232 const char* t = s + i;
233 if (strncmp(t, "coding", 6) == 0) {
234 const char* begin = NULL;
235 t += 6;
236 if (t[0] != ':' && t[0] != '=')
237 continue;
238 do {
239 t++;
240 } while (t[0] == '\x20' || t[0] == '\t');
241
242 begin = t;
243 while (Py_ISALNUM(t[0]) ||
244 t[0] == '-' || t[0] == '_' || t[0] == '.')
245 t++;
246
247 if (begin < t) {
248 char* r = new_string(begin, t - begin);
249 char* q;
250 if (!r)
251 return NULL;
252 q = get_normal_name(r);
253 if (r != q) {
254 PyMem_FREE(r);
255 r = new_string(q, strlen(q));
256 }
257 return r;
258 }
259 }
260 }
261 return NULL;
262 }
263
264 /* Check whether the line contains a coding spec. If it does,
265 invoke the set_readline function for the new encoding.
266 This function receives the tok_state and the new encoding.
267 Return 1 on success, 0 on failure. */
268
269 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))270 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
271 int set_readline(struct tok_state *, const char *))
272 {
273 char * cs;
274 int r = 1;
275
276 if (tok->cont_line) {
277 /* It's a continuation line, so it can't be a coding spec. */
278 tok->read_coding_spec = 1;
279 return 1;
280 }
281 cs = get_coding_spec(line, size);
282 if (!cs) {
283 Py_ssize_t i;
284 for (i = 0; i < size; i++) {
285 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
286 break;
287 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
288 /* Stop checking coding spec after a line containing
289 * anything except a comment. */
290 tok->read_coding_spec = 1;
291 break;
292 }
293 }
294 } else {
295 tok->read_coding_spec = 1;
296 if (tok->encoding == NULL) {
297 assert(tok->decoding_state == 1); /* raw */
298 if (strcmp(cs, "utf-8") == 0 ||
299 strcmp(cs, "iso-8859-1") == 0) {
300 tok->encoding = cs;
301 } else {
302 #ifdef Py_USING_UNICODE
303 r = set_readline(tok, cs);
304 if (r) {
305 tok->encoding = cs;
306 tok->decoding_state = -1;
307 }
308 else {
309 PyErr_Format(PyExc_SyntaxError,
310 "encoding problem: %s", cs);
311 PyMem_FREE(cs);
312 }
313 #else
314 /* Without Unicode support, we cannot
315 process the coding spec. Since there
316 won't be any Unicode literals, that
317 won't matter. */
318 PyMem_FREE(cs);
319 #endif
320 }
321 } else { /* then, compare cs with BOM */
322 r = (strcmp(tok->encoding, cs) == 0);
323 if (!r)
324 PyErr_Format(PyExc_SyntaxError,
325 "encoding problem: %s with BOM", cs);
326 PyMem_FREE(cs);
327 }
328 }
329 return r;
330 }
331
332 /* See whether the file starts with a BOM. If it does,
333 invoke the set_readline function with the new encoding.
334 Return 1 on success, 0 on failure. */
335
336 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)337 check_bom(int get_char(struct tok_state *),
338 void unget_char(int, struct tok_state *),
339 int set_readline(struct tok_state *, const char *),
340 struct tok_state *tok)
341 {
342 int ch1, ch2, ch3;
343 ch1 = get_char(tok);
344 tok->decoding_state = 1;
345 if (ch1 == EOF) {
346 return 1;
347 } else if (ch1 == 0xEF) {
348 ch2 = get_char(tok);
349 if (ch2 != 0xBB) {
350 unget_char(ch2, tok);
351 unget_char(ch1, tok);
352 return 1;
353 }
354 ch3 = get_char(tok);
355 if (ch3 != 0xBF) {
356 unget_char(ch3, tok);
357 unget_char(ch2, tok);
358 unget_char(ch1, tok);
359 return 1;
360 }
361 #if 0
362 /* Disable support for UTF-16 BOMs until a decision
363 is made whether this needs to be supported. */
364 } else if (ch1 == 0xFE) {
365 ch2 = get_char(tok);
366 if (ch2 != 0xFF) {
367 unget_char(ch2, tok);
368 unget_char(ch1, tok);
369 return 1;
370 }
371 if (!set_readline(tok, "utf-16-be"))
372 return 0;
373 tok->decoding_state = -1;
374 } else if (ch1 == 0xFF) {
375 ch2 = get_char(tok);
376 if (ch2 != 0xFE) {
377 unget_char(ch2, tok);
378 unget_char(ch1, tok);
379 return 1;
380 }
381 if (!set_readline(tok, "utf-16-le"))
382 return 0;
383 tok->decoding_state = -1;
384 #endif
385 } else {
386 unget_char(ch1, tok);
387 return 1;
388 }
389 if (tok->encoding != NULL)
390 PyMem_FREE(tok->encoding);
391 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
392 return 1;
393 }
394
395 /* Read a line of text from TOK into S, using the stream in TOK.
396 Return NULL on failure, else S.
397
398 On entry, tok->decoding_buffer will be one of:
399 1) NULL: need to call tok->decoding_readline to get a new line
400 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
401 stored the result in tok->decoding_buffer
402 3) PyBytesObject *: previous call to fp_readl did not have enough room
403 (in the s buffer) to copy entire contents of the line read
404 by tok->decoding_readline. tok->decoding_buffer has the overflow.
405 In this case, fp_readl is called in a loop (with an expanded buffer)
406 until the buffer ends with a '\n' (or until the end of the file is
407 reached): see tok_nextc and its calls to decoding_fgets.
408 */
409
410 static char *
fp_readl(char * s,int size,struct tok_state * tok)411 fp_readl(char *s, int size, struct tok_state *tok)
412 {
413 #ifndef Py_USING_UNICODE
414 /* In a non-Unicode built, this should never be called. */
415 Py_FatalError("fp_readl should not be called in this build.");
416 return NULL; /* Keep compiler happy (not reachable) */
417 #else
418 PyObject* utf8 = NULL;
419 PyObject* buf = tok->decoding_buffer;
420 char *str;
421 Py_ssize_t utf8len;
422
423 /* Ask for one less byte so we can terminate it */
424 assert(size > 0);
425 size--;
426
427 if (buf == NULL) {
428 buf = PyObject_CallObject(tok->decoding_readline, NULL);
429 if (buf == NULL)
430 return error_ret(tok);
431 if (!PyUnicode_Check(buf)) {
432 Py_DECREF(buf);
433 PyErr_SetString(PyExc_SyntaxError,
434 "codec did not return a unicode object");
435 return error_ret(tok);
436 }
437 } else {
438 tok->decoding_buffer = NULL;
439 if (PyBytes_CheckExact(buf))
440 utf8 = buf;
441 }
442 if (utf8 == NULL) {
443 utf8 = PyUnicode_AsUTF8String(buf);
444 Py_DECREF(buf);
445 if (utf8 == NULL)
446 return error_ret(tok);
447 }
448 str = PyBytes_AsString(utf8);
449 utf8len = PyBytes_GET_SIZE(utf8);
450 if (utf8len > size) {
451 tok->decoding_buffer = PyBytes_FromStringAndSize(str+size, utf8len-size);
452 if (tok->decoding_buffer == NULL) {
453 Py_DECREF(utf8);
454 return error_ret(tok);
455 }
456 utf8len = size;
457 }
458 memcpy(s, str, utf8len);
459 s[utf8len] = '\0';
460 Py_DECREF(utf8);
461 if (utf8len == 0)
462 return NULL; /* EOF */
463 return s;
464 #endif
465 }
466
467 /* Set the readline function for TOK to a StreamReader's
468 readline function. The StreamReader is named ENC.
469
470 This function is called from check_bom and check_coding_spec.
471
472 ENC is usually identical to the future value of tok->encoding,
473 except for the (currently unsupported) case of UTF-16.
474
475 Return 1 on success, 0 on failure. */
476
477 /* taken from Python 3.5.1 */
478
479 static int
fp_setreadl(struct tok_state * tok,const char * enc)480 fp_setreadl(struct tok_state *tok, const char* enc)
481 {
482 PyObject *readline = NULL, *stream = NULL, *io = NULL;
483 _Py_IDENTIFIER(open);
484 _Py_IDENTIFIER(readline);
485 int fd;
486 long pos;
487
488 io = PyImport_ImportModuleNoBlock("io");
489 if (io == NULL)
490 goto cleanup;
491
492 fd = fileno(tok->fp);
493 /* Due to buffering the file offset for fd can be different from the file
494 * position of tok->fp. If tok->fp was opened in text mode on Windows,
495 * its file position counts CRLF as one char and can't be directly mapped
496 * to the file offset for fd. Instead we step back one byte and read to
497 * the end of line.*/
498 pos = ftell(tok->fp);
499 if (pos == -1 ||
500 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
501 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
502 goto cleanup;
503 }
504
505 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
506 fd, "r", -1, enc, Py_None, Py_None, Py_False);
507 if (stream == NULL)
508 goto cleanup;
509
510 Py_XDECREF(tok->decoding_readline);
511 readline = _PyObject_GetAttrId(stream, &PyId_readline);
512 tok->decoding_readline = readline;
513 if (pos > 0) {
514 if (PyObject_CallObject(readline, NULL) == NULL) {
515 readline = NULL;
516 goto cleanup;
517 }
518 }
519
520 cleanup:
521 Py_XDECREF(stream);
522 Py_XDECREF(io);
523 return readline != NULL;
524 }
525
526 /* Fetch the next byte from TOK. */
527
fp_getc(struct tok_state * tok)528 static int fp_getc(struct tok_state *tok) {
529 return getc(tok->fp);
530 }
531
532 /* Unfetch the last byte back into TOK. */
533
fp_ungetc(int c,struct tok_state * tok)534 static void fp_ungetc(int c, struct tok_state *tok) {
535 ungetc(c, tok->fp);
536 }
537
538 /* Read a line of input from TOK. Determine encoding
539 if necessary. */
540
541 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)542 decoding_fgets(char *s, int size, struct tok_state *tok)
543 {
544 char *line = NULL;
545 int badchar = 0;
546 for (;;) {
547 if (tok->decoding_state < 0) {
548 /* We already have a codec associated with
549 this input. */
550 line = fp_readl(s, size, tok);
551 break;
552 } else if (tok->decoding_state > 0) {
553 /* We want a 'raw' read. */
554 line = Py_UniversalNewlineFgets(s, size,
555 tok->fp, NULL);
556 break;
557 } else {
558 /* We have not yet determined the encoding.
559 If an encoding is found, use the file-pointer
560 reader functions from now on. */
561 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
562 return error_ret(tok);
563 assert(tok->decoding_state != 0);
564 }
565 }
566 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
567 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
568 return error_ret(tok);
569 }
570 }
571 #ifndef PGEN
572 /* The default encoding is ASCII, so make sure we don't have any
573 non-ASCII bytes in it. */
574 if (line && !tok->encoding) {
575 unsigned char *c;
576 for (c = (unsigned char *)line; *c; c++)
577 if (*c > 127) {
578 badchar = *c;
579 break;
580 }
581 }
582 if (badchar) {
583 char buf[500];
584 /* Need to add 1 to the line number, since this line
585 has not been counted, yet. */
586 sprintf(buf,
587 "Non-ASCII character '\\x%.2x' "
588 "in file %.200s on line %i, "
589 "but no encoding declared; "
590 "see http://python.org/dev/peps/pep-0263/ for details",
591 badchar, tok->filename, tok->lineno + 1);
592 PyErr_SetString(PyExc_SyntaxError, buf);
593 return error_ret(tok);
594 }
595 #endif
596 return line;
597 }
598
599 static int
decoding_feof(struct tok_state * tok)600 decoding_feof(struct tok_state *tok)
601 {
602 if (tok->decoding_state >= 0) {
603 return feof(tok->fp);
604 } else {
605 PyObject* buf = tok->decoding_buffer;
606 if (buf == NULL) {
607 buf = PyObject_CallObject(tok->decoding_readline, NULL);
608 if (buf == NULL) {
609 error_ret(tok);
610 return 1;
611 } else {
612 tok->decoding_buffer = buf;
613 }
614 }
615 return PyObject_Length(buf) == 0;
616 }
617 }
618
619 /* Fetch a byte from TOK, using the string buffer. */
620
621 static int
buf_getc(struct tok_state * tok)622 buf_getc(struct tok_state *tok) {
623 return Py_CHARMASK(*tok->str++);
624 }
625
626 /* Unfetch a byte from TOK, using the string buffer. */
627
628 static void
buf_ungetc(int c,struct tok_state * tok)629 buf_ungetc(int c, struct tok_state *tok) {
630 tok->str--;
631 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
632 }
633
634 /* Set the readline function for TOK to ENC. For the string-based
635 tokenizer, this means to just record the encoding. */
636
637 static int
buf_setreadl(struct tok_state * tok,const char * enc)638 buf_setreadl(struct tok_state *tok, const char* enc) {
639 tok->enc = enc;
640 return 1;
641 }
642
643 /* Return a UTF-8 encoding Python string object from the
644 C byte string STR, which is encoded with ENC. */
645
646 #ifdef Py_USING_UNICODE
647 static PyObject *
translate_into_utf8(const char * str,const char * enc)648 translate_into_utf8(const char* str, const char* enc) {
649 PyObject *utf8;
650 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
651 if (buf == NULL)
652 return NULL;
653 utf8 = PyUnicode_AsUTF8String(buf);
654 Py_DECREF(buf);
655 return utf8;
656 }
657 #endif
658
659
660 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)661 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
662 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
663 char *buf, *current;
664 char c = '\0';
665 buf = PyMem_MALLOC(needed_length);
666 if (buf == NULL) {
667 tok->done = E_NOMEM;
668 return NULL;
669 }
670 for (current = buf; *s; s++, current++) {
671 c = *s;
672 if (skip_next_lf) {
673 skip_next_lf = 0;
674 if (c == '\n') {
675 c = *++s;
676 if (!c)
677 break;
678 }
679 }
680 if (c == '\r') {
681 skip_next_lf = 1;
682 c = '\n';
683 }
684 *current = c;
685 }
686 /* If this is exec input, add a newline to the end of the string if
687 there isn't one already. */
688 if (exec_input && c != '\n') {
689 *current = '\n';
690 current++;
691 }
692 *current = '\0';
693 final_length = current - buf + 1;
694 if (final_length < needed_length && final_length)
695 /* should never fail */
696 buf = PyMem_REALLOC(buf, final_length);
697 return buf;
698 }
699
700 /* Decode a byte string STR for use as the buffer of TOK.
701 Look for encoding declarations inside STR, and record them
702 inside TOK. */
703
704 static const char *
decode_str(const char * input,int single,struct tok_state * tok)705 decode_str(const char *input, int single, struct tok_state *tok)
706 {
707 PyObject* utf8 = NULL;
708 const char *str;
709 const char *s;
710 const char *newl[2] = {NULL, NULL};
711 int lineno = 0;
712 tok->input = str = translate_newlines(input, single, tok);
713 if (str == NULL)
714 return NULL;
715 tok->enc = NULL;
716 tok->str = str;
717 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
718 return error_ret(tok);
719 str = tok->str; /* string after BOM if any */
720 assert(str);
721 #ifdef Py_USING_UNICODE
722 if (tok->enc != NULL) {
723 utf8 = translate_into_utf8(str, tok->enc);
724 if (utf8 == NULL)
725 return error_ret(tok);
726 str = PyBytes_AsString(utf8);
727 }
728 #endif
729 for (s = str;; s++) {
730 if (*s == '\0') break;
731 else if (*s == '\n') {
732 assert(lineno < 2);
733 newl[lineno] = s;
734 lineno++;
735 if (lineno == 2) break;
736 }
737 }
738 tok->enc = NULL;
739 /* need to check line 1 and 2 separately since check_coding_spec
740 assumes a single line as input */
741 if (newl[0]) {
742 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
743 return error_ret(tok);
744 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
745 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
746 tok, buf_setreadl))
747 return error_ret(tok);
748 }
749 }
750 #ifdef Py_USING_UNICODE
751 if (tok->enc != NULL) {
752 assert(utf8 == NULL);
753 utf8 = translate_into_utf8(str, tok->enc);
754 if (utf8 == NULL)
755 return error_ret(tok);
756 str = PyBytes_AsString(utf8);
757 }
758 #endif
759 assert(tok->decoding_buffer == NULL);
760 tok->decoding_buffer = utf8; /* CAUTION */
761 return str;
762 }
763
764 #endif /* PGEN */
765
766 /* Set up tokenizer for string */
767
768 struct tok_state *
Ta27Tokenizer_FromString(const char * str,int exec_input)769 Ta27Tokenizer_FromString(const char *str, int exec_input)
770 {
771 struct tok_state *tok = tok_new();
772 if (tok == NULL)
773 return NULL;
774 str = (char *)decode_str(str, exec_input, tok);
775 if (str == NULL) {
776 Ta27Tokenizer_Free(tok);
777 return NULL;
778 }
779
780 /* XXX: constify members. */
781 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
782 return tok;
783 }
784
785 /* adapted from Python 3.5.1 */
786 struct tok_state *
Ta27Tokenizer_FromUTF8(const char * str,int exec_input)787 Ta27Tokenizer_FromUTF8(const char *str, int exec_input)
788 {
789 struct tok_state *tok = tok_new();
790 if (tok == NULL)
791 return NULL;
792 #ifndef PGEN
793 tok->input = str = translate_newlines(str, exec_input, tok);
794 #endif
795 if (str == NULL) {
796 Ta27Tokenizer_Free(tok);
797 return NULL;
798 }
799 tok->decoding_state = 1;
800 tok->read_coding_spec = 1;
801 tok->enc = NULL;
802 tok->str = str;
803
804 /* XXX: constify members. */
805 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
806 return tok;
807 }
808
809
810 /* Set up tokenizer for file */
811
812 struct tok_state *
Ta27Tokenizer_FromFile(FILE * fp,char * ps1,char * ps2)813 Ta27Tokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
814 {
815 struct tok_state *tok = tok_new();
816 if (tok == NULL)
817 return NULL;
818 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
819 Ta27Tokenizer_Free(tok);
820 return NULL;
821 }
822 tok->cur = tok->inp = tok->buf;
823 tok->end = tok->buf + BUFSIZ;
824 tok->fp = fp;
825 tok->prompt = ps1;
826 tok->nextprompt = ps2;
827 return tok;
828 }
829
830
831 /* Free a tok_state structure */
832
833 void
Ta27Tokenizer_Free(struct tok_state * tok)834 Ta27Tokenizer_Free(struct tok_state *tok)
835 {
836 if (tok->encoding != NULL)
837 PyMem_FREE(tok->encoding);
838 #ifndef PGEN
839 Py_XDECREF(tok->decoding_readline);
840 Py_XDECREF(tok->decoding_buffer);
841 #endif
842 if (tok->fp != NULL && tok->buf != NULL)
843 PyMem_FREE(tok->buf);
844 if (tok->input)
845 PyMem_FREE((char *)tok->input);
846 PyMem_FREE(tok);
847 }
848
849 /* Get next char, updating state; error code goes into tok->done */
850 /* taken from Python 3.5.1 */
851 static int
tok_nextc(struct tok_state * tok)852 tok_nextc(struct tok_state *tok)
853 {
854 for (;;) {
855 if (tok->cur != tok->inp) {
856 return Py_CHARMASK(*tok->cur++); /* Fast path */
857 }
858 if (tok->done != E_OK)
859 return EOF;
860 if (tok->fp == NULL) {
861 char *end = strchr(tok->inp, '\n');
862 if (end != NULL)
863 end++;
864 else {
865 end = strchr(tok->inp, '\0');
866 if (end == tok->inp) {
867 tok->done = E_EOF;
868 return EOF;
869 }
870 }
871 if (tok->start == NULL)
872 tok->buf = tok->cur;
873 tok->line_start = tok->cur;
874 tok->lineno++;
875 tok->inp = end;
876 return Py_CHARMASK(*tok->cur++);
877 }
878 if (tok->prompt != NULL) {
879 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
880 #ifndef PGEN
881 if (newtok != NULL) {
882 char *translated = translate_newlines(newtok, 0, tok);
883 PyMem_FREE(newtok);
884 if (translated == NULL)
885 return EOF;
886 newtok = translated;
887 }
888 if (tok->encoding && newtok && *newtok) {
889 /* Recode to UTF-8 */
890 Py_ssize_t buflen;
891 const char* buf;
892 PyObject *u = translate_into_utf8(newtok, tok->encoding);
893 PyMem_FREE(newtok);
894 if (!u) {
895 tok->done = E_DECODE;
896 return EOF;
897 }
898 buflen = PyBytes_GET_SIZE(u);
899 buf = PyBytes_AS_STRING(u);
900 newtok = PyMem_MALLOC(buflen+1);
901 strcpy(newtok, buf);
902 Py_DECREF(u);
903 }
904 #endif
905 if (tok->nextprompt != NULL)
906 tok->prompt = tok->nextprompt;
907 if (newtok == NULL)
908 tok->done = E_INTR;
909 else if (*newtok == '\0') {
910 PyMem_FREE(newtok);
911 tok->done = E_EOF;
912 }
913 else if (tok->start != NULL) {
914 size_t start = tok->start - tok->buf;
915 size_t oldlen = tok->cur - tok->buf;
916 size_t newlen = oldlen + strlen(newtok);
917 char *buf = tok->buf;
918 buf = (char *)PyMem_REALLOC(buf, newlen+1);
919 tok->lineno++;
920 if (buf == NULL) {
921 PyMem_FREE(tok->buf);
922 tok->buf = NULL;
923 PyMem_FREE(newtok);
924 tok->done = E_NOMEM;
925 return EOF;
926 }
927 tok->buf = buf;
928 tok->cur = tok->buf + oldlen;
929 tok->line_start = tok->cur;
930 strcpy(tok->buf + oldlen, newtok);
931 PyMem_FREE(newtok);
932 tok->inp = tok->buf + newlen;
933 tok->end = tok->inp + 1;
934 tok->start = tok->buf + start;
935 }
936 else {
937 tok->lineno++;
938 if (tok->buf != NULL)
939 PyMem_FREE(tok->buf);
940 tok->buf = newtok;
941 tok->cur = tok->buf;
942 tok->line_start = tok->buf;
943 tok->inp = strchr(tok->buf, '\0');
944 tok->end = tok->inp + 1;
945 }
946 }
947 else {
948 int done = 0;
949 Py_ssize_t cur = 0;
950 char *pt;
951 if (tok->start == NULL) {
952 if (tok->buf == NULL) {
953 tok->buf = (char *)
954 PyMem_MALLOC(BUFSIZ);
955 if (tok->buf == NULL) {
956 tok->done = E_NOMEM;
957 return EOF;
958 }
959 tok->end = tok->buf + BUFSIZ;
960 }
961 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
962 tok) == NULL) {
963 if (!tok->decoding_erred)
964 tok->done = E_EOF;
965 done = 1;
966 }
967 else {
968 tok->done = E_OK;
969 tok->inp = strchr(tok->buf, '\0');
970 done = tok->inp[-1] == '\n';
971 }
972 }
973 else {
974 cur = tok->cur - tok->buf;
975 if (decoding_feof(tok)) {
976 tok->done = E_EOF;
977 done = 1;
978 }
979 else
980 tok->done = E_OK;
981 }
982 tok->lineno++;
983 /* Read until '\n' or EOF */
984 while (!done) {
985 Py_ssize_t curstart = tok->start == NULL ? -1 :
986 tok->start - tok->buf;
987 Py_ssize_t curvalid = tok->inp - tok->buf;
988 Py_ssize_t newsize = curvalid + BUFSIZ;
989 char *newbuf = tok->buf;
990 newbuf = (char *)PyMem_REALLOC(newbuf,
991 newsize);
992 if (newbuf == NULL) {
993 tok->done = E_NOMEM;
994 tok->cur = tok->inp;
995 return EOF;
996 }
997 tok->buf = newbuf;
998 tok->cur = tok->buf + cur;
999 tok->line_start = tok->cur;
1000 tok->inp = tok->buf + curvalid;
1001 tok->end = tok->buf + newsize;
1002 tok->start = curstart < 0 ? NULL :
1003 tok->buf + curstart;
1004 if (decoding_fgets(tok->inp,
1005 (int)(tok->end - tok->inp),
1006 tok) == NULL) {
1007 /* Break out early on decoding
1008 errors, as tok->buf will be NULL
1009 */
1010 if (tok->decoding_erred)
1011 return EOF;
1012 /* Last line does not end in \n,
1013 fake one */
1014 strcpy(tok->inp, "\n");
1015 }
1016 tok->inp = strchr(tok->inp, '\0');
1017 done = tok->inp[-1] == '\n';
1018 }
1019 if (tok->buf != NULL) {
1020 tok->cur = tok->buf + cur;
1021 tok->line_start = tok->cur;
1022 /* replace "\r\n" with "\n" */
1023 /* For Mac leave the \r, giving a syntax error */
1024 pt = tok->inp - 2;
1025 if (pt >= tok->buf && *pt == '\r') {
1026 *pt++ = '\n';
1027 *pt = '\0';
1028 tok->inp = pt;
1029 }
1030 }
1031 }
1032 if (tok->done != E_OK) {
1033 if (tok->prompt != NULL)
1034 PySys_WriteStderr("\n");
1035 tok->cur = tok->inp;
1036 return EOF;
1037 }
1038 }
1039 /*NOTREACHED*/
1040 }
1041
1042
1043 /* Back-up one character */
1044
1045 static void
tok_backup(register struct tok_state * tok,register int c)1046 tok_backup(register struct tok_state *tok, register int c)
1047 {
1048 if (c != EOF) {
1049 if (--tok->cur < tok->buf)
1050 Py_FatalError("tok_backup: beginning of buffer");
1051 if (*tok->cur != c)
1052 *tok->cur = c;
1053 }
1054 }
1055
1056
1057 /* Return the token corresponding to a single character */
1058
1059 int
Ta27Token_OneChar(int c)1060 Ta27Token_OneChar(int c)
1061 {
1062 switch (c) {
1063 case '(': return LPAR;
1064 case ')': return RPAR;
1065 case '[': return LSQB;
1066 case ']': return RSQB;
1067 case ':': return COLON;
1068 case ',': return COMMA;
1069 case ';': return SEMI;
1070 case '+': return PLUS;
1071 case '-': return MINUS;
1072 case '*': return STAR;
1073 case '/': return SLASH;
1074 case '|': return VBAR;
1075 case '&': return AMPER;
1076 case '<': return LESS;
1077 case '>': return GREATER;
1078 case '=': return EQUAL;
1079 case '.': return DOT;
1080 case '%': return PERCENT;
1081 case '`': return BACKQUOTE;
1082 case '{': return LBRACE;
1083 case '}': return RBRACE;
1084 case '^': return CIRCUMFLEX;
1085 case '~': return TILDE;
1086 case '@': return AT;
1087 default: return OP;
1088 }
1089 }
1090
1091
1092 int
Ta27Token_TwoChars(int c1,int c2)1093 Ta27Token_TwoChars(int c1, int c2)
1094 {
1095 switch (c1) {
1096 case '=':
1097 switch (c2) {
1098 case '=': return EQEQUAL;
1099 }
1100 break;
1101 case '!':
1102 switch (c2) {
1103 case '=': return NOTEQUAL;
1104 }
1105 break;
1106 case '<':
1107 switch (c2) {
1108 case '>': return NOTEQUAL;
1109 case '=': return LESSEQUAL;
1110 case '<': return LEFTSHIFT;
1111 }
1112 break;
1113 case '>':
1114 switch (c2) {
1115 case '=': return GREATEREQUAL;
1116 case '>': return RIGHTSHIFT;
1117 }
1118 break;
1119 case '+':
1120 switch (c2) {
1121 case '=': return PLUSEQUAL;
1122 }
1123 break;
1124 case '-':
1125 switch (c2) {
1126 case '=': return MINEQUAL;
1127 case '>': return RARROW;
1128 }
1129 break;
1130 case '*':
1131 switch (c2) {
1132 case '*': return DOUBLESTAR;
1133 case '=': return STAREQUAL;
1134 }
1135 break;
1136 case '/':
1137 switch (c2) {
1138 case '/': return DOUBLESLASH;
1139 case '=': return SLASHEQUAL;
1140 }
1141 break;
1142 case '|':
1143 switch (c2) {
1144 case '=': return VBAREQUAL;
1145 }
1146 break;
1147 case '%':
1148 switch (c2) {
1149 case '=': return PERCENTEQUAL;
1150 }
1151 break;
1152 case '&':
1153 switch (c2) {
1154 case '=': return AMPEREQUAL;
1155 }
1156 break;
1157 case '^':
1158 switch (c2) {
1159 case '=': return CIRCUMFLEXEQUAL;
1160 }
1161 break;
1162 }
1163 return OP;
1164 }
1165
1166 int
Ta27Token_ThreeChars(int c1,int c2,int c3)1167 Ta27Token_ThreeChars(int c1, int c2, int c3)
1168 {
1169 switch (c1) {
1170 case '<':
1171 switch (c2) {
1172 case '<':
1173 switch (c3) {
1174 case '=':
1175 return LEFTSHIFTEQUAL;
1176 }
1177 break;
1178 }
1179 break;
1180 case '>':
1181 switch (c2) {
1182 case '>':
1183 switch (c3) {
1184 case '=':
1185 return RIGHTSHIFTEQUAL;
1186 }
1187 break;
1188 }
1189 break;
1190 case '*':
1191 switch (c2) {
1192 case '*':
1193 switch (c3) {
1194 case '=':
1195 return DOUBLESTAREQUAL;
1196 }
1197 break;
1198 }
1199 break;
1200 case '/':
1201 switch (c2) {
1202 case '/':
1203 switch (c3) {
1204 case '=':
1205 return DOUBLESLASHEQUAL;
1206 }
1207 break;
1208 }
1209 break;
1210 }
1211 return OP;
1212 }
1213
1214 static int
indenterror(struct tok_state * tok)1215 indenterror(struct tok_state *tok)
1216 {
1217 if (tok->alterror) {
1218 tok->done = E_TABSPACE;
1219 tok->cur = tok->inp;
1220 return 1;
1221 }
1222 if (tok->altwarning) {
1223 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1224 "in indentation\n", tok->filename);
1225 tok->altwarning = 0;
1226 }
1227 return 0;
1228 }
1229
1230 /* Get next token, after space stripping etc. */
1231
1232 static int
tok_get(register struct tok_state * tok,char ** p_start,char ** p_end)1233 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1234 {
1235 register int c;
1236 int blankline;
1237
1238 *p_start = *p_end = NULL;
1239 nextline:
1240 tok->start = NULL;
1241 blankline = 0;
1242
1243 /* Get indentation level */
1244 if (tok->atbol) {
1245 register int col = 0;
1246 register int altcol = 0;
1247 tok->atbol = 0;
1248 for (;;) {
1249 c = tok_nextc(tok);
1250 if (c == ' ')
1251 col++, altcol++;
1252 else if (c == '\t') {
1253 col = (col/tok->tabsize + 1) * tok->tabsize;
1254 altcol = (altcol/tok->alttabsize + 1)
1255 * tok->alttabsize;
1256 }
1257 else if (c == '\014') /* Control-L (formfeed) */
1258 col = altcol = 0; /* For Emacs users */
1259 else
1260 break;
1261 }
1262 tok_backup(tok, c);
1263 if (c == '#' || c == '\n') {
1264 /* Lines with only whitespace and/or comments
1265 shouldn't affect the indentation and are
1266 not passed to the parser as NEWLINE tokens,
1267 except *totally* empty lines in interactive
1268 mode, which signal the end of a command group. */
1269 if (col == 0 && c == '\n' && tok->prompt != NULL)
1270 blankline = 0; /* Let it through */
1271 else
1272 blankline = 1; /* Ignore completely */
1273 /* We can't jump back right here since we still
1274 may need to skip to the end of a comment */
1275 }
1276 if (!blankline && tok->level == 0) {
1277 if (col == tok->indstack[tok->indent]) {
1278 /* No change */
1279 if (altcol != tok->altindstack[tok->indent]) {
1280 if (indenterror(tok))
1281 return ERRORTOKEN;
1282 }
1283 }
1284 else if (col > tok->indstack[tok->indent]) {
1285 /* Indent -- always one */
1286 if (tok->indent+1 >= MAXINDENT) {
1287 tok->done = E_TOODEEP;
1288 tok->cur = tok->inp;
1289 return ERRORTOKEN;
1290 }
1291 if (altcol <= tok->altindstack[tok->indent]) {
1292 if (indenterror(tok))
1293 return ERRORTOKEN;
1294 }
1295 tok->pendin++;
1296 tok->indstack[++tok->indent] = col;
1297 tok->altindstack[tok->indent] = altcol;
1298 }
1299 else /* col < tok->indstack[tok->indent] */ {
1300 /* Dedent -- any number, must be consistent */
1301 while (tok->indent > 0 &&
1302 col < tok->indstack[tok->indent]) {
1303 tok->pendin--;
1304 tok->indent--;
1305 }
1306 if (col != tok->indstack[tok->indent]) {
1307 tok->done = E_DEDENT;
1308 tok->cur = tok->inp;
1309 return ERRORTOKEN;
1310 }
1311 if (altcol != tok->altindstack[tok->indent]) {
1312 if (indenterror(tok))
1313 return ERRORTOKEN;
1314 }
1315 }
1316 }
1317 }
1318
1319 tok->start = tok->cur;
1320
1321 /* Return pending indents/dedents */
1322 if (tok->pendin != 0) {
1323 if (tok->pendin < 0) {
1324 tok->pendin++;
1325 return DEDENT;
1326 }
1327 else {
1328 tok->pendin--;
1329 return INDENT;
1330 }
1331 }
1332
1333 again:
1334 tok->start = NULL;
1335 /* Skip spaces */
1336 do {
1337 c = tok_nextc(tok);
1338 } while (c == ' ' || c == '\t' || c == '\014');
1339
1340 /* Set start of current token */
1341 tok->start = tok->cur - 1;
1342
1343 /* Skip comment, while looking for tab-setting magic and type comments */
1344 if (c == '#') {
1345 static char *tabforms[] = {
1346 "tab-width:", /* Emacs */
1347 ":tabstop=", /* vim, full form */
1348 ":ts=", /* vim, abbreviated form */
1349 "set tabsize=", /* will vi never die? */
1350 /* more templates can be added here to support other editors */
1351 };
1352 char cbuf[80];
1353 char *tp, **cp;
1354
1355 /* used for type comment checks */
1356 const char *prefix, *p, *type_start;
1357
1358 tp = cbuf;
1359 do {
1360 *tp++ = c = tok_nextc(tok);
1361 } while (c != EOF && c != '\n' &&
1362 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1363 *tp = '\0';
1364 for (cp = tabforms;
1365 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1366 cp++) {
1367 if ((tp = strstr(cbuf, *cp))) {
1368 int newsize = atoi(tp + strlen(*cp));
1369
1370 if (newsize >= 1 && newsize <= 40) {
1371 tok->tabsize = newsize;
1372 if (Py_VerboseFlag)
1373 PySys_WriteStderr(
1374 "Tab size set to %d\n",
1375 newsize);
1376 }
1377 }
1378 }
1379 while (c != EOF && c != '\n')
1380 c = tok_nextc(tok);
1381
1382 /* check for type comment */
1383
1384 p = tok->start;
1385 prefix = type_comment_prefix;
1386 while (*prefix && p < tok->cur) {
1387 if (*prefix == ' ') {
1388 while (*p == ' ' || *p == '\t')
1389 p++;
1390 } else if (*prefix == *p) {
1391 p++;
1392 } else {
1393 break;
1394 }
1395
1396 prefix++;
1397 }
1398
1399 /* This is a type comment if we matched all of type_comment_prefix. */
1400 if (!*prefix) {
1401 int is_type_ignore = 1;
1402 const char *ignore_end = p + 6;
1403 tok_backup(tok, c); /* don't eat the newline or EOF */
1404
1405 type_start = p;
1406
1407 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1408 * or anything ASCII and non-alphanumeric. */
1409 is_type_ignore = (
1410 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1411 && !(tok->cur > ignore_end
1412 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1413
1414 if (is_type_ignore) {
1415 *p_start = (char *) ignore_end;
1416 *p_end = tok->cur;
1417
1418 /* If this type ignore is the only thing on the line, consume the newline also. */
1419 if (blankline) {
1420 tok_nextc(tok);
1421 tok->atbol = 1;
1422 }
1423 return TYPE_IGNORE;
1424 } else {
1425 *p_start = (char *) type_start; /* after type_comment_prefix */
1426 *p_end = tok->cur;
1427 return TYPE_COMMENT;
1428 }
1429 }
1430
1431 }
1432
1433 /* Check for EOF and errors now */
1434 if (c == EOF) {
1435 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1436 }
1437
1438 /* Identifier (most frequent token!) */
1439 if (Py_ISALPHA(c) || c == '_') {
1440 /* Process r"", u"" and ur"" */
1441 switch (c) {
1442 case 'b':
1443 case 'B':
1444 c = tok_nextc(tok);
1445 if (c == 'r' || c == 'R')
1446 c = tok_nextc(tok);
1447 if (c == '"' || c == '\'')
1448 goto letter_quote;
1449 break;
1450 case 'r':
1451 case 'R':
1452 c = tok_nextc(tok);
1453 if (c == '"' || c == '\'')
1454 goto letter_quote;
1455 break;
1456 case 'u':
1457 case 'U':
1458 c = tok_nextc(tok);
1459 if (c == 'r' || c == 'R')
1460 c = tok_nextc(tok);
1461 if (c == '"' || c == '\'')
1462 goto letter_quote;
1463 break;
1464 }
1465 while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
1466 c = tok_nextc(tok);
1467 }
1468 tok_backup(tok, c);
1469 *p_start = tok->start;
1470 *p_end = tok->cur;
1471 return NAME;
1472 }
1473
1474 /* Newline */
1475 if (c == '\n') {
1476 tok->atbol = 1;
1477 if (blankline || tok->level > 0)
1478 goto nextline;
1479 *p_start = tok->start;
1480 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1481 tok->cont_line = 0;
1482 return NEWLINE;
1483 }
1484
1485 /* Period or number starting with period? */
1486 if (c == '.') {
1487 c = tok_nextc(tok);
1488 if (isdigit(c)) {
1489 goto fraction;
1490 }
1491 else {
1492 tok_backup(tok, c);
1493 *p_start = tok->start;
1494 *p_end = tok->cur;
1495 return DOT;
1496 }
1497 }
1498
1499 /* Number */
1500 if (isdigit(c)) {
1501 if (c == '0') {
1502 /* Hex, octal or binary -- maybe. */
1503 c = tok_nextc(tok);
1504 if (c == '.')
1505 goto fraction;
1506 #ifndef WITHOUT_COMPLEX
1507 if (c == 'j' || c == 'J')
1508 goto imaginary;
1509 #endif
1510 if (c == 'x' || c == 'X') {
1511
1512 /* Hex */
1513 c = tok_nextc(tok);
1514 if (!isxdigit(c)) {
1515 tok->done = E_TOKEN;
1516 tok_backup(tok, c);
1517 return ERRORTOKEN;
1518 }
1519 do {
1520 c = tok_nextc(tok);
1521 } while (isxdigit(c));
1522 }
1523 else if (c == 'o' || c == 'O') {
1524 /* Octal */
1525 c = tok_nextc(tok);
1526 if (c < '0' || c >= '8') {
1527 tok->done = E_TOKEN;
1528 tok_backup(tok, c);
1529 return ERRORTOKEN;
1530 }
1531 do {
1532 c = tok_nextc(tok);
1533 } while ('0' <= c && c < '8');
1534 }
1535 else if (c == 'b' || c == 'B') {
1536 /* Binary */
1537 c = tok_nextc(tok);
1538 if (c != '0' && c != '1') {
1539 tok->done = E_TOKEN;
1540 tok_backup(tok, c);
1541 return ERRORTOKEN;
1542 }
1543 do {
1544 c = tok_nextc(tok);
1545 } while (c == '0' || c == '1');
1546 }
1547 else {
1548 int found_decimal = 0;
1549 /* Octal; c is first char of it */
1550 /* There's no 'isoctdigit' macro, sigh */
1551 while ('0' <= c && c < '8') {
1552 c = tok_nextc(tok);
1553 }
1554 if (isdigit(c)) {
1555 found_decimal = 1;
1556 do {
1557 c = tok_nextc(tok);
1558 } while (isdigit(c));
1559 }
1560 if (c == '.')
1561 goto fraction;
1562 else if (c == 'e' || c == 'E')
1563 goto exponent;
1564 #ifndef WITHOUT_COMPLEX
1565 else if (c == 'j' || c == 'J')
1566 goto imaginary;
1567 #endif
1568 else if (found_decimal) {
1569 tok->done = E_TOKEN;
1570 tok_backup(tok, c);
1571 return ERRORTOKEN;
1572 }
1573 }
1574 if (c == 'l' || c == 'L')
1575 c = tok_nextc(tok);
1576 }
1577 else {
1578 /* Decimal */
1579 do {
1580 c = tok_nextc(tok);
1581 } while (isdigit(c));
1582 if (c == 'l' || c == 'L')
1583 c = tok_nextc(tok);
1584 else {
1585 /* Accept floating point numbers. */
1586 if (c == '.') {
1587 fraction:
1588 /* Fraction */
1589 do {
1590 c = tok_nextc(tok);
1591 } while (isdigit(c));
1592 }
1593 if (c == 'e' || c == 'E') {
1594 int e;
1595 exponent:
1596 e = c;
1597 /* Exponent part */
1598 c = tok_nextc(tok);
1599 if (c == '+' || c == '-') {
1600 c = tok_nextc(tok);
1601 if (!isdigit(c)) {
1602 tok->done = E_TOKEN;
1603 tok_backup(tok, c);
1604 return ERRORTOKEN;
1605 }
1606 } else if (!isdigit(c)) {
1607 tok_backup(tok, c);
1608 tok_backup(tok, e);
1609 *p_start = tok->start;
1610 *p_end = tok->cur;
1611 return NUMBER;
1612 }
1613 do {
1614 c = tok_nextc(tok);
1615 } while (isdigit(c));
1616 }
1617 #ifndef WITHOUT_COMPLEX
1618 if (c == 'j' || c == 'J')
1619 /* Imaginary part */
1620 imaginary:
1621 c = tok_nextc(tok);
1622 #endif
1623 }
1624 }
1625 tok_backup(tok, c);
1626 *p_start = tok->start;
1627 *p_end = tok->cur;
1628 return NUMBER;
1629 }
1630
1631 letter_quote:
1632 /* String */
1633 if (c == '\'' || c == '"') {
1634 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1635 int quote = c;
1636 int triple = 0;
1637 int tripcount = 0;
1638 for (;;) {
1639 c = tok_nextc(tok);
1640 if (c == '\n') {
1641 if (!triple) {
1642 tok->done = E_EOLS;
1643 tok_backup(tok, c);
1644 return ERRORTOKEN;
1645 }
1646 tripcount = 0;
1647 tok->cont_line = 1; /* multiline string. */
1648 }
1649 else if (c == EOF) {
1650 if (triple)
1651 tok->done = E_EOFS;
1652 else
1653 tok->done = E_EOLS;
1654 tok->cur = tok->inp;
1655 return ERRORTOKEN;
1656 }
1657 else if (c == quote) {
1658 tripcount++;
1659 if (tok->cur - tok->start == quote2) {
1660 c = tok_nextc(tok);
1661 if (c == quote) {
1662 triple = 1;
1663 tripcount = 0;
1664 continue;
1665 }
1666 tok_backup(tok, c);
1667 }
1668 if (!triple || tripcount == 3)
1669 break;
1670 }
1671 else if (c == '\\') {
1672 tripcount = 0;
1673 c = tok_nextc(tok);
1674 if (c == EOF) {
1675 tok->done = E_EOLS;
1676 tok->cur = tok->inp;
1677 return ERRORTOKEN;
1678 }
1679 }
1680 else
1681 tripcount = 0;
1682 }
1683 *p_start = tok->start;
1684 *p_end = tok->cur;
1685 return STRING;
1686 }
1687
1688 /* Line continuation */
1689 if (c == '\\') {
1690 c = tok_nextc(tok);
1691 if (c != '\n') {
1692 tok->done = E_LINECONT;
1693 tok->cur = tok->inp;
1694 return ERRORTOKEN;
1695 }
1696 tok->cont_line = 1;
1697 goto again; /* Read next line */
1698 }
1699
1700 /* Check for two-character token */
1701 {
1702 int c2 = tok_nextc(tok);
1703 int token = Ta27Token_TwoChars(c, c2);
1704 #ifndef PGEN
1705 if (token == NOTEQUAL && c == '<') {
1706 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1707 "<> not supported in 3.x; use !=",
1708 tok->filename, tok->lineno,
1709 NULL, NULL)) {
1710 return ERRORTOKEN;
1711 }
1712 }
1713 #endif
1714 if (token != OP) {
1715 int c3 = tok_nextc(tok);
1716 int token3 = Ta27Token_ThreeChars(c, c2, c3);
1717 if (token3 != OP) {
1718 token = token3;
1719 } else {
1720 tok_backup(tok, c3);
1721 }
1722 *p_start = tok->start;
1723 *p_end = tok->cur;
1724 return token;
1725 }
1726 tok_backup(tok, c2);
1727 }
1728
1729 /* Keep track of parentheses nesting level */
1730 switch (c) {
1731 case '(':
1732 case '[':
1733 case '{':
1734 tok->level++;
1735 break;
1736 case ')':
1737 case ']':
1738 case '}':
1739 tok->level--;
1740 break;
1741 }
1742
1743 /* Punctuation character */
1744 *p_start = tok->start;
1745 *p_end = tok->cur;
1746 return Ta27Token_OneChar(c);
1747 }
1748
1749 int
Ta27Tokenizer_Get(struct tok_state * tok,char ** p_start,char ** p_end)1750 Ta27Tokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1751 {
1752 int result = tok_get(tok, p_start, p_end);
1753 if (tok->decoding_erred) {
1754 result = ERRORTOKEN;
1755 tok->done = E_DECODE;
1756 }
1757 return result;
1758 }
1759
1760 /* This function is only called from parsetok. However, it cannot live
1761 there, as it must be empty for PGEN, and we can check for PGEN only
1762 in this file. */
1763
1764 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1765 char*
Ta27Tokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1766 Ta27Tokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1767 {
1768 return NULL;
1769 }
1770 #else
1771 #ifdef Py_USING_UNICODE
1772 static PyObject *
dec_utf8(const char * enc,const char * text,size_t len)1773 dec_utf8(const char *enc, const char *text, size_t len) {
1774 PyObject *ret = NULL;
1775 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1776 if (unicode_text) {
1777 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1778 Py_DECREF(unicode_text);
1779 }
1780 if (!ret) {
1781 PyErr_Clear();
1782 }
1783 return ret;
1784 }
1785 char *
Ta27Tokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1786 Ta27Tokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1787 {
1788 char *text = NULL;
1789 if (tok->encoding) {
1790 /* convert source to original encondig */
1791 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1792 if (lineobj != NULL) {
1793 int linelen = PyBytes_Size(lineobj);
1794 const char *line = PyBytes_AsString(lineobj);
1795 text = PyObject_MALLOC(linelen + 1);
1796 if (text != NULL && line != NULL) {
1797 if (linelen)
1798 strncpy(text, line, linelen);
1799 text[linelen] = '\0';
1800 }
1801 Py_DECREF(lineobj);
1802
1803 /* adjust error offset */
1804 if (*offset > 1) {
1805 PyObject *offsetobj = dec_utf8(tok->encoding,
1806 tok->buf, *offset-1);
1807 if (offsetobj) {
1808 *offset = PyBytes_Size(offsetobj) + 1;
1809 Py_DECREF(offsetobj);
1810 }
1811 }
1812
1813 }
1814 }
1815 return text;
1816
1817 }
1818 #endif /* defined(Py_USING_UNICODE) */
1819 #endif
1820
1821
1822 #ifdef Py_DEBUG
1823
1824 void
tok_dump(int type,char * start,char * end)1825 tok_dump(int type, char *start, char *end)
1826 {
1827 printf("%s", _Ta27Parser_TokenNames[type]);
1828 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1829 printf("(%.*s)", (int)(end - start), start);
1830 }
1831
1832 #endif
1833