1 
2 /* Tokenizer implementation */
3 
4 #include "Python.h"
5 #include "../Include/pgenheaders.h"
6 
7 #include <ctype.h>
8 #include <assert.h>
9 
10 #include "tokenizer.h"
11 #include "../Include/errcode.h"
12 
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "bytesobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #endif /* PGEN */
20 
21 #if PY_MINOR_VERSION >= 4
22 PyAPI_FUNC(char *) PyOS_Readline(FILE *, FILE *, const char *);
23 #else
24 PyAPI_FUNC(char *) PyOS_Readline(FILE *, FILE *, char *);
25 #endif
26 /* Return malloc'ed string including trailing \n;
27    empty malloc'ed string for EOF;
28    NULL if interrupted */
29 
30 /* Don't ever change this -- it would break the portability of Python code */
31 #define TABSIZE 8
32 
33 /* Forward */
34 static struct tok_state *tok_new(void);
35 static int tok_nextc(struct tok_state *tok);
36 static void tok_backup(struct tok_state *tok, int c);
37 
38 /* Token names */
39 
40 char *_Ta27Parser_TokenNames[] = {
41     "ENDMARKER",
42     "NAME",
43     "NUMBER",
44     "STRING",
45     "NEWLINE",
46     "INDENT",
47     "DEDENT",
48     "LPAR",
49     "RPAR",
50     "LSQB",
51     "RSQB",
52     "COLON",
53     "COMMA",
54     "SEMI",
55     "PLUS",
56     "MINUS",
57     "STAR",
58     "SLASH",
59     "VBAR",
60     "AMPER",
61     "LESS",
62     "GREATER",
63     "EQUAL",
64     "DOT",
65     "PERCENT",
66     "BACKQUOTE",
67     "LBRACE",
68     "RBRACE",
69     "EQEQUAL",
70     "NOTEQUAL",
71     "LESSEQUAL",
72     "GREATEREQUAL",
73     "TILDE",
74     "CIRCUMFLEX",
75     "LEFTSHIFT",
76     "RIGHTSHIFT",
77     "DOUBLESTAR",
78     "PLUSEQUAL",
79     "MINEQUAL",
80     "STAREQUAL",
81     "SLASHEQUAL",
82     "PERCENTEQUAL",
83     "AMPEREQUAL",
84     "VBAREQUAL",
85     "CIRCUMFLEXEQUAL",
86     "LEFTSHIFTEQUAL",
87     "RIGHTSHIFTEQUAL",
88     "DOUBLESTAREQUAL",
89     "DOUBLESLASH",
90     "DOUBLESLASHEQUAL",
91     "AT",
92     /* This table must match the #defines in token.h! */
93     "OP",
94     "RARROW",
95     "TYPE_IGNORE",
96     "TYPE_COMMENT",
97     "<ERRORTOKEN>",
98     "<N_TOKENS>"
99 };
100 
101 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
102    tokenizing. */
103 static const char* type_comment_prefix = "# type: ";
104 
105 /* Create and initialize a new tok_state structure */
106 
107 static struct tok_state *
tok_new(void)108 tok_new(void)
109 {
110     struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
111                                             sizeof(struct tok_state));
112     if (tok == NULL)
113         return NULL;
114     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
115     tok->done = E_OK;
116     tok->fp = NULL;
117     tok->input = NULL;
118     tok->tabsize = TABSIZE;
119     tok->indent = 0;
120     tok->indstack[0] = 0;
121     tok->atbol = 1;
122     tok->pendin = 0;
123     tok->prompt = tok->nextprompt = NULL;
124     tok->lineno = 0;
125     tok->level = 0;
126     tok->filename = NULL;
127     tok->altwarning = 0;
128     tok->alterror = 0;
129     tok->alttabsize = 1;
130     tok->altindstack[0] = 0;
131     tok->decoding_state = 0;
132     tok->decoding_erred = 0;
133     tok->read_coding_spec = 0;
134     tok->encoding = NULL;
135     tok->cont_line = 0;
136 #ifndef PGEN
137     tok->decoding_readline = NULL;
138     tok->decoding_buffer = NULL;
139 #endif
140     return tok;
141 }
142 
143 static char *
new_string(const char * s,Py_ssize_t len)144 new_string(const char *s, Py_ssize_t len)
145 {
146     char* result = (char *)PyMem_MALLOC(len + 1);
147     if (result != NULL) {
148         memcpy(result, s, len);
149         result[len] = '\0';
150     }
151     return result;
152 }
153 
154 #ifdef PGEN
155 
156 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)157 decoding_fgets(char *s, int size, struct tok_state *tok)
158 {
159     return fgets(s, size, tok->fp);
160 }
161 
162 static int
decoding_feof(struct tok_state * tok)163 decoding_feof(struct tok_state *tok)
164 {
165     return feof(tok->fp);
166 }
167 
168 static char *
decode_str(const char * str,int exec_input,struct tok_state * tok)169 decode_str(const char *str, int exec_input, struct tok_state *tok)
170 {
171     return new_string(str, strlen(str));
172 }
173 
174 #else /* PGEN */
175 
176 static char *
error_ret(struct tok_state * tok)177 error_ret(struct tok_state *tok) /* XXX */
178 {
179     tok->decoding_erred = 1;
180     if (tok->fp != NULL && tok->buf != NULL) /* see Ta27Tokenizer_Free */
181         PyMem_FREE(tok->buf);
182     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
183     tok->done = E_DECODE;
184     return NULL;                /* as if it were EOF */
185 }
186 
187 
188 static char *
get_normal_name(char * s)189 get_normal_name(char *s)        /* for utf-8 and latin-1 */
190 {
191     char buf[13];
192     int i;
193     for (i = 0; i < 12; i++) {
194         int c = s[i];
195         if (c == '\0')
196             break;
197         else if (c == '_')
198             buf[i] = '-';
199         else
200             buf[i] = tolower(c);
201     }
202     buf[i] = '\0';
203     if (strcmp(buf, "utf-8") == 0 ||
204         strncmp(buf, "utf-8-", 6) == 0)
205         return "utf-8";
206     else if (strcmp(buf, "latin-1") == 0 ||
207              strcmp(buf, "iso-8859-1") == 0 ||
208              strcmp(buf, "iso-latin-1") == 0 ||
209              strncmp(buf, "latin-1-", 8) == 0 ||
210              strncmp(buf, "iso-8859-1-", 11) == 0 ||
211              strncmp(buf, "iso-latin-1-", 12) == 0)
212         return "iso-8859-1";
213     else
214         return s;
215 }
216 
217 /* Return the coding spec in S, or NULL if none is found.  */
218 
219 static char *
get_coding_spec(const char * s,Py_ssize_t size)220 get_coding_spec(const char *s, Py_ssize_t size)
221 {
222     Py_ssize_t i;
223     /* Coding spec must be in a comment, and that comment must be
224      * the only statement on the source code line. */
225     for (i = 0; i < size - 6; i++) {
226         if (s[i] == '#')
227             break;
228         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
229             return NULL;
230     }
231     for (; i < size - 6; i++) { /* XXX inefficient search */
232         const char* t = s + i;
233         if (strncmp(t, "coding", 6) == 0) {
234             const char* begin = NULL;
235             t += 6;
236             if (t[0] != ':' && t[0] != '=')
237                 continue;
238             do {
239                 t++;
240             } while (t[0] == '\x20' || t[0] == '\t');
241 
242             begin = t;
243             while (Py_ISALNUM(t[0]) ||
244                    t[0] == '-' || t[0] == '_' || t[0] == '.')
245                 t++;
246 
247             if (begin < t) {
248                 char* r = new_string(begin, t - begin);
249                 char* q;
250                 if (!r)
251                     return NULL;
252                 q = get_normal_name(r);
253                 if (r != q) {
254                     PyMem_FREE(r);
255                     r = new_string(q, strlen(q));
256                 }
257                 return r;
258             }
259         }
260     }
261     return NULL;
262 }
263 
264 /* Check whether the line contains a coding spec. If it does,
265    invoke the set_readline function for the new encoding.
266    This function receives the tok_state and the new encoding.
267    Return 1 on success, 0 on failure.  */
268 
269 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))270 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
271                   int set_readline(struct tok_state *, const char *))
272 {
273     char * cs;
274     int r = 1;
275 
276     if (tok->cont_line) {
277         /* It's a continuation line, so it can't be a coding spec. */
278         tok->read_coding_spec = 1;
279         return 1;
280     }
281     cs = get_coding_spec(line, size);
282     if (!cs) {
283         Py_ssize_t i;
284         for (i = 0; i < size; i++) {
285             if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
286                 break;
287             if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
288                 /* Stop checking coding spec after a line containing
289                  * anything except a comment. */
290                 tok->read_coding_spec = 1;
291                 break;
292             }
293         }
294     } else {
295         tok->read_coding_spec = 1;
296         if (tok->encoding == NULL) {
297             assert(tok->decoding_state == 1); /* raw */
298             if (strcmp(cs, "utf-8") == 0 ||
299                 strcmp(cs, "iso-8859-1") == 0) {
300                 tok->encoding = cs;
301             } else {
302 #ifdef Py_USING_UNICODE
303                 r = set_readline(tok, cs);
304                 if (r) {
305                     tok->encoding = cs;
306                     tok->decoding_state = -1;
307                 }
308                 else {
309                     PyErr_Format(PyExc_SyntaxError,
310                                  "encoding problem: %s", cs);
311                     PyMem_FREE(cs);
312                 }
313 #else
314                 /* Without Unicode support, we cannot
315                    process the coding spec. Since there
316                    won't be any Unicode literals, that
317                    won't matter. */
318                 PyMem_FREE(cs);
319 #endif
320             }
321         } else {                /* then, compare cs with BOM */
322             r = (strcmp(tok->encoding, cs) == 0);
323             if (!r)
324                 PyErr_Format(PyExc_SyntaxError,
325                              "encoding problem: %s with BOM", cs);
326             PyMem_FREE(cs);
327         }
328     }
329     return r;
330 }
331 
332 /* See whether the file starts with a BOM. If it does,
333    invoke the set_readline function with the new encoding.
334    Return 1 on success, 0 on failure.  */
335 
336 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)337 check_bom(int get_char(struct tok_state *),
338           void unget_char(int, struct tok_state *),
339           int set_readline(struct tok_state *, const char *),
340           struct tok_state *tok)
341 {
342     int ch1, ch2, ch3;
343     ch1 = get_char(tok);
344     tok->decoding_state = 1;
345     if (ch1 == EOF) {
346         return 1;
347     } else if (ch1 == 0xEF) {
348         ch2 = get_char(tok);
349         if (ch2 != 0xBB) {
350             unget_char(ch2, tok);
351             unget_char(ch1, tok);
352             return 1;
353         }
354         ch3 = get_char(tok);
355         if (ch3 != 0xBF) {
356             unget_char(ch3, tok);
357             unget_char(ch2, tok);
358             unget_char(ch1, tok);
359             return 1;
360         }
361 #if 0
362     /* Disable support for UTF-16 BOMs until a decision
363        is made whether this needs to be supported.  */
364     } else if (ch1 == 0xFE) {
365         ch2 = get_char(tok);
366         if (ch2 != 0xFF) {
367             unget_char(ch2, tok);
368             unget_char(ch1, tok);
369             return 1;
370         }
371         if (!set_readline(tok, "utf-16-be"))
372             return 0;
373         tok->decoding_state = -1;
374     } else if (ch1 == 0xFF) {
375         ch2 = get_char(tok);
376         if (ch2 != 0xFE) {
377             unget_char(ch2, tok);
378             unget_char(ch1, tok);
379             return 1;
380         }
381         if (!set_readline(tok, "utf-16-le"))
382             return 0;
383         tok->decoding_state = -1;
384 #endif
385     } else {
386         unget_char(ch1, tok);
387         return 1;
388     }
389     if (tok->encoding != NULL)
390         PyMem_FREE(tok->encoding);
391     tok->encoding = new_string("utf-8", 5);     /* resulting is in utf-8 */
392     return 1;
393 }
394 
395 /* Read a line of text from TOK into S, using the stream in TOK.
396    Return NULL on failure, else S.
397 
398    On entry, tok->decoding_buffer will be one of:
399      1) NULL: need to call tok->decoding_readline to get a new line
400      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
401        stored the result in tok->decoding_buffer
402      3) PyBytesObject *: previous call to fp_readl did not have enough room
403        (in the s buffer) to copy entire contents of the line read
404        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
405        In this case, fp_readl is called in a loop (with an expanded buffer)
406        until the buffer ends with a '\n' (or until the end of the file is
407        reached): see tok_nextc and its calls to decoding_fgets.
408 */
409 
410 static char *
fp_readl(char * s,int size,struct tok_state * tok)411 fp_readl(char *s, int size, struct tok_state *tok)
412 {
413 #ifndef Py_USING_UNICODE
414     /* In a non-Unicode built, this should never be called. */
415     Py_FatalError("fp_readl should not be called in this build.");
416     return NULL; /* Keep compiler happy (not reachable) */
417 #else
418     PyObject* utf8 = NULL;
419     PyObject* buf = tok->decoding_buffer;
420     char *str;
421     Py_ssize_t utf8len;
422 
423     /* Ask for one less byte so we can terminate it */
424     assert(size > 0);
425     size--;
426 
427     if (buf == NULL) {
428         buf = PyObject_CallObject(tok->decoding_readline, NULL);
429         if (buf == NULL)
430             return error_ret(tok);
431         if (!PyUnicode_Check(buf)) {
432             Py_DECREF(buf);
433             PyErr_SetString(PyExc_SyntaxError,
434                             "codec did not return a unicode object");
435             return error_ret(tok);
436         }
437     } else {
438         tok->decoding_buffer = NULL;
439         if (PyBytes_CheckExact(buf))
440             utf8 = buf;
441     }
442     if (utf8 == NULL) {
443         utf8 = PyUnicode_AsUTF8String(buf);
444         Py_DECREF(buf);
445         if (utf8 == NULL)
446             return error_ret(tok);
447     }
448     str = PyBytes_AsString(utf8);
449     utf8len = PyBytes_GET_SIZE(utf8);
450     if (utf8len > size) {
451         tok->decoding_buffer = PyBytes_FromStringAndSize(str+size, utf8len-size);
452         if (tok->decoding_buffer == NULL) {
453             Py_DECREF(utf8);
454             return error_ret(tok);
455         }
456         utf8len = size;
457     }
458     memcpy(s, str, utf8len);
459     s[utf8len] = '\0';
460     Py_DECREF(utf8);
461     if (utf8len == 0)
462         return NULL; /* EOF */
463     return s;
464 #endif
465 }
466 
467 /* Set the readline function for TOK to a StreamReader's
468    readline function. The StreamReader is named ENC.
469 
470    This function is called from check_bom and check_coding_spec.
471 
472    ENC is usually identical to the future value of tok->encoding,
473    except for the (currently unsupported) case of UTF-16.
474 
475    Return 1 on success, 0 on failure. */
476 
477 /* taken from Python 3.5.1 */
478 
479 static int
fp_setreadl(struct tok_state * tok,const char * enc)480 fp_setreadl(struct tok_state *tok, const char* enc)
481 {
482     PyObject *readline = NULL, *stream = NULL, *io = NULL;
483     _Py_IDENTIFIER(open);
484     _Py_IDENTIFIER(readline);
485     int fd;
486     long pos;
487 
488     io = PyImport_ImportModuleNoBlock("io");
489     if (io == NULL)
490         goto cleanup;
491 
492     fd = fileno(tok->fp);
493     /* Due to buffering the file offset for fd can be different from the file
494      * position of tok->fp.  If tok->fp was opened in text mode on Windows,
495      * its file position counts CRLF as one char and can't be directly mapped
496      * to the file offset for fd.  Instead we step back one byte and read to
497      * the end of line.*/
498     pos = ftell(tok->fp);
499     if (pos == -1 ||
500         lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
501         PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
502         goto cleanup;
503     }
504 
505     stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
506                     fd, "r", -1, enc, Py_None, Py_None, Py_False);
507     if (stream == NULL)
508         goto cleanup;
509 
510     Py_XDECREF(tok->decoding_readline);
511     readline = _PyObject_GetAttrId(stream, &PyId_readline);
512     tok->decoding_readline = readline;
513     if (pos > 0) {
514         if (PyObject_CallObject(readline, NULL) == NULL) {
515             readline = NULL;
516             goto cleanup;
517         }
518     }
519 
520   cleanup:
521     Py_XDECREF(stream);
522     Py_XDECREF(io);
523     return readline != NULL;
524 }
525 
526 /* Fetch the next byte from TOK. */
527 
fp_getc(struct tok_state * tok)528 static int fp_getc(struct tok_state *tok) {
529     return getc(tok->fp);
530 }
531 
532 /* Unfetch the last byte back into TOK.  */
533 
fp_ungetc(int c,struct tok_state * tok)534 static void fp_ungetc(int c, struct tok_state *tok) {
535     ungetc(c, tok->fp);
536 }
537 
538 /* Read a line of input from TOK. Determine encoding
539    if necessary.  */
540 
541 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)542 decoding_fgets(char *s, int size, struct tok_state *tok)
543 {
544     char *line = NULL;
545     int badchar = 0;
546     for (;;) {
547         if (tok->decoding_state < 0) {
548             /* We already have a codec associated with
549                this input. */
550             line = fp_readl(s, size, tok);
551             break;
552         } else if (tok->decoding_state > 0) {
553             /* We want a 'raw' read. */
554             line = Py_UniversalNewlineFgets(s, size,
555                                             tok->fp, NULL);
556             break;
557         } else {
558             /* We have not yet determined the encoding.
559                If an encoding is found, use the file-pointer
560                reader functions from now on. */
561             if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
562                 return error_ret(tok);
563             assert(tok->decoding_state != 0);
564         }
565     }
566     if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
567         if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
568             return error_ret(tok);
569         }
570     }
571 #ifndef PGEN
572     /* The default encoding is ASCII, so make sure we don't have any
573        non-ASCII bytes in it. */
574     if (line && !tok->encoding) {
575         unsigned char *c;
576         for (c = (unsigned char *)line; *c; c++)
577             if (*c > 127) {
578                 badchar = *c;
579                 break;
580             }
581     }
582     if (badchar) {
583         char buf[500];
584         /* Need to add 1 to the line number, since this line
585            has not been counted, yet.  */
586         sprintf(buf,
587             "Non-ASCII character '\\x%.2x' "
588             "in file %.200s on line %i, "
589             "but no encoding declared; "
590             "see http://python.org/dev/peps/pep-0263/ for details",
591             badchar, tok->filename, tok->lineno + 1);
592         PyErr_SetString(PyExc_SyntaxError, buf);
593         return error_ret(tok);
594     }
595 #endif
596     return line;
597 }
598 
599 static int
decoding_feof(struct tok_state * tok)600 decoding_feof(struct tok_state *tok)
601 {
602     if (tok->decoding_state >= 0) {
603         return feof(tok->fp);
604     } else {
605         PyObject* buf = tok->decoding_buffer;
606         if (buf == NULL) {
607             buf = PyObject_CallObject(tok->decoding_readline, NULL);
608             if (buf == NULL) {
609                 error_ret(tok);
610                 return 1;
611             } else {
612                 tok->decoding_buffer = buf;
613             }
614         }
615         return PyObject_Length(buf) == 0;
616     }
617 }
618 
619 /* Fetch a byte from TOK, using the string buffer. */
620 
621 static int
buf_getc(struct tok_state * tok)622 buf_getc(struct tok_state *tok) {
623     return Py_CHARMASK(*tok->str++);
624 }
625 
626 /* Unfetch a byte from TOK, using the string buffer. */
627 
628 static void
buf_ungetc(int c,struct tok_state * tok)629 buf_ungetc(int c, struct tok_state *tok) {
630     tok->str--;
631     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
632 }
633 
634 /* Set the readline function for TOK to ENC. For the string-based
635    tokenizer, this means to just record the encoding. */
636 
637 static int
buf_setreadl(struct tok_state * tok,const char * enc)638 buf_setreadl(struct tok_state *tok, const char* enc) {
639     tok->enc = enc;
640     return 1;
641 }
642 
643 /* Return a UTF-8 encoding Python string object from the
644    C byte string STR, which is encoded with ENC. */
645 
646 #ifdef Py_USING_UNICODE
647 static PyObject *
translate_into_utf8(const char * str,const char * enc)648 translate_into_utf8(const char* str, const char* enc) {
649     PyObject *utf8;
650     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
651     if (buf == NULL)
652         return NULL;
653     utf8 = PyUnicode_AsUTF8String(buf);
654     Py_DECREF(buf);
655     return utf8;
656 }
657 #endif
658 
659 
660 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)661 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
662     int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
663     char *buf, *current;
664     char c = '\0';
665     buf = PyMem_MALLOC(needed_length);
666     if (buf == NULL) {
667         tok->done = E_NOMEM;
668         return NULL;
669     }
670     for (current = buf; *s; s++, current++) {
671         c = *s;
672         if (skip_next_lf) {
673             skip_next_lf = 0;
674             if (c == '\n') {
675                 c = *++s;
676                 if (!c)
677                     break;
678             }
679         }
680         if (c == '\r') {
681             skip_next_lf = 1;
682             c = '\n';
683         }
684         *current = c;
685     }
686     /* If this is exec input, add a newline to the end of the string if
687        there isn't one already. */
688     if (exec_input && c != '\n') {
689         *current = '\n';
690         current++;
691     }
692     *current = '\0';
693     final_length = current - buf + 1;
694     if (final_length < needed_length && final_length)
695         /* should never fail */
696         buf = PyMem_REALLOC(buf, final_length);
697     return buf;
698 }
699 
700 /* Decode a byte string STR for use as the buffer of TOK.
701    Look for encoding declarations inside STR, and record them
702    inside TOK.  */
703 
704 static const char *
decode_str(const char * input,int single,struct tok_state * tok)705 decode_str(const char *input, int single, struct tok_state *tok)
706 {
707     PyObject* utf8 = NULL;
708     const char *str;
709     const char *s;
710     const char *newl[2] = {NULL, NULL};
711     int lineno = 0;
712     tok->input = str = translate_newlines(input, single, tok);
713     if (str == NULL)
714         return NULL;
715     tok->enc = NULL;
716     tok->str = str;
717     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
718         return error_ret(tok);
719     str = tok->str;             /* string after BOM if any */
720     assert(str);
721 #ifdef Py_USING_UNICODE
722     if (tok->enc != NULL) {
723         utf8 = translate_into_utf8(str, tok->enc);
724         if (utf8 == NULL)
725             return error_ret(tok);
726         str = PyBytes_AsString(utf8);
727     }
728 #endif
729     for (s = str;; s++) {
730         if (*s == '\0') break;
731         else if (*s == '\n') {
732             assert(lineno < 2);
733             newl[lineno] = s;
734             lineno++;
735             if (lineno == 2) break;
736         }
737     }
738     tok->enc = NULL;
739     /* need to check line 1 and 2 separately since check_coding_spec
740        assumes a single line as input */
741     if (newl[0]) {
742         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
743             return error_ret(tok);
744         if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
745             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
746                                    tok, buf_setreadl))
747                 return error_ret(tok);
748         }
749     }
750 #ifdef Py_USING_UNICODE
751     if (tok->enc != NULL) {
752         assert(utf8 == NULL);
753         utf8 = translate_into_utf8(str, tok->enc);
754         if (utf8 == NULL)
755             return error_ret(tok);
756         str = PyBytes_AsString(utf8);
757     }
758 #endif
759     assert(tok->decoding_buffer == NULL);
760     tok->decoding_buffer = utf8; /* CAUTION */
761     return str;
762 }
763 
764 #endif /* PGEN */
765 
766 /* Set up tokenizer for string */
767 
768 struct tok_state *
Ta27Tokenizer_FromString(const char * str,int exec_input)769 Ta27Tokenizer_FromString(const char *str, int exec_input)
770 {
771     struct tok_state *tok = tok_new();
772     if (tok == NULL)
773         return NULL;
774     str = (char *)decode_str(str, exec_input, tok);
775     if (str == NULL) {
776         Ta27Tokenizer_Free(tok);
777         return NULL;
778     }
779 
780     /* XXX: constify members. */
781     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
782     return tok;
783 }
784 
785 /* adapted from Python 3.5.1 */
786 struct tok_state *
Ta27Tokenizer_FromUTF8(const char * str,int exec_input)787 Ta27Tokenizer_FromUTF8(const char *str, int exec_input)
788 {
789     struct tok_state *tok = tok_new();
790     if (tok == NULL)
791         return NULL;
792 #ifndef PGEN
793     tok->input = str = translate_newlines(str, exec_input, tok);
794 #endif
795     if (str == NULL) {
796         Ta27Tokenizer_Free(tok);
797         return NULL;
798     }
799     tok->decoding_state = 1;
800     tok->read_coding_spec = 1;
801     tok->enc = NULL;
802     tok->str = str;
803 
804     /* XXX: constify members. */
805     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
806     return tok;
807 }
808 
809 
810 /* Set up tokenizer for file */
811 
812 struct tok_state *
Ta27Tokenizer_FromFile(FILE * fp,char * ps1,char * ps2)813 Ta27Tokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
814 {
815     struct tok_state *tok = tok_new();
816     if (tok == NULL)
817         return NULL;
818     if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
819         Ta27Tokenizer_Free(tok);
820         return NULL;
821     }
822     tok->cur = tok->inp = tok->buf;
823     tok->end = tok->buf + BUFSIZ;
824     tok->fp = fp;
825     tok->prompt = ps1;
826     tok->nextprompt = ps2;
827     return tok;
828 }
829 
830 
831 /* Free a tok_state structure */
832 
833 void
Ta27Tokenizer_Free(struct tok_state * tok)834 Ta27Tokenizer_Free(struct tok_state *tok)
835 {
836     if (tok->encoding != NULL)
837         PyMem_FREE(tok->encoding);
838 #ifndef PGEN
839     Py_XDECREF(tok->decoding_readline);
840     Py_XDECREF(tok->decoding_buffer);
841 #endif
842     if (tok->fp != NULL && tok->buf != NULL)
843         PyMem_FREE(tok->buf);
844     if (tok->input)
845         PyMem_FREE((char *)tok->input);
846     PyMem_FREE(tok);
847 }
848 
849 /* Get next char, updating state; error code goes into tok->done */
850 /* taken from Python 3.5.1 */
851 static int
tok_nextc(struct tok_state * tok)852 tok_nextc(struct tok_state *tok)
853 {
854     for (;;) {
855         if (tok->cur != tok->inp) {
856             return Py_CHARMASK(*tok->cur++); /* Fast path */
857         }
858         if (tok->done != E_OK)
859             return EOF;
860         if (tok->fp == NULL) {
861             char *end = strchr(tok->inp, '\n');
862             if (end != NULL)
863                 end++;
864             else {
865                 end = strchr(tok->inp, '\0');
866                 if (end == tok->inp) {
867                     tok->done = E_EOF;
868                     return EOF;
869                 }
870             }
871             if (tok->start == NULL)
872                 tok->buf = tok->cur;
873             tok->line_start = tok->cur;
874             tok->lineno++;
875             tok->inp = end;
876             return Py_CHARMASK(*tok->cur++);
877         }
878         if (tok->prompt != NULL) {
879             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
880 #ifndef PGEN
881             if (newtok != NULL) {
882                 char *translated = translate_newlines(newtok, 0, tok);
883                 PyMem_FREE(newtok);
884                 if (translated == NULL)
885                     return EOF;
886                 newtok = translated;
887             }
888             if (tok->encoding && newtok && *newtok) {
889                 /* Recode to UTF-8 */
890                 Py_ssize_t buflen;
891                 const char* buf;
892                 PyObject *u = translate_into_utf8(newtok, tok->encoding);
893                 PyMem_FREE(newtok);
894                 if (!u) {
895                     tok->done = E_DECODE;
896                     return EOF;
897                 }
898                 buflen = PyBytes_GET_SIZE(u);
899                 buf = PyBytes_AS_STRING(u);
900                 newtok = PyMem_MALLOC(buflen+1);
901                 strcpy(newtok, buf);
902                 Py_DECREF(u);
903             }
904 #endif
905             if (tok->nextprompt != NULL)
906                 tok->prompt = tok->nextprompt;
907             if (newtok == NULL)
908                 tok->done = E_INTR;
909             else if (*newtok == '\0') {
910                 PyMem_FREE(newtok);
911                 tok->done = E_EOF;
912             }
913             else if (tok->start != NULL) {
914                 size_t start = tok->start - tok->buf;
915                 size_t oldlen = tok->cur - tok->buf;
916                 size_t newlen = oldlen + strlen(newtok);
917                 char *buf = tok->buf;
918                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
919                 tok->lineno++;
920                 if (buf == NULL) {
921                     PyMem_FREE(tok->buf);
922                     tok->buf = NULL;
923                     PyMem_FREE(newtok);
924                     tok->done = E_NOMEM;
925                     return EOF;
926                 }
927                 tok->buf = buf;
928                 tok->cur = tok->buf + oldlen;
929                 tok->line_start = tok->cur;
930                 strcpy(tok->buf + oldlen, newtok);
931                 PyMem_FREE(newtok);
932                 tok->inp = tok->buf + newlen;
933                 tok->end = tok->inp + 1;
934                 tok->start = tok->buf + start;
935             }
936             else {
937                 tok->lineno++;
938                 if (tok->buf != NULL)
939                     PyMem_FREE(tok->buf);
940                 tok->buf = newtok;
941                 tok->cur = tok->buf;
942                 tok->line_start = tok->buf;
943                 tok->inp = strchr(tok->buf, '\0');
944                 tok->end = tok->inp + 1;
945             }
946         }
947         else {
948             int done = 0;
949             Py_ssize_t cur = 0;
950             char *pt;
951             if (tok->start == NULL) {
952                 if (tok->buf == NULL) {
953                     tok->buf = (char *)
954                         PyMem_MALLOC(BUFSIZ);
955                     if (tok->buf == NULL) {
956                         tok->done = E_NOMEM;
957                         return EOF;
958                     }
959                     tok->end = tok->buf + BUFSIZ;
960                 }
961                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
962                           tok) == NULL) {
963                     if (!tok->decoding_erred)
964                         tok->done = E_EOF;
965                     done = 1;
966                 }
967                 else {
968                     tok->done = E_OK;
969                     tok->inp = strchr(tok->buf, '\0');
970                     done = tok->inp[-1] == '\n';
971                 }
972             }
973             else {
974                 cur = tok->cur - tok->buf;
975                 if (decoding_feof(tok)) {
976                     tok->done = E_EOF;
977                     done = 1;
978                 }
979                 else
980                     tok->done = E_OK;
981             }
982             tok->lineno++;
983             /* Read until '\n' or EOF */
984             while (!done) {
985                 Py_ssize_t curstart = tok->start == NULL ? -1 :
986                           tok->start - tok->buf;
987                 Py_ssize_t curvalid = tok->inp - tok->buf;
988                 Py_ssize_t newsize = curvalid + BUFSIZ;
989                 char *newbuf = tok->buf;
990                 newbuf = (char *)PyMem_REALLOC(newbuf,
991                                                newsize);
992                 if (newbuf == NULL) {
993                     tok->done = E_NOMEM;
994                     tok->cur = tok->inp;
995                     return EOF;
996                 }
997                 tok->buf = newbuf;
998                 tok->cur = tok->buf + cur;
999                 tok->line_start = tok->cur;
1000                 tok->inp = tok->buf + curvalid;
1001                 tok->end = tok->buf + newsize;
1002                 tok->start = curstart < 0 ? NULL :
1003                          tok->buf + curstart;
1004                 if (decoding_fgets(tok->inp,
1005                                (int)(tok->end - tok->inp),
1006                                tok) == NULL) {
1007                     /* Break out early on decoding
1008                        errors, as tok->buf will be NULL
1009                      */
1010                     if (tok->decoding_erred)
1011                         return EOF;
1012                     /* Last line does not end in \n,
1013                        fake one */
1014                     strcpy(tok->inp, "\n");
1015                 }
1016                 tok->inp = strchr(tok->inp, '\0');
1017                 done = tok->inp[-1] == '\n';
1018             }
1019             if (tok->buf != NULL) {
1020                 tok->cur = tok->buf + cur;
1021                 tok->line_start = tok->cur;
1022                 /* replace "\r\n" with "\n" */
1023                 /* For Mac leave the \r, giving a syntax error */
1024                 pt = tok->inp - 2;
1025                 if (pt >= tok->buf && *pt == '\r') {
1026                     *pt++ = '\n';
1027                     *pt = '\0';
1028                     tok->inp = pt;
1029                 }
1030             }
1031         }
1032         if (tok->done != E_OK) {
1033             if (tok->prompt != NULL)
1034                 PySys_WriteStderr("\n");
1035             tok->cur = tok->inp;
1036             return EOF;
1037         }
1038     }
1039     /*NOTREACHED*/
1040 }
1041 
1042 
1043 /* Back-up one character */
1044 
1045 static void
tok_backup(register struct tok_state * tok,register int c)1046 tok_backup(register struct tok_state *tok, register int c)
1047 {
1048     if (c != EOF) {
1049         if (--tok->cur < tok->buf)
1050             Py_FatalError("tok_backup: beginning of buffer");
1051         if (*tok->cur != c)
1052             *tok->cur = c;
1053     }
1054 }
1055 
1056 
1057 /* Return the token corresponding to a single character */
1058 
1059 int
Ta27Token_OneChar(int c)1060 Ta27Token_OneChar(int c)
1061 {
1062     switch (c) {
1063     case '(':           return LPAR;
1064     case ')':           return RPAR;
1065     case '[':           return LSQB;
1066     case ']':           return RSQB;
1067     case ':':           return COLON;
1068     case ',':           return COMMA;
1069     case ';':           return SEMI;
1070     case '+':           return PLUS;
1071     case '-':           return MINUS;
1072     case '*':           return STAR;
1073     case '/':           return SLASH;
1074     case '|':           return VBAR;
1075     case '&':           return AMPER;
1076     case '<':           return LESS;
1077     case '>':           return GREATER;
1078     case '=':           return EQUAL;
1079     case '.':           return DOT;
1080     case '%':           return PERCENT;
1081     case '`':           return BACKQUOTE;
1082     case '{':           return LBRACE;
1083     case '}':           return RBRACE;
1084     case '^':           return CIRCUMFLEX;
1085     case '~':           return TILDE;
1086     case '@':       return AT;
1087     default:            return OP;
1088     }
1089 }
1090 
1091 
1092 int
Ta27Token_TwoChars(int c1,int c2)1093 Ta27Token_TwoChars(int c1, int c2)
1094 {
1095     switch (c1) {
1096     case '=':
1097         switch (c2) {
1098         case '=':               return EQEQUAL;
1099         }
1100         break;
1101     case '!':
1102         switch (c2) {
1103         case '=':               return NOTEQUAL;
1104         }
1105         break;
1106     case '<':
1107         switch (c2) {
1108         case '>':               return NOTEQUAL;
1109         case '=':               return LESSEQUAL;
1110         case '<':               return LEFTSHIFT;
1111         }
1112         break;
1113     case '>':
1114         switch (c2) {
1115         case '=':               return GREATEREQUAL;
1116         case '>':               return RIGHTSHIFT;
1117         }
1118         break;
1119     case '+':
1120         switch (c2) {
1121         case '=':               return PLUSEQUAL;
1122         }
1123         break;
1124     case '-':
1125         switch (c2) {
1126         case '=':               return MINEQUAL;
1127         case '>':               return RARROW;
1128         }
1129         break;
1130     case '*':
1131         switch (c2) {
1132         case '*':               return DOUBLESTAR;
1133         case '=':               return STAREQUAL;
1134         }
1135         break;
1136     case '/':
1137         switch (c2) {
1138         case '/':               return DOUBLESLASH;
1139         case '=':               return SLASHEQUAL;
1140         }
1141         break;
1142     case '|':
1143         switch (c2) {
1144         case '=':               return VBAREQUAL;
1145         }
1146         break;
1147     case '%':
1148         switch (c2) {
1149         case '=':               return PERCENTEQUAL;
1150         }
1151         break;
1152     case '&':
1153         switch (c2) {
1154         case '=':               return AMPEREQUAL;
1155         }
1156         break;
1157     case '^':
1158         switch (c2) {
1159         case '=':               return CIRCUMFLEXEQUAL;
1160         }
1161         break;
1162     }
1163     return OP;
1164 }
1165 
1166 int
Ta27Token_ThreeChars(int c1,int c2,int c3)1167 Ta27Token_ThreeChars(int c1, int c2, int c3)
1168 {
1169     switch (c1) {
1170     case '<':
1171         switch (c2) {
1172         case '<':
1173             switch (c3) {
1174             case '=':
1175                 return LEFTSHIFTEQUAL;
1176             }
1177             break;
1178         }
1179         break;
1180     case '>':
1181         switch (c2) {
1182         case '>':
1183             switch (c3) {
1184             case '=':
1185                 return RIGHTSHIFTEQUAL;
1186             }
1187             break;
1188         }
1189         break;
1190     case '*':
1191         switch (c2) {
1192         case '*':
1193             switch (c3) {
1194             case '=':
1195                 return DOUBLESTAREQUAL;
1196             }
1197             break;
1198         }
1199         break;
1200     case '/':
1201         switch (c2) {
1202         case '/':
1203             switch (c3) {
1204             case '=':
1205                 return DOUBLESLASHEQUAL;
1206             }
1207             break;
1208         }
1209         break;
1210     }
1211     return OP;
1212 }
1213 
1214 static int
indenterror(struct tok_state * tok)1215 indenterror(struct tok_state *tok)
1216 {
1217     if (tok->alterror) {
1218         tok->done = E_TABSPACE;
1219         tok->cur = tok->inp;
1220         return 1;
1221     }
1222     if (tok->altwarning) {
1223         PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1224                           "in indentation\n", tok->filename);
1225         tok->altwarning = 0;
1226     }
1227     return 0;
1228 }
1229 
1230 /* Get next token, after space stripping etc. */
1231 
1232 static int
tok_get(register struct tok_state * tok,char ** p_start,char ** p_end)1233 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1234 {
1235     register int c;
1236     int blankline;
1237 
1238     *p_start = *p_end = NULL;
1239   nextline:
1240     tok->start = NULL;
1241     blankline = 0;
1242 
1243     /* Get indentation level */
1244     if (tok->atbol) {
1245         register int col = 0;
1246         register int altcol = 0;
1247         tok->atbol = 0;
1248         for (;;) {
1249             c = tok_nextc(tok);
1250             if (c == ' ')
1251                 col++, altcol++;
1252             else if (c == '\t') {
1253                 col = (col/tok->tabsize + 1) * tok->tabsize;
1254                 altcol = (altcol/tok->alttabsize + 1)
1255                     * tok->alttabsize;
1256             }
1257             else if (c == '\014') /* Control-L (formfeed) */
1258                 col = altcol = 0; /* For Emacs users */
1259             else
1260                 break;
1261         }
1262         tok_backup(tok, c);
1263         if (c == '#' || c == '\n') {
1264             /* Lines with only whitespace and/or comments
1265                shouldn't affect the indentation and are
1266                not passed to the parser as NEWLINE tokens,
1267                except *totally* empty lines in interactive
1268                mode, which signal the end of a command group. */
1269             if (col == 0 && c == '\n' && tok->prompt != NULL)
1270                 blankline = 0; /* Let it through */
1271             else
1272                 blankline = 1; /* Ignore completely */
1273             /* We can't jump back right here since we still
1274                may need to skip to the end of a comment */
1275         }
1276         if (!blankline && tok->level == 0) {
1277             if (col == tok->indstack[tok->indent]) {
1278                 /* No change */
1279                 if (altcol != tok->altindstack[tok->indent]) {
1280                     if (indenterror(tok))
1281                         return ERRORTOKEN;
1282                 }
1283             }
1284             else if (col > tok->indstack[tok->indent]) {
1285                 /* Indent -- always one */
1286                 if (tok->indent+1 >= MAXINDENT) {
1287                     tok->done = E_TOODEEP;
1288                     tok->cur = tok->inp;
1289                     return ERRORTOKEN;
1290                 }
1291                 if (altcol <= tok->altindstack[tok->indent]) {
1292                     if (indenterror(tok))
1293                         return ERRORTOKEN;
1294                 }
1295                 tok->pendin++;
1296                 tok->indstack[++tok->indent] = col;
1297                 tok->altindstack[tok->indent] = altcol;
1298             }
1299             else /* col < tok->indstack[tok->indent] */ {
1300                 /* Dedent -- any number, must be consistent */
1301                 while (tok->indent > 0 &&
1302                     col < tok->indstack[tok->indent]) {
1303                     tok->pendin--;
1304                     tok->indent--;
1305                 }
1306                 if (col != tok->indstack[tok->indent]) {
1307                     tok->done = E_DEDENT;
1308                     tok->cur = tok->inp;
1309                     return ERRORTOKEN;
1310                 }
1311                 if (altcol != tok->altindstack[tok->indent]) {
1312                     if (indenterror(tok))
1313                         return ERRORTOKEN;
1314                 }
1315             }
1316         }
1317     }
1318 
1319     tok->start = tok->cur;
1320 
1321     /* Return pending indents/dedents */
1322     if (tok->pendin != 0) {
1323         if (tok->pendin < 0) {
1324             tok->pendin++;
1325             return DEDENT;
1326         }
1327         else {
1328             tok->pendin--;
1329             return INDENT;
1330         }
1331     }
1332 
1333  again:
1334     tok->start = NULL;
1335     /* Skip spaces */
1336     do {
1337         c = tok_nextc(tok);
1338     } while (c == ' ' || c == '\t' || c == '\014');
1339 
1340     /* Set start of current token */
1341     tok->start = tok->cur - 1;
1342 
1343     /* Skip comment, while looking for tab-setting magic and type comments */
1344     if (c == '#') {
1345         static char *tabforms[] = {
1346             "tab-width:",                       /* Emacs */
1347             ":tabstop=",                        /* vim, full form */
1348             ":ts=",                             /* vim, abbreviated form */
1349             "set tabsize=",                     /* will vi never die? */
1350         /* more templates can be added here to support other editors */
1351         };
1352         char cbuf[80];
1353         char *tp, **cp;
1354 
1355         /* used for type comment checks */
1356         const char *prefix, *p, *type_start;
1357 
1358         tp = cbuf;
1359         do {
1360             *tp++ = c = tok_nextc(tok);
1361         } while (c != EOF && c != '\n' &&
1362                  (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1363         *tp = '\0';
1364         for (cp = tabforms;
1365              cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1366              cp++) {
1367             if ((tp = strstr(cbuf, *cp))) {
1368                 int newsize = atoi(tp + strlen(*cp));
1369 
1370                 if (newsize >= 1 && newsize <= 40) {
1371                     tok->tabsize = newsize;
1372                     if (Py_VerboseFlag)
1373                         PySys_WriteStderr(
1374                         "Tab size set to %d\n",
1375                         newsize);
1376                 }
1377             }
1378         }
1379         while (c != EOF && c != '\n')
1380             c = tok_nextc(tok);
1381 
1382         /* check for type comment */
1383 
1384         p = tok->start;
1385         prefix = type_comment_prefix;
1386         while (*prefix && p < tok->cur) {
1387             if (*prefix == ' ') {
1388                 while (*p == ' ' || *p == '\t')
1389                     p++;
1390             } else if (*prefix == *p) {
1391                 p++;
1392             } else {
1393                 break;
1394             }
1395 
1396             prefix++;
1397         }
1398 
1399         /* This is a type comment if we matched all of type_comment_prefix. */
1400         if (!*prefix) {
1401             int is_type_ignore = 1;
1402             const char *ignore_end = p + 6;
1403             tok_backup(tok, c);  /* don't eat the newline or EOF */
1404 
1405             type_start = p;
1406 
1407             /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1408              * or anything ASCII and non-alphanumeric. */
1409             is_type_ignore = (
1410                 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1411                 && !(tok->cur > ignore_end
1412                      && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1413 
1414             if (is_type_ignore) {
1415                 *p_start = (char *) ignore_end;
1416                 *p_end = tok->cur;
1417 
1418                 /* If this type ignore is the only thing on the line, consume the newline also. */
1419                 if (blankline) {
1420                     tok_nextc(tok);
1421                     tok->atbol = 1;
1422                 }
1423                 return TYPE_IGNORE;
1424             } else {
1425                 *p_start = (char *) type_start;  /* after type_comment_prefix */
1426                 *p_end = tok->cur;
1427                 return TYPE_COMMENT;
1428             }
1429         }
1430 
1431     }
1432 
1433     /* Check for EOF and errors now */
1434     if (c == EOF) {
1435         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1436     }
1437 
1438     /* Identifier (most frequent token!) */
1439     if (Py_ISALPHA(c) || c == '_') {
1440         /* Process r"", u"" and ur"" */
1441         switch (c) {
1442         case 'b':
1443         case 'B':
1444             c = tok_nextc(tok);
1445             if (c == 'r' || c == 'R')
1446                 c = tok_nextc(tok);
1447             if (c == '"' || c == '\'')
1448                 goto letter_quote;
1449             break;
1450         case 'r':
1451         case 'R':
1452             c = tok_nextc(tok);
1453             if (c == '"' || c == '\'')
1454                 goto letter_quote;
1455             break;
1456         case 'u':
1457         case 'U':
1458             c = tok_nextc(tok);
1459             if (c == 'r' || c == 'R')
1460                 c = tok_nextc(tok);
1461             if (c == '"' || c == '\'')
1462                 goto letter_quote;
1463             break;
1464         }
1465         while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
1466             c = tok_nextc(tok);
1467         }
1468         tok_backup(tok, c);
1469         *p_start = tok->start;
1470         *p_end = tok->cur;
1471         return NAME;
1472     }
1473 
1474     /* Newline */
1475     if (c == '\n') {
1476         tok->atbol = 1;
1477         if (blankline || tok->level > 0)
1478             goto nextline;
1479         *p_start = tok->start;
1480         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1481         tok->cont_line = 0;
1482         return NEWLINE;
1483     }
1484 
1485     /* Period or number starting with period? */
1486     if (c == '.') {
1487         c = tok_nextc(tok);
1488         if (isdigit(c)) {
1489             goto fraction;
1490         }
1491         else {
1492             tok_backup(tok, c);
1493             *p_start = tok->start;
1494             *p_end = tok->cur;
1495             return DOT;
1496         }
1497     }
1498 
1499     /* Number */
1500     if (isdigit(c)) {
1501         if (c == '0') {
1502             /* Hex, octal or binary -- maybe. */
1503             c = tok_nextc(tok);
1504             if (c == '.')
1505                 goto fraction;
1506 #ifndef WITHOUT_COMPLEX
1507             if (c == 'j' || c == 'J')
1508                 goto imaginary;
1509 #endif
1510             if (c == 'x' || c == 'X') {
1511 
1512                 /* Hex */
1513                 c = tok_nextc(tok);
1514                 if (!isxdigit(c)) {
1515                     tok->done = E_TOKEN;
1516                     tok_backup(tok, c);
1517                     return ERRORTOKEN;
1518                 }
1519                 do {
1520                     c = tok_nextc(tok);
1521                 } while (isxdigit(c));
1522             }
1523             else if (c == 'o' || c == 'O') {
1524                 /* Octal */
1525                 c = tok_nextc(tok);
1526                 if (c < '0' || c >= '8') {
1527                     tok->done = E_TOKEN;
1528                     tok_backup(tok, c);
1529                     return ERRORTOKEN;
1530                 }
1531                 do {
1532                     c = tok_nextc(tok);
1533                 } while ('0' <= c && c < '8');
1534             }
1535             else if (c == 'b' || c == 'B') {
1536                 /* Binary */
1537                 c = tok_nextc(tok);
1538                 if (c != '0' && c != '1') {
1539                     tok->done = E_TOKEN;
1540                     tok_backup(tok, c);
1541                     return ERRORTOKEN;
1542                 }
1543                 do {
1544                     c = tok_nextc(tok);
1545                 } while (c == '0' || c == '1');
1546             }
1547             else {
1548                 int found_decimal = 0;
1549                 /* Octal; c is first char of it */
1550                 /* There's no 'isoctdigit' macro, sigh */
1551                 while ('0' <= c && c < '8') {
1552                     c = tok_nextc(tok);
1553                 }
1554                 if (isdigit(c)) {
1555                     found_decimal = 1;
1556                     do {
1557                         c = tok_nextc(tok);
1558                     } while (isdigit(c));
1559                 }
1560                 if (c == '.')
1561                     goto fraction;
1562                 else if (c == 'e' || c == 'E')
1563                     goto exponent;
1564 #ifndef WITHOUT_COMPLEX
1565                 else if (c == 'j' || c == 'J')
1566                     goto imaginary;
1567 #endif
1568                 else if (found_decimal) {
1569                     tok->done = E_TOKEN;
1570                     tok_backup(tok, c);
1571                     return ERRORTOKEN;
1572                 }
1573             }
1574             if (c == 'l' || c == 'L')
1575                 c = tok_nextc(tok);
1576         }
1577         else {
1578             /* Decimal */
1579             do {
1580                 c = tok_nextc(tok);
1581             } while (isdigit(c));
1582             if (c == 'l' || c == 'L')
1583                 c = tok_nextc(tok);
1584             else {
1585                 /* Accept floating point numbers. */
1586                 if (c == '.') {
1587         fraction:
1588                     /* Fraction */
1589                     do {
1590                         c = tok_nextc(tok);
1591                     } while (isdigit(c));
1592                 }
1593                 if (c == 'e' || c == 'E') {
1594                     int e;
1595                   exponent:
1596                     e = c;
1597                     /* Exponent part */
1598                     c = tok_nextc(tok);
1599                     if (c == '+' || c == '-') {
1600                         c = tok_nextc(tok);
1601                         if (!isdigit(c)) {
1602                             tok->done = E_TOKEN;
1603                             tok_backup(tok, c);
1604                             return ERRORTOKEN;
1605                         }
1606                     } else if (!isdigit(c)) {
1607                         tok_backup(tok, c);
1608                         tok_backup(tok, e);
1609                         *p_start = tok->start;
1610                         *p_end = tok->cur;
1611                         return NUMBER;
1612                     }
1613                     do {
1614                         c = tok_nextc(tok);
1615                     } while (isdigit(c));
1616                 }
1617 #ifndef WITHOUT_COMPLEX
1618                 if (c == 'j' || c == 'J')
1619                     /* Imaginary part */
1620         imaginary:
1621                     c = tok_nextc(tok);
1622 #endif
1623             }
1624         }
1625         tok_backup(tok, c);
1626         *p_start = tok->start;
1627         *p_end = tok->cur;
1628         return NUMBER;
1629     }
1630 
1631   letter_quote:
1632     /* String */
1633     if (c == '\'' || c == '"') {
1634         Py_ssize_t quote2 = tok->cur - tok->start + 1;
1635         int quote = c;
1636         int triple = 0;
1637         int tripcount = 0;
1638         for (;;) {
1639             c = tok_nextc(tok);
1640             if (c == '\n') {
1641                 if (!triple) {
1642                     tok->done = E_EOLS;
1643                     tok_backup(tok, c);
1644                     return ERRORTOKEN;
1645                 }
1646                 tripcount = 0;
1647                 tok->cont_line = 1; /* multiline string. */
1648             }
1649             else if (c == EOF) {
1650                 if (triple)
1651                     tok->done = E_EOFS;
1652                 else
1653                     tok->done = E_EOLS;
1654                 tok->cur = tok->inp;
1655                 return ERRORTOKEN;
1656             }
1657             else if (c == quote) {
1658                 tripcount++;
1659                 if (tok->cur - tok->start == quote2) {
1660                     c = tok_nextc(tok);
1661                     if (c == quote) {
1662                         triple = 1;
1663                         tripcount = 0;
1664                         continue;
1665                     }
1666                     tok_backup(tok, c);
1667                 }
1668                 if (!triple || tripcount == 3)
1669                     break;
1670             }
1671             else if (c == '\\') {
1672                 tripcount = 0;
1673                 c = tok_nextc(tok);
1674                 if (c == EOF) {
1675                     tok->done = E_EOLS;
1676                     tok->cur = tok->inp;
1677                     return ERRORTOKEN;
1678                 }
1679             }
1680             else
1681                 tripcount = 0;
1682         }
1683         *p_start = tok->start;
1684         *p_end = tok->cur;
1685         return STRING;
1686     }
1687 
1688     /* Line continuation */
1689     if (c == '\\') {
1690         c = tok_nextc(tok);
1691         if (c != '\n') {
1692             tok->done = E_LINECONT;
1693             tok->cur = tok->inp;
1694             return ERRORTOKEN;
1695         }
1696         tok->cont_line = 1;
1697         goto again; /* Read next line */
1698     }
1699 
1700     /* Check for two-character token */
1701     {
1702         int c2 = tok_nextc(tok);
1703         int token = Ta27Token_TwoChars(c, c2);
1704 #ifndef PGEN
1705         if (token == NOTEQUAL && c == '<') {
1706             if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1707                                    "<> not supported in 3.x; use !=",
1708                                    tok->filename, tok->lineno,
1709                                    NULL, NULL)) {
1710                 return ERRORTOKEN;
1711             }
1712         }
1713 #endif
1714         if (token != OP) {
1715             int c3 = tok_nextc(tok);
1716             int token3 = Ta27Token_ThreeChars(c, c2, c3);
1717             if (token3 != OP) {
1718                 token = token3;
1719             } else {
1720                 tok_backup(tok, c3);
1721             }
1722             *p_start = tok->start;
1723             *p_end = tok->cur;
1724             return token;
1725         }
1726         tok_backup(tok, c2);
1727     }
1728 
1729     /* Keep track of parentheses nesting level */
1730     switch (c) {
1731     case '(':
1732     case '[':
1733     case '{':
1734         tok->level++;
1735         break;
1736     case ')':
1737     case ']':
1738     case '}':
1739         tok->level--;
1740         break;
1741     }
1742 
1743     /* Punctuation character */
1744     *p_start = tok->start;
1745     *p_end = tok->cur;
1746     return Ta27Token_OneChar(c);
1747 }
1748 
1749 int
Ta27Tokenizer_Get(struct tok_state * tok,char ** p_start,char ** p_end)1750 Ta27Tokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1751 {
1752     int result = tok_get(tok, p_start, p_end);
1753     if (tok->decoding_erred) {
1754         result = ERRORTOKEN;
1755         tok->done = E_DECODE;
1756     }
1757     return result;
1758 }
1759 
1760 /* This function is only called from parsetok. However, it cannot live
1761    there, as it must be empty for PGEN, and we can check for PGEN only
1762    in this file. */
1763 
1764 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1765 char*
Ta27Tokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1766 Ta27Tokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1767 {
1768     return NULL;
1769 }
1770 #else
1771 #ifdef Py_USING_UNICODE
1772 static PyObject *
dec_utf8(const char * enc,const char * text,size_t len)1773 dec_utf8(const char *enc, const char *text, size_t len) {
1774     PyObject *ret = NULL;
1775     PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1776     if (unicode_text) {
1777         ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1778         Py_DECREF(unicode_text);
1779     }
1780     if (!ret) {
1781         PyErr_Clear();
1782     }
1783     return ret;
1784 }
1785 char *
Ta27Tokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1786 Ta27Tokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1787 {
1788     char *text = NULL;
1789     if (tok->encoding) {
1790         /* convert source to original encondig */
1791         PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1792         if (lineobj != NULL) {
1793             int linelen = PyBytes_Size(lineobj);
1794             const char *line = PyBytes_AsString(lineobj);
1795             text = PyObject_MALLOC(linelen + 1);
1796             if (text != NULL && line != NULL) {
1797                 if (linelen)
1798                     strncpy(text, line, linelen);
1799                 text[linelen] = '\0';
1800             }
1801             Py_DECREF(lineobj);
1802 
1803             /* adjust error offset */
1804             if (*offset > 1) {
1805                 PyObject *offsetobj = dec_utf8(tok->encoding,
1806                                                tok->buf, *offset-1);
1807                 if (offsetobj) {
1808                     *offset = PyBytes_Size(offsetobj) + 1;
1809                     Py_DECREF(offsetobj);
1810                 }
1811             }
1812 
1813         }
1814     }
1815     return text;
1816 
1817 }
1818 #endif /* defined(Py_USING_UNICODE) */
1819 #endif
1820 
1821 
1822 #ifdef Py_DEBUG
1823 
1824 void
tok_dump(int type,char * start,char * end)1825 tok_dump(int type, char *start, char *end)
1826 {
1827     printf("%s", _Ta27Parser_TokenNames[type]);
1828     if (type == NAME || type == NUMBER || type == STRING || type == OP)
1829         printf("(%.*s)", (int)(end - start), start);
1830 }
1831 
1832 #endif
1833