1 
2 /* Tokenizer implementation */
3 
4 #include "Python.h"
5 #include "pgenheaders.h"
6 
7 #include <ctype.h>
8 #include <assert.h>
9 
10 #include "tokenizer.h"
11 #include "errcode.h"
12 
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #include "pydebug.h"
20 #endif /* PGEN */
21 
22 extern char *PyOS_Readline(FILE *, FILE *, char *);
23 /* Return malloc'ed string including trailing \n;
24    empty malloc'ed string for EOF;
25    NULL if interrupted */
26 
27 /* Don't ever change this -- it would break the portability of Python code */
28 #define TABSIZE 8
29 
30 /* Forward */
31 static struct tok_state *tok_new(void);
32 static int tok_nextc(struct tok_state *tok);
33 static void tok_backup(struct tok_state *tok, int c);
34 
35 /* Token names */
36 
37 char *_PyParser_TokenNames[] = {
38     "ENDMARKER",
39     "NAME",
40     "NUMBER",
41     "STRING",
42     "NEWLINE",
43     "INDENT",
44     "DEDENT",
45     "LPAR",
46     "RPAR",
47     "LSQB",
48     "RSQB",
49     "COLON",
50     "COMMA",
51     "SEMI",
52     "PLUS",
53     "MINUS",
54     "STAR",
55     "SLASH",
56     "VBAR",
57     "AMPER",
58     "LESS",
59     "GREATER",
60     "EQUAL",
61     "DOT",
62     "PERCENT",
63     "BACKQUOTE",
64     "LBRACE",
65     "RBRACE",
66     "EQEQUAL",
67     "NOTEQUAL",
68     "LESSEQUAL",
69     "GREATEREQUAL",
70     "TILDE",
71     "CIRCUMFLEX",
72     "LEFTSHIFT",
73     "RIGHTSHIFT",
74     "DOUBLESTAR",
75     "PLUSEQUAL",
76     "MINEQUAL",
77     "STAREQUAL",
78     "SLASHEQUAL",
79     "PERCENTEQUAL",
80     "AMPEREQUAL",
81     "VBAREQUAL",
82     "CIRCUMFLEXEQUAL",
83     "LEFTSHIFTEQUAL",
84     "RIGHTSHIFTEQUAL",
85     "DOUBLESTAREQUAL",
86     "DOUBLESLASH",
87     "DOUBLESLASHEQUAL",
88     "AT",
89     /* This table must match the #defines in token.h! */
90     "OP",
91     "<ERRORTOKEN>",
92     "<N_TOKENS>"
93 };
94 
95 /* Create and initialize a new tok_state structure */
96 
97 static struct tok_state *
tok_new(void)98 tok_new(void)
99 {
100     struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
101                                             sizeof(struct tok_state));
102     if (tok == NULL)
103         return NULL;
104     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
105     tok->done = E_OK;
106     tok->fp = NULL;
107     tok->input = NULL;
108     tok->tabsize = TABSIZE;
109     tok->indent = 0;
110     tok->indstack[0] = 0;
111     tok->atbol = 1;
112     tok->pendin = 0;
113     tok->prompt = tok->nextprompt = NULL;
114     tok->lineno = 0;
115     tok->level = 0;
116     tok->filename = NULL;
117     tok->altwarning = 0;
118     tok->alterror = 0;
119     tok->alttabsize = 1;
120     tok->altindstack[0] = 0;
121     tok->decoding_state = 0;
122     tok->decoding_erred = 0;
123     tok->read_coding_spec = 0;
124     tok->encoding = NULL;
125     tok->cont_line = 0;
126 #ifndef PGEN
127     tok->decoding_readline = NULL;
128     tok->decoding_buffer = NULL;
129 #endif
130     return tok;
131 }
132 
133 static char *
new_string(const char * s,Py_ssize_t len)134 new_string(const char *s, Py_ssize_t len)
135 {
136     char* result = (char *)PyMem_MALLOC(len + 1);
137     if (result != NULL) {
138         memcpy(result, s, len);
139         result[len] = '\0';
140     }
141     return result;
142 }
143 
144 #ifdef PGEN
145 
146 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)147 decoding_fgets(char *s, int size, struct tok_state *tok)
148 {
149     return fgets(s, size, tok->fp);
150 }
151 
152 static int
decoding_feof(struct tok_state * tok)153 decoding_feof(struct tok_state *tok)
154 {
155     return feof(tok->fp);
156 }
157 
158 static char *
decode_str(const char * str,int exec_input,struct tok_state * tok)159 decode_str(const char *str, int exec_input, struct tok_state *tok)
160 {
161     return new_string(str, strlen(str));
162 }
163 
164 #else /* PGEN */
165 
166 static char *
error_ret(struct tok_state * tok)167 error_ret(struct tok_state *tok) /* XXX */
168 {
169     tok->decoding_erred = 1;
170     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
171         PyMem_FREE(tok->buf);
172     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
173     tok->done = E_DECODE;
174     return NULL;                /* as if it were EOF */
175 }
176 
177 
178 static char *
get_normal_name(char * s)179 get_normal_name(char *s)        /* for utf-8 and latin-1 */
180 {
181     char buf[13];
182     int i;
183     for (i = 0; i < 12; i++) {
184         int c = s[i];
185         if (c == '\0')
186             break;
187         else if (c == '_')
188             buf[i] = '-';
189         else
190             buf[i] = tolower(c);
191     }
192     buf[i] = '\0';
193     if (strcmp(buf, "utf-8") == 0 ||
194         strncmp(buf, "utf-8-", 6) == 0)
195         return "utf-8";
196     else if (strcmp(buf, "latin-1") == 0 ||
197              strcmp(buf, "iso-8859-1") == 0 ||
198              strcmp(buf, "iso-latin-1") == 0 ||
199              strncmp(buf, "latin-1-", 8) == 0 ||
200              strncmp(buf, "iso-8859-1-", 11) == 0 ||
201              strncmp(buf, "iso-latin-1-", 12) == 0)
202         return "iso-8859-1";
203     else
204         return s;
205 }
206 
207 /* Return the coding spec in S, or NULL if none is found.  */
208 
209 static char *
get_coding_spec(const char * s,Py_ssize_t size)210 get_coding_spec(const char *s, Py_ssize_t size)
211 {
212     Py_ssize_t i;
213     /* Coding spec must be in a comment, and that comment must be
214      * the only statement on the source code line. */
215     for (i = 0; i < size - 6; i++) {
216         if (s[i] == '#')
217             break;
218         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219             return NULL;
220     }
221     for (; i < size - 6; i++) { /* XXX inefficient search */
222         const char* t = s + i;
223         if (strncmp(t, "coding", 6) == 0) {
224             const char* begin = NULL;
225             t += 6;
226             if (t[0] != ':' && t[0] != '=')
227                 continue;
228             do {
229                 t++;
230             } while (t[0] == '\x20' || t[0] == '\t');
231 
232             begin = t;
233             while (Py_ISALNUM(t[0]) ||
234                    t[0] == '-' || t[0] == '_' || t[0] == '.')
235                 t++;
236 
237             if (begin < t) {
238                 char* r = new_string(begin, t - begin);
239                 char* q;
240                 if (!r)
241                     return NULL;
242                 q = get_normal_name(r);
243                 if (r != q) {
244                     PyMem_FREE(r);
245                     r = new_string(q, strlen(q));
246                 }
247                 return r;
248             }
249         }
250     }
251     return NULL;
252 }
253 
254 /* Check whether the line contains a coding spec. If it does,
255    invoke the set_readline function for the new encoding.
256    This function receives the tok_state and the new encoding.
257    Return 1 on success, 0 on failure.  */
258 
259 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))260 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
261                   int set_readline(struct tok_state *, const char *))
262 {
263     char * cs;
264     int r = 1;
265 
266     if (tok->cont_line) {
267         /* It's a continuation line, so it can't be a coding spec. */
268         tok->read_coding_spec = 1;
269         return 1;
270     }
271     cs = get_coding_spec(line, size);
272     if (!cs) {
273         Py_ssize_t i;
274         for (i = 0; i < size; i++) {
275             if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
276                 break;
277             if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
278                 /* Stop checking coding spec after a line containing
279                  * anything except a comment. */
280                 tok->read_coding_spec = 1;
281                 break;
282             }
283         }
284     } else {
285         tok->read_coding_spec = 1;
286         if (tok->encoding == NULL) {
287             assert(tok->decoding_state == 1); /* raw */
288             if (strcmp(cs, "utf-8") == 0 ||
289                 strcmp(cs, "iso-8859-1") == 0) {
290                 tok->encoding = cs;
291             } else {
292 #ifdef Py_USING_UNICODE
293                 r = set_readline(tok, cs);
294                 if (r) {
295                     tok->encoding = cs;
296                     tok->decoding_state = -1;
297                 }
298                 else {
299                     PyErr_Format(PyExc_SyntaxError,
300                                  "encoding problem: %s", cs);
301                     PyMem_FREE(cs);
302                 }
303 #else
304                 /* Without Unicode support, we cannot
305                    process the coding spec. Since there
306                    won't be any Unicode literals, that
307                    won't matter. */
308                 PyMem_FREE(cs);
309 #endif
310             }
311         } else {                /* then, compare cs with BOM */
312             r = (strcmp(tok->encoding, cs) == 0);
313             if (!r)
314                 PyErr_Format(PyExc_SyntaxError,
315                              "encoding problem: %s with BOM", cs);
316             PyMem_FREE(cs);
317         }
318     }
319     return r;
320 }
321 
322 /* See whether the file starts with a BOM. If it does,
323    invoke the set_readline function with the new encoding.
324    Return 1 on success, 0 on failure.  */
325 
326 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)327 check_bom(int get_char(struct tok_state *),
328           void unget_char(int, struct tok_state *),
329           int set_readline(struct tok_state *, const char *),
330           struct tok_state *tok)
331 {
332     int ch1, ch2, ch3;
333     ch1 = get_char(tok);
334     tok->decoding_state = 1;
335     if (ch1 == EOF) {
336         return 1;
337     } else if (ch1 == 0xEF) {
338         ch2 = get_char(tok);
339         if (ch2 != 0xBB) {
340             unget_char(ch2, tok);
341             unget_char(ch1, tok);
342             return 1;
343         }
344         ch3 = get_char(tok);
345         if (ch3 != 0xBF) {
346             unget_char(ch3, tok);
347             unget_char(ch2, tok);
348             unget_char(ch1, tok);
349             return 1;
350         }
351 #if 0
352     /* Disable support for UTF-16 BOMs until a decision
353        is made whether this needs to be supported.  */
354     } else if (ch1 == 0xFE) {
355         ch2 = get_char(tok);
356         if (ch2 != 0xFF) {
357             unget_char(ch2, tok);
358             unget_char(ch1, tok);
359             return 1;
360         }
361         if (!set_readline(tok, "utf-16-be"))
362             return 0;
363         tok->decoding_state = -1;
364     } else if (ch1 == 0xFF) {
365         ch2 = get_char(tok);
366         if (ch2 != 0xFE) {
367             unget_char(ch2, tok);
368             unget_char(ch1, tok);
369             return 1;
370         }
371         if (!set_readline(tok, "utf-16-le"))
372             return 0;
373         tok->decoding_state = -1;
374 #endif
375     } else {
376         unget_char(ch1, tok);
377         return 1;
378     }
379     if (tok->encoding != NULL)
380         PyMem_FREE(tok->encoding);
381     tok->encoding = new_string("utf-8", 5);     /* resulting is in utf-8 */
382     return 1;
383 }
384 
385 /* Read a line of text from TOK into S, using the stream in TOK.
386    Return NULL on failure, else S.
387 
388    On entry, tok->decoding_buffer will be one of:
389      1) NULL: need to call tok->decoding_readline to get a new line
390      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
391        stored the result in tok->decoding_buffer
392      3) PyStringObject *: previous call to fp_readl did not have enough room
393        (in the s buffer) to copy entire contents of the line read
394        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
395        In this case, fp_readl is called in a loop (with an expanded buffer)
396        until the buffer ends with a '\n' (or until the end of the file is
397        reached): see tok_nextc and its calls to decoding_fgets.
398 */
399 
400 static char *
fp_readl(char * s,int size,struct tok_state * tok)401 fp_readl(char *s, int size, struct tok_state *tok)
402 {
403 #ifndef Py_USING_UNICODE
404     /* In a non-Unicode built, this should never be called. */
405     Py_FatalError("fp_readl should not be called in this build.");
406     return NULL; /* Keep compiler happy (not reachable) */
407 #else
408     PyObject* utf8 = NULL;
409     PyObject* buf = tok->decoding_buffer;
410     char *str;
411     Py_ssize_t utf8len;
412 
413     /* Ask for one less byte so we can terminate it */
414     assert(size > 0);
415     size--;
416 
417     if (buf == NULL) {
418         buf = PyObject_CallObject(tok->decoding_readline, NULL);
419         if (buf == NULL)
420             return error_ret(tok);
421         if (!PyUnicode_Check(buf)) {
422             Py_DECREF(buf);
423             PyErr_SetString(PyExc_SyntaxError,
424                             "codec did not return a unicode object");
425             return error_ret(tok);
426         }
427     } else {
428         tok->decoding_buffer = NULL;
429         if (PyString_CheckExact(buf))
430             utf8 = buf;
431     }
432     if (utf8 == NULL) {
433         utf8 = PyUnicode_AsUTF8String(buf);
434         Py_DECREF(buf);
435         if (utf8 == NULL)
436             return error_ret(tok);
437     }
438     str = PyString_AsString(utf8);
439     utf8len = PyString_GET_SIZE(utf8);
440     if (utf8len > size) {
441         tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
442         if (tok->decoding_buffer == NULL) {
443             Py_DECREF(utf8);
444             return error_ret(tok);
445         }
446         utf8len = size;
447     }
448     memcpy(s, str, utf8len);
449     s[utf8len] = '\0';
450     Py_DECREF(utf8);
451     if (utf8len == 0)
452         return NULL; /* EOF */
453     return s;
454 #endif
455 }
456 
457 /* Set the readline function for TOK to a StreamReader's
458    readline function. The StreamReader is named ENC.
459 
460    This function is called from check_bom and check_coding_spec.
461 
462    ENC is usually identical to the future value of tok->encoding,
463    except for the (currently unsupported) case of UTF-16.
464 
465    Return 1 on success, 0 on failure. */
466 
467 static int
fp_setreadl(struct tok_state * tok,const char * enc)468 fp_setreadl(struct tok_state *tok, const char* enc)
469 {
470     PyObject *reader, *stream, *readline;
471 
472     /* XXX: constify filename argument. */
473     stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
474     if (stream == NULL)
475         return 0;
476 
477     reader = PyCodec_StreamReader(enc, stream, NULL);
478     Py_DECREF(stream);
479     if (reader == NULL)
480         return 0;
481 
482     readline = PyObject_GetAttrString(reader, "readline");
483     Py_DECREF(reader);
484     if (readline == NULL)
485         return 0;
486 
487     tok->decoding_readline = readline;
488     return 1;
489 }
490 
491 /* Fetch the next byte from TOK. */
492 
fp_getc(struct tok_state * tok)493 static int fp_getc(struct tok_state *tok) {
494     return getc(tok->fp);
495 }
496 
497 /* Unfetch the last byte back into TOK.  */
498 
fp_ungetc(int c,struct tok_state * tok)499 static void fp_ungetc(int c, struct tok_state *tok) {
500     ungetc(c, tok->fp);
501 }
502 
503 /* Read a line of input from TOK. Determine encoding
504    if necessary.  */
505 
506 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)507 decoding_fgets(char *s, int size, struct tok_state *tok)
508 {
509     char *line = NULL;
510     int badchar = 0;
511     for (;;) {
512         if (tok->decoding_state < 0) {
513             /* We already have a codec associated with
514                this input. */
515             line = fp_readl(s, size, tok);
516             break;
517         } else if (tok->decoding_state > 0) {
518             /* We want a 'raw' read. */
519             line = Py_UniversalNewlineFgets(s, size,
520                                             tok->fp, NULL);
521             break;
522         } else {
523             /* We have not yet determined the encoding.
524                If an encoding is found, use the file-pointer
525                reader functions from now on. */
526             if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
527                 return error_ret(tok);
528             assert(tok->decoding_state != 0);
529         }
530     }
531     if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
532         if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
533             return error_ret(tok);
534         }
535     }
536 #ifndef PGEN
537     /* The default encoding is ASCII, so make sure we don't have any
538        non-ASCII bytes in it. */
539     if (line && !tok->encoding) {
540         unsigned char *c;
541         for (c = (unsigned char *)line; *c; c++)
542             if (*c > 127) {
543                 badchar = *c;
544                 break;
545             }
546     }
547     if (badchar) {
548         char buf[500];
549         /* Need to add 1 to the line number, since this line
550            has not been counted, yet.  */
551         sprintf(buf,
552             "Non-ASCII character '\\x%.2x' "
553             "in file %.200s on line %i, "
554             "but no encoding declared; "
555             "see http://python.org/dev/peps/pep-0263/ for details",
556             badchar, tok->filename, tok->lineno + 1);
557         PyErr_SetString(PyExc_SyntaxError, buf);
558         return error_ret(tok);
559     }
560 #endif
561     return line;
562 }
563 
564 static int
decoding_feof(struct tok_state * tok)565 decoding_feof(struct tok_state *tok)
566 {
567     if (tok->decoding_state >= 0) {
568         return feof(tok->fp);
569     } else {
570         PyObject* buf = tok->decoding_buffer;
571         if (buf == NULL) {
572             buf = PyObject_CallObject(tok->decoding_readline, NULL);
573             if (buf == NULL) {
574                 error_ret(tok);
575                 return 1;
576             } else {
577                 tok->decoding_buffer = buf;
578             }
579         }
580         return PyObject_Length(buf) == 0;
581     }
582 }
583 
584 /* Fetch a byte from TOK, using the string buffer. */
585 
586 static int
buf_getc(struct tok_state * tok)587 buf_getc(struct tok_state *tok) {
588     return Py_CHARMASK(*tok->str++);
589 }
590 
591 /* Unfetch a byte from TOK, using the string buffer. */
592 
593 static void
buf_ungetc(int c,struct tok_state * tok)594 buf_ungetc(int c, struct tok_state *tok) {
595     tok->str--;
596     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
597 }
598 
599 /* Set the readline function for TOK to ENC. For the string-based
600    tokenizer, this means to just record the encoding. */
601 
602 static int
buf_setreadl(struct tok_state * tok,const char * enc)603 buf_setreadl(struct tok_state *tok, const char* enc) {
604     tok->enc = enc;
605     return 1;
606 }
607 
608 /* Return a UTF-8 encoding Python string object from the
609    C byte string STR, which is encoded with ENC. */
610 
611 #ifdef Py_USING_UNICODE
612 static PyObject *
translate_into_utf8(const char * str,const char * enc)613 translate_into_utf8(const char* str, const char* enc) {
614     PyObject *utf8;
615     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
616     if (buf == NULL)
617         return NULL;
618     utf8 = PyUnicode_AsUTF8String(buf);
619     Py_DECREF(buf);
620     return utf8;
621 }
622 #endif
623 
624 
625 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)626 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
627     int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
628     char *buf, *current;
629     char c = '\0';
630     buf = PyMem_MALLOC(needed_length);
631     if (buf == NULL) {
632         tok->done = E_NOMEM;
633         return NULL;
634     }
635     for (current = buf; *s; s++, current++) {
636         c = *s;
637         if (skip_next_lf) {
638             skip_next_lf = 0;
639             if (c == '\n') {
640                 c = *++s;
641                 if (!c)
642                     break;
643             }
644         }
645         if (c == '\r') {
646             skip_next_lf = 1;
647             c = '\n';
648         }
649         *current = c;
650     }
651     /* If this is exec input, add a newline to the end of the string if
652        there isn't one already. */
653     if (exec_input && c != '\n') {
654         *current = '\n';
655         current++;
656     }
657     *current = '\0';
658     final_length = current - buf + 1;
659     if (final_length < needed_length && final_length) {
660         /* should never fail */
661         char* result = PyMem_REALLOC(buf, final_length);
662         if (result == NULL) {
663             PyMem_FREE(buf);
664         }
665         buf = result;
666     }
667     return buf;
668 }
669 
670 /* Decode a byte string STR for use as the buffer of TOK.
671    Look for encoding declarations inside STR, and record them
672    inside TOK.  */
673 
674 static const char *
decode_str(const char * input,int single,struct tok_state * tok)675 decode_str(const char *input, int single, struct tok_state *tok)
676 {
677     PyObject* utf8 = NULL;
678     const char *str;
679     const char *s;
680     const char *newl[2] = {NULL, NULL};
681     int lineno = 0;
682     tok->input = str = translate_newlines(input, single, tok);
683     if (str == NULL)
684         return NULL;
685     tok->enc = NULL;
686     tok->str = str;
687     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
688         return error_ret(tok);
689     str = tok->str;             /* string after BOM if any */
690     assert(str);
691 #ifdef Py_USING_UNICODE
692     if (tok->enc != NULL) {
693         utf8 = translate_into_utf8(str, tok->enc);
694         if (utf8 == NULL)
695             return error_ret(tok);
696         str = PyString_AsString(utf8);
697     }
698 #endif
699     for (s = str;; s++) {
700         if (*s == '\0') break;
701         else if (*s == '\n') {
702             assert(lineno < 2);
703             newl[lineno] = s;
704             lineno++;
705             if (lineno == 2) break;
706         }
707     }
708     tok->enc = NULL;
709     /* need to check line 1 and 2 separately since check_coding_spec
710        assumes a single line as input */
711     if (newl[0]) {
712         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
713             return error_ret(tok);
714         if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
715             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
716                                    tok, buf_setreadl))
717                 return error_ret(tok);
718         }
719     }
720 #ifdef Py_USING_UNICODE
721     if (tok->enc != NULL) {
722         assert(utf8 == NULL);
723         utf8 = translate_into_utf8(str, tok->enc);
724         if (utf8 == NULL)
725             return error_ret(tok);
726         str = PyString_AsString(utf8);
727     }
728 #endif
729     assert(tok->decoding_buffer == NULL);
730     tok->decoding_buffer = utf8; /* CAUTION */
731     return str;
732 }
733 
734 #endif /* PGEN */
735 
736 /* Set up tokenizer for string */
737 
738 struct tok_state *
PyTokenizer_FromString(const char * str,int exec_input)739 PyTokenizer_FromString(const char *str, int exec_input)
740 {
741     struct tok_state *tok = tok_new();
742     if (tok == NULL)
743         return NULL;
744     str = (char *)decode_str(str, exec_input, tok);
745     if (str == NULL) {
746         PyTokenizer_Free(tok);
747         return NULL;
748     }
749 
750     /* XXX: constify members. */
751     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
752     return tok;
753 }
754 
755 
756 /* Set up tokenizer for file */
757 
758 struct tok_state *
PyTokenizer_FromFile(FILE * fp,char * ps1,char * ps2)759 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
760 {
761     struct tok_state *tok = tok_new();
762     if (tok == NULL)
763         return NULL;
764     if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
765         PyTokenizer_Free(tok);
766         return NULL;
767     }
768     tok->cur = tok->inp = tok->buf;
769     tok->end = tok->buf + BUFSIZ;
770     tok->fp = fp;
771     tok->prompt = ps1;
772     tok->nextprompt = ps2;
773     return tok;
774 }
775 
776 
777 /* Free a tok_state structure */
778 
779 void
PyTokenizer_Free(struct tok_state * tok)780 PyTokenizer_Free(struct tok_state *tok)
781 {
782     if (tok->encoding != NULL)
783         PyMem_FREE(tok->encoding);
784 #ifndef PGEN
785     Py_XDECREF(tok->decoding_readline);
786     Py_XDECREF(tok->decoding_buffer);
787 #endif
788     if (tok->fp != NULL && tok->buf != NULL)
789         PyMem_FREE(tok->buf);
790     if (tok->input)
791         PyMem_FREE((char *)tok->input);
792     PyMem_FREE(tok);
793 }
794 
795 #if !defined(PGEN) && defined(Py_USING_UNICODE)
796 static int
tok_stdin_decode(struct tok_state * tok,char ** inp)797 tok_stdin_decode(struct tok_state *tok, char **inp)
798 {
799     PyObject *enc, *sysstdin, *decoded, *utf8;
800     const char *encoding;
801     char *converted;
802 
803     if (PySys_GetFile((char *)"stdin", NULL) != stdin)
804         return 0;
805     sysstdin = PySys_GetObject("stdin");
806     if (sysstdin == NULL || !PyFile_Check(sysstdin))
807         return 0;
808 
809     enc = ((PyFileObject *)sysstdin)->f_encoding;
810     if (enc == NULL || !PyString_Check(enc))
811         return 0;
812     Py_INCREF(enc);
813 
814     encoding = PyString_AsString(enc);
815     decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
816     if (decoded == NULL)
817         goto error_clear;
818 
819     utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
820     Py_DECREF(decoded);
821     if (utf8 == NULL)
822         goto error_clear;
823 
824     assert(PyString_Check(utf8));
825     converted = new_string(PyString_AS_STRING(utf8),
826                            PyString_GET_SIZE(utf8));
827     Py_DECREF(utf8);
828     if (converted == NULL)
829         goto error_nomem;
830 
831     PyMem_FREE(*inp);
832     *inp = converted;
833     if (tok->encoding != NULL)
834         PyMem_FREE(tok->encoding);
835     tok->encoding = new_string(encoding, strlen(encoding));
836     if (tok->encoding == NULL)
837         goto error_nomem;
838 
839     Py_DECREF(enc);
840     return 0;
841 
842 error_nomem:
843     Py_DECREF(enc);
844     tok->done = E_NOMEM;
845     return -1;
846 
847 error_clear:
848     Py_DECREF(enc);
849     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
850         tok->done = E_ERROR;
851         return -1;
852     }
853     /* Fallback to iso-8859-1: for backward compatibility */
854     PyErr_Clear();
855     return 0;
856 }
857 #endif
858 
859 /* Get next char, updating state; error code goes into tok->done */
860 
861 static int
tok_nextc(register struct tok_state * tok)862 tok_nextc(register struct tok_state *tok)
863 {
864     for (;;) {
865         if (tok->cur != tok->inp) {
866             return Py_CHARMASK(*tok->cur++); /* Fast path */
867         }
868         if (tok->done != E_OK)
869             return EOF;
870         if (tok->fp == NULL) {
871             char *end = strchr(tok->inp, '\n');
872             if (end != NULL)
873                 end++;
874             else {
875                 end = strchr(tok->inp, '\0');
876                 if (end == tok->inp) {
877                     tok->done = E_EOF;
878                     return EOF;
879                 }
880             }
881             if (tok->start == NULL)
882                 tok->buf = tok->cur;
883             tok->line_start = tok->cur;
884             tok->lineno++;
885             tok->inp = end;
886             return Py_CHARMASK(*tok->cur++);
887         }
888         if (tok->prompt != NULL) {
889             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
890             if (tok->nextprompt != NULL)
891                 tok->prompt = tok->nextprompt;
892             if (newtok == NULL)
893                 tok->done = E_INTR;
894             else if (*newtok == '\0') {
895                 PyMem_FREE(newtok);
896                 tok->done = E_EOF;
897             }
898 #if !defined(PGEN) && defined(Py_USING_UNICODE)
899             else if (tok_stdin_decode(tok, &newtok) != 0)
900                 PyMem_FREE(newtok);
901 #endif
902             else if (tok->start != NULL) {
903                 size_t start = tok->start - tok->buf;
904                 size_t oldlen = tok->cur - tok->buf;
905                 size_t newlen = oldlen + strlen(newtok);
906                 char *buf = tok->buf;
907                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
908                 tok->lineno++;
909                 if (buf == NULL) {
910                     PyMem_FREE(tok->buf);
911                     tok->buf = NULL;
912                     PyMem_FREE(newtok);
913                     tok->done = E_NOMEM;
914                     return EOF;
915                 }
916                 tok->buf = buf;
917                 tok->cur = tok->buf + oldlen;
918                 tok->line_start = tok->cur;
919                 strcpy(tok->buf + oldlen, newtok);
920                 PyMem_FREE(newtok);
921                 tok->inp = tok->buf + newlen;
922                 tok->end = tok->inp + 1;
923                 tok->start = tok->buf + start;
924             }
925             else {
926                 tok->lineno++;
927                 if (tok->buf != NULL)
928                     PyMem_FREE(tok->buf);
929                 tok->buf = newtok;
930                 tok->cur = tok->buf;
931                 tok->line_start = tok->buf;
932                 tok->inp = strchr(tok->buf, '\0');
933                 tok->end = tok->inp + 1;
934             }
935         }
936         else {
937             int done = 0;
938             Py_ssize_t cur = 0;
939             char *pt;
940             if (tok->start == NULL) {
941                 if (tok->buf == NULL) {
942                     tok->buf = (char *)
943                         PyMem_MALLOC(BUFSIZ);
944                     if (tok->buf == NULL) {
945                         tok->done = E_NOMEM;
946                         return EOF;
947                     }
948                     tok->end = tok->buf + BUFSIZ;
949                 }
950                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
951                           tok) == NULL) {
952                     if (!tok->decoding_erred)
953                         tok->done = E_EOF;
954                     done = 1;
955                 }
956                 else {
957                     tok->done = E_OK;
958                     tok->inp = strchr(tok->buf, '\0');
959                     done = tok->inp == tok->buf || tok->inp[-1] == '\n';
960                 }
961             }
962             else {
963                 cur = tok->cur - tok->buf;
964                 if (decoding_feof(tok)) {
965                     tok->done = E_EOF;
966                     done = 1;
967                 }
968                 else
969                     tok->done = E_OK;
970             }
971             tok->lineno++;
972             /* Read until '\n' or EOF */
973             while (!done) {
974                 Py_ssize_t curstart = tok->start == NULL ? -1 :
975                           tok->start - tok->buf;
976                 Py_ssize_t curvalid = tok->inp - tok->buf;
977                 Py_ssize_t newsize = curvalid + BUFSIZ;
978                 char *newbuf = tok->buf;
979                 newbuf = (char *)PyMem_REALLOC(newbuf,
980                                                newsize);
981                 if (newbuf == NULL) {
982                     tok->done = E_NOMEM;
983                     tok->cur = tok->inp;
984                     return EOF;
985                 }
986                 tok->buf = newbuf;
987                 tok->cur = tok->buf + cur;
988                 tok->line_start = tok->cur;
989                 tok->inp = tok->buf + curvalid;
990                 tok->end = tok->buf + newsize;
991                 tok->start = curstart < 0 ? NULL :
992                          tok->buf + curstart;
993                 if (decoding_fgets(tok->inp,
994                                (int)(tok->end - tok->inp),
995                                tok) == NULL) {
996                     /* Break out early on decoding
997                        errors, as tok->buf will be NULL
998                      */
999                     if (tok->decoding_erred)
1000                         return EOF;
1001                     /* Last line does not end in \n,
1002                        fake one */
1003                     strcpy(tok->inp, "\n");
1004                 }
1005                 tok->inp = strchr(tok->inp, '\0');
1006                 done = tok->inp[-1] == '\n';
1007             }
1008             if (tok->buf != NULL) {
1009                 tok->cur = tok->buf + cur;
1010                 tok->line_start = tok->cur;
1011                 /* replace "\r\n" with "\n" */
1012                 /* For Mac leave the \r, giving a syntax error */
1013                 pt = tok->inp - 2;
1014                 if (pt >= tok->buf && *pt == '\r') {
1015                     *pt++ = '\n';
1016                     *pt = '\0';
1017                     tok->inp = pt;
1018                 }
1019             }
1020         }
1021         if (tok->done != E_OK) {
1022             if (tok->prompt != NULL)
1023                 PySys_WriteStderr("\n");
1024             tok->cur = tok->inp;
1025             return EOF;
1026         }
1027     }
1028     /*NOTREACHED*/
1029 }
1030 
1031 
1032 /* Back-up one character */
1033 
1034 static void
tok_backup(register struct tok_state * tok,register int c)1035 tok_backup(register struct tok_state *tok, register int c)
1036 {
1037     if (c != EOF) {
1038         if (--tok->cur < tok->buf)
1039             Py_FatalError("tok_backup: beginning of buffer");
1040         if (*tok->cur != c)
1041             *tok->cur = c;
1042     }
1043 }
1044 
1045 
1046 /* Return the token corresponding to a single character */
1047 
1048 int
PyToken_OneChar(int c)1049 PyToken_OneChar(int c)
1050 {
1051     switch (c) {
1052     case '(':           return LPAR;
1053     case ')':           return RPAR;
1054     case '[':           return LSQB;
1055     case ']':           return RSQB;
1056     case ':':           return COLON;
1057     case ',':           return COMMA;
1058     case ';':           return SEMI;
1059     case '+':           return PLUS;
1060     case '-':           return MINUS;
1061     case '*':           return STAR;
1062     case '/':           return SLASH;
1063     case '|':           return VBAR;
1064     case '&':           return AMPER;
1065     case '<':           return LESS;
1066     case '>':           return GREATER;
1067     case '=':           return EQUAL;
1068     case '.':           return DOT;
1069     case '%':           return PERCENT;
1070     case '`':           return BACKQUOTE;
1071     case '{':           return LBRACE;
1072     case '}':           return RBRACE;
1073     case '^':           return CIRCUMFLEX;
1074     case '~':           return TILDE;
1075     case '@':       return AT;
1076     default:            return OP;
1077     }
1078 }
1079 
1080 
1081 int
PyToken_TwoChars(int c1,int c2)1082 PyToken_TwoChars(int c1, int c2)
1083 {
1084     switch (c1) {
1085     case '=':
1086         switch (c2) {
1087         case '=':               return EQEQUAL;
1088         }
1089         break;
1090     case '!':
1091         switch (c2) {
1092         case '=':               return NOTEQUAL;
1093         }
1094         break;
1095     case '<':
1096         switch (c2) {
1097         case '>':               return NOTEQUAL;
1098         case '=':               return LESSEQUAL;
1099         case '<':               return LEFTSHIFT;
1100         }
1101         break;
1102     case '>':
1103         switch (c2) {
1104         case '=':               return GREATEREQUAL;
1105         case '>':               return RIGHTSHIFT;
1106         }
1107         break;
1108     case '+':
1109         switch (c2) {
1110         case '=':               return PLUSEQUAL;
1111         }
1112         break;
1113     case '-':
1114         switch (c2) {
1115         case '=':               return MINEQUAL;
1116         }
1117         break;
1118     case '*':
1119         switch (c2) {
1120         case '*':               return DOUBLESTAR;
1121         case '=':               return STAREQUAL;
1122         }
1123         break;
1124     case '/':
1125         switch (c2) {
1126         case '/':               return DOUBLESLASH;
1127         case '=':               return SLASHEQUAL;
1128         }
1129         break;
1130     case '|':
1131         switch (c2) {
1132         case '=':               return VBAREQUAL;
1133         }
1134         break;
1135     case '%':
1136         switch (c2) {
1137         case '=':               return PERCENTEQUAL;
1138         }
1139         break;
1140     case '&':
1141         switch (c2) {
1142         case '=':               return AMPEREQUAL;
1143         }
1144         break;
1145     case '^':
1146         switch (c2) {
1147         case '=':               return CIRCUMFLEXEQUAL;
1148         }
1149         break;
1150     }
1151     return OP;
1152 }
1153 
1154 int
PyToken_ThreeChars(int c1,int c2,int c3)1155 PyToken_ThreeChars(int c1, int c2, int c3)
1156 {
1157     switch (c1) {
1158     case '<':
1159         switch (c2) {
1160         case '<':
1161             switch (c3) {
1162             case '=':
1163                 return LEFTSHIFTEQUAL;
1164             }
1165             break;
1166         }
1167         break;
1168     case '>':
1169         switch (c2) {
1170         case '>':
1171             switch (c3) {
1172             case '=':
1173                 return RIGHTSHIFTEQUAL;
1174             }
1175             break;
1176         }
1177         break;
1178     case '*':
1179         switch (c2) {
1180         case '*':
1181             switch (c3) {
1182             case '=':
1183                 return DOUBLESTAREQUAL;
1184             }
1185             break;
1186         }
1187         break;
1188     case '/':
1189         switch (c2) {
1190         case '/':
1191             switch (c3) {
1192             case '=':
1193                 return DOUBLESLASHEQUAL;
1194             }
1195             break;
1196         }
1197         break;
1198     }
1199     return OP;
1200 }
1201 
1202 static int
indenterror(struct tok_state * tok)1203 indenterror(struct tok_state *tok)
1204 {
1205     if (tok->alterror) {
1206         tok->done = E_TABSPACE;
1207         tok->cur = tok->inp;
1208         return 1;
1209     }
1210     if (tok->altwarning) {
1211         PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1212                           "in indentation\n", tok->filename);
1213         tok->altwarning = 0;
1214     }
1215     return 0;
1216 }
1217 
1218 /* Get next token, after space stripping etc. */
1219 
1220 static int
tok_get(register struct tok_state * tok,char ** p_start,char ** p_end)1221 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1222 {
1223     register int c;
1224     int blankline;
1225 
1226     *p_start = *p_end = NULL;
1227   nextline:
1228     tok->start = NULL;
1229     blankline = 0;
1230 
1231     /* Get indentation level */
1232     if (tok->atbol) {
1233         register int col = 0;
1234         register int altcol = 0;
1235         tok->atbol = 0;
1236         for (;;) {
1237             c = tok_nextc(tok);
1238             if (c == ' ')
1239                 col++, altcol++;
1240             else if (c == '\t') {
1241                 col = (col/tok->tabsize + 1) * tok->tabsize;
1242                 altcol = (altcol/tok->alttabsize + 1)
1243                     * tok->alttabsize;
1244             }
1245             else if (c == '\014') /* Control-L (formfeed) */
1246                 col = altcol = 0; /* For Emacs users */
1247             else
1248                 break;
1249         }
1250         tok_backup(tok, c);
1251         if (c == '#' || c == '\n') {
1252             /* Lines with only whitespace and/or comments
1253                shouldn't affect the indentation and are
1254                not passed to the parser as NEWLINE tokens,
1255                except *totally* empty lines in interactive
1256                mode, which signal the end of a command group. */
1257             if (col == 0 && c == '\n' && tok->prompt != NULL)
1258                 blankline = 0; /* Let it through */
1259             else
1260                 blankline = 1; /* Ignore completely */
1261             /* We can't jump back right here since we still
1262                may need to skip to the end of a comment */
1263         }
1264         if (!blankline && tok->level == 0) {
1265             if (col == tok->indstack[tok->indent]) {
1266                 /* No change */
1267                 if (altcol != tok->altindstack[tok->indent]) {
1268                     if (indenterror(tok))
1269                         return ERRORTOKEN;
1270                 }
1271             }
1272             else if (col > tok->indstack[tok->indent]) {
1273                 /* Indent -- always one */
1274                 if (tok->indent+1 >= MAXINDENT) {
1275                     tok->done = E_TOODEEP;
1276                     tok->cur = tok->inp;
1277                     return ERRORTOKEN;
1278                 }
1279                 if (altcol <= tok->altindstack[tok->indent]) {
1280                     if (indenterror(tok))
1281                         return ERRORTOKEN;
1282                 }
1283                 tok->pendin++;
1284                 tok->indstack[++tok->indent] = col;
1285                 tok->altindstack[tok->indent] = altcol;
1286             }
1287             else /* col < tok->indstack[tok->indent] */ {
1288                 /* Dedent -- any number, must be consistent */
1289                 while (tok->indent > 0 &&
1290                     col < tok->indstack[tok->indent]) {
1291                     tok->pendin--;
1292                     tok->indent--;
1293                 }
1294                 if (col != tok->indstack[tok->indent]) {
1295                     tok->done = E_DEDENT;
1296                     tok->cur = tok->inp;
1297                     return ERRORTOKEN;
1298                 }
1299                 if (altcol != tok->altindstack[tok->indent]) {
1300                     if (indenterror(tok))
1301                         return ERRORTOKEN;
1302                 }
1303             }
1304         }
1305     }
1306 
1307     tok->start = tok->cur;
1308 
1309     /* Return pending indents/dedents */
1310     if (tok->pendin != 0) {
1311         if (tok->pendin < 0) {
1312             tok->pendin++;
1313             return DEDENT;
1314         }
1315         else {
1316             tok->pendin--;
1317             return INDENT;
1318         }
1319     }
1320 
1321  again:
1322     tok->start = NULL;
1323     /* Skip spaces */
1324     do {
1325         c = tok_nextc(tok);
1326     } while (c == ' ' || c == '\t' || c == '\014');
1327 
1328     /* Set start of current token */
1329     tok->start = tok->cur - 1;
1330 
1331     /* Skip comment, while looking for tab-setting magic */
1332     if (c == '#') {
1333         static char *tabforms[] = {
1334             "tab-width:",                       /* Emacs */
1335             ":tabstop=",                        /* vim, full form */
1336             ":ts=",                             /* vim, abbreviated form */
1337             "set tabsize=",                     /* will vi never die? */
1338         /* more templates can be added here to support other editors */
1339         };
1340         char cbuf[80];
1341         char *tp, **cp;
1342         tp = cbuf;
1343         do {
1344             *tp++ = c = tok_nextc(tok);
1345         } while (c != EOF && c != '\n' &&
1346                  (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1347         *tp = '\0';
1348         for (cp = tabforms;
1349              cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1350              cp++) {
1351             if ((tp = strstr(cbuf, *cp))) {
1352                 int newsize = atoi(tp + strlen(*cp));
1353 
1354                 if (newsize >= 1 && newsize <= 40) {
1355                     tok->tabsize = newsize;
1356                     if (Py_VerboseFlag)
1357                         PySys_WriteStderr(
1358                         "Tab size set to %d\n",
1359                         newsize);
1360                 }
1361             }
1362         }
1363         while (c != EOF && c != '\n')
1364             c = tok_nextc(tok);
1365     }
1366 
1367     /* Check for EOF and errors now */
1368     if (c == EOF) {
1369         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1370     }
1371 
1372     /* Identifier (most frequent token!) */
1373     if (Py_ISALPHA(c) || c == '_') {
1374         /* Process r"", u"" and ur"" */
1375         switch (c) {
1376         case 'b':
1377         case 'B':
1378             c = tok_nextc(tok);
1379             if (c == 'r' || c == 'R')
1380                 c = tok_nextc(tok);
1381             if (c == '"' || c == '\'')
1382                 goto letter_quote;
1383             break;
1384         case 'r':
1385         case 'R':
1386             c = tok_nextc(tok);
1387             if (c == '"' || c == '\'')
1388                 goto letter_quote;
1389             break;
1390         case 'u':
1391         case 'U':
1392             c = tok_nextc(tok);
1393             if (c == 'r' || c == 'R')
1394                 c = tok_nextc(tok);
1395             if (c == '"' || c == '\'')
1396                 goto letter_quote;
1397             break;
1398         }
1399         while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
1400             c = tok_nextc(tok);
1401         }
1402         tok_backup(tok, c);
1403         *p_start = tok->start;
1404         *p_end = tok->cur;
1405         return NAME;
1406     }
1407 
1408     /* Newline */
1409     if (c == '\n') {
1410         tok->atbol = 1;
1411         if (blankline || tok->level > 0)
1412             goto nextline;
1413         *p_start = tok->start;
1414         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1415         tok->cont_line = 0;
1416         return NEWLINE;
1417     }
1418 
1419     /* Period or number starting with period? */
1420     if (c == '.') {
1421         c = tok_nextc(tok);
1422         if (isdigit(c)) {
1423             goto fraction;
1424         }
1425         else {
1426             tok_backup(tok, c);
1427             *p_start = tok->start;
1428             *p_end = tok->cur;
1429             return DOT;
1430         }
1431     }
1432 
1433     /* Number */
1434     if (isdigit(c)) {
1435         if (c == '0') {
1436             /* Hex, octal or binary -- maybe. */
1437             c = tok_nextc(tok);
1438             if (c == '.')
1439                 goto fraction;
1440 #ifndef WITHOUT_COMPLEX
1441             if (c == 'j' || c == 'J')
1442                 goto imaginary;
1443 #endif
1444             if (c == 'x' || c == 'X') {
1445 
1446                 /* Hex */
1447                 c = tok_nextc(tok);
1448                 if (!isxdigit(c)) {
1449                     tok->done = E_TOKEN;
1450                     tok_backup(tok, c);
1451                     return ERRORTOKEN;
1452                 }
1453                 do {
1454                     c = tok_nextc(tok);
1455                 } while (isxdigit(c));
1456             }
1457             else if (c == 'o' || c == 'O') {
1458                 /* Octal */
1459                 c = tok_nextc(tok);
1460                 if (c < '0' || c >= '8') {
1461                     tok->done = E_TOKEN;
1462                     tok_backup(tok, c);
1463                     return ERRORTOKEN;
1464                 }
1465                 do {
1466                     c = tok_nextc(tok);
1467                 } while ('0' <= c && c < '8');
1468             }
1469             else if (c == 'b' || c == 'B') {
1470                 /* Binary */
1471                 c = tok_nextc(tok);
1472                 if (c != '0' && c != '1') {
1473                     tok->done = E_TOKEN;
1474                     tok_backup(tok, c);
1475                     return ERRORTOKEN;
1476                 }
1477                 do {
1478                     c = tok_nextc(tok);
1479                 } while (c == '0' || c == '1');
1480             }
1481             else {
1482                 int found_decimal = 0;
1483                 /* Octal; c is first char of it */
1484                 /* There's no 'isoctdigit' macro, sigh */
1485                 while ('0' <= c && c < '8') {
1486                     c = tok_nextc(tok);
1487                 }
1488                 if (isdigit(c)) {
1489                     found_decimal = 1;
1490                     do {
1491                         c = tok_nextc(tok);
1492                     } while (isdigit(c));
1493                 }
1494                 if (c == '.')
1495                     goto fraction;
1496                 else if (c == 'e' || c == 'E')
1497                     goto exponent;
1498 #ifndef WITHOUT_COMPLEX
1499                 else if (c == 'j' || c == 'J')
1500                     goto imaginary;
1501 #endif
1502                 else if (found_decimal) {
1503                     tok->done = E_TOKEN;
1504                     tok_backup(tok, c);
1505                     return ERRORTOKEN;
1506                 }
1507             }
1508             if (c == 'l' || c == 'L')
1509                 c = tok_nextc(tok);
1510         }
1511         else {
1512             /* Decimal */
1513             do {
1514                 c = tok_nextc(tok);
1515             } while (isdigit(c));
1516             if (c == 'l' || c == 'L')
1517                 c = tok_nextc(tok);
1518             else {
1519                 /* Accept floating point numbers. */
1520                 if (c == '.') {
1521         fraction:
1522                     /* Fraction */
1523                     do {
1524                         c = tok_nextc(tok);
1525                     } while (isdigit(c));
1526                 }
1527                 if (c == 'e' || c == 'E') {
1528                     int e;
1529                   exponent:
1530                     e = c;
1531                     /* Exponent part */
1532                     c = tok_nextc(tok);
1533                     if (c == '+' || c == '-') {
1534                         c = tok_nextc(tok);
1535                         if (!isdigit(c)) {
1536                             tok->done = E_TOKEN;
1537                             tok_backup(tok, c);
1538                             return ERRORTOKEN;
1539                         }
1540                     } else if (!isdigit(c)) {
1541                         tok_backup(tok, c);
1542                         tok_backup(tok, e);
1543                         *p_start = tok->start;
1544                         *p_end = tok->cur;
1545                         return NUMBER;
1546                     }
1547                     do {
1548                         c = tok_nextc(tok);
1549                     } while (isdigit(c));
1550                 }
1551 #ifndef WITHOUT_COMPLEX
1552                 if (c == 'j' || c == 'J')
1553                     /* Imaginary part */
1554         imaginary:
1555                     c = tok_nextc(tok);
1556 #endif
1557             }
1558         }
1559         tok_backup(tok, c);
1560         *p_start = tok->start;
1561         *p_end = tok->cur;
1562         return NUMBER;
1563     }
1564 
1565   letter_quote:
1566     /* String */
1567     if (c == '\'' || c == '"') {
1568         Py_ssize_t quote2 = tok->cur - tok->start + 1;
1569         int quote = c;
1570         int triple = 0;
1571         int tripcount = 0;
1572         for (;;) {
1573             c = tok_nextc(tok);
1574             if (c == '\n') {
1575                 if (!triple) {
1576                     tok->done = E_EOLS;
1577                     tok_backup(tok, c);
1578                     return ERRORTOKEN;
1579                 }
1580                 tripcount = 0;
1581                 tok->cont_line = 1; /* multiline string. */
1582             }
1583             else if (c == EOF) {
1584                 if (triple)
1585                     tok->done = E_EOFS;
1586                 else
1587                     tok->done = E_EOLS;
1588                 tok->cur = tok->inp;
1589                 return ERRORTOKEN;
1590             }
1591             else if (c == quote) {
1592                 tripcount++;
1593                 if (tok->cur - tok->start == quote2) {
1594                     c = tok_nextc(tok);
1595                     if (c == quote) {
1596                         triple = 1;
1597                         tripcount = 0;
1598                         continue;
1599                     }
1600                     tok_backup(tok, c);
1601                 }
1602                 if (!triple || tripcount == 3)
1603                     break;
1604             }
1605             else if (c == '\\') {
1606                 tripcount = 0;
1607                 c = tok_nextc(tok);
1608                 if (c == EOF) {
1609                     tok->done = E_EOLS;
1610                     tok->cur = tok->inp;
1611                     return ERRORTOKEN;
1612                 }
1613             }
1614             else
1615                 tripcount = 0;
1616         }
1617         *p_start = tok->start;
1618         *p_end = tok->cur;
1619         return STRING;
1620     }
1621 
1622     /* Line continuation */
1623     if (c == '\\') {
1624         c = tok_nextc(tok);
1625         if (c != '\n') {
1626             tok->done = E_LINECONT;
1627             tok->cur = tok->inp;
1628             return ERRORTOKEN;
1629         }
1630         tok->cont_line = 1;
1631         goto again; /* Read next line */
1632     }
1633 
1634     /* Check for two-character token */
1635     {
1636         int c2 = tok_nextc(tok);
1637         int token = PyToken_TwoChars(c, c2);
1638 #ifndef PGEN
1639         if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1640             if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1641                                    "<> not supported in 3.x; use !=",
1642                                    tok->filename, tok->lineno,
1643                                    NULL, NULL)) {
1644                 tok->done = E_ERROR;
1645                 tok->cur = tok->inp;
1646                 return ERRORTOKEN;
1647             }
1648         }
1649 #endif
1650         if (token != OP) {
1651             int c3 = tok_nextc(tok);
1652             int token3 = PyToken_ThreeChars(c, c2, c3);
1653             if (token3 != OP) {
1654                 token = token3;
1655             } else {
1656                 tok_backup(tok, c3);
1657             }
1658             *p_start = tok->start;
1659             *p_end = tok->cur;
1660             return token;
1661         }
1662         tok_backup(tok, c2);
1663     }
1664 
1665     /* Keep track of parentheses nesting level */
1666     switch (c) {
1667     case '(':
1668     case '[':
1669     case '{':
1670         tok->level++;
1671         break;
1672     case ')':
1673     case ']':
1674     case '}':
1675         tok->level--;
1676         break;
1677     }
1678 
1679     /* Punctuation character */
1680     *p_start = tok->start;
1681     *p_end = tok->cur;
1682     return PyToken_OneChar(c);
1683 }
1684 
1685 int
PyTokenizer_Get(struct tok_state * tok,char ** p_start,char ** p_end)1686 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1687 {
1688     int result = tok_get(tok, p_start, p_end);
1689     if (tok->fp && ferror(tok->fp)) {
1690         clearerr(tok->fp);
1691         result = ERRORTOKEN;
1692         tok->done = E_IO;
1693     }
1694     if (tok->decoding_erred) {
1695         result = ERRORTOKEN;
1696         tok->done = E_DECODE;
1697     }
1698     return result;
1699 }
1700 
1701 /* This function is only called from parsetok. However, it cannot live
1702    there, as it must be empty for PGEN, and we can check for PGEN only
1703    in this file. */
1704 
1705 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1706 char*
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1707 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1708 {
1709     return NULL;
1710 }
1711 #else
1712 #ifdef Py_USING_UNICODE
1713 static PyObject *
dec_utf8(const char * enc,const char * text,size_t len)1714 dec_utf8(const char *enc, const char *text, size_t len) {
1715     PyObject *ret = NULL;
1716     PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1717     if (unicode_text) {
1718         ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1719         Py_DECREF(unicode_text);
1720     }
1721     if (!ret) {
1722         PyErr_Clear();
1723     }
1724     return ret;
1725 }
1726 char *
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1727 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1728 {
1729     char *text = NULL;
1730     if (tok->encoding) {
1731         /* convert source to original encondig */
1732         PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1733         if (lineobj != NULL) {
1734             int linelen = PyString_Size(lineobj);
1735             const char *line = PyString_AsString(lineobj);
1736             text = PyObject_MALLOC(linelen + 1);
1737             if (text != NULL && line != NULL) {
1738                 if (linelen)
1739                     strncpy(text, line, linelen);
1740                 text[linelen] = '\0';
1741             }
1742             Py_DECREF(lineobj);
1743 
1744             /* adjust error offset */
1745             if (*offset > 1) {
1746                 PyObject *offsetobj = dec_utf8(tok->encoding,
1747                                                tok->buf, *offset-1);
1748                 if (offsetobj) {
1749                     *offset = PyString_Size(offsetobj) + 1;
1750                     Py_DECREF(offsetobj);
1751                 }
1752             }
1753 
1754         }
1755     }
1756     return text;
1757 
1758 }
1759 #endif /* defined(Py_USING_UNICODE) */
1760 #endif
1761 
1762 
1763 #ifdef Py_DEBUG
1764 
1765 void
tok_dump(int type,char * start,char * end)1766 tok_dump(int type, char *start, char *end)
1767 {
1768     printf("%s", _PyParser_TokenNames[type]);
1769     if (type == NAME || type == NUMBER || type == STRING || type == OP)
1770         printf("(%.*s)", (int)(end - start), start);
1771 }
1772 
1773 #endif
1774