1 
2 /* Tokenizer implementation */
3 
4 #include "Python.h"
5 #include "pgenheaders.h"
6 
7 #include <ctype.h>
8 #include <assert.h>
9 
10 #include "tokenizer.h"
11 #include "errcode.h"
12 
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #include "pydebug.h"
20 #endif /* PGEN */
21 
22 extern char *PyOS_Readline(FILE *, FILE *, char *);
23 /* Return malloc'ed string including trailing \n;
24    empty malloc'ed string for EOF;
25    NULL if interrupted */
26 
27 /* Don't ever change this -- it would break the portability of Python code */
28 #define TABSIZE 8
29 
30 /* Forward */
31 static struct tok_state *tok_new(void);
32 static int tok_nextc(struct tok_state *tok);
33 static void tok_backup(struct tok_state *tok, int c);
34 
35 /* Token names */
36 
37 char *_PyParser_TokenNames[] = {
38     "ENDMARKER",
39     "NAME",
40     "NUMBER",
41     "STRING",
42     "NEWLINE",
43     "INDENT",
44     "DEDENT",
45     "LPAR",
46     "RPAR",
47     "LSQB",
48     "RSQB",
49     "COLON",
50     "COMMA",
51     "SEMI",
52     "PLUS",
53     "MINUS",
54     "STAR",
55     "SLASH",
56     "VBAR",
57     "AMPER",
58     "LESS",
59     "GREATER",
60     "EQUAL",
61     "DOT",
62     "PERCENT",
63     "BACKQUOTE",
64     "LBRACE",
65     "RBRACE",
66     "EQEQUAL",
67     "NOTEQUAL",
68     "LESSEQUAL",
69     "GREATEREQUAL",
70     "TILDE",
71     "CIRCUMFLEX",
72     "LEFTSHIFT",
73     "RIGHTSHIFT",
74     "DOUBLESTAR",
75     "PLUSEQUAL",
76     "MINEQUAL",
77     "STAREQUAL",
78     "SLASHEQUAL",
79     "PERCENTEQUAL",
80     "AMPEREQUAL",
81     "VBAREQUAL",
82     "CIRCUMFLEXEQUAL",
83     "LEFTSHIFTEQUAL",
84     "RIGHTSHIFTEQUAL",
85     "DOUBLESTAREQUAL",
86     "DOUBLESLASH",
87     "DOUBLESLASHEQUAL",
88     "AT",
89     /* This table must match the #defines in token.h! */
90     "OP",
91     "<ERRORTOKEN>",
92     "<N_TOKENS>"
93 };
94 
95 /* Create and initialize a new tok_state structure */
96 
97 static struct tok_state *
tok_new(void)98 tok_new(void)
99 {
100     struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
101                                             sizeof(struct tok_state));
102     if (tok == NULL)
103         return NULL;
104     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
105     tok->done = E_OK;
106     tok->fp = NULL;
107     tok->input = NULL;
108     tok->tabsize = TABSIZE;
109     tok->indent = 0;
110     tok->indstack[0] = 0;
111     tok->atbol = 1;
112     tok->pendin = 0;
113     tok->prompt = tok->nextprompt = NULL;
114     tok->lineno = 0;
115     tok->level = 0;
116     tok->filename = NULL;
117     tok->altwarning = 0;
118     tok->alterror = 0;
119     tok->alttabsize = 1;
120     tok->altindstack[0] = 0;
121     tok->decoding_state = 0;
122     tok->decoding_erred = 0;
123     tok->read_coding_spec = 0;
124     tok->encoding = NULL;
125     tok->cont_line = 0;
126 #ifndef PGEN
127     tok->decoding_readline = NULL;
128     tok->decoding_buffer = NULL;
129 #endif
130     return tok;
131 }
132 
133 static char *
new_string(const char * s,Py_ssize_t len)134 new_string(const char *s, Py_ssize_t len)
135 {
136     char* result = (char *)PyMem_MALLOC(len + 1);
137     if (result != NULL) {
138         memcpy(result, s, len);
139         result[len] = '\0';
140     }
141     return result;
142 }
143 
144 #ifdef PGEN
145 
146 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)147 decoding_fgets(char *s, int size, struct tok_state *tok)
148 {
149     return fgets(s, size, tok->fp);
150 }
151 
152 static int
decoding_feof(struct tok_state * tok)153 decoding_feof(struct tok_state *tok)
154 {
155     return feof(tok->fp);
156 }
157 
158 static char *
decode_str(const char * str,int exec_input,struct tok_state * tok)159 decode_str(const char *str, int exec_input, struct tok_state *tok)
160 {
161     return new_string(str, strlen(str));
162 }
163 
164 #else /* PGEN */
165 
166 static char *
error_ret(struct tok_state * tok)167 error_ret(struct tok_state *tok) /* XXX */
168 {
169     tok->decoding_erred = 1;
170     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
171         PyMem_FREE(tok->buf);
172     tok->buf = NULL;
173     return NULL;                /* as if it were EOF */
174 }
175 
176 
177 static char *
get_normal_name(char * s)178 get_normal_name(char *s)        /* for utf-8 and latin-1 */
179 {
180     char buf[13];
181     int i;
182     for (i = 0; i < 12; i++) {
183         int c = s[i];
184         if (c == '\0')
185             break;
186         else if (c == '_')
187             buf[i] = '-';
188         else
189             buf[i] = tolower(c);
190     }
191     buf[i] = '\0';
192     if (strcmp(buf, "utf-8") == 0 ||
193         strncmp(buf, "utf-8-", 6) == 0)
194         return "utf-8";
195     else if (strcmp(buf, "latin-1") == 0 ||
196              strcmp(buf, "iso-8859-1") == 0 ||
197              strcmp(buf, "iso-latin-1") == 0 ||
198              strncmp(buf, "latin-1-", 8) == 0 ||
199              strncmp(buf, "iso-8859-1-", 11) == 0 ||
200              strncmp(buf, "iso-latin-1-", 12) == 0)
201         return "iso-8859-1";
202     else
203         return s;
204 }
205 
206 /* Return the coding spec in S, or NULL if none is found.  */
207 
208 static char *
get_coding_spec(const char * s,Py_ssize_t size)209 get_coding_spec(const char *s, Py_ssize_t size)
210 {
211     Py_ssize_t i;
212     /* Coding spec must be in a comment, and that comment must be
213      * the only statement on the source code line. */
214     for (i = 0; i < size - 6; i++) {
215         if (s[i] == '#')
216             break;
217         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218             return NULL;
219     }
220     for (; i < size - 6; i++) { /* XXX inefficient search */
221         const char* t = s + i;
222         if (strncmp(t, "coding", 6) == 0) {
223             const char* begin = NULL;
224             t += 6;
225             if (t[0] != ':' && t[0] != '=')
226                 continue;
227             do {
228                 t++;
229             } while (t[0] == '\x20' || t[0] == '\t');
230 
231             begin = t;
232             while (Py_ISALNUM(t[0]) ||
233                    t[0] == '-' || t[0] == '_' || t[0] == '.')
234                 t++;
235 
236             if (begin < t) {
237                 char* r = new_string(begin, t - begin);
238                 char* q = get_normal_name(r);
239                 if (r != q) {
240                     PyMem_FREE(r);
241                     r = new_string(q, strlen(q));
242                 }
243                 return r;
244             }
245         }
246     }
247     return NULL;
248 }
249 
250 /* Check whether the line contains a coding spec. If it does,
251    invoke the set_readline function for the new encoding.
252    This function receives the tok_state and the new encoding.
253    Return 1 on success, 0 on failure.  */
254 
255 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))256 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
257                   int set_readline(struct tok_state *, const char *))
258 {
259     char * cs;
260     int r = 1;
261 
262     if (tok->cont_line)
263         /* It's a continuation line, so it can't be a coding spec. */
264         return 1;
265     cs = get_coding_spec(line, size);
266     if (cs != NULL) {
267         tok->read_coding_spec = 1;
268         if (tok->encoding == NULL) {
269             assert(tok->decoding_state == 1); /* raw */
270             if (strcmp(cs, "utf-8") == 0 ||
271                 strcmp(cs, "iso-8859-1") == 0) {
272                 tok->encoding = cs;
273             } else {
274 #ifdef Py_USING_UNICODE
275                 r = set_readline(tok, cs);
276                 if (r) {
277                     tok->encoding = cs;
278                     tok->decoding_state = -1;
279                 }
280                 else
281                     PyMem_FREE(cs);
282 #else
283                 /* Without Unicode support, we cannot
284                    process the coding spec. Since there
285                    won't be any Unicode literals, that
286                    won't matter. */
287                 PyMem_FREE(cs);
288 #endif
289             }
290         } else {                /* then, compare cs with BOM */
291             r = (strcmp(tok->encoding, cs) == 0);
292             PyMem_FREE(cs);
293         }
294     }
295     if (!r) {
296         cs = tok->encoding;
297         if (!cs)
298             cs = "with BOM";
299         PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
300     }
301     return r;
302 }
303 
304 /* See whether the file starts with a BOM. If it does,
305    invoke the set_readline function with the new encoding.
306    Return 1 on success, 0 on failure.  */
307 
308 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)309 check_bom(int get_char(struct tok_state *),
310           void unget_char(int, struct tok_state *),
311           int set_readline(struct tok_state *, const char *),
312           struct tok_state *tok)
313 {
314     int ch1, ch2, ch3;
315     ch1 = get_char(tok);
316     tok->decoding_state = 1;
317     if (ch1 == EOF) {
318         return 1;
319     } else if (ch1 == 0xEF) {
320         ch2 = get_char(tok);
321         if (ch2 != 0xBB) {
322             unget_char(ch2, tok);
323             unget_char(ch1, tok);
324             return 1;
325         }
326         ch3 = get_char(tok);
327         if (ch3 != 0xBF) {
328             unget_char(ch3, tok);
329             unget_char(ch2, tok);
330             unget_char(ch1, tok);
331             return 1;
332         }
333 #if 0
334     /* Disable support for UTF-16 BOMs until a decision
335        is made whether this needs to be supported.  */
336     } else if (ch1 == 0xFE) {
337         ch2 = get_char(tok);
338         if (ch2 != 0xFF) {
339             unget_char(ch2, tok);
340             unget_char(ch1, tok);
341             return 1;
342         }
343         if (!set_readline(tok, "utf-16-be"))
344             return 0;
345         tok->decoding_state = -1;
346     } else if (ch1 == 0xFF) {
347         ch2 = get_char(tok);
348         if (ch2 != 0xFE) {
349             unget_char(ch2, tok);
350             unget_char(ch1, tok);
351             return 1;
352         }
353         if (!set_readline(tok, "utf-16-le"))
354             return 0;
355         tok->decoding_state = -1;
356 #endif
357     } else {
358         unget_char(ch1, tok);
359         return 1;
360     }
361     if (tok->encoding != NULL)
362         PyMem_FREE(tok->encoding);
363     tok->encoding = new_string("utf-8", 5);     /* resulting is in utf-8 */
364     return 1;
365 }
366 
367 /* Read a line of text from TOK into S, using the stream in TOK.
368    Return NULL on failure, else S.
369 
370    On entry, tok->decoding_buffer will be one of:
371      1) NULL: need to call tok->decoding_readline to get a new line
372      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
373        stored the result in tok->decoding_buffer
374      3) PyStringObject *: previous call to fp_readl did not have enough room
375        (in the s buffer) to copy entire contents of the line read
376        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
377        In this case, fp_readl is called in a loop (with an expanded buffer)
378        until the buffer ends with a '\n' (or until the end of the file is
379        reached): see tok_nextc and its calls to decoding_fgets.
380 */
381 
382 static char *
fp_readl(char * s,int size,struct tok_state * tok)383 fp_readl(char *s, int size, struct tok_state *tok)
384 {
385 #ifndef Py_USING_UNICODE
386     /* In a non-Unicode built, this should never be called. */
387     Py_FatalError("fp_readl should not be called in this build.");
388     return NULL; /* Keep compiler happy (not reachable) */
389 #else
390     PyObject* utf8 = NULL;
391     PyObject* buf = tok->decoding_buffer;
392     char *str;
393     Py_ssize_t utf8len;
394 
395     /* Ask for one less byte so we can terminate it */
396     assert(size > 0);
397     size--;
398 
399     if (buf == NULL) {
400         buf = PyObject_CallObject(tok->decoding_readline, NULL);
401         if (buf == NULL)
402             return error_ret(tok);
403     } else {
404         tok->decoding_buffer = NULL;
405         if (PyString_CheckExact(buf))
406             utf8 = buf;
407     }
408     if (utf8 == NULL) {
409         utf8 = PyUnicode_AsUTF8String(buf);
410         Py_DECREF(buf);
411         if (utf8 == NULL)
412             return error_ret(tok);
413     }
414     str = PyString_AsString(utf8);
415     utf8len = PyString_GET_SIZE(utf8);
416     if (utf8len > size) {
417         tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
418         if (tok->decoding_buffer == NULL) {
419             Py_DECREF(utf8);
420             return error_ret(tok);
421         }
422         utf8len = size;
423     }
424     memcpy(s, str, utf8len);
425     s[utf8len] = '\0';
426     Py_DECREF(utf8);
427     if (utf8len == 0)
428         return NULL; /* EOF */
429     return s;
430 #endif
431 }
432 
433 /* Set the readline function for TOK to a StreamReader's
434    readline function. The StreamReader is named ENC.
435 
436    This function is called from check_bom and check_coding_spec.
437 
438    ENC is usually identical to the future value of tok->encoding,
439    except for the (currently unsupported) case of UTF-16.
440 
441    Return 1 on success, 0 on failure. */
442 
443 static int
fp_setreadl(struct tok_state * tok,const char * enc)444 fp_setreadl(struct tok_state *tok, const char* enc)
445 {
446     PyObject *reader, *stream, *readline;
447 
448     /* XXX: constify filename argument. */
449     stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
450     if (stream == NULL)
451         return 0;
452 
453     reader = PyCodec_StreamReader(enc, stream, NULL);
454     Py_DECREF(stream);
455     if (reader == NULL)
456         return 0;
457 
458     readline = PyObject_GetAttrString(reader, "readline");
459     Py_DECREF(reader);
460     if (readline == NULL)
461         return 0;
462 
463     tok->decoding_readline = readline;
464     return 1;
465 }
466 
467 /* Fetch the next byte from TOK. */
468 
fp_getc(struct tok_state * tok)469 static int fp_getc(struct tok_state *tok) {
470     return getc(tok->fp);
471 }
472 
473 /* Unfetch the last byte back into TOK.  */
474 
fp_ungetc(int c,struct tok_state * tok)475 static void fp_ungetc(int c, struct tok_state *tok) {
476     ungetc(c, tok->fp);
477 }
478 
479 /* Read a line of input from TOK. Determine encoding
480    if necessary.  */
481 
482 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)483 decoding_fgets(char *s, int size, struct tok_state *tok)
484 {
485     char *line = NULL;
486     int badchar = 0;
487     for (;;) {
488         if (tok->decoding_state < 0) {
489             /* We already have a codec associated with
490                this input. */
491             line = fp_readl(s, size, tok);
492             break;
493         } else if (tok->decoding_state > 0) {
494             /* We want a 'raw' read. */
495             line = Py_UniversalNewlineFgets(s, size,
496                                             tok->fp, NULL);
497             break;
498         } else {
499             /* We have not yet determined the encoding.
500                If an encoding is found, use the file-pointer
501                reader functions from now on. */
502             if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
503                 return error_ret(tok);
504             assert(tok->decoding_state != 0);
505         }
506     }
507     if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
508         if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
509             return error_ret(tok);
510         }
511     }
512 #ifndef PGEN
513     /* The default encoding is ASCII, so make sure we don't have any
514        non-ASCII bytes in it. */
515     if (line && !tok->encoding) {
516         unsigned char *c;
517         for (c = (unsigned char *)line; *c; c++)
518             if (*c > 127) {
519                 badchar = *c;
520                 break;
521             }
522     }
523     if (badchar) {
524         char buf[500];
525         /* Need to add 1 to the line number, since this line
526            has not been counted, yet.  */
527         sprintf(buf,
528             "Non-ASCII character '\\x%.2x' "
529             "in file %.200s on line %i, "
530             "but no encoding declared; "
531             "see http://www.python.org/peps/pep-0263.html for details",
532             badchar, tok->filename, tok->lineno + 1);
533         PyErr_SetString(PyExc_SyntaxError, buf);
534         return error_ret(tok);
535     }
536 #endif
537     return line;
538 }
539 
540 static int
decoding_feof(struct tok_state * tok)541 decoding_feof(struct tok_state *tok)
542 {
543     if (tok->decoding_state >= 0) {
544         return feof(tok->fp);
545     } else {
546         PyObject* buf = tok->decoding_buffer;
547         if (buf == NULL) {
548             buf = PyObject_CallObject(tok->decoding_readline, NULL);
549             if (buf == NULL) {
550                 error_ret(tok);
551                 return 1;
552             } else {
553                 tok->decoding_buffer = buf;
554             }
555         }
556         return PyObject_Length(buf) == 0;
557     }
558 }
559 
560 /* Fetch a byte from TOK, using the string buffer. */
561 
562 static int
buf_getc(struct tok_state * tok)563 buf_getc(struct tok_state *tok) {
564     return Py_CHARMASK(*tok->str++);
565 }
566 
567 /* Unfetch a byte from TOK, using the string buffer. */
568 
569 static void
buf_ungetc(int c,struct tok_state * tok)570 buf_ungetc(int c, struct tok_state *tok) {
571     tok->str--;
572     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
573 }
574 
575 /* Set the readline function for TOK to ENC. For the string-based
576    tokenizer, this means to just record the encoding. */
577 
578 static int
buf_setreadl(struct tok_state * tok,const char * enc)579 buf_setreadl(struct tok_state *tok, const char* enc) {
580     tok->enc = enc;
581     return 1;
582 }
583 
584 /* Return a UTF-8 encoding Python string object from the
585    C byte string STR, which is encoded with ENC. */
586 
587 #ifdef Py_USING_UNICODE
588 static PyObject *
translate_into_utf8(const char * str,const char * enc)589 translate_into_utf8(const char* str, const char* enc) {
590     PyObject *utf8;
591     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
592     if (buf == NULL)
593         return NULL;
594     utf8 = PyUnicode_AsUTF8String(buf);
595     Py_DECREF(buf);
596     return utf8;
597 }
598 #endif
599 
600 
601 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)602 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
603     int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
604     char *buf, *current;
605     char c = '\0';
606     buf = PyMem_MALLOC(needed_length);
607     if (buf == NULL) {
608         tok->done = E_NOMEM;
609         return NULL;
610     }
611     for (current = buf; *s; s++, current++) {
612         c = *s;
613         if (skip_next_lf) {
614             skip_next_lf = 0;
615             if (c == '\n') {
616                 c = *++s;
617                 if (!c)
618                     break;
619             }
620         }
621         if (c == '\r') {
622             skip_next_lf = 1;
623             c = '\n';
624         }
625         *current = c;
626     }
627     /* If this is exec input, add a newline to the end of the string if
628        there isn't one already. */
629     if (exec_input && c != '\n') {
630         *current = '\n';
631         current++;
632     }
633     *current = '\0';
634     final_length = current - buf + 1;
635     if (final_length < needed_length && final_length)
636         /* should never fail */
637         buf = PyMem_REALLOC(buf, final_length);
638     return buf;
639 }
640 
641 /* Decode a byte string STR for use as the buffer of TOK.
642    Look for encoding declarations inside STR, and record them
643    inside TOK.  */
644 
645 static const char *
decode_str(const char * input,int single,struct tok_state * tok)646 decode_str(const char *input, int single, struct tok_state *tok)
647 {
648     PyObject* utf8 = NULL;
649     const char *str;
650     const char *s;
651     const char *newl[2] = {NULL, NULL};
652     int lineno = 0;
653     tok->input = str = translate_newlines(input, single, tok);
654     if (str == NULL)
655         return NULL;
656     tok->enc = NULL;
657     tok->str = str;
658     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
659         return error_ret(tok);
660     str = tok->str;             /* string after BOM if any */
661     assert(str);
662 #ifdef Py_USING_UNICODE
663     if (tok->enc != NULL) {
664         utf8 = translate_into_utf8(str, tok->enc);
665         if (utf8 == NULL)
666             return error_ret(tok);
667         str = PyString_AsString(utf8);
668     }
669 #endif
670     for (s = str;; s++) {
671         if (*s == '\0') break;
672         else if (*s == '\n') {
673             assert(lineno < 2);
674             newl[lineno] = s;
675             lineno++;
676             if (lineno == 2) break;
677         }
678     }
679     tok->enc = NULL;
680     /* need to check line 1 and 2 separately since check_coding_spec
681        assumes a single line as input */
682     if (newl[0]) {
683         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
684             return error_ret(tok);
685         if (tok->enc == NULL && newl[1]) {
686             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
687                                    tok, buf_setreadl))
688                 return error_ret(tok);
689         }
690     }
691 #ifdef Py_USING_UNICODE
692     if (tok->enc != NULL) {
693         assert(utf8 == NULL);
694         utf8 = translate_into_utf8(str, tok->enc);
695         if (utf8 == NULL)
696             return error_ret(tok);
697         str = PyString_AsString(utf8);
698     }
699 #endif
700     assert(tok->decoding_buffer == NULL);
701     tok->decoding_buffer = utf8; /* CAUTION */
702     return str;
703 }
704 
705 #endif /* PGEN */
706 
707 /* Set up tokenizer for string */
708 
709 struct tok_state *
PyTokenizer_FromString(const char * str,int exec_input)710 PyTokenizer_FromString(const char *str, int exec_input)
711 {
712     struct tok_state *tok = tok_new();
713     if (tok == NULL)
714         return NULL;
715     str = (char *)decode_str(str, exec_input, tok);
716     if (str == NULL) {
717         PyTokenizer_Free(tok);
718         return NULL;
719     }
720 
721     /* XXX: constify members. */
722     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
723     return tok;
724 }
725 
726 
727 /* Set up tokenizer for file */
728 
729 struct tok_state *
PyTokenizer_FromFile(FILE * fp,char * ps1,char * ps2)730 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
731 {
732     struct tok_state *tok = tok_new();
733     if (tok == NULL)
734         return NULL;
735     if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
736         PyTokenizer_Free(tok);
737         return NULL;
738     }
739     tok->cur = tok->inp = tok->buf;
740     tok->end = tok->buf + BUFSIZ;
741     tok->fp = fp;
742     tok->prompt = ps1;
743     tok->nextprompt = ps2;
744     return tok;
745 }
746 
747 
748 /* Free a tok_state structure */
749 
750 void
PyTokenizer_Free(struct tok_state * tok)751 PyTokenizer_Free(struct tok_state *tok)
752 {
753     if (tok->encoding != NULL)
754         PyMem_FREE(tok->encoding);
755 #ifndef PGEN
756     Py_XDECREF(tok->decoding_readline);
757     Py_XDECREF(tok->decoding_buffer);
758 #endif
759     if (tok->fp != NULL && tok->buf != NULL)
760         PyMem_FREE(tok->buf);
761     if (tok->input)
762         PyMem_FREE((char *)tok->input);
763     PyMem_FREE(tok);
764 }
765 
766 #if !defined(PGEN) && defined(Py_USING_UNICODE)
767 static int
tok_stdin_decode(struct tok_state * tok,char ** inp)768 tok_stdin_decode(struct tok_state *tok, char **inp)
769 {
770     PyObject *enc, *sysstdin, *decoded, *utf8;
771     const char *encoding;
772     char *converted;
773 
774     if (PySys_GetFile((char *)"stdin", NULL) != stdin)
775         return 0;
776     sysstdin = PySys_GetObject("stdin");
777     if (sysstdin == NULL || !PyFile_Check(sysstdin))
778         return 0;
779 
780     enc = ((PyFileObject *)sysstdin)->f_encoding;
781     if (enc == NULL || !PyString_Check(enc))
782         return 0;
783     Py_INCREF(enc);
784 
785     encoding = PyString_AsString(enc);
786     decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
787     if (decoded == NULL)
788         goto error_clear;
789 
790     utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
791     Py_DECREF(decoded);
792     if (utf8 == NULL)
793         goto error_clear;
794 
795     assert(PyString_Check(utf8));
796     converted = new_string(PyString_AS_STRING(utf8),
797                            PyString_GET_SIZE(utf8));
798     Py_DECREF(utf8);
799     if (converted == NULL)
800         goto error_nomem;
801 
802     PyMem_FREE(*inp);
803     *inp = converted;
804     if (tok->encoding != NULL)
805         PyMem_FREE(tok->encoding);
806     tok->encoding = new_string(encoding, strlen(encoding));
807     if (tok->encoding == NULL)
808         goto error_nomem;
809 
810     Py_DECREF(enc);
811     return 0;
812 
813 error_nomem:
814     Py_DECREF(enc);
815     tok->done = E_NOMEM;
816     return -1;
817 
818 error_clear:
819     Py_DECREF(enc);
820     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
821         tok->done = E_ERROR;
822         return -1;
823     }
824     /* Fallback to iso-8859-1: for backward compatibility */
825     PyErr_Clear();
826     return 0;
827 }
828 #endif
829 
830 /* Get next char, updating state; error code goes into tok->done */
831 
832 static int
tok_nextc(register struct tok_state * tok)833 tok_nextc(register struct tok_state *tok)
834 {
835     for (;;) {
836         if (tok->cur != tok->inp) {
837             return Py_CHARMASK(*tok->cur++); /* Fast path */
838         }
839         if (tok->done != E_OK)
840             return EOF;
841         if (tok->fp == NULL) {
842             char *end = strchr(tok->inp, '\n');
843             if (end != NULL)
844                 end++;
845             else {
846                 end = strchr(tok->inp, '\0');
847                 if (end == tok->inp) {
848                     tok->done = E_EOF;
849                     return EOF;
850                 }
851             }
852             if (tok->start == NULL)
853                 tok->buf = tok->cur;
854             tok->line_start = tok->cur;
855             tok->lineno++;
856             tok->inp = end;
857             return Py_CHARMASK(*tok->cur++);
858         }
859         if (tok->prompt != NULL) {
860             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
861             if (tok->nextprompt != NULL)
862                 tok->prompt = tok->nextprompt;
863             if (newtok == NULL)
864                 tok->done = E_INTR;
865             else if (*newtok == '\0') {
866                 PyMem_FREE(newtok);
867                 tok->done = E_EOF;
868             }
869 #if !defined(PGEN) && defined(Py_USING_UNICODE)
870             else if (tok_stdin_decode(tok, &newtok) != 0)
871                 PyMem_FREE(newtok);
872 #endif
873             else if (tok->start != NULL) {
874                 size_t start = tok->start - tok->buf;
875                 size_t oldlen = tok->cur - tok->buf;
876                 size_t newlen = oldlen + strlen(newtok);
877                 char *buf = tok->buf;
878                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
879                 tok->lineno++;
880                 if (buf == NULL) {
881                     PyMem_FREE(tok->buf);
882                     tok->buf = NULL;
883                     PyMem_FREE(newtok);
884                     tok->done = E_NOMEM;
885                     return EOF;
886                 }
887                 tok->buf = buf;
888                 tok->cur = tok->buf + oldlen;
889                 tok->line_start = tok->cur;
890                 strcpy(tok->buf + oldlen, newtok);
891                 PyMem_FREE(newtok);
892                 tok->inp = tok->buf + newlen;
893                 tok->end = tok->inp + 1;
894                 tok->start = tok->buf + start;
895             }
896             else {
897                 tok->lineno++;
898                 if (tok->buf != NULL)
899                     PyMem_FREE(tok->buf);
900                 tok->buf = newtok;
901                 tok->line_start = tok->buf;
902                 tok->cur = tok->buf;
903                 tok->line_start = tok->buf;
904                 tok->inp = strchr(tok->buf, '\0');
905                 tok->end = tok->inp + 1;
906             }
907         }
908         else {
909             int done = 0;
910             Py_ssize_t cur = 0;
911             char *pt;
912             if (tok->start == NULL) {
913                 if (tok->buf == NULL) {
914                     tok->buf = (char *)
915                         PyMem_MALLOC(BUFSIZ);
916                     if (tok->buf == NULL) {
917                         tok->done = E_NOMEM;
918                         return EOF;
919                     }
920                     tok->end = tok->buf + BUFSIZ;
921                 }
922                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
923                           tok) == NULL) {
924                     tok->done = E_EOF;
925                     done = 1;
926                 }
927                 else {
928                     tok->done = E_OK;
929                     tok->inp = strchr(tok->buf, '\0');
930                     done = tok->inp[-1] == '\n';
931                 }
932             }
933             else {
934                 cur = tok->cur - tok->buf;
935                 if (decoding_feof(tok)) {
936                     tok->done = E_EOF;
937                     done = 1;
938                 }
939                 else
940                     tok->done = E_OK;
941             }
942             tok->lineno++;
943             /* Read until '\n' or EOF */
944             while (!done) {
945                 Py_ssize_t curstart = tok->start == NULL ? -1 :
946                           tok->start - tok->buf;
947                 Py_ssize_t curvalid = tok->inp - tok->buf;
948                 Py_ssize_t newsize = curvalid + BUFSIZ;
949                 char *newbuf = tok->buf;
950                 newbuf = (char *)PyMem_REALLOC(newbuf,
951                                                newsize);
952                 if (newbuf == NULL) {
953                     tok->done = E_NOMEM;
954                     tok->cur = tok->inp;
955                     return EOF;
956                 }
957                 tok->buf = newbuf;
958                 tok->inp = tok->buf + curvalid;
959                 tok->end = tok->buf + newsize;
960                 tok->start = curstart < 0 ? NULL :
961                          tok->buf + curstart;
962                 if (decoding_fgets(tok->inp,
963                                (int)(tok->end - tok->inp),
964                                tok) == NULL) {
965                     /* Break out early on decoding
966                        errors, as tok->buf will be NULL
967                      */
968                     if (tok->decoding_erred)
969                         return EOF;
970                     /* Last line does not end in \n,
971                        fake one */
972                     strcpy(tok->inp, "\n");
973                 }
974                 tok->inp = strchr(tok->inp, '\0');
975                 done = tok->inp[-1] == '\n';
976             }
977             if (tok->buf != NULL) {
978                 tok->cur = tok->buf + cur;
979                 tok->line_start = tok->cur;
980                 /* replace "\r\n" with "\n" */
981                 /* For Mac leave the \r, giving a syntax error */
982                 pt = tok->inp - 2;
983                 if (pt >= tok->buf && *pt == '\r') {
984                     *pt++ = '\n';
985                     *pt = '\0';
986                     tok->inp = pt;
987                 }
988             }
989         }
990         if (tok->done != E_OK) {
991             if (tok->prompt != NULL)
992                 PySys_WriteStderr("\n");
993             tok->cur = tok->inp;
994             return EOF;
995         }
996     }
997     /*NOTREACHED*/
998 }
999 
1000 
1001 /* Back-up one character */
1002 
1003 static void
tok_backup(register struct tok_state * tok,register int c)1004 tok_backup(register struct tok_state *tok, register int c)
1005 {
1006     if (c != EOF) {
1007         if (--tok->cur < tok->buf)
1008             Py_FatalError("tok_backup: beginning of buffer");
1009         if (*tok->cur != c)
1010             *tok->cur = c;
1011     }
1012 }
1013 
1014 
1015 /* Return the token corresponding to a single character */
1016 
1017 int
PyToken_OneChar(int c)1018 PyToken_OneChar(int c)
1019 {
1020     switch (c) {
1021     case '(':           return LPAR;
1022     case ')':           return RPAR;
1023     case '[':           return LSQB;
1024     case ']':           return RSQB;
1025     case ':':           return COLON;
1026     case ',':           return COMMA;
1027     case ';':           return SEMI;
1028     case '+':           return PLUS;
1029     case '-':           return MINUS;
1030     case '*':           return STAR;
1031     case '/':           return SLASH;
1032     case '|':           return VBAR;
1033     case '&':           return AMPER;
1034     case '<':           return LESS;
1035     case '>':           return GREATER;
1036     case '=':           return EQUAL;
1037     case '.':           return DOT;
1038     case '%':           return PERCENT;
1039     case '`':           return BACKQUOTE;
1040     case '{':           return LBRACE;
1041     case '}':           return RBRACE;
1042     case '^':           return CIRCUMFLEX;
1043     case '~':           return TILDE;
1044     case '@':       return AT;
1045     default:            return OP;
1046     }
1047 }
1048 
1049 
1050 int
PyToken_TwoChars(int c1,int c2)1051 PyToken_TwoChars(int c1, int c2)
1052 {
1053     switch (c1) {
1054     case '=':
1055         switch (c2) {
1056         case '=':               return EQEQUAL;
1057         }
1058         break;
1059     case '!':
1060         switch (c2) {
1061         case '=':               return NOTEQUAL;
1062         }
1063         break;
1064     case '<':
1065         switch (c2) {
1066         case '>':               return NOTEQUAL;
1067         case '=':               return LESSEQUAL;
1068         case '<':               return LEFTSHIFT;
1069         }
1070         break;
1071     case '>':
1072         switch (c2) {
1073         case '=':               return GREATEREQUAL;
1074         case '>':               return RIGHTSHIFT;
1075         }
1076         break;
1077     case '+':
1078         switch (c2) {
1079         case '=':               return PLUSEQUAL;
1080         }
1081         break;
1082     case '-':
1083         switch (c2) {
1084         case '=':               return MINEQUAL;
1085         }
1086         break;
1087     case '*':
1088         switch (c2) {
1089         case '*':               return DOUBLESTAR;
1090         case '=':               return STAREQUAL;
1091         }
1092         break;
1093     case '/':
1094         switch (c2) {
1095         case '/':               return DOUBLESLASH;
1096         case '=':               return SLASHEQUAL;
1097         }
1098         break;
1099     case '|':
1100         switch (c2) {
1101         case '=':               return VBAREQUAL;
1102         }
1103         break;
1104     case '%':
1105         switch (c2) {
1106         case '=':               return PERCENTEQUAL;
1107         }
1108         break;
1109     case '&':
1110         switch (c2) {
1111         case '=':               return AMPEREQUAL;
1112         }
1113         break;
1114     case '^':
1115         switch (c2) {
1116         case '=':               return CIRCUMFLEXEQUAL;
1117         }
1118         break;
1119     }
1120     return OP;
1121 }
1122 
1123 int
PyToken_ThreeChars(int c1,int c2,int c3)1124 PyToken_ThreeChars(int c1, int c2, int c3)
1125 {
1126     switch (c1) {
1127     case '<':
1128         switch (c2) {
1129         case '<':
1130             switch (c3) {
1131             case '=':
1132                 return LEFTSHIFTEQUAL;
1133             }
1134             break;
1135         }
1136         break;
1137     case '>':
1138         switch (c2) {
1139         case '>':
1140             switch (c3) {
1141             case '=':
1142                 return RIGHTSHIFTEQUAL;
1143             }
1144             break;
1145         }
1146         break;
1147     case '*':
1148         switch (c2) {
1149         case '*':
1150             switch (c3) {
1151             case '=':
1152                 return DOUBLESTAREQUAL;
1153             }
1154             break;
1155         }
1156         break;
1157     case '/':
1158         switch (c2) {
1159         case '/':
1160             switch (c3) {
1161             case '=':
1162                 return DOUBLESLASHEQUAL;
1163             }
1164             break;
1165         }
1166         break;
1167     }
1168     return OP;
1169 }
1170 
1171 static int
indenterror(struct tok_state * tok)1172 indenterror(struct tok_state *tok)
1173 {
1174     if (tok->alterror) {
1175         tok->done = E_TABSPACE;
1176         tok->cur = tok->inp;
1177         return 1;
1178     }
1179     if (tok->altwarning) {
1180         PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1181                           "in indentation\n", tok->filename);
1182         tok->altwarning = 0;
1183     }
1184     return 0;
1185 }
1186 
1187 /* Get next token, after space stripping etc. */
1188 
1189 static int
tok_get(register struct tok_state * tok,char ** p_start,char ** p_end)1190 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1191 {
1192     register int c;
1193     int blankline;
1194 
1195     *p_start = *p_end = NULL;
1196   nextline:
1197     tok->start = NULL;
1198     blankline = 0;
1199 
1200     /* Get indentation level */
1201     if (tok->atbol) {
1202         register int col = 0;
1203         register int altcol = 0;
1204         tok->atbol = 0;
1205         for (;;) {
1206             c = tok_nextc(tok);
1207             if (c == ' ')
1208                 col++, altcol++;
1209             else if (c == '\t') {
1210                 col = (col/tok->tabsize + 1) * tok->tabsize;
1211                 altcol = (altcol/tok->alttabsize + 1)
1212                     * tok->alttabsize;
1213             }
1214             else if (c == '\014') /* Control-L (formfeed) */
1215                 col = altcol = 0; /* For Emacs users */
1216             else
1217                 break;
1218         }
1219         tok_backup(tok, c);
1220         if (c == '#' || c == '\n') {
1221             /* Lines with only whitespace and/or comments
1222                shouldn't affect the indentation and are
1223                not passed to the parser as NEWLINE tokens,
1224                except *totally* empty lines in interactive
1225                mode, which signal the end of a command group. */
1226             if (col == 0 && c == '\n' && tok->prompt != NULL)
1227                 blankline = 0; /* Let it through */
1228             else
1229                 blankline = 1; /* Ignore completely */
1230             /* We can't jump back right here since we still
1231                may need to skip to the end of a comment */
1232         }
1233         if (!blankline && tok->level == 0) {
1234             if (col == tok->indstack[tok->indent]) {
1235                 /* No change */
1236                 if (altcol != tok->altindstack[tok->indent]) {
1237                     if (indenterror(tok))
1238                         return ERRORTOKEN;
1239                 }
1240             }
1241             else if (col > tok->indstack[tok->indent]) {
1242                 /* Indent -- always one */
1243                 if (tok->indent+1 >= MAXINDENT) {
1244                     tok->done = E_TOODEEP;
1245                     tok->cur = tok->inp;
1246                     return ERRORTOKEN;
1247                 }
1248                 if (altcol <= tok->altindstack[tok->indent]) {
1249                     if (indenterror(tok))
1250                         return ERRORTOKEN;
1251                 }
1252                 tok->pendin++;
1253                 tok->indstack[++tok->indent] = col;
1254                 tok->altindstack[tok->indent] = altcol;
1255             }
1256             else /* col < tok->indstack[tok->indent] */ {
1257                 /* Dedent -- any number, must be consistent */
1258                 while (tok->indent > 0 &&
1259                     col < tok->indstack[tok->indent]) {
1260                     tok->pendin--;
1261                     tok->indent--;
1262                 }
1263                 if (col != tok->indstack[tok->indent]) {
1264                     tok->done = E_DEDENT;
1265                     tok->cur = tok->inp;
1266                     return ERRORTOKEN;
1267                 }
1268                 if (altcol != tok->altindstack[tok->indent]) {
1269                     if (indenterror(tok))
1270                         return ERRORTOKEN;
1271                 }
1272             }
1273         }
1274     }
1275 
1276     tok->start = tok->cur;
1277 
1278     /* Return pending indents/dedents */
1279     if (tok->pendin != 0) {
1280         if (tok->pendin < 0) {
1281             tok->pendin++;
1282             return DEDENT;
1283         }
1284         else {
1285             tok->pendin--;
1286             return INDENT;
1287         }
1288     }
1289 
1290  again:
1291     tok->start = NULL;
1292     /* Skip spaces */
1293     do {
1294         c = tok_nextc(tok);
1295     } while (c == ' ' || c == '\t' || c == '\014');
1296 
1297     /* Set start of current token */
1298     tok->start = tok->cur - 1;
1299 
1300     /* Skip comment, while looking for tab-setting magic */
1301     if (c == '#') {
1302         static char *tabforms[] = {
1303             "tab-width:",                       /* Emacs */
1304             ":tabstop=",                        /* vim, full form */
1305             ":ts=",                             /* vim, abbreviated form */
1306             "set tabsize=",                     /* will vi never die? */
1307         /* more templates can be added here to support other editors */
1308         };
1309         char cbuf[80];
1310         char *tp, **cp;
1311         tp = cbuf;
1312         do {
1313             *tp++ = c = tok_nextc(tok);
1314         } while (c != EOF && c != '\n' &&
1315                  (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1316         *tp = '\0';
1317         for (cp = tabforms;
1318              cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1319              cp++) {
1320             if ((tp = strstr(cbuf, *cp))) {
1321                 int newsize = atoi(tp + strlen(*cp));
1322 
1323                 if (newsize >= 1 && newsize <= 40) {
1324                     tok->tabsize = newsize;
1325                     if (Py_VerboseFlag)
1326                         PySys_WriteStderr(
1327                         "Tab size set to %d\n",
1328                         newsize);
1329                 }
1330             }
1331         }
1332         while (c != EOF && c != '\n')
1333             c = tok_nextc(tok);
1334     }
1335 
1336     /* Check for EOF and errors now */
1337     if (c == EOF) {
1338         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1339     }
1340 
1341     /* Identifier (most frequent token!) */
1342     if (Py_ISALPHA(c) || c == '_') {
1343         /* Process r"", u"" and ur"" */
1344         switch (c) {
1345         case 'b':
1346         case 'B':
1347             c = tok_nextc(tok);
1348             if (c == 'r' || c == 'R')
1349                 c = tok_nextc(tok);
1350             if (c == '"' || c == '\'')
1351                 goto letter_quote;
1352             break;
1353         case 'r':
1354         case 'R':
1355             c = tok_nextc(tok);
1356             if (c == '"' || c == '\'')
1357                 goto letter_quote;
1358             break;
1359         case 'u':
1360         case 'U':
1361             c = tok_nextc(tok);
1362             if (c == 'r' || c == 'R')
1363                 c = tok_nextc(tok);
1364             if (c == '"' || c == '\'')
1365                 goto letter_quote;
1366             break;
1367         }
1368         while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
1369             c = tok_nextc(tok);
1370         }
1371         tok_backup(tok, c);
1372         *p_start = tok->start;
1373         *p_end = tok->cur;
1374         return NAME;
1375     }
1376 
1377     /* Newline */
1378     if (c == '\n') {
1379         tok->atbol = 1;
1380         if (blankline || tok->level > 0)
1381             goto nextline;
1382         *p_start = tok->start;
1383         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1384         tok->cont_line = 0;
1385         return NEWLINE;
1386     }
1387 
1388     /* Period or number starting with period? */
1389     if (c == '.') {
1390         c = tok_nextc(tok);
1391         if (isdigit(c)) {
1392             goto fraction;
1393         }
1394         else {
1395             tok_backup(tok, c);
1396             *p_start = tok->start;
1397             *p_end = tok->cur;
1398             return DOT;
1399         }
1400     }
1401 
1402     /* Number */
1403     if (isdigit(c)) {
1404         if (c == '0') {
1405             /* Hex, octal or binary -- maybe. */
1406             c = tok_nextc(tok);
1407             if (c == '.')
1408                 goto fraction;
1409 #ifndef WITHOUT_COMPLEX
1410             if (c == 'j' || c == 'J')
1411                 goto imaginary;
1412 #endif
1413             if (c == 'x' || c == 'X') {
1414 
1415                 /* Hex */
1416                 c = tok_nextc(tok);
1417                 if (!isxdigit(c)) {
1418                     tok->done = E_TOKEN;
1419                     tok_backup(tok, c);
1420                     return ERRORTOKEN;
1421                 }
1422                 do {
1423                     c = tok_nextc(tok);
1424                 } while (isxdigit(c));
1425             }
1426             else if (c == 'o' || c == 'O') {
1427                 /* Octal */
1428                 c = tok_nextc(tok);
1429                 if (c < '0' || c >= '8') {
1430                     tok->done = E_TOKEN;
1431                     tok_backup(tok, c);
1432                     return ERRORTOKEN;
1433                 }
1434                 do {
1435                     c = tok_nextc(tok);
1436                 } while ('0' <= c && c < '8');
1437             }
1438             else if (c == 'b' || c == 'B') {
1439                 /* Binary */
1440                 c = tok_nextc(tok);
1441                 if (c != '0' && c != '1') {
1442                     tok->done = E_TOKEN;
1443                     tok_backup(tok, c);
1444                     return ERRORTOKEN;
1445                 }
1446                 do {
1447                     c = tok_nextc(tok);
1448                 } while (c == '0' || c == '1');
1449             }
1450             else {
1451                 int found_decimal = 0;
1452                 /* Octal; c is first char of it */
1453                 /* There's no 'isoctdigit' macro, sigh */
1454                 while ('0' <= c && c < '8') {
1455                     c = tok_nextc(tok);
1456                 }
1457                 if (isdigit(c)) {
1458                     found_decimal = 1;
1459                     do {
1460                         c = tok_nextc(tok);
1461                     } while (isdigit(c));
1462                 }
1463                 if (c == '.')
1464                     goto fraction;
1465                 else if (c == 'e' || c == 'E')
1466                     goto exponent;
1467 #ifndef WITHOUT_COMPLEX
1468                 else if (c == 'j' || c == 'J')
1469                     goto imaginary;
1470 #endif
1471                 else if (found_decimal) {
1472                     tok->done = E_TOKEN;
1473                     tok_backup(tok, c);
1474                     return ERRORTOKEN;
1475                 }
1476             }
1477             if (c == 'l' || c == 'L')
1478                 c = tok_nextc(tok);
1479         }
1480         else {
1481             /* Decimal */
1482             do {
1483                 c = tok_nextc(tok);
1484             } while (isdigit(c));
1485             if (c == 'l' || c == 'L')
1486                 c = tok_nextc(tok);
1487             else {
1488                 /* Accept floating point numbers. */
1489                 if (c == '.') {
1490         fraction:
1491                     /* Fraction */
1492                     do {
1493                         c = tok_nextc(tok);
1494                     } while (isdigit(c));
1495                 }
1496                 if (c == 'e' || c == 'E') {
1497         exponent:
1498                     /* Exponent part */
1499                     c = tok_nextc(tok);
1500                     if (c == '+' || c == '-')
1501                         c = tok_nextc(tok);
1502                     if (!isdigit(c)) {
1503                         tok->done = E_TOKEN;
1504                         tok_backup(tok, c);
1505                         return ERRORTOKEN;
1506                     }
1507                     do {
1508                         c = tok_nextc(tok);
1509                     } while (isdigit(c));
1510                 }
1511 #ifndef WITHOUT_COMPLEX
1512                 if (c == 'j' || c == 'J')
1513                     /* Imaginary part */
1514         imaginary:
1515                     c = tok_nextc(tok);
1516 #endif
1517             }
1518         }
1519         tok_backup(tok, c);
1520         *p_start = tok->start;
1521         *p_end = tok->cur;
1522         return NUMBER;
1523     }
1524 
1525   letter_quote:
1526     /* String */
1527     if (c == '\'' || c == '"') {
1528         Py_ssize_t quote2 = tok->cur - tok->start + 1;
1529         int quote = c;
1530         int triple = 0;
1531         int tripcount = 0;
1532         for (;;) {
1533             c = tok_nextc(tok);
1534             if (c == '\n') {
1535                 if (!triple) {
1536                     tok->done = E_EOLS;
1537                     tok_backup(tok, c);
1538                     return ERRORTOKEN;
1539                 }
1540                 tripcount = 0;
1541                 tok->cont_line = 1; /* multiline string. */
1542             }
1543             else if (c == EOF) {
1544                 if (triple)
1545                     tok->done = E_EOFS;
1546                 else
1547                     tok->done = E_EOLS;
1548                 tok->cur = tok->inp;
1549                 return ERRORTOKEN;
1550             }
1551             else if (c == quote) {
1552                 tripcount++;
1553                 if (tok->cur - tok->start == quote2) {
1554                     c = tok_nextc(tok);
1555                     if (c == quote) {
1556                         triple = 1;
1557                         tripcount = 0;
1558                         continue;
1559                     }
1560                     tok_backup(tok, c);
1561                 }
1562                 if (!triple || tripcount == 3)
1563                     break;
1564             }
1565             else if (c == '\\') {
1566                 tripcount = 0;
1567                 c = tok_nextc(tok);
1568                 if (c == EOF) {
1569                     tok->done = E_EOLS;
1570                     tok->cur = tok->inp;
1571                     return ERRORTOKEN;
1572                 }
1573             }
1574             else
1575                 tripcount = 0;
1576         }
1577         *p_start = tok->start;
1578         *p_end = tok->cur;
1579         return STRING;
1580     }
1581 
1582     /* Line continuation */
1583     if (c == '\\') {
1584         c = tok_nextc(tok);
1585         if (c != '\n') {
1586             tok->done = E_LINECONT;
1587             tok->cur = tok->inp;
1588             return ERRORTOKEN;
1589         }
1590         tok->cont_line = 1;
1591         goto again; /* Read next line */
1592     }
1593 
1594     /* Check for two-character token */
1595     {
1596         int c2 = tok_nextc(tok);
1597         int token = PyToken_TwoChars(c, c2);
1598 #ifndef PGEN
1599         if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1600             if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1601                                    "<> not supported in 3.x; use !=",
1602                                    tok->filename, tok->lineno,
1603                                    NULL, NULL)) {
1604                 return ERRORTOKEN;
1605             }
1606         }
1607 #endif
1608         if (token != OP) {
1609             int c3 = tok_nextc(tok);
1610             int token3 = PyToken_ThreeChars(c, c2, c3);
1611             if (token3 != OP) {
1612                 token = token3;
1613             } else {
1614                 tok_backup(tok, c3);
1615             }
1616             *p_start = tok->start;
1617             *p_end = tok->cur;
1618             return token;
1619         }
1620         tok_backup(tok, c2);
1621     }
1622 
1623     /* Keep track of parentheses nesting level */
1624     switch (c) {
1625     case '(':
1626     case '[':
1627     case '{':
1628         tok->level++;
1629         break;
1630     case ')':
1631     case ']':
1632     case '}':
1633         tok->level--;
1634         break;
1635     }
1636 
1637     /* Punctuation character */
1638     *p_start = tok->start;
1639     *p_end = tok->cur;
1640     return PyToken_OneChar(c);
1641 }
1642 
1643 int
PyTokenizer_Get(struct tok_state * tok,char ** p_start,char ** p_end)1644 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1645 {
1646     int result = tok_get(tok, p_start, p_end);
1647     if (tok->decoding_erred) {
1648         result = ERRORTOKEN;
1649         tok->done = E_DECODE;
1650     }
1651     return result;
1652 }
1653 
1654 /* This function is only called from parsetok. However, it cannot live
1655    there, as it must be empty for PGEN, and we can check for PGEN only
1656    in this file. */
1657 
1658 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1659 char*
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1660 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1661 {
1662     return NULL;
1663 }
1664 #else
1665 #ifdef Py_USING_UNICODE
1666 static PyObject *
dec_utf8(const char * enc,const char * text,size_t len)1667 dec_utf8(const char *enc, const char *text, size_t len) {
1668     PyObject *ret = NULL;
1669     PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1670     if (unicode_text) {
1671         ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1672         Py_DECREF(unicode_text);
1673     }
1674     if (!ret) {
1675         PyErr_Clear();
1676     }
1677     return ret;
1678 }
1679 char *
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1680 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1681 {
1682     char *text = NULL;
1683     if (tok->encoding) {
1684         /* convert source to original encondig */
1685         PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1686         if (lineobj != NULL) {
1687             int linelen = PyString_Size(lineobj);
1688             const char *line = PyString_AsString(lineobj);
1689             text = PyObject_MALLOC(linelen + 1);
1690             if (text != NULL && line != NULL) {
1691                 if (linelen)
1692                     strncpy(text, line, linelen);
1693                 text[linelen] = '\0';
1694             }
1695             Py_DECREF(lineobj);
1696 
1697             /* adjust error offset */
1698             if (*offset > 1) {
1699                 PyObject *offsetobj = dec_utf8(tok->encoding,
1700                                                tok->buf, *offset-1);
1701                 if (offsetobj) {
1702                     *offset = PyString_Size(offsetobj) + 1;
1703                     Py_DECREF(offsetobj);
1704                 }
1705             }
1706 
1707         }
1708     }
1709     return text;
1710 
1711 }
1712 #endif /* defined(Py_USING_UNICODE) */
1713 #endif
1714 
1715 
1716 #ifdef Py_DEBUG
1717 
1718 void
tok_dump(int type,char * start,char * end)1719 tok_dump(int type, char *start, char *end)
1720 {
1721     printf("%s", _PyParser_TokenNames[type]);
1722     if (type == NAME || type == NUMBER || type == STRING || type == OP)
1723         printf("(%.*s)", (int)(end - start), start);
1724 }
1725 
1726 #endif
1727