1
2 /* Tokenizer implementation */
3
4 #include "Python.h"
5 #include "pgenheaders.h"
6
7 #include <ctype.h>
8 #include <assert.h>
9
10 #include "tokenizer.h"
11 #include "errcode.h"
12
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #include "pydebug.h"
20 #endif /* PGEN */
21
22 extern char *PyOS_Readline(FILE *, FILE *, char *);
23 /* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
27 /* Don't ever change this -- it would break the portability of Python code */
28 #define TABSIZE 8
29
30 /* Forward */
31 static struct tok_state *tok_new(void);
32 static int tok_nextc(struct tok_state *tok);
33 static void tok_backup(struct tok_state *tok, int c);
34
35 /* Token names */
36
37 char *_PyParser_TokenNames[] = {
38 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
66 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
70 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
74 "DOUBLESTAR",
75 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
86 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
88 "AT",
89 /* This table must match the #defines in token.h! */
90 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
93 };
94
95 /* Create and initialize a new tok_state structure */
96
97 static struct tok_state *
tok_new(void)98 tok_new(void)
99 {
100 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
101 sizeof(struct tok_state));
102 if (tok == NULL)
103 return NULL;
104 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
105 tok->done = E_OK;
106 tok->fp = NULL;
107 tok->input = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
115 tok->level = 0;
116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
124 tok->encoding = NULL;
125 tok->cont_line = 0;
126 #ifndef PGEN
127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
129 #endif
130 return tok;
131 }
132
133 static char *
new_string(const char * s,Py_ssize_t len)134 new_string(const char *s, Py_ssize_t len)
135 {
136 char* result = (char *)PyMem_MALLOC(len + 1);
137 if (result != NULL) {
138 memcpy(result, s, len);
139 result[len] = '\0';
140 }
141 return result;
142 }
143
144 #ifdef PGEN
145
146 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)147 decoding_fgets(char *s, int size, struct tok_state *tok)
148 {
149 return fgets(s, size, tok->fp);
150 }
151
152 static int
decoding_feof(struct tok_state * tok)153 decoding_feof(struct tok_state *tok)
154 {
155 return feof(tok->fp);
156 }
157
158 static char *
decode_str(const char * str,int exec_input,struct tok_state * tok)159 decode_str(const char *str, int exec_input, struct tok_state *tok)
160 {
161 return new_string(str, strlen(str));
162 }
163
164 #else /* PGEN */
165
166 static char *
error_ret(struct tok_state * tok)167 error_ret(struct tok_state *tok) /* XXX */
168 {
169 tok->decoding_erred = 1;
170 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
171 PyMem_FREE(tok->buf);
172 tok->buf = NULL;
173 return NULL; /* as if it were EOF */
174 }
175
176
177 static char *
get_normal_name(char * s)178 get_normal_name(char *s) /* for utf-8 and latin-1 */
179 {
180 char buf[13];
181 int i;
182 for (i = 0; i < 12; i++) {
183 int c = s[i];
184 if (c == '\0')
185 break;
186 else if (c == '_')
187 buf[i] = '-';
188 else
189 buf[i] = tolower(c);
190 }
191 buf[i] = '\0';
192 if (strcmp(buf, "utf-8") == 0 ||
193 strncmp(buf, "utf-8-", 6) == 0)
194 return "utf-8";
195 else if (strcmp(buf, "latin-1") == 0 ||
196 strcmp(buf, "iso-8859-1") == 0 ||
197 strcmp(buf, "iso-latin-1") == 0 ||
198 strncmp(buf, "latin-1-", 8) == 0 ||
199 strncmp(buf, "iso-8859-1-", 11) == 0 ||
200 strncmp(buf, "iso-latin-1-", 12) == 0)
201 return "iso-8859-1";
202 else
203 return s;
204 }
205
206 /* Return the coding spec in S, or NULL if none is found. */
207
208 static char *
get_coding_spec(const char * s,Py_ssize_t size)209 get_coding_spec(const char *s, Py_ssize_t size)
210 {
211 Py_ssize_t i;
212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i = 0; i < size - 6; i++) {
215 if (s[i] == '#')
216 break;
217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218 return NULL;
219 }
220 for (; i < size - 6; i++) { /* XXX inefficient search */
221 const char* t = s + i;
222 if (strncmp(t, "coding", 6) == 0) {
223 const char* begin = NULL;
224 t += 6;
225 if (t[0] != ':' && t[0] != '=')
226 continue;
227 do {
228 t++;
229 } while (t[0] == '\x20' || t[0] == '\t');
230
231 begin = t;
232 while (Py_ISALNUM(t[0]) ||
233 t[0] == '-' || t[0] == '_' || t[0] == '.')
234 t++;
235
236 if (begin < t) {
237 char* r = new_string(begin, t - begin);
238 char* q = get_normal_name(r);
239 if (r != q) {
240 PyMem_FREE(r);
241 r = new_string(q, strlen(q));
242 }
243 return r;
244 }
245 }
246 }
247 return NULL;
248 }
249
250 /* Check whether the line contains a coding spec. If it does,
251 invoke the set_readline function for the new encoding.
252 This function receives the tok_state and the new encoding.
253 Return 1 on success, 0 on failure. */
254
255 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))256 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
257 int set_readline(struct tok_state *, const char *))
258 {
259 char * cs;
260 int r = 1;
261
262 if (tok->cont_line)
263 /* It's a continuation line, so it can't be a coding spec. */
264 return 1;
265 cs = get_coding_spec(line, size);
266 if (cs != NULL) {
267 tok->read_coding_spec = 1;
268 if (tok->encoding == NULL) {
269 assert(tok->decoding_state == 1); /* raw */
270 if (strcmp(cs, "utf-8") == 0 ||
271 strcmp(cs, "iso-8859-1") == 0) {
272 tok->encoding = cs;
273 } else {
274 #ifdef Py_USING_UNICODE
275 r = set_readline(tok, cs);
276 if (r) {
277 tok->encoding = cs;
278 tok->decoding_state = -1;
279 }
280 else
281 PyMem_FREE(cs);
282 #else
283 /* Without Unicode support, we cannot
284 process the coding spec. Since there
285 won't be any Unicode literals, that
286 won't matter. */
287 PyMem_FREE(cs);
288 #endif
289 }
290 } else { /* then, compare cs with BOM */
291 r = (strcmp(tok->encoding, cs) == 0);
292 PyMem_FREE(cs);
293 }
294 }
295 if (!r) {
296 cs = tok->encoding;
297 if (!cs)
298 cs = "with BOM";
299 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
300 }
301 return r;
302 }
303
304 /* See whether the file starts with a BOM. If it does,
305 invoke the set_readline function with the new encoding.
306 Return 1 on success, 0 on failure. */
307
308 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)309 check_bom(int get_char(struct tok_state *),
310 void unget_char(int, struct tok_state *),
311 int set_readline(struct tok_state *, const char *),
312 struct tok_state *tok)
313 {
314 int ch1, ch2, ch3;
315 ch1 = get_char(tok);
316 tok->decoding_state = 1;
317 if (ch1 == EOF) {
318 return 1;
319 } else if (ch1 == 0xEF) {
320 ch2 = get_char(tok);
321 if (ch2 != 0xBB) {
322 unget_char(ch2, tok);
323 unget_char(ch1, tok);
324 return 1;
325 }
326 ch3 = get_char(tok);
327 if (ch3 != 0xBF) {
328 unget_char(ch3, tok);
329 unget_char(ch2, tok);
330 unget_char(ch1, tok);
331 return 1;
332 }
333 #if 0
334 /* Disable support for UTF-16 BOMs until a decision
335 is made whether this needs to be supported. */
336 } else if (ch1 == 0xFE) {
337 ch2 = get_char(tok);
338 if (ch2 != 0xFF) {
339 unget_char(ch2, tok);
340 unget_char(ch1, tok);
341 return 1;
342 }
343 if (!set_readline(tok, "utf-16-be"))
344 return 0;
345 tok->decoding_state = -1;
346 } else if (ch1 == 0xFF) {
347 ch2 = get_char(tok);
348 if (ch2 != 0xFE) {
349 unget_char(ch2, tok);
350 unget_char(ch1, tok);
351 return 1;
352 }
353 if (!set_readline(tok, "utf-16-le"))
354 return 0;
355 tok->decoding_state = -1;
356 #endif
357 } else {
358 unget_char(ch1, tok);
359 return 1;
360 }
361 if (tok->encoding != NULL)
362 PyMem_FREE(tok->encoding);
363 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
364 return 1;
365 }
366
367 /* Read a line of text from TOK into S, using the stream in TOK.
368 Return NULL on failure, else S.
369
370 On entry, tok->decoding_buffer will be one of:
371 1) NULL: need to call tok->decoding_readline to get a new line
372 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
373 stored the result in tok->decoding_buffer
374 3) PyStringObject *: previous call to fp_readl did not have enough room
375 (in the s buffer) to copy entire contents of the line read
376 by tok->decoding_readline. tok->decoding_buffer has the overflow.
377 In this case, fp_readl is called in a loop (with an expanded buffer)
378 until the buffer ends with a '\n' (or until the end of the file is
379 reached): see tok_nextc and its calls to decoding_fgets.
380 */
381
382 static char *
fp_readl(char * s,int size,struct tok_state * tok)383 fp_readl(char *s, int size, struct tok_state *tok)
384 {
385 #ifndef Py_USING_UNICODE
386 /* In a non-Unicode built, this should never be called. */
387 Py_FatalError("fp_readl should not be called in this build.");
388 return NULL; /* Keep compiler happy (not reachable) */
389 #else
390 PyObject* utf8 = NULL;
391 PyObject* buf = tok->decoding_buffer;
392 char *str;
393 Py_ssize_t utf8len;
394
395 /* Ask for one less byte so we can terminate it */
396 assert(size > 0);
397 size--;
398
399 if (buf == NULL) {
400 buf = PyObject_CallObject(tok->decoding_readline, NULL);
401 if (buf == NULL)
402 return error_ret(tok);
403 } else {
404 tok->decoding_buffer = NULL;
405 if (PyString_CheckExact(buf))
406 utf8 = buf;
407 }
408 if (utf8 == NULL) {
409 utf8 = PyUnicode_AsUTF8String(buf);
410 Py_DECREF(buf);
411 if (utf8 == NULL)
412 return error_ret(tok);
413 }
414 str = PyString_AsString(utf8);
415 utf8len = PyString_GET_SIZE(utf8);
416 if (utf8len > size) {
417 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
418 if (tok->decoding_buffer == NULL) {
419 Py_DECREF(utf8);
420 return error_ret(tok);
421 }
422 utf8len = size;
423 }
424 memcpy(s, str, utf8len);
425 s[utf8len] = '\0';
426 Py_DECREF(utf8);
427 if (utf8len == 0)
428 return NULL; /* EOF */
429 return s;
430 #endif
431 }
432
433 /* Set the readline function for TOK to a StreamReader's
434 readline function. The StreamReader is named ENC.
435
436 This function is called from check_bom and check_coding_spec.
437
438 ENC is usually identical to the future value of tok->encoding,
439 except for the (currently unsupported) case of UTF-16.
440
441 Return 1 on success, 0 on failure. */
442
443 static int
fp_setreadl(struct tok_state * tok,const char * enc)444 fp_setreadl(struct tok_state *tok, const char* enc)
445 {
446 PyObject *reader, *stream, *readline;
447
448 /* XXX: constify filename argument. */
449 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
450 if (stream == NULL)
451 return 0;
452
453 reader = PyCodec_StreamReader(enc, stream, NULL);
454 Py_DECREF(stream);
455 if (reader == NULL)
456 return 0;
457
458 readline = PyObject_GetAttrString(reader, "readline");
459 Py_DECREF(reader);
460 if (readline == NULL)
461 return 0;
462
463 tok->decoding_readline = readline;
464 return 1;
465 }
466
467 /* Fetch the next byte from TOK. */
468
fp_getc(struct tok_state * tok)469 static int fp_getc(struct tok_state *tok) {
470 return getc(tok->fp);
471 }
472
473 /* Unfetch the last byte back into TOK. */
474
fp_ungetc(int c,struct tok_state * tok)475 static void fp_ungetc(int c, struct tok_state *tok) {
476 ungetc(c, tok->fp);
477 }
478
479 /* Read a line of input from TOK. Determine encoding
480 if necessary. */
481
482 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)483 decoding_fgets(char *s, int size, struct tok_state *tok)
484 {
485 char *line = NULL;
486 int badchar = 0;
487 for (;;) {
488 if (tok->decoding_state < 0) {
489 /* We already have a codec associated with
490 this input. */
491 line = fp_readl(s, size, tok);
492 break;
493 } else if (tok->decoding_state > 0) {
494 /* We want a 'raw' read. */
495 line = Py_UniversalNewlineFgets(s, size,
496 tok->fp, NULL);
497 break;
498 } else {
499 /* We have not yet determined the encoding.
500 If an encoding is found, use the file-pointer
501 reader functions from now on. */
502 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
503 return error_ret(tok);
504 assert(tok->decoding_state != 0);
505 }
506 }
507 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
508 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
509 return error_ret(tok);
510 }
511 }
512 #ifndef PGEN
513 /* The default encoding is ASCII, so make sure we don't have any
514 non-ASCII bytes in it. */
515 if (line && !tok->encoding) {
516 unsigned char *c;
517 for (c = (unsigned char *)line; *c; c++)
518 if (*c > 127) {
519 badchar = *c;
520 break;
521 }
522 }
523 if (badchar) {
524 char buf[500];
525 /* Need to add 1 to the line number, since this line
526 has not been counted, yet. */
527 sprintf(buf,
528 "Non-ASCII character '\\x%.2x' "
529 "in file %.200s on line %i, "
530 "but no encoding declared; "
531 "see http://www.python.org/peps/pep-0263.html for details",
532 badchar, tok->filename, tok->lineno + 1);
533 PyErr_SetString(PyExc_SyntaxError, buf);
534 return error_ret(tok);
535 }
536 #endif
537 return line;
538 }
539
540 static int
decoding_feof(struct tok_state * tok)541 decoding_feof(struct tok_state *tok)
542 {
543 if (tok->decoding_state >= 0) {
544 return feof(tok->fp);
545 } else {
546 PyObject* buf = tok->decoding_buffer;
547 if (buf == NULL) {
548 buf = PyObject_CallObject(tok->decoding_readline, NULL);
549 if (buf == NULL) {
550 error_ret(tok);
551 return 1;
552 } else {
553 tok->decoding_buffer = buf;
554 }
555 }
556 return PyObject_Length(buf) == 0;
557 }
558 }
559
560 /* Fetch a byte from TOK, using the string buffer. */
561
562 static int
buf_getc(struct tok_state * tok)563 buf_getc(struct tok_state *tok) {
564 return Py_CHARMASK(*tok->str++);
565 }
566
567 /* Unfetch a byte from TOK, using the string buffer. */
568
569 static void
buf_ungetc(int c,struct tok_state * tok)570 buf_ungetc(int c, struct tok_state *tok) {
571 tok->str--;
572 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
573 }
574
575 /* Set the readline function for TOK to ENC. For the string-based
576 tokenizer, this means to just record the encoding. */
577
578 static int
buf_setreadl(struct tok_state * tok,const char * enc)579 buf_setreadl(struct tok_state *tok, const char* enc) {
580 tok->enc = enc;
581 return 1;
582 }
583
584 /* Return a UTF-8 encoding Python string object from the
585 C byte string STR, which is encoded with ENC. */
586
587 #ifdef Py_USING_UNICODE
588 static PyObject *
translate_into_utf8(const char * str,const char * enc)589 translate_into_utf8(const char* str, const char* enc) {
590 PyObject *utf8;
591 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
592 if (buf == NULL)
593 return NULL;
594 utf8 = PyUnicode_AsUTF8String(buf);
595 Py_DECREF(buf);
596 return utf8;
597 }
598 #endif
599
600
601 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)602 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
603 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
604 char *buf, *current;
605 char c = '\0';
606 buf = PyMem_MALLOC(needed_length);
607 if (buf == NULL) {
608 tok->done = E_NOMEM;
609 return NULL;
610 }
611 for (current = buf; *s; s++, current++) {
612 c = *s;
613 if (skip_next_lf) {
614 skip_next_lf = 0;
615 if (c == '\n') {
616 c = *++s;
617 if (!c)
618 break;
619 }
620 }
621 if (c == '\r') {
622 skip_next_lf = 1;
623 c = '\n';
624 }
625 *current = c;
626 }
627 /* If this is exec input, add a newline to the end of the string if
628 there isn't one already. */
629 if (exec_input && c != '\n') {
630 *current = '\n';
631 current++;
632 }
633 *current = '\0';
634 final_length = current - buf + 1;
635 if (final_length < needed_length && final_length)
636 /* should never fail */
637 buf = PyMem_REALLOC(buf, final_length);
638 return buf;
639 }
640
641 /* Decode a byte string STR for use as the buffer of TOK.
642 Look for encoding declarations inside STR, and record them
643 inside TOK. */
644
645 static const char *
decode_str(const char * input,int single,struct tok_state * tok)646 decode_str(const char *input, int single, struct tok_state *tok)
647 {
648 PyObject* utf8 = NULL;
649 const char *str;
650 const char *s;
651 const char *newl[2] = {NULL, NULL};
652 int lineno = 0;
653 tok->input = str = translate_newlines(input, single, tok);
654 if (str == NULL)
655 return NULL;
656 tok->enc = NULL;
657 tok->str = str;
658 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
659 return error_ret(tok);
660 str = tok->str; /* string after BOM if any */
661 assert(str);
662 #ifdef Py_USING_UNICODE
663 if (tok->enc != NULL) {
664 utf8 = translate_into_utf8(str, tok->enc);
665 if (utf8 == NULL)
666 return error_ret(tok);
667 str = PyString_AsString(utf8);
668 }
669 #endif
670 for (s = str;; s++) {
671 if (*s == '\0') break;
672 else if (*s == '\n') {
673 assert(lineno < 2);
674 newl[lineno] = s;
675 lineno++;
676 if (lineno == 2) break;
677 }
678 }
679 tok->enc = NULL;
680 /* need to check line 1 and 2 separately since check_coding_spec
681 assumes a single line as input */
682 if (newl[0]) {
683 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
684 return error_ret(tok);
685 if (tok->enc == NULL && newl[1]) {
686 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
687 tok, buf_setreadl))
688 return error_ret(tok);
689 }
690 }
691 #ifdef Py_USING_UNICODE
692 if (tok->enc != NULL) {
693 assert(utf8 == NULL);
694 utf8 = translate_into_utf8(str, tok->enc);
695 if (utf8 == NULL)
696 return error_ret(tok);
697 str = PyString_AsString(utf8);
698 }
699 #endif
700 assert(tok->decoding_buffer == NULL);
701 tok->decoding_buffer = utf8; /* CAUTION */
702 return str;
703 }
704
705 #endif /* PGEN */
706
707 /* Set up tokenizer for string */
708
709 struct tok_state *
PyTokenizer_FromString(const char * str,int exec_input)710 PyTokenizer_FromString(const char *str, int exec_input)
711 {
712 struct tok_state *tok = tok_new();
713 if (tok == NULL)
714 return NULL;
715 str = (char *)decode_str(str, exec_input, tok);
716 if (str == NULL) {
717 PyTokenizer_Free(tok);
718 return NULL;
719 }
720
721 /* XXX: constify members. */
722 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
723 return tok;
724 }
725
726
727 /* Set up tokenizer for file */
728
729 struct tok_state *
PyTokenizer_FromFile(FILE * fp,char * ps1,char * ps2)730 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
731 {
732 struct tok_state *tok = tok_new();
733 if (tok == NULL)
734 return NULL;
735 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
736 PyTokenizer_Free(tok);
737 return NULL;
738 }
739 tok->cur = tok->inp = tok->buf;
740 tok->end = tok->buf + BUFSIZ;
741 tok->fp = fp;
742 tok->prompt = ps1;
743 tok->nextprompt = ps2;
744 return tok;
745 }
746
747
748 /* Free a tok_state structure */
749
750 void
PyTokenizer_Free(struct tok_state * tok)751 PyTokenizer_Free(struct tok_state *tok)
752 {
753 if (tok->encoding != NULL)
754 PyMem_FREE(tok->encoding);
755 #ifndef PGEN
756 Py_XDECREF(tok->decoding_readline);
757 Py_XDECREF(tok->decoding_buffer);
758 #endif
759 if (tok->fp != NULL && tok->buf != NULL)
760 PyMem_FREE(tok->buf);
761 if (tok->input)
762 PyMem_FREE((char *)tok->input);
763 PyMem_FREE(tok);
764 }
765
766 #if !defined(PGEN) && defined(Py_USING_UNICODE)
767 static int
tok_stdin_decode(struct tok_state * tok,char ** inp)768 tok_stdin_decode(struct tok_state *tok, char **inp)
769 {
770 PyObject *enc, *sysstdin, *decoded, *utf8;
771 const char *encoding;
772 char *converted;
773
774 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
775 return 0;
776 sysstdin = PySys_GetObject("stdin");
777 if (sysstdin == NULL || !PyFile_Check(sysstdin))
778 return 0;
779
780 enc = ((PyFileObject *)sysstdin)->f_encoding;
781 if (enc == NULL || !PyString_Check(enc))
782 return 0;
783 Py_INCREF(enc);
784
785 encoding = PyString_AsString(enc);
786 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
787 if (decoded == NULL)
788 goto error_clear;
789
790 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
791 Py_DECREF(decoded);
792 if (utf8 == NULL)
793 goto error_clear;
794
795 assert(PyString_Check(utf8));
796 converted = new_string(PyString_AS_STRING(utf8),
797 PyString_GET_SIZE(utf8));
798 Py_DECREF(utf8);
799 if (converted == NULL)
800 goto error_nomem;
801
802 PyMem_FREE(*inp);
803 *inp = converted;
804 if (tok->encoding != NULL)
805 PyMem_FREE(tok->encoding);
806 tok->encoding = new_string(encoding, strlen(encoding));
807 if (tok->encoding == NULL)
808 goto error_nomem;
809
810 Py_DECREF(enc);
811 return 0;
812
813 error_nomem:
814 Py_DECREF(enc);
815 tok->done = E_NOMEM;
816 return -1;
817
818 error_clear:
819 Py_DECREF(enc);
820 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
821 tok->done = E_ERROR;
822 return -1;
823 }
824 /* Fallback to iso-8859-1: for backward compatibility */
825 PyErr_Clear();
826 return 0;
827 }
828 #endif
829
830 /* Get next char, updating state; error code goes into tok->done */
831
832 static int
tok_nextc(register struct tok_state * tok)833 tok_nextc(register struct tok_state *tok)
834 {
835 for (;;) {
836 if (tok->cur != tok->inp) {
837 return Py_CHARMASK(*tok->cur++); /* Fast path */
838 }
839 if (tok->done != E_OK)
840 return EOF;
841 if (tok->fp == NULL) {
842 char *end = strchr(tok->inp, '\n');
843 if (end != NULL)
844 end++;
845 else {
846 end = strchr(tok->inp, '\0');
847 if (end == tok->inp) {
848 tok->done = E_EOF;
849 return EOF;
850 }
851 }
852 if (tok->start == NULL)
853 tok->buf = tok->cur;
854 tok->line_start = tok->cur;
855 tok->lineno++;
856 tok->inp = end;
857 return Py_CHARMASK(*tok->cur++);
858 }
859 if (tok->prompt != NULL) {
860 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
861 if (tok->nextprompt != NULL)
862 tok->prompt = tok->nextprompt;
863 if (newtok == NULL)
864 tok->done = E_INTR;
865 else if (*newtok == '\0') {
866 PyMem_FREE(newtok);
867 tok->done = E_EOF;
868 }
869 #if !defined(PGEN) && defined(Py_USING_UNICODE)
870 else if (tok_stdin_decode(tok, &newtok) != 0)
871 PyMem_FREE(newtok);
872 #endif
873 else if (tok->start != NULL) {
874 size_t start = tok->start - tok->buf;
875 size_t oldlen = tok->cur - tok->buf;
876 size_t newlen = oldlen + strlen(newtok);
877 char *buf = tok->buf;
878 buf = (char *)PyMem_REALLOC(buf, newlen+1);
879 tok->lineno++;
880 if (buf == NULL) {
881 PyMem_FREE(tok->buf);
882 tok->buf = NULL;
883 PyMem_FREE(newtok);
884 tok->done = E_NOMEM;
885 return EOF;
886 }
887 tok->buf = buf;
888 tok->cur = tok->buf + oldlen;
889 tok->line_start = tok->cur;
890 strcpy(tok->buf + oldlen, newtok);
891 PyMem_FREE(newtok);
892 tok->inp = tok->buf + newlen;
893 tok->end = tok->inp + 1;
894 tok->start = tok->buf + start;
895 }
896 else {
897 tok->lineno++;
898 if (tok->buf != NULL)
899 PyMem_FREE(tok->buf);
900 tok->buf = newtok;
901 tok->line_start = tok->buf;
902 tok->cur = tok->buf;
903 tok->line_start = tok->buf;
904 tok->inp = strchr(tok->buf, '\0');
905 tok->end = tok->inp + 1;
906 }
907 }
908 else {
909 int done = 0;
910 Py_ssize_t cur = 0;
911 char *pt;
912 if (tok->start == NULL) {
913 if (tok->buf == NULL) {
914 tok->buf = (char *)
915 PyMem_MALLOC(BUFSIZ);
916 if (tok->buf == NULL) {
917 tok->done = E_NOMEM;
918 return EOF;
919 }
920 tok->end = tok->buf + BUFSIZ;
921 }
922 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
923 tok) == NULL) {
924 tok->done = E_EOF;
925 done = 1;
926 }
927 else {
928 tok->done = E_OK;
929 tok->inp = strchr(tok->buf, '\0');
930 done = tok->inp[-1] == '\n';
931 }
932 }
933 else {
934 cur = tok->cur - tok->buf;
935 if (decoding_feof(tok)) {
936 tok->done = E_EOF;
937 done = 1;
938 }
939 else
940 tok->done = E_OK;
941 }
942 tok->lineno++;
943 /* Read until '\n' or EOF */
944 while (!done) {
945 Py_ssize_t curstart = tok->start == NULL ? -1 :
946 tok->start - tok->buf;
947 Py_ssize_t curvalid = tok->inp - tok->buf;
948 Py_ssize_t newsize = curvalid + BUFSIZ;
949 char *newbuf = tok->buf;
950 newbuf = (char *)PyMem_REALLOC(newbuf,
951 newsize);
952 if (newbuf == NULL) {
953 tok->done = E_NOMEM;
954 tok->cur = tok->inp;
955 return EOF;
956 }
957 tok->buf = newbuf;
958 tok->inp = tok->buf + curvalid;
959 tok->end = tok->buf + newsize;
960 tok->start = curstart < 0 ? NULL :
961 tok->buf + curstart;
962 if (decoding_fgets(tok->inp,
963 (int)(tok->end - tok->inp),
964 tok) == NULL) {
965 /* Break out early on decoding
966 errors, as tok->buf will be NULL
967 */
968 if (tok->decoding_erred)
969 return EOF;
970 /* Last line does not end in \n,
971 fake one */
972 strcpy(tok->inp, "\n");
973 }
974 tok->inp = strchr(tok->inp, '\0');
975 done = tok->inp[-1] == '\n';
976 }
977 if (tok->buf != NULL) {
978 tok->cur = tok->buf + cur;
979 tok->line_start = tok->cur;
980 /* replace "\r\n" with "\n" */
981 /* For Mac leave the \r, giving a syntax error */
982 pt = tok->inp - 2;
983 if (pt >= tok->buf && *pt == '\r') {
984 *pt++ = '\n';
985 *pt = '\0';
986 tok->inp = pt;
987 }
988 }
989 }
990 if (tok->done != E_OK) {
991 if (tok->prompt != NULL)
992 PySys_WriteStderr("\n");
993 tok->cur = tok->inp;
994 return EOF;
995 }
996 }
997 /*NOTREACHED*/
998 }
999
1000
1001 /* Back-up one character */
1002
1003 static void
tok_backup(register struct tok_state * tok,register int c)1004 tok_backup(register struct tok_state *tok, register int c)
1005 {
1006 if (c != EOF) {
1007 if (--tok->cur < tok->buf)
1008 Py_FatalError("tok_backup: beginning of buffer");
1009 if (*tok->cur != c)
1010 *tok->cur = c;
1011 }
1012 }
1013
1014
1015 /* Return the token corresponding to a single character */
1016
1017 int
PyToken_OneChar(int c)1018 PyToken_OneChar(int c)
1019 {
1020 switch (c) {
1021 case '(': return LPAR;
1022 case ')': return RPAR;
1023 case '[': return LSQB;
1024 case ']': return RSQB;
1025 case ':': return COLON;
1026 case ',': return COMMA;
1027 case ';': return SEMI;
1028 case '+': return PLUS;
1029 case '-': return MINUS;
1030 case '*': return STAR;
1031 case '/': return SLASH;
1032 case '|': return VBAR;
1033 case '&': return AMPER;
1034 case '<': return LESS;
1035 case '>': return GREATER;
1036 case '=': return EQUAL;
1037 case '.': return DOT;
1038 case '%': return PERCENT;
1039 case '`': return BACKQUOTE;
1040 case '{': return LBRACE;
1041 case '}': return RBRACE;
1042 case '^': return CIRCUMFLEX;
1043 case '~': return TILDE;
1044 case '@': return AT;
1045 default: return OP;
1046 }
1047 }
1048
1049
1050 int
PyToken_TwoChars(int c1,int c2)1051 PyToken_TwoChars(int c1, int c2)
1052 {
1053 switch (c1) {
1054 case '=':
1055 switch (c2) {
1056 case '=': return EQEQUAL;
1057 }
1058 break;
1059 case '!':
1060 switch (c2) {
1061 case '=': return NOTEQUAL;
1062 }
1063 break;
1064 case '<':
1065 switch (c2) {
1066 case '>': return NOTEQUAL;
1067 case '=': return LESSEQUAL;
1068 case '<': return LEFTSHIFT;
1069 }
1070 break;
1071 case '>':
1072 switch (c2) {
1073 case '=': return GREATEREQUAL;
1074 case '>': return RIGHTSHIFT;
1075 }
1076 break;
1077 case '+':
1078 switch (c2) {
1079 case '=': return PLUSEQUAL;
1080 }
1081 break;
1082 case '-':
1083 switch (c2) {
1084 case '=': return MINEQUAL;
1085 }
1086 break;
1087 case '*':
1088 switch (c2) {
1089 case '*': return DOUBLESTAR;
1090 case '=': return STAREQUAL;
1091 }
1092 break;
1093 case '/':
1094 switch (c2) {
1095 case '/': return DOUBLESLASH;
1096 case '=': return SLASHEQUAL;
1097 }
1098 break;
1099 case '|':
1100 switch (c2) {
1101 case '=': return VBAREQUAL;
1102 }
1103 break;
1104 case '%':
1105 switch (c2) {
1106 case '=': return PERCENTEQUAL;
1107 }
1108 break;
1109 case '&':
1110 switch (c2) {
1111 case '=': return AMPEREQUAL;
1112 }
1113 break;
1114 case '^':
1115 switch (c2) {
1116 case '=': return CIRCUMFLEXEQUAL;
1117 }
1118 break;
1119 }
1120 return OP;
1121 }
1122
1123 int
PyToken_ThreeChars(int c1,int c2,int c3)1124 PyToken_ThreeChars(int c1, int c2, int c3)
1125 {
1126 switch (c1) {
1127 case '<':
1128 switch (c2) {
1129 case '<':
1130 switch (c3) {
1131 case '=':
1132 return LEFTSHIFTEQUAL;
1133 }
1134 break;
1135 }
1136 break;
1137 case '>':
1138 switch (c2) {
1139 case '>':
1140 switch (c3) {
1141 case '=':
1142 return RIGHTSHIFTEQUAL;
1143 }
1144 break;
1145 }
1146 break;
1147 case '*':
1148 switch (c2) {
1149 case '*':
1150 switch (c3) {
1151 case '=':
1152 return DOUBLESTAREQUAL;
1153 }
1154 break;
1155 }
1156 break;
1157 case '/':
1158 switch (c2) {
1159 case '/':
1160 switch (c3) {
1161 case '=':
1162 return DOUBLESLASHEQUAL;
1163 }
1164 break;
1165 }
1166 break;
1167 }
1168 return OP;
1169 }
1170
1171 static int
indenterror(struct tok_state * tok)1172 indenterror(struct tok_state *tok)
1173 {
1174 if (tok->alterror) {
1175 tok->done = E_TABSPACE;
1176 tok->cur = tok->inp;
1177 return 1;
1178 }
1179 if (tok->altwarning) {
1180 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1181 "in indentation\n", tok->filename);
1182 tok->altwarning = 0;
1183 }
1184 return 0;
1185 }
1186
1187 /* Get next token, after space stripping etc. */
1188
1189 static int
tok_get(register struct tok_state * tok,char ** p_start,char ** p_end)1190 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1191 {
1192 register int c;
1193 int blankline;
1194
1195 *p_start = *p_end = NULL;
1196 nextline:
1197 tok->start = NULL;
1198 blankline = 0;
1199
1200 /* Get indentation level */
1201 if (tok->atbol) {
1202 register int col = 0;
1203 register int altcol = 0;
1204 tok->atbol = 0;
1205 for (;;) {
1206 c = tok_nextc(tok);
1207 if (c == ' ')
1208 col++, altcol++;
1209 else if (c == '\t') {
1210 col = (col/tok->tabsize + 1) * tok->tabsize;
1211 altcol = (altcol/tok->alttabsize + 1)
1212 * tok->alttabsize;
1213 }
1214 else if (c == '\014') /* Control-L (formfeed) */
1215 col = altcol = 0; /* For Emacs users */
1216 else
1217 break;
1218 }
1219 tok_backup(tok, c);
1220 if (c == '#' || c == '\n') {
1221 /* Lines with only whitespace and/or comments
1222 shouldn't affect the indentation and are
1223 not passed to the parser as NEWLINE tokens,
1224 except *totally* empty lines in interactive
1225 mode, which signal the end of a command group. */
1226 if (col == 0 && c == '\n' && tok->prompt != NULL)
1227 blankline = 0; /* Let it through */
1228 else
1229 blankline = 1; /* Ignore completely */
1230 /* We can't jump back right here since we still
1231 may need to skip to the end of a comment */
1232 }
1233 if (!blankline && tok->level == 0) {
1234 if (col == tok->indstack[tok->indent]) {
1235 /* No change */
1236 if (altcol != tok->altindstack[tok->indent]) {
1237 if (indenterror(tok))
1238 return ERRORTOKEN;
1239 }
1240 }
1241 else if (col > tok->indstack[tok->indent]) {
1242 /* Indent -- always one */
1243 if (tok->indent+1 >= MAXINDENT) {
1244 tok->done = E_TOODEEP;
1245 tok->cur = tok->inp;
1246 return ERRORTOKEN;
1247 }
1248 if (altcol <= tok->altindstack[tok->indent]) {
1249 if (indenterror(tok))
1250 return ERRORTOKEN;
1251 }
1252 tok->pendin++;
1253 tok->indstack[++tok->indent] = col;
1254 tok->altindstack[tok->indent] = altcol;
1255 }
1256 else /* col < tok->indstack[tok->indent] */ {
1257 /* Dedent -- any number, must be consistent */
1258 while (tok->indent > 0 &&
1259 col < tok->indstack[tok->indent]) {
1260 tok->pendin--;
1261 tok->indent--;
1262 }
1263 if (col != tok->indstack[tok->indent]) {
1264 tok->done = E_DEDENT;
1265 tok->cur = tok->inp;
1266 return ERRORTOKEN;
1267 }
1268 if (altcol != tok->altindstack[tok->indent]) {
1269 if (indenterror(tok))
1270 return ERRORTOKEN;
1271 }
1272 }
1273 }
1274 }
1275
1276 tok->start = tok->cur;
1277
1278 /* Return pending indents/dedents */
1279 if (tok->pendin != 0) {
1280 if (tok->pendin < 0) {
1281 tok->pendin++;
1282 return DEDENT;
1283 }
1284 else {
1285 tok->pendin--;
1286 return INDENT;
1287 }
1288 }
1289
1290 again:
1291 tok->start = NULL;
1292 /* Skip spaces */
1293 do {
1294 c = tok_nextc(tok);
1295 } while (c == ' ' || c == '\t' || c == '\014');
1296
1297 /* Set start of current token */
1298 tok->start = tok->cur - 1;
1299
1300 /* Skip comment, while looking for tab-setting magic */
1301 if (c == '#') {
1302 static char *tabforms[] = {
1303 "tab-width:", /* Emacs */
1304 ":tabstop=", /* vim, full form */
1305 ":ts=", /* vim, abbreviated form */
1306 "set tabsize=", /* will vi never die? */
1307 /* more templates can be added here to support other editors */
1308 };
1309 char cbuf[80];
1310 char *tp, **cp;
1311 tp = cbuf;
1312 do {
1313 *tp++ = c = tok_nextc(tok);
1314 } while (c != EOF && c != '\n' &&
1315 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1316 *tp = '\0';
1317 for (cp = tabforms;
1318 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1319 cp++) {
1320 if ((tp = strstr(cbuf, *cp))) {
1321 int newsize = atoi(tp + strlen(*cp));
1322
1323 if (newsize >= 1 && newsize <= 40) {
1324 tok->tabsize = newsize;
1325 if (Py_VerboseFlag)
1326 PySys_WriteStderr(
1327 "Tab size set to %d\n",
1328 newsize);
1329 }
1330 }
1331 }
1332 while (c != EOF && c != '\n')
1333 c = tok_nextc(tok);
1334 }
1335
1336 /* Check for EOF and errors now */
1337 if (c == EOF) {
1338 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1339 }
1340
1341 /* Identifier (most frequent token!) */
1342 if (Py_ISALPHA(c) || c == '_') {
1343 /* Process r"", u"" and ur"" */
1344 switch (c) {
1345 case 'b':
1346 case 'B':
1347 c = tok_nextc(tok);
1348 if (c == 'r' || c == 'R')
1349 c = tok_nextc(tok);
1350 if (c == '"' || c == '\'')
1351 goto letter_quote;
1352 break;
1353 case 'r':
1354 case 'R':
1355 c = tok_nextc(tok);
1356 if (c == '"' || c == '\'')
1357 goto letter_quote;
1358 break;
1359 case 'u':
1360 case 'U':
1361 c = tok_nextc(tok);
1362 if (c == 'r' || c == 'R')
1363 c = tok_nextc(tok);
1364 if (c == '"' || c == '\'')
1365 goto letter_quote;
1366 break;
1367 }
1368 while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
1369 c = tok_nextc(tok);
1370 }
1371 tok_backup(tok, c);
1372 *p_start = tok->start;
1373 *p_end = tok->cur;
1374 return NAME;
1375 }
1376
1377 /* Newline */
1378 if (c == '\n') {
1379 tok->atbol = 1;
1380 if (blankline || tok->level > 0)
1381 goto nextline;
1382 *p_start = tok->start;
1383 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1384 tok->cont_line = 0;
1385 return NEWLINE;
1386 }
1387
1388 /* Period or number starting with period? */
1389 if (c == '.') {
1390 c = tok_nextc(tok);
1391 if (isdigit(c)) {
1392 goto fraction;
1393 }
1394 else {
1395 tok_backup(tok, c);
1396 *p_start = tok->start;
1397 *p_end = tok->cur;
1398 return DOT;
1399 }
1400 }
1401
1402 /* Number */
1403 if (isdigit(c)) {
1404 if (c == '0') {
1405 /* Hex, octal or binary -- maybe. */
1406 c = tok_nextc(tok);
1407 if (c == '.')
1408 goto fraction;
1409 #ifndef WITHOUT_COMPLEX
1410 if (c == 'j' || c == 'J')
1411 goto imaginary;
1412 #endif
1413 if (c == 'x' || c == 'X') {
1414
1415 /* Hex */
1416 c = tok_nextc(tok);
1417 if (!isxdigit(c)) {
1418 tok->done = E_TOKEN;
1419 tok_backup(tok, c);
1420 return ERRORTOKEN;
1421 }
1422 do {
1423 c = tok_nextc(tok);
1424 } while (isxdigit(c));
1425 }
1426 else if (c == 'o' || c == 'O') {
1427 /* Octal */
1428 c = tok_nextc(tok);
1429 if (c < '0' || c >= '8') {
1430 tok->done = E_TOKEN;
1431 tok_backup(tok, c);
1432 return ERRORTOKEN;
1433 }
1434 do {
1435 c = tok_nextc(tok);
1436 } while ('0' <= c && c < '8');
1437 }
1438 else if (c == 'b' || c == 'B') {
1439 /* Binary */
1440 c = tok_nextc(tok);
1441 if (c != '0' && c != '1') {
1442 tok->done = E_TOKEN;
1443 tok_backup(tok, c);
1444 return ERRORTOKEN;
1445 }
1446 do {
1447 c = tok_nextc(tok);
1448 } while (c == '0' || c == '1');
1449 }
1450 else {
1451 int found_decimal = 0;
1452 /* Octal; c is first char of it */
1453 /* There's no 'isoctdigit' macro, sigh */
1454 while ('0' <= c && c < '8') {
1455 c = tok_nextc(tok);
1456 }
1457 if (isdigit(c)) {
1458 found_decimal = 1;
1459 do {
1460 c = tok_nextc(tok);
1461 } while (isdigit(c));
1462 }
1463 if (c == '.')
1464 goto fraction;
1465 else if (c == 'e' || c == 'E')
1466 goto exponent;
1467 #ifndef WITHOUT_COMPLEX
1468 else if (c == 'j' || c == 'J')
1469 goto imaginary;
1470 #endif
1471 else if (found_decimal) {
1472 tok->done = E_TOKEN;
1473 tok_backup(tok, c);
1474 return ERRORTOKEN;
1475 }
1476 }
1477 if (c == 'l' || c == 'L')
1478 c = tok_nextc(tok);
1479 }
1480 else {
1481 /* Decimal */
1482 do {
1483 c = tok_nextc(tok);
1484 } while (isdigit(c));
1485 if (c == 'l' || c == 'L')
1486 c = tok_nextc(tok);
1487 else {
1488 /* Accept floating point numbers. */
1489 if (c == '.') {
1490 fraction:
1491 /* Fraction */
1492 do {
1493 c = tok_nextc(tok);
1494 } while (isdigit(c));
1495 }
1496 if (c == 'e' || c == 'E') {
1497 exponent:
1498 /* Exponent part */
1499 c = tok_nextc(tok);
1500 if (c == '+' || c == '-')
1501 c = tok_nextc(tok);
1502 if (!isdigit(c)) {
1503 tok->done = E_TOKEN;
1504 tok_backup(tok, c);
1505 return ERRORTOKEN;
1506 }
1507 do {
1508 c = tok_nextc(tok);
1509 } while (isdigit(c));
1510 }
1511 #ifndef WITHOUT_COMPLEX
1512 if (c == 'j' || c == 'J')
1513 /* Imaginary part */
1514 imaginary:
1515 c = tok_nextc(tok);
1516 #endif
1517 }
1518 }
1519 tok_backup(tok, c);
1520 *p_start = tok->start;
1521 *p_end = tok->cur;
1522 return NUMBER;
1523 }
1524
1525 letter_quote:
1526 /* String */
1527 if (c == '\'' || c == '"') {
1528 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1529 int quote = c;
1530 int triple = 0;
1531 int tripcount = 0;
1532 for (;;) {
1533 c = tok_nextc(tok);
1534 if (c == '\n') {
1535 if (!triple) {
1536 tok->done = E_EOLS;
1537 tok_backup(tok, c);
1538 return ERRORTOKEN;
1539 }
1540 tripcount = 0;
1541 tok->cont_line = 1; /* multiline string. */
1542 }
1543 else if (c == EOF) {
1544 if (triple)
1545 tok->done = E_EOFS;
1546 else
1547 tok->done = E_EOLS;
1548 tok->cur = tok->inp;
1549 return ERRORTOKEN;
1550 }
1551 else if (c == quote) {
1552 tripcount++;
1553 if (tok->cur - tok->start == quote2) {
1554 c = tok_nextc(tok);
1555 if (c == quote) {
1556 triple = 1;
1557 tripcount = 0;
1558 continue;
1559 }
1560 tok_backup(tok, c);
1561 }
1562 if (!triple || tripcount == 3)
1563 break;
1564 }
1565 else if (c == '\\') {
1566 tripcount = 0;
1567 c = tok_nextc(tok);
1568 if (c == EOF) {
1569 tok->done = E_EOLS;
1570 tok->cur = tok->inp;
1571 return ERRORTOKEN;
1572 }
1573 }
1574 else
1575 tripcount = 0;
1576 }
1577 *p_start = tok->start;
1578 *p_end = tok->cur;
1579 return STRING;
1580 }
1581
1582 /* Line continuation */
1583 if (c == '\\') {
1584 c = tok_nextc(tok);
1585 if (c != '\n') {
1586 tok->done = E_LINECONT;
1587 tok->cur = tok->inp;
1588 return ERRORTOKEN;
1589 }
1590 tok->cont_line = 1;
1591 goto again; /* Read next line */
1592 }
1593
1594 /* Check for two-character token */
1595 {
1596 int c2 = tok_nextc(tok);
1597 int token = PyToken_TwoChars(c, c2);
1598 #ifndef PGEN
1599 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1600 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1601 "<> not supported in 3.x; use !=",
1602 tok->filename, tok->lineno,
1603 NULL, NULL)) {
1604 return ERRORTOKEN;
1605 }
1606 }
1607 #endif
1608 if (token != OP) {
1609 int c3 = tok_nextc(tok);
1610 int token3 = PyToken_ThreeChars(c, c2, c3);
1611 if (token3 != OP) {
1612 token = token3;
1613 } else {
1614 tok_backup(tok, c3);
1615 }
1616 *p_start = tok->start;
1617 *p_end = tok->cur;
1618 return token;
1619 }
1620 tok_backup(tok, c2);
1621 }
1622
1623 /* Keep track of parentheses nesting level */
1624 switch (c) {
1625 case '(':
1626 case '[':
1627 case '{':
1628 tok->level++;
1629 break;
1630 case ')':
1631 case ']':
1632 case '}':
1633 tok->level--;
1634 break;
1635 }
1636
1637 /* Punctuation character */
1638 *p_start = tok->start;
1639 *p_end = tok->cur;
1640 return PyToken_OneChar(c);
1641 }
1642
1643 int
PyTokenizer_Get(struct tok_state * tok,char ** p_start,char ** p_end)1644 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1645 {
1646 int result = tok_get(tok, p_start, p_end);
1647 if (tok->decoding_erred) {
1648 result = ERRORTOKEN;
1649 tok->done = E_DECODE;
1650 }
1651 return result;
1652 }
1653
1654 /* This function is only called from parsetok. However, it cannot live
1655 there, as it must be empty for PGEN, and we can check for PGEN only
1656 in this file. */
1657
1658 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1659 char*
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1660 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1661 {
1662 return NULL;
1663 }
1664 #else
1665 #ifdef Py_USING_UNICODE
1666 static PyObject *
dec_utf8(const char * enc,const char * text,size_t len)1667 dec_utf8(const char *enc, const char *text, size_t len) {
1668 PyObject *ret = NULL;
1669 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1670 if (unicode_text) {
1671 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1672 Py_DECREF(unicode_text);
1673 }
1674 if (!ret) {
1675 PyErr_Clear();
1676 }
1677 return ret;
1678 }
1679 char *
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1680 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1681 {
1682 char *text = NULL;
1683 if (tok->encoding) {
1684 /* convert source to original encondig */
1685 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1686 if (lineobj != NULL) {
1687 int linelen = PyString_Size(lineobj);
1688 const char *line = PyString_AsString(lineobj);
1689 text = PyObject_MALLOC(linelen + 1);
1690 if (text != NULL && line != NULL) {
1691 if (linelen)
1692 strncpy(text, line, linelen);
1693 text[linelen] = '\0';
1694 }
1695 Py_DECREF(lineobj);
1696
1697 /* adjust error offset */
1698 if (*offset > 1) {
1699 PyObject *offsetobj = dec_utf8(tok->encoding,
1700 tok->buf, *offset-1);
1701 if (offsetobj) {
1702 *offset = PyString_Size(offsetobj) + 1;
1703 Py_DECREF(offsetobj);
1704 }
1705 }
1706
1707 }
1708 }
1709 return text;
1710
1711 }
1712 #endif /* defined(Py_USING_UNICODE) */
1713 #endif
1714
1715
1716 #ifdef Py_DEBUG
1717
1718 void
tok_dump(int type,char * start,char * end)1719 tok_dump(int type, char *start, char *end)
1720 {
1721 printf("%s", _PyParser_TokenNames[type]);
1722 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1723 printf("(%.*s)", (int)(end - start), start);
1724 }
1725
1726 #endif
1727