1
2 /* Tokenizer implementation */
3
4 #include "Python.h"
5 #include "pgenheaders.h"
6
7 #include <ctype.h>
8 #include <assert.h>
9
10 #include "tokenizer.h"
11 #include "errcode.h"
12
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #include "pydebug.h"
20 #endif /* PGEN */
21
22 extern char *PyOS_Readline(FILE *, FILE *, char *);
23 /* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
27 /* Don't ever change this -- it would break the portability of Python code */
28 #define TABSIZE 8
29
30 /* Forward */
31 static struct tok_state *tok_new(void);
32 static int tok_nextc(struct tok_state *tok);
33 static void tok_backup(struct tok_state *tok, int c);
34
35 /* Token names */
36
37 char *_PyParser_TokenNames[] = {
38 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
66 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
70 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
74 "DOUBLESTAR",
75 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
86 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
88 "AT",
89 /* This table must match the #defines in token.h! */
90 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
93 };
94
95 /* Create and initialize a new tok_state structure */
96
97 static struct tok_state *
tok_new(void)98 tok_new(void)
99 {
100 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
101 sizeof(struct tok_state));
102 if (tok == NULL)
103 return NULL;
104 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
105 tok->done = E_OK;
106 tok->fp = NULL;
107 tok->input = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
115 tok->level = 0;
116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
124 tok->encoding = NULL;
125 tok->cont_line = 0;
126 #ifndef PGEN
127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
129 #endif
130 return tok;
131 }
132
133 static char *
new_string(const char * s,Py_ssize_t len)134 new_string(const char *s, Py_ssize_t len)
135 {
136 char* result = (char *)PyMem_MALLOC(len + 1);
137 if (result != NULL) {
138 memcpy(result, s, len);
139 result[len] = '\0';
140 }
141 return result;
142 }
143
144 #ifdef PGEN
145
146 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)147 decoding_fgets(char *s, int size, struct tok_state *tok)
148 {
149 return fgets(s, size, tok->fp);
150 }
151
152 static int
decoding_feof(struct tok_state * tok)153 decoding_feof(struct tok_state *tok)
154 {
155 return feof(tok->fp);
156 }
157
158 static char *
decode_str(const char * str,int exec_input,struct tok_state * tok)159 decode_str(const char *str, int exec_input, struct tok_state *tok)
160 {
161 return new_string(str, strlen(str));
162 }
163
164 #else /* PGEN */
165
166 static char *
error_ret(struct tok_state * tok)167 error_ret(struct tok_state *tok) /* XXX */
168 {
169 tok->decoding_erred = 1;
170 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
171 PyMem_FREE(tok->buf);
172 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
173 tok->done = E_DECODE;
174 return NULL; /* as if it were EOF */
175 }
176
177
178 static char *
get_normal_name(char * s)179 get_normal_name(char *s) /* for utf-8 and latin-1 */
180 {
181 char buf[13];
182 int i;
183 for (i = 0; i < 12; i++) {
184 int c = s[i];
185 if (c == '\0')
186 break;
187 else if (c == '_')
188 buf[i] = '-';
189 else
190 buf[i] = tolower(c);
191 }
192 buf[i] = '\0';
193 if (strcmp(buf, "utf-8") == 0 ||
194 strncmp(buf, "utf-8-", 6) == 0)
195 return "utf-8";
196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
201 strncmp(buf, "iso-latin-1-", 12) == 0)
202 return "iso-8859-1";
203 else
204 return s;
205 }
206
207 /* Return the coding spec in S, or NULL if none is found. */
208
209 static char *
get_coding_spec(const char * s,Py_ssize_t size)210 get_coding_spec(const char *s, Py_ssize_t size)
211 {
212 Py_ssize_t i;
213 /* Coding spec must be in a comment, and that comment must be
214 * the only statement on the source code line. */
215 for (i = 0; i < size - 6; i++) {
216 if (s[i] == '#')
217 break;
218 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219 return NULL;
220 }
221 for (; i < size - 6; i++) { /* XXX inefficient search */
222 const char* t = s + i;
223 if (strncmp(t, "coding", 6) == 0) {
224 const char* begin = NULL;
225 t += 6;
226 if (t[0] != ':' && t[0] != '=')
227 continue;
228 do {
229 t++;
230 } while (t[0] == '\x20' || t[0] == '\t');
231
232 begin = t;
233 while (Py_ISALNUM(t[0]) ||
234 t[0] == '-' || t[0] == '_' || t[0] == '.')
235 t++;
236
237 if (begin < t) {
238 char* r = new_string(begin, t - begin);
239 char* q;
240 if (!r)
241 return NULL;
242 q = get_normal_name(r);
243 if (r != q) {
244 PyMem_FREE(r);
245 r = new_string(q, strlen(q));
246 }
247 return r;
248 }
249 }
250 }
251 return NULL;
252 }
253
254 /* Check whether the line contains a coding spec. If it does,
255 invoke the set_readline function for the new encoding.
256 This function receives the tok_state and the new encoding.
257 Return 1 on success, 0 on failure. */
258
259 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))260 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
261 int set_readline(struct tok_state *, const char *))
262 {
263 char * cs;
264 int r = 1;
265
266 if (tok->cont_line) {
267 /* It's a continuation line, so it can't be a coding spec. */
268 tok->read_coding_spec = 1;
269 return 1;
270 }
271 cs = get_coding_spec(line, size);
272 if (!cs) {
273 Py_ssize_t i;
274 for (i = 0; i < size; i++) {
275 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
276 break;
277 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
278 /* Stop checking coding spec after a line containing
279 * anything except a comment. */
280 tok->read_coding_spec = 1;
281 break;
282 }
283 }
284 } else {
285 tok->read_coding_spec = 1;
286 if (tok->encoding == NULL) {
287 assert(tok->decoding_state == 1); /* raw */
288 if (strcmp(cs, "utf-8") == 0 ||
289 strcmp(cs, "iso-8859-1") == 0) {
290 tok->encoding = cs;
291 } else {
292 #ifdef Py_USING_UNICODE
293 r = set_readline(tok, cs);
294 if (r) {
295 tok->encoding = cs;
296 tok->decoding_state = -1;
297 }
298 else {
299 PyErr_Format(PyExc_SyntaxError,
300 "encoding problem: %s", cs);
301 PyMem_FREE(cs);
302 }
303 #else
304 /* Without Unicode support, we cannot
305 process the coding spec. Since there
306 won't be any Unicode literals, that
307 won't matter. */
308 PyMem_FREE(cs);
309 #endif
310 }
311 } else { /* then, compare cs with BOM */
312 r = (strcmp(tok->encoding, cs) == 0);
313 if (!r)
314 PyErr_Format(PyExc_SyntaxError,
315 "encoding problem: %s with BOM", cs);
316 PyMem_FREE(cs);
317 }
318 }
319 return r;
320 }
321
322 /* See whether the file starts with a BOM. If it does,
323 invoke the set_readline function with the new encoding.
324 Return 1 on success, 0 on failure. */
325
326 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)327 check_bom(int get_char(struct tok_state *),
328 void unget_char(int, struct tok_state *),
329 int set_readline(struct tok_state *, const char *),
330 struct tok_state *tok)
331 {
332 int ch1, ch2, ch3;
333 ch1 = get_char(tok);
334 tok->decoding_state = 1;
335 if (ch1 == EOF) {
336 return 1;
337 } else if (ch1 == 0xEF) {
338 ch2 = get_char(tok);
339 if (ch2 != 0xBB) {
340 unget_char(ch2, tok);
341 unget_char(ch1, tok);
342 return 1;
343 }
344 ch3 = get_char(tok);
345 if (ch3 != 0xBF) {
346 unget_char(ch3, tok);
347 unget_char(ch2, tok);
348 unget_char(ch1, tok);
349 return 1;
350 }
351 #if 0
352 /* Disable support for UTF-16 BOMs until a decision
353 is made whether this needs to be supported. */
354 } else if (ch1 == 0xFE) {
355 ch2 = get_char(tok);
356 if (ch2 != 0xFF) {
357 unget_char(ch2, tok);
358 unget_char(ch1, tok);
359 return 1;
360 }
361 if (!set_readline(tok, "utf-16-be"))
362 return 0;
363 tok->decoding_state = -1;
364 } else if (ch1 == 0xFF) {
365 ch2 = get_char(tok);
366 if (ch2 != 0xFE) {
367 unget_char(ch2, tok);
368 unget_char(ch1, tok);
369 return 1;
370 }
371 if (!set_readline(tok, "utf-16-le"))
372 return 0;
373 tok->decoding_state = -1;
374 #endif
375 } else {
376 unget_char(ch1, tok);
377 return 1;
378 }
379 if (tok->encoding != NULL)
380 PyMem_FREE(tok->encoding);
381 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
382 return 1;
383 }
384
385 /* Read a line of text from TOK into S, using the stream in TOK.
386 Return NULL on failure, else S.
387
388 On entry, tok->decoding_buffer will be one of:
389 1) NULL: need to call tok->decoding_readline to get a new line
390 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
391 stored the result in tok->decoding_buffer
392 3) PyStringObject *: previous call to fp_readl did not have enough room
393 (in the s buffer) to copy entire contents of the line read
394 by tok->decoding_readline. tok->decoding_buffer has the overflow.
395 In this case, fp_readl is called in a loop (with an expanded buffer)
396 until the buffer ends with a '\n' (or until the end of the file is
397 reached): see tok_nextc and its calls to decoding_fgets.
398 */
399
400 static char *
fp_readl(char * s,int size,struct tok_state * tok)401 fp_readl(char *s, int size, struct tok_state *tok)
402 {
403 #ifndef Py_USING_UNICODE
404 /* In a non-Unicode built, this should never be called. */
405 Py_FatalError("fp_readl should not be called in this build.");
406 return NULL; /* Keep compiler happy (not reachable) */
407 #else
408 PyObject* utf8 = NULL;
409 PyObject* buf = tok->decoding_buffer;
410 char *str;
411 Py_ssize_t utf8len;
412
413 /* Ask for one less byte so we can terminate it */
414 assert(size > 0);
415 size--;
416
417 if (buf == NULL) {
418 buf = PyObject_CallObject(tok->decoding_readline, NULL);
419 if (buf == NULL)
420 return error_ret(tok);
421 if (!PyUnicode_Check(buf)) {
422 Py_DECREF(buf);
423 PyErr_SetString(PyExc_SyntaxError,
424 "codec did not return a unicode object");
425 return error_ret(tok);
426 }
427 } else {
428 tok->decoding_buffer = NULL;
429 if (PyString_CheckExact(buf))
430 utf8 = buf;
431 }
432 if (utf8 == NULL) {
433 utf8 = PyUnicode_AsUTF8String(buf);
434 Py_DECREF(buf);
435 if (utf8 == NULL)
436 return error_ret(tok);
437 }
438 str = PyString_AsString(utf8);
439 utf8len = PyString_GET_SIZE(utf8);
440 if (utf8len > size) {
441 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
442 if (tok->decoding_buffer == NULL) {
443 Py_DECREF(utf8);
444 return error_ret(tok);
445 }
446 utf8len = size;
447 }
448 memcpy(s, str, utf8len);
449 s[utf8len] = '\0';
450 Py_DECREF(utf8);
451 if (utf8len == 0)
452 return NULL; /* EOF */
453 return s;
454 #endif
455 }
456
457 /* Set the readline function for TOK to a StreamReader's
458 readline function. The StreamReader is named ENC.
459
460 This function is called from check_bom and check_coding_spec.
461
462 ENC is usually identical to the future value of tok->encoding,
463 except for the (currently unsupported) case of UTF-16.
464
465 Return 1 on success, 0 on failure. */
466
467 static int
fp_setreadl(struct tok_state * tok,const char * enc)468 fp_setreadl(struct tok_state *tok, const char* enc)
469 {
470 PyObject *reader, *stream, *readline;
471
472 /* XXX: constify filename argument. */
473 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
474 if (stream == NULL)
475 return 0;
476
477 reader = PyCodec_StreamReader(enc, stream, NULL);
478 Py_DECREF(stream);
479 if (reader == NULL)
480 return 0;
481
482 readline = PyObject_GetAttrString(reader, "readline");
483 Py_DECREF(reader);
484 if (readline == NULL)
485 return 0;
486
487 tok->decoding_readline = readline;
488 return 1;
489 }
490
491 /* Fetch the next byte from TOK. */
492
fp_getc(struct tok_state * tok)493 static int fp_getc(struct tok_state *tok) {
494 return getc(tok->fp);
495 }
496
497 /* Unfetch the last byte back into TOK. */
498
fp_ungetc(int c,struct tok_state * tok)499 static void fp_ungetc(int c, struct tok_state *tok) {
500 ungetc(c, tok->fp);
501 }
502
503 /* Read a line of input from TOK. Determine encoding
504 if necessary. */
505
506 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)507 decoding_fgets(char *s, int size, struct tok_state *tok)
508 {
509 char *line = NULL;
510 int badchar = 0;
511 for (;;) {
512 if (tok->decoding_state < 0) {
513 /* We already have a codec associated with
514 this input. */
515 line = fp_readl(s, size, tok);
516 break;
517 } else if (tok->decoding_state > 0) {
518 /* We want a 'raw' read. */
519 line = Py_UniversalNewlineFgets(s, size,
520 tok->fp, NULL);
521 break;
522 } else {
523 /* We have not yet determined the encoding.
524 If an encoding is found, use the file-pointer
525 reader functions from now on. */
526 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
527 return error_ret(tok);
528 assert(tok->decoding_state != 0);
529 }
530 }
531 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
532 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
533 return error_ret(tok);
534 }
535 }
536 #ifndef PGEN
537 /* The default encoding is ASCII, so make sure we don't have any
538 non-ASCII bytes in it. */
539 if (line && !tok->encoding) {
540 unsigned char *c;
541 for (c = (unsigned char *)line; *c; c++)
542 if (*c > 127) {
543 badchar = *c;
544 break;
545 }
546 }
547 if (badchar) {
548 char buf[500];
549 /* Need to add 1 to the line number, since this line
550 has not been counted, yet. */
551 sprintf(buf,
552 "Non-ASCII character '\\x%.2x' "
553 "in file %.200s on line %i, "
554 "but no encoding declared; "
555 "see http://python.org/dev/peps/pep-0263/ for details",
556 badchar, tok->filename, tok->lineno + 1);
557 PyErr_SetString(PyExc_SyntaxError, buf);
558 return error_ret(tok);
559 }
560 #endif
561 return line;
562 }
563
564 static int
decoding_feof(struct tok_state * tok)565 decoding_feof(struct tok_state *tok)
566 {
567 if (tok->decoding_state >= 0) {
568 return feof(tok->fp);
569 } else {
570 PyObject* buf = tok->decoding_buffer;
571 if (buf == NULL) {
572 buf = PyObject_CallObject(tok->decoding_readline, NULL);
573 if (buf == NULL) {
574 error_ret(tok);
575 return 1;
576 } else {
577 tok->decoding_buffer = buf;
578 }
579 }
580 return PyObject_Length(buf) == 0;
581 }
582 }
583
584 /* Fetch a byte from TOK, using the string buffer. */
585
586 static int
buf_getc(struct tok_state * tok)587 buf_getc(struct tok_state *tok) {
588 return Py_CHARMASK(*tok->str++);
589 }
590
591 /* Unfetch a byte from TOK, using the string buffer. */
592
593 static void
buf_ungetc(int c,struct tok_state * tok)594 buf_ungetc(int c, struct tok_state *tok) {
595 tok->str--;
596 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
597 }
598
599 /* Set the readline function for TOK to ENC. For the string-based
600 tokenizer, this means to just record the encoding. */
601
602 static int
buf_setreadl(struct tok_state * tok,const char * enc)603 buf_setreadl(struct tok_state *tok, const char* enc) {
604 tok->enc = enc;
605 return 1;
606 }
607
608 /* Return a UTF-8 encoding Python string object from the
609 C byte string STR, which is encoded with ENC. */
610
611 #ifdef Py_USING_UNICODE
612 static PyObject *
translate_into_utf8(const char * str,const char * enc)613 translate_into_utf8(const char* str, const char* enc) {
614 PyObject *utf8;
615 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
616 if (buf == NULL)
617 return NULL;
618 utf8 = PyUnicode_AsUTF8String(buf);
619 Py_DECREF(buf);
620 return utf8;
621 }
622 #endif
623
624
625 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)626 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
627 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
628 char *buf, *current;
629 char c = '\0';
630 buf = PyMem_MALLOC(needed_length);
631 if (buf == NULL) {
632 tok->done = E_NOMEM;
633 return NULL;
634 }
635 for (current = buf; *s; s++, current++) {
636 c = *s;
637 if (skip_next_lf) {
638 skip_next_lf = 0;
639 if (c == '\n') {
640 c = *++s;
641 if (!c)
642 break;
643 }
644 }
645 if (c == '\r') {
646 skip_next_lf = 1;
647 c = '\n';
648 }
649 *current = c;
650 }
651 /* If this is exec input, add a newline to the end of the string if
652 there isn't one already. */
653 if (exec_input && c != '\n') {
654 *current = '\n';
655 current++;
656 }
657 *current = '\0';
658 final_length = current - buf + 1;
659 if (final_length < needed_length && final_length) {
660 /* should never fail */
661 char* result = PyMem_REALLOC(buf, final_length);
662 if (result == NULL) {
663 PyMem_FREE(buf);
664 }
665 buf = result;
666 }
667 return buf;
668 }
669
670 /* Decode a byte string STR for use as the buffer of TOK.
671 Look for encoding declarations inside STR, and record them
672 inside TOK. */
673
674 static const char *
decode_str(const char * input,int single,struct tok_state * tok)675 decode_str(const char *input, int single, struct tok_state *tok)
676 {
677 PyObject* utf8 = NULL;
678 const char *str;
679 const char *s;
680 const char *newl[2] = {NULL, NULL};
681 int lineno = 0;
682 tok->input = str = translate_newlines(input, single, tok);
683 if (str == NULL)
684 return NULL;
685 tok->enc = NULL;
686 tok->str = str;
687 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
688 return error_ret(tok);
689 str = tok->str; /* string after BOM if any */
690 assert(str);
691 #ifdef Py_USING_UNICODE
692 if (tok->enc != NULL) {
693 utf8 = translate_into_utf8(str, tok->enc);
694 if (utf8 == NULL)
695 return error_ret(tok);
696 str = PyString_AsString(utf8);
697 }
698 #endif
699 for (s = str;; s++) {
700 if (*s == '\0') break;
701 else if (*s == '\n') {
702 assert(lineno < 2);
703 newl[lineno] = s;
704 lineno++;
705 if (lineno == 2) break;
706 }
707 }
708 tok->enc = NULL;
709 /* need to check line 1 and 2 separately since check_coding_spec
710 assumes a single line as input */
711 if (newl[0]) {
712 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
713 return error_ret(tok);
714 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
715 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
716 tok, buf_setreadl))
717 return error_ret(tok);
718 }
719 }
720 #ifdef Py_USING_UNICODE
721 if (tok->enc != NULL) {
722 assert(utf8 == NULL);
723 utf8 = translate_into_utf8(str, tok->enc);
724 if (utf8 == NULL)
725 return error_ret(tok);
726 str = PyString_AsString(utf8);
727 }
728 #endif
729 assert(tok->decoding_buffer == NULL);
730 tok->decoding_buffer = utf8; /* CAUTION */
731 return str;
732 }
733
734 #endif /* PGEN */
735
736 /* Set up tokenizer for string */
737
738 struct tok_state *
PyTokenizer_FromString(const char * str,int exec_input)739 PyTokenizer_FromString(const char *str, int exec_input)
740 {
741 struct tok_state *tok = tok_new();
742 if (tok == NULL)
743 return NULL;
744 str = (char *)decode_str(str, exec_input, tok);
745 if (str == NULL) {
746 PyTokenizer_Free(tok);
747 return NULL;
748 }
749
750 /* XXX: constify members. */
751 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
752 return tok;
753 }
754
755
756 /* Set up tokenizer for file */
757
758 struct tok_state *
PyTokenizer_FromFile(FILE * fp,char * ps1,char * ps2)759 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
760 {
761 struct tok_state *tok = tok_new();
762 if (tok == NULL)
763 return NULL;
764 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
765 PyTokenizer_Free(tok);
766 return NULL;
767 }
768 tok->cur = tok->inp = tok->buf;
769 tok->end = tok->buf + BUFSIZ;
770 tok->fp = fp;
771 tok->prompt = ps1;
772 tok->nextprompt = ps2;
773 return tok;
774 }
775
776
777 /* Free a tok_state structure */
778
779 void
PyTokenizer_Free(struct tok_state * tok)780 PyTokenizer_Free(struct tok_state *tok)
781 {
782 if (tok->encoding != NULL)
783 PyMem_FREE(tok->encoding);
784 #ifndef PGEN
785 Py_XDECREF(tok->decoding_readline);
786 Py_XDECREF(tok->decoding_buffer);
787 #endif
788 if (tok->fp != NULL && tok->buf != NULL)
789 PyMem_FREE(tok->buf);
790 if (tok->input)
791 PyMem_FREE((char *)tok->input);
792 PyMem_FREE(tok);
793 }
794
795 #if !defined(PGEN) && defined(Py_USING_UNICODE)
796 static int
tok_stdin_decode(struct tok_state * tok,char ** inp)797 tok_stdin_decode(struct tok_state *tok, char **inp)
798 {
799 PyObject *enc, *sysstdin, *decoded, *utf8;
800 const char *encoding;
801 char *converted;
802
803 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
804 return 0;
805 sysstdin = PySys_GetObject("stdin");
806 if (sysstdin == NULL || !PyFile_Check(sysstdin))
807 return 0;
808
809 enc = ((PyFileObject *)sysstdin)->f_encoding;
810 if (enc == NULL || !PyString_Check(enc))
811 return 0;
812 Py_INCREF(enc);
813
814 encoding = PyString_AsString(enc);
815 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
816 if (decoded == NULL)
817 goto error_clear;
818
819 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
820 Py_DECREF(decoded);
821 if (utf8 == NULL)
822 goto error_clear;
823
824 assert(PyString_Check(utf8));
825 converted = new_string(PyString_AS_STRING(utf8),
826 PyString_GET_SIZE(utf8));
827 Py_DECREF(utf8);
828 if (converted == NULL)
829 goto error_nomem;
830
831 PyMem_FREE(*inp);
832 *inp = converted;
833 if (tok->encoding != NULL)
834 PyMem_FREE(tok->encoding);
835 tok->encoding = new_string(encoding, strlen(encoding));
836 if (tok->encoding == NULL)
837 goto error_nomem;
838
839 Py_DECREF(enc);
840 return 0;
841
842 error_nomem:
843 Py_DECREF(enc);
844 tok->done = E_NOMEM;
845 return -1;
846
847 error_clear:
848 Py_DECREF(enc);
849 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
850 tok->done = E_ERROR;
851 return -1;
852 }
853 /* Fallback to iso-8859-1: for backward compatibility */
854 PyErr_Clear();
855 return 0;
856 }
857 #endif
858
859 /* Get next char, updating state; error code goes into tok->done */
860
861 static int
tok_nextc(register struct tok_state * tok)862 tok_nextc(register struct tok_state *tok)
863 {
864 for (;;) {
865 if (tok->cur != tok->inp) {
866 return Py_CHARMASK(*tok->cur++); /* Fast path */
867 }
868 if (tok->done != E_OK)
869 return EOF;
870 if (tok->fp == NULL) {
871 char *end = strchr(tok->inp, '\n');
872 if (end != NULL)
873 end++;
874 else {
875 end = strchr(tok->inp, '\0');
876 if (end == tok->inp) {
877 tok->done = E_EOF;
878 return EOF;
879 }
880 }
881 if (tok->start == NULL)
882 tok->buf = tok->cur;
883 tok->line_start = tok->cur;
884 tok->lineno++;
885 tok->inp = end;
886 return Py_CHARMASK(*tok->cur++);
887 }
888 if (tok->prompt != NULL) {
889 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
890 if (tok->nextprompt != NULL)
891 tok->prompt = tok->nextprompt;
892 if (newtok == NULL)
893 tok->done = E_INTR;
894 else if (*newtok == '\0') {
895 PyMem_FREE(newtok);
896 tok->done = E_EOF;
897 }
898 #if !defined(PGEN) && defined(Py_USING_UNICODE)
899 else if (tok_stdin_decode(tok, &newtok) != 0)
900 PyMem_FREE(newtok);
901 #endif
902 else if (tok->start != NULL) {
903 size_t start = tok->start - tok->buf;
904 size_t oldlen = tok->cur - tok->buf;
905 size_t newlen = oldlen + strlen(newtok);
906 char *buf = tok->buf;
907 buf = (char *)PyMem_REALLOC(buf, newlen+1);
908 tok->lineno++;
909 if (buf == NULL) {
910 PyMem_FREE(tok->buf);
911 tok->buf = NULL;
912 PyMem_FREE(newtok);
913 tok->done = E_NOMEM;
914 return EOF;
915 }
916 tok->buf = buf;
917 tok->cur = tok->buf + oldlen;
918 tok->line_start = tok->cur;
919 strcpy(tok->buf + oldlen, newtok);
920 PyMem_FREE(newtok);
921 tok->inp = tok->buf + newlen;
922 tok->end = tok->inp + 1;
923 tok->start = tok->buf + start;
924 }
925 else {
926 tok->lineno++;
927 if (tok->buf != NULL)
928 PyMem_FREE(tok->buf);
929 tok->buf = newtok;
930 tok->cur = tok->buf;
931 tok->line_start = tok->buf;
932 tok->inp = strchr(tok->buf, '\0');
933 tok->end = tok->inp + 1;
934 }
935 }
936 else {
937 int done = 0;
938 Py_ssize_t cur = 0;
939 char *pt;
940 if (tok->start == NULL) {
941 if (tok->buf == NULL) {
942 tok->buf = (char *)
943 PyMem_MALLOC(BUFSIZ);
944 if (tok->buf == NULL) {
945 tok->done = E_NOMEM;
946 return EOF;
947 }
948 tok->end = tok->buf + BUFSIZ;
949 }
950 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
951 tok) == NULL) {
952 if (!tok->decoding_erred)
953 tok->done = E_EOF;
954 done = 1;
955 }
956 else {
957 tok->done = E_OK;
958 tok->inp = strchr(tok->buf, '\0');
959 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
960 }
961 }
962 else {
963 cur = tok->cur - tok->buf;
964 if (decoding_feof(tok)) {
965 tok->done = E_EOF;
966 done = 1;
967 }
968 else
969 tok->done = E_OK;
970 }
971 tok->lineno++;
972 /* Read until '\n' or EOF */
973 while (!done) {
974 Py_ssize_t curstart = tok->start == NULL ? -1 :
975 tok->start - tok->buf;
976 Py_ssize_t curvalid = tok->inp - tok->buf;
977 Py_ssize_t newsize = curvalid + BUFSIZ;
978 char *newbuf = tok->buf;
979 newbuf = (char *)PyMem_REALLOC(newbuf,
980 newsize);
981 if (newbuf == NULL) {
982 tok->done = E_NOMEM;
983 tok->cur = tok->inp;
984 return EOF;
985 }
986 tok->buf = newbuf;
987 tok->cur = tok->buf + cur;
988 tok->line_start = tok->cur;
989 tok->inp = tok->buf + curvalid;
990 tok->end = tok->buf + newsize;
991 tok->start = curstart < 0 ? NULL :
992 tok->buf + curstart;
993 if (decoding_fgets(tok->inp,
994 (int)(tok->end - tok->inp),
995 tok) == NULL) {
996 /* Break out early on decoding
997 errors, as tok->buf will be NULL
998 */
999 if (tok->decoding_erred)
1000 return EOF;
1001 /* Last line does not end in \n,
1002 fake one */
1003 strcpy(tok->inp, "\n");
1004 }
1005 tok->inp = strchr(tok->inp, '\0');
1006 done = tok->inp[-1] == '\n';
1007 }
1008 if (tok->buf != NULL) {
1009 tok->cur = tok->buf + cur;
1010 tok->line_start = tok->cur;
1011 /* replace "\r\n" with "\n" */
1012 /* For Mac leave the \r, giving a syntax error */
1013 pt = tok->inp - 2;
1014 if (pt >= tok->buf && *pt == '\r') {
1015 *pt++ = '\n';
1016 *pt = '\0';
1017 tok->inp = pt;
1018 }
1019 }
1020 }
1021 if (tok->done != E_OK) {
1022 if (tok->prompt != NULL)
1023 PySys_WriteStderr("\n");
1024 tok->cur = tok->inp;
1025 return EOF;
1026 }
1027 }
1028 /*NOTREACHED*/
1029 }
1030
1031
1032 /* Back-up one character */
1033
1034 static void
tok_backup(register struct tok_state * tok,register int c)1035 tok_backup(register struct tok_state *tok, register int c)
1036 {
1037 if (c != EOF) {
1038 if (--tok->cur < tok->buf)
1039 Py_FatalError("tok_backup: beginning of buffer");
1040 if (*tok->cur != c)
1041 *tok->cur = c;
1042 }
1043 }
1044
1045
1046 /* Return the token corresponding to a single character */
1047
1048 int
PyToken_OneChar(int c)1049 PyToken_OneChar(int c)
1050 {
1051 switch (c) {
1052 case '(': return LPAR;
1053 case ')': return RPAR;
1054 case '[': return LSQB;
1055 case ']': return RSQB;
1056 case ':': return COLON;
1057 case ',': return COMMA;
1058 case ';': return SEMI;
1059 case '+': return PLUS;
1060 case '-': return MINUS;
1061 case '*': return STAR;
1062 case '/': return SLASH;
1063 case '|': return VBAR;
1064 case '&': return AMPER;
1065 case '<': return LESS;
1066 case '>': return GREATER;
1067 case '=': return EQUAL;
1068 case '.': return DOT;
1069 case '%': return PERCENT;
1070 case '`': return BACKQUOTE;
1071 case '{': return LBRACE;
1072 case '}': return RBRACE;
1073 case '^': return CIRCUMFLEX;
1074 case '~': return TILDE;
1075 case '@': return AT;
1076 default: return OP;
1077 }
1078 }
1079
1080
1081 int
PyToken_TwoChars(int c1,int c2)1082 PyToken_TwoChars(int c1, int c2)
1083 {
1084 switch (c1) {
1085 case '=':
1086 switch (c2) {
1087 case '=': return EQEQUAL;
1088 }
1089 break;
1090 case '!':
1091 switch (c2) {
1092 case '=': return NOTEQUAL;
1093 }
1094 break;
1095 case '<':
1096 switch (c2) {
1097 case '>': return NOTEQUAL;
1098 case '=': return LESSEQUAL;
1099 case '<': return LEFTSHIFT;
1100 }
1101 break;
1102 case '>':
1103 switch (c2) {
1104 case '=': return GREATEREQUAL;
1105 case '>': return RIGHTSHIFT;
1106 }
1107 break;
1108 case '+':
1109 switch (c2) {
1110 case '=': return PLUSEQUAL;
1111 }
1112 break;
1113 case '-':
1114 switch (c2) {
1115 case '=': return MINEQUAL;
1116 }
1117 break;
1118 case '*':
1119 switch (c2) {
1120 case '*': return DOUBLESTAR;
1121 case '=': return STAREQUAL;
1122 }
1123 break;
1124 case '/':
1125 switch (c2) {
1126 case '/': return DOUBLESLASH;
1127 case '=': return SLASHEQUAL;
1128 }
1129 break;
1130 case '|':
1131 switch (c2) {
1132 case '=': return VBAREQUAL;
1133 }
1134 break;
1135 case '%':
1136 switch (c2) {
1137 case '=': return PERCENTEQUAL;
1138 }
1139 break;
1140 case '&':
1141 switch (c2) {
1142 case '=': return AMPEREQUAL;
1143 }
1144 break;
1145 case '^':
1146 switch (c2) {
1147 case '=': return CIRCUMFLEXEQUAL;
1148 }
1149 break;
1150 }
1151 return OP;
1152 }
1153
1154 int
PyToken_ThreeChars(int c1,int c2,int c3)1155 PyToken_ThreeChars(int c1, int c2, int c3)
1156 {
1157 switch (c1) {
1158 case '<':
1159 switch (c2) {
1160 case '<':
1161 switch (c3) {
1162 case '=':
1163 return LEFTSHIFTEQUAL;
1164 }
1165 break;
1166 }
1167 break;
1168 case '>':
1169 switch (c2) {
1170 case '>':
1171 switch (c3) {
1172 case '=':
1173 return RIGHTSHIFTEQUAL;
1174 }
1175 break;
1176 }
1177 break;
1178 case '*':
1179 switch (c2) {
1180 case '*':
1181 switch (c3) {
1182 case '=':
1183 return DOUBLESTAREQUAL;
1184 }
1185 break;
1186 }
1187 break;
1188 case '/':
1189 switch (c2) {
1190 case '/':
1191 switch (c3) {
1192 case '=':
1193 return DOUBLESLASHEQUAL;
1194 }
1195 break;
1196 }
1197 break;
1198 }
1199 return OP;
1200 }
1201
1202 static int
indenterror(struct tok_state * tok)1203 indenterror(struct tok_state *tok)
1204 {
1205 if (tok->alterror) {
1206 tok->done = E_TABSPACE;
1207 tok->cur = tok->inp;
1208 return 1;
1209 }
1210 if (tok->altwarning) {
1211 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1212 "in indentation\n", tok->filename);
1213 tok->altwarning = 0;
1214 }
1215 return 0;
1216 }
1217
1218 /* Get next token, after space stripping etc. */
1219
1220 static int
tok_get(register struct tok_state * tok,char ** p_start,char ** p_end)1221 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1222 {
1223 register int c;
1224 int blankline;
1225
1226 *p_start = *p_end = NULL;
1227 nextline:
1228 tok->start = NULL;
1229 blankline = 0;
1230
1231 /* Get indentation level */
1232 if (tok->atbol) {
1233 register int col = 0;
1234 register int altcol = 0;
1235 tok->atbol = 0;
1236 for (;;) {
1237 c = tok_nextc(tok);
1238 if (c == ' ')
1239 col++, altcol++;
1240 else if (c == '\t') {
1241 col = (col/tok->tabsize + 1) * tok->tabsize;
1242 altcol = (altcol/tok->alttabsize + 1)
1243 * tok->alttabsize;
1244 }
1245 else if (c == '\014') /* Control-L (formfeed) */
1246 col = altcol = 0; /* For Emacs users */
1247 else
1248 break;
1249 }
1250 tok_backup(tok, c);
1251 if (c == '#' || c == '\n') {
1252 /* Lines with only whitespace and/or comments
1253 shouldn't affect the indentation and are
1254 not passed to the parser as NEWLINE tokens,
1255 except *totally* empty lines in interactive
1256 mode, which signal the end of a command group. */
1257 if (col == 0 && c == '\n' && tok->prompt != NULL)
1258 blankline = 0; /* Let it through */
1259 else
1260 blankline = 1; /* Ignore completely */
1261 /* We can't jump back right here since we still
1262 may need to skip to the end of a comment */
1263 }
1264 if (!blankline && tok->level == 0) {
1265 if (col == tok->indstack[tok->indent]) {
1266 /* No change */
1267 if (altcol != tok->altindstack[tok->indent]) {
1268 if (indenterror(tok))
1269 return ERRORTOKEN;
1270 }
1271 }
1272 else if (col > tok->indstack[tok->indent]) {
1273 /* Indent -- always one */
1274 if (tok->indent+1 >= MAXINDENT) {
1275 tok->done = E_TOODEEP;
1276 tok->cur = tok->inp;
1277 return ERRORTOKEN;
1278 }
1279 if (altcol <= tok->altindstack[tok->indent]) {
1280 if (indenterror(tok))
1281 return ERRORTOKEN;
1282 }
1283 tok->pendin++;
1284 tok->indstack[++tok->indent] = col;
1285 tok->altindstack[tok->indent] = altcol;
1286 }
1287 else /* col < tok->indstack[tok->indent] */ {
1288 /* Dedent -- any number, must be consistent */
1289 while (tok->indent > 0 &&
1290 col < tok->indstack[tok->indent]) {
1291 tok->pendin--;
1292 tok->indent--;
1293 }
1294 if (col != tok->indstack[tok->indent]) {
1295 tok->done = E_DEDENT;
1296 tok->cur = tok->inp;
1297 return ERRORTOKEN;
1298 }
1299 if (altcol != tok->altindstack[tok->indent]) {
1300 if (indenterror(tok))
1301 return ERRORTOKEN;
1302 }
1303 }
1304 }
1305 }
1306
1307 tok->start = tok->cur;
1308
1309 /* Return pending indents/dedents */
1310 if (tok->pendin != 0) {
1311 if (tok->pendin < 0) {
1312 tok->pendin++;
1313 return DEDENT;
1314 }
1315 else {
1316 tok->pendin--;
1317 return INDENT;
1318 }
1319 }
1320
1321 again:
1322 tok->start = NULL;
1323 /* Skip spaces */
1324 do {
1325 c = tok_nextc(tok);
1326 } while (c == ' ' || c == '\t' || c == '\014');
1327
1328 /* Set start of current token */
1329 tok->start = tok->cur - 1;
1330
1331 /* Skip comment, while looking for tab-setting magic */
1332 if (c == '#') {
1333 static char *tabforms[] = {
1334 "tab-width:", /* Emacs */
1335 ":tabstop=", /* vim, full form */
1336 ":ts=", /* vim, abbreviated form */
1337 "set tabsize=", /* will vi never die? */
1338 /* more templates can be added here to support other editors */
1339 };
1340 char cbuf[80];
1341 char *tp, **cp;
1342 tp = cbuf;
1343 do {
1344 *tp++ = c = tok_nextc(tok);
1345 } while (c != EOF && c != '\n' &&
1346 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1347 *tp = '\0';
1348 for (cp = tabforms;
1349 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1350 cp++) {
1351 if ((tp = strstr(cbuf, *cp))) {
1352 int newsize = atoi(tp + strlen(*cp));
1353
1354 if (newsize >= 1 && newsize <= 40) {
1355 tok->tabsize = newsize;
1356 if (Py_VerboseFlag)
1357 PySys_WriteStderr(
1358 "Tab size set to %d\n",
1359 newsize);
1360 }
1361 }
1362 }
1363 while (c != EOF && c != '\n')
1364 c = tok_nextc(tok);
1365 }
1366
1367 /* Check for EOF and errors now */
1368 if (c == EOF) {
1369 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1370 }
1371
1372 /* Identifier (most frequent token!) */
1373 if (Py_ISALPHA(c) || c == '_') {
1374 /* Process r"", u"" and ur"" */
1375 switch (c) {
1376 case 'b':
1377 case 'B':
1378 c = tok_nextc(tok);
1379 if (c == 'r' || c == 'R')
1380 c = tok_nextc(tok);
1381 if (c == '"' || c == '\'')
1382 goto letter_quote;
1383 break;
1384 case 'r':
1385 case 'R':
1386 c = tok_nextc(tok);
1387 if (c == '"' || c == '\'')
1388 goto letter_quote;
1389 break;
1390 case 'u':
1391 case 'U':
1392 c = tok_nextc(tok);
1393 if (c == 'r' || c == 'R')
1394 c = tok_nextc(tok);
1395 if (c == '"' || c == '\'')
1396 goto letter_quote;
1397 break;
1398 }
1399 while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
1400 c = tok_nextc(tok);
1401 }
1402 tok_backup(tok, c);
1403 *p_start = tok->start;
1404 *p_end = tok->cur;
1405 return NAME;
1406 }
1407
1408 /* Newline */
1409 if (c == '\n') {
1410 tok->atbol = 1;
1411 if (blankline || tok->level > 0)
1412 goto nextline;
1413 *p_start = tok->start;
1414 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1415 tok->cont_line = 0;
1416 return NEWLINE;
1417 }
1418
1419 /* Period or number starting with period? */
1420 if (c == '.') {
1421 c = tok_nextc(tok);
1422 if (isdigit(c)) {
1423 goto fraction;
1424 }
1425 else {
1426 tok_backup(tok, c);
1427 *p_start = tok->start;
1428 *p_end = tok->cur;
1429 return DOT;
1430 }
1431 }
1432
1433 /* Number */
1434 if (isdigit(c)) {
1435 if (c == '0') {
1436 /* Hex, octal or binary -- maybe. */
1437 c = tok_nextc(tok);
1438 if (c == '.')
1439 goto fraction;
1440 #ifndef WITHOUT_COMPLEX
1441 if (c == 'j' || c == 'J')
1442 goto imaginary;
1443 #endif
1444 if (c == 'x' || c == 'X') {
1445
1446 /* Hex */
1447 c = tok_nextc(tok);
1448 if (!isxdigit(c)) {
1449 tok->done = E_TOKEN;
1450 tok_backup(tok, c);
1451 return ERRORTOKEN;
1452 }
1453 do {
1454 c = tok_nextc(tok);
1455 } while (isxdigit(c));
1456 }
1457 else if (c == 'o' || c == 'O') {
1458 /* Octal */
1459 c = tok_nextc(tok);
1460 if (c < '0' || c >= '8') {
1461 tok->done = E_TOKEN;
1462 tok_backup(tok, c);
1463 return ERRORTOKEN;
1464 }
1465 do {
1466 c = tok_nextc(tok);
1467 } while ('0' <= c && c < '8');
1468 }
1469 else if (c == 'b' || c == 'B') {
1470 /* Binary */
1471 c = tok_nextc(tok);
1472 if (c != '0' && c != '1') {
1473 tok->done = E_TOKEN;
1474 tok_backup(tok, c);
1475 return ERRORTOKEN;
1476 }
1477 do {
1478 c = tok_nextc(tok);
1479 } while (c == '0' || c == '1');
1480 }
1481 else {
1482 int found_decimal = 0;
1483 /* Octal; c is first char of it */
1484 /* There's no 'isoctdigit' macro, sigh */
1485 while ('0' <= c && c < '8') {
1486 c = tok_nextc(tok);
1487 }
1488 if (isdigit(c)) {
1489 found_decimal = 1;
1490 do {
1491 c = tok_nextc(tok);
1492 } while (isdigit(c));
1493 }
1494 if (c == '.')
1495 goto fraction;
1496 else if (c == 'e' || c == 'E')
1497 goto exponent;
1498 #ifndef WITHOUT_COMPLEX
1499 else if (c == 'j' || c == 'J')
1500 goto imaginary;
1501 #endif
1502 else if (found_decimal) {
1503 tok->done = E_TOKEN;
1504 tok_backup(tok, c);
1505 return ERRORTOKEN;
1506 }
1507 }
1508 if (c == 'l' || c == 'L')
1509 c = tok_nextc(tok);
1510 }
1511 else {
1512 /* Decimal */
1513 do {
1514 c = tok_nextc(tok);
1515 } while (isdigit(c));
1516 if (c == 'l' || c == 'L')
1517 c = tok_nextc(tok);
1518 else {
1519 /* Accept floating point numbers. */
1520 if (c == '.') {
1521 fraction:
1522 /* Fraction */
1523 do {
1524 c = tok_nextc(tok);
1525 } while (isdigit(c));
1526 }
1527 if (c == 'e' || c == 'E') {
1528 int e;
1529 exponent:
1530 e = c;
1531 /* Exponent part */
1532 c = tok_nextc(tok);
1533 if (c == '+' || c == '-') {
1534 c = tok_nextc(tok);
1535 if (!isdigit(c)) {
1536 tok->done = E_TOKEN;
1537 tok_backup(tok, c);
1538 return ERRORTOKEN;
1539 }
1540 } else if (!isdigit(c)) {
1541 tok_backup(tok, c);
1542 tok_backup(tok, e);
1543 *p_start = tok->start;
1544 *p_end = tok->cur;
1545 return NUMBER;
1546 }
1547 do {
1548 c = tok_nextc(tok);
1549 } while (isdigit(c));
1550 }
1551 #ifndef WITHOUT_COMPLEX
1552 if (c == 'j' || c == 'J')
1553 /* Imaginary part */
1554 imaginary:
1555 c = tok_nextc(tok);
1556 #endif
1557 }
1558 }
1559 tok_backup(tok, c);
1560 *p_start = tok->start;
1561 *p_end = tok->cur;
1562 return NUMBER;
1563 }
1564
1565 letter_quote:
1566 /* String */
1567 if (c == '\'' || c == '"') {
1568 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1569 int quote = c;
1570 int triple = 0;
1571 int tripcount = 0;
1572 for (;;) {
1573 c = tok_nextc(tok);
1574 if (c == '\n') {
1575 if (!triple) {
1576 tok->done = E_EOLS;
1577 tok_backup(tok, c);
1578 return ERRORTOKEN;
1579 }
1580 tripcount = 0;
1581 tok->cont_line = 1; /* multiline string. */
1582 }
1583 else if (c == EOF) {
1584 if (triple)
1585 tok->done = E_EOFS;
1586 else
1587 tok->done = E_EOLS;
1588 tok->cur = tok->inp;
1589 return ERRORTOKEN;
1590 }
1591 else if (c == quote) {
1592 tripcount++;
1593 if (tok->cur - tok->start == quote2) {
1594 c = tok_nextc(tok);
1595 if (c == quote) {
1596 triple = 1;
1597 tripcount = 0;
1598 continue;
1599 }
1600 tok_backup(tok, c);
1601 }
1602 if (!triple || tripcount == 3)
1603 break;
1604 }
1605 else if (c == '\\') {
1606 tripcount = 0;
1607 c = tok_nextc(tok);
1608 if (c == EOF) {
1609 tok->done = E_EOLS;
1610 tok->cur = tok->inp;
1611 return ERRORTOKEN;
1612 }
1613 }
1614 else
1615 tripcount = 0;
1616 }
1617 *p_start = tok->start;
1618 *p_end = tok->cur;
1619 return STRING;
1620 }
1621
1622 /* Line continuation */
1623 if (c == '\\') {
1624 c = tok_nextc(tok);
1625 if (c != '\n') {
1626 tok->done = E_LINECONT;
1627 tok->cur = tok->inp;
1628 return ERRORTOKEN;
1629 }
1630 tok->cont_line = 1;
1631 goto again; /* Read next line */
1632 }
1633
1634 /* Check for two-character token */
1635 {
1636 int c2 = tok_nextc(tok);
1637 int token = PyToken_TwoChars(c, c2);
1638 #ifndef PGEN
1639 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1640 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1641 "<> not supported in 3.x; use !=",
1642 tok->filename, tok->lineno,
1643 NULL, NULL)) {
1644 tok->done = E_ERROR;
1645 tok->cur = tok->inp;
1646 return ERRORTOKEN;
1647 }
1648 }
1649 #endif
1650 if (token != OP) {
1651 int c3 = tok_nextc(tok);
1652 int token3 = PyToken_ThreeChars(c, c2, c3);
1653 if (token3 != OP) {
1654 token = token3;
1655 } else {
1656 tok_backup(tok, c3);
1657 }
1658 *p_start = tok->start;
1659 *p_end = tok->cur;
1660 return token;
1661 }
1662 tok_backup(tok, c2);
1663 }
1664
1665 /* Keep track of parentheses nesting level */
1666 switch (c) {
1667 case '(':
1668 case '[':
1669 case '{':
1670 tok->level++;
1671 break;
1672 case ')':
1673 case ']':
1674 case '}':
1675 tok->level--;
1676 break;
1677 }
1678
1679 /* Punctuation character */
1680 *p_start = tok->start;
1681 *p_end = tok->cur;
1682 return PyToken_OneChar(c);
1683 }
1684
1685 int
PyTokenizer_Get(struct tok_state * tok,char ** p_start,char ** p_end)1686 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1687 {
1688 int result = tok_get(tok, p_start, p_end);
1689 if (tok->fp && ferror(tok->fp)) {
1690 clearerr(tok->fp);
1691 result = ERRORTOKEN;
1692 tok->done = E_IO;
1693 }
1694 if (tok->decoding_erred) {
1695 result = ERRORTOKEN;
1696 tok->done = E_DECODE;
1697 }
1698 return result;
1699 }
1700
1701 /* This function is only called from parsetok. However, it cannot live
1702 there, as it must be empty for PGEN, and we can check for PGEN only
1703 in this file. */
1704
1705 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1706 char*
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1707 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1708 {
1709 return NULL;
1710 }
1711 #else
1712 #ifdef Py_USING_UNICODE
1713 static PyObject *
dec_utf8(const char * enc,const char * text,size_t len)1714 dec_utf8(const char *enc, const char *text, size_t len) {
1715 PyObject *ret = NULL;
1716 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1717 if (unicode_text) {
1718 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1719 Py_DECREF(unicode_text);
1720 }
1721 if (!ret) {
1722 PyErr_Clear();
1723 }
1724 return ret;
1725 }
1726 char *
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1727 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1728 {
1729 char *text = NULL;
1730 if (tok->encoding) {
1731 /* convert source to original encondig */
1732 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1733 if (lineobj != NULL) {
1734 int linelen = PyString_Size(lineobj);
1735 const char *line = PyString_AsString(lineobj);
1736 text = PyObject_MALLOC(linelen + 1);
1737 if (text != NULL && line != NULL) {
1738 if (linelen)
1739 strncpy(text, line, linelen);
1740 text[linelen] = '\0';
1741 }
1742 Py_DECREF(lineobj);
1743
1744 /* adjust error offset */
1745 if (*offset > 1) {
1746 PyObject *offsetobj = dec_utf8(tok->encoding,
1747 tok->buf, *offset-1);
1748 if (offsetobj) {
1749 *offset = PyString_Size(offsetobj) + 1;
1750 Py_DECREF(offsetobj);
1751 }
1752 }
1753
1754 }
1755 }
1756 return text;
1757
1758 }
1759 #endif /* defined(Py_USING_UNICODE) */
1760 #endif
1761
1762
1763 #ifdef Py_DEBUG
1764
1765 void
tok_dump(int type,char * start,char * end)1766 tok_dump(int type, char *start, char *end)
1767 {
1768 printf("%s", _PyParser_TokenNames[type]);
1769 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1770 printf("(%.*s)", (int)(end - start), start);
1771 }
1772
1773 #endif
1774