1 /*
2  * This file is part of the MicroPython project, http://micropython.org/
3  *
4  * The MIT License (MIT)
5  *
6  * Copyright (c) 2013, 2014 Damien P. George
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  */
26 
27 #include <stdio.h>
28 #include <string.h>
29 #include <assert.h>
30 
31 #include "py/reader.h"
32 #include "py/lexer.h"
33 #include "py/runtime.h"
34 
35 #if MICROPY_ENABLE_COMPILER
36 
37 #define TAB_SIZE (8)
38 
39 // TODO seems that CPython allows NULL byte in the input stream
40 // don't know if that's intentional or not, but we don't allow it
41 
42 #define MP_LEXER_EOF ((unichar)MP_READER_EOF)
43 #define CUR_CHAR(lex) ((lex)->chr0)
44 
is_end(mp_lexer_t * lex)45 STATIC bool is_end(mp_lexer_t *lex) {
46     return lex->chr0 == MP_LEXER_EOF;
47 }
48 
is_physical_newline(mp_lexer_t * lex)49 STATIC bool is_physical_newline(mp_lexer_t *lex) {
50     return lex->chr0 == '\n';
51 }
52 
is_char(mp_lexer_t * lex,byte c)53 STATIC bool is_char(mp_lexer_t *lex, byte c) {
54     return lex->chr0 == c;
55 }
56 
is_char_or(mp_lexer_t * lex,byte c1,byte c2)57 STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {
58     return lex->chr0 == c1 || lex->chr0 == c2;
59 }
60 
is_char_or3(mp_lexer_t * lex,byte c1,byte c2,byte c3)61 STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
62     return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
63 }
64 
65 #if MICROPY_PY_FSTRINGS
is_char_or4(mp_lexer_t * lex,byte c1,byte c2,byte c3,byte c4)66 STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) {
67     return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4;
68 }
69 #endif
70 
is_char_following(mp_lexer_t * lex,byte c)71 STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
72     return lex->chr1 == c;
73 }
74 
is_char_following_or(mp_lexer_t * lex,byte c1,byte c2)75 STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
76     return lex->chr1 == c1 || lex->chr1 == c2;
77 }
78 
is_char_following_following_or(mp_lexer_t * lex,byte c1,byte c2)79 STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {
80     return lex->chr2 == c1 || lex->chr2 == c2;
81 }
82 
is_char_and(mp_lexer_t * lex,byte c1,byte c2)83 STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {
84     return lex->chr0 == c1 && lex->chr1 == c2;
85 }
86 
is_whitespace(mp_lexer_t * lex)87 STATIC bool is_whitespace(mp_lexer_t *lex) {
88     return unichar_isspace(lex->chr0);
89 }
90 
is_letter(mp_lexer_t * lex)91 STATIC bool is_letter(mp_lexer_t *lex) {
92     return unichar_isalpha(lex->chr0);
93 }
94 
is_digit(mp_lexer_t * lex)95 STATIC bool is_digit(mp_lexer_t *lex) {
96     return unichar_isdigit(lex->chr0);
97 }
98 
is_following_digit(mp_lexer_t * lex)99 STATIC bool is_following_digit(mp_lexer_t *lex) {
100     return unichar_isdigit(lex->chr1);
101 }
102 
is_following_base_char(mp_lexer_t * lex)103 STATIC bool is_following_base_char(mp_lexer_t *lex) {
104     const unichar chr1 = lex->chr1 | 0x20;
105     return chr1 == 'b' || chr1 == 'o' || chr1 == 'x';
106 }
107 
is_following_odigit(mp_lexer_t * lex)108 STATIC bool is_following_odigit(mp_lexer_t *lex) {
109     return lex->chr1 >= '0' && lex->chr1 <= '7';
110 }
111 
is_string_or_bytes(mp_lexer_t * lex)112 STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
113     return is_char_or(lex, '\'', '\"')
114            #if MICROPY_PY_FSTRINGS
115            || (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"'))
116            || (((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r'))
117                && is_char_following_following_or(lex, '\'', '\"')))
118            #else
119            || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
120            #endif
121            || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
122                && is_char_following_following_or(lex, '\'', '\"'));
123 }
124 
125 // to easily parse utf-8 identifiers we allow any raw byte with high bit set
is_head_of_identifier(mp_lexer_t * lex)126 STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
127     return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
128 }
129 
is_tail_of_identifier(mp_lexer_t * lex)130 STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
131     return is_head_of_identifier(lex) || is_digit(lex);
132 }
133 
next_char(mp_lexer_t * lex)134 STATIC void next_char(mp_lexer_t *lex) {
135     if (lex->chr0 == '\n') {
136         // a new line
137         ++lex->line;
138         lex->column = 1;
139     } else if (lex->chr0 == '\t') {
140         // a tab
141         lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
142     } else {
143         // a character worth one column
144         ++lex->column;
145     }
146 
147     // shift the input queue forward
148     lex->chr0 = lex->chr1;
149     lex->chr1 = lex->chr2;
150 
151     // and add the next byte from either the fstring args or the reader
152     #if MICROPY_PY_FSTRINGS
153     if (lex->fstring_args_idx) {
154         // if there are saved chars, then we're currently injecting fstring args
155         if (lex->fstring_args_idx < lex->fstring_args.len) {
156             lex->chr2 = lex->fstring_args.buf[lex->fstring_args_idx++];
157         } else {
158             // no more fstring arg bytes
159             lex->chr2 = '\0';
160         }
161 
162         if (lex->chr0 == '\0') {
163             // consumed all fstring data, restore saved input queue
164             lex->chr0 = lex->chr0_saved;
165             lex->chr1 = lex->chr1_saved;
166             lex->chr2 = lex->chr2_saved;
167             // stop consuming fstring arg data
168             vstr_reset(&lex->fstring_args);
169             lex->fstring_args_idx = 0;
170         }
171     } else
172     #endif
173     {
174         lex->chr2 = lex->reader.readbyte(lex->reader.data);
175     }
176 
177     if (lex->chr1 == '\r') {
178         // CR is a new line, converted to LF
179         lex->chr1 = '\n';
180         if (lex->chr2 == '\n') {
181             // CR LF is a single new line, throw out the extra LF
182             lex->chr2 = lex->reader.readbyte(lex->reader.data);
183         }
184     }
185 
186     // check if we need to insert a newline at end of file
187     if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
188         lex->chr2 = '\n';
189     }
190 }
191 
indent_push(mp_lexer_t * lex,size_t indent)192 STATIC void indent_push(mp_lexer_t *lex, size_t indent) {
193     if (lex->num_indent_level >= lex->alloc_indent_level) {
194         lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
195         lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
196     }
197     lex->indent_level[lex->num_indent_level++] = indent;
198 }
199 
indent_top(mp_lexer_t * lex)200 STATIC size_t indent_top(mp_lexer_t *lex) {
201     return lex->indent_level[lex->num_indent_level - 1];
202 }
203 
indent_pop(mp_lexer_t * lex)204 STATIC void indent_pop(mp_lexer_t *lex) {
205     lex->num_indent_level -= 1;
206 }
207 
208 // some tricky operator encoding:
209 //     <op>  = begin with <op>, if this opchar matches then begin here
210 //     e<op> = end with <op>, if this opchar matches then end
211 //     c<op> = continue with <op>, if this opchar matches then continue matching
212 // this means if the start of two ops are the same then they are equal til the last char
213 
214 STATIC const char *const tok_enc =
215     "()[]{},;~"   // singles
216     ":e="         // : :=
217     "<e=c<e="     // < <= << <<=
218     ">e=c>e="     // > >= >> >>=
219     "*e=c*e="     // * *= ** **=
220     "+e="         // + +=
221     "-e=e>"       // - -= ->
222     "&e="         // & &=
223     "|e="         // | |=
224     "/e=c/e="     // / /= // //=
225     "%e="         // % %=
226     "^e="         // ^ ^=
227     "@e="         // @ @=
228     "=e="         // = ==
229     "!.";         // start of special cases: != . ...
230 
231 // TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
232 STATIC const uint8_t tok_enc_kind[] = {
233     MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
234     MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
235     MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
236     MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_OP_TILDE,
237 
238     MP_TOKEN_DEL_COLON, MP_TOKEN_OP_ASSIGN,
239     MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
240     MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
241     MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
242     MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
243     MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
244     MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
245     MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
246     MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
247     MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
248     MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
249     MP_TOKEN_OP_AT, MP_TOKEN_DEL_AT_EQUAL,
250     MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
251 };
252 
253 // must have the same order as enum in lexer.h
254 // must be sorted according to strcmp
255 STATIC const char *const tok_kw[] = {
256     "False",
257     "None",
258     "True",
259     "__debug__",
260     "and",
261     "as",
262     "assert",
263     #if MICROPY_PY_ASYNC_AWAIT
264     "async",
265     "await",
266     #endif
267     "break",
268     "class",
269     "continue",
270     "def",
271     "del",
272     "elif",
273     "else",
274     "except",
275     "finally",
276     "for",
277     "from",
278     "global",
279     "if",
280     "import",
281     "in",
282     "is",
283     "lambda",
284     "nonlocal",
285     "not",
286     "or",
287     "pass",
288     "raise",
289     "return",
290     "try",
291     "while",
292     "with",
293     "yield",
294 };
295 
296 // This is called with CUR_CHAR() before first hex digit, and should return with
297 // it pointing to last hex digit
298 // num_digits must be greater than zero
get_hex(mp_lexer_t * lex,size_t num_digits,mp_uint_t * result)299 STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
300     mp_uint_t num = 0;
301     while (num_digits-- != 0) {
302         next_char(lex);
303         unichar c = CUR_CHAR(lex);
304         if (!unichar_isxdigit(c)) {
305             return false;
306         }
307         num = (num << 4) + unichar_xdigit_value(c);
308     }
309     *result = num;
310     return true;
311 }
312 
parse_string_literal(mp_lexer_t * lex,bool is_raw,bool is_fstring)313 STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) {
314     // get first quoting character
315     char quote_char = '\'';
316     if (is_char(lex, '\"')) {
317         quote_char = '\"';
318     }
319     next_char(lex);
320 
321     // work out if it's a single or triple quoted literal
322     size_t num_quotes;
323     if (is_char_and(lex, quote_char, quote_char)) {
324         // triple quotes
325         next_char(lex);
326         next_char(lex);
327         num_quotes = 3;
328     } else {
329         // single quotes
330         num_quotes = 1;
331     }
332 
333     size_t n_closing = 0;
334     #if MICROPY_PY_FSTRINGS
335     if (is_fstring) {
336         // assume there's going to be interpolation, so prep the injection data
337         // fstring_args_idx==0 && len(fstring_args)>0 means we're extracting the args.
338         // only when fstring_args_idx>0 will we consume the arg data
339         // note: lex->fstring_args will be empty already (it's reset when finished)
340         vstr_add_str(&lex->fstring_args, ".format(");
341     }
342     #endif
343 
344     while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
345         if (is_char(lex, quote_char)) {
346             n_closing += 1;
347             vstr_add_char(&lex->vstr, CUR_CHAR(lex));
348         } else {
349             n_closing = 0;
350 
351             #if MICROPY_PY_FSTRINGS
352             while (is_fstring && is_char(lex, '{')) {
353                 next_char(lex);
354                 if (is_char(lex, '{')) {
355                     // "{{" is passed through unchanged to be handled by str.format
356                     vstr_add_byte(&lex->vstr, '{');
357                     next_char(lex);
358                 } else {
359                     // remember the start of this argument (if we need it for f'{a=}').
360                     size_t i = lex->fstring_args.len;
361                     // extract characters inside the { until we reach the
362                     // format specifier or closing }.
363                     // (MicroPython limitation) note: this is completely unaware of
364                     // Python syntax and will not handle any expression containing '}' or ':'.
365                     // e.g. f'{"}"}' or f'{foo({})}'.
366                     while (!is_end(lex) && !is_char_or(lex, ':', '}')) {
367                         // like the default case at the end of this function, stay 8-bit clean
368                         vstr_add_byte(&lex->fstring_args, CUR_CHAR(lex));
369                         next_char(lex);
370                     }
371                     if (lex->fstring_args.buf[lex->fstring_args.len - 1] == '=') {
372                         // if the last character of the arg was '=', then inject "arg=" before the '{'.
373                         // f'{a=}' --> 'a={}'.format(a)
374                         vstr_add_strn(&lex->vstr, lex->fstring_args.buf + i, lex->fstring_args.len - i);
375                         // remove the trailing '='
376                         lex->fstring_args.len--;
377                     }
378                     // comma-separate args
379                     vstr_add_byte(&lex->fstring_args, ',');
380                 }
381                 vstr_add_byte(&lex->vstr, '{');
382             }
383             #endif
384 
385             if (is_char(lex, '\\')) {
386                 next_char(lex);
387                 unichar c = CUR_CHAR(lex);
388                 if (is_raw) {
389                     // raw strings allow escaping of quotes, but the backslash is also emitted
390                     vstr_add_char(&lex->vstr, '\\');
391                 } else {
392                     switch (c) {
393                         // note: "c" can never be MP_LEXER_EOF because next_char
394                         // always inserts a newline at the end of the input stream
395                         case '\n':
396                             c = MP_LEXER_EOF;
397                             break;                          // backslash escape the newline, just ignore it
398                         case '\\':
399                             break;
400                         case '\'':
401                             break;
402                         case '"':
403                             break;
404                         case 'a':
405                             c = 0x07;
406                             break;
407                         case 'b':
408                             c = 0x08;
409                             break;
410                         case 't':
411                             c = 0x09;
412                             break;
413                         case 'n':
414                             c = 0x0a;
415                             break;
416                         case 'v':
417                             c = 0x0b;
418                             break;
419                         case 'f':
420                             c = 0x0c;
421                             break;
422                         case 'r':
423                             c = 0x0d;
424                             break;
425                         case 'u':
426                         case 'U':
427                             if (lex->tok_kind == MP_TOKEN_BYTES) {
428                                 // b'\u1234' == b'\\u1234'
429                                 vstr_add_char(&lex->vstr, '\\');
430                                 break;
431                             }
432                             // Otherwise fall through.
433                             MP_FALLTHROUGH
434                         case 'x': {
435                             mp_uint_t num = 0;
436                             if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
437                                 // not enough hex chars for escape sequence
438                                 lex->tok_kind = MP_TOKEN_INVALID;
439                             }
440                             c = num;
441                             break;
442                         }
443                         case 'N':
444                             // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
445                             // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
446                             // 3MB of text; even gzip-compressed and with minimal structure, it'll take
447                             // roughly half a meg of storage. This form of Unicode escape may be added
448                             // later on, but it's definitely not a priority right now. -- CJA 20140607
449                             mp_raise_NotImplementedError(MP_ERROR_TEXT("unicode name escapes"));
450                             break;
451                         default:
452                             if (c >= '0' && c <= '7') {
453                                 // Octal sequence, 1-3 chars
454                                 size_t digits = 3;
455                                 mp_uint_t num = c - '0';
456                                 while (is_following_odigit(lex) && --digits != 0) {
457                                     next_char(lex);
458                                     num = num * 8 + (CUR_CHAR(lex) - '0');
459                                 }
460                                 c = num;
461                             } else {
462                                 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
463                                 vstr_add_char(&lex->vstr, '\\');
464                             }
465                             break;
466                     }
467                 }
468                 if (c != MP_LEXER_EOF) {
469                     if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
470                         if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
471                             vstr_add_char(&lex->vstr, c);
472                         } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
473                             vstr_add_byte(&lex->vstr, c);
474                         } else {
475                             // unicode character out of range
476                             // this raises a generic SyntaxError; could provide more info
477                             lex->tok_kind = MP_TOKEN_INVALID;
478                         }
479                     } else {
480                         // without unicode everything is just added as an 8-bit byte
481                         if (c < 0x100) {
482                             vstr_add_byte(&lex->vstr, c);
483                         } else {
484                             // 8-bit character out of range
485                             // this raises a generic SyntaxError; could provide more info
486                             lex->tok_kind = MP_TOKEN_INVALID;
487                         }
488                     }
489                 }
490             } else {
491                 // Add the "character" as a byte so that we remain 8-bit clean.
492                 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
493                 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
494             }
495         }
496         next_char(lex);
497     }
498 
499     // check we got the required end quotes
500     if (n_closing < num_quotes) {
501         lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
502     }
503 
504     // cut off the end quotes from the token text
505     vstr_cut_tail_bytes(&lex->vstr, n_closing);
506 }
507 
skip_whitespace(mp_lexer_t * lex,bool stop_at_newline)508 STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
509     bool had_physical_newline = false;
510     while (!is_end(lex)) {
511         if (is_physical_newline(lex)) {
512             if (stop_at_newline && lex->nested_bracket_level == 0) {
513                 break;
514             }
515             had_physical_newline = true;
516             next_char(lex);
517         } else if (is_whitespace(lex)) {
518             next_char(lex);
519         } else if (is_char(lex, '#')) {
520             next_char(lex);
521             while (!is_end(lex) && !is_physical_newline(lex)) {
522                 next_char(lex);
523             }
524             // had_physical_newline will be set on next loop
525         } else if (is_char_and(lex, '\\', '\n')) {
526             // line-continuation, so don't set had_physical_newline
527             next_char(lex);
528             next_char(lex);
529         } else {
530             break;
531         }
532     }
533     return had_physical_newline;
534 }
535 
mp_lexer_to_next(mp_lexer_t * lex)536 void mp_lexer_to_next(mp_lexer_t *lex) {
537     #if MICROPY_PY_FSTRINGS
538     if (lex->fstring_args.len && lex->fstring_args_idx == 0) {
539         // moving onto the next token means the literal string is complete.
540         // switch into injecting the format args.
541         vstr_add_byte(&lex->fstring_args, ')');
542         lex->chr0_saved = lex->chr0;
543         lex->chr1_saved = lex->chr1;
544         lex->chr2_saved = lex->chr2;
545         lex->chr0 = lex->fstring_args.buf[0];
546         lex->chr1 = lex->fstring_args.buf[1];
547         lex->chr2 = lex->fstring_args.buf[2];
548         // we've already extracted 3 chars, but setting this non-zero also
549         // means we'll start consuming the fstring data
550         lex->fstring_args_idx = 3;
551     }
552     #endif
553 
554     // start new token text
555     vstr_reset(&lex->vstr);
556 
557     // skip white space and comments
558     bool had_physical_newline = skip_whitespace(lex, false);
559 
560     // set token source information
561     lex->tok_line = lex->line;
562     lex->tok_column = lex->column;
563 
564     if (lex->emit_dent < 0) {
565         lex->tok_kind = MP_TOKEN_DEDENT;
566         lex->emit_dent += 1;
567 
568     } else if (lex->emit_dent > 0) {
569         lex->tok_kind = MP_TOKEN_INDENT;
570         lex->emit_dent -= 1;
571 
572     } else if (had_physical_newline && lex->nested_bracket_level == 0) {
573         lex->tok_kind = MP_TOKEN_NEWLINE;
574 
575         size_t num_spaces = lex->column - 1;
576         if (num_spaces == indent_top(lex)) {
577         } else if (num_spaces > indent_top(lex)) {
578             indent_push(lex, num_spaces);
579             lex->emit_dent += 1;
580         } else {
581             while (num_spaces < indent_top(lex)) {
582                 indent_pop(lex);
583                 lex->emit_dent -= 1;
584             }
585             if (num_spaces != indent_top(lex)) {
586                 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
587             }
588         }
589 
590     } else if (is_end(lex)) {
591         lex->tok_kind = MP_TOKEN_END;
592 
593     } else if (is_string_or_bytes(lex)) {
594         // a string or bytes literal
595 
596         // Python requires adjacent string/bytes literals to be automatically
597         // concatenated.  We do it here in the tokeniser to make efficient use of RAM,
598         // because then the lexer's vstr can be used to accumulate the string literal,
599         // in contrast to creating a parse tree of strings and then joining them later
600         // in the compiler.  It's also more compact in code size to do it here.
601 
602         // MP_TOKEN_END is used to indicate that this is the first string token
603         lex->tok_kind = MP_TOKEN_END;
604 
605         // Loop to accumulate string/bytes literals
606         do {
607             // parse type codes
608             bool is_raw = false;
609             bool is_fstring = false;
610             mp_token_kind_t kind = MP_TOKEN_STRING;
611             int n_char = 0;
612             if (is_char(lex, 'u')) {
613                 n_char = 1;
614             } else if (is_char(lex, 'b')) {
615                 kind = MP_TOKEN_BYTES;
616                 n_char = 1;
617                 if (is_char_following(lex, 'r')) {
618                     is_raw = true;
619                     n_char = 2;
620                 }
621             } else if (is_char(lex, 'r')) {
622                 is_raw = true;
623                 n_char = 1;
624                 if (is_char_following(lex, 'b')) {
625                     kind = MP_TOKEN_BYTES;
626                     n_char = 2;
627                 }
628                 #if MICROPY_PY_FSTRINGS
629                 if (is_char_following(lex, 'f')) {
630                     // raw-f-strings unsupported, immediately return (invalid) token.
631                     lex->tok_kind = MP_TOKEN_FSTRING_RAW;
632                     break;
633                 }
634                 #endif
635             }
636             #if MICROPY_PY_FSTRINGS
637             else if (is_char(lex, 'f')) {
638                 if (is_char_following(lex, 'r')) {
639                     // raw-f-strings unsupported, immediately return (invalid) token.
640                     lex->tok_kind = MP_TOKEN_FSTRING_RAW;
641                     break;
642                 }
643                 n_char = 1;
644                 is_fstring = true;
645             }
646             #endif
647 
648             // Set or check token kind
649             if (lex->tok_kind == MP_TOKEN_END) {
650                 lex->tok_kind = kind;
651             } else if (lex->tok_kind != kind) {
652                 // Can't concatenate string with bytes
653                 break;
654             }
655 
656             // Skip any type code characters
657             if (n_char != 0) {
658                 next_char(lex);
659                 if (n_char == 2) {
660                     next_char(lex);
661                 }
662             }
663 
664             // Parse the literal
665             parse_string_literal(lex, is_raw, is_fstring);
666 
667             // Skip whitespace so we can check if there's another string following
668             skip_whitespace(lex, true);
669 
670         } while (is_string_or_bytes(lex));
671 
672     } else if (is_head_of_identifier(lex)) {
673         lex->tok_kind = MP_TOKEN_NAME;
674 
675         // get first char (add as byte to remain 8-bit clean and support utf-8)
676         vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
677         next_char(lex);
678 
679         // get tail chars
680         while (!is_end(lex) && is_tail_of_identifier(lex)) {
681             vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
682             next_char(lex);
683         }
684 
685         // Check if the name is a keyword.
686         // We also check for __debug__ here and convert it to its value.  This is
687         // so the parser gives a syntax error on, eg, x.__debug__.  Otherwise, we
688         // need to check for this special token in many places in the compiler.
689         const char *s = vstr_null_terminated_str(&lex->vstr);
690         for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
691             int cmp = strcmp(s, tok_kw[i]);
692             if (cmp == 0) {
693                 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
694                 if (lex->tok_kind == MP_TOKEN_KW___DEBUG__) {
695                     lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
696                 }
697                 break;
698             } else if (cmp < 0) {
699                 // Table is sorted and comparison was less-than, so stop searching
700                 break;
701             }
702         }
703 
704     } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
705         bool forced_integer = false;
706         if (is_char(lex, '.')) {
707             lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
708         } else {
709             lex->tok_kind = MP_TOKEN_INTEGER;
710             if (is_char(lex, '0') && is_following_base_char(lex)) {
711                 forced_integer = true;
712             }
713         }
714 
715         // get first char
716         vstr_add_char(&lex->vstr, CUR_CHAR(lex));
717         next_char(lex);
718 
719         // get tail chars
720         while (!is_end(lex)) {
721             if (!forced_integer && is_char_or(lex, 'e', 'E')) {
722                 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
723                 vstr_add_char(&lex->vstr, 'e');
724                 next_char(lex);
725                 if (is_char(lex, '+') || is_char(lex, '-')) {
726                     vstr_add_char(&lex->vstr, CUR_CHAR(lex));
727                     next_char(lex);
728                 }
729             } else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {
730                 if (is_char_or3(lex, '.', 'j', 'J')) {
731                     lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
732                 }
733                 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
734                 next_char(lex);
735             } else if (is_char(lex, '_')) {
736                 next_char(lex);
737             } else {
738                 break;
739             }
740         }
741 
742     } else {
743         // search for encoded delimiter or operator
744 
745         const char *t = tok_enc;
746         size_t tok_enc_index = 0;
747         for (; *t != 0 && !is_char(lex, *t); t += 1) {
748             if (*t == 'e' || *t == 'c') {
749                 t += 1;
750             }
751             tok_enc_index += 1;
752         }
753 
754         next_char(lex);
755 
756         if (*t == 0) {
757             // didn't match any delimiter or operator characters
758             lex->tok_kind = MP_TOKEN_INVALID;
759 
760         } else if (*t == '!') {
761             // "!=" is a special case because "!" is not a valid operator
762             if (is_char(lex, '=')) {
763                 next_char(lex);
764                 lex->tok_kind = MP_TOKEN_OP_NOT_EQUAL;
765             } else {
766                 lex->tok_kind = MP_TOKEN_INVALID;
767             }
768 
769         } else if (*t == '.') {
770             // "." and "..." are special cases because ".." is not a valid operator
771             if (is_char_and(lex, '.', '.')) {
772                 next_char(lex);
773                 next_char(lex);
774                 lex->tok_kind = MP_TOKEN_ELLIPSIS;
775             } else {
776                 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
777             }
778 
779         } else {
780             // matched a delimiter or operator character
781 
782             // get the maximum characters for a valid token
783             t += 1;
784             size_t t_index = tok_enc_index;
785             while (*t == 'c' || *t == 'e') {
786                 t_index += 1;
787                 if (is_char(lex, t[1])) {
788                     next_char(lex);
789                     tok_enc_index = t_index;
790                     if (*t == 'e') {
791                         break;
792                     }
793                 } else if (*t == 'c') {
794                     break;
795                 }
796                 t += 2;
797             }
798 
799             // set token kind
800             lex->tok_kind = tok_enc_kind[tok_enc_index];
801 
802             // compute bracket level for implicit line joining
803             if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
804                 lex->nested_bracket_level += 1;
805             } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
806                 lex->nested_bracket_level -= 1;
807             }
808         }
809     }
810 }
811 
mp_lexer_new(qstr src_name,mp_reader_t reader)812 mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
813     mp_lexer_t *lex = m_new_obj(mp_lexer_t);
814 
815     lex->source_name = src_name;
816     lex->reader = reader;
817     lex->line = 1;
818     lex->column = (size_t)-2; // account for 3 dummy bytes
819     lex->emit_dent = 0;
820     lex->nested_bracket_level = 0;
821     lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
822     lex->num_indent_level = 1;
823     lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
824     vstr_init(&lex->vstr, 32);
825     #if MICROPY_PY_FSTRINGS
826     vstr_init(&lex->fstring_args, 0);
827     #endif
828 
829     // store sentinel for first indentation level
830     lex->indent_level[0] = 0;
831 
832     // load lexer with start of file, advancing lex->column to 1
833     // start with dummy bytes and use next_char() for proper EOL/EOF handling
834     lex->chr0 = lex->chr1 = lex->chr2 = 0;
835     next_char(lex);
836     next_char(lex);
837     next_char(lex);
838 
839     // preload first token
840     mp_lexer_to_next(lex);
841 
842     // Check that the first token is in the first column.  If it's not then we
843     // convert the token kind to INDENT so that the parser gives a syntax error.
844     if (lex->tok_column != 1) {
845         lex->tok_kind = MP_TOKEN_INDENT;
846     }
847 
848     return lex;
849 }
850 
mp_lexer_new_from_str_len(qstr src_name,const char * str,size_t len,size_t free_len)851 mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, size_t len, size_t free_len) {
852     mp_reader_t reader;
853     mp_reader_new_mem(&reader, (const byte *)str, len, free_len);
854     return mp_lexer_new(src_name, reader);
855 }
856 
857 #if MICROPY_READER_POSIX || MICROPY_READER_VFS
858 
mp_lexer_new_from_file(const char * filename)859 mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
860     mp_reader_t reader;
861     mp_reader_new_file(&reader, filename);
862     return mp_lexer_new(qstr_from_str(filename), reader);
863 }
864 
865 #if MICROPY_HELPER_LEXER_UNIX
866 
mp_lexer_new_from_fd(qstr filename,int fd,bool close_fd)867 mp_lexer_t *mp_lexer_new_from_fd(qstr filename, int fd, bool close_fd) {
868     mp_reader_t reader;
869     mp_reader_new_file_from_fd(&reader, fd, close_fd);
870     return mp_lexer_new(filename, reader);
871 }
872 
873 #endif
874 
875 #endif
876 
mp_lexer_free(mp_lexer_t * lex)877 void mp_lexer_free(mp_lexer_t *lex) {
878     if (lex) {
879         lex->reader.close(lex->reader.data);
880         vstr_clear(&lex->vstr);
881         #if MICROPY_PY_FSTRINGS
882         vstr_clear(&lex->fstring_args);
883         #endif
884         m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
885         m_del_obj(mp_lexer_t, lex);
886     }
887 }
888 
889 #if 0
890 // This function is used to print the current token and should only be
891 // needed to debug the lexer, so it's not available via a config option.
892 void mp_lexer_show_token(const mp_lexer_t *lex) {
893     printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
894     if (lex->vstr.len > 0) {
895         const byte *i = (const byte *)lex->vstr.buf;
896         const byte *j = (const byte *)i + lex->vstr.len;
897         printf(" ");
898         while (i < j) {
899             unichar c = utf8_get_char(i);
900             i = utf8_next_char(i);
901             if (unichar_isprint(c)) {
902                 printf("%c", (int)c);
903             } else {
904                 printf("?");
905             }
906         }
907     }
908     printf("\n");
909 }
910 #endif
911 
912 #endif // MICROPY_ENABLE_COMPILER
913