1 /*
2  * This file is part of the MicroPython project, http://micropython.org/
3  *
4  * The MIT License (MIT)
5  *
6  * Copyright (c) 2013, 2014 Damien P. George
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  */
26 #ifndef MICROPY_INCLUDED_PY_LEXER_H
27 #define MICROPY_INCLUDED_PY_LEXER_H
28 
29 #include <stdint.h>
30 
31 #include "py/mpconfig.h"
32 #include "py/qstr.h"
33 #include "py/reader.h"
34 
35 /* lexer.h -- simple tokeniser for MicroPython
36  *
37  * Uses (byte) length instead of null termination.
38  * Tokens are the same - UTF-8 with (byte) length.
39  */
40 
41 typedef enum _mp_token_kind_t {
42     MP_TOKEN_END,
43 
44     MP_TOKEN_INVALID,
45     MP_TOKEN_DEDENT_MISMATCH,
46     MP_TOKEN_LONELY_STRING_OPEN,
47     #if MICROPY_PY_FSTRINGS
48     MP_TOKEN_MALFORMED_FSTRING,
49     MP_TOKEN_FSTRING_RAW,
50     #endif
51 
52     MP_TOKEN_NEWLINE,
53     MP_TOKEN_INDENT,
54     MP_TOKEN_DEDENT,
55 
56     MP_TOKEN_NAME,
57     MP_TOKEN_INTEGER,
58     MP_TOKEN_FLOAT_OR_IMAG,
59     MP_TOKEN_STRING,
60     MP_TOKEN_BYTES,
61 
62     MP_TOKEN_ELLIPSIS,
63 
64     MP_TOKEN_KW_FALSE,
65     MP_TOKEN_KW_NONE,
66     MP_TOKEN_KW_TRUE,
67     MP_TOKEN_KW___DEBUG__,
68     MP_TOKEN_KW_AND,
69     MP_TOKEN_KW_AS,
70     MP_TOKEN_KW_ASSERT,
71     #if MICROPY_PY_ASYNC_AWAIT
72     MP_TOKEN_KW_ASYNC,
73     MP_TOKEN_KW_AWAIT,
74     #endif
75     MP_TOKEN_KW_BREAK,
76     MP_TOKEN_KW_CLASS,
77     MP_TOKEN_KW_CONTINUE,
78     MP_TOKEN_KW_DEF,
79     MP_TOKEN_KW_DEL,
80     MP_TOKEN_KW_ELIF,
81     MP_TOKEN_KW_ELSE,
82     MP_TOKEN_KW_EXCEPT,
83     MP_TOKEN_KW_FINALLY,
84     MP_TOKEN_KW_FOR,
85     MP_TOKEN_KW_FROM,
86     MP_TOKEN_KW_GLOBAL,
87     MP_TOKEN_KW_IF,
88     MP_TOKEN_KW_IMPORT,
89     MP_TOKEN_KW_IN,
90     MP_TOKEN_KW_IS,
91     MP_TOKEN_KW_LAMBDA,
92     MP_TOKEN_KW_NONLOCAL,
93     MP_TOKEN_KW_NOT,
94     MP_TOKEN_KW_OR,
95     MP_TOKEN_KW_PASS,
96     MP_TOKEN_KW_RAISE,
97     MP_TOKEN_KW_RETURN,
98     MP_TOKEN_KW_TRY,
99     MP_TOKEN_KW_WHILE,
100     MP_TOKEN_KW_WITH,
101     MP_TOKEN_KW_YIELD,
102 
103     MP_TOKEN_OP_ASSIGN,
104     MP_TOKEN_OP_TILDE,
105 
106     // Order of these 6 matches corresponding mp_binary_op_t operator
107     MP_TOKEN_OP_LESS,
108     MP_TOKEN_OP_MORE,
109     MP_TOKEN_OP_DBL_EQUAL,
110     MP_TOKEN_OP_LESS_EQUAL,
111     MP_TOKEN_OP_MORE_EQUAL,
112     MP_TOKEN_OP_NOT_EQUAL,
113 
114     // Order of these 13 matches corresponding mp_binary_op_t operator
115     MP_TOKEN_OP_PIPE,
116     MP_TOKEN_OP_CARET,
117     MP_TOKEN_OP_AMPERSAND,
118     MP_TOKEN_OP_DBL_LESS,
119     MP_TOKEN_OP_DBL_MORE,
120     MP_TOKEN_OP_PLUS,
121     MP_TOKEN_OP_MINUS,
122     MP_TOKEN_OP_STAR,
123     MP_TOKEN_OP_AT,
124     MP_TOKEN_OP_DBL_SLASH,
125     MP_TOKEN_OP_SLASH,
126     MP_TOKEN_OP_PERCENT,
127     MP_TOKEN_OP_DBL_STAR,
128 
129     // Order of these 13 matches corresponding mp_binary_op_t operator
130     MP_TOKEN_DEL_PIPE_EQUAL,
131     MP_TOKEN_DEL_CARET_EQUAL,
132     MP_TOKEN_DEL_AMPERSAND_EQUAL,
133     MP_TOKEN_DEL_DBL_LESS_EQUAL,
134     MP_TOKEN_DEL_DBL_MORE_EQUAL,
135     MP_TOKEN_DEL_PLUS_EQUAL,
136     MP_TOKEN_DEL_MINUS_EQUAL,
137     MP_TOKEN_DEL_STAR_EQUAL,
138     MP_TOKEN_DEL_AT_EQUAL,
139     MP_TOKEN_DEL_DBL_SLASH_EQUAL,
140     MP_TOKEN_DEL_SLASH_EQUAL,
141     MP_TOKEN_DEL_PERCENT_EQUAL,
142     MP_TOKEN_DEL_DBL_STAR_EQUAL,
143 
144     MP_TOKEN_DEL_PAREN_OPEN,
145     MP_TOKEN_DEL_PAREN_CLOSE,
146     MP_TOKEN_DEL_BRACKET_OPEN,
147     MP_TOKEN_DEL_BRACKET_CLOSE,
148     MP_TOKEN_DEL_BRACE_OPEN,
149     MP_TOKEN_DEL_BRACE_CLOSE,
150     MP_TOKEN_DEL_COMMA,
151     MP_TOKEN_DEL_COLON,
152     MP_TOKEN_DEL_PERIOD,
153     MP_TOKEN_DEL_SEMICOLON,
154     MP_TOKEN_DEL_EQUAL,
155     MP_TOKEN_DEL_MINUS_MORE,
156 } mp_token_kind_t;
157 
158 // this data structure is exposed for efficiency
159 // public members are: source_name, tok_line, tok_column, tok_kind, vstr
160 typedef struct _mp_lexer_t {
161     qstr source_name;           // name of source
162     mp_reader_t reader;         // stream source
163 
164     unichar chr0, chr1, chr2;   // current cached characters from source
165     #if MICROPY_PY_FSTRINGS
166     unichar chr0_saved, chr1_saved, chr2_saved; // current cached characters from alt source
167     #endif
168 
169     size_t line;                // current source line
170     size_t column;              // current source column
171 
172     mp_int_t emit_dent;             // non-zero when there are INDENT/DEDENT tokens to emit
173     mp_int_t nested_bracket_level;  // >0 when there are nested brackets over multiple lines
174 
175     size_t alloc_indent_level;
176     size_t num_indent_level;
177     uint16_t *indent_level;
178 
179     size_t tok_line;            // token source line
180     size_t tok_column;          // token source column
181     mp_token_kind_t tok_kind;   // token kind
182     vstr_t vstr;                // token data
183     #if MICROPY_PY_FSTRINGS
184     vstr_t fstring_args;        // extracted arguments to pass to .format()
185     size_t fstring_args_idx;    // how many bytes of fstring_args have been read
186     #endif
187 } mp_lexer_t;
188 
189 mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader);
190 mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, size_t len, size_t free_len);
191 
192 void mp_lexer_free(mp_lexer_t *lex);
193 void mp_lexer_to_next(mp_lexer_t *lex);
194 
195 /******************************************************************/
196 // platform specific import function; must be implemented for a specific port
197 // TODO tidy up, rename, or put elsewhere
198 
199 typedef enum {
200     MP_IMPORT_STAT_NO_EXIST,
201     MP_IMPORT_STAT_DIR,
202     MP_IMPORT_STAT_FILE,
203 } mp_import_stat_t;
204 
205 mp_import_stat_t mp_import_stat(const char *path);
206 mp_lexer_t *mp_lexer_new_from_file(const char *filename);
207 
208 #if MICROPY_HELPER_LEXER_UNIX
209 mp_lexer_t *mp_lexer_new_from_fd(qstr filename, int fd, bool close_fd);
210 #endif
211 
212 #endif // MICROPY_INCLUDED_PY_LEXER_H
213