1 #include <tree_sitter/parser.h>
2 #include <vector>
3 #include <cwctype>
4 #include <cstring>
5 #include <cassert>
6 #include <stdio.h>
7 namespace {
8
9 using std::vector;
10 using std::iswspace;
11 using std::memcpy;
12
13 enum TokenType {
14 NEWLINE,
15 INDENT,
16 DEDENT,
17 STRING_START,
18 STRING_CONTENT,
19 STRING_END,
20 };
21
22 struct Delimiter {
23 enum {
24 SingleQuote = 1 << 0,
25 DoubleQuote = 1 << 1,
26 BackQuote = 1 << 2,
27 Raw = 1 << 3,
28 Format = 1 << 4,
29 Triple = 1 << 5,
30 Bytes = 1 << 6,
31 };
32
Delimiter__anond3dc72110111::Delimiter33 Delimiter() : flags(0) {}
34
is_format__anond3dc72110111::Delimiter35 bool is_format() const {
36 return flags & Format;
37 }
38
is_raw__anond3dc72110111::Delimiter39 bool is_raw() const {
40 return flags & Raw;
41 }
42
is_triple__anond3dc72110111::Delimiter43 bool is_triple() const {
44 return flags & Triple;
45 }
46
is_bytes__anond3dc72110111::Delimiter47 bool is_bytes() const {
48 return flags & Bytes;
49 }
50
end_character__anond3dc72110111::Delimiter51 int32_t end_character() const {
52 if (flags & SingleQuote) return '\'';
53 if (flags & DoubleQuote) return '"';
54 if (flags & BackQuote) return '`';
55 return 0;
56 }
57
set_format__anond3dc72110111::Delimiter58 void set_format() {
59 flags |= Format;
60 }
61
set_raw__anond3dc72110111::Delimiter62 void set_raw() {
63 flags |= Raw;
64 }
65
set_triple__anond3dc72110111::Delimiter66 void set_triple() {
67 flags |= Triple;
68 }
69
set_bytes__anond3dc72110111::Delimiter70 void set_bytes() {
71 flags |= Bytes;
72 }
73
set_end_character__anond3dc72110111::Delimiter74 void set_end_character(int32_t character) {
75 switch (character) {
76 case '\'':
77 flags |= SingleQuote;
78 break;
79 case '"':
80 flags |= DoubleQuote;
81 break;
82 case '`':
83 flags |= BackQuote;
84 break;
85 default:
86 assert(false);
87 }
88 }
89
90 char flags;
91 };
92
93 struct Scanner {
Scanner__anond3dc72110111::Scanner94 Scanner() {
95 assert(sizeof(Delimiter) == sizeof(char));
96 deserialize(NULL, 0);
97 }
98
serialize__anond3dc72110111::Scanner99 unsigned serialize(char *buffer) {
100 size_t i = 0;
101
102 size_t delimiter_count = delimiter_stack.size();
103 if (delimiter_count > UINT8_MAX) delimiter_count = UINT8_MAX;
104 buffer[i++] = delimiter_count;
105
106 if (delimiter_count > 0) {
107 memcpy(&buffer[i], delimiter_stack.data(), delimiter_count);
108 }
109 i += delimiter_count;
110
111 vector<uint16_t>::iterator
112 iter = indent_length_stack.begin() + 1,
113 end = indent_length_stack.end();
114
115 for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
116 buffer[i++] = *iter;
117 }
118
119 return i;
120 }
121
deserialize__anond3dc72110111::Scanner122 void deserialize(const char *buffer, unsigned length) {
123 delimiter_stack.clear();
124 indent_length_stack.clear();
125 indent_length_stack.push_back(0);
126
127 if (length > 0) {
128 size_t i = 0;
129
130 size_t delimiter_count = (uint8_t)buffer[i++];
131 delimiter_stack.resize(delimiter_count);
132 if (delimiter_count > 0) {
133 memcpy(delimiter_stack.data(), &buffer[i], delimiter_count);
134 }
135 i += delimiter_count;
136
137 for (; i < length; i++) {
138 indent_length_stack.push_back(buffer[i]);
139 }
140 }
141 }
142
advance__anond3dc72110111::Scanner143 void advance(TSLexer *lexer) {
144 lexer->advance(lexer, false);
145 }
146
skip__anond3dc72110111::Scanner147 void skip(TSLexer *lexer) {
148 lexer->advance(lexer, true);
149 }
150
scan__anond3dc72110111::Scanner151 bool scan(TSLexer *lexer, const bool *valid_symbols) {
152 if (valid_symbols[STRING_CONTENT] && !valid_symbols[INDENT] && !delimiter_stack.empty()) {
153 Delimiter delimiter = delimiter_stack.back();
154 int32_t end_character = delimiter.end_character();
155 bool has_content = false;
156 while (lexer->lookahead) {
157 if (lexer->lookahead == '{' && delimiter.is_format()) {
158 lexer->mark_end(lexer);
159 lexer->advance(lexer, false);
160 if (lexer->lookahead == '{') {
161 lexer->advance(lexer, false);
162 } else {
163 lexer->result_symbol = STRING_CONTENT;
164 return has_content;
165 }
166 } else if (lexer->lookahead == '\\') {
167 if (delimiter.is_raw()) {
168 lexer->advance(lexer, false);
169 } else if (delimiter.is_bytes()) {
170 lexer->mark_end(lexer);
171 lexer->advance(lexer, false);
172 if (lexer->lookahead == 'N' || lexer->lookahead == 'u' || lexer->lookahead == 'U') {
173 // In bytes string, \N{...}, \uXXXX and \UXXXXXXXX are not escape sequences
174 // https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
175 lexer->advance(lexer, false);
176 } else {
177 lexer->result_symbol = STRING_CONTENT;
178 return has_content;
179 }
180 } else {
181 lexer->mark_end(lexer);
182 lexer->result_symbol = STRING_CONTENT;
183 return has_content;
184 }
185 } else if (lexer->lookahead == end_character) {
186 if (delimiter.is_triple()) {
187 lexer->mark_end(lexer);
188 lexer->advance(lexer, false);
189 if (lexer->lookahead == end_character) {
190 lexer->advance(lexer, false);
191 if (lexer->lookahead == end_character) {
192 if (has_content) {
193 lexer->result_symbol = STRING_CONTENT;
194 } else {
195 lexer->advance(lexer, false);
196 lexer->mark_end(lexer);
197 delimiter_stack.pop_back();
198 lexer->result_symbol = STRING_END;
199 }
200 return true;
201 }
202 }
203 } else {
204 if (has_content) {
205 lexer->result_symbol = STRING_CONTENT;
206 } else {
207 lexer->advance(lexer, false);
208 delimiter_stack.pop_back();
209 lexer->result_symbol = STRING_END;
210 }
211 lexer->mark_end(lexer);
212 return true;
213 }
214 } else if (lexer->lookahead == '\n' && has_content && !delimiter.is_triple()) {
215 return false;
216 }
217 advance(lexer);
218 has_content = true;
219 }
220 }
221
222 lexer->mark_end(lexer);
223
224 bool found_end_of_line = false;
225 uint32_t indent_length = 0;
226 int32_t first_comment_indent_length = -1;
227 for (;;) {
228 if (lexer->lookahead == '\n') {
229 found_end_of_line = true;
230 indent_length = 0;
231 skip(lexer);
232 } else if (lexer->lookahead == ' ') {
233 indent_length++;
234 skip(lexer);
235 } else if (lexer->lookahead == '\r') {
236 indent_length = 0;
237 skip(lexer);
238 } else if (lexer->lookahead == '\t') {
239 indent_length += 8;
240 skip(lexer);
241 } else if (lexer->lookahead == '#') {
242 if (first_comment_indent_length == -1) {
243 first_comment_indent_length = (int32_t)indent_length;
244 }
245 while (lexer->lookahead && lexer->lookahead != '\n') {
246 skip(lexer);
247 }
248 skip(lexer);
249 indent_length = 0;
250 } else if (lexer->lookahead == '\\') {
251 skip(lexer);
252 if (iswspace(lexer->lookahead)) {
253 skip(lexer);
254 } else {
255 return false;
256 }
257 } else if (lexer->lookahead == '\f') {
258 indent_length = 0;
259 skip(lexer);
260 } else if (lexer->lookahead == 0) {
261 indent_length = 0;
262 found_end_of_line = true;
263 break;
264 } else {
265 break;
266 }
267 }
268
269 if (found_end_of_line) {
270 if (!indent_length_stack.empty()) {
271 uint16_t current_indent_length = indent_length_stack.back();
272
273 if (
274 valid_symbols[INDENT] &&
275 indent_length > current_indent_length
276 ) {
277 indent_length_stack.push_back(indent_length);
278 lexer->result_symbol = INDENT;
279 return true;
280 }
281
282 if (
283 valid_symbols[DEDENT] &&
284 indent_length < current_indent_length &&
285
286 // Wait to create a dedent token until we've consumed any comments
287 // whose indentation matches the current block.
288 first_comment_indent_length < (int32_t)current_indent_length
289 ) {
290 indent_length_stack.pop_back();
291 lexer->result_symbol = DEDENT;
292 return true;
293 }
294 }
295
296 if (valid_symbols[NEWLINE]) {
297 lexer->result_symbol = NEWLINE;
298 return true;
299 }
300 }
301
302 if (first_comment_indent_length == -1 && valid_symbols[STRING_START]) {
303 Delimiter delimiter;
304
305 bool has_flags = false;
306 while (lexer->lookahead) {
307 if (lexer->lookahead == 'f' || lexer->lookahead == 'F') {
308 delimiter.set_format();
309 } else if (lexer->lookahead == 'r' || lexer->lookahead == 'R') {
310 delimiter.set_raw();
311 } else if (lexer->lookahead == 'b' || lexer->lookahead == 'B') {
312 delimiter.set_bytes();
313 } else if (lexer->lookahead != 'u' && lexer->lookahead != 'U') {
314 break;
315 }
316 has_flags = true;
317 advance(lexer);
318 }
319
320 if (lexer->lookahead == '`') {
321 delimiter.set_end_character('`');
322 advance(lexer);
323 lexer->mark_end(lexer);
324 } else if (lexer->lookahead == '\'') {
325 delimiter.set_end_character('\'');
326 advance(lexer);
327 lexer->mark_end(lexer);
328 if (lexer->lookahead == '\'') {
329 advance(lexer);
330 if (lexer->lookahead == '\'') {
331 advance(lexer);
332 lexer->mark_end(lexer);
333 delimiter.set_triple();
334 }
335 }
336 } else if (lexer->lookahead == '"') {
337 delimiter.set_end_character('"');
338 advance(lexer);
339 lexer->mark_end(lexer);
340 if (lexer->lookahead == '"') {
341 advance(lexer);
342 if (lexer->lookahead == '"') {
343 advance(lexer);
344 lexer->mark_end(lexer);
345 delimiter.set_triple();
346 }
347 }
348 }
349
350 if (delimiter.end_character()) {
351 delimiter_stack.push_back(delimiter);
352 lexer->result_symbol = STRING_START;
353 return true;
354 } else if (has_flags) {
355 return false;
356 }
357 }
358
359 return false;
360 }
361
362 vector<uint16_t> indent_length_stack;
363 vector<Delimiter> delimiter_stack;
364 };
365
366 }
367
368 extern "C" {
369
tree_sitter_python_external_scanner_create()370 void *tree_sitter_python_external_scanner_create() {
371 return new Scanner();
372 }
373
tree_sitter_python_external_scanner_scan(void * payload,TSLexer * lexer,const bool * valid_symbols)374 bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer,
375 const bool *valid_symbols) {
376 Scanner *scanner = static_cast<Scanner *>(payload);
377 return scanner->scan(lexer, valid_symbols);
378 }
379
tree_sitter_python_external_scanner_serialize(void * payload,char * buffer)380 unsigned tree_sitter_python_external_scanner_serialize(void *payload, char *buffer) {
381 Scanner *scanner = static_cast<Scanner *>(payload);
382 return scanner->serialize(buffer);
383 }
384
tree_sitter_python_external_scanner_deserialize(void * payload,const char * buffer,unsigned length)385 void tree_sitter_python_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
386 Scanner *scanner = static_cast<Scanner *>(payload);
387 scanner->deserialize(buffer, length);
388 }
389
tree_sitter_python_external_scanner_destroy(void * payload)390 void tree_sitter_python_external_scanner_destroy(void *payload) {
391 Scanner *scanner = static_cast<Scanner *>(payload);
392 delete scanner;
393 }
394
395 }
396