1 #include <tree_sitter/parser.h>
2 #include <vector>
3 #include <cwctype>
4 #include <cstring>
5 #include <cassert>
6 #include <stdio.h>
7 namespace {
8 
9 using std::vector;
10 using std::iswspace;
11 using std::memcpy;
12 
13 enum TokenType {
14   NEWLINE,
15   INDENT,
16   DEDENT,
17   STRING_START,
18   STRING_CONTENT,
19   STRING_END,
20 };
21 
22 struct Delimiter {
23   enum {
24     SingleQuote = 1 << 0,
25     DoubleQuote = 1 << 1,
26     BackQuote = 1 << 2,
27     Raw = 1 << 3,
28     Format = 1 << 4,
29     Triple = 1 << 5,
30     Bytes = 1 << 6,
31   };
32 
Delimiter__anond3dc72110111::Delimiter33   Delimiter() : flags(0) {}
34 
is_format__anond3dc72110111::Delimiter35   bool is_format() const {
36     return flags & Format;
37   }
38 
is_raw__anond3dc72110111::Delimiter39   bool is_raw() const {
40     return flags & Raw;
41   }
42 
is_triple__anond3dc72110111::Delimiter43   bool is_triple() const {
44     return flags & Triple;
45   }
46 
is_bytes__anond3dc72110111::Delimiter47   bool is_bytes() const {
48     return flags & Bytes;
49   }
50 
end_character__anond3dc72110111::Delimiter51   int32_t end_character() const {
52     if (flags & SingleQuote) return '\'';
53     if (flags & DoubleQuote) return '"';
54     if (flags & BackQuote) return '`';
55     return 0;
56   }
57 
set_format__anond3dc72110111::Delimiter58   void set_format() {
59     flags |= Format;
60   }
61 
set_raw__anond3dc72110111::Delimiter62   void set_raw() {
63     flags |= Raw;
64   }
65 
set_triple__anond3dc72110111::Delimiter66   void set_triple() {
67     flags |= Triple;
68   }
69 
set_bytes__anond3dc72110111::Delimiter70   void set_bytes() {
71     flags |= Bytes;
72   }
73 
set_end_character__anond3dc72110111::Delimiter74   void set_end_character(int32_t character) {
75     switch (character) {
76       case '\'':
77         flags |= SingleQuote;
78         break;
79       case '"':
80         flags |= DoubleQuote;
81         break;
82       case '`':
83         flags |= BackQuote;
84         break;
85       default:
86         assert(false);
87     }
88   }
89 
90   char flags;
91 };
92 
93 struct Scanner {
Scanner__anond3dc72110111::Scanner94   Scanner() {
95     assert(sizeof(Delimiter) == sizeof(char));
96     deserialize(NULL, 0);
97   }
98 
serialize__anond3dc72110111::Scanner99   unsigned serialize(char *buffer) {
100     size_t i = 0;
101 
102     size_t delimiter_count = delimiter_stack.size();
103     if (delimiter_count > UINT8_MAX) delimiter_count = UINT8_MAX;
104     buffer[i++] = delimiter_count;
105 
106     if (delimiter_count > 0) {
107       memcpy(&buffer[i], delimiter_stack.data(), delimiter_count);
108     }
109     i += delimiter_count;
110 
111     vector<uint16_t>::iterator
112       iter = indent_length_stack.begin() + 1,
113       end = indent_length_stack.end();
114 
115     for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
116       buffer[i++] = *iter;
117     }
118 
119     return i;
120   }
121 
deserialize__anond3dc72110111::Scanner122   void deserialize(const char *buffer, unsigned length) {
123     delimiter_stack.clear();
124     indent_length_stack.clear();
125     indent_length_stack.push_back(0);
126 
127     if (length > 0) {
128       size_t i = 0;
129 
130       size_t delimiter_count = (uint8_t)buffer[i++];
131       delimiter_stack.resize(delimiter_count);
132       if (delimiter_count > 0) {
133         memcpy(delimiter_stack.data(), &buffer[i], delimiter_count);
134       }
135       i += delimiter_count;
136 
137       for (; i < length; i++) {
138         indent_length_stack.push_back(buffer[i]);
139       }
140     }
141   }
142 
advance__anond3dc72110111::Scanner143   void advance(TSLexer *lexer) {
144     lexer->advance(lexer, false);
145   }
146 
skip__anond3dc72110111::Scanner147   void skip(TSLexer *lexer) {
148     lexer->advance(lexer, true);
149   }
150 
scan__anond3dc72110111::Scanner151   bool scan(TSLexer *lexer, const bool *valid_symbols) {
152     if (valid_symbols[STRING_CONTENT] && !valid_symbols[INDENT] && !delimiter_stack.empty()) {
153       Delimiter delimiter = delimiter_stack.back();
154       int32_t end_character = delimiter.end_character();
155       bool has_content = false;
156       while (lexer->lookahead) {
157         if (lexer->lookahead == '{' && delimiter.is_format()) {
158           lexer->mark_end(lexer);
159           lexer->advance(lexer, false);
160           if (lexer->lookahead == '{') {
161             lexer->advance(lexer, false);
162           } else {
163             lexer->result_symbol = STRING_CONTENT;
164             return has_content;
165           }
166         } else if (lexer->lookahead == '\\') {
167           if (delimiter.is_raw()) {
168             lexer->advance(lexer, false);
169           } else if (delimiter.is_bytes()) {
170               lexer->mark_end(lexer);
171               lexer->advance(lexer, false);
172               if (lexer->lookahead == 'N' || lexer->lookahead == 'u' || lexer->lookahead == 'U') {
173                 // In bytes string, \N{...}, \uXXXX and \UXXXXXXXX are not escape sequences
174                 // https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
175                 lexer->advance(lexer, false);
176               } else {
177                   lexer->result_symbol = STRING_CONTENT;
178                   return has_content;
179               }
180           } else {
181             lexer->mark_end(lexer);
182             lexer->result_symbol = STRING_CONTENT;
183             return has_content;
184           }
185         } else if (lexer->lookahead == end_character) {
186           if (delimiter.is_triple()) {
187             lexer->mark_end(lexer);
188             lexer->advance(lexer, false);
189             if (lexer->lookahead == end_character) {
190               lexer->advance(lexer, false);
191               if (lexer->lookahead == end_character) {
192                 if (has_content) {
193                   lexer->result_symbol = STRING_CONTENT;
194                 } else {
195                   lexer->advance(lexer, false);
196                   lexer->mark_end(lexer);
197                   delimiter_stack.pop_back();
198                   lexer->result_symbol = STRING_END;
199                 }
200                 return true;
201               }
202             }
203           } else {
204             if (has_content) {
205               lexer->result_symbol = STRING_CONTENT;
206             } else {
207               lexer->advance(lexer, false);
208               delimiter_stack.pop_back();
209               lexer->result_symbol = STRING_END;
210             }
211             lexer->mark_end(lexer);
212             return true;
213           }
214         } else if (lexer->lookahead == '\n' && has_content && !delimiter.is_triple()) {
215           return false;
216         }
217         advance(lexer);
218         has_content = true;
219       }
220     }
221 
222     lexer->mark_end(lexer);
223 
224     bool found_end_of_line = false;
225     uint32_t indent_length = 0;
226     int32_t first_comment_indent_length = -1;
227     for (;;) {
228       if (lexer->lookahead == '\n') {
229         found_end_of_line = true;
230         indent_length = 0;
231         skip(lexer);
232       } else if (lexer->lookahead == ' ') {
233         indent_length++;
234         skip(lexer);
235       } else if (lexer->lookahead == '\r') {
236         indent_length = 0;
237         skip(lexer);
238       } else if (lexer->lookahead == '\t') {
239         indent_length += 8;
240         skip(lexer);
241       } else if (lexer->lookahead == '#') {
242         if (first_comment_indent_length == -1) {
243           first_comment_indent_length = (int32_t)indent_length;
244         }
245         while (lexer->lookahead && lexer->lookahead != '\n') {
246           skip(lexer);
247         }
248         skip(lexer);
249         indent_length = 0;
250       } else if (lexer->lookahead == '\\') {
251         skip(lexer);
252         if (iswspace(lexer->lookahead)) {
253           skip(lexer);
254         } else {
255           return false;
256         }
257       } else if (lexer->lookahead == '\f') {
258         indent_length = 0;
259         skip(lexer);
260       } else if (lexer->lookahead == 0) {
261         indent_length = 0;
262         found_end_of_line = true;
263         break;
264       } else {
265         break;
266       }
267     }
268 
269     if (found_end_of_line) {
270       if (!indent_length_stack.empty()) {
271         uint16_t current_indent_length = indent_length_stack.back();
272 
273         if (
274           valid_symbols[INDENT] &&
275           indent_length > current_indent_length
276         ) {
277           indent_length_stack.push_back(indent_length);
278           lexer->result_symbol = INDENT;
279           return true;
280         }
281 
282         if (
283           valid_symbols[DEDENT] &&
284           indent_length < current_indent_length &&
285 
286           // Wait to create a dedent token until we've consumed any comments
287           // whose indentation matches the current block.
288           first_comment_indent_length < (int32_t)current_indent_length
289         ) {
290           indent_length_stack.pop_back();
291           lexer->result_symbol = DEDENT;
292           return true;
293         }
294       }
295 
296       if (valid_symbols[NEWLINE]) {
297         lexer->result_symbol = NEWLINE;
298         return true;
299       }
300     }
301 
302     if (first_comment_indent_length == -1 && valid_symbols[STRING_START]) {
303       Delimiter delimiter;
304 
305       bool has_flags = false;
306       while (lexer->lookahead) {
307         if (lexer->lookahead == 'f' || lexer->lookahead == 'F') {
308           delimiter.set_format();
309         } else if (lexer->lookahead == 'r' || lexer->lookahead == 'R') {
310           delimiter.set_raw();
311         } else if (lexer->lookahead == 'b' || lexer->lookahead == 'B') {
312           delimiter.set_bytes();
313         } else if (lexer->lookahead != 'u' && lexer->lookahead != 'U') {
314           break;
315         }
316         has_flags = true;
317         advance(lexer);
318       }
319 
320       if (lexer->lookahead == '`') {
321         delimiter.set_end_character('`');
322         advance(lexer);
323         lexer->mark_end(lexer);
324       } else if (lexer->lookahead == '\'') {
325         delimiter.set_end_character('\'');
326         advance(lexer);
327         lexer->mark_end(lexer);
328         if (lexer->lookahead == '\'') {
329           advance(lexer);
330           if (lexer->lookahead == '\'') {
331             advance(lexer);
332             lexer->mark_end(lexer);
333             delimiter.set_triple();
334           }
335         }
336       } else if (lexer->lookahead == '"') {
337         delimiter.set_end_character('"');
338         advance(lexer);
339         lexer->mark_end(lexer);
340         if (lexer->lookahead == '"') {
341           advance(lexer);
342           if (lexer->lookahead == '"') {
343             advance(lexer);
344             lexer->mark_end(lexer);
345             delimiter.set_triple();
346           }
347         }
348       }
349 
350       if (delimiter.end_character()) {
351         delimiter_stack.push_back(delimiter);
352         lexer->result_symbol = STRING_START;
353         return true;
354       } else if (has_flags) {
355         return false;
356       }
357     }
358 
359     return false;
360   }
361 
362   vector<uint16_t> indent_length_stack;
363   vector<Delimiter> delimiter_stack;
364 };
365 
366 }
367 
368 extern "C" {
369 
tree_sitter_python_external_scanner_create()370 void *tree_sitter_python_external_scanner_create() {
371   return new Scanner();
372 }
373 
tree_sitter_python_external_scanner_scan(void * payload,TSLexer * lexer,const bool * valid_symbols)374 bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer,
375                                             const bool *valid_symbols) {
376   Scanner *scanner = static_cast<Scanner *>(payload);
377   return scanner->scan(lexer, valid_symbols);
378 }
379 
tree_sitter_python_external_scanner_serialize(void * payload,char * buffer)380 unsigned tree_sitter_python_external_scanner_serialize(void *payload, char *buffer) {
381   Scanner *scanner = static_cast<Scanner *>(payload);
382   return scanner->serialize(buffer);
383 }
384 
tree_sitter_python_external_scanner_deserialize(void * payload,const char * buffer,unsigned length)385 void tree_sitter_python_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
386   Scanner *scanner = static_cast<Scanner *>(payload);
387   scanner->deserialize(buffer, length);
388 }
389 
tree_sitter_python_external_scanner_destroy(void * payload)390 void tree_sitter_python_external_scanner_destroy(void *payload) {
391   Scanner *scanner = static_cast<Scanner *>(payload);
392   delete scanner;
393 }
394 
395 }
396