1 #include <tree_sitter/parser.h>
2 #include <algorithm>
3 #include <vector>
4 #include <string>
5 #include <cwctype>
6 #include <cstring>
7 #include "tag.h"
8 
9 namespace {
10 
11 using std::vector;
12 using std::string;
13 
14 enum TokenType {
15   START_TAG_NAME,
16   SCRIPT_START_TAG_NAME,
17   STYLE_START_TAG_NAME,
18   END_TAG_NAME,
19   ERRONEOUS_END_TAG_NAME,
20   SELF_CLOSING_TAG_DELIMITER,
21   IMPLICIT_END_TAG,
22   RAW_TEXT,
23   COMMENT
24 };
25 
26 struct Scanner {
Scanner__anon0c42c5c40111::Scanner27   Scanner() {}
28 
serialize__anon0c42c5c40111::Scanner29   unsigned serialize(char *buffer) {
30     uint16_t tag_count = tags.size() > UINT16_MAX ? UINT16_MAX : tags.size();
31     uint16_t serialized_tag_count = 0;
32 
33     unsigned i = sizeof(tag_count);
34     std::memcpy(&buffer[i], &tag_count, sizeof(tag_count));
35     i += sizeof(tag_count);
36 
37     for (; serialized_tag_count < tag_count; serialized_tag_count++) {
38       Tag &tag = tags[serialized_tag_count];
39       if (tag.type == CUSTOM) {
40         unsigned name_length = tag.custom_tag_name.size();
41         if (name_length > UINT8_MAX) name_length = UINT8_MAX;
42         if (i + 2 + name_length >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) break;
43         buffer[i++] = static_cast<char>(tag.type);
44         buffer[i++] = name_length;
45         tag.custom_tag_name.copy(&buffer[i], name_length);
46         i += name_length;
47       } else {
48         if (i + 1 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) break;
49         buffer[i++] = static_cast<char>(tag.type);
50       }
51     }
52 
53     std::memcpy(&buffer[0], &serialized_tag_count, sizeof(serialized_tag_count));
54     return i;
55   }
56 
deserialize__anon0c42c5c40111::Scanner57   void deserialize(const char *buffer, unsigned length) {
58     tags.clear();
59     if (length > 0) {
60       unsigned i = 0;
61       uint16_t tag_count, serialized_tag_count;
62 
63       std::memcpy(&serialized_tag_count, &buffer[i], sizeof(serialized_tag_count));
64       i += sizeof(serialized_tag_count);
65 
66       std::memcpy(&tag_count, &buffer[i], sizeof(tag_count));
67       i += sizeof(tag_count);
68 
69       tags.resize(tag_count);
70       for (unsigned j = 0; j < serialized_tag_count; j++) {
71         Tag &tag = tags[j];
72         tag.type = static_cast<TagType>(buffer[i++]);
73         if (tag.type == CUSTOM) {
74           uint16_t name_length = static_cast<uint8_t>(buffer[i++]);
75           tag.custom_tag_name.assign(&buffer[i], &buffer[i + name_length]);
76           i += name_length;
77         }
78       }
79     }
80   }
81 
scan_tag_name__anon0c42c5c40111::Scanner82   string scan_tag_name(TSLexer *lexer) {
83     string tag_name;
84     while (iswalnum(lexer->lookahead) ||
85            lexer->lookahead == '-' ||
86            lexer->lookahead == ':') {
87       tag_name += towupper(lexer->lookahead);
88       lexer->advance(lexer, false);
89     }
90     return tag_name;
91   }
92 
scan_comment__anon0c42c5c40111::Scanner93   bool scan_comment(TSLexer *lexer) {
94     if (lexer->lookahead != '-') return false;
95     lexer->advance(lexer, false);
96     if (lexer->lookahead != '-') return false;
97     lexer->advance(lexer, false);
98 
99     unsigned dashes = 0;
100     while (lexer->lookahead) {
101       switch (lexer->lookahead) {
102         case '-':
103           ++dashes;
104           break;
105         case '>':
106           if (dashes >= 2) {
107             lexer->result_symbol = COMMENT;
108             lexer->advance(lexer, false);
109             lexer->mark_end(lexer);
110             return true;
111           }
112         default:
113           dashes = 0;
114       }
115       lexer->advance(lexer, false);
116     }
117     return false;
118   }
119 
scan_raw_text__anon0c42c5c40111::Scanner120   bool scan_raw_text(TSLexer *lexer) {
121     if (!tags.size()) return false;
122 
123     lexer->mark_end(lexer);
124 
125     const string &end_delimiter = tags.back().type == SCRIPT
126       ? "</SCRIPT"
127       : "</STYLE";
128 
129     unsigned delimiter_index = 0;
130     while (lexer->lookahead) {
131       if (towupper(lexer->lookahead) == end_delimiter[delimiter_index]) {
132         delimiter_index++;
133         if (delimiter_index == end_delimiter.size()) break;
134         lexer->advance(lexer, false);
135       } else {
136         delimiter_index = 0;
137         lexer->advance(lexer, false);
138         lexer->mark_end(lexer);
139       }
140     }
141 
142     lexer->result_symbol = RAW_TEXT;
143     return true;
144   }
145 
scan_implicit_end_tag__anon0c42c5c40111::Scanner146   bool scan_implicit_end_tag(TSLexer *lexer) {
147     Tag *parent = tags.empty() ? NULL : &tags.back();
148 
149     bool is_closing_tag = false;
150     if (lexer->lookahead == '/') {
151       is_closing_tag = true;
152       lexer->advance(lexer, false);
153     } else {
154       if (parent && parent->is_void()) {
155         tags.pop_back();
156         lexer->result_symbol = IMPLICIT_END_TAG;
157         return true;
158       }
159     }
160 
161     string tag_name = scan_tag_name(lexer);
162     if (tag_name.empty()) return false;
163 
164     Tag next_tag = Tag::for_name(tag_name);
165 
166     if (is_closing_tag) {
167       // The tag correctly closes the topmost element on the stack
168       if (!tags.empty() && tags.back() == next_tag) return false;
169 
170       // Otherwise, dig deeper and queue implicit end tags (to be nice in
171       // the case of malformed HTML)
172       if (std::find(tags.begin(), tags.end(), next_tag) != tags.end()) {
173         tags.pop_back();
174         lexer->result_symbol = IMPLICIT_END_TAG;
175         return true;
176       }
177     } else if (parent && !parent->can_contain(next_tag)) {
178       tags.pop_back();
179       lexer->result_symbol = IMPLICIT_END_TAG;
180       return true;
181     }
182 
183     return false;
184   }
185 
scan_start_tag_name__anon0c42c5c40111::Scanner186   bool scan_start_tag_name(TSLexer *lexer) {
187     string tag_name = scan_tag_name(lexer);
188     if (tag_name.empty()) return false;
189     Tag tag = Tag::for_name(tag_name);
190     tags.push_back(tag);
191     switch (tag.type) {
192       case SCRIPT:
193         lexer->result_symbol = SCRIPT_START_TAG_NAME;
194         break;
195       case STYLE:
196         lexer->result_symbol = STYLE_START_TAG_NAME;
197         break;
198       default:
199         lexer->result_symbol = START_TAG_NAME;
200         break;
201     }
202     return true;
203   }
204 
scan_end_tag_name__anon0c42c5c40111::Scanner205   bool scan_end_tag_name(TSLexer *lexer) {
206     string tag_name = scan_tag_name(lexer);
207     if (tag_name.empty()) return false;
208     Tag tag = Tag::for_name(tag_name);
209     if (!tags.empty() && tags.back() == tag) {
210       tags.pop_back();
211       lexer->result_symbol = END_TAG_NAME;
212     } else {
213       lexer->result_symbol = ERRONEOUS_END_TAG_NAME;
214     }
215     return true;
216   }
217 
scan_self_closing_tag_delimiter__anon0c42c5c40111::Scanner218   bool scan_self_closing_tag_delimiter(TSLexer *lexer) {
219     lexer->advance(lexer, false);
220     if (lexer->lookahead == '>') {
221       lexer->advance(lexer, false);
222       if (!tags.empty()) {
223         tags.pop_back();
224         lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER;
225       }
226       return true;
227     }
228     return false;
229   }
230 
scan__anon0c42c5c40111::Scanner231   bool scan(TSLexer *lexer, const bool *valid_symbols) {
232     while (iswspace(lexer->lookahead)) {
233       lexer->advance(lexer, true);
234     }
235 
236     if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] && !valid_symbols[END_TAG_NAME]) {
237       return scan_raw_text(lexer);
238     }
239 
240     switch (lexer->lookahead) {
241       case '<':
242         lexer->mark_end(lexer);
243         lexer->advance(lexer, false);
244 
245         if (lexer->lookahead == '!') {
246           lexer->advance(lexer, false);
247           return scan_comment(lexer);
248         }
249 
250         if (valid_symbols[IMPLICIT_END_TAG]) {
251           return scan_implicit_end_tag(lexer);
252         }
253         break;
254 
255       case '\0':
256         if (valid_symbols[IMPLICIT_END_TAG]) {
257           return scan_implicit_end_tag(lexer);
258         }
259         break;
260 
261       case '/':
262         if (valid_symbols[SELF_CLOSING_TAG_DELIMITER]) {
263           return scan_self_closing_tag_delimiter(lexer);
264         }
265         break;
266 
267       default:
268         if ((valid_symbols[START_TAG_NAME] || valid_symbols[END_TAG_NAME]) && !valid_symbols[RAW_TEXT]) {
269           return valid_symbols[START_TAG_NAME]
270             ? scan_start_tag_name(lexer)
271             : scan_end_tag_name(lexer);
272         }
273     }
274 
275     return false;
276   }
277 
278   vector<Tag> tags;
279 };
280 
281 }
282 
283 extern "C" {
284 
tree_sitter_html_external_scanner_create()285 void *tree_sitter_html_external_scanner_create() {
286   return new Scanner();
287 }
288 
tree_sitter_html_external_scanner_scan(void * payload,TSLexer * lexer,const bool * valid_symbols)289 bool tree_sitter_html_external_scanner_scan(void *payload, TSLexer *lexer,
290                                             const bool *valid_symbols) {
291   Scanner *scanner = static_cast<Scanner *>(payload);
292   return scanner->scan(lexer, valid_symbols);
293 }
294 
tree_sitter_html_external_scanner_serialize(void * payload,char * buffer)295 unsigned tree_sitter_html_external_scanner_serialize(void *payload, char *buffer) {
296   Scanner *scanner = static_cast<Scanner *>(payload);
297   return scanner->serialize(buffer);
298 }
299 
tree_sitter_html_external_scanner_deserialize(void * payload,const char * buffer,unsigned length)300 void tree_sitter_html_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
301   Scanner *scanner = static_cast<Scanner *>(payload);
302   scanner->deserialize(buffer, length);
303 }
304 
tree_sitter_html_external_scanner_destroy(void * payload)305 void tree_sitter_html_external_scanner_destroy(void *payload) {
306   Scanner *scanner = static_cast<Scanner *>(payload);
307   delete scanner;
308 }
309 
310 }
311