1 #include <tree_sitter/parser.h>
2 #include <algorithm>
3 #include <vector>
4 #include <string>
5 #include <cwctype>
6 #include <cstring>
7 #include "tag.h"
8
9 namespace {
10
11 using std::vector;
12 using std::string;
13
14 enum TokenType {
15 START_TAG_NAME,
16 SCRIPT_START_TAG_NAME,
17 STYLE_START_TAG_NAME,
18 END_TAG_NAME,
19 ERRONEOUS_END_TAG_NAME,
20 SELF_CLOSING_TAG_DELIMITER,
21 IMPLICIT_END_TAG,
22 RAW_TEXT,
23 COMMENT
24 };
25
26 struct Scanner {
Scanner__anon0c42c5c40111::Scanner27 Scanner() {}
28
serialize__anon0c42c5c40111::Scanner29 unsigned serialize(char *buffer) {
30 uint16_t tag_count = tags.size() > UINT16_MAX ? UINT16_MAX : tags.size();
31 uint16_t serialized_tag_count = 0;
32
33 unsigned i = sizeof(tag_count);
34 std::memcpy(&buffer[i], &tag_count, sizeof(tag_count));
35 i += sizeof(tag_count);
36
37 for (; serialized_tag_count < tag_count; serialized_tag_count++) {
38 Tag &tag = tags[serialized_tag_count];
39 if (tag.type == CUSTOM) {
40 unsigned name_length = tag.custom_tag_name.size();
41 if (name_length > UINT8_MAX) name_length = UINT8_MAX;
42 if (i + 2 + name_length >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) break;
43 buffer[i++] = static_cast<char>(tag.type);
44 buffer[i++] = name_length;
45 tag.custom_tag_name.copy(&buffer[i], name_length);
46 i += name_length;
47 } else {
48 if (i + 1 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) break;
49 buffer[i++] = static_cast<char>(tag.type);
50 }
51 }
52
53 std::memcpy(&buffer[0], &serialized_tag_count, sizeof(serialized_tag_count));
54 return i;
55 }
56
deserialize__anon0c42c5c40111::Scanner57 void deserialize(const char *buffer, unsigned length) {
58 tags.clear();
59 if (length > 0) {
60 unsigned i = 0;
61 uint16_t tag_count, serialized_tag_count;
62
63 std::memcpy(&serialized_tag_count, &buffer[i], sizeof(serialized_tag_count));
64 i += sizeof(serialized_tag_count);
65
66 std::memcpy(&tag_count, &buffer[i], sizeof(tag_count));
67 i += sizeof(tag_count);
68
69 tags.resize(tag_count);
70 for (unsigned j = 0; j < serialized_tag_count; j++) {
71 Tag &tag = tags[j];
72 tag.type = static_cast<TagType>(buffer[i++]);
73 if (tag.type == CUSTOM) {
74 uint16_t name_length = static_cast<uint8_t>(buffer[i++]);
75 tag.custom_tag_name.assign(&buffer[i], &buffer[i + name_length]);
76 i += name_length;
77 }
78 }
79 }
80 }
81
scan_tag_name__anon0c42c5c40111::Scanner82 string scan_tag_name(TSLexer *lexer) {
83 string tag_name;
84 while (iswalnum(lexer->lookahead) ||
85 lexer->lookahead == '-' ||
86 lexer->lookahead == ':') {
87 tag_name += towupper(lexer->lookahead);
88 lexer->advance(lexer, false);
89 }
90 return tag_name;
91 }
92
scan_comment__anon0c42c5c40111::Scanner93 bool scan_comment(TSLexer *lexer) {
94 if (lexer->lookahead != '-') return false;
95 lexer->advance(lexer, false);
96 if (lexer->lookahead != '-') return false;
97 lexer->advance(lexer, false);
98
99 unsigned dashes = 0;
100 while (lexer->lookahead) {
101 switch (lexer->lookahead) {
102 case '-':
103 ++dashes;
104 break;
105 case '>':
106 if (dashes >= 2) {
107 lexer->result_symbol = COMMENT;
108 lexer->advance(lexer, false);
109 lexer->mark_end(lexer);
110 return true;
111 }
112 default:
113 dashes = 0;
114 }
115 lexer->advance(lexer, false);
116 }
117 return false;
118 }
119
scan_raw_text__anon0c42c5c40111::Scanner120 bool scan_raw_text(TSLexer *lexer) {
121 if (!tags.size()) return false;
122
123 lexer->mark_end(lexer);
124
125 const string &end_delimiter = tags.back().type == SCRIPT
126 ? "</SCRIPT"
127 : "</STYLE";
128
129 unsigned delimiter_index = 0;
130 while (lexer->lookahead) {
131 if (towupper(lexer->lookahead) == end_delimiter[delimiter_index]) {
132 delimiter_index++;
133 if (delimiter_index == end_delimiter.size()) break;
134 lexer->advance(lexer, false);
135 } else {
136 delimiter_index = 0;
137 lexer->advance(lexer, false);
138 lexer->mark_end(lexer);
139 }
140 }
141
142 lexer->result_symbol = RAW_TEXT;
143 return true;
144 }
145
scan_implicit_end_tag__anon0c42c5c40111::Scanner146 bool scan_implicit_end_tag(TSLexer *lexer) {
147 Tag *parent = tags.empty() ? NULL : &tags.back();
148
149 bool is_closing_tag = false;
150 if (lexer->lookahead == '/') {
151 is_closing_tag = true;
152 lexer->advance(lexer, false);
153 } else {
154 if (parent && parent->is_void()) {
155 tags.pop_back();
156 lexer->result_symbol = IMPLICIT_END_TAG;
157 return true;
158 }
159 }
160
161 string tag_name = scan_tag_name(lexer);
162 if (tag_name.empty()) return false;
163
164 Tag next_tag = Tag::for_name(tag_name);
165
166 if (is_closing_tag) {
167 // The tag correctly closes the topmost element on the stack
168 if (!tags.empty() && tags.back() == next_tag) return false;
169
170 // Otherwise, dig deeper and queue implicit end tags (to be nice in
171 // the case of malformed HTML)
172 if (std::find(tags.begin(), tags.end(), next_tag) != tags.end()) {
173 tags.pop_back();
174 lexer->result_symbol = IMPLICIT_END_TAG;
175 return true;
176 }
177 } else if (parent && !parent->can_contain(next_tag)) {
178 tags.pop_back();
179 lexer->result_symbol = IMPLICIT_END_TAG;
180 return true;
181 }
182
183 return false;
184 }
185
scan_start_tag_name__anon0c42c5c40111::Scanner186 bool scan_start_tag_name(TSLexer *lexer) {
187 string tag_name = scan_tag_name(lexer);
188 if (tag_name.empty()) return false;
189 Tag tag = Tag::for_name(tag_name);
190 tags.push_back(tag);
191 switch (tag.type) {
192 case SCRIPT:
193 lexer->result_symbol = SCRIPT_START_TAG_NAME;
194 break;
195 case STYLE:
196 lexer->result_symbol = STYLE_START_TAG_NAME;
197 break;
198 default:
199 lexer->result_symbol = START_TAG_NAME;
200 break;
201 }
202 return true;
203 }
204
scan_end_tag_name__anon0c42c5c40111::Scanner205 bool scan_end_tag_name(TSLexer *lexer) {
206 string tag_name = scan_tag_name(lexer);
207 if (tag_name.empty()) return false;
208 Tag tag = Tag::for_name(tag_name);
209 if (!tags.empty() && tags.back() == tag) {
210 tags.pop_back();
211 lexer->result_symbol = END_TAG_NAME;
212 } else {
213 lexer->result_symbol = ERRONEOUS_END_TAG_NAME;
214 }
215 return true;
216 }
217
scan_self_closing_tag_delimiter__anon0c42c5c40111::Scanner218 bool scan_self_closing_tag_delimiter(TSLexer *lexer) {
219 lexer->advance(lexer, false);
220 if (lexer->lookahead == '>') {
221 lexer->advance(lexer, false);
222 if (!tags.empty()) {
223 tags.pop_back();
224 lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER;
225 }
226 return true;
227 }
228 return false;
229 }
230
scan__anon0c42c5c40111::Scanner231 bool scan(TSLexer *lexer, const bool *valid_symbols) {
232 while (iswspace(lexer->lookahead)) {
233 lexer->advance(lexer, true);
234 }
235
236 if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] && !valid_symbols[END_TAG_NAME]) {
237 return scan_raw_text(lexer);
238 }
239
240 switch (lexer->lookahead) {
241 case '<':
242 lexer->mark_end(lexer);
243 lexer->advance(lexer, false);
244
245 if (lexer->lookahead == '!') {
246 lexer->advance(lexer, false);
247 return scan_comment(lexer);
248 }
249
250 if (valid_symbols[IMPLICIT_END_TAG]) {
251 return scan_implicit_end_tag(lexer);
252 }
253 break;
254
255 case '\0':
256 if (valid_symbols[IMPLICIT_END_TAG]) {
257 return scan_implicit_end_tag(lexer);
258 }
259 break;
260
261 case '/':
262 if (valid_symbols[SELF_CLOSING_TAG_DELIMITER]) {
263 return scan_self_closing_tag_delimiter(lexer);
264 }
265 break;
266
267 default:
268 if ((valid_symbols[START_TAG_NAME] || valid_symbols[END_TAG_NAME]) && !valid_symbols[RAW_TEXT]) {
269 return valid_symbols[START_TAG_NAME]
270 ? scan_start_tag_name(lexer)
271 : scan_end_tag_name(lexer);
272 }
273 }
274
275 return false;
276 }
277
278 vector<Tag> tags;
279 };
280
281 }
282
283 extern "C" {
284
tree_sitter_html_external_scanner_create()285 void *tree_sitter_html_external_scanner_create() {
286 return new Scanner();
287 }
288
tree_sitter_html_external_scanner_scan(void * payload,TSLexer * lexer,const bool * valid_symbols)289 bool tree_sitter_html_external_scanner_scan(void *payload, TSLexer *lexer,
290 const bool *valid_symbols) {
291 Scanner *scanner = static_cast<Scanner *>(payload);
292 return scanner->scan(lexer, valid_symbols);
293 }
294
tree_sitter_html_external_scanner_serialize(void * payload,char * buffer)295 unsigned tree_sitter_html_external_scanner_serialize(void *payload, char *buffer) {
296 Scanner *scanner = static_cast<Scanner *>(payload);
297 return scanner->serialize(buffer);
298 }
299
tree_sitter_html_external_scanner_deserialize(void * payload,const char * buffer,unsigned length)300 void tree_sitter_html_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
301 Scanner *scanner = static_cast<Scanner *>(payload);
302 scanner->deserialize(buffer, length);
303 }
304
tree_sitter_html_external_scanner_destroy(void * payload)305 void tree_sitter_html_external_scanner_destroy(void *payload) {
306 Scanner *scanner = static_cast<Scanner *>(payload);
307 delete scanner;
308 }
309
310 }
311