1 /*
2 Copyright (C) 2015-2017 Alexander Borisov
3
4 This library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Lesser General Public
6 License as published by the Free Software Foundation; either
7 version 2.1 of the License, or (at your option) any later version.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with this library; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
18 Author: lex.borisov@gmail.com (Alexander Borisov)
19 */
20
21 #include "myhtml/parser.h"
22
myhtml_parser_stream(mythread_id_t thread_id,void * ctx)23 void myhtml_parser_stream(mythread_id_t thread_id, void* ctx)
24 {
25 mythread_queue_node_t *qnode = (mythread_queue_node_t*)ctx;
26
27 if((((myhtml_tree_t*)(qnode->context))->parse_flags & MyHTML_TREE_PARSE_FLAGS_WITHOUT_BUILD_TREE) == 0) {
28 while(myhtml_rules_tree_dispatcher(qnode->context, qnode->args)){}
29 }
30 }
31
myhtml_parser_token_data_to_string_lowercase(myhtml_tree_t * tree,mycore_string_t * str,myhtml_data_process_entry_t * proc_entry,size_t begin,size_t length)32 size_t myhtml_parser_token_data_to_string_lowercase(myhtml_tree_t *tree, mycore_string_t* str, myhtml_data_process_entry_t* proc_entry, size_t begin, size_t length)
33 {
34 mycore_incoming_buffer_t *buffer = mycore_incoming_buffer_find_by_position(tree->incoming_buf_first, begin);
35 size_t relative_begin = begin - buffer->offset;
36
37 // if token data length in one buffer then print them all at once
38 if((relative_begin + length) <= buffer->size) {
39 if(tree->encoding == MyENCODING_UTF_8)
40 myhtml_string_append_lowercase_with_preprocessing(str, &buffer->data[relative_begin], length, proc_entry->emit_null_char);
41 else
42 myhtml_string_append_lowercase_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res,
43 &buffer->data[relative_begin], length,
44 proc_entry->encoding, proc_entry->emit_null_char);
45
46 return str->length;
47 }
48
49 size_t save_position = 0;
50 // if the data are spread across multiple buffers that join them
51 while(buffer) {
52 if((relative_begin + length) > buffer->size)
53 {
54 size_t relative_end = (buffer->size - relative_begin);
55 length -= relative_end;
56
57 size_t tmp_offset = myhtml_string_before_append_any_preprocessing(str, &buffer->data[relative_begin], relative_end, save_position);
58
59 if(relative_end > 0) {
60 if(tree->encoding == MyENCODING_UTF_8)
61 save_position = myhtml_string_append_lowercase_with_preprocessing(str, &buffer->data[(relative_begin + tmp_offset)], (relative_end - tmp_offset), proc_entry->emit_null_char);
62 else
63 save_position = myhtml_string_append_lowercase_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res,
64 &buffer->data[(relative_begin + tmp_offset)], (relative_end - tmp_offset),
65 proc_entry->encoding, proc_entry->emit_null_char);
66 }
67
68 relative_begin = 0;
69 buffer = buffer->next;
70 }
71 else {
72 size_t tmp_offset = myhtml_string_before_append_any_preprocessing(str, &buffer->data[relative_begin], length, save_position);
73
74 if(length > 0) {
75 if(tree->encoding == MyENCODING_UTF_8)
76 myhtml_string_append_lowercase_with_preprocessing(str, &buffer->data[(relative_begin + tmp_offset)], (length - tmp_offset), proc_entry->emit_null_char);
77 else
78 myhtml_string_append_lowercase_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res,
79 &buffer->data[(relative_begin + tmp_offset)], (length - tmp_offset),
80 proc_entry->encoding, proc_entry->emit_null_char);
81 }
82
83 break;
84 }
85 }
86
87 return str->length;
88 }
89
myhtml_parser_token_data_to_string(myhtml_tree_t * tree,mycore_string_t * str,myhtml_data_process_entry_t * proc_entry,size_t begin,size_t length)90 size_t myhtml_parser_token_data_to_string(myhtml_tree_t *tree, mycore_string_t* str, myhtml_data_process_entry_t* proc_entry, size_t begin, size_t length)
91 {
92 mycore_incoming_buffer_t *buffer = mycore_incoming_buffer_find_by_position(tree->incoming_buf_first, begin);
93 size_t relative_begin = begin - buffer->offset;
94
95 // if token data length in one buffer then print them all at once
96 if((relative_begin + length) <= buffer->size) {
97 if(tree->encoding == MyENCODING_UTF_8)
98 myhtml_string_append_with_preprocessing(str, &buffer->data[relative_begin], length, proc_entry->emit_null_char);
99 else
100 myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res,
101 &buffer->data[relative_begin], length,
102 proc_entry->encoding, proc_entry->emit_null_char);
103
104 return str->length;
105 }
106
107 size_t save_position = 0;
108 // if the data are spread across multiple buffers that join them
109 while(buffer) {
110 if((relative_begin + length) > buffer->size)
111 {
112 size_t relative_end = (buffer->size - relative_begin);
113 length -= relative_end;
114
115 size_t tmp_offset = myhtml_string_before_append_any_preprocessing(str, &buffer->data[relative_begin], relative_end, save_position);
116
117 if(relative_end > 0) {
118 if(tree->encoding == MyENCODING_UTF_8)
119 save_position = myhtml_string_append_with_preprocessing(str, &buffer->data[(relative_begin + tmp_offset)], (relative_end - tmp_offset), proc_entry->emit_null_char);
120 else
121 save_position = myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res,
122 &buffer->data[(relative_begin + tmp_offset)],
123 (relative_end - tmp_offset),
124 proc_entry->encoding, proc_entry->emit_null_char);
125 }
126
127 relative_begin = 0;
128 buffer = buffer->next;
129 }
130 else {
131 size_t tmp_offset = myhtml_string_before_append_any_preprocessing(str, &buffer->data[relative_begin], length, save_position);
132
133 if(length > 0) {
134 if(tree->encoding == MyENCODING_UTF_8)
135 myhtml_string_append_with_preprocessing(str, &buffer->data[(relative_begin + tmp_offset)], (length - tmp_offset), proc_entry->emit_null_char);
136 else
137 myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res,
138 &buffer->data[(relative_begin + tmp_offset)], (length - tmp_offset),
139 proc_entry->encoding, proc_entry->emit_null_char);
140 }
141
142 break;
143 }
144 }
145
146 return str->length;
147 }
148
myhtml_parser_token_data_to_string_charef(myhtml_tree_t * tree,mycore_string_t * str,myhtml_data_process_entry_t * proc_entry,size_t begin,size_t length)149 size_t myhtml_parser_token_data_to_string_charef(myhtml_tree_t *tree, mycore_string_t* str, myhtml_data_process_entry_t* proc_entry, size_t begin, size_t length)
150 {
151 mycore_incoming_buffer_t *buffer = mycore_incoming_buffer_find_by_position(tree->incoming_buf_first, begin);
152 size_t relative_begin = begin - buffer->offset;
153
154 // if token data length in one buffer then print them all at once
155 if((relative_begin + length) <= buffer->size) {
156 myhtml_data_process(proc_entry, str, &buffer->data[relative_begin], length);
157 myhtml_data_process_end(proc_entry, str);
158
159 return str->length;
160 }
161
162 // if the data are spread across multiple buffers that join them
163 while(buffer) {
164 if((relative_begin + length) > buffer->size)
165 {
166 size_t relative_end = (buffer->size - relative_begin);
167 length -= relative_end;
168
169 myhtml_data_process(proc_entry, str, &buffer->data[relative_begin], relative_end);
170
171 relative_begin = 0;
172 buffer = buffer->next;
173 }
174 else {
175 myhtml_data_process(proc_entry, str, &buffer->data[relative_begin], length);
176 break;
177 }
178 }
179
180 myhtml_data_process_end(proc_entry, str);
181
182 return str->length;
183 }
184
myhtml_parser_worker(mythread_id_t thread_id,void * ctx)185 void myhtml_parser_worker(mythread_id_t thread_id, void* ctx)
186 {
187 mythread_queue_node_t *qnode = (mythread_queue_node_t*)ctx;
188
189 myhtml_tree_t* tree = qnode->context;
190 myhtml_token_node_t* token = qnode->args;
191
192 /*
193 * Tree can not be built without tokens
194 *
195 * MyHTML_TREE_PARSE_FLAGS_WITHOUT_PROCESS_TOKEN == 3
196 * MyHTML_TREE_PARSE_FLAGS_WITHOUT_BUILD_TREE == 1
197 *
198 * MyHTML_TREE_PARSE_FLAGS_WITHOUT_PROCESS_TOKEN include MyHTML_TREE_PARSE_FLAGS_WITHOUT_BUILD_TREE
199 *
200 * if set only MyHTML_TREE_PARSE_FLAGS_WITHOUT_BUILD_TREE and check only for MyHTML_TREE_PARSE_FLAGS_WITHOUT_PROCESS_TOKEN
201 * return true
202 * we need check both, 1 and 2
203 */
204 if((tree->parse_flags & MyHTML_TREE_PARSE_FLAGS_WITHOUT_PROCESS_TOKEN) &&
205 (tree->parse_flags & 2))
206 {
207 if(tree->callback_before_token)
208 tree->callback_before_token_ctx = tree->callback_before_token(tree, token, tree->callback_before_token_ctx);
209
210 token->type |= MyHTML_TOKEN_TYPE_DONE;
211
212 if(tree->callback_after_token)
213 tree->callback_after_token_ctx = tree->callback_after_token(tree, token, tree->callback_after_token_ctx);
214
215 return;
216 }
217
218 size_t mchar_node_id;
219 #ifndef MyCORE_BUILD_WITHOUT_THREADS
220 if(tree->myhtml->thread_batch)
221 mchar_node_id = tree->async_args[(thread_id + tree->myhtml->thread_batch->id_increase)].mchar_node_id;
222 else
223 #endif
224 mchar_node_id = tree->async_args[thread_id].mchar_node_id;
225
226 if(tree->callback_before_token)
227 tree->callback_before_token_ctx = tree->callback_before_token(tree, token, tree->callback_before_token_ctx);
228
229 if(token->tag_id == MyHTML_TAG__TEXT ||
230 token->tag_id == MyHTML_TAG__COMMENT)
231 {
232 mycore_string_init(tree->mchar, mchar_node_id, &token->str, (token->raw_length + 1));
233
234 token->attr_first = NULL;
235 token->attr_last = NULL;
236
237 myhtml_data_process_entry_t proc_entry;
238 myhtml_data_process_entry_clean(&proc_entry);
239
240 proc_entry.encoding = tree->encoding;
241
242 if(token->type & MyHTML_TOKEN_TYPE_DATA) {
243 proc_entry.emit_null_char = true;
244
245 myhtml_parser_token_data_to_string_charef(tree, &token->str, &proc_entry, token->raw_begin, token->raw_length);
246 }
247 else if(token->type & MyHTML_TOKEN_TYPE_RCDATA || token->type & MyHTML_TOKEN_TYPE_CDATA) {
248 myhtml_parser_token_data_to_string_charef(tree, &token->str, &proc_entry, token->raw_begin, token->raw_length);
249 }
250 else
251 myhtml_parser_token_data_to_string(tree, &token->str, &proc_entry, token->raw_begin, token->raw_length);
252 }
253 else if(token->attr_first)
254 {
255 mycore_string_clean_all(&token->str);
256
257 myhtml_token_attr_t* attr = token->attr_first;
258 myhtml_data_process_entry_t proc_entry;
259
260 while(attr)
261 {
262 if(attr->raw_key_length) {
263 myhtml_data_process_entry_clean(&proc_entry);
264 proc_entry.encoding = tree->encoding;
265
266 mycore_string_init(tree->mchar, mchar_node_id, &attr->key, (attr->raw_key_length + 1));
267 myhtml_parser_token_data_to_string_lowercase(tree, &attr->key, &proc_entry, attr->raw_key_begin, attr->raw_key_length);
268 }
269 else
270 mycore_string_clean_all(&attr->key);
271
272 if(attr->raw_value_length) {
273 myhtml_data_process_entry_clean(&proc_entry);
274 proc_entry.encoding = tree->encoding;
275 proc_entry.is_attributes = true;
276
277 mycore_string_init(tree->mchar, mchar_node_id, &attr->value, (attr->raw_value_length + 1));
278 myhtml_parser_token_data_to_string_charef(tree, &attr->value, &proc_entry, attr->raw_value_begin, attr->raw_value_length);
279 }
280 else
281 mycore_string_clean_all(&attr->value);
282
283 attr = attr->next;
284 }
285 }
286 else {
287 token->attr_first = NULL;
288 token->attr_last = NULL;
289
290 mycore_string_clean_all(&token->str);
291 }
292
293 token->type |= MyHTML_TOKEN_TYPE_DONE;
294
295 if(tree->callback_after_token)
296 tree->callback_after_token_ctx = tree->callback_after_token(tree, token, tree->callback_after_token_ctx);
297 }
298
myhtml_parser_worker_stream(mythread_id_t thread_id,void * ctx)299 void myhtml_parser_worker_stream(mythread_id_t thread_id, void* ctx)
300 {
301 mythread_queue_node_t *qnode = (mythread_queue_node_t*)ctx;
302
303 myhtml_parser_worker(thread_id, qnode);
304 myhtml_parser_stream(thread_id, qnode);
305 }
306
307
308