1 /*
2  Copyright (C) 2015-2017 Alexander Borisov
3 
4  This library is free software; you can redistribute it and/or
5  modify it under the terms of the GNU Lesser General Public
6  License as published by the Free Software Foundation; either
7  version 2.1 of the License, or (at your option) any later version.
8 
9  This library is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  Lesser General Public License for more details.
13 
14  You should have received a copy of the GNU Lesser General Public
15  License along with this library; if not, write to the Free Software
16  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 
18  Author: lex.borisov@gmail.com (Alexander Borisov)
19 */
20 
21 #include "myhtml/parser.h"
22 
myhtml_parser_stream(mythread_id_t thread_id,void * ctx)23 void myhtml_parser_stream(mythread_id_t thread_id, void* ctx)
24 {
25     mythread_queue_node_t *qnode = (mythread_queue_node_t*)ctx;
26 
27     if((((myhtml_tree_t*)(qnode->context))->parse_flags & MyHTML_TREE_PARSE_FLAGS_WITHOUT_BUILD_TREE) == 0) {
28         while(myhtml_rules_tree_dispatcher(qnode->context, qnode->args)){}
29     }
30 }
31 
myhtml_parser_token_data_to_string_lowercase(myhtml_tree_t * tree,mycore_string_t * str,myhtml_data_process_entry_t * proc_entry,size_t begin,size_t length)32 size_t myhtml_parser_token_data_to_string_lowercase(myhtml_tree_t *tree, mycore_string_t* str, myhtml_data_process_entry_t* proc_entry, size_t begin, size_t length)
33 {
34     mycore_incoming_buffer_t *buffer = mycore_incoming_buffer_find_by_position(tree->incoming_buf_first, begin);
35     size_t relative_begin = begin - buffer->offset;
36 
37     // if token data length in one buffer then print them all at once
38     if((relative_begin + length) <= buffer->size) {
39         if(tree->encoding == MyENCODING_UTF_8)
40             myhtml_string_append_lowercase_with_preprocessing(str, &buffer->data[relative_begin], length, proc_entry->emit_null_char);
41         else
42             myhtml_string_append_lowercase_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res,
43                                                                                           &buffer->data[relative_begin], length,
44                                                                                           proc_entry->encoding, proc_entry->emit_null_char);
45 
46         return str->length;
47     }
48 
49     size_t save_position = 0;
50     // if the data are spread across multiple buffers that join them
51     while(buffer) {
52         if((relative_begin + length) > buffer->size)
53         {
54             size_t relative_end = (buffer->size - relative_begin);
55             length -= relative_end;
56 
57             size_t tmp_offset = myhtml_string_before_append_any_preprocessing(str, &buffer->data[relative_begin], relative_end, save_position);
58 
59             if(relative_end > 0) {
60                 if(tree->encoding == MyENCODING_UTF_8)
61                     save_position = myhtml_string_append_lowercase_with_preprocessing(str, &buffer->data[(relative_begin + tmp_offset)], (relative_end - tmp_offset), proc_entry->emit_null_char);
62                 else
63                     save_position = myhtml_string_append_lowercase_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res,
64                                                                                                                   &buffer->data[(relative_begin + tmp_offset)], (relative_end - tmp_offset),
65                                                                                                                   proc_entry->encoding, proc_entry->emit_null_char);
66             }
67 
68             relative_begin = 0;
69             buffer         = buffer->next;
70         }
71         else {
72             size_t tmp_offset = myhtml_string_before_append_any_preprocessing(str, &buffer->data[relative_begin], length, save_position);
73 
74             if(length > 0) {
75                 if(tree->encoding == MyENCODING_UTF_8)
76                     myhtml_string_append_lowercase_with_preprocessing(str, &buffer->data[(relative_begin + tmp_offset)], (length - tmp_offset), proc_entry->emit_null_char);
77                 else
78                     myhtml_string_append_lowercase_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res,
79                                                                                                                   &buffer->data[(relative_begin + tmp_offset)], (length - tmp_offset),
80                                                                                                                   proc_entry->encoding, proc_entry->emit_null_char);
81             }
82 
83             break;
84         }
85     }
86 
87     return str->length;
88 }
89 
myhtml_parser_token_data_to_string(myhtml_tree_t * tree,mycore_string_t * str,myhtml_data_process_entry_t * proc_entry,size_t begin,size_t length)90 size_t myhtml_parser_token_data_to_string(myhtml_tree_t *tree, mycore_string_t* str, myhtml_data_process_entry_t* proc_entry, size_t begin, size_t length)
91 {
92     mycore_incoming_buffer_t *buffer = mycore_incoming_buffer_find_by_position(tree->incoming_buf_first, begin);
93     size_t relative_begin = begin - buffer->offset;
94 
95     // if token data length in one buffer then print them all at once
96     if((relative_begin + length) <= buffer->size) {
97         if(tree->encoding == MyENCODING_UTF_8)
98             myhtml_string_append_with_preprocessing(str, &buffer->data[relative_begin], length, proc_entry->emit_null_char);
99         else
100             myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res,
101                                                                                 &buffer->data[relative_begin], length,
102                                                                                 proc_entry->encoding, proc_entry->emit_null_char);
103 
104         return str->length;
105     }
106 
107     size_t save_position = 0;
108     // if the data are spread across multiple buffers that join them
109     while(buffer) {
110         if((relative_begin + length) > buffer->size)
111         {
112             size_t relative_end = (buffer->size - relative_begin);
113             length -= relative_end;
114 
115             size_t tmp_offset = myhtml_string_before_append_any_preprocessing(str, &buffer->data[relative_begin], relative_end, save_position);
116 
117             if(relative_end > 0) {
118                 if(tree->encoding == MyENCODING_UTF_8)
119                     save_position = myhtml_string_append_with_preprocessing(str, &buffer->data[(relative_begin + tmp_offset)], (relative_end - tmp_offset), proc_entry->emit_null_char);
120                 else
121                     save_position = myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res,
122                                                                                                         &buffer->data[(relative_begin + tmp_offset)],
123                                                                                                         (relative_end - tmp_offset),
124                                                                                                         proc_entry->encoding, proc_entry->emit_null_char);
125             }
126 
127             relative_begin = 0;
128             buffer         = buffer->next;
129         }
130         else {
131             size_t tmp_offset = myhtml_string_before_append_any_preprocessing(str, &buffer->data[relative_begin], length, save_position);
132 
133             if(length > 0) {
134                 if(tree->encoding == MyENCODING_UTF_8)
135                     myhtml_string_append_with_preprocessing(str, &buffer->data[(relative_begin + tmp_offset)], (length - tmp_offset), proc_entry->emit_null_char);
136                 else
137                     myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res,
138                                                                                         &buffer->data[(relative_begin + tmp_offset)], (length - tmp_offset),
139                                                                                         proc_entry->encoding, proc_entry->emit_null_char);
140             }
141 
142             break;
143         }
144     }
145 
146     return str->length;
147 }
148 
myhtml_parser_token_data_to_string_charef(myhtml_tree_t * tree,mycore_string_t * str,myhtml_data_process_entry_t * proc_entry,size_t begin,size_t length)149 size_t myhtml_parser_token_data_to_string_charef(myhtml_tree_t *tree, mycore_string_t* str, myhtml_data_process_entry_t* proc_entry, size_t begin, size_t length)
150 {
151     mycore_incoming_buffer_t *buffer = mycore_incoming_buffer_find_by_position(tree->incoming_buf_first, begin);
152     size_t relative_begin = begin - buffer->offset;
153 
154     // if token data length in one buffer then print them all at once
155     if((relative_begin + length) <= buffer->size) {
156         myhtml_data_process(proc_entry, str, &buffer->data[relative_begin], length);
157         myhtml_data_process_end(proc_entry, str);
158 
159         return str->length;
160     }
161 
162     // if the data are spread across multiple buffers that join them
163     while(buffer) {
164         if((relative_begin + length) > buffer->size)
165         {
166             size_t relative_end = (buffer->size - relative_begin);
167             length -= relative_end;
168 
169             myhtml_data_process(proc_entry, str, &buffer->data[relative_begin], relative_end);
170 
171             relative_begin = 0;
172             buffer         = buffer->next;
173         }
174         else {
175             myhtml_data_process(proc_entry, str, &buffer->data[relative_begin], length);
176             break;
177         }
178     }
179 
180     myhtml_data_process_end(proc_entry, str);
181 
182     return str->length;
183 }
184 
myhtml_parser_worker(mythread_id_t thread_id,void * ctx)185 void myhtml_parser_worker(mythread_id_t thread_id, void* ctx)
186 {
187     mythread_queue_node_t *qnode = (mythread_queue_node_t*)ctx;
188 
189     myhtml_tree_t* tree = qnode->context;
190     myhtml_token_node_t* token = qnode->args;
191 
192     /*
193      * Tree can not be built without tokens
194      *
195      * MyHTML_TREE_PARSE_FLAGS_WITHOUT_PROCESS_TOKEN == 3
196      * MyHTML_TREE_PARSE_FLAGS_WITHOUT_BUILD_TREE    == 1
197      *
198      * MyHTML_TREE_PARSE_FLAGS_WITHOUT_PROCESS_TOKEN include MyHTML_TREE_PARSE_FLAGS_WITHOUT_BUILD_TREE
199      *
200      * if set only MyHTML_TREE_PARSE_FLAGS_WITHOUT_BUILD_TREE and check only for MyHTML_TREE_PARSE_FLAGS_WITHOUT_PROCESS_TOKEN
201      *   return true
202      * we need check both, 1 and 2
203      */
204     if((tree->parse_flags & MyHTML_TREE_PARSE_FLAGS_WITHOUT_PROCESS_TOKEN) &&
205        (tree->parse_flags & 2))
206     {
207         if(tree->callback_before_token)
208             tree->callback_before_token_ctx = tree->callback_before_token(tree, token, tree->callback_before_token_ctx);
209 
210         token->type |= MyHTML_TOKEN_TYPE_DONE;
211 
212         if(tree->callback_after_token)
213             tree->callback_after_token_ctx = tree->callback_after_token(tree, token, tree->callback_after_token_ctx);
214 
215         return;
216     }
217 
218     size_t mchar_node_id;
219 #ifndef MyCORE_BUILD_WITHOUT_THREADS
220     if(tree->myhtml->thread_batch)
221         mchar_node_id = tree->async_args[(thread_id + tree->myhtml->thread_batch->id_increase)].mchar_node_id;
222     else
223 #endif
224         mchar_node_id = tree->async_args[thread_id].mchar_node_id;
225 
226     if(tree->callback_before_token)
227         tree->callback_before_token_ctx = tree->callback_before_token(tree, token, tree->callback_before_token_ctx);
228 
229     if(token->tag_id == MyHTML_TAG__TEXT ||
230        token->tag_id == MyHTML_TAG__COMMENT)
231     {
232         mycore_string_init(tree->mchar, mchar_node_id, &token->str, (token->raw_length + 1));
233 
234         token->attr_first = NULL;
235         token->attr_last  = NULL;
236 
237         myhtml_data_process_entry_t proc_entry;
238         myhtml_data_process_entry_clean(&proc_entry);
239 
240         proc_entry.encoding = tree->encoding;
241 
242         if(token->type & MyHTML_TOKEN_TYPE_DATA) {
243             proc_entry.emit_null_char = true;
244 
245             myhtml_parser_token_data_to_string_charef(tree, &token->str, &proc_entry, token->raw_begin, token->raw_length);
246         }
247         else if(token->type & MyHTML_TOKEN_TYPE_RCDATA || token->type & MyHTML_TOKEN_TYPE_CDATA) {
248             myhtml_parser_token_data_to_string_charef(tree, &token->str, &proc_entry, token->raw_begin, token->raw_length);
249         }
250         else
251             myhtml_parser_token_data_to_string(tree, &token->str, &proc_entry, token->raw_begin, token->raw_length);
252     }
253     else if(token->attr_first)
254     {
255         mycore_string_clean_all(&token->str);
256 
257         myhtml_token_attr_t* attr = token->attr_first;
258         myhtml_data_process_entry_t proc_entry;
259 
260         while(attr)
261         {
262             if(attr->raw_key_length) {
263                 myhtml_data_process_entry_clean(&proc_entry);
264                 proc_entry.encoding = tree->encoding;
265 
266                 mycore_string_init(tree->mchar, mchar_node_id, &attr->key, (attr->raw_key_length + 1));
267                 myhtml_parser_token_data_to_string_lowercase(tree, &attr->key, &proc_entry, attr->raw_key_begin, attr->raw_key_length);
268             }
269             else
270                 mycore_string_clean_all(&attr->key);
271 
272             if(attr->raw_value_length) {
273                 myhtml_data_process_entry_clean(&proc_entry);
274                 proc_entry.encoding = tree->encoding;
275                 proc_entry.is_attributes = true;
276 
277                 mycore_string_init(tree->mchar, mchar_node_id, &attr->value, (attr->raw_value_length + 1));
278                 myhtml_parser_token_data_to_string_charef(tree, &attr->value, &proc_entry, attr->raw_value_begin, attr->raw_value_length);
279             }
280             else
281                 mycore_string_clean_all(&attr->value);
282 
283             attr = attr->next;
284         }
285     }
286     else {
287         token->attr_first = NULL;
288         token->attr_last  = NULL;
289 
290         mycore_string_clean_all(&token->str);
291     }
292 
293     token->type |= MyHTML_TOKEN_TYPE_DONE;
294 
295     if(tree->callback_after_token)
296         tree->callback_after_token_ctx = tree->callback_after_token(tree, token, tree->callback_after_token_ctx);
297 }
298 
myhtml_parser_worker_stream(mythread_id_t thread_id,void * ctx)299 void myhtml_parser_worker_stream(mythread_id_t thread_id, void* ctx)
300 {
301     mythread_queue_node_t *qnode = (mythread_queue_node_t*)ctx;
302 
303     myhtml_parser_worker(thread_id, qnode);
304     myhtml_parser_stream(thread_id, qnode);
305 }
306 
307 
308