1 /*
2  Copyright (C) 2015-2017 Alexander Borisov
3 
4  This library is free software; you can redistribute it and/or
5  modify it under the terms of the GNU Lesser General Public
6  License as published by the Free Software Foundation; either
7  version 2.1 of the License, or (at your option) any later version.
8 
9  This library is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  Lesser General Public License for more details.
13 
14  You should have received a copy of the GNU Lesser General Public
15  License along with this library; if not, write to the Free Software
16  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 
18  Author: lex.borisov@gmail.com (Alexander Borisov)
19 */
20 
21 #include "myhtml/data_process.h"
22 #include "mycore/utils/resources.h"
23 
24 #define MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING() \
25 tmp_offset += myhtml_string_before_append_any_preprocessing(str, &data[tmp_offset], (offset - tmp_offset), \
26                                                             proc_entry->tmp_str_pos_proc); \
27 if(offset != tmp_offset) { \
28     if(proc_entry->encoding == MyENCODING_UTF_8) \
29         proc_entry->tmp_str_pos_proc = myhtml_string_append_with_preprocessing(str, &data[tmp_offset], (offset - tmp_offset), \
30                                                                                proc_entry->emit_null_char); \
31     else { \
32         proc_entry->tmp_str_pos_proc = \
33         myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res, \
34                                                                             &data[tmp_offset], (offset - tmp_offset), \
35                                                                             proc_entry->encoding, proc_entry->emit_null_char); \
36     } \
37 }
38 
myhtml_data_process_entry_clean(myhtml_data_process_entry_t * proc_entry)39 void myhtml_data_process_entry_clean(myhtml_data_process_entry_t* proc_entry)
40 {
41     memset(proc_entry, 0, sizeof(myhtml_data_process_entry_t));
42     proc_entry->state = myhtml_data_process_state_data;
43 }
44 
myhtml_data_process_string_append_char(mycore_string_t * str,const char sm)45 void myhtml_data_process_string_append_char(mycore_string_t* str, const char sm)
46 {
47     MyCORE_STRING_REALLOC_IF_NEED(str, 2, 0);
48 
49     str->data[str->length] = sm;
50     str->length++;
51 
52     str->data[str->length] = '\0';
53 }
54 
myhtml_data_process_state_data(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str,const char * data,size_t offset,size_t size)55 size_t myhtml_data_process_state_data(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str, const char* data, size_t offset, size_t size)
56 {
57     size_t tmp_offset = offset;
58 
59     while(offset < size)
60     {
61         if(data[offset] == '&')
62         {
63             tmp_offset += myhtml_string_before_append_any_preprocessing(str, &data[tmp_offset], (offset - tmp_offset),
64                                                                         proc_entry->tmp_str_pos_proc);
65             if(offset != tmp_offset) {
66                 if(proc_entry->encoding == MyENCODING_UTF_8)
67                     proc_entry->tmp_str_pos_proc = myhtml_string_append_with_preprocessing(str, &data[tmp_offset],
68                                                                                            (offset - tmp_offset),
69                                                                                            proc_entry->emit_null_char);
70                 else {
71                     proc_entry->tmp_str_pos_proc =
72                     myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res,
73                                                                                         &data[tmp_offset], (offset - tmp_offset),
74                                                                                         proc_entry->encoding, proc_entry->emit_null_char);
75                     myencoding_result_clean(&proc_entry->res);
76                 }
77             }
78 
79             proc_entry->tmp_str_pos = str->length;
80             proc_entry->state = myhtml_data_process_state_ampersand;
81 
82             myhtml_data_process_string_append_char(str, data[offset]);
83 
84             offset++;
85             return offset;
86         }
87 
88         offset++;
89     }
90 
91     MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
92 
93     return offset;
94 }
95 
myhtml_data_process_state_ampersand(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str,const char * data,size_t offset,size_t size)96 size_t myhtml_data_process_state_ampersand(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str, const char* data, size_t offset, size_t size)
97 {
98     if(data[offset] == '#')
99     {
100         myhtml_data_process_string_append_char(str, data[offset]);
101         offset++;
102 
103         proc_entry->tmp_num = 0;
104 
105         if(offset >= size) {
106             proc_entry->state = myhtml_data_process_state_ampersand_hash;
107             return offset;
108         }
109 
110         if(data[offset] == 'x' || data[offset] == 'X') {
111             myhtml_data_process_string_append_char(str, data[offset]);
112             offset++;
113 
114             proc_entry->state = myhtml_data_process_state_ampersand_hash_x_data;
115         }
116         else
117             proc_entry->state = myhtml_data_process_state_ampersand_hash_data;
118     }
119     else {
120         proc_entry->charef_res.last_entry = NULL;
121         proc_entry->charef_res.curr_entry = myhtml_charef_get_first_position(data[offset]);
122 
123         if(proc_entry->charef_res.curr_entry->ch == '\0')
124             proc_entry->state = myhtml_data_process_state_data;
125         else {
126             proc_entry->state = myhtml_data_process_state_ampersand_data;
127 
128             myhtml_data_process_string_append_char(str, data[offset]);
129             offset++;
130         }
131     }
132 
133     return offset;
134 }
135 
myhtml_data_process_state_ampersand_data(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str,const char * data,size_t offset,size_t size)136 size_t myhtml_data_process_state_ampersand_data(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str, const char* data, size_t offset, size_t size)
137 {
138     size_t tmp_offset = offset;
139 
140     const charef_entry_t *current_entry = myhtml_charef_find_by_pos(proc_entry->charef_res.curr_entry->next, data, &offset, size, &proc_entry->charef_res);
141 
142     if(proc_entry->charef_res.is_done) {
143         proc_entry->state = myhtml_data_process_state_data;
144 
145         if(data[offset] == ';')
146             offset++;
147         else {
148             /* if current charef is atrribute */
149             if(proc_entry->is_attributes &&
150                (data[offset] == '=' || mycore_string_alphanumeric_character[ (unsigned char)data[offset] ] != 0xff))
151             {
152                 MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
153 
154                 return offset;
155             }
156         }
157 
158         if(current_entry->codepoints_len) {
159             for (size_t i = 0; i < current_entry->codepoints_len; i++) {
160                 MyCORE_STRING_REALLOC_IF_NEED(str, 5, 0);
161 
162                 proc_entry->tmp_str_pos += myencoding_codepoint_to_ascii_utf_8(current_entry->codepoints[i], &str->data[proc_entry->tmp_str_pos]);
163             }
164 
165             str->length = proc_entry->tmp_str_pos;
166             str->data[str->length] = '\0';
167         }
168         else {
169             MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
170         }
171 
172         proc_entry->charef_res.last_entry = NULL;
173     }
174     else {
175         MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
176     }
177 
178     return offset;
179 }
180 
myhtml_data_process_state_ampersand_hash(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str,const char * data,size_t offset,size_t size)181 size_t myhtml_data_process_state_ampersand_hash(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str, const char* data, size_t offset, size_t size)
182 {
183     if(data[offset] == 'x' || data[offset] == 'X') {
184         myhtml_data_process_string_append_char(str, data[offset]);
185         offset++;
186 
187         proc_entry->state = myhtml_data_process_state_ampersand_hash_x_data;
188     }
189     else
190         proc_entry->state = myhtml_data_process_state_ampersand_hash_data;
191 
192     return offset;
193 }
194 
myhtml_data_process_state_ampersand_hash_data(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str,const char * data,size_t offset,size_t size)195 size_t myhtml_data_process_state_ampersand_hash_data(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str, const char* data, size_t offset, size_t size)
196 {
197     const unsigned char *u_data = (const unsigned char*)data;
198     size_t tmp_offset = offset;
199 
200     while(offset < size)
201     {
202         if(mycore_string_chars_num_map[ u_data[offset] ] == 0xff)
203         {
204             proc_entry->state = myhtml_data_process_state_data;
205 
206             if((offset - tmp_offset) == 0) {
207                 MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
208 
209                 return offset;
210             }
211 
212             if(data[offset] == ';')
213                 offset++;
214 
215             myhtml_data_process_state_end(proc_entry, str);
216             return offset;
217         }
218 
219         if(proc_entry->tmp_num <= 0x10FFFF) {
220             proc_entry->tmp_num = mycore_string_chars_num_map[ u_data[offset] ] + proc_entry->tmp_num * 10;
221         }
222 
223         offset++;
224     }
225 
226     MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
227 
228     return offset;
229 }
230 
myhtml_data_process_state_ampersand_hash_x_data(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str,const char * data,size_t offset,size_t size)231 size_t myhtml_data_process_state_ampersand_hash_x_data(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str, const char* data, size_t offset, size_t size)
232 {
233     unsigned const char *u_data = (unsigned const char*)data;
234     size_t tmp_offset = offset;
235 
236     while(offset < size)
237     {
238         if(mycore_string_chars_hex_map[ u_data[offset] ] == 0xff)
239         {
240             proc_entry->state = myhtml_data_process_state_data;
241 
242             if((offset - tmp_offset) == 0) {
243                 MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
244 
245                 return offset;
246             }
247 
248             if(data[offset] == ';')
249                 offset++;
250 
251             myhtml_data_process_state_end(proc_entry, str);
252             return offset;
253         }
254 
255         if(proc_entry->tmp_num <= 0x10FFFF) {
256             proc_entry->tmp_num <<= 4;
257             proc_entry->tmp_num |= mycore_string_chars_hex_map[ u_data[offset] ];
258         }
259 
260         offset++;
261     }
262 
263     MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
264 
265     return offset;
266 }
267 
myhtml_data_process_state_end(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str)268 void myhtml_data_process_state_end(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str)
269 {
270     /* 4 is max utf8 byte + \0 */
271     MyCORE_STRING_REALLOC_IF_NEED(str, 5, 0);
272 
273     if(proc_entry->tmp_num <= 0x9F)
274         proc_entry->tmp_num = replacement_character[proc_entry->tmp_num];
275     else if(proc_entry->tmp_num >= 0xD800 && proc_entry->tmp_num <= 0xDFFF)
276         proc_entry->tmp_num = replacement_character[0];
277     else if(proc_entry->tmp_num > 0x10FFFF)
278         proc_entry->tmp_num = replacement_character[0];
279 
280     str->length = proc_entry->tmp_str_pos +
281         myencoding_codepoint_to_ascii_utf_8(proc_entry->tmp_num, &str->data[proc_entry->tmp_str_pos]);
282 
283     str->data[str->length] = '\0';
284 }
285 
myhtml_data_process(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str,const char * data,size_t size)286 void myhtml_data_process(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str, const char* data, size_t size)
287 {
288     size_t offset = 0;
289 
290     while (offset < size) {
291         offset = proc_entry->state(proc_entry, str, data, offset, size);
292     }
293 }
294 
myhtml_data_process_end(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str)295 void myhtml_data_process_end(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str)
296 {
297     if(proc_entry->state == myhtml_data_process_state_ampersand_data && proc_entry->charef_res.last_entry)
298     {
299         const charef_entry_t *entry = proc_entry->charef_res.last_entry;
300 
301         for (size_t i = 0; i < entry->codepoints_len; i++) {
302             MyCORE_STRING_REALLOC_IF_NEED(str, 5, 0);
303 
304             proc_entry->tmp_str_pos += myencoding_codepoint_to_ascii_utf_8(entry->codepoints[i], &str->data[proc_entry->tmp_str_pos]);
305         }
306 
307         str->length = proc_entry->tmp_str_pos;
308         str->data[str->length] = '\0';
309     }
310     else if(proc_entry->state == myhtml_data_process_state_ampersand_hash_data) {
311         if((str->length - (proc_entry->tmp_str_pos + 2)))
312             myhtml_data_process_state_end(proc_entry, str);
313     }
314     else if(proc_entry->state == myhtml_data_process_state_ampersand_hash_x_data) {
315         if((str->length - (proc_entry->tmp_str_pos + 3)))
316             myhtml_data_process_state_end(proc_entry, str);
317     }
318 }
319 
320 
321