1 /*
2 Copyright (C) 2015-2017 Alexander Borisov
3
4 This library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Lesser General Public
6 License as published by the Free Software Foundation; either
7 version 2.1 of the License, or (at your option) any later version.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with this library; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
18 Author: lex.borisov@gmail.com (Alexander Borisov)
19 */
20
21 #include "myhtml/data_process.h"
22 #include "mycore/utils/resources.h"
23
24 #define MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING() \
25 tmp_offset += myhtml_string_before_append_any_preprocessing(str, &data[tmp_offset], (offset - tmp_offset), \
26 proc_entry->tmp_str_pos_proc); \
27 if(offset != tmp_offset) { \
28 if(proc_entry->encoding == MyENCODING_UTF_8) \
29 proc_entry->tmp_str_pos_proc = myhtml_string_append_with_preprocessing(str, &data[tmp_offset], (offset - tmp_offset), \
30 proc_entry->emit_null_char); \
31 else { \
32 proc_entry->tmp_str_pos_proc = \
33 myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res, \
34 &data[tmp_offset], (offset - tmp_offset), \
35 proc_entry->encoding, proc_entry->emit_null_char); \
36 } \
37 }
38
myhtml_data_process_entry_clean(myhtml_data_process_entry_t * proc_entry)39 void myhtml_data_process_entry_clean(myhtml_data_process_entry_t* proc_entry)
40 {
41 memset(proc_entry, 0, sizeof(myhtml_data_process_entry_t));
42 proc_entry->state = myhtml_data_process_state_data;
43 }
44
myhtml_data_process_string_append_char(mycore_string_t * str,const char sm)45 void myhtml_data_process_string_append_char(mycore_string_t* str, const char sm)
46 {
47 MyCORE_STRING_REALLOC_IF_NEED(str, 2, 0);
48
49 str->data[str->length] = sm;
50 str->length++;
51
52 str->data[str->length] = '\0';
53 }
54
myhtml_data_process_state_data(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str,const char * data,size_t offset,size_t size)55 size_t myhtml_data_process_state_data(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str, const char* data, size_t offset, size_t size)
56 {
57 size_t tmp_offset = offset;
58
59 while(offset < size)
60 {
61 if(data[offset] == '&')
62 {
63 tmp_offset += myhtml_string_before_append_any_preprocessing(str, &data[tmp_offset], (offset - tmp_offset),
64 proc_entry->tmp_str_pos_proc);
65 if(offset != tmp_offset) {
66 if(proc_entry->encoding == MyENCODING_UTF_8)
67 proc_entry->tmp_str_pos_proc = myhtml_string_append_with_preprocessing(str, &data[tmp_offset],
68 (offset - tmp_offset),
69 proc_entry->emit_null_char);
70 else {
71 proc_entry->tmp_str_pos_proc =
72 myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(str, &proc_entry->res,
73 &data[tmp_offset], (offset - tmp_offset),
74 proc_entry->encoding, proc_entry->emit_null_char);
75 myencoding_result_clean(&proc_entry->res);
76 }
77 }
78
79 proc_entry->tmp_str_pos = str->length;
80 proc_entry->state = myhtml_data_process_state_ampersand;
81
82 myhtml_data_process_string_append_char(str, data[offset]);
83
84 offset++;
85 return offset;
86 }
87
88 offset++;
89 }
90
91 MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
92
93 return offset;
94 }
95
myhtml_data_process_state_ampersand(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str,const char * data,size_t offset,size_t size)96 size_t myhtml_data_process_state_ampersand(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str, const char* data, size_t offset, size_t size)
97 {
98 if(data[offset] == '#')
99 {
100 myhtml_data_process_string_append_char(str, data[offset]);
101 offset++;
102
103 proc_entry->tmp_num = 0;
104
105 if(offset >= size) {
106 proc_entry->state = myhtml_data_process_state_ampersand_hash;
107 return offset;
108 }
109
110 if(data[offset] == 'x' || data[offset] == 'X') {
111 myhtml_data_process_string_append_char(str, data[offset]);
112 offset++;
113
114 proc_entry->state = myhtml_data_process_state_ampersand_hash_x_data;
115 }
116 else
117 proc_entry->state = myhtml_data_process_state_ampersand_hash_data;
118 }
119 else {
120 proc_entry->charef_res.last_entry = NULL;
121 proc_entry->charef_res.curr_entry = myhtml_charef_get_first_position(data[offset]);
122
123 if(proc_entry->charef_res.curr_entry->ch == '\0')
124 proc_entry->state = myhtml_data_process_state_data;
125 else {
126 proc_entry->state = myhtml_data_process_state_ampersand_data;
127
128 myhtml_data_process_string_append_char(str, data[offset]);
129 offset++;
130 }
131 }
132
133 return offset;
134 }
135
myhtml_data_process_state_ampersand_data(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str,const char * data,size_t offset,size_t size)136 size_t myhtml_data_process_state_ampersand_data(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str, const char* data, size_t offset, size_t size)
137 {
138 size_t tmp_offset = offset;
139
140 const charef_entry_t *current_entry = myhtml_charef_find_by_pos(proc_entry->charef_res.curr_entry->next, data, &offset, size, &proc_entry->charef_res);
141
142 if(proc_entry->charef_res.is_done) {
143 proc_entry->state = myhtml_data_process_state_data;
144
145 if(data[offset] == ';')
146 offset++;
147 else {
148 /* if current charef is atrribute */
149 if(proc_entry->is_attributes &&
150 (data[offset] == '=' || mycore_string_alphanumeric_character[ (unsigned char)data[offset] ] != 0xff))
151 {
152 MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
153
154 return offset;
155 }
156 }
157
158 if(current_entry->codepoints_len) {
159 for (size_t i = 0; i < current_entry->codepoints_len; i++) {
160 MyCORE_STRING_REALLOC_IF_NEED(str, 5, 0);
161
162 proc_entry->tmp_str_pos += myencoding_codepoint_to_ascii_utf_8(current_entry->codepoints[i], &str->data[proc_entry->tmp_str_pos]);
163 }
164
165 str->length = proc_entry->tmp_str_pos;
166 str->data[str->length] = '\0';
167 }
168 else {
169 MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
170 }
171
172 proc_entry->charef_res.last_entry = NULL;
173 }
174 else {
175 MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
176 }
177
178 return offset;
179 }
180
myhtml_data_process_state_ampersand_hash(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str,const char * data,size_t offset,size_t size)181 size_t myhtml_data_process_state_ampersand_hash(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str, const char* data, size_t offset, size_t size)
182 {
183 if(data[offset] == 'x' || data[offset] == 'X') {
184 myhtml_data_process_string_append_char(str, data[offset]);
185 offset++;
186
187 proc_entry->state = myhtml_data_process_state_ampersand_hash_x_data;
188 }
189 else
190 proc_entry->state = myhtml_data_process_state_ampersand_hash_data;
191
192 return offset;
193 }
194
myhtml_data_process_state_ampersand_hash_data(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str,const char * data,size_t offset,size_t size)195 size_t myhtml_data_process_state_ampersand_hash_data(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str, const char* data, size_t offset, size_t size)
196 {
197 const unsigned char *u_data = (const unsigned char*)data;
198 size_t tmp_offset = offset;
199
200 while(offset < size)
201 {
202 if(mycore_string_chars_num_map[ u_data[offset] ] == 0xff)
203 {
204 proc_entry->state = myhtml_data_process_state_data;
205
206 if((offset - tmp_offset) == 0) {
207 MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
208
209 return offset;
210 }
211
212 if(data[offset] == ';')
213 offset++;
214
215 myhtml_data_process_state_end(proc_entry, str);
216 return offset;
217 }
218
219 if(proc_entry->tmp_num <= 0x10FFFF) {
220 proc_entry->tmp_num = mycore_string_chars_num_map[ u_data[offset] ] + proc_entry->tmp_num * 10;
221 }
222
223 offset++;
224 }
225
226 MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
227
228 return offset;
229 }
230
myhtml_data_process_state_ampersand_hash_x_data(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str,const char * data,size_t offset,size_t size)231 size_t myhtml_data_process_state_ampersand_hash_x_data(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str, const char* data, size_t offset, size_t size)
232 {
233 unsigned const char *u_data = (unsigned const char*)data;
234 size_t tmp_offset = offset;
235
236 while(offset < size)
237 {
238 if(mycore_string_chars_hex_map[ u_data[offset] ] == 0xff)
239 {
240 proc_entry->state = myhtml_data_process_state_data;
241
242 if((offset - tmp_offset) == 0) {
243 MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
244
245 return offset;
246 }
247
248 if(data[offset] == ';')
249 offset++;
250
251 myhtml_data_process_state_end(proc_entry, str);
252 return offset;
253 }
254
255 if(proc_entry->tmp_num <= 0x10FFFF) {
256 proc_entry->tmp_num <<= 4;
257 proc_entry->tmp_num |= mycore_string_chars_hex_map[ u_data[offset] ];
258 }
259
260 offset++;
261 }
262
263 MyHTML_DATA_PROCESS_APPEND_WITH_PREPROCESSING()
264
265 return offset;
266 }
267
myhtml_data_process_state_end(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str)268 void myhtml_data_process_state_end(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str)
269 {
270 /* 4 is max utf8 byte + \0 */
271 MyCORE_STRING_REALLOC_IF_NEED(str, 5, 0);
272
273 if(proc_entry->tmp_num <= 0x9F)
274 proc_entry->tmp_num = replacement_character[proc_entry->tmp_num];
275 else if(proc_entry->tmp_num >= 0xD800 && proc_entry->tmp_num <= 0xDFFF)
276 proc_entry->tmp_num = replacement_character[0];
277 else if(proc_entry->tmp_num > 0x10FFFF)
278 proc_entry->tmp_num = replacement_character[0];
279
280 str->length = proc_entry->tmp_str_pos +
281 myencoding_codepoint_to_ascii_utf_8(proc_entry->tmp_num, &str->data[proc_entry->tmp_str_pos]);
282
283 str->data[str->length] = '\0';
284 }
285
myhtml_data_process(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str,const char * data,size_t size)286 void myhtml_data_process(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str, const char* data, size_t size)
287 {
288 size_t offset = 0;
289
290 while (offset < size) {
291 offset = proc_entry->state(proc_entry, str, data, offset, size);
292 }
293 }
294
myhtml_data_process_end(myhtml_data_process_entry_t * proc_entry,mycore_string_t * str)295 void myhtml_data_process_end(myhtml_data_process_entry_t* proc_entry, mycore_string_t* str)
296 {
297 if(proc_entry->state == myhtml_data_process_state_ampersand_data && proc_entry->charef_res.last_entry)
298 {
299 const charef_entry_t *entry = proc_entry->charef_res.last_entry;
300
301 for (size_t i = 0; i < entry->codepoints_len; i++) {
302 MyCORE_STRING_REALLOC_IF_NEED(str, 5, 0);
303
304 proc_entry->tmp_str_pos += myencoding_codepoint_to_ascii_utf_8(entry->codepoints[i], &str->data[proc_entry->tmp_str_pos]);
305 }
306
307 str->length = proc_entry->tmp_str_pos;
308 str->data[str->length] = '\0';
309 }
310 else if(proc_entry->state == myhtml_data_process_state_ampersand_hash_data) {
311 if((str->length - (proc_entry->tmp_str_pos + 2)))
312 myhtml_data_process_state_end(proc_entry, str);
313 }
314 else if(proc_entry->state == myhtml_data_process_state_ampersand_hash_x_data) {
315 if((str->length - (proc_entry->tmp_str_pos + 3)))
316 myhtml_data_process_state_end(proc_entry, str);
317 }
318 }
319
320
321