1 /*
2 Copyright (C) 2015-2017 Alexander Borisov
3
4 This library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Lesser General Public
6 License as published by the Free Software Foundation; either
7 version 2.1 of the License, or (at your option) any later version.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with this library; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
18 Author: lex.borisov@gmail.com (Alexander Borisov)
19 */
20
21 #include "myhtml/mystring.h"
22 #include "mycore/utils/resources.h"
23
myhtml_string_append_with_preprocessing(mycore_string_t * str,const char * buff,size_t length,bool emit_null_chars)24 size_t myhtml_string_append_with_preprocessing(mycore_string_t* str, const char* buff, size_t length, bool emit_null_chars)
25 {
26 MyCORE_STRING_REALLOC_IF_NEED(str, (length + 1), 0);
27
28 unsigned char *data = (unsigned char*)str->data;
29 const unsigned char *u_buff = (const unsigned char*)buff;
30
31 /* 0x0D == \r */
32 /* 0x0A == \n */
33
34 for (size_t i = 0; i < length; i++)
35 {
36 if(u_buff[i] == 0x0D) {
37 data[str->length] = 0x0A;
38
39 if((i + 1) < length) {
40 if(u_buff[(i + 1)] == 0x0A)
41 i++;
42 }
43 else {
44 str->length++;
45
46 MyCORE_STRING_REALLOC_IF_NEED(str, 0, 2);
47 str->data[str->length] = '\0';
48
49 return str->length;
50 }
51 }
52 else if(u_buff[i] == 0x00 && emit_null_chars == false)
53 {
54 mycore_string_realloc(str, (str->size + 5));
55 data = (unsigned char*)str->data;
56
57 // Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
58 data[str->length] = 0xEF; str->length++;
59 data[str->length] = 0xBF; str->length++;
60 data[str->length] = 0xBD;
61 }
62 else
63 data[str->length] = u_buff[i];
64
65 str->length++;
66 }
67
68 str->data[str->length] = '\0';
69
70 return 0;
71 }
72
myhtml_string_append_lowercase_with_preprocessing(mycore_string_t * str,const char * buff,size_t length,bool emit_null_chars)73 size_t myhtml_string_append_lowercase_with_preprocessing(mycore_string_t* str, const char* buff, size_t length, bool emit_null_chars)
74 {
75 MyCORE_STRING_REALLOC_IF_NEED(str, (length + 1), 0);
76
77 unsigned char *data = (unsigned char*)str->data;
78 const unsigned char *u_buff = (const unsigned char*)buff;
79
80 for (size_t i = 0; i < length; i++)
81 {
82 if(u_buff[i] == 0x0D) {
83 data[str->length] = 0x0A;
84
85 if((i + 1) < length) {
86 if(u_buff[(i + 1)] == 0x0A)
87 i++;
88 }
89 else {
90 str->length++;
91
92 MyCORE_STRING_REALLOC_IF_NEED(str, 0, 2);
93 str->data[str->length] = '\0';
94
95 return str->length;
96 }
97 }
98 else if(u_buff[i] == 0x00 && emit_null_chars == false)
99 {
100 mycore_string_realloc(str, (str->size + 5));
101 data = (unsigned char*)str->data;
102
103 // Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
104 data[str->length] = 0xEF; str->length++;
105 data[str->length] = 0xBF; str->length++;
106 data[str->length] = 0xBD;
107 }
108 else {
109 data[str->length] = mycore_string_chars_lowercase_map[ u_buff[i] ];
110 }
111
112 str->length++;
113 }
114
115 str->data[str->length] = '\0';
116
117 return 0;
118 }
119
myhtml_string_before_append_any_preprocessing(mycore_string_t * str,const char * buff,size_t length,size_t last_position)120 size_t myhtml_string_before_append_any_preprocessing(mycore_string_t* str, const char* buff, size_t length, size_t last_position)
121 {
122 if(last_position == 0 || length == 0)
123 return 0;
124
125 if(*buff == '\n')
126 return 1;
127
128 return 0;
129 }
130
131 /////////////////////////////////////////////////////////
132 //// Append With Convert Encoding and Preprocessing API
133 ////
134 /////////////////////////////////////////////////////////
myhtml_string_append_with_convert_encoding_with_preprocessing(mycore_string_t * str,const char * buff,size_t length,myencoding_t encoding,bool emit_null_chars)135 size_t myhtml_string_append_with_convert_encoding_with_preprocessing(mycore_string_t* str, const char* buff, size_t length, myencoding_t encoding, bool emit_null_chars)
136 {
137 myencoding_result_t res;
138 myencoding_result_clean(&res);
139
140 return myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(str, &res, buff, length, encoding, emit_null_chars);
141 }
142
myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(mycore_string_t * str,myencoding_result_t * res,const char * buff,size_t length,myencoding_t encoding,bool emit_null_chars)143 size_t myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(mycore_string_t* str, myencoding_result_t* res, const char* buff, size_t length, myencoding_t encoding, bool emit_null_chars)
144 {
145 MyCORE_STRING_REALLOC_IF_NEED(str, (length + 1), 0);
146
147 unsigned const char* u_buff = (unsigned const char*)buff;
148 const myencoding_custom_f func = myencoding_get_function_by_id(encoding);
149
150 for (size_t i = 0; i < length; i++)
151 {
152 if(func(u_buff[i], res) == MyENCODING_STATUS_OK) {
153 MyCORE_STRING_REALLOC_IF_NEED(str, 5, 1);
154
155 size_t len = myencoding_codepoint_to_ascii_utf_8(res->result, &str->data[str->length]);
156
157 if(len == 1) {
158 if(str->data[str->length] == '\r') {
159 str->data[str->length] = '\n';
160
161 if((i + 1) < length) {
162 if(buff[(i + 1)] == '\n')
163 i++;
164 }
165 else {
166 str->length++;
167
168 MyCORE_STRING_REALLOC_IF_NEED(str, 0, 2);
169 str->data[str->length] = '\0';
170
171 return str->length;
172 }
173 }
174 else if(str->data[str->length] == 0x00 && emit_null_chars == false)
175 {
176 mycore_string_realloc(str, (str->size + 5));
177
178 // Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
179 str->data[str->length] = (char)0xEF; str->length++;
180 str->data[str->length] = (char)0xBF; str->length++;
181 str->data[str->length] = (char)0xBD;
182 }
183 }
184
185 str->length += len;
186 }
187 }
188
189 MyCORE_STRING_APPEND_BYTE_WITHOUT_INCREMENT('\0', str, 1);
190
191 return 0;
192 }
193
myhtml_string_append_lowercase_with_convert_encoding_with_preprocessing(mycore_string_t * str,const char * buff,size_t length,myencoding_t encoding,bool emit_null_chars)194 size_t myhtml_string_append_lowercase_with_convert_encoding_with_preprocessing(mycore_string_t* str, const char* buff, size_t length, myencoding_t encoding, bool emit_null_chars)
195 {
196 myencoding_result_t res;
197 myencoding_result_clean(&res);
198
199 return myhtml_string_append_lowercase_chunk_with_convert_encoding_with_preprocessing(str, &res, buff, length, encoding, emit_null_chars);
200 }
201
myhtml_string_append_lowercase_chunk_with_convert_encoding_with_preprocessing(mycore_string_t * str,myencoding_result_t * res,const char * buff,size_t length,myencoding_t encoding,bool emit_null_chars)202 size_t myhtml_string_append_lowercase_chunk_with_convert_encoding_with_preprocessing(mycore_string_t* str, myencoding_result_t* res, const char* buff, size_t length, myencoding_t encoding, bool emit_null_chars)
203 {
204 MyCORE_STRING_REALLOC_IF_NEED(str, (length + 1), 0);
205
206 unsigned const char* u_buff = (unsigned const char*)buff;
207 const myencoding_custom_f func = myencoding_get_function_by_id(encoding);
208
209 for (size_t i = 0; i < length; i++)
210 {
211 if(func(u_buff[i], res) == MyENCODING_STATUS_OK) {
212 MyCORE_STRING_REALLOC_IF_NEED(str, 5, 1);
213
214 size_t len = myencoding_codepoint_to_lowercase_ascii_utf_8(res->result, &str->data[str->length]);
215
216 if(len == 1) {
217 if(str->data[str->length] == '\r') {
218 str->data[str->length] = '\n';
219
220 if((i + 1) < length) {
221 if(buff[(i + 1)] == '\n')
222 i++;
223 }
224 else {
225 str->length++;
226
227 MyCORE_STRING_REALLOC_IF_NEED(str, 0, 2);
228 str->data[str->length] = '\0';
229
230 return str->length;
231 }
232 }
233 else if(str->data[str->length] == 0x00 && emit_null_chars == false)
234 {
235 mycore_string_realloc(str, (str->size + 5));
236
237 // Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
238 str->data[str->length] = (char)0xEF; str->length++;
239 str->data[str->length] = (char)0xBF; str->length++;
240 str->data[str->length] = (char)0xBD;
241 }
242 }
243
244 str->length += len;
245 }
246 }
247
248 MyCORE_STRING_APPEND_BYTE_WITHOUT_INCREMENT('\0', str, 1);
249
250 return 0;
251 }
252