1 /*
2  Copyright (C) 2015-2017 Alexander Borisov
3 
4  This library is free software; you can redistribute it and/or
5  modify it under the terms of the GNU Lesser General Public
6  License as published by the Free Software Foundation; either
7  version 2.1 of the License, or (at your option) any later version.
8 
9  This library is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  Lesser General Public License for more details.
13 
14  You should have received a copy of the GNU Lesser General Public
15  License along with this library; if not, write to the Free Software
16  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 
18  Author: lex.borisov@gmail.com (Alexander Borisov)
19 */
20 
21 #include "myhtml/mystring.h"
22 #include "mycore/utils/resources.h"
23 
myhtml_string_append_with_preprocessing(mycore_string_t * str,const char * buff,size_t length,bool emit_null_chars)24 size_t myhtml_string_append_with_preprocessing(mycore_string_t* str, const char* buff, size_t length, bool emit_null_chars)
25 {
26     MyCORE_STRING_REALLOC_IF_NEED(str, (length + 1), 0);
27 
28     unsigned char *data = (unsigned char*)str->data;
29     const unsigned char *u_buff = (const unsigned char*)buff;
30 
31     /* 0x0D == \r */
32     /* 0x0A == \n */
33 
34     for (size_t i = 0; i < length; i++)
35     {
36         if(u_buff[i] == 0x0D) {
37             data[str->length] = 0x0A;
38 
39             if((i + 1) < length) {
40                 if(u_buff[(i + 1)] == 0x0A)
41                     i++;
42             }
43             else {
44                 str->length++;
45 
46                 MyCORE_STRING_REALLOC_IF_NEED(str, 0, 2);
47                 str->data[str->length] = '\0';
48 
49                 return str->length;
50             }
51         }
52         else if(u_buff[i] == 0x00 && emit_null_chars == false)
53         {
54             mycore_string_realloc(str, (str->size + 5));
55             data = (unsigned char*)str->data;
56 
57             // Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
58             data[str->length] = 0xEF; str->length++;
59             data[str->length] = 0xBF; str->length++;
60             data[str->length] = 0xBD;
61         }
62         else
63             data[str->length] = u_buff[i];
64 
65         str->length++;
66     }
67 
68     str->data[str->length] = '\0';
69 
70     return 0;
71 }
72 
myhtml_string_append_lowercase_with_preprocessing(mycore_string_t * str,const char * buff,size_t length,bool emit_null_chars)73 size_t myhtml_string_append_lowercase_with_preprocessing(mycore_string_t* str, const char* buff, size_t length, bool emit_null_chars)
74 {
75     MyCORE_STRING_REALLOC_IF_NEED(str, (length + 1), 0);
76 
77     unsigned char *data = (unsigned char*)str->data;
78     const unsigned char *u_buff = (const unsigned char*)buff;
79 
80     for (size_t i = 0; i < length; i++)
81     {
82         if(u_buff[i] == 0x0D) {
83             data[str->length] = 0x0A;
84 
85             if((i + 1) < length) {
86                 if(u_buff[(i + 1)] == 0x0A)
87                     i++;
88             }
89             else {
90                 str->length++;
91 
92                 MyCORE_STRING_REALLOC_IF_NEED(str, 0, 2);
93                 str->data[str->length] = '\0';
94 
95                 return str->length;
96             }
97         }
98         else if(u_buff[i] == 0x00 && emit_null_chars == false)
99         {
100             mycore_string_realloc(str, (str->size + 5));
101             data = (unsigned char*)str->data;
102 
103             // Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
104             data[str->length] = 0xEF; str->length++;
105             data[str->length] = 0xBF; str->length++;
106             data[str->length] = 0xBD;
107         }
108         else {
109             data[str->length] = mycore_string_chars_lowercase_map[ u_buff[i] ];
110         }
111 
112         str->length++;
113     }
114 
115     str->data[str->length] = '\0';
116 
117     return 0;
118 }
119 
myhtml_string_before_append_any_preprocessing(mycore_string_t * str,const char * buff,size_t length,size_t last_position)120 size_t myhtml_string_before_append_any_preprocessing(mycore_string_t* str, const char* buff, size_t length, size_t last_position)
121 {
122     if(last_position == 0 || length == 0)
123         return 0;
124 
125     if(*buff == '\n')
126         return 1;
127 
128     return 0;
129 }
130 
131 /////////////////////////////////////////////////////////
132 //// Append With Convert Encoding and Preprocessing API
133 ////
134 /////////////////////////////////////////////////////////
myhtml_string_append_with_convert_encoding_with_preprocessing(mycore_string_t * str,const char * buff,size_t length,myencoding_t encoding,bool emit_null_chars)135 size_t myhtml_string_append_with_convert_encoding_with_preprocessing(mycore_string_t* str, const char* buff, size_t length, myencoding_t encoding, bool emit_null_chars)
136 {
137     myencoding_result_t res;
138     myencoding_result_clean(&res);
139 
140     return myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(str, &res, buff, length, encoding, emit_null_chars);
141 }
142 
myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(mycore_string_t * str,myencoding_result_t * res,const char * buff,size_t length,myencoding_t encoding,bool emit_null_chars)143 size_t myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(mycore_string_t* str, myencoding_result_t* res, const char* buff, size_t length, myencoding_t encoding, bool emit_null_chars)
144 {
145     MyCORE_STRING_REALLOC_IF_NEED(str, (length + 1), 0);
146 
147     unsigned const char* u_buff = (unsigned const char*)buff;
148     const myencoding_custom_f func = myencoding_get_function_by_id(encoding);
149 
150     for (size_t i = 0; i < length; i++)
151     {
152         if(func(u_buff[i], res) == MyENCODING_STATUS_OK) {
153             MyCORE_STRING_REALLOC_IF_NEED(str, 5, 1);
154 
155             size_t len = myencoding_codepoint_to_ascii_utf_8(res->result, &str->data[str->length]);
156 
157             if(len == 1) {
158                 if(str->data[str->length] == '\r') {
159                     str->data[str->length] = '\n';
160 
161                     if((i + 1) < length) {
162                         if(buff[(i + 1)] == '\n')
163                             i++;
164                     }
165                     else {
166                         str->length++;
167 
168                         MyCORE_STRING_REALLOC_IF_NEED(str, 0, 2);
169                         str->data[str->length] = '\0';
170 
171                         return str->length;
172                     }
173                 }
174                 else if(str->data[str->length] == 0x00 && emit_null_chars == false)
175                 {
176                     mycore_string_realloc(str, (str->size + 5));
177 
178                     // Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
179                     str->data[str->length] = (char)0xEF; str->length++;
180                     str->data[str->length] = (char)0xBF; str->length++;
181                     str->data[str->length] = (char)0xBD;
182                 }
183             }
184 
185             str->length += len;
186         }
187     }
188 
189     MyCORE_STRING_APPEND_BYTE_WITHOUT_INCREMENT('\0', str, 1);
190 
191     return 0;
192 }
193 
myhtml_string_append_lowercase_with_convert_encoding_with_preprocessing(mycore_string_t * str,const char * buff,size_t length,myencoding_t encoding,bool emit_null_chars)194 size_t myhtml_string_append_lowercase_with_convert_encoding_with_preprocessing(mycore_string_t* str, const char* buff, size_t length, myencoding_t encoding, bool emit_null_chars)
195 {
196     myencoding_result_t res;
197     myencoding_result_clean(&res);
198 
199     return myhtml_string_append_lowercase_chunk_with_convert_encoding_with_preprocessing(str, &res, buff, length, encoding, emit_null_chars);
200 }
201 
myhtml_string_append_lowercase_chunk_with_convert_encoding_with_preprocessing(mycore_string_t * str,myencoding_result_t * res,const char * buff,size_t length,myencoding_t encoding,bool emit_null_chars)202 size_t myhtml_string_append_lowercase_chunk_with_convert_encoding_with_preprocessing(mycore_string_t* str, myencoding_result_t* res, const char* buff, size_t length, myencoding_t encoding, bool emit_null_chars)
203 {
204     MyCORE_STRING_REALLOC_IF_NEED(str, (length + 1), 0);
205 
206     unsigned const char* u_buff = (unsigned const char*)buff;
207     const myencoding_custom_f func = myencoding_get_function_by_id(encoding);
208 
209     for (size_t i = 0; i < length; i++)
210     {
211         if(func(u_buff[i], res) == MyENCODING_STATUS_OK) {
212             MyCORE_STRING_REALLOC_IF_NEED(str, 5, 1);
213 
214             size_t len = myencoding_codepoint_to_lowercase_ascii_utf_8(res->result, &str->data[str->length]);
215 
216             if(len == 1) {
217                 if(str->data[str->length] == '\r') {
218                     str->data[str->length] = '\n';
219 
220                     if((i + 1) < length) {
221                         if(buff[(i + 1)] == '\n')
222                             i++;
223                     }
224                     else {
225                         str->length++;
226 
227                         MyCORE_STRING_REALLOC_IF_NEED(str, 0, 2);
228                         str->data[str->length] = '\0';
229 
230                         return str->length;
231                     }
232                 }
233                 else if(str->data[str->length] == 0x00 && emit_null_chars == false)
234                 {
235                     mycore_string_realloc(str, (str->size + 5));
236 
237                     // Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
238                     str->data[str->length] = (char)0xEF; str->length++;
239                     str->data[str->length] = (char)0xBF; str->length++;
240                     str->data[str->length] = (char)0xBD;
241                 }
242             }
243 
244             str->length += len;
245         }
246     }
247 
248     MyCORE_STRING_APPEND_BYTE_WITHOUT_INCREMENT('\0', str, 1);
249 
250     return 0;
251 }
252