1 /*
2 * libwbxml, the WBXML Library.
3 * Copyright (C) 2002-2008 Aymerick Jehanne <aymerick@jehanne.org>
4 * Copyright (C) 2011 Michael Bell <michael.bell@opensync.org>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * LGPL v2.1: http://www.gnu.org/copyleft/lesser.txt
21 *
22 * Contact: aymerick@jehanne.org
23 * Home: http://libwbxml.aymerick.com
24 */
25
26 /**
27 * @file wbxml_charset.c
28 * @ingroup wbxml_charset
29 *
30 * @author Aymerick Jehanne <aymerick@jehanne.org>
31 * @date 04/03/24
32 *
33 * @brief Charset Functions
34 */
35
36 #include "wbxml_charset.h"
37 #include "wbxml_internals.h"
38
39 /* Structures */
40
41 /** WBXML Charset */
42 typedef struct WBXMLCharsetEntry_s {
43 const WB_TINY *name; /**< Charset Name */
44 WBXMLCharsetMIBEnum mib_enum; /**< Charset MIBEnum Value */
45 } WBXMLCharsetEntry;
46
47
48 /* Globals */
49
50 /**
51 * @brief Charset table
52 * @note From http://www.iana.org/assignments/character-sets
53 */
54 static const WBXMLCharsetEntry wbxml_charset_entries[] =
55 {
56 { "US-ASCII", WBXML_CHARSET_US_ASCII },
57 { "ISO-8859-1", WBXML_CHARSET_ISO_8859_1 },
58 { "ISO-8859-2", WBXML_CHARSET_ISO_8859_2 },
59 { "ISO-8859-3", WBXML_CHARSET_ISO_8859_3 },
60 { "ISO-8859-4", WBXML_CHARSET_ISO_8859_4 },
61 { "ISO-8859-5", WBXML_CHARSET_ISO_8859_5 },
62 { "ISO-8859-6", WBXML_CHARSET_ISO_8859_6 },
63 { "ISO-8859-7", WBXML_CHARSET_ISO_8859_7 },
64 { "ISO-8859-8", WBXML_CHARSET_ISO_8859_8 },
65 { "ISO-8859-9", WBXML_CHARSET_ISO_8859_9 },
66 { "Shift_JIS", WBXML_CHARSET_SHIFT_JIS },
67 { "UTF-8", WBXML_CHARSET_UTF_8 },
68 { "ISO-10646-UCS-2", WBXML_CHARSET_ISO_10646_UCS_2 },
69 { "UTF-16", WBXML_CHARSET_UTF_16 },
70 { "Big5", WBXML_CHARSET_BIG5 }
71 };
72
73
74 /* Private Functions Prototypes */
75 static WB_BOOL search_null_block(const WB_TINY *in_buf,
76 WB_ULONG in_buf_len,
77 WB_ULONG block_len,
78 WB_ULONG *out_pos);
79
80
81 /***************************************************
82 * Public Functions
83 */
84
wbxml_charset_get_mib(const WB_TINY * name,WBXMLCharsetMIBEnum * mib_enum)85 WBXML_DECLARE(WB_BOOL) wbxml_charset_get_mib(const WB_TINY *name,
86 WBXMLCharsetMIBEnum *mib_enum)
87 {
88 WB_ULONG i = 0;
89
90 for (i = 0; i < WBXML_TABLE_SIZE(wbxml_charset_entries); i++) {
91 if (WBXML_STRCASECMP(name, wbxml_charset_entries[i].name) == 0) {
92 if (mib_enum != NULL) {
93 *mib_enum = wbxml_charset_entries[i].mib_enum;
94 }
95
96 return TRUE;
97 }
98 }
99
100 return FALSE;
101 }
102
103
wbxml_charset_get_name(WBXMLCharsetMIBEnum mib_enum,const WB_TINY ** name)104 WBXML_DECLARE(WB_BOOL) wbxml_charset_get_name(WBXMLCharsetMIBEnum mib_enum,
105 const WB_TINY **name)
106 {
107 WB_ULONG i = 0;
108
109 for (i = 0; i < WBXML_TABLE_SIZE(wbxml_charset_entries); i++) {
110 if (mib_enum == wbxml_charset_entries[i].mib_enum) {
111 if (name != NULL) {
112 *name = wbxml_charset_entries[i].name;
113 }
114
115 return TRUE;
116 }
117 }
118
119 return FALSE;
120 }
121
122
wbxml_charset_conv(const WB_TINY * in_buf,WB_ULONG * io_bytes,WBXMLCharsetMIBEnum in_charset,WBXMLBuffer ** out_buf,WBXMLCharsetMIBEnum out_charset)123 WBXML_DECLARE(WBXMLError) wbxml_charset_conv(const WB_TINY *in_buf,
124 WB_ULONG *io_bytes,
125 WBXMLCharsetMIBEnum in_charset,
126 WBXMLBuffer **out_buf,
127 WBXMLCharsetMIBEnum out_charset)
128 {
129 /**************************************************
130 * First, check for simple US-ASCII / UTF-8 cases
131 */
132
133 /* Are we dealing with US-ASCII or UTF-8 ? */
134 if (((in_charset == WBXML_CHARSET_US_ASCII) || (in_charset == WBXML_CHARSET_UTF_8)) &&
135 ((out_charset == WBXML_CHARSET_US_ASCII) || (out_charset == WBXML_CHARSET_UTF_8)))
136 {
137 /* Create a static buffer */
138 if ((*out_buf = wbxml_buffer_sta_create_from_cstr(in_buf)) == NULL) {
139 return WBXML_ERROR_NOT_ENOUGH_MEMORY;
140 }
141
142 /* US-ASCII and UTF-8 are NULL terminated */
143 *io_bytes -= WBXML_STRLEN(in_buf) + 1;
144
145 return WBXML_OK;
146 }
147
148 /**************************************
149 * Ok guys, we really have to convert
150 */
151
152 #if defined( HAVE_ICONV )
153
154 {
155 /**********************
156 * The iconv way
157 */
158
159 const WB_TINY * inbuf_pos = NULL;
160 WB_TINY **__restrict__ inbuf_ref = NULL;
161 const WB_TINY * charset_to = NULL;
162 const WB_TINY * charset_from = NULL;
163 WB_TINY * tmp_buf = NULL;
164 WB_TINY * tmp_ptr = NULL;
165 WB_ULONG tmp_buf_len = 0;
166 WB_ULONG tmp_len_left = 0;
167 WBXMLError ret = WBXML_OK;
168 iconv_t cd = 0;
169 WB_UTINY last_char = 0;
170
171 /* Get Charsets names */
172 if (!wbxml_charset_get_name(in_charset, &charset_from)) {
173 return WBXML_ERROR_CHARSET_UNKNOWN;
174 }
175
176 if (!wbxml_charset_get_name(out_charset, &charset_to)) {
177 return WBXML_ERROR_CHARSET_UNKNOWN;
178 }
179
180 /* Init iconv */
181 if ((cd = iconv_open(charset_to, charset_from)) == (iconv_t)(-1))
182 {
183 /* Init failed */
184 return WBXML_ERROR_CHARSET_CONV_INIT;
185 }
186
187 /* Allocate maximum result buffer (4 bytes unicode) */
188 tmp_len_left = tmp_buf_len = 4 * (sizeof(WB_TINY) * (*io_bytes));
189
190 if ((tmp_buf = wbxml_malloc(tmp_buf_len)) == NULL) {
191 iconv_close(cd);
192 return WBXML_ERROR_NOT_ENOUGH_MEMORY;
193 }
194
195 tmp_ptr = tmp_buf;
196
197 /* The input buffer is const but not the pointer itself.
198 The original const *inbuf should not be modified for a potential later usage.
199 */
200 inbuf_pos = in_buf;
201 inbuf_ref = (WB_TINY **__restrict__) &inbuf_pos;
202
203 /* Convert ! */
204 (void) iconv(cd,
205 inbuf_ref,
206 (size_t*)io_bytes,
207 &tmp_buf,
208 (size_t*)&tmp_len_left);
209
210 /** @todo Check errno (but it doesn't seems to work on windows) */
211
212 if (tmp_buf_len > tmp_len_left) {
213 /* Create result buffer */
214 if ((*out_buf = wbxml_buffer_create(tmp_ptr,
215 tmp_buf_len - tmp_len_left,
216 tmp_buf_len - tmp_len_left)) == NULL)
217 {
218 /* Not enough memory */
219 ret = WBXML_ERROR_NOT_ENOUGH_MEMORY;
220 }
221
222 /* Remove trailing NULL char */
223 wbxml_buffer_remove_trailing_zeros(*out_buf);
224 }
225 else
226 {
227 /* Not converted */
228 ret = WBXML_ERROR_CHARSET_CONV;
229 }
230
231 /* Shutdown iconv */
232 iconv_close(cd);
233
234 /* Clean-up */
235 wbxml_free(tmp_ptr);
236
237 return ret;
238 }
239
240 #else
241
242 {
243 /***************************************************
244 * Add your own charset conversion function here !
245 */
246
247 return WBXML_ERROR_NO_CHARSET_CONV;
248 }
249
250 #endif /* HAVE_ICONV */
251 }
252
253
wbxml_charset_conv_term(const WB_TINY * in_buf,WB_ULONG * io_bytes,WBXMLCharsetMIBEnum in_charset,WBXMLBuffer ** out_buf,WBXMLCharsetMIBEnum out_charset)254 WBXML_DECLARE(WBXMLError) wbxml_charset_conv_term(const WB_TINY *in_buf,
255 WB_ULONG *io_bytes,
256 WBXMLCharsetMIBEnum in_charset,
257 WBXMLBuffer **out_buf,
258 WBXMLCharsetMIBEnum out_charset)
259 {
260 WB_ULONG buf_len = 0;
261 WB_ULONG new_len = 0;
262 WB_ULONG term_len = 0;
263 WBXMLError ret = WBXML_OK;
264
265 /* Find length of input buffer */
266 switch (in_charset)
267 {
268 case WBXML_CHARSET_ISO_10646_UCS_2 :
269 case WBXML_CHARSET_UTF_16 :
270 /* Terminated by two NULL char ("\0\0") */
271 term_len = 2;
272
273 if (!search_null_block(in_buf, *io_bytes, 2, &buf_len)) {
274 return WBXML_ERROR_CHARSET_STR_LEN;
275 }
276
277 /* Add termination bytes length */
278 buf_len += term_len;
279 break;
280
281 default :
282 /* Terminated by a simple NULL char ('\0') */
283 term_len = 1;
284
285 buf_len = WBXML_STRLEN(in_buf) + term_len;
286 break;
287 }
288
289 /* Check length found */
290 if (buf_len > *io_bytes) {
291 return WBXML_ERROR_CHARSET_STR_LEN;
292 }
293
294 /* Use a temporary length var (because it is decreased) */
295 new_len = buf_len;
296
297 /* Convert ! */
298 ret = wbxml_charset_conv(in_buf,
299 &new_len,
300 in_charset,
301 out_buf,
302 out_charset);
303
304 /* Set input buffer length */
305 *io_bytes = buf_len;
306
307 return ret;
308 }
309
310
311 /***************************************************
312 * Private Functions
313 */
314
315 /**
316 * Binary search of a sequence of NULL bytes in a buffer
317 *
318 * @param in_buf Buffer to search in
319 * @param in_buf_len Length of input buffer
320 * @param block_len Length of the NULL sequence
321 * @param out_pos Index of Sequence into Buffer
322 * @return TRUE if found, FALSE otherwise
323 */
search_null_block(const WB_TINY * in_buf,WB_ULONG in_buf_len,WB_ULONG block_len,WB_ULONG * out_pos)324 static WB_BOOL search_null_block(const WB_TINY *in_buf,
325 WB_ULONG in_buf_len,
326 WB_ULONG block_len,
327 WB_ULONG *out_pos)
328 {
329 WB_ULONG pos = 0;
330 WB_ULONG i = 0;
331
332 for (pos = 0; pos + block_len <= in_buf_len; pos += block_len) {
333 for (i = 0; i < block_len; i++) {
334 if (memcmp(in_buf + pos + i, "\0", 1)) {
335 i = block_len;
336 } else {
337 if (i == block_len -1) {
338 *out_pos = pos;
339 return TRUE;
340 }
341 }
342 }
343 }
344
345 return FALSE;
346 }
347