1 /*
2  * libwbxml, the WBXML Library.
3  * Copyright (C) 2002-2008 Aymerick Jehanne <aymerick@jehanne.org>
4  * Copyright (C) 2011 Michael Bell <michael.bell@opensync.org>
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19  *
20  * LGPL v2.1: http://www.gnu.org/copyleft/lesser.txt
21  *
22  * Contact: aymerick@jehanne.org
23  * Home: http://libwbxml.aymerick.com
24  */
25 
26 /**
27  * @file wbxml_charset.c
28  * @ingroup wbxml_charset
29  *
30  * @author Aymerick Jehanne <aymerick@jehanne.org>
31  * @date 04/03/24
32  *
33  * @brief Charset Functions
34  */
35 
36 #include "wbxml_charset.h"
37 #include "wbxml_internals.h"
38 
39 /* Structures */
40 
41 /** WBXML Charset */
42 typedef struct WBXMLCharsetEntry_s {
43     const WB_TINY       *name;     /**< Charset Name */
44     WBXMLCharsetMIBEnum  mib_enum; /**< Charset MIBEnum Value */
45 } WBXMLCharsetEntry;
46 
47 
48 /* Globals */
49 
50 /**
51  * @brief Charset table
52  * @note  From http://www.iana.org/assignments/character-sets
53  */
54 static const WBXMLCharsetEntry wbxml_charset_entries[] =
55 {
56     { "US-ASCII",        WBXML_CHARSET_US_ASCII        },
57     { "ISO-8859-1",      WBXML_CHARSET_ISO_8859_1      },
58     { "ISO-8859-2",      WBXML_CHARSET_ISO_8859_2      },
59     { "ISO-8859-3",      WBXML_CHARSET_ISO_8859_3      },
60     { "ISO-8859-4",      WBXML_CHARSET_ISO_8859_4      },
61     { "ISO-8859-5",      WBXML_CHARSET_ISO_8859_5      },
62     { "ISO-8859-6",      WBXML_CHARSET_ISO_8859_6      },
63     { "ISO-8859-7",      WBXML_CHARSET_ISO_8859_7      },
64     { "ISO-8859-8",      WBXML_CHARSET_ISO_8859_8      },
65     { "ISO-8859-9",      WBXML_CHARSET_ISO_8859_9      },
66     { "Shift_JIS",       WBXML_CHARSET_SHIFT_JIS       },
67     { "UTF-8",           WBXML_CHARSET_UTF_8           },
68     { "ISO-10646-UCS-2", WBXML_CHARSET_ISO_10646_UCS_2 },
69     { "UTF-16",          WBXML_CHARSET_UTF_16          },
70     { "Big5",            WBXML_CHARSET_BIG5            }
71 };
72 
73 
74 /* Private Functions Prototypes */
75 static WB_BOOL search_null_block(const WB_TINY *in_buf,
76                                  WB_ULONG       in_buf_len,
77                                  WB_ULONG       block_len,
78                                  WB_ULONG      *out_pos);
79 
80 
81 /***************************************************
82  *    Public Functions
83  */
84 
wbxml_charset_get_mib(const WB_TINY * name,WBXMLCharsetMIBEnum * mib_enum)85 WBXML_DECLARE(WB_BOOL) wbxml_charset_get_mib(const WB_TINY       *name,
86                                              WBXMLCharsetMIBEnum *mib_enum)
87 {
88     WB_ULONG i = 0;
89 
90     for (i = 0; i < WBXML_TABLE_SIZE(wbxml_charset_entries); i++) {
91         if (WBXML_STRCASECMP(name, wbxml_charset_entries[i].name) == 0) {
92             if (mib_enum != NULL) {
93                 *mib_enum = wbxml_charset_entries[i].mib_enum;
94             }
95 
96             return TRUE;
97         }
98     }
99 
100     return FALSE;
101 }
102 
103 
wbxml_charset_get_name(WBXMLCharsetMIBEnum mib_enum,const WB_TINY ** name)104 WBXML_DECLARE(WB_BOOL) wbxml_charset_get_name(WBXMLCharsetMIBEnum   mib_enum,
105                                               const WB_TINY       **name)
106 {
107     WB_ULONG i = 0;
108 
109     for (i = 0; i < WBXML_TABLE_SIZE(wbxml_charset_entries); i++) {
110         if (mib_enum == wbxml_charset_entries[i].mib_enum) {
111             if (name != NULL) {
112                 *name = wbxml_charset_entries[i].name;
113             }
114 
115             return TRUE;
116         }
117     }
118 
119     return FALSE;
120 }
121 
122 
wbxml_charset_conv(const WB_TINY * in_buf,WB_ULONG * io_bytes,WBXMLCharsetMIBEnum in_charset,WBXMLBuffer ** out_buf,WBXMLCharsetMIBEnum out_charset)123 WBXML_DECLARE(WBXMLError) wbxml_charset_conv(const WB_TINY        *in_buf,
124                                              WB_ULONG             *io_bytes,
125                                              WBXMLCharsetMIBEnum   in_charset,
126                                              WBXMLBuffer         **out_buf,
127                                              WBXMLCharsetMIBEnum   out_charset)
128 {
129     /**************************************************
130      * First, check for simple US-ASCII / UTF-8 cases
131      */
132 
133     /* Are we dealing with US-ASCII or UTF-8 ? */
134     if (((in_charset  == WBXML_CHARSET_US_ASCII) || (in_charset  == WBXML_CHARSET_UTF_8)) &&
135         ((out_charset == WBXML_CHARSET_US_ASCII) || (out_charset == WBXML_CHARSET_UTF_8)))
136     {
137         /* Create a static buffer */
138         if ((*out_buf = wbxml_buffer_sta_create_from_cstr(in_buf)) == NULL) {
139             return WBXML_ERROR_NOT_ENOUGH_MEMORY;
140         }
141 
142         /* US-ASCII and UTF-8 are NULL terminated */
143         *io_bytes -= WBXML_STRLEN(in_buf) + 1;
144 
145         return WBXML_OK;
146     }
147 
148     /**************************************
149      * Ok guys, we really have to convert
150      */
151 
152 #if defined( HAVE_ICONV )
153 
154     {
155         /**********************
156          * The iconv way
157          */
158 
159         const WB_TINY * inbuf_pos    = NULL;
160         WB_TINY      **__restrict__ inbuf_ref = NULL;
161         const WB_TINY * charset_to   = NULL;
162         const WB_TINY * charset_from = NULL;
163         WB_TINY       * tmp_buf      = NULL;
164         WB_TINY       * tmp_ptr      = NULL;
165         WB_ULONG        tmp_buf_len  = 0;
166         WB_ULONG        tmp_len_left = 0;
167         WBXMLError      ret          = WBXML_OK;
168         iconv_t         cd           = 0;
169         WB_UTINY        last_char    = 0;
170 
171         /* Get Charsets names */
172         if (!wbxml_charset_get_name(in_charset, &charset_from)) {
173             return WBXML_ERROR_CHARSET_UNKNOWN;
174         }
175 
176         if (!wbxml_charset_get_name(out_charset, &charset_to)) {
177             return WBXML_ERROR_CHARSET_UNKNOWN;
178         }
179 
180         /* Init iconv */
181         if ((cd = iconv_open(charset_to, charset_from)) == (iconv_t)(-1))
182         {
183             /* Init failed */
184             return WBXML_ERROR_CHARSET_CONV_INIT;
185         }
186 
187         /* Allocate maximum result buffer (4 bytes unicode) */
188         tmp_len_left = tmp_buf_len = 4 * (sizeof(WB_TINY) * (*io_bytes));
189 
190         if ((tmp_buf = wbxml_malloc(tmp_buf_len)) == NULL) {
191             iconv_close(cd);
192             return WBXML_ERROR_NOT_ENOUGH_MEMORY;
193         }
194 
195         tmp_ptr = tmp_buf;
196 
197         /* The input buffer is const but not the pointer itself.
198            The original const *inbuf should not be modified for a potential later usage.
199          */
200         inbuf_pos = in_buf;
201         inbuf_ref = (WB_TINY **__restrict__) &inbuf_pos;
202 
203         /* Convert ! */
204         (void) iconv(cd,
205                      inbuf_ref,
206                      (size_t*)io_bytes,
207                      &tmp_buf,
208                      (size_t*)&tmp_len_left);
209 
210         /** @todo Check errno (but it doesn't seems to work on windows) */
211 
212         if (tmp_buf_len > tmp_len_left) {
213             /* Create result buffer */
214             if ((*out_buf = wbxml_buffer_create(tmp_ptr,
215                                                 tmp_buf_len - tmp_len_left,
216                                                 tmp_buf_len - tmp_len_left)) == NULL)
217             {
218                 /* Not enough memory */
219                 ret = WBXML_ERROR_NOT_ENOUGH_MEMORY;
220             }
221 
222             /* Remove trailing NULL char */
223             wbxml_buffer_remove_trailing_zeros(*out_buf);
224         }
225         else
226         {
227             /* Not converted */
228             ret = WBXML_ERROR_CHARSET_CONV;
229         }
230 
231         /* Shutdown iconv */
232         iconv_close(cd);
233 
234         /* Clean-up */
235         wbxml_free(tmp_ptr);
236 
237         return ret;
238     }
239 
240 #else
241 
242     {
243         /***************************************************
244          * Add your own charset conversion function here !
245          */
246 
247         return WBXML_ERROR_NO_CHARSET_CONV;
248     }
249 
250 #endif /* HAVE_ICONV */
251 }
252 
253 
wbxml_charset_conv_term(const WB_TINY * in_buf,WB_ULONG * io_bytes,WBXMLCharsetMIBEnum in_charset,WBXMLBuffer ** out_buf,WBXMLCharsetMIBEnum out_charset)254 WBXML_DECLARE(WBXMLError) wbxml_charset_conv_term(const WB_TINY        *in_buf,
255                                                   WB_ULONG             *io_bytes,
256                                                   WBXMLCharsetMIBEnum   in_charset,
257                                                   WBXMLBuffer         **out_buf,
258                                                   WBXMLCharsetMIBEnum   out_charset)
259 {
260     WB_ULONG   buf_len  = 0;
261     WB_ULONG   new_len  = 0;
262     WB_ULONG   term_len = 0;
263     WBXMLError ret      = WBXML_OK;
264 
265     /* Find length of input buffer */
266     switch (in_charset)
267     {
268     case WBXML_CHARSET_ISO_10646_UCS_2 :
269     case WBXML_CHARSET_UTF_16 :
270         /* Terminated by two NULL char ("\0\0") */
271         term_len = 2;
272 
273         if (!search_null_block(in_buf, *io_bytes, 2, &buf_len)) {
274             return WBXML_ERROR_CHARSET_STR_LEN;
275         }
276 
277         /* Add termination bytes length */
278         buf_len += term_len;
279         break;
280 
281     default :
282         /* Terminated by a simple NULL char ('\0') */
283         term_len = 1;
284 
285         buf_len = WBXML_STRLEN(in_buf) + term_len;
286         break;
287     }
288 
289     /* Check length found */
290     if (buf_len > *io_bytes) {
291         return WBXML_ERROR_CHARSET_STR_LEN;
292     }
293 
294     /* Use a temporary length var (because it is decreased) */
295     new_len = buf_len;
296 
297     /* Convert ! */
298     ret = wbxml_charset_conv(in_buf,
299                              &new_len,
300                              in_charset,
301                              out_buf,
302                              out_charset);
303 
304     /* Set input buffer length */
305     *io_bytes = buf_len;
306 
307     return ret;
308 }
309 
310 
311 /***************************************************
312  *    Private Functions
313  */
314 
315 /**
316  * Binary search of a sequence of NULL bytes in a buffer
317  *
318  * @param in_buf     Buffer to search in
319  * @param in_buf_len Length of input buffer
320  * @param block_len  Length of the NULL sequence
321  * @param out_pos    Index of Sequence into Buffer
322  * @return TRUE if found, FALSE otherwise
323  */
search_null_block(const WB_TINY * in_buf,WB_ULONG in_buf_len,WB_ULONG block_len,WB_ULONG * out_pos)324 static WB_BOOL search_null_block(const WB_TINY *in_buf,
325                                  WB_ULONG       in_buf_len,
326                                  WB_ULONG       block_len,
327                                  WB_ULONG      *out_pos)
328 {
329     WB_ULONG pos = 0;
330     WB_ULONG i = 0;
331 
332     for (pos = 0; pos + block_len <= in_buf_len; pos += block_len) {
333         for (i = 0; i < block_len; i++) {
334             if (memcmp(in_buf + pos + i, "\0", 1)) {
335                 i = block_len;
336             } else {
337                 if (i == block_len -1) {
338                     *out_pos = pos;
339                     return TRUE;
340                 }
341             }
342         }
343     }
344 
345     return FALSE;
346 }
347