1 /* -*- mode: C++; tab-width: 4; c-basic-offset: 4; -*- */
2
3 /* AbiSource Program Utilities
4 * Copyright (C) 2001
5 *
6 * This file is the work of:
7 * Dom Lachowicz <dominicl@seas.upenn.edu>
8 * Mike Nordell <tamlin@alognet.se>
9 *
10 * The UT_convert method was completed by Dom and Mike and was
11 * based upon work done by various members of the GLib team
12 * (http://www.gtk.org)
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version 2
17 * of the License, or (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, write to the Free Software
26 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
27 * 02110-1301 USA.
28 */
29
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33
34 #include <string.h>
35 #include <stdlib.h>
36
37 #include "ut_iconv.h"
38 #include "ut_assert.h"
39 #include "ut_debugmsg.h"
40
41 #include <glib.h>
42
43 #include "xap_EncodingManager.h"
44
45 /************************************************************************/
46 /************************************************************************/
47 /*
48 * This file represents my own personal assault on iconv, the most horrid
49 * utility ever, which is yet somehow still essential.
50 *
51 * Issues -
52 * 1) freebsd: requires extern "C" around iconv.h
53 * 2) invalid iconv handles (== iconv_t -1 (usually))
54 * 3) iconv resetting (vlad's i18n issues)
55 * 4) ICONV_CONST passed to iconv()
56 * 5) UCS2 internally to AbiWord
57 * 6) byte-order problems
58 * 7) good C/C++ linkage
59 *
60 * Provides solutions to all of the above plus -
61 * 1) 1-shot conversions (UT_convert, UT_convert_cd)
62 * 2) wrapper class around an iconv_t handle
63 */
64
65 /*!
66 * This class is a nice wrapper around an iconv_t type
67 */
auto_iconv(UT_iconv_t iconv)68 auto_iconv::auto_iconv(UT_iconv_t iconv)
69 : m_h(iconv)
70 {
71 }
72
73 /*!
74 * Convert characters from in_charset to out_charset
75 */
auto_iconv(const char * in_charset,const char * out_charset)76 auto_iconv::auto_iconv(const char * in_charset, const char *out_charset) throw(UT_iconv_t)
77 {
78 m_h = UT_ICONV_INVALID;
79
80 UT_iconv_t cd = UT_iconv_open (out_charset, in_charset);
81
82 if (!UT_iconv_isValid(cd))
83 throw cd;
84
85 m_h = cd;
86 }
87
88 /*!
89 * Public destructor
90 */
~auto_iconv()91 auto_iconv::~auto_iconv()
92 {
93 if (UT_iconv_isValid(m_h))
94 {
95 UT_iconv_close(m_h);
96 }
97 }
98
99 /*!
100 * Returns the internal iconv_t handle
101 */
operator UT_iconv_t()102 auto_iconv::operator UT_iconv_t()
103 {
104 return m_h;
105 }
106
getHandle()107 UT_iconv_t auto_iconv::getHandle ()
108 {
109 return m_h;
110 }
111
112 /************************************************************************/
113 /************************************************************************/
114
115 //
116 // everything below this line is extern "C"
117 //
118
119 static const char * s_ucs2_internal = 0;
120 static const char * s_ucs4_internal = 0;
121
122 static const char * s_ucs2_list[] = {
123 "UCS-2-INTERNAL",
124 "UCS-2-LE",
125 "UCS-2-BE",
126 "UCS-2LE",
127 "UCS-2BE",
128 "UTF16-LE",
129 "UTF16-BE",
130 "UCS2",
131 "UCS-2",
132 "UTF-16",
133 0
134 };
135
136 static const char * s_ucs4_list[] = {
137 "UCS-4-INTERNAL",
138 "UCS-4-LE",
139 "UCS-4-BE",
140 "UCS-4LE",
141 "UCS-4BE",
142 "UTF-32LE",
143 "UTF-32BE",
144 "UCS4",
145 "UCS-4",
146 "UTF32",
147 "UTF-32",
148 0
149 };
150
s_internal_init()151 static void s_internal_init ()
152 {
153 static const char * latin = "ISO-8859-1";
154
155 UT_iconv_t handle = UT_ICONV_INVALID;
156
157 s_ucs2_internal = 0;
158 s_ucs4_internal = 0;
159
160 const char ** pszEnc = s_ucs2_list;
161 while (*pszEnc)
162 {
163 if (!UT_iconv_isValid(handle = UT_iconv_open (*pszEnc, latin)))
164 {
165 pszEnc++;
166 continue;
167 }
168 const char ibuf = 0x20;
169 const char * iptr = &ibuf;
170 size_t ilen = 1;
171 UT_UCS2Char obuf[2];
172 char * optr = reinterpret_cast<char *>(obuf);
173 size_t olen = 2;
174
175 bool success = ((size_t)(-1) != UT_iconv (handle, &iptr, &ilen, &optr, &olen));
176
177 UT_iconv_close (handle);
178 handle = UT_ICONV_INVALID;
179
180 if (success) success = (olen == 0);
181 if (success) success = (obuf[0] == 0x20);
182 if (success)
183 {
184 s_ucs2_internal = *pszEnc;
185 break;
186 }
187 pszEnc++;
188 }
189 UT_ASSERT(s_ucs2_internal);
190 if (s_ucs2_internal == 0)
191 {
192 s_ucs2_internal = s_ucs2_list[0];
193 UT_DEBUGMSG(("WARNING! this test failed to determine correct UCS-2 setting!\n"));
194 }
195 UT_DEBUGMSG(("using '%s' for UCS-2 internal\n", s_ucs2_internal));
196
197 pszEnc = s_ucs4_list;
198 while (*pszEnc)
199 {
200 if (!UT_iconv_isValid(handle = UT_iconv_open (*pszEnc, latin)))
201 {
202 pszEnc++;
203 continue;
204 }
205 const char ibuf = 0x20;
206 const char * iptr = &ibuf;
207 size_t ilen = 1;
208 UT_UCS4Char obuf[4];
209 char * optr = reinterpret_cast<char *>(&obuf);
210 size_t olen = 4;
211
212 bool success = ((size_t)(-1) != UT_iconv (handle, &iptr, &ilen, &optr, &olen));
213
214 UT_iconv_close (handle);
215 handle = UT_ICONV_INVALID;
216
217 if (success) success = (olen == 0);
218 if (success) success = (obuf[0] == 0x20);
219 if (success)
220 {
221 s_ucs4_internal = *pszEnc;
222 break;
223 }
224 pszEnc++;
225 }
226 UT_ASSERT(s_ucs4_internal);
227 if (s_ucs4_internal == 0)
228 {
229 s_ucs4_internal = s_ucs4_list[0];
230 UT_DEBUGMSG(("WARNING! this test failed to determine correct UCS-4 setting!\n"));
231 }
232 UT_DEBUGMSG(("using '%s' for UCS-4 internal\n", s_ucs4_internal));
233 }
234
235 /*!
236 * \return the internal iconv UCS-2 charset name
237 */
ucs2Internal()238 const char * ucs2Internal ()
239 {
240 #if defined(TOOLKIT_WIN)
241 // we special-case the win32 build, otherwise spelling and other stuff
242 // just doesn't work
243 return "UCS-2LE";
244 #elif defined(_LIBICONV_H)
245 // libiconv seems to prefer UCS-2-INTERNAL to UCS-2BE and UCS-2LE
246 return "UCS-2-INTERNAL";
247 #elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
248 // we special case the BSDs since spelling just doesn't work
249 return "UCS2";
250 #else
251 // general case, found by hub and dom
252 if (s_ucs2_internal == 0)
253 s_internal_init ();
254 return s_ucs2_internal;
255 #endif
256 }
257
258 /*!
259 * \return the internal iconv UCS-4 charset name
260 */
ucs4Internal()261 const char * ucs4Internal ()
262 {
263 #if defined(TOOLKIT_WIN)
264 // we special-case the win32 build, otherwise spelling and other stuff
265 // just doesn't work
266 return "UCS-4LE";
267 #elif defined(_LIBICONV_H) || defined(__FreeBSD__)
268 // libiconv seems to prefer UCS-4-INTERNAL to UCS-4BE and UCS-4LE
269 return "UCS-4-INTERNAL";
270 #elif defined(__OpenBSD__) || defined(__NetBSD__)
271 // we special case the BSDs since spelling just doesn't work
272 return "UCS4";
273 #else
274 // general case, found by hub and dom
275 if (s_ucs4_internal == 0)
276 s_internal_init ();
277 return s_ucs4_internal;
278 #endif
279 }
280
281 /************************************************************************/
282 /************************************************************************/
283
284 /*!
285 * Returns true if the internal handle is valid, false if not
286 */
UT_iconv_isValid(UT_iconv_t cd)287 int UT_iconv_isValid ( UT_iconv_t cd )
288 {
289 return (cd != UT_ICONV_INVALID);
290 }
291
UT_iconv_open(const char * to,const char * from)292 UT_iconv_t UT_iconv_open( const char* to, const char* from )
293 {
294 if ( to && from )
295 return (UT_iconv_t)g_iconv_open(to, from);
296
297 return UT_ICONV_INVALID;
298 }
299
UT_iconv(UT_iconv_t cd,const char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)300 size_t UT_iconv( UT_iconv_t cd, const char **inbuf,
301 size_t *inbytesleft, char **outbuf, size_t *outbytesleft )
302 {
303 // this should take care of iconv problems with different compilers
304 // known issues:
305 // 1) gcc3.0 doesn't like const_cast<const pointer>()
306 // 2) some iconv implementations don't use a const char ** inbuf
307 // while some (newer, conformant ones) do
308
309 if ( !UT_iconv_isValid ( cd ) )
310 return (size_t)-1;
311
312 return g_iconv((GIConv)cd, (char **)inbuf, inbytesleft, outbuf, outbytesleft);
313 }
314
UT_iconv_close(UT_iconv_t cd)315 int UT_iconv_close( UT_iconv_t cd )
316 {
317 if ( UT_iconv_isValid ( cd ) )
318 return g_iconv_close( (GIConv) cd );
319
320 return -1;
321 }
322
UT_iconv_reset(UT_iconv_t cd)323 void UT_iconv_reset(UT_iconv_t cd)
324 {
325 // this insane code is needed by iconv brokenness. see
326 // http://www.abisource.com/mailinglists/abiword-dev/01/April/0135.html
327 if (XAP_EncodingManager::get_instance()->cjk_locale())
328 UT_iconv(cd, NULL, NULL, NULL, NULL);
329 }
330
331 /*!
332 * Borrowed from GLib 2.0 and (heavily) modified
333 *
334 * \param str Pointer to the input string.
335 * \param len Length of the input string to convert.
336 * \param from_codeset The "codeset" of the string pointed to by 'str'.
337 * \param to_codeset The "codeset" we want for the output.
338 * \param bytes_read optional, supply NULL if you don't want this.
339 * \param bytes_written optional, supply NULL if you don't want this.
340 *
341 * \return Returns a freshly allocated output string, which is terminated by
342 * a zero byte. Note that if the output codeset's terminator is not
343 * a zero byte (e.g., UCS-2, where it is two zero bytes), you can
344 * get correct termination by including the input string's terminator
345 * in the length passed as 'len'. E.g., if 'str' is null-terminated
346 * US-ASCII "foo", given 'len' as 4.
347 *
348 * \todo Check for out-of-memory allocations etc.
349 */
UT_convert(const char * str,UT_sint32 len,const char * from_codeset,const char * to_codeset,UT_uint32 * bytes_read_arg,UT_uint32 * bytes_written_arg)350 char * UT_convert(const char* str,
351 UT_sint32 len,
352 const char* from_codeset,
353 const char* to_codeset,
354 UT_uint32* bytes_read_arg,
355 UT_uint32* bytes_written_arg)
356 {
357 gsize _bytes_read = 0, _bytes_written = 0;
358 char* result = g_convert(str, len, to_codeset, from_codeset, &_bytes_read, &_bytes_written, NULL);
359
360 if (bytes_read_arg) *bytes_read_arg = _bytes_read;
361 if (bytes_written_arg) *bytes_written_arg = _bytes_written;
362
363 return result;
364 }
365
366 /*! This function is almost the same as the other UT_convert function,
367 * only that it takes an UT_iconv_t instead of a from and to codeset.
368 * This is useful if you need to do a conversion multiple times
369 */
UT_convert_cd(const char * str,UT_sint32 len,UT_iconv_t cd,UT_uint32 * bytes_read_arg,UT_uint32 * bytes_written_arg)370 char * UT_convert_cd(const char *str,
371 UT_sint32 len,
372 UT_iconv_t cd,
373 UT_uint32 *bytes_read_arg,
374 UT_uint32 *bytes_written_arg)
375 {
376 gsize _bytes_read = 0, _bytes_written = 0;
377 char* result = g_convert_with_iconv(str, len, (GIConv)cd, &_bytes_read, &_bytes_written, NULL);
378
379 if (bytes_read_arg) *bytes_read_arg = _bytes_read;
380 if (bytes_written_arg) *bytes_written_arg = _bytes_written;
381
382 return result;
383 }
384