1 /* -*- mode: C++; tab-width: 4; c-basic-offset: 4; -*- */
2 
3 /* AbiSource Program Utilities
4  * Copyright (C) 2001
5  *
6  * This file is the work of:
7  *    Dom Lachowicz <dominicl@seas.upenn.edu>
8  *    Mike Nordell  <tamlin@alognet.se>
9  *
10  *    The UT_convert method was completed by Dom and Mike and was
11  *    based upon work done by various members of the GLib team
12  *    (http://www.gtk.org)
13  *
14  * This program is free software; you can redistribute it and/or
15  * modify it under the terms of the GNU General Public License
16  * as published by the Free Software Foundation; either version 2
17  * of the License, or (at your option) any later version.
18  *
19  * This program is distributed in the hope that it will be useful,
20  * but WITHOUT ANY WARRANTY; without even the implied warranty of
21  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22  * GNU General Public License for more details.
23  *
24  * You should have received a copy of the GNU General Public License
25  * along with this program; if not, write to the Free Software
26  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
27  * 02110-1301 USA.
28  */
29 
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33 
34 #include <string.h>
35 #include <stdlib.h>
36 
37 #include "ut_iconv.h"
38 #include "ut_assert.h"
39 #include "ut_debugmsg.h"
40 
41 #include <glib.h>
42 
43 #include "xap_EncodingManager.h"
44 
45 /************************************************************************/
46 /************************************************************************/
47 /*
48  * This file represents my own personal assault on iconv, the most horrid
49  * utility ever, which is yet somehow still essential.
50  *
51  * Issues -
52  * 1) freebsd: requires extern "C" around iconv.h
53  * 2) invalid iconv handles (== iconv_t -1 (usually))
54  * 3) iconv resetting (vlad's i18n issues)
55  * 4) ICONV_CONST passed to iconv()
56  * 5) UCS2 internally to AbiWord
57  * 6) byte-order problems
58  * 7) good C/C++ linkage
59  *
60  * Provides solutions to all of the above plus -
61  * 1) 1-shot conversions (UT_convert, UT_convert_cd)
62  * 2) wrapper class around an iconv_t handle
63  */
64 
65 /*!
66  * This class is a nice wrapper around an iconv_t type
67  */
auto_iconv(UT_iconv_t iconv)68 auto_iconv::auto_iconv(UT_iconv_t iconv)
69   : m_h(iconv)
70 {
71 }
72 
73 /*!
74  * Convert characters from in_charset to out_charset
75  */
auto_iconv(const char * in_charset,const char * out_charset)76 auto_iconv::auto_iconv(const char * in_charset, const char *out_charset) throw(UT_iconv_t)
77 {
78 	m_h = UT_ICONV_INVALID;
79 
80 	UT_iconv_t cd = UT_iconv_open (out_charset, in_charset);
81 
82 	if (!UT_iconv_isValid(cd))
83 		throw cd;
84 
85 	m_h = cd;
86 }
87 
88 /*!
89  * Public destructor
90  */
~auto_iconv()91 auto_iconv::~auto_iconv()
92 {
93   if (UT_iconv_isValid(m_h))
94     {
95       UT_iconv_close(m_h);
96     }
97 }
98 
99 /*!
100  * Returns the internal iconv_t handle
101  */
operator UT_iconv_t()102 auto_iconv::operator UT_iconv_t()
103 {
104   return m_h;
105 }
106 
getHandle()107 UT_iconv_t auto_iconv::getHandle ()
108 {
109   return m_h;
110 }
111 
112 /************************************************************************/
113 /************************************************************************/
114 
115 //
116 // everything below this line is extern "C"
117 //
118 
119 static const char * s_ucs2_internal = 0;
120 static const char * s_ucs4_internal = 0;
121 
122 static const char * s_ucs2_list[] = {
123 	"UCS-2-INTERNAL",
124 	"UCS-2-LE",
125 	"UCS-2-BE",
126 	"UCS-2LE",
127 	"UCS-2BE",
128 	"UTF16-LE",
129 	"UTF16-BE",
130 	"UCS2",
131 	"UCS-2",
132 	"UTF-16",
133 	0
134 };
135 
136 static const char * s_ucs4_list[] = {
137 	"UCS-4-INTERNAL",
138 	"UCS-4-LE",
139 	"UCS-4-BE",
140 	"UCS-4LE",
141 	"UCS-4BE",
142 	"UTF-32LE",
143 	"UTF-32BE",
144 	"UCS4",
145 	"UCS-4",
146 	"UTF32",
147 	"UTF-32",
148 	0
149 };
150 
s_internal_init()151 static void s_internal_init ()
152 {
153 	static const char * latin = "ISO-8859-1";
154 
155 	UT_iconv_t handle = UT_ICONV_INVALID;
156 
157 	s_ucs2_internal = 0;
158 	s_ucs4_internal = 0;
159 
160 	const char ** pszEnc = s_ucs2_list;
161 	while (*pszEnc)
162 		{
163 			if (!UT_iconv_isValid(handle = UT_iconv_open (*pszEnc, latin)))
164 				{
165 					pszEnc++;
166 					continue;
167 				}
168 			const char ibuf = 0x20;
169 			const char * iptr = &ibuf;
170 			size_t ilen = 1;
171 			UT_UCS2Char obuf[2];
172 			char * optr = reinterpret_cast<char *>(obuf);
173 			size_t olen = 2;
174 
175 			bool success = ((size_t)(-1) != UT_iconv (handle, &iptr, &ilen, &optr, &olen));
176 
177 			UT_iconv_close (handle);
178 			handle = UT_ICONV_INVALID;
179 
180 			if (success) success = (olen == 0);
181 			if (success) success = (obuf[0] == 0x20);
182 			if (success)
183 				{
184 					s_ucs2_internal = *pszEnc;
185 					break;
186 				}
187 			pszEnc++;
188 		}
189 	UT_ASSERT(s_ucs2_internal);
190 	if (s_ucs2_internal == 0)
191 		{
192 			s_ucs2_internal = s_ucs2_list[0];
193 			UT_DEBUGMSG(("WARNING! this test failed to determine correct UCS-2 setting!\n"));
194 		}
195 	UT_DEBUGMSG(("using '%s' for UCS-2 internal\n", s_ucs2_internal));
196 
197 	pszEnc = s_ucs4_list;
198 	while (*pszEnc)
199 		{
200 			if (!UT_iconv_isValid(handle = UT_iconv_open (*pszEnc, latin)))
201 				{
202 					pszEnc++;
203 					continue;
204 				}
205 			const char ibuf = 0x20;
206 			const char * iptr = &ibuf;
207 			size_t ilen = 1;
208 			UT_UCS4Char obuf[4];
209 			char * optr = reinterpret_cast<char *>(&obuf);
210 			size_t olen = 4;
211 
212 			bool success = ((size_t)(-1) != UT_iconv (handle, &iptr, &ilen, &optr, &olen));
213 
214 			UT_iconv_close (handle);
215 			handle = UT_ICONV_INVALID;
216 
217 			if (success) success = (olen == 0);
218 			if (success) success = (obuf[0] == 0x20);
219 			if (success)
220 				{
221 					s_ucs4_internal = *pszEnc;
222 					break;
223 				}
224 			pszEnc++;
225 		}
226 	UT_ASSERT(s_ucs4_internal);
227 	if (s_ucs4_internal == 0)
228 		{
229 			s_ucs4_internal = s_ucs4_list[0];
230 			UT_DEBUGMSG(("WARNING! this test failed to determine correct UCS-4 setting!\n"));
231 		}
232 	UT_DEBUGMSG(("using '%s' for UCS-4 internal\n", s_ucs4_internal));
233 }
234 
235 /*!
236  * \return the internal iconv UCS-2 charset name
237  */
ucs2Internal()238 const char * ucs2Internal ()
239 {
240 #if defined(TOOLKIT_WIN)
241   // we special-case the win32 build, otherwise spelling and other stuff
242   // just doesn't work
243   return "UCS-2LE";
244 #elif defined(_LIBICONV_H)
245   // libiconv seems to prefer UCS-2-INTERNAL to UCS-2BE and UCS-2LE
246   return "UCS-2-INTERNAL";
247 #elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
248   // we special case the BSDs since spelling just doesn't work
249   return "UCS2";
250 #else
251   // general case, found by hub and dom
252 	if (s_ucs2_internal == 0)
253 		s_internal_init ();
254 	return s_ucs2_internal;
255 #endif
256 }
257 
258 /*!
259  * \return the internal iconv UCS-4 charset name
260  */
ucs4Internal()261 const char * ucs4Internal ()
262 {
263 #if defined(TOOLKIT_WIN)
264   // we special-case the win32 build, otherwise spelling and other stuff
265   // just doesn't work
266   return "UCS-4LE";
267 #elif defined(_LIBICONV_H) || defined(__FreeBSD__)
268   // libiconv seems to prefer UCS-4-INTERNAL to UCS-4BE and UCS-4LE
269   return "UCS-4-INTERNAL";
270 #elif defined(__OpenBSD__) || defined(__NetBSD__)
271   // we special case the BSDs since spelling just doesn't work
272   return "UCS4";
273 #else
274   // general case, found by hub and dom
275 	if (s_ucs4_internal == 0)
276 		s_internal_init ();
277 	return s_ucs4_internal;
278 #endif
279 }
280 
281 /************************************************************************/
282 /************************************************************************/
283 
284 /*!
285  * Returns true if the internal handle is valid, false if not
286  */
UT_iconv_isValid(UT_iconv_t cd)287 int UT_iconv_isValid ( UT_iconv_t cd )
288 {
289   return (cd != UT_ICONV_INVALID);
290 }
291 
UT_iconv_open(const char * to,const char * from)292 UT_iconv_t  UT_iconv_open( const char* to, const char* from )
293 {
294   if ( to && from )
295 	  return (UT_iconv_t)g_iconv_open(to, from);
296 
297   return UT_ICONV_INVALID;
298 }
299 
UT_iconv(UT_iconv_t cd,const char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)300 size_t UT_iconv( UT_iconv_t cd, const char **inbuf,
301 				 size_t *inbytesleft, char **outbuf, size_t *outbytesleft )
302 {
303   // this should take care of iconv problems with different compilers
304   // known issues:
305   // 1) gcc3.0 doesn't like const_cast<const pointer>()
306   // 2) some iconv implementations don't use a const char ** inbuf
307   //    while some (newer, conformant ones) do
308 
309   if ( !UT_iconv_isValid ( cd ) )
310     return (size_t)-1;
311 
312   return g_iconv((GIConv)cd, (char **)inbuf, inbytesleft, outbuf, outbytesleft);
313 }
314 
UT_iconv_close(UT_iconv_t cd)315 int  UT_iconv_close( UT_iconv_t cd )
316 {
317   if ( UT_iconv_isValid ( cd ) )
318     return g_iconv_close( (GIConv) cd );
319 
320   return -1;
321 }
322 
UT_iconv_reset(UT_iconv_t cd)323 void UT_iconv_reset(UT_iconv_t cd)
324 {
325     // this insane code is needed by iconv brokenness.  see
326     // http://www.abisource.com/mailinglists/abiword-dev/01/April/0135.html
327     if (XAP_EncodingManager::get_instance()->cjk_locale())
328 		UT_iconv(cd, NULL, NULL, NULL, NULL);
329 }
330 
331 /*!
332  * Borrowed from GLib 2.0 and (heavily) modified
333  *
334  * \param str Pointer to the input string.
335  * \param len Length of the input string to convert.
336  * \param from_codeset The "codeset" of the string pointed to by 'str'.
337  * \param to_codeset The "codeset" we want for the output.
338  * \param bytes_read optional, supply NULL if you don't want this.
339  * \param bytes_written optional, supply NULL if you don't want this.
340  *
341  * \return Returns a freshly allocated output string, which is terminated by
342  * a zero byte. Note that if the output codeset's terminator is not
343  * a zero byte (e.g., UCS-2, where it is two zero bytes), you can
344  * get correct termination by including the input string's terminator
345  * in the length passed as 'len'. E.g., if 'str' is null-terminated
346  * US-ASCII "foo", given 'len' as 4.
347  *
348  * \todo Check for out-of-memory allocations etc.
349  */
UT_convert(const char * str,UT_sint32 len,const char * from_codeset,const char * to_codeset,UT_uint32 * bytes_read_arg,UT_uint32 * bytes_written_arg)350 char * UT_convert(const char*	str,
351 		  UT_sint32	len,
352 		  const char*	from_codeset,
353 		  const char*	to_codeset,
354 		  UT_uint32*	bytes_read_arg,
355 		  UT_uint32*	bytes_written_arg)
356 {
357 	gsize _bytes_read = 0, _bytes_written = 0;
358 	char* result = g_convert(str, len, to_codeset, from_codeset, &_bytes_read, &_bytes_written, NULL);
359 
360 	if (bytes_read_arg) *bytes_read_arg = _bytes_read;
361 	if (bytes_written_arg) *bytes_written_arg = _bytes_written;
362 
363 	return result;
364 }
365 
366 /*! This function is almost the same as the other UT_convert function,
367  * only that it takes an UT_iconv_t instead of a from and to codeset.
368  * This is useful if you need to do a conversion multiple times
369  */
UT_convert_cd(const char * str,UT_sint32 len,UT_iconv_t cd,UT_uint32 * bytes_read_arg,UT_uint32 * bytes_written_arg)370 char * UT_convert_cd(const char *str,
371 		     UT_sint32 len,
372 		     UT_iconv_t cd,
373 		     UT_uint32 *bytes_read_arg,
374 		     UT_uint32 *bytes_written_arg)
375 {
376 	gsize _bytes_read = 0, _bytes_written = 0;
377 	char* result = g_convert_with_iconv(str, len, (GIConv)cd, &_bytes_read, &_bytes_written, NULL);
378 
379 	if (bytes_read_arg) *bytes_read_arg = _bytes_read;
380 	if (bytes_written_arg) *bytes_written_arg = _bytes_written;
381 
382 	return result;
383 }
384