1 /* AbiSource Program Utilities
2  * Copyright (C) 2001 AbiSource, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17  * 02110-1301 USA.
18  */
19 
20 #ifdef HAVE_CONFIG_H
21 #include "config.h"
22 #endif
23 
24 #include <stdlib.h>
25 
26 #include "ut_iconv.h"
27 #include "ut_Encoding.h"
28 #include "ut_string.h"
29 #include "ut_debugmsg.h"
30 
31 #include "xap_App.h"
32 #include "xap_Strings.h"
33 
34 // Please keep the list below alphabetised by the encoding; even though
35 // this is not required for it to work, it will make it easier to maintain.
36 
37 // to add a new encoding:
38 // (1) check on this list it is not already there;
39 // (2) add it to the list in xap_String_Id.h
40 // (3) add it here, using the ID corresponding to the one
41 //     from xap_String_Id.h
42 
43 // This list is based on libiconv.
44 // Since iconv implementations differ and iconv has no enumeration API,
45 // this code attempts to enumerate the supported encodings.
46 //
47 // There is an array of possible names for each encoding
48 // in order of standardization or popularity.  We attempt to
49 // open each before deciding the encoding is not supported.
50 //
51 // Another approach is to do these tests in an external program which
52 // outputs the C++ code for the following table.
53 //
54 // TODO Note that certain operations in AbiWord currently try to open or
55 // TODO compare certain encodings via hard-coded names.  This should be
56 // TODO discouraged and replaced with names derived as in these tables.
57 //
58 // TODO This code should probably move into the Encoding Manager.
59 //
60 // TODO Some platforms use specific names not covered by this alias table
61 // TODO such as the solaris iso-8559-n problem just fixed.  Root out any
62 // TODO more instances of this to easily avoid problems that can cause a
63 // TODO huge PITA when misdiagnosed.
64 
65 static const gchar * enc_armscii[]	= {"ARMSCII-8",0};
66 static const gchar * enc_big5[]	= {"BIG5","BIG-5","BIG-FIVE","BIGFIVE","CN-BIG5",0};
67 static const gchar * enc_big5hkscs[]	= {"BIG5-HKSCS","BIG5HKSCS",0};
68 #ifdef WIN32 /* DOS/Win32 console encodings, peer iconv supports, others may not */
69 static const gchar * enc_cp437[]	= {"C437","IBM437","437",0};
70 static const gchar * enc_cp850[]	= {"C850","IBM850","850",0};
71 #endif
72 static const gchar * enc_cp874[]	= {"CP874",0};
73 static const gchar * enc_cp932[]	= {"CP932",0};
74 static const gchar * enc_cp936[]	= {"CP936","GBK",0};
75 static const gchar * enc_cp949[]	= {"CP949","UHC",0};
76 static const gchar * enc_cp950[]	= {"CP950",0};
77 static const gchar * enc_cp1250[]	= {"CP1250","WINDOWS-1250","MS-EE",0};
78 static const gchar * enc_cp1251[]	= {"CP1251","WINDOWS-1251","MS-CYRL",0};
79 static const gchar * enc_cp1252[]	= {"CP1252","WINDOWS-1252","MS-ANSI",0};
80 static const gchar * enc_cp1253[]	= {"CP1253","WINDOWS-1253","MS-GREEK",0};
81 static const gchar * enc_cp1254[]	= {"CP1254","WINDOWS-1254","MS-TURK",0};
82 static const gchar * enc_cp1255[]	= {"CP1255","WINDOWS-1255","MS-HEBR",0};
83 static const gchar * enc_cp1256[]	= {"CP1256","WINDOWS-1256","MS-ARAB",0};
84 static const gchar * enc_cp1257[]	= {"CP1257","WINDOWS-1257","WINBALTRIM",0};
85 static const gchar * enc_cp1258[]	= {"CP1258","WINDOWS-1258",0};
86 static const gchar * enc_euc_cn[]	= {"EUC-CN","EUCCN","GB2312","CN-GB",0};	// Cf. GB_2312-80
87 static const gchar * enc_euc_jp[]	= {"EUC-JP","EUCJP",0};
88 static const gchar * enc_euc_kr[]	= {"EUC-KR","EUCKR",0};
89 static const gchar * enc_euc_tw[]	= {"EUC-TW","EUCTW",0};
90 static const gchar * enc_gb2312[]	= {"GB_2312-80","ISO-IR-58","CHINESE",0};	// Cf. EUC-CN
91 static const gchar * enc_georga[]	= {"GEORGIAN-ACADEMY",0};
92 static const gchar * enc_georgps[]	= {"GEORGIAN-PS",0};
93 static const gchar * enc_hp[]		= {"HP-ROMAN8","ROMAN8","R8",0};
94 static const gchar * enc_hz[]		= {"HZ","HZ-GB-2312",0};
95 static const gchar * enc_8859_1[]	= {"ISO-8859-1","ISO_8859-1","8859-1","LATIN1","L1",0};
96 static const gchar * enc_8859_2[]	= {"ISO-8859-2","ISO_8859-2","8859-2","LATIN2","L2",0};
97 //static const gchar * enc_8859_3[]  = {"ISO-8859-3","ISO_8859-3","8859-3","LATIN3","L3",0};
98 static const gchar * enc_8859_4[]	= {"ISO-8859-4","ISO_8859-4","8859-4","LATIN4","L4",0};
99 static const gchar * enc_8859_5[]	= {"ISO-8859-5","ISO_8859-5","8859-5","CYRILLIC",0};
100 static const gchar * enc_8859_6[]	= {"ISO-8859-6","ISO_8859-6","8859-6","ECMA-114","ASMO-708","ARABIC",0};
101 static const gchar * enc_8859_7[]	= {"ISO-8859-7","ISO_8859-7","8859-7","ECMA-118","ELOT_928","GREEK8","GREEK",0};
102 static const gchar * enc_8859_8[]	= {"ISO-8859-8","ISO_8859-8","8859-8","HEBREW",0};
103 static const gchar * enc_8859_9[]	= {"ISO-8859-9","ISO_8859-9","8859-9","LATIN5","L5",0};
104 //static const gchar * enc_8859_10[]	= {"ISO-8859-10","ISO_8859-10","8859-10","LATIN6","L6",0};
105 //static const gchar * enc_8859_13[]	= {"ISO-8859-13","ISO_8859-13","8859-13","LATIN7","L7",0};
106 //static const gchar * enc_8859_14[]	= {"ISO-8859-14","ISO_8859-14","8859-14","LATIN8","L8",0};
107 //static const gchar * enc_8859_15[]	= {"ISO-8859-15","ISO_8859-15","8859-15",0};
108 //static const gchar * enc_8859_16[]	= {"ISO-8859-16","ISO_8859-16","8859-16",0};
109 static const gchar * enc_2022_jp[]	= {"ISO-2022-JP",0};
110 // There are 4 JIS encodings which are not Shift-JIS...
111 static const gchar * enc_johab[]	= {"JOHAB","CP1361",0};
112 static const gchar * enc_koi8r[]	= {"KOI8-R",0};
113 static const gchar * enc_koi8u[]	= {"KOI8-U",0};
114 static const gchar * enc_ksc5601[]	= {"KSC_5601","KS_C_5601-1987","KS_C_5601-1989","KOREAN",0};
115 static const gchar * enc_macarab[]	= {"MacArabic",0};
116 static const gchar * enc_macceur[] = {"MacCentralEurope",0};
117 static const gchar * enc_maccroat[]	= {"MacCroatian",0};
118 static const gchar * enc_maccyr[]	= {"MacCyrillic",0};
119 static const gchar * enc_macgrk[]	= {"MacGreek",0};
120 static const gchar * enc_macheb[]	= {"MacHebrew",0};
121 static const gchar * enc_macice[]	= {"MacIceLand",0};
122 static const gchar * enc_macrom[]	= {"MacRoman","MACINTOSH","MAC",0};
123 static const gchar * enc_macrman[]	= {"MacRomania",0};
124 static const gchar * enc_macthai[]	= {"MacThai",0};
125 static const gchar * enc_macturk[]	= {"MacTurkish",0};
126 static const gchar * enc_macukr[]	= {"MacUkraine",0};
127 //static const gchar * enc_mulao[]	= {"MULELAO-1",0};
128 static const gchar * enc_next[]	= {"NEXTSTEP",0};
129 static const gchar * enc_sjis[]	= {"SJIS","SHIFT_JIS","SHIFT-JIS","MS_KANJI",0};
130 static const gchar * enc_tcvn[]	= {"TCVN","TCVN-5712","TCVN5712-1",0};
131 static const gchar * enc_tis620[]	= {"TIS-620","TIS620","TIS620-0",0};
132 static const gchar * enc_ucs2be[]	= {"UCS-2BE","UCS-2-BE","UNICODEBIG","UNICODE-1-1",0};
133 static const gchar * enc_ucs2le[]	= {"UCS-2LE","UCS-2-LE","UNICODELITTLE",0};
134 static const gchar * enc_ucs4be[]	= {"UCS-4BE","UCS-4-BE",0};
135 static const gchar * enc_ucs4le[]	= {"UCS-4LE","UCS-4-LE",0};
136 // US-ASCII has more aliases if we need them
137 static const gchar * enc_usascii[]	= {"US-ASCII","ASCII","US",0};
138 static const gchar * enc_utf7[]	= {"UTF-7","UNICODE-1-1-UTF-7",0};
139 static const gchar * enc_utf8[]	= {"UTF-8",0};
140 static const gchar * enc_utf16be[]	= {"UTF-16BE","UTF-16-BE",0};
141 static const gchar * enc_utf16le[]	= {"UTF-16LE","UTF-16-LE",0};
142 static const gchar * enc_utf32be[]	= {"UTF-32BE","UTF-32-BE",0};
143 static const gchar * enc_utf32le[]	= {"UTF-32LE","UTF-32-LE",0};
144 static const gchar * enc_viscii[]	= {"VISCII",0};
145 
146 static enc_entry s_Table[] =
147 {
148 	//the property value, the localised translation, the numerical id
149 	{enc_armscii,			NULL, XAP_STRING_ID_ENC_ARME_ARMSCII},
150 	{enc_big5,				NULL, XAP_STRING_ID_ENC_CHTR_BIG5},
151 	{enc_big5hkscs,				NULL, XAP_STRING_ID_ENC_CHTR_BIG5HKSCS},
152 #ifdef TOOLKIT_WIN
153 	{enc_cp437,				NULL, XAP_STRING_ID_ENC_US_DOS},
154 	{enc_cp850,				NULL, XAP_STRING_ID_ENC_MLNG_DOS},
155 #endif
156 	{enc_cp874,				NULL, XAP_STRING_ID_ENC_THAI_WIN},
157 	{enc_cp932,				NULL, XAP_STRING_ID_ENC_JAPN_WIN},
158 	{enc_cp936,				NULL, XAP_STRING_ID_ENC_CHSI_WIN},
159 	{enc_cp949,				NULL, XAP_STRING_ID_ENC_KORE_WIN},
160 	{enc_cp950,				NULL, XAP_STRING_ID_ENC_CHTR_WIN},
161 	{enc_cp1250,			NULL, XAP_STRING_ID_ENC_CENT_WIN},
162 	{enc_cp1251,			NULL, XAP_STRING_ID_ENC_CYRL_WIN},
163 	{enc_cp1252,			NULL, XAP_STRING_ID_ENC_WEST_WIN},
164 	{enc_cp1253,			NULL, XAP_STRING_ID_ENC_GREE_WIN},
165 	{enc_cp1254,			NULL, XAP_STRING_ID_ENC_TURK_WIN},
166 	{enc_cp1255,			NULL, XAP_STRING_ID_ENC_HEBR_WIN},
167 	{enc_cp1256,			NULL, XAP_STRING_ID_ENC_ARAB_WIN},
168 	{enc_cp1257,			NULL, XAP_STRING_ID_ENC_BALT_WIN},
169 	{enc_cp1258,			NULL, XAP_STRING_ID_ENC_VIET_WIN},
170 	{enc_euc_cn,			NULL, XAP_STRING_ID_ENC_CHSI_EUC},
171 	{enc_euc_jp,			NULL, XAP_STRING_ID_ENC_JAPN_EUC},
172 	{enc_euc_kr,			NULL, XAP_STRING_ID_ENC_KORE_EUC},
173 	{enc_euc_tw,			NULL, XAP_STRING_ID_ENC_CHTR_EUC},
174 	{enc_gb2312,			NULL, XAP_STRING_ID_ENC_CHSI_GB},
175 	{enc_georga,			NULL, XAP_STRING_ID_ENC_GEOR_ACADEMY},
176 	{enc_georgps,			NULL, XAP_STRING_ID_ENC_GEOR_PS},
177 	{enc_hp,				NULL, XAP_STRING_ID_ENC_WEST_HP},
178 	{enc_hz,				NULL, XAP_STRING_ID_ENC_CHSI_HZ},
179 	{enc_8859_1,			NULL, XAP_STRING_ID_ENC_WEST_ISO},
180 	{enc_8859_2,			NULL, XAP_STRING_ID_ENC_CENT_ISO},
181 	// 8859-3
182 	{enc_8859_4,			NULL, XAP_STRING_ID_ENC_BALT_ISO},
183 	{enc_8859_5,			NULL, XAP_STRING_ID_ENC_CYRL_ISO},
184 	{enc_8859_6,			NULL, XAP_STRING_ID_ENC_ARAB_ISO},
185 	{enc_8859_7,			NULL, XAP_STRING_ID_ENC_GREE_ISO},
186 	{enc_8859_8,			NULL, XAP_STRING_ID_ENC_HEBR_ISO},
187 	{enc_8859_9,			NULL, XAP_STRING_ID_ENC_TURK_ISO},
188 	// 8859-10, 8859-13-16
189 	{enc_2022_jp,			NULL, XAP_STRING_ID_ENC_JAPN_ISO},
190 	{enc_johab,				NULL, XAP_STRING_ID_ENC_KORE_JOHAB},
191 	{enc_koi8r,				NULL, XAP_STRING_ID_ENC_CYRL_KOI},
192 	{enc_koi8u,				NULL, XAP_STRING_ID_ENC_UKRA_KOI},
193 	{enc_ksc5601,			NULL, XAP_STRING_ID_ENC_KORE_KSC},	// ISO
194 	{enc_macarab,			NULL, XAP_STRING_ID_ENC_ARAB_MAC},
195 	{enc_macceur,			NULL, XAP_STRING_ID_ENC_CENT_MAC},
196 	{enc_maccroat,			NULL, XAP_STRING_ID_ENC_CROA_MAC},
197 	{enc_maccyr,			NULL, XAP_STRING_ID_ENC_CYRL_MAC},
198 	{enc_macgrk,			NULL, XAP_STRING_ID_ENC_GREE_MAC},
199 	{enc_macheb,			NULL, XAP_STRING_ID_ENC_HEBR_MAC},
200 	{enc_macice,			NULL, XAP_STRING_ID_ENC_ICEL_MAC},
201 	{enc_macrman,			NULL, XAP_STRING_ID_ENC_ROMA_MAC},
202 	{enc_macrom,			NULL, XAP_STRING_ID_ENC_WEST_MAC},
203 	{enc_macthai,			NULL, XAP_STRING_ID_ENC_THAI_MAC},
204 	{enc_macturk,			NULL, XAP_STRING_ID_ENC_TURK_MAC},
205 	{enc_macukr,			NULL, XAP_STRING_ID_ENC_UKRA_MAC},
206 	// other mac encodings
207 	{enc_next,				NULL, XAP_STRING_ID_ENC_WEST_NXT},
208 	{enc_sjis,				NULL, XAP_STRING_ID_ENC_JAPN_SJIS},
209 	{enc_tcvn,				NULL, XAP_STRING_ID_ENC_VIET_TCVN},
210 	{enc_tis620,			NULL, XAP_STRING_ID_ENC_THAI_TIS},
211 //	{enc_ucs2,				NULL, XAP_STRING_ID_ENC_UNIC_UCS_2},
212 	{enc_ucs2be,			NULL, XAP_STRING_ID_ENC_UNIC_UCS_2BE},
213 	{enc_ucs2le,			NULL, XAP_STRING_ID_ENC_UNIC_UCS_2LE},
214 //	{enc_ucs4,				NULL, XAP_STRING_ID_ENC_UNIC_UCS_4},
215 	{enc_ucs4be,			NULL, XAP_STRING_ID_ENC_UNIC_UCS_4BE},
216 	{enc_ucs4le,			NULL, XAP_STRING_ID_ENC_UNIC_UCS_4LE},
217     {enc_usascii,    		NULL, XAP_STRING_ID_ENC_WEST_ASCII},
218 	{enc_utf7,				NULL, XAP_STRING_ID_ENC_UNIC_UTF_7},
219 	{enc_utf8,				NULL, XAP_STRING_ID_ENC_UNIC_UTF_8},
220 //	{enc_utf16,				NULL, XAP_STRING_ID_ENC_UNIC_UTF_16},
221 	{enc_utf16be,			NULL, XAP_STRING_ID_ENC_UNIC_UTF_16BE},
222 	{enc_utf16le,			NULL, XAP_STRING_ID_ENC_UNIC_UTF_16LE},
223 //	{enc_utf32,				NULL, XAP_STRING_ID_ENC_UNIC_UTF_32},
224 	{enc_utf32be,			NULL, XAP_STRING_ID_ENC_UNIC_UTF_32BE},
225 	{enc_utf32le,			NULL, XAP_STRING_ID_ENC_UNIC_UTF_32LE},
226 	{enc_viscii,			NULL, XAP_STRING_ID_ENC_VIET_VISCII},
227 };
228 
s_compareQ(const void * a,const void * b)229 static int s_compareQ(const void * a, const void *b)
230 {
231 	const enc_entry * A = static_cast<const enc_entry *>(a);
232 	const enc_entry * B = static_cast<const enc_entry *>(b);
233 
234 	if (A->id < B->id)
235 	{
236 		return -1;
237 	}
238 	else if (A->id > B->id)
239 	{
240 		return 1;
241 	}
242 	return 0;
243 }
244 
s_compareB(const void * l,const void * e)245 static int s_compareB(const void * l, const void *e)
246 {
247 	const gchar * L   = static_cast<const gchar *>(l);
248 	const enc_entry * E = static_cast<const enc_entry *>(e);
249 	return strcmp(L, E->encs[0]);
250 }
251 
252 bool UT_Encoding::s_Init = true;
253 UT_uint32 UT_Encoding::s_iCount = 0;
254 
255 
256 /*!
257   Construct encoding class
258 
259  Find out which encodings the iconv on this system supports.
260  We try several possible names for each encoding.
261  If any name is successfully opened it becomes the only name for this encoding.
262  If no name is successfully opened the encoding is removed from the table.
263  */
UT_Encoding()264 UT_Encoding::UT_Encoding()
265 {
266 	if (s_Init) //only do this once
267 	{
268 		const XAP_StringSet * pSS = XAP_App::getApp()->getStringSet();
269 
270 		// Test all the encodings in our master table
271 		// Build a list of only those supported by the current iconv
272 		UT_uint32 iCheckIndex = 0;
273 		UT_uint32 iOkayIndex = 0;
274 
275 		while (iCheckIndex < G_N_ELEMENTS(s_Table))
276 		{
277 			const gchar * szName = pSS->getValue(s_Table[iCheckIndex].id);
278 			const gchar * szEnc;
279 			UT_uint32 iAltIndex;
280 			bool bFound = false;
281 
282 			UT_DEBUGMSG(("Encoding '%s' = ",s_Table[iCheckIndex].encs[0]));
283 			for (iAltIndex = 0; (szEnc = s_Table[iCheckIndex].encs[iAltIndex]); ++iAltIndex)
284 			{
285 				UT_iconv_t iconv_handle = UT_iconv_open(szEnc,szEnc);
286 				if (UT_iconv_isValid(iconv_handle))
287 				{
288 					bFound = true;
289 					UT_iconv_close(iconv_handle);
290 					s_Table[iOkayIndex].encs[0] = szEnc;
291 					s_Table[iOkayIndex].encs[1] = 0;
292 					s_Table[iOkayIndex].desc = szName;
293 					s_Table[iOkayIndex].id = s_Table[iCheckIndex].id;
294 					UT_DEBUGMSG(("'%s' (alias %d)\n",szEnc,iAltIndex+1));
295 					++iOkayIndex;
296 					break;
297 				}
298 			}
299 			if (bFound == false)
300 			{
301 				UT_DEBUGMSG(("** Not supported **\n"));
302 			}
303 			++iCheckIndex;
304 		}
305 		s_iCount = iOkayIndex;
306 
307 		qsort(s_Table, s_iCount, sizeof(enc_entry), s_compareQ);
308 
309 		s_Init = false;
310 	}
311 }
312 
getCount()313 UT_uint32 UT_Encoding::getCount()
314 {
315 	UT_ASSERT (s_Init == false);
316 	return s_iCount;
317 }
318 
getNthEncoding(UT_uint32 n)319 const gchar * UT_Encoding::getNthEncoding(UT_uint32 n)
320 {
321 	UT_ASSERT (s_Init == false);
322 	return (s_Table[n].encs[0]);
323 }
324 
getNthDescription(UT_uint32 n)325 const gchar * UT_Encoding::getNthDescription(UT_uint32 n)
326 {
327 	UT_ASSERT (s_Init == false);
328 	return (s_Table[n].desc);
329 }
330 
331 
getEncodingFromDescription(const gchar * desc)332 const gchar * UT_Encoding::getEncodingFromDescription(const gchar * desc)
333 {
334 	UT_ASSERT (s_Init == false);
335 	for (UT_uint32 i = 0; i < s_iCount; i++)
336 	{
337 		if (!strcmp(desc, s_Table[i].desc))
338 		{
339 			return s_Table[i].encs[0];
340 		}
341 	}
342 	return NULL;
343 }
344 
getIndxFromEncoding(const gchar * enc)345 UT_uint32 UT_Encoding::getIndxFromEncoding(const gchar * enc)
346 {
347 	UT_ASSERT(s_Init == false);
348 	for (UT_uint32 i = 0; i < s_iCount; i++)
349 	{
350 		if (!strcmp(enc, s_Table[i].encs[0]))
351 		{
352 			return i;
353 		}
354 	}
355 	return 0;
356 }
357 
getIdFromEncoding(const gchar * enc)358 UT_uint32 UT_Encoding::getIdFromEncoding(const gchar * enc)
359 {
360 	UT_ASSERT (s_Init == false);
361 	enc_entry * e = static_cast<enc_entry *>(bsearch(enc, s_Table, s_iCount, sizeof(enc_entry), s_compareB));
362 	if (e)
363 	{
364 		return e->id;
365 	}
366 	else
367 	{
368 		return 0;
369 	}
370 }
371 
372