1 /* AbiSource Program Utilities
2 * Copyright (C) 2001 AbiSource, Inc.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version 2
7 * of the License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA.
18 */
19
20 #ifdef HAVE_CONFIG_H
21 #include "config.h"
22 #endif
23
24 #include <stdlib.h>
25
26 #include "ut_iconv.h"
27 #include "ut_Encoding.h"
28 #include "ut_string.h"
29 #include "ut_debugmsg.h"
30
31 #include "xap_App.h"
32 #include "xap_Strings.h"
33
34 // Please keep the list below alphabetised by the encoding; even though
35 // this is not required for it to work, it will make it easier to maintain.
36
37 // to add a new encoding:
38 // (1) check on this list it is not already there;
39 // (2) add it to the list in xap_String_Id.h
40 // (3) add it here, using the ID corresponding to the one
41 // from xap_String_Id.h
42
43 // This list is based on libiconv.
44 // Since iconv implementations differ and iconv has no enumeration API,
45 // this code attempts to enumerate the supported encodings.
46 //
47 // There is an array of possible names for each encoding
48 // in order of standardization or popularity. We attempt to
49 // open each before deciding the encoding is not supported.
50 //
51 // Another approach is to do these tests in an external program which
52 // outputs the C++ code for the following table.
53 //
54 // TODO Note that certain operations in AbiWord currently try to open or
55 // TODO compare certain encodings via hard-coded names. This should be
56 // TODO discouraged and replaced with names derived as in these tables.
57 //
58 // TODO This code should probably move into the Encoding Manager.
59 //
60 // TODO Some platforms use specific names not covered by this alias table
61 // TODO such as the solaris iso-8559-n problem just fixed. Root out any
62 // TODO more instances of this to easily avoid problems that can cause a
63 // TODO huge PITA when misdiagnosed.
64
65 static const gchar * enc_armscii[] = {"ARMSCII-8",0};
66 static const gchar * enc_big5[] = {"BIG5","BIG-5","BIG-FIVE","BIGFIVE","CN-BIG5",0};
67 static const gchar * enc_big5hkscs[] = {"BIG5-HKSCS","BIG5HKSCS",0};
68 #ifdef WIN32 /* DOS/Win32 console encodings, peer iconv supports, others may not */
69 static const gchar * enc_cp437[] = {"C437","IBM437","437",0};
70 static const gchar * enc_cp850[] = {"C850","IBM850","850",0};
71 #endif
72 static const gchar * enc_cp874[] = {"CP874",0};
73 static const gchar * enc_cp932[] = {"CP932",0};
74 static const gchar * enc_cp936[] = {"CP936","GBK",0};
75 static const gchar * enc_cp949[] = {"CP949","UHC",0};
76 static const gchar * enc_cp950[] = {"CP950",0};
77 static const gchar * enc_cp1250[] = {"CP1250","WINDOWS-1250","MS-EE",0};
78 static const gchar * enc_cp1251[] = {"CP1251","WINDOWS-1251","MS-CYRL",0};
79 static const gchar * enc_cp1252[] = {"CP1252","WINDOWS-1252","MS-ANSI",0};
80 static const gchar * enc_cp1253[] = {"CP1253","WINDOWS-1253","MS-GREEK",0};
81 static const gchar * enc_cp1254[] = {"CP1254","WINDOWS-1254","MS-TURK",0};
82 static const gchar * enc_cp1255[] = {"CP1255","WINDOWS-1255","MS-HEBR",0};
83 static const gchar * enc_cp1256[] = {"CP1256","WINDOWS-1256","MS-ARAB",0};
84 static const gchar * enc_cp1257[] = {"CP1257","WINDOWS-1257","WINBALTRIM",0};
85 static const gchar * enc_cp1258[] = {"CP1258","WINDOWS-1258",0};
86 static const gchar * enc_euc_cn[] = {"EUC-CN","EUCCN","GB2312","CN-GB",0}; // Cf. GB_2312-80
87 static const gchar * enc_euc_jp[] = {"EUC-JP","EUCJP",0};
88 static const gchar * enc_euc_kr[] = {"EUC-KR","EUCKR",0};
89 static const gchar * enc_euc_tw[] = {"EUC-TW","EUCTW",0};
90 static const gchar * enc_gb2312[] = {"GB_2312-80","ISO-IR-58","CHINESE",0}; // Cf. EUC-CN
91 static const gchar * enc_georga[] = {"GEORGIAN-ACADEMY",0};
92 static const gchar * enc_georgps[] = {"GEORGIAN-PS",0};
93 static const gchar * enc_hp[] = {"HP-ROMAN8","ROMAN8","R8",0};
94 static const gchar * enc_hz[] = {"HZ","HZ-GB-2312",0};
95 static const gchar * enc_8859_1[] = {"ISO-8859-1","ISO_8859-1","8859-1","LATIN1","L1",0};
96 static const gchar * enc_8859_2[] = {"ISO-8859-2","ISO_8859-2","8859-2","LATIN2","L2",0};
97 //static const gchar * enc_8859_3[] = {"ISO-8859-3","ISO_8859-3","8859-3","LATIN3","L3",0};
98 static const gchar * enc_8859_4[] = {"ISO-8859-4","ISO_8859-4","8859-4","LATIN4","L4",0};
99 static const gchar * enc_8859_5[] = {"ISO-8859-5","ISO_8859-5","8859-5","CYRILLIC",0};
100 static const gchar * enc_8859_6[] = {"ISO-8859-6","ISO_8859-6","8859-6","ECMA-114","ASMO-708","ARABIC",0};
101 static const gchar * enc_8859_7[] = {"ISO-8859-7","ISO_8859-7","8859-7","ECMA-118","ELOT_928","GREEK8","GREEK",0};
102 static const gchar * enc_8859_8[] = {"ISO-8859-8","ISO_8859-8","8859-8","HEBREW",0};
103 static const gchar * enc_8859_9[] = {"ISO-8859-9","ISO_8859-9","8859-9","LATIN5","L5",0};
104 //static const gchar * enc_8859_10[] = {"ISO-8859-10","ISO_8859-10","8859-10","LATIN6","L6",0};
105 //static const gchar * enc_8859_13[] = {"ISO-8859-13","ISO_8859-13","8859-13","LATIN7","L7",0};
106 //static const gchar * enc_8859_14[] = {"ISO-8859-14","ISO_8859-14","8859-14","LATIN8","L8",0};
107 //static const gchar * enc_8859_15[] = {"ISO-8859-15","ISO_8859-15","8859-15",0};
108 //static const gchar * enc_8859_16[] = {"ISO-8859-16","ISO_8859-16","8859-16",0};
109 static const gchar * enc_2022_jp[] = {"ISO-2022-JP",0};
110 // There are 4 JIS encodings which are not Shift-JIS...
111 static const gchar * enc_johab[] = {"JOHAB","CP1361",0};
112 static const gchar * enc_koi8r[] = {"KOI8-R",0};
113 static const gchar * enc_koi8u[] = {"KOI8-U",0};
114 static const gchar * enc_ksc5601[] = {"KSC_5601","KS_C_5601-1987","KS_C_5601-1989","KOREAN",0};
115 static const gchar * enc_macarab[] = {"MacArabic",0};
116 static const gchar * enc_macceur[] = {"MacCentralEurope",0};
117 static const gchar * enc_maccroat[] = {"MacCroatian",0};
118 static const gchar * enc_maccyr[] = {"MacCyrillic",0};
119 static const gchar * enc_macgrk[] = {"MacGreek",0};
120 static const gchar * enc_macheb[] = {"MacHebrew",0};
121 static const gchar * enc_macice[] = {"MacIceLand",0};
122 static const gchar * enc_macrom[] = {"MacRoman","MACINTOSH","MAC",0};
123 static const gchar * enc_macrman[] = {"MacRomania",0};
124 static const gchar * enc_macthai[] = {"MacThai",0};
125 static const gchar * enc_macturk[] = {"MacTurkish",0};
126 static const gchar * enc_macukr[] = {"MacUkraine",0};
127 //static const gchar * enc_mulao[] = {"MULELAO-1",0};
128 static const gchar * enc_next[] = {"NEXTSTEP",0};
129 static const gchar * enc_sjis[] = {"SJIS","SHIFT_JIS","SHIFT-JIS","MS_KANJI",0};
130 static const gchar * enc_tcvn[] = {"TCVN","TCVN-5712","TCVN5712-1",0};
131 static const gchar * enc_tis620[] = {"TIS-620","TIS620","TIS620-0",0};
132 static const gchar * enc_ucs2be[] = {"UCS-2BE","UCS-2-BE","UNICODEBIG","UNICODE-1-1",0};
133 static const gchar * enc_ucs2le[] = {"UCS-2LE","UCS-2-LE","UNICODELITTLE",0};
134 static const gchar * enc_ucs4be[] = {"UCS-4BE","UCS-4-BE",0};
135 static const gchar * enc_ucs4le[] = {"UCS-4LE","UCS-4-LE",0};
136 // US-ASCII has more aliases if we need them
137 static const gchar * enc_usascii[] = {"US-ASCII","ASCII","US",0};
138 static const gchar * enc_utf7[] = {"UTF-7","UNICODE-1-1-UTF-7",0};
139 static const gchar * enc_utf8[] = {"UTF-8",0};
140 static const gchar * enc_utf16be[] = {"UTF-16BE","UTF-16-BE",0};
141 static const gchar * enc_utf16le[] = {"UTF-16LE","UTF-16-LE",0};
142 static const gchar * enc_utf32be[] = {"UTF-32BE","UTF-32-BE",0};
143 static const gchar * enc_utf32le[] = {"UTF-32LE","UTF-32-LE",0};
144 static const gchar * enc_viscii[] = {"VISCII",0};
145
146 static enc_entry s_Table[] =
147 {
148 //the property value, the localised translation, the numerical id
149 {enc_armscii, NULL, XAP_STRING_ID_ENC_ARME_ARMSCII},
150 {enc_big5, NULL, XAP_STRING_ID_ENC_CHTR_BIG5},
151 {enc_big5hkscs, NULL, XAP_STRING_ID_ENC_CHTR_BIG5HKSCS},
152 #ifdef TOOLKIT_WIN
153 {enc_cp437, NULL, XAP_STRING_ID_ENC_US_DOS},
154 {enc_cp850, NULL, XAP_STRING_ID_ENC_MLNG_DOS},
155 #endif
156 {enc_cp874, NULL, XAP_STRING_ID_ENC_THAI_WIN},
157 {enc_cp932, NULL, XAP_STRING_ID_ENC_JAPN_WIN},
158 {enc_cp936, NULL, XAP_STRING_ID_ENC_CHSI_WIN},
159 {enc_cp949, NULL, XAP_STRING_ID_ENC_KORE_WIN},
160 {enc_cp950, NULL, XAP_STRING_ID_ENC_CHTR_WIN},
161 {enc_cp1250, NULL, XAP_STRING_ID_ENC_CENT_WIN},
162 {enc_cp1251, NULL, XAP_STRING_ID_ENC_CYRL_WIN},
163 {enc_cp1252, NULL, XAP_STRING_ID_ENC_WEST_WIN},
164 {enc_cp1253, NULL, XAP_STRING_ID_ENC_GREE_WIN},
165 {enc_cp1254, NULL, XAP_STRING_ID_ENC_TURK_WIN},
166 {enc_cp1255, NULL, XAP_STRING_ID_ENC_HEBR_WIN},
167 {enc_cp1256, NULL, XAP_STRING_ID_ENC_ARAB_WIN},
168 {enc_cp1257, NULL, XAP_STRING_ID_ENC_BALT_WIN},
169 {enc_cp1258, NULL, XAP_STRING_ID_ENC_VIET_WIN},
170 {enc_euc_cn, NULL, XAP_STRING_ID_ENC_CHSI_EUC},
171 {enc_euc_jp, NULL, XAP_STRING_ID_ENC_JAPN_EUC},
172 {enc_euc_kr, NULL, XAP_STRING_ID_ENC_KORE_EUC},
173 {enc_euc_tw, NULL, XAP_STRING_ID_ENC_CHTR_EUC},
174 {enc_gb2312, NULL, XAP_STRING_ID_ENC_CHSI_GB},
175 {enc_georga, NULL, XAP_STRING_ID_ENC_GEOR_ACADEMY},
176 {enc_georgps, NULL, XAP_STRING_ID_ENC_GEOR_PS},
177 {enc_hp, NULL, XAP_STRING_ID_ENC_WEST_HP},
178 {enc_hz, NULL, XAP_STRING_ID_ENC_CHSI_HZ},
179 {enc_8859_1, NULL, XAP_STRING_ID_ENC_WEST_ISO},
180 {enc_8859_2, NULL, XAP_STRING_ID_ENC_CENT_ISO},
181 // 8859-3
182 {enc_8859_4, NULL, XAP_STRING_ID_ENC_BALT_ISO},
183 {enc_8859_5, NULL, XAP_STRING_ID_ENC_CYRL_ISO},
184 {enc_8859_6, NULL, XAP_STRING_ID_ENC_ARAB_ISO},
185 {enc_8859_7, NULL, XAP_STRING_ID_ENC_GREE_ISO},
186 {enc_8859_8, NULL, XAP_STRING_ID_ENC_HEBR_ISO},
187 {enc_8859_9, NULL, XAP_STRING_ID_ENC_TURK_ISO},
188 // 8859-10, 8859-13-16
189 {enc_2022_jp, NULL, XAP_STRING_ID_ENC_JAPN_ISO},
190 {enc_johab, NULL, XAP_STRING_ID_ENC_KORE_JOHAB},
191 {enc_koi8r, NULL, XAP_STRING_ID_ENC_CYRL_KOI},
192 {enc_koi8u, NULL, XAP_STRING_ID_ENC_UKRA_KOI},
193 {enc_ksc5601, NULL, XAP_STRING_ID_ENC_KORE_KSC}, // ISO
194 {enc_macarab, NULL, XAP_STRING_ID_ENC_ARAB_MAC},
195 {enc_macceur, NULL, XAP_STRING_ID_ENC_CENT_MAC},
196 {enc_maccroat, NULL, XAP_STRING_ID_ENC_CROA_MAC},
197 {enc_maccyr, NULL, XAP_STRING_ID_ENC_CYRL_MAC},
198 {enc_macgrk, NULL, XAP_STRING_ID_ENC_GREE_MAC},
199 {enc_macheb, NULL, XAP_STRING_ID_ENC_HEBR_MAC},
200 {enc_macice, NULL, XAP_STRING_ID_ENC_ICEL_MAC},
201 {enc_macrman, NULL, XAP_STRING_ID_ENC_ROMA_MAC},
202 {enc_macrom, NULL, XAP_STRING_ID_ENC_WEST_MAC},
203 {enc_macthai, NULL, XAP_STRING_ID_ENC_THAI_MAC},
204 {enc_macturk, NULL, XAP_STRING_ID_ENC_TURK_MAC},
205 {enc_macukr, NULL, XAP_STRING_ID_ENC_UKRA_MAC},
206 // other mac encodings
207 {enc_next, NULL, XAP_STRING_ID_ENC_WEST_NXT},
208 {enc_sjis, NULL, XAP_STRING_ID_ENC_JAPN_SJIS},
209 {enc_tcvn, NULL, XAP_STRING_ID_ENC_VIET_TCVN},
210 {enc_tis620, NULL, XAP_STRING_ID_ENC_THAI_TIS},
211 // {enc_ucs2, NULL, XAP_STRING_ID_ENC_UNIC_UCS_2},
212 {enc_ucs2be, NULL, XAP_STRING_ID_ENC_UNIC_UCS_2BE},
213 {enc_ucs2le, NULL, XAP_STRING_ID_ENC_UNIC_UCS_2LE},
214 // {enc_ucs4, NULL, XAP_STRING_ID_ENC_UNIC_UCS_4},
215 {enc_ucs4be, NULL, XAP_STRING_ID_ENC_UNIC_UCS_4BE},
216 {enc_ucs4le, NULL, XAP_STRING_ID_ENC_UNIC_UCS_4LE},
217 {enc_usascii, NULL, XAP_STRING_ID_ENC_WEST_ASCII},
218 {enc_utf7, NULL, XAP_STRING_ID_ENC_UNIC_UTF_7},
219 {enc_utf8, NULL, XAP_STRING_ID_ENC_UNIC_UTF_8},
220 // {enc_utf16, NULL, XAP_STRING_ID_ENC_UNIC_UTF_16},
221 {enc_utf16be, NULL, XAP_STRING_ID_ENC_UNIC_UTF_16BE},
222 {enc_utf16le, NULL, XAP_STRING_ID_ENC_UNIC_UTF_16LE},
223 // {enc_utf32, NULL, XAP_STRING_ID_ENC_UNIC_UTF_32},
224 {enc_utf32be, NULL, XAP_STRING_ID_ENC_UNIC_UTF_32BE},
225 {enc_utf32le, NULL, XAP_STRING_ID_ENC_UNIC_UTF_32LE},
226 {enc_viscii, NULL, XAP_STRING_ID_ENC_VIET_VISCII},
227 };
228
s_compareQ(const void * a,const void * b)229 static int s_compareQ(const void * a, const void *b)
230 {
231 const enc_entry * A = static_cast<const enc_entry *>(a);
232 const enc_entry * B = static_cast<const enc_entry *>(b);
233
234 if (A->id < B->id)
235 {
236 return -1;
237 }
238 else if (A->id > B->id)
239 {
240 return 1;
241 }
242 return 0;
243 }
244
s_compareB(const void * l,const void * e)245 static int s_compareB(const void * l, const void *e)
246 {
247 const gchar * L = static_cast<const gchar *>(l);
248 const enc_entry * E = static_cast<const enc_entry *>(e);
249 return strcmp(L, E->encs[0]);
250 }
251
252 bool UT_Encoding::s_Init = true;
253 UT_uint32 UT_Encoding::s_iCount = 0;
254
255
256 /*!
257 Construct encoding class
258
259 Find out which encodings the iconv on this system supports.
260 We try several possible names for each encoding.
261 If any name is successfully opened it becomes the only name for this encoding.
262 If no name is successfully opened the encoding is removed from the table.
263 */
UT_Encoding()264 UT_Encoding::UT_Encoding()
265 {
266 if (s_Init) //only do this once
267 {
268 const XAP_StringSet * pSS = XAP_App::getApp()->getStringSet();
269
270 // Test all the encodings in our master table
271 // Build a list of only those supported by the current iconv
272 UT_uint32 iCheckIndex = 0;
273 UT_uint32 iOkayIndex = 0;
274
275 while (iCheckIndex < G_N_ELEMENTS(s_Table))
276 {
277 const gchar * szName = pSS->getValue(s_Table[iCheckIndex].id);
278 const gchar * szEnc;
279 UT_uint32 iAltIndex;
280 bool bFound = false;
281
282 UT_DEBUGMSG(("Encoding '%s' = ",s_Table[iCheckIndex].encs[0]));
283 for (iAltIndex = 0; (szEnc = s_Table[iCheckIndex].encs[iAltIndex]); ++iAltIndex)
284 {
285 UT_iconv_t iconv_handle = UT_iconv_open(szEnc,szEnc);
286 if (UT_iconv_isValid(iconv_handle))
287 {
288 bFound = true;
289 UT_iconv_close(iconv_handle);
290 s_Table[iOkayIndex].encs[0] = szEnc;
291 s_Table[iOkayIndex].encs[1] = 0;
292 s_Table[iOkayIndex].desc = szName;
293 s_Table[iOkayIndex].id = s_Table[iCheckIndex].id;
294 UT_DEBUGMSG(("'%s' (alias %d)\n",szEnc,iAltIndex+1));
295 ++iOkayIndex;
296 break;
297 }
298 }
299 if (bFound == false)
300 {
301 UT_DEBUGMSG(("** Not supported **\n"));
302 }
303 ++iCheckIndex;
304 }
305 s_iCount = iOkayIndex;
306
307 qsort(s_Table, s_iCount, sizeof(enc_entry), s_compareQ);
308
309 s_Init = false;
310 }
311 }
312
getCount()313 UT_uint32 UT_Encoding::getCount()
314 {
315 UT_ASSERT (s_Init == false);
316 return s_iCount;
317 }
318
getNthEncoding(UT_uint32 n)319 const gchar * UT_Encoding::getNthEncoding(UT_uint32 n)
320 {
321 UT_ASSERT (s_Init == false);
322 return (s_Table[n].encs[0]);
323 }
324
getNthDescription(UT_uint32 n)325 const gchar * UT_Encoding::getNthDescription(UT_uint32 n)
326 {
327 UT_ASSERT (s_Init == false);
328 return (s_Table[n].desc);
329 }
330
331
getEncodingFromDescription(const gchar * desc)332 const gchar * UT_Encoding::getEncodingFromDescription(const gchar * desc)
333 {
334 UT_ASSERT (s_Init == false);
335 for (UT_uint32 i = 0; i < s_iCount; i++)
336 {
337 if (!strcmp(desc, s_Table[i].desc))
338 {
339 return s_Table[i].encs[0];
340 }
341 }
342 return NULL;
343 }
344
getIndxFromEncoding(const gchar * enc)345 UT_uint32 UT_Encoding::getIndxFromEncoding(const gchar * enc)
346 {
347 UT_ASSERT(s_Init == false);
348 for (UT_uint32 i = 0; i < s_iCount; i++)
349 {
350 if (!strcmp(enc, s_Table[i].encs[0]))
351 {
352 return i;
353 }
354 }
355 return 0;
356 }
357
getIdFromEncoding(const gchar * enc)358 UT_uint32 UT_Encoding::getIdFromEncoding(const gchar * enc)
359 {
360 UT_ASSERT (s_Init == false);
361 enc_entry * e = static_cast<enc_entry *>(bsearch(enc, s_Table, s_iCount, sizeof(enc_entry), s_compareB));
362 if (e)
363 {
364 return e->id;
365 }
366 else
367 {
368 return 0;
369 }
370 }
371
372