1 /*
2  * local.c - translate our internal character set codes to and from
3  * our own set of plausibly legible character-set names. Also
4  * provides a canonical name for each encoding (useful for software
5  * announcing what character set it will be using), and a set of
6  * enumeration functions which return a list of supported
7  * encodings one by one.
8  *
9  * Also in this table are other ways people might plausibly refer
10  * to a charset (for example, Win1252 as well as CP1252). Where
11  * more than one string is given for a particular character set,
12  * the first one is the canonical one returned by
13  * charset_to_localenc.
14  *
15  * charset_from_localenc will attempt all other text translations
16  * as well as this table, to maximise the number of different ways
17  * you can select a supported charset.
18  */
19 
20 #include <ctype.h>
21 #include "charset.h"
22 #include "internal.h"
23 
24 static const struct {
25     const char *name;
26     int charset;
27     int return_in_enum;   /* enumeration misses some charsets */
28 } localencs[] = {
29     { "<UNKNOWN>", CS_NONE, 0 },
30     { "ASCII", CS_ASCII, 1 },
31     { "BS 4730", CS_BS4730, 1 },
32     { "BS-4730", CS_BS4730, 0 },
33     { "BS4730", CS_BS4730, 0 },
34     { "ISO-8859-1", CS_ISO8859_1, 1 },
35     { "ISO-8859-1 with X11 line drawing", CS_ISO8859_1_X11, 0 },
36     { "ISO-8859-1-X11", CS_ISO8859_1_X11, 0 },
37     { "ISO8859-1-X11", CS_ISO8859_1_X11, 0 },
38     { "ISO-8859-2", CS_ISO8859_2, 1 },
39     { "ISO-8859-3", CS_ISO8859_3, 1 },
40     { "ISO-8859-4", CS_ISO8859_4, 1 },
41     { "ISO-8859-5", CS_ISO8859_5, 1 },
42     { "ISO-8859-6", CS_ISO8859_6, 1 },
43     { "ISO-8859-7", CS_ISO8859_7, 1 },
44     { "ISO-8859-8", CS_ISO8859_8, 1 },
45     { "ISO-8859-9", CS_ISO8859_9, 1 },
46     { "ISO-8859-10", CS_ISO8859_10, 1 },
47     { "ISO-8859-11", CS_ISO8859_11, 1 },
48     { "ISO-8859-13", CS_ISO8859_13, 1 },
49     { "ISO-8859-14", CS_ISO8859_14, 1 },
50     { "ISO-8859-15", CS_ISO8859_15, 1 },
51     { "ISO-8859-16", CS_ISO8859_16, 1 },
52     { "CP437", CS_CP437, 1 },
53     { "CP850", CS_CP850, 1 },
54     { "CP852", CS_CP852, 1 },
55     { "CP866", CS_CP866, 1 },
56     { "CP874", CS_CP874, 1 },
57     { "Win874", CS_CP874, 0 },
58     { "Win-874", CS_CP874, 0 },
59     { "CP1250", CS_CP1250, 1 },
60     { "Win1250", CS_CP1250, 0 },
61     { "CP1251", CS_CP1251, 1 },
62     { "Win1251", CS_CP1251, 0 },
63     { "CP1252", CS_CP1252, 1 },
64     { "Win1252", CS_CP1252, 0 },
65     { "CP1253", CS_CP1253, 1 },
66     { "Win1253", CS_CP1253, 0 },
67     { "CP1254", CS_CP1254, 1 },
68     { "Win1254", CS_CP1254, 0 },
69     { "CP1255", CS_CP1255, 1 },
70     { "Win1255", CS_CP1255, 0 },
71     { "CP1256", CS_CP1256, 1 },
72     { "Win1256", CS_CP1256, 0 },
73     { "CP1257", CS_CP1257, 1 },
74     { "Win1257", CS_CP1257, 0 },
75     { "CP1258", CS_CP1258, 1 },
76     { "Win1258", CS_CP1258, 0 },
77     { "KOI8-R", CS_KOI8_R, 1 },
78     { "KOI8-U", CS_KOI8_U, 1 },
79     { "KOI8-RU", CS_KOI8_RU, 1 },
80     { "JIS X 0201", CS_JISX0201, 1 },
81     { "JIS-X-0201", CS_JISX0201, 0 },
82     { "JIS_X_0201", CS_JISX0201, 0 },
83     { "JISX0201", CS_JISX0201, 0 },
84     { "Mac Roman", CS_MAC_ROMAN, 1 },
85     { "Mac-Roman", CS_MAC_ROMAN, 0 },
86     { "MacRoman", CS_MAC_ROMAN, 0 },
87     { "Mac Turkish", CS_MAC_TURKISH, 1 },
88     { "Mac-Turkish", CS_MAC_TURKISH, 0 },
89     { "MacTurkish", CS_MAC_TURKISH, 0 },
90     { "Mac Croatian", CS_MAC_CROATIAN, 1 },
91     { "Mac-Croatian", CS_MAC_CROATIAN, 0 },
92     { "MacCroatian", CS_MAC_CROATIAN, 0 },
93     { "Mac Iceland", CS_MAC_ICELAND, 1 },
94     { "Mac-Iceland", CS_MAC_ICELAND, 0 },
95     { "MacIceland", CS_MAC_ICELAND, 0 },
96     { "Mac Romanian", CS_MAC_ROMANIAN, 1 },
97     { "Mac-Romanian", CS_MAC_ROMANIAN, 0 },
98     { "MacRomanian", CS_MAC_ROMANIAN, 0 },
99     { "Mac Greek", CS_MAC_GREEK, 1 },
100     { "Mac-Greek", CS_MAC_GREEK, 0 },
101     { "MacGreek", CS_MAC_GREEK, 0 },
102     { "Mac Cyrillic", CS_MAC_CYRILLIC, 1 },
103     { "Mac-Cyrillic", CS_MAC_CYRILLIC, 0 },
104     { "MacCyrillic", CS_MAC_CYRILLIC, 0 },
105     { "Mac Thai", CS_MAC_THAI, 1 },
106     { "Mac-Thai", CS_MAC_THAI, 0 },
107     { "MacThai", CS_MAC_THAI, 0 },
108     { "Mac Centeuro", CS_MAC_CENTEURO, 1 },
109     { "Mac-Centeuro", CS_MAC_CENTEURO, 0 },
110     { "MacCenteuro", CS_MAC_CENTEURO, 0 },
111     { "Mac Symbol", CS_MAC_SYMBOL, 1 },
112     { "Mac-Symbol", CS_MAC_SYMBOL, 0 },
113     { "MacSymbol", CS_MAC_SYMBOL, 0 },
114     { "Mac Dingbats", CS_MAC_DINGBATS, 1 },
115     { "Mac-Dingbats", CS_MAC_DINGBATS, 0 },
116     { "MacDingbats", CS_MAC_DINGBATS, 0 },
117     { "Mac Roman (old)", CS_MAC_ROMAN_OLD, 0 },
118     { "Mac-Roman-old", CS_MAC_ROMAN_OLD, 0 },
119     { "MacRoman-old", CS_MAC_ROMAN_OLD, 0 },
120     { "Mac Croatian (old)", CS_MAC_CROATIAN_OLD, 0 },
121     { "Mac-Croatian-old", CS_MAC_CROATIAN_OLD, 0 },
122     { "MacCroatian-old", CS_MAC_CROATIAN_OLD, 0 },
123     { "Mac Iceland (old)", CS_MAC_ICELAND_OLD, 0 },
124     { "Mac-Iceland-old", CS_MAC_ICELAND_OLD, 0 },
125     { "MacIceland-old", CS_MAC_ICELAND_OLD, 0 },
126     { "Mac Romanian (old)", CS_MAC_ROMANIAN_OLD, 0 },
127     { "Mac-Romanian-old", CS_MAC_ROMANIAN_OLD, 0 },
128     { "MacRomanian-old", CS_MAC_ROMANIAN_OLD, 0 },
129     { "Mac Greek (old)", CS_MAC_GREEK_OLD, 0 },
130     { "Mac-Greek-old", CS_MAC_GREEK_OLD, 0 },
131     { "MacGreek-old", CS_MAC_GREEK_OLD, 0 },
132     { "Mac Cyrillic (old)", CS_MAC_CYRILLIC_OLD, 0 },
133     { "Mac-Cyrillic-old", CS_MAC_CYRILLIC_OLD, 0 },
134     { "MacCyrillic-old", CS_MAC_CYRILLIC_OLD, 0 },
135     { "Mac Ukraine", CS_MAC_UKRAINE, 1 },
136     { "Mac-Ukraine", CS_MAC_UKRAINE, 0 },
137     { "MacUkraine", CS_MAC_UKRAINE, 0 },
138     { "Mac VT100", CS_MAC_VT100, 1 },
139     { "Mac-VT100", CS_MAC_VT100, 0 },
140     { "MacVT100", CS_MAC_VT100, 0 },
141     { "Mac VT100 (old)", CS_MAC_VT100_OLD, 0 },
142     { "Mac-VT100-old", CS_MAC_VT100_OLD, 0 },
143     { "MacVT100-old", CS_MAC_VT100_OLD, 0 },
144     { "Mac Roman (Pirard encoding)", CS_MAC_PIRARD, 0 },
145     { "Mac Pirard", CS_MAC_PIRARD, 0 },
146     { "Mac-Pirard", CS_MAC_PIRARD, 0 },
147     { "MacPirard", CS_MAC_PIRARD, 0 },
148     { "VISCII", CS_VISCII, 1 },
149     { "HP ROMAN8", CS_HP_ROMAN8, 1 },
150     { "HP-ROMAN8", CS_HP_ROMAN8, 0 },
151     { "DEC MCS", CS_DEC_MCS, 1 },
152     { "DEC-MCS", CS_DEC_MCS, 1 },
153     { "DEC graphics", CS_DEC_GRAPHICS, 1 },
154     { "DEC-graphics", CS_DEC_GRAPHICS, 0 },
155     { "DECgraphics", CS_DEC_GRAPHICS, 0 },
156     { "UTF-8", CS_UTF8, 1 },
157     { "UTF-7", CS_UTF7, 1 },
158     { "UTF-7-conservative", CS_UTF7_CONSERVATIVE, 0 },
159     { "EUC-CN", CS_EUC_CN, 1 },
160     { "EUC-KR", CS_EUC_KR, 1 },
161     { "EUC-JP", CS_EUC_JP, 1 },
162     { "EUC-TW", CS_EUC_TW, 1 },
163     { "ISO-2022-JP", CS_ISO2022_JP, 1 },
164     { "ISO-2022-KR", CS_ISO2022_KR, 1 },
165     { "Big5", CS_BIG5, 1 },
166     { "Shift-JIS", CS_SHIFT_JIS, 1 },
167     { "HZ", CS_HZ, 1 },
168     { "UTF-16BE", CS_UTF16BE, 1 },
169     { "UTF-16LE", CS_UTF16LE, 1 },
170     { "UTF-16", CS_UTF16, 1 },
171     { "CP949", CS_CP949, 1 },
172     { "PDFDocEncoding", CS_PDF, 1 },
173     { "StandardEncoding", CS_PSSTD, 1 },
174     { "COMPOUND_TEXT", CS_CTEXT, 1 },
175     { "COMPOUND-TEXT", CS_CTEXT, 0 },
176     { "COMPOUND TEXT", CS_CTEXT, 0 },
177     { "COMPOUNDTEXT", CS_CTEXT, 0 },
178     { "CTEXT", CS_CTEXT, 0 },
179     { "ISO-2022", CS_ISO2022, 1 },
180     { "ISO2022", CS_ISO2022, 0 },
181 };
182 
charset_to_localenc(int charset)183 const char *charset_to_localenc(int charset)
184 {
185     int i;
186 
187     for (i = 0; i < (int)lenof(localencs); i++)
188 	if (charset == localencs[i].charset)
189 	    return localencs[i].name;
190 
191     return NULL;		       /* not found */
192 }
193 
charset_from_localenc(const char * name)194 int charset_from_localenc(const char *name)
195 {
196     int i;
197 
198     if ( (i = charset_from_mimeenc(name)) != CS_NONE)
199 	return i;
200     if ( (i = charset_from_xenc(name)) != CS_NONE)
201 	return i;
202     if ( (i = charset_from_emacsenc(name)) != CS_NONE)
203 	return i;
204 
205     for (i = 0; i < (int)lenof(localencs); i++) {
206 	const char *p, *q;
207 	p = name;
208 	q = localencs[i].name;
209 	while (*p || *q) {
210 		if (tolower((unsigned char)*p) != tolower((unsigned char)*q))
211 		break;
212 	    p++; q++;
213 	}
214 	if (!*p && !*q)
215 	    return localencs[i].charset;
216     }
217 
218     return CS_NONE;		       /* not found */
219 }
220 
charset_localenc_nth(int n)221 int charset_localenc_nth(int n)
222 {
223     int i;
224 
225     for (i = 0; i < (int)lenof(localencs); i++)
226 	if (localencs[i].return_in_enum && !n--)
227 	    return localencs[i].charset;
228 
229     return CS_NONE;		       /* end of list */
230 }
231