1 /*
2  * charset.h - header file for general character set conversion
3  * routines.
4  */
5 
6 #ifndef charset_charset_h
7 #define charset_charset_h
8 
9 #include <stddef.h>
10 
11 /*
12  * Enumeration that lists all the multibyte or single-byte
13  * character sets known to this library.
14  */
15 typedef enum {
16     CS_NONE,			       /* used for reporting errors, etc */
17     CS_ASCII,			       /* ordinary US-ASCII is worth having! */
18     CS_ISO8859_1,
19     CS_ISO8859_1_X11,		       /* X font encoding with VT100 glyphs */
20     CS_ISO8859_2,
21     CS_ISO8859_3,
22     CS_ISO8859_4,
23     CS_ISO8859_5,
24     CS_ISO8859_6,
25     CS_ISO8859_7,
26     CS_ISO8859_8,
27     CS_ISO8859_9,
28     CS_ISO8859_10,
29     CS_ISO8859_11,
30     CS_ISO8859_13,
31     CS_ISO8859_14,
32     CS_ISO8859_15,
33     CS_ISO8859_16,
34     CS_CP437,
35     CS_CP850,
36     CS_CP852,
37     CS_CP866,
38     CS_CP874,
39     CS_CP1250,
40     CS_CP1251,
41     CS_CP1252,
42     CS_CP1253,
43     CS_CP1254,
44     CS_CP1255,
45     CS_CP1256,
46     CS_CP1257,
47     CS_CP1258,
48     CS_KOI8_R,
49     CS_KOI8_U,
50     CS_KOI8_RU,
51     CS_JISX0201,
52     CS_MAC_ROMAN,
53     CS_MAC_TURKISH,
54     CS_MAC_CROATIAN,
55     CS_MAC_ICELAND,
56     CS_MAC_ROMANIAN,
57     CS_MAC_GREEK,
58     CS_MAC_CYRILLIC,
59     CS_MAC_THAI,
60     CS_MAC_CENTEURO,
61     CS_MAC_SYMBOL,
62     CS_MAC_DINGBATS,
63     CS_MAC_ROMAN_OLD,
64     CS_MAC_CROATIAN_OLD,
65     CS_MAC_ICELAND_OLD,
66     CS_MAC_ROMANIAN_OLD,
67     CS_MAC_GREEK_OLD,
68     CS_MAC_CYRILLIC_OLD,
69     CS_MAC_UKRAINE,
70     CS_MAC_VT100,
71     CS_MAC_VT100_OLD,
72     CS_MAC_PIRARD,
73     CS_VISCII,
74     CS_HP_ROMAN8,
75     CS_DEC_MCS,
76     CS_UTF8,
77     CS_UTF7,
78     CS_UTF7_CONSERVATIVE,
79     CS_UTF16,
80     CS_UTF16BE,
81     CS_UTF16LE,
82     CS_EUC_JP,
83     CS_EUC_CN,
84     CS_EUC_KR,
85     CS_ISO2022_JP,
86     CS_ISO2022_KR,
87     CS_BIG5,
88     CS_SHIFT_JIS,
89     CS_HZ,
90     CS_CP949,
91     CS_PDF,
92     CS_PSSTD,
93     CS_CTEXT,
94     CS_ISO2022,
95     CS_BS4730,
96     CS_DEC_GRAPHICS,
97     CS_EUC_TW,
98     CS_LIMIT               /* dummy value indicating extent of enum */
99 } charset_t;
100 
101 typedef struct {
102     unsigned long s0, s1;
103 } charset_state;
104 
105 /*
106  * This macro is used to initialise a charset_state structure:
107  *
108  *   charset_state mystate = CHARSET_INIT_STATE;
109  */
110 #define CHARSET_INIT_STATE { 0L, 0L }  /* a suitable initialiser */
111 
112 /*
113  * This external variable contains the same data, but is provided
114  * for easy structure-copy assignment:
115  *
116  *   mystate = charset_init_state;
117  */
118 extern const charset_state charset_init_state;
119 
120 /*
121  * Routine to convert a MB/SB character set to Unicode.
122  *
123  * This routine accepts some number of bytes, updates a state
124  * variable, and outputs some number of Unicode characters. There
125  * are no guarantees. You can't even guarantee that at most one
126  * Unicode character will be output per byte you feed in; for
127  * example, suppose you're reading UTF-8, you've seen E1 80, and
128  * then you suddenly see FE. Now you need to output _two_ error
129  * characters - one for the incomplete sequence E1 80, and one for
130  * the completely invalid UTF-8 byte FE.
131  *
132  * Returns the number of wide characters output; will never output
133  * more than the size of the buffer (as specified on input).
134  * Advances the `input' pointer and decrements `inlen', to indicate
135  * how far along the input string it got.
136  *
137  * The sequence of `errlen' wide characters pointed to by `errstr'
138  * will be used to indicate a conversion error. If `errstr' is
139  * NULL, `errlen' will be ignored, and the library will choose
140  * something sensible to do on its own. For Unicode, this will be
141  * U+FFFD (REPLACEMENT CHARACTER).
142  *
143  * `output' may be NULL, in which case the entire translation will
144  * be performed in theory (e.g. a dry run to work out how much
145  * space needs to be allocated for the real thing). `outlen' may
146  * also be negative, indicating an unlimited buffer length
147  * (although this is almost certainly unwise if `output' is _not_
148  * NULL).
149  */
150 
151 int charset_to_unicode(const char **input, int *inlen,
152 		       wchar_t *output, int outlen,
153 		       int charset, charset_state *state,
154 		       const wchar_t *errstr, int errlen);
155 
156 /*
157  * Routine to convert Unicode to an MB/SB character set.
158  *
159  * This routine accepts some number of Unicode characters, updates
160  * a state variable, and outputs some number of bytes.
161  *
162  * Returns the number of bytes output; will never output more than
163  * the size of the buffer (as specified on input), and will never
164  * output a partial MB character. Advances the `input' pointer and
165  * decrements `inlen', to indicate how far along the input string
166  * it got.
167  *
168  * If `error' is non-NULL and a character is found which cannot be
169  * expressed in the output charset, conversion will terminate at
170  * that character (so `input' points to the offending character)
171  * and `*error' will be set to TRUE; if `error' is non-NULL and no
172  * difficult characters are encountered, `*error' will be set to
173  * FALSE. If `error' is NULL, difficult characters will simply be
174  * ignored.
175  *
176  * If `input' is NULL, this routine will output the necessary bytes
177  * to reset the encoding state in any way which might be required
178  * at the end of an output piece of text.
179  *
180  * `output' may be NULL, in which case the entire translation will
181  * be performed in theory (e.g. a dry run to work out how much
182  * space needs to be allocated for the real thing). `outlen' may
183  * also be negative, indicating an unlimited buffer length
184  * (although this is almost certainly unwise if `output' is _not_
185  * NULL).
186  */
187 
188 int charset_from_unicode(const wchar_t **input, int *inlen,
189 			 char *output, int outlen,
190 			 int charset, charset_state *state, int *error);
191 
192 /*
193  * Convert X11 encoding names to and from our charset identifiers.
194  */
195 const char *charset_to_xenc(int charset);
196 int charset_from_xenc(const char *name);
197 
198 /*
199  * Convert MIME encoding names to and from our charset identifiers.
200  */
201 const char *charset_to_mimeenc(int charset);
202 int charset_from_mimeenc(const char *name);
203 
204 /*
205  * Convert our own encoding names to and from our charset
206  * identifiers.
207  */
208 const char *charset_to_localenc(int charset);
209 int charset_from_localenc(const char *name);
210 int charset_localenc_nth(int n);
211 
212 /*
213  * Convert Mac OS script/region/font to our charset identifiers.
214  */
215 int charset_from_macenc(int script, int region, int sysvers,
216 			const char *fontname);
217 
218 /*
219  * Convert GNU Emacs coding system symbol to and from our charset
220  * identifiers.
221  */
222 const char *charset_to_emacsenc(int charset);
223 int charset_from_emacsenc(const char *name);
224 
225 /*
226  * Upgrade a charset identifier to a superset charset which is
227  * often confused with it. For example, people whose MUAs report
228  * their mail as ASCII or ISO8859-1 often in practice turn out to
229  * be using CP1252 quote characters, so when parsing incoming mail
230  * it is prudent to treat ASCII and ISO8859-1 as aliases for CP1252
231  * - and since it's a superset of both, this will cause no
232  * genuinely correct mail to be parsed wrongly.
233  */
234 int charset_upgrade(int charset);
235 
236 /*
237  * This function returns TRUE if the input charset is a vaguely
238  * sensible superset of ASCII. That is, it returns FALSE for 7-bit
239  * encoding formats such as HZ and UTF-7.
240  */
241 int charset_contains_ascii(int charset);
242 
243 /*
244  * This function returns TRUE if the input charset is single-byte.
245  */
246 int charset_is_single_byte(int charset);
247 
248 /*
249  * This function tries to deduce the CS_* identifier of the charset
250  * used in the current C locale. It falls back to CS_ASCII if it
251  * can't figure it out at all, so it will always return a valid
252  * charset.
253  *
254  * (Note that you should have already called setlocale(LC_CTYPE,
255  * "") to guarantee that this function will do the right thing.)
256  */
257 int charset_from_locale(void);
258 
259 /*
260  * This function tries to infer a charset identifier from a prefix of
261  * an HTML file, by looking for tags of the form <meta charset='foo'>
262  * or <meta http-equiv='content-type' content='text/html; charset=foo'>.
263  *
264  * If it returns CS_NONE, no identifiable charset was found.
265  * Otherwise, it returns the charset identifier it decided on, and
266  * also returns in namepos and namelen the starting position and
267  * length of the substring of the input that identifies that charset.
268  * (This permits a caller to translate an HTML document into a
269  * different charset and also know how to rewrite the <meta> tag so
270  * that it doesn't still claim the old charset.)
271  */
272 int charset_from_html_prefix(const char *data, size_t len,
273                              size_t *namepos, size_t *namelen);
274 
275 /*
276  * This function simply reports whether a charset identifier
277  * corresponds to an actually usable charset. Not everything in the
278  * above enum does: CS_NONE, for a start, and occasionally other slots
279  * in the enum are reserved before they actually go into service.
280  *
281  * This function permits clients to iterate over _all_ supported
282  * charsets by means of a loop such as
283  *
284  *     for (cs = 0; cs < CS_LIMIT; cs++)
285  *         if (charset_exists(cs))
286  *             do_stuff_with(cs);
287  */
288 int charset_exists(int charset);
289 
290 #endif /* charset_charset_h */
291