1 /* 2 * charset.h - header file for general character set conversion 3 * routines. 4 */ 5 6 #ifndef charset_charset_h 7 #define charset_charset_h 8 9 #include <stddef.h> 10 11 /* 12 * Enumeration that lists all the multibyte or single-byte 13 * character sets known to this library. 14 */ 15 typedef enum { 16 CS_NONE, /* used for reporting errors, etc */ 17 CS_ASCII, /* ordinary US-ASCII is worth having! */ 18 CS_ISO8859_1, 19 CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */ 20 CS_ISO8859_2, 21 CS_ISO8859_3, 22 CS_ISO8859_4, 23 CS_ISO8859_5, 24 CS_ISO8859_6, 25 CS_ISO8859_7, 26 CS_ISO8859_8, 27 CS_ISO8859_9, 28 CS_ISO8859_10, 29 CS_ISO8859_11, 30 CS_ISO8859_13, 31 CS_ISO8859_14, 32 CS_ISO8859_15, 33 CS_ISO8859_16, 34 CS_CP437, 35 CS_CP850, 36 CS_CP852, 37 CS_CP866, 38 CS_CP874, 39 CS_CP1250, 40 CS_CP1251, 41 CS_CP1252, 42 CS_CP1253, 43 CS_CP1254, 44 CS_CP1255, 45 CS_CP1256, 46 CS_CP1257, 47 CS_CP1258, 48 CS_KOI8_R, 49 CS_KOI8_U, 50 CS_KOI8_RU, 51 CS_JISX0201, 52 CS_MAC_ROMAN, 53 CS_MAC_TURKISH, 54 CS_MAC_CROATIAN, 55 CS_MAC_ICELAND, 56 CS_MAC_ROMANIAN, 57 CS_MAC_GREEK, 58 CS_MAC_CYRILLIC, 59 CS_MAC_THAI, 60 CS_MAC_CENTEURO, 61 CS_MAC_SYMBOL, 62 CS_MAC_DINGBATS, 63 CS_MAC_ROMAN_OLD, 64 CS_MAC_CROATIAN_OLD, 65 CS_MAC_ICELAND_OLD, 66 CS_MAC_ROMANIAN_OLD, 67 CS_MAC_GREEK_OLD, 68 CS_MAC_CYRILLIC_OLD, 69 CS_MAC_UKRAINE, 70 CS_MAC_VT100, 71 CS_MAC_VT100_OLD, 72 CS_MAC_PIRARD, 73 CS_VISCII, 74 CS_HP_ROMAN8, 75 CS_DEC_MCS, 76 CS_UTF8, 77 CS_UTF7, 78 CS_UTF7_CONSERVATIVE, 79 CS_UTF16, 80 CS_UTF16BE, 81 CS_UTF16LE, 82 CS_EUC_JP, 83 CS_EUC_CN, 84 CS_EUC_KR, 85 CS_ISO2022_JP, 86 CS_ISO2022_KR, 87 CS_BIG5, 88 CS_SHIFT_JIS, 89 CS_HZ, 90 CS_CP949, 91 CS_PDF, 92 CS_PSSTD, 93 CS_CTEXT, 94 CS_ISO2022, 95 CS_BS4730, 96 CS_DEC_GRAPHICS, 97 CS_EUC_TW, 98 CS_LIMIT /* dummy value indicating extent of enum */ 99 } charset_t; 100 101 typedef struct { 102 unsigned long s0, s1; 103 } charset_state; 104 105 /* 106 * This macro is used to initialise a charset_state structure: 107 * 108 * charset_state mystate = CHARSET_INIT_STATE; 109 */ 110 #define CHARSET_INIT_STATE { 0L, 0L } /* a suitable initialiser */ 111 112 /* 113 * This external variable contains the same data, but is provided 114 * for easy structure-copy assignment: 115 * 116 * mystate = charset_init_state; 117 */ 118 extern const charset_state charset_init_state; 119 120 /* 121 * Routine to convert a MB/SB character set to Unicode. 122 * 123 * This routine accepts some number of bytes, updates a state 124 * variable, and outputs some number of Unicode characters. There 125 * are no guarantees. You can't even guarantee that at most one 126 * Unicode character will be output per byte you feed in; for 127 * example, suppose you're reading UTF-8, you've seen E1 80, and 128 * then you suddenly see FE. Now you need to output _two_ error 129 * characters - one for the incomplete sequence E1 80, and one for 130 * the completely invalid UTF-8 byte FE. 131 * 132 * Returns the number of wide characters output; will never output 133 * more than the size of the buffer (as specified on input). 134 * Advances the `input' pointer and decrements `inlen', to indicate 135 * how far along the input string it got. 136 * 137 * The sequence of `errlen' wide characters pointed to by `errstr' 138 * will be used to indicate a conversion error. If `errstr' is 139 * NULL, `errlen' will be ignored, and the library will choose 140 * something sensible to do on its own. For Unicode, this will be 141 * U+FFFD (REPLACEMENT CHARACTER). 142 * 143 * `output' may be NULL, in which case the entire translation will 144 * be performed in theory (e.g. a dry run to work out how much 145 * space needs to be allocated for the real thing). `outlen' may 146 * also be negative, indicating an unlimited buffer length 147 * (although this is almost certainly unwise if `output' is _not_ 148 * NULL). 149 */ 150 151 int charset_to_unicode(const char **input, int *inlen, 152 wchar_t *output, int outlen, 153 int charset, charset_state *state, 154 const wchar_t *errstr, int errlen); 155 156 /* 157 * Routine to convert Unicode to an MB/SB character set. 158 * 159 * This routine accepts some number of Unicode characters, updates 160 * a state variable, and outputs some number of bytes. 161 * 162 * Returns the number of bytes output; will never output more than 163 * the size of the buffer (as specified on input), and will never 164 * output a partial MB character. Advances the `input' pointer and 165 * decrements `inlen', to indicate how far along the input string 166 * it got. 167 * 168 * If `error' is non-NULL and a character is found which cannot be 169 * expressed in the output charset, conversion will terminate at 170 * that character (so `input' points to the offending character) 171 * and `*error' will be set to TRUE; if `error' is non-NULL and no 172 * difficult characters are encountered, `*error' will be set to 173 * FALSE. If `error' is NULL, difficult characters will simply be 174 * ignored. 175 * 176 * If `input' is NULL, this routine will output the necessary bytes 177 * to reset the encoding state in any way which might be required 178 * at the end of an output piece of text. 179 * 180 * `output' may be NULL, in which case the entire translation will 181 * be performed in theory (e.g. a dry run to work out how much 182 * space needs to be allocated for the real thing). `outlen' may 183 * also be negative, indicating an unlimited buffer length 184 * (although this is almost certainly unwise if `output' is _not_ 185 * NULL). 186 */ 187 188 int charset_from_unicode(const wchar_t **input, int *inlen, 189 char *output, int outlen, 190 int charset, charset_state *state, int *error); 191 192 /* 193 * Convert X11 encoding names to and from our charset identifiers. 194 */ 195 const char *charset_to_xenc(int charset); 196 int charset_from_xenc(const char *name); 197 198 /* 199 * Convert MIME encoding names to and from our charset identifiers. 200 */ 201 const char *charset_to_mimeenc(int charset); 202 int charset_from_mimeenc(const char *name); 203 204 /* 205 * Convert our own encoding names to and from our charset 206 * identifiers. 207 */ 208 const char *charset_to_localenc(int charset); 209 int charset_from_localenc(const char *name); 210 int charset_localenc_nth(int n); 211 212 /* 213 * Convert Mac OS script/region/font to our charset identifiers. 214 */ 215 int charset_from_macenc(int script, int region, int sysvers, 216 const char *fontname); 217 218 /* 219 * Convert GNU Emacs coding system symbol to and from our charset 220 * identifiers. 221 */ 222 const char *charset_to_emacsenc(int charset); 223 int charset_from_emacsenc(const char *name); 224 225 /* 226 * Upgrade a charset identifier to a superset charset which is 227 * often confused with it. For example, people whose MUAs report 228 * their mail as ASCII or ISO8859-1 often in practice turn out to 229 * be using CP1252 quote characters, so when parsing incoming mail 230 * it is prudent to treat ASCII and ISO8859-1 as aliases for CP1252 231 * - and since it's a superset of both, this will cause no 232 * genuinely correct mail to be parsed wrongly. 233 */ 234 int charset_upgrade(int charset); 235 236 /* 237 * This function returns TRUE if the input charset is a vaguely 238 * sensible superset of ASCII. That is, it returns FALSE for 7-bit 239 * encoding formats such as HZ and UTF-7. 240 */ 241 int charset_contains_ascii(int charset); 242 243 /* 244 * This function returns TRUE if the input charset is single-byte. 245 */ 246 int charset_is_single_byte(int charset); 247 248 /* 249 * This function tries to deduce the CS_* identifier of the charset 250 * used in the current C locale. It falls back to CS_ASCII if it 251 * can't figure it out at all, so it will always return a valid 252 * charset. 253 * 254 * (Note that you should have already called setlocale(LC_CTYPE, 255 * "") to guarantee that this function will do the right thing.) 256 */ 257 int charset_from_locale(void); 258 259 /* 260 * This function tries to infer a charset identifier from a prefix of 261 * an HTML file, by looking for tags of the form <meta charset='foo'> 262 * or <meta http-equiv='content-type' content='text/html; charset=foo'>. 263 * 264 * If it returns CS_NONE, no identifiable charset was found. 265 * Otherwise, it returns the charset identifier it decided on, and 266 * also returns in namepos and namelen the starting position and 267 * length of the substring of the input that identifies that charset. 268 * (This permits a caller to translate an HTML document into a 269 * different charset and also know how to rewrite the <meta> tag so 270 * that it doesn't still claim the old charset.) 271 */ 272 int charset_from_html_prefix(const char *data, size_t len, 273 size_t *namepos, size_t *namelen); 274 275 /* 276 * This function simply reports whether a charset identifier 277 * corresponds to an actually usable charset. Not everything in the 278 * above enum does: CS_NONE, for a start, and occasionally other slots 279 * in the enum are reserved before they actually go into service. 280 * 281 * This function permits clients to iterate over _all_ supported 282 * charsets by means of a loop such as 283 * 284 * for (cs = 0; cs < CS_LIMIT; cs++) 285 * if (charset_exists(cs)) 286 * do_stuff_with(cs); 287 */ 288 int charset_exists(int charset); 289 290 #endif /* charset_charset_h */ 291