1 /* 2 * charset.h - header file for general character set conversion 3 * routines. 4 */ 5 6 #ifndef charset_charset_h 7 #define charset_charset_h 8 9 #include <stddef.h> 10 11 /* 12 * Enumeration that lists all the multibyte or single-byte 13 * character sets known to this library. 14 */ 15 typedef enum { 16 CS_NONE, /* used for reporting errors, etc */ 17 CS_ISO8859_1, 18 CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */ 19 CS_ISO8859_2, 20 CS_ISO8859_3, 21 CS_ISO8859_4, 22 CS_ISO8859_5, 23 CS_ISO8859_6, 24 CS_ISO8859_7, 25 CS_ISO8859_8, 26 CS_ISO8859_9, 27 CS_ISO8859_10, 28 CS_ISO8859_11, 29 CS_ISO8859_13, 30 CS_ISO8859_14, 31 CS_ISO8859_15, 32 CS_ISO8859_16, 33 CS_CP437, 34 CS_CP850, 35 CS_CP852, 36 CS_CP866, 37 CS_CP1250, 38 CS_CP1251, 39 CS_CP1252, 40 CS_CP1253, 41 CS_CP1254, 42 CS_CP1255, 43 CS_CP1256, 44 CS_CP1257, 45 CS_CP1258, 46 CS_KOI8_R, 47 CS_KOI8_U, 48 CS_MAC_ROMAN, 49 CS_MAC_TURKISH, 50 CS_MAC_CROATIAN, 51 CS_MAC_ICELAND, 52 CS_MAC_ROMANIAN, 53 CS_MAC_GREEK, 54 CS_MAC_CYRILLIC, 55 CS_MAC_THAI, 56 CS_MAC_CENTEURO, 57 CS_MAC_SYMBOL, 58 CS_MAC_DINGBATS, 59 CS_MAC_ROMAN_OLD, 60 CS_MAC_CROATIAN_OLD, 61 CS_MAC_ICELAND_OLD, 62 CS_MAC_ROMANIAN_OLD, 63 CS_MAC_GREEK_OLD, 64 CS_MAC_CYRILLIC_OLD, 65 CS_MAC_UKRAINE, 66 CS_MAC_VT100, 67 CS_MAC_VT100_OLD, 68 CS_VISCII, 69 CS_HP_ROMAN8, 70 CS_DEC_MCS, 71 CS_UTF8 72 } charset_t; 73 74 typedef struct { 75 unsigned long s0; 76 } charset_state; 77 78 /* 79 * Routine to convert a MB/SB character set to Unicode. 80 * 81 * This routine accepts some number of bytes, updates a state 82 * variable, and outputs some number of Unicode characters. There 83 * are no guarantees. You can't even guarantee that at most one 84 * Unicode character will be output per byte you feed in; for 85 * example, suppose you're reading UTF-8, you've seen E1 80, and 86 * then you suddenly see FE. Now you need to output _two_ error 87 * characters - one for the incomplete sequence E1 80, and one for 88 * the completely invalid UTF-8 byte FE. 89 * 90 * Returns the number of wide characters output; will never output 91 * more than the size of the buffer (as specified on input). 92 * Advances the `input' pointer and decrements `inlen', to indicate 93 * how far along the input string it got. 94 * 95 * The sequence of `errlen' wide characters pointed to by `errstr' 96 * will be used to indicate a conversion error. If `errstr' is 97 * NULL, `errlen' will be ignored, and the library will choose 98 * something sensible to do on its own. For Unicode, this will be 99 * U+FFFD (REPLACEMENT CHARACTER). 100 */ 101 102 int charset_to_unicode(const char **input, int *inlen, 103 wchar_t *output, int outlen, 104 int charset, charset_state *state, 105 const wchar_t *errstr, int errlen); 106 107 /* 108 * Routine to convert Unicode to an MB/SB character set. 109 * 110 * This routine accepts some number of Unicode characters, updates 111 * a state variable, and outputs some number of bytes. 112 * 113 * Returns the number of bytes characters output; will never output 114 * more than the size of the buffer (as specified on input), and 115 * will never output a partial MB character. Advances the `input' 116 * pointer and decrements `inlen', to indicate how far along the 117 * input string it got. 118 * 119 * The sequence of `errlen' characters pointed to by `errstr' will 120 * be used to indicate a conversion error. If `errstr' is NULL, 121 * `errlen' will be ignored, and the library will choose something 122 * sensible to do on its own (which will vary depending on the 123 * output charset). 124 */ 125 126 int charset_from_unicode(const wchar_t **input, int *inlen, 127 char *output, int outlen, 128 int charset, charset_state *state, 129 const char *errstr, int errlen); 130 131 /* 132 * Convert X11 encoding names to and from our charset identifiers. 133 */ 134 const char *charset_to_xenc(int charset); 135 int charset_from_xenc(const char *name); 136 137 /* 138 * Convert MIME encoding names to and from our charset identifiers. 139 */ 140 const char *charset_to_mimeenc(int charset); 141 int charset_from_mimeenc(const char *name); 142 143 /* 144 * Convert our own encoding names to and from our charset 145 * identifiers. 146 */ 147 const char *charset_to_localenc(int charset); 148 int charset_from_localenc(const char *name); 149 int charset_localenc_nth(int n); 150 151 /* 152 * Convert Mac OS script/region/font to our charset identifiers. 153 */ 154 int charset_from_macenc(int script, int region, int sysvers, 155 const char *fontname); 156 157 #endif /* charset_charset_h */ 158