1 /* 2 * charset.h - header file for general character set conversion 3 * routines. 4 */ 5 6 #ifndef charset_charset_h 7 #define charset_charset_h 8 9 #include <stddef.h> 10 11 /* 12 * Enumeration that lists all the multibyte or single-byte 13 * character sets known to this library. 14 */ 15 typedef enum { 16 CS_NONE, /* used for reporting errors, etc */ 17 CS_ISO8859_1, 18 CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */ 19 CS_ISO8859_2, 20 CS_ISO8859_3, 21 CS_ISO8859_4, 22 CS_ISO8859_5, 23 CS_ISO8859_6, 24 CS_ISO8859_7, 25 CS_ISO8859_8, 26 CS_ISO8859_9, 27 CS_ISO8859_10, 28 CS_ISO8859_11, 29 CS_ISO8859_13, 30 CS_ISO8859_14, 31 CS_ISO8859_15, 32 CS_ISO8859_16, 33 CS_CP437, 34 CS_CP850, 35 CS_CP866, 36 CS_CP1250, 37 CS_CP1251, 38 CS_CP1252, 39 CS_CP1253, 40 CS_CP1254, 41 CS_CP1255, 42 CS_CP1256, 43 CS_CP1257, 44 CS_CP1258, 45 CS_KOI8_R, 46 CS_KOI8_U, 47 CS_MAC_ROMAN, 48 CS_MAC_TURKISH, 49 CS_MAC_CROATIAN, 50 CS_MAC_ICELAND, 51 CS_MAC_ROMANIAN, 52 CS_MAC_GREEK, 53 CS_MAC_CYRILLIC, 54 CS_MAC_THAI, 55 CS_MAC_CENTEURO, 56 CS_MAC_SYMBOL, 57 CS_MAC_DINGBATS, 58 CS_MAC_ROMAN_OLD, 59 CS_MAC_CROATIAN_OLD, 60 CS_MAC_ICELAND_OLD, 61 CS_MAC_ROMANIAN_OLD, 62 CS_MAC_GREEK_OLD, 63 CS_MAC_CYRILLIC_OLD, 64 CS_MAC_UKRAINE, 65 CS_MAC_VT100, 66 CS_MAC_VT100_OLD, 67 CS_VISCII, 68 CS_HP_ROMAN8, 69 CS_DEC_MCS, 70 CS_UTF8 71 } charset_t; 72 73 typedef struct { 74 unsigned long s0; 75 } charset_state; 76 77 /* 78 * Routine to convert a MB/SB character set to Unicode. 79 * 80 * This routine accepts some number of bytes, updates a state 81 * variable, and outputs some number of Unicode characters. There 82 * are no guarantees. You can't even guarantee that at most one 83 * Unicode character will be output per byte you feed in; for 84 * example, suppose you're reading UTF-8, you've seen E1 80, and 85 * then you suddenly see FE. Now you need to output _two_ error 86 * characters - one for the incomplete sequence E1 80, and one for 87 * the completely invalid UTF-8 byte FE. 88 * 89 * Returns the number of wide characters output; will never output 90 * more than the size of the buffer (as specified on input). 91 * Advances the `input' pointer and decrements `inlen', to indicate 92 * how far along the input string it got. 93 * 94 * The sequence of `errlen' wide characters pointed to by `errstr' 95 * will be used to indicate a conversion error. If `errstr' is 96 * NULL, `errlen' will be ignored, and the library will choose 97 * something sensible to do on its own. For Unicode, this will be 98 * U+FFFD (REPLACEMENT CHARACTER). 99 */ 100 101 int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen, 102 int charset, charset_state *state, 103 const wchar_t *errstr, int errlen); 104 105 /* 106 * Routine to convert Unicode to an MB/SB character set. 107 * 108 * This routine accepts some number of Unicode characters, updates 109 * a state variable, and outputs some number of bytes. 110 * 111 * Returns the number of bytes characters output; will never output 112 * more than the size of the buffer (as specified on input), and 113 * will never output a partial MB character. Advances the `input' 114 * pointer and decrements `inlen', to indicate how far along the 115 * input string it got. 116 * 117 * The sequence of `errlen' characters pointed to by `errstr' will 118 * be used to indicate a conversion error. If `errstr' is NULL, 119 * `errlen' will be ignored, and the library will choose something 120 * sensible to do on its own (which will vary depending on the 121 * output charset). 122 */ 123 124 int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen, 125 int charset, charset_state *state, 126 const char *errstr, int errlen); 127 128 /* 129 * Convert X11 encoding names to and from our charset identifiers. 130 */ 131 const char *charset_to_xenc(int charset); 132 int charset_from_xenc(const char *name); 133 134 /* 135 * Convert MIME encoding names to and from our charset identifiers. 136 */ 137 const char *charset_to_mimeenc(int charset); 138 int charset_from_mimeenc(const char *name); 139 140 /* 141 * Convert our own encoding names to and from our charset 142 * identifiers. 143 */ 144 const char *charset_to_localenc(int charset); 145 int charset_from_localenc(const char *name); 146 int charset_localenc_nth(int n); 147 148 /* 149 * Convert Mac OS script/region/font to our charset identifiers. 150 */ 151 int charset_from_macenc(int script, int region, int sysvers, 152 const char *fontname); 153 154 #endif /* charset_charset_h */ 155