1 /* -*- c-basic-offset:2; tab-width:2; indent-tabs-mode:nil -*- */ 2 3 #ifndef __VT_CHAR_ENCODING_H__ 4 #define __VT_CHAR_ENCODING_H__ 5 6 #include <pobl/bl_types.h> /* u_char */ 7 #include <mef/ef_parser.h> 8 #include <mef/ef_conv.h> 9 10 /* 11 * Supported encodings are those which are not conflicted with US_ASCII. 12 * So , UCS-2, UCS-4 etc encodings are not supported. 13 */ 14 typedef enum vt_char_encoding { 15 VT_UNKNOWN_ENCODING = -1, 16 17 VT_ISO8859_1 = 0, 18 VT_ISO8859_2, 19 VT_ISO8859_3, 20 VT_ISO8859_4, 21 VT_ISO8859_5, 22 VT_ISO8859_6, 23 VT_ISO8859_7, 24 VT_ISO8859_8, 25 VT_ISO8859_9, 26 VT_ISO8859_10, 27 VT_TIS620, 28 VT_ISO8859_13, 29 VT_ISO8859_14, 30 VT_ISO8859_15, 31 VT_ISO8859_16, 32 VT_TCVN5712, 33 34 VT_ISCII_ASSAMESE, 35 VT_ISCII_BENGALI, 36 VT_ISCII_GUJARATI, 37 VT_ISCII_HINDI, 38 VT_ISCII_KANNADA, 39 VT_ISCII_MALAYALAM, 40 VT_ISCII_ORIYA, 41 VT_ISCII_PUNJABI, 42 VT_ISCII_TELUGU, 43 VT_VISCII, 44 VT_KOI8_R, 45 VT_KOI8_U, 46 VT_KOI8_T, 47 VT_GEORGIAN_PS, 48 VT_CP1250, 49 VT_CP1251, 50 VT_CP1252, 51 VT_CP1253, 52 VT_CP1254, 53 VT_CP1255, 54 VT_CP1256, 55 VT_CP1257, 56 VT_CP1258, 57 VT_CP874, 58 59 VT_UTF8, 60 61 VT_EUCJP, 62 VT_EUCJISX0213, 63 VT_ISO2022JP, 64 VT_ISO2022JP2, 65 VT_ISO2022JP3, 66 VT_SJIS, 67 VT_SJISX0213, 68 69 VT_EUCKR, 70 VT_UHC, 71 VT_JOHAB, 72 VT_ISO2022KR, 73 74 VT_BIG5, 75 VT_EUCTW, 76 77 VT_BIG5HKSCS, 78 79 VT_EUCCN, 80 VT_GBK, 81 VT_GB18030, 82 VT_HZ, 83 84 VT_ISO2022CN, 85 86 MAX_CHAR_ENCODINGS 87 88 } vt_char_encoding_t; 89 90 /* VT_ISO8859_1 <= (encoding) is always true if encoding is u_int16_t (vt_parser.h) */ 91 #define IS_ISO8859_VARIANT(encoding) (VT_ISO8859_1 <= (encoding) && (encoding) <= VT_TCVN5712) 92 93 #define IS_8BIT_ENCODING(encoding) (VT_ISO8859_1 <= (encoding) && (encoding) <= VT_CP874) 94 95 #define IS_ENCODING_BASED_ON_ISO2022(encoding) \ 96 (IS_ISO8859_VARIANT(encoding) || (VT_EUCJP <= (encoding) && (encoding) <= VT_ISO2022JP3) || \ 97 VT_EUCKR == (encoding) || VT_ISO2022KR == (encoding) || VT_EUCTW == (encoding) || \ 98 VT_ISO2022CN == (encoding) || VT_EUCCN == (encoding)) 99 100 /* ISO2022KR is subset and EUC-TW is not subset */ 101 #define IS_UCS_SUBSET_ENCODING(encoding) \ 102 ((encoding) != VT_ISO2022JP && (encoding) != VT_ISO2022JP2 && (encoding) != VT_ISO2022JP3 && \ 103 (encoding) != VT_ISO2022CN && (encoding) != VT_EUCTW) 104 105 /* 0x0 - 0x7f is not necessarily US-ASCII */ 106 #define IS_STATEFUL_ENCODING(encoding) \ 107 ((encoding) == VT_ISO2022JP || (encoding) == VT_ISO2022JP2 || (encoding) == VT_ISO2022JP3 || \ 108 (encoding) == VT_ISO2022KR || (encoding) == VT_ISO2022CN || (encoding) == VT_HZ) 109 110 #define IS_ISCII_ENCODING(encoding) \ 111 (VT_ISCII_ASSAMESE <= (encoding) && (encoding) <= VT_ISCII_TELUGU) 112 113 char *vt_get_char_encoding_name(vt_char_encoding_t encoding); 114 115 vt_char_encoding_t vt_get_char_encoding(const char *name); 116 117 ef_parser_t *vt_char_encoding_parser_new(vt_char_encoding_t encoding); 118 119 ef_conv_t *vt_char_encoding_conv_new(vt_char_encoding_t encoding); 120 121 int vt_is_msb_set(ef_charset_t cs); 122 123 size_t vt_char_encoding_convert(u_char *dst, size_t dst_len, vt_char_encoding_t dst_encoding, 124 u_char *src, size_t src_len, vt_char_encoding_t src_encoding); 125 126 size_t vt_char_encoding_convert_with_parser(u_char *dst, size_t dst_len, 127 vt_char_encoding_t dst_encoding, ef_parser_t *parser); 128 129 int vt_parse_unicode_area(const char *str, u_int *min, u_int *max); 130 131 u_char vt_convert_ucs_to_decsp(u_int16_t ucs); 132 133 u_int16_t vt_convert_decsp_to_ucs(u_char decsp); 134 135 u_int16_t vt_convert_dectech_to_ucs(u_char decsp); 136 137 void vt_char_encoding_conv_set_use_loose_rule(ef_conv_t *conv, vt_char_encoding_t encoding, 138 int flag); 139 140 #endif 141