1 /** \file crtxtenc.h 2 \brief character encoding utils 3 4 CoolReader Engine 5 6 (c) Vadim Lopatin, 2000-2006 7 This source code is distributed under the terms of 8 GNU General Public License. 9 See LICENSE file for details. 10 11 */ 12 13 #ifndef __CRTXTENC_H_INCLUDED__ 14 #define __CRTXTENC_H_INCLUDED__ 15 16 #include "lvtypes.h" 17 #include <stdio.h> 18 #include "lvstring.h" 19 20 #ifndef GBK_ENCODING_SUPPORT 21 #define GBK_ENCODING_SUPPORT 1 22 #endif 23 #ifndef JIS_ENCODING_SUPPORT 24 #define JIS_ENCODING_SUPPORT 1 25 #endif 26 #ifndef BIG5_ENCODING_SUPPORT 27 #define BIG5_ENCODING_SUPPORT 1 28 #endif 29 #ifndef EUC_KR_ENCODING_SUPPORT 30 #define EUC_KR_ENCODING_SUPPORT 1 31 #endif 32 33 enum char_encoding_type { 34 ce_unknown = 0, 35 ce_utf8 = 1, 36 ce_utf16_be = 2, 37 ce_utf16_le = 3, 38 ce_utf32_be = 4, 39 ce_utf32_le = 5, 40 ce_8bit_cp = 6 41 #if GBK_ENCODING_SUPPORT == 1 42 ,ce_gbk = 7 43 #endif 44 #if JIS_ENCODING_SUPPORT == 1 45 ,ce_euc_jis = 8 46 ,ce_shift_jis = 9 47 #endif 48 #if JIS_ENCODING_SUPPORT == 1 49 ,ce_big5 = 10 50 #endif 51 #if EUC_KR_ENCODING_SUPPORT == 1 52 ,ce_euc_kr = 11 53 #endif 54 }; 55 56 #define CRENC_ID_UNKNOWN ce_unknown 57 #define CRENC_ID_UTF8 ce_utf8 58 #define CRENC_ID_UTF16_LE ce_utf16_le 59 #define CRENC_ID_UTF16_BE ce_utf16_be 60 #define CRENC_ID_UTF32_LE ce_utf32_le 61 #define CRENC_ID_UTF32_BE ce_utf32_be 62 #define CRENC_ID_8BIT_START ce_8bit_cp 63 64 int CREncodingNameToId( const lChar32 * name ); 65 const char * CREncodingIdToName( int id ); 66 67 /** 68 \brief Searches for 8-bit encoding to unicode conversion table by encoding name. 69 70 Conversion table is table of 128 unicode characters corresponding to 8-bit 71 encoding characters 128..255. enc_table[0] is unicode value for character 72 128 in 8-bit encoding. 73 74 \param encoding_name is name of encoding, i.e. "utf-8", "windows-1251" 75 76 \return pointer to conversion table if found, NULL otherwise 77 */ 78 const lChar32 * GetCharsetByte2UnicodeTable( const lChar32 * encoding_name ); 79 const lChar32 * GetCharsetByte2UnicodeTableById( int id ); 80 const lChar8 ** GetCharsetUnicode2ByteTable( const lChar32 * encoding_name ); 81 /// get conversion table for upper 128 characters of codepage, by codepage number 82 const lChar32 * GetCharsetByte2UnicodeTable( int codepage ); 83 /// returns "cp1251" for 1251, etc. for supported codepages 84 const lChar32 * GetCharsetName( int codepage ); 85 /// convert language id to codepage number (MS) 86 int langToCodepage( int lang ); 87 const char* langToLanguage( int lang ); 88 89 /** 90 \brief Autodetects encoding of text data in buffer. 91 92 \param buf is buffer with text data to autodetect 93 \param buf_size is size of data in buffer, bytes 94 \param cp_name is buffer to store autodetected name of encoding, i.e. "utf-8", "windows-1251" 95 \param lang_name is buffer to store autodetected name of language, i.e. "en", "ru" 96 97 \return non-zero on success 98 */ 99 int AutodetectCodePage(const unsigned char * buf, int buf_size, char * cp_name, char * lang_name, bool skipHtml); 100 /** 101 \brief Autodetects encoding of text data in buffer, only using ByteOrderMark or Utf-8 validity detection. 102 103 \param buf is buffer with text data to autodetect 104 \param buf_size is size of data in buffer, bytes 105 \param cp_name is buffer to store autodetected name of encoding, i.e. "utf-8", "windows-1251" 106 \param lang_name is buffer to store autodetected name of language, i.e. "en", "ru" 107 \param skipHtml if true, skip HTML/XML tags 108 109 \return non-zero on success 110 */ 111 int AutodetectCodePageUtf( const unsigned char * buf, int buf_size, char * cp_name, char * lang_name ); 112 113 bool hasXmlTags(const lUInt8 * buf, int size); 114 115 /** 116 \brief checks whether data buffer is valid utf-8 stream 117 118 \param buf is buffer with text data to autodetect 119 \param buf_size is size of data in buffer, bytes 120 121 \return true if buffer has valid utf-8 data 122 */ 123 bool isValidUtf8Data( const unsigned char * buf, int buf_size ); 124 125 void MakeStatsForFile( const char * fname, const char * cp_name, const char * lang_name, int index, FILE * f, lString8 & list ); 126 127 128 #endif 129