1 /** \file crtxtenc.h
2     \brief character encoding utils
3 
4     CoolReader Engine
5 
6     (c) Vadim Lopatin, 2000-2006
7     This source code is distributed under the terms of
8     GNU General Public License.
9     See LICENSE file for details.
10 
11 */
12 
13 #ifndef __CRTXTENC_H_INCLUDED__
14 #define __CRTXTENC_H_INCLUDED__
15 
16 #include "lvtypes.h"
17 #include <stdio.h>
18 #include "lvstring.h"
19 
20 #ifndef GBK_ENCODING_SUPPORT
21 #define GBK_ENCODING_SUPPORT 1
22 #endif
23 #ifndef JIS_ENCODING_SUPPORT
24 #define JIS_ENCODING_SUPPORT 1
25 #endif
26 #ifndef BIG5_ENCODING_SUPPORT
27 #define BIG5_ENCODING_SUPPORT 1
28 #endif
29 #ifndef EUC_KR_ENCODING_SUPPORT
30 #define EUC_KR_ENCODING_SUPPORT 1
31 #endif
32 
33 enum char_encoding_type {
34     ce_unknown = 0,
35     ce_utf8 = 1,
36     ce_utf16_be = 2,
37     ce_utf16_le = 3,
38     ce_utf32_be = 4,
39     ce_utf32_le = 5,
40     ce_8bit_cp = 6
41 #if GBK_ENCODING_SUPPORT == 1
42     ,ce_gbk = 7
43 #endif
44 #if JIS_ENCODING_SUPPORT == 1
45     ,ce_euc_jis = 8
46     ,ce_shift_jis = 9
47 #endif
48 #if JIS_ENCODING_SUPPORT == 1
49     ,ce_big5 = 10
50 #endif
51 #if EUC_KR_ENCODING_SUPPORT == 1
52     ,ce_euc_kr = 11
53 #endif
54 };
55 
56 #define CRENC_ID_UNKNOWN      ce_unknown
57 #define CRENC_ID_UTF8         ce_utf8
58 #define CRENC_ID_UTF16_LE     ce_utf16_le
59 #define CRENC_ID_UTF16_BE     ce_utf16_be
60 #define CRENC_ID_UTF32_LE     ce_utf32_le
61 #define CRENC_ID_UTF32_BE     ce_utf32_be
62 #define CRENC_ID_8BIT_START   ce_8bit_cp
63 
64 int CREncodingNameToId( const lChar32 * name );
65 const char * CREncodingIdToName( int id );
66 
67 /**
68     \brief Searches for 8-bit encoding to unicode conversion table by encoding name.
69 
70     Conversion table is table of 128 unicode characters corresponding to 8-bit
71     encoding characters 128..255. enc_table[0] is unicode value for character
72     128 in 8-bit encoding.
73 
74     \param encoding_name is name of encoding, i.e. "utf-8", "windows-1251"
75 
76     \return pointer to conversion table if found, NULL otherwise
77 */
78 const lChar32 * GetCharsetByte2UnicodeTable( const lChar32 * encoding_name );
79 const lChar32 * GetCharsetByte2UnicodeTableById( int id );
80 const lChar8 ** GetCharsetUnicode2ByteTable( const lChar32 * encoding_name );
81 /// get conversion table for upper 128 characters of codepage, by codepage number
82 const lChar32 * GetCharsetByte2UnicodeTable( int codepage );
83 /// returns "cp1251" for 1251, etc. for supported codepages
84 const lChar32 * GetCharsetName( int codepage );
85 /// convert language id to codepage number (MS)
86 int langToCodepage( int lang );
87 const char* langToLanguage( int lang );
88 
89 /**
90     \brief Autodetects encoding of text data in buffer.
91 
92     \param buf is buffer with text data to autodetect
93     \param buf_size is size of data in buffer, bytes
94     \param cp_name is buffer to store autodetected name of encoding, i.e. "utf-8", "windows-1251"
95     \param lang_name is buffer to store autodetected name of language, i.e. "en", "ru"
96 
97     \return non-zero on success
98 */
99 int AutodetectCodePage(const unsigned char * buf, int buf_size, char * cp_name, char * lang_name, bool skipHtml);
100 /**
101     \brief Autodetects encoding of text data in buffer, only using ByteOrderMark or Utf-8 validity detection.
102 
103     \param buf is buffer with text data to autodetect
104     \param buf_size is size of data in buffer, bytes
105     \param cp_name is buffer to store autodetected name of encoding, i.e. "utf-8", "windows-1251"
106     \param lang_name is buffer to store autodetected name of language, i.e. "en", "ru"
107     \param skipHtml if true, skip HTML/XML tags
108 
109     \return non-zero on success
110 */
111 int AutodetectCodePageUtf( const unsigned char * buf, int buf_size, char * cp_name, char * lang_name );
112 
113 bool hasXmlTags(const lUInt8 * buf, int size);
114 
115 /**
116     \brief checks whether data buffer is valid utf-8 stream
117 
118     \param buf is buffer with text data to autodetect
119     \param buf_size is size of data in buffer, bytes
120 
121     \return true if buffer has valid utf-8 data
122 */
123 bool isValidUtf8Data( const unsigned char * buf, int buf_size );
124 
125 void MakeStatsForFile( const char * fname, const char * cp_name, const char * lang_name, int index, FILE * f, lString8 & list );
126 
127 
128 #endif
129