1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #include <sys/types.h>
19 
20 #ifndef UDM_CHARSET_H
21 #define UDM_CHARSET_H
22 
23 #define UDM_RECODE_TEXT                 0
24 #define UDM_RECODE_HTML_NONASCII	1 /* Put unconvertable characters using &#123; notation */
25 #define UDM_RECODE_HTML_OUT             1 /* Synonym for HTML_NONASCII */
26 #define UDM_RECODE_HTML_SPECIAL		2 /* recognize &#123; and &#x123; in input */
27 #define UDM_RECODE_HTML_IN              2 /* Synonym for HTML_SPECIAL */
28 #define UDM_RECODE_HTML                 3
29 #define UDM_RECODE_HTML_NONASCII_HEX    4 /* Put unconvertable characters using &#x123; notation */
30 #define UDM_RECODE_STRIP_ACCENTS        8
31 #define UDM_RECODE_HTML_OUT_SPECIAL    16 /* Convert &<> using &amp; &lt; &gt; */
32 
33 #define UDM_CHARSET_UNKNOWN             0
34 #define UDM_CHARSET_ARABIC              1
35 #define UDM_CHARSET_ARMENIAN            2
36 #define UDM_CHARSET_BALTIC              3
37 #define UDM_CHARSET_CELTIC              4
38 #define UDM_CHARSET_CENTRAL             5
39 #define UDM_CHARSET_CHINESE_SIMPLIFIED  6
40 #define UDM_CHARSET_CHINESE_TRADITIONAL 7
41 #define UDM_CHARSET_CYRILLIC            8
42 #define UDM_CHARSET_GREEK               9
43 #define UDM_CHARSET_HEBREW             10
44 #define UDM_CHARSET_ICELANDIC          11
45 #define UDM_CHARSET_JAPANESE           12
46 #define UDM_CHARSET_KOREAN             13
47 #define UDM_CHARSET_NORDIC             14
48 #define UDM_CHARSET_SOUTHERN           15
49 #define UDM_CHARSET_THAI               16
50 #define UDM_CHARSET_TURKISH            17
51 #define UDM_CHARSET_UNICODE            18
52 #define UDM_CHARSET_VIETNAMESE         19
53 #define UDM_CHARSET_WESTERN            20
54 #define UDM_CHARSET_INDIAN             21
55 #define UDM_CHARSET_GEORGIAN           22
56 
57 typedef struct
58 {
59   int id;
60   const char * name;
61 } UDM_CHARSETGROUP;
62 
63 #define UDM_CHARSET_8859_1      0
64 #define UDM_CHARSET_8859_10     1
65 #define UDM_CHARSET_8859_11     2
66 #define UDM_CHARSET_8859_13     3
67 #define UDM_CHARSET_8859_14     4
68 #define UDM_CHARSET_8859_15     5
69 #define UDM_CHARSET_8859_16     6
70 #define UDM_CHARSET_8859_2      7
71 #define UDM_CHARSET_8859_3      8
72 #define UDM_CHARSET_8859_4      9
73 #define UDM_CHARSET_8859_5      10
74 #define UDM_CHARSET_8859_6      11
75 #define UDM_CHARSET_8859_7      12
76 #define UDM_CHARSET_8859_8      13
77 #define UDM_CHARSET_8859_9      14
78 #define UDM_CHARSET_ARMSCII_8   15
79 #define UDM_CHARSET_CP1250      16
80 #define UDM_CHARSET_CP1251      17
81 #define UDM_CHARSET_CP1252      18
82 #define UDM_CHARSET_CP1253      19
83 #define UDM_CHARSET_CP1254      20
84 #define UDM_CHARSET_CP1255      21
85 #define UDM_CHARSET_CP1256      22
86 #define UDM_CHARSET_CP1257      23
87 #define UDM_CHARSET_CP1258      24
88 #define UDM_CHARSET_CP437       25
89 #define UDM_CHARSET_CP850       26
90 #define UDM_CHARSET_CP852       27
91 #define UDM_CHARSET_CP855       28
92 #define UDM_CHARSET_CP857       29
93 #define UDM_CHARSET_CP860       30
94 #define UDM_CHARSET_CP861       31
95 #define UDM_CHARSET_CP862       32
96 #define UDM_CHARSET_CP863       33
97 #define UDM_CHARSET_CP864       34
98 #define UDM_CHARSET_CP865       35
99 #define UDM_CHARSET_CP866       36
100 #define UDM_CHARSET_CP869       37
101 #define UDM_CHARSET_CP874       38
102 #define UDM_CHARSET_KOI8_R      39
103 #define UDM_CHARSET_KOI8_U      40
104 #define UDM_CHARSET_MACARABIC   41
105 #define UDM_CHARSET_MACCE       42
106 #define UDM_CHARSET_MACCROATIAN 43
107 #define UDM_CHARSET_MACCYRILLIC 44
108 #define UDM_CHARSET_MACGREEK    45
109 #define UDM_CHARSET_MACHEBREW   46
110 #define UDM_CHARSET_MACICELAND  47
111 #define UDM_CHARSET_MACROMAN    48
112 #define UDM_CHARSET_MACROMANIA  49
113 #define UDM_CHARSET_MACTHAI     50
114 #define UDM_CHARSET_MACTURKISH  51
115 #define UDM_CHARSET_US_ASCII    52
116 #define UDM_CHARSET_VISCII      53
117 #define UDM_CHARSET_UTF8        54
118 #define UDM_CHARSET_GB2312      55
119 #define UDM_CHARSET_BIG5        56
120 #define UDM_CHARSET_SJIS        57
121 #define UDM_CHARSET_EUC_KR      58
122 #define UDM_CHARSET_EUC_JP      60
123 #define UDM_CHARSET_GBK         61
124 #define UDM_CHARSET_GUJARATI    62
125 #define UDM_CHARSET_TSCII       63
126 #define UDM_CHARSET_ISO2022JP   64
127 #define UDM_CHARSET_GEOSTD8     65
128 #define UDM_CHARSET_SYS_INT     255
129 
130 typedef struct
131 {
132   unsigned short from;
133   unsigned short to;
134   unsigned char  *tab;
135 } UDM_UNI_IDX;
136 
137 struct udm_conv_st;
138 
139 struct udm_unidata_st;
140 
141 struct udm_cset_st;
142 
143 typedef unsigned int udm_wc_t;
144 
145 typedef int udm_mbstate_t;
146 
147 typedef struct udm_word_scanner_st
148 {
149   const char *str;
150   const char *end;
151   const struct udm_unidata_st *unidata;
152   const struct udm_cset_st *cs;
153   int flags;
154 } UDM_WORD_SCANNER;
155 
156 typedef struct udm_word_scanner_token_st
157 {
158   udmcrc32_t crc;
159   const char *str;
160 } UDM_WORD_SCANNER_TOKEN;
161 
162 void UdmWordScannerInit(UDM_WORD_SCANNER *scanner,
163                         const struct udm_unidata_st *unidata,
164                         const struct udm_cset_st *cs);
165 
166 void UdmWordScannerSetSource(UDM_WORD_SCANNER *scanner,
167                              int flags, const char *str, size_t length);
168 
169 typedef struct udm_cset_handler_st
170 {
171   int (*mb_wc)(udm_mbstate_t *mbstate,
172                const struct udm_cset_st *cs, udm_wc_t *wc,
173                const unsigned char *s,const unsigned char *e, int flags);
174   int (*wc_mb)(udm_mbstate_t *mbstate,
175                const struct udm_cset_st *cs, udm_wc_t wc,
176                unsigned char *s,unsigned char *e, int flags);
177   void (*lcase)(const struct udm_unidata_st *, const struct udm_cset_st *cs,
178                 char *str, size_t bytelen);
179   const char *(*septoken)(const struct udm_unidata_st *,
180                           const struct udm_cset_st *cs,
181                           const char *str, const char *strend,
182                           const char **last, int *ctype0, int flags);
183   int (*getword)(UDM_WORD_SCANNER *scanner, UDM_WORD_SCANNER_TOKEN *str);
184   udmcrc32_t (*crc32lcase)(const struct udm_unidata_st *,
185                            const struct udm_cset_st *cs,
186                            const char *src, size_t srclen, int flags);
187   size_t (*well_formed_length)(const struct udm_cset_st *cs,
188                                const char *src, size_t srclen,
189                                int flags);
190 } UDM_CHARSET_HANDLER;
191 
192 extern UDM_CHARSET_HANDLER udm_charset_handler_8bit;
193 extern UDM_CHARSET_HANDLER udm_charset_handler_usascii;
194 extern UDM_CHARSET_HANDLER udm_charset_handler_latin1;
195 
196 
197 struct udm_cset_st
198 {
199   int id;
200   UDM_CHARSET_HANDLER *cset;
201   const char *name;
202   const char *mysql_name;
203   const char *pgsql_name;
204   int family;
205   unsigned char  *ctype;
206   unsigned short *tab_to_uni;
207   UDM_UNI_IDX    *tab_from_uni;
208 };
209 
210 typedef const struct udm_cset_st UDM_CHARSET;
211 
212 typedef struct udm_conv_st
213 {
214   UDM_CHARSET  *from;
215   UDM_CHARSET  *to;
216 } UDM_CONV;
217 
218 
219 /* Unicode, system dependent integer */
220 extern UDM_CHARSET udm_charset_sys_int;
221 
222 
223 extern UDM_CHARSET udm_charset_usascii;
224 extern UDM_CHARSET udm_charset_latin1;
225 extern UDM_CHARSET udm_charset_utf8;
226 
227 #ifdef HAVE_CHARSET_big5
228 extern UDM_CHARSET udm_charset_big5;
229 #endif
230 
231 #ifdef HAVE_CHARSET_japanese
232 extern UDM_CHARSET udm_charset_sjis;
233 extern UDM_CHARSET udm_charset_eucjp;
234 extern UDM_CHARSET udm_charset_gbk;
235 extern UDM_CHARSET udm_charset_iso2022jp;
236 #endif
237 
238 #ifdef HAVE_CHARSET_euc_kr
239 extern UDM_CHARSET udm_charset_euckr;
240 #endif
241 
242 #ifdef HAVE_CHARSET_gb2312
243 extern UDM_CHARSET udm_charset_gb2312;
244 #endif
245 
246 #ifdef HAVE_CHARSET_gujarati
247 extern UDM_CHARSET udm_charset_gujarati;
248 #endif
249 
250 #ifdef HAVE_CHARSET_tscii
251 extern UDM_CHARSET udm_charset_tscii;
252 #endif
253 
254 
255 /************** Shared functions between handlers *********/
256 const char * UdmStrGetSepTokenMB(const struct udm_unidata_st *,
257                                  UDM_CHARSET *cs,
258                                  const char *str, const char *strend,
259                                  const char **last, int *ctype0, int flags);
260 
261 
262 size_t UdmWellFormedLengthGeneric(UDM_CHARSET *cs,
263                                   const char *src, size_t srclen, int flags);
264 
265 /************** Language and charset guesser *************/
266 
267 
268 #define UDM_LM_MAXGRAM		6
269 #define UDM_LM_HASHMASK		0x0FFF
270 #define UDM_LM_TOPCNT           200
271 
272 typedef struct
273 {
274   size_t count;
275   size_t index;
276   char   str[UDM_LM_MAXGRAM+1];
277 } UDM_LANGITEM;
278 
279 typedef struct
280 {
281   float        expectation;		/**< Average value   */
282   int          needsave;
283   char         *lang;			/**< Map Language    */
284   char         *charset;		/**< Map charset     */
285   char         *filename;		/**< Filename to write updates, if need */
286   UDM_LANGITEM memb[UDM_LM_HASHMASK+1];	/**< Items list      */
287 } UDM_LANGMAP;
288 
289 typedef struct
290 {
291   size_t      nmaps;
292   UDM_LANGMAP *Map;
293 } UDM_LANGMAPLIST;
294 
295 /********************************************/
296 
297 /* Input string in xxx2uni                  */
298 /* convertion  has bad multi-byte sequence  */
299 #define UDM_CHARSET_ILSEQ     0
300 #define UDM_CHARSET_ILSEQ2   -1
301 #define UDM_CHARSET_ILSEQ3   -2
302 #define UDM_CHARSET_ILSEQ4   -3
303 #define UDM_CHARSET_ILSEQ5   -4
304 #define UDM_CHARSET_ILSEQ6   -5
305 
306 /* Input buffer in xxx2uni was terminated   */
307 /* in the middle of multi-byte sequence     */
308 #define UDM_CHARSET_TOOFEW(n) (-6-(n))
309 
310 /* Can't convert unicode into given charset */
311 #define UDM_CHARSET_ILUNI     0
312 
313 /* Output buffer in uni2xxx is too small    */
314 #define UDM_CHARSET_TOOSMALL  -1
315 
316 /*
317   "Unicode value was returned from cache"
318   For character sets like tscii, when one
319   native code corresponds to several Unicode characters:
320   8C -> U+0BE8 U+0BB7 U+0B82
321 */
322 #define UDM_CHARSET_CACHEDUNI -100
323 
324 /* Character types */
325 #define UDM_UNI_SEPAR    0
326 #define UDM_UNI_LETTER   1
327 #define UDM_UNI_DIGIT    2
328 #define UDM_UNI_CJK 3
329 
330 UDM_API(const char *) UdmCsGroup(UDM_CHARSET *cs);
331 UDM_API(UDM_CHARSET *) UdmGetCharSet(const char *name);
332 UDM_API(UDM_CHARSET *) UdmGetCharSetByID(int id);
333 const char *UdmCharsetCanonicalName(const char * aslias);
334 UDM_API(void) UdmConvInit(UDM_CONV *c, UDM_CHARSET *from,UDM_CHARSET *to);
335 UDM_API(int) UdmConv(UDM_CONV *c,
336                      char *d, size_t dlen,
337                      const char *s, size_t slen, int flags);
338 size_t UdmConvLCase(const struct udm_unidata_st *unidata,
339                     UDM_CONV *cnv, int cnvflags,
340                     char *dst, size_t dstlen,
341                     const char *src, size_t srclen);
342 size_t UdmConvSizeNeeded(const UDM_CONV *cnv, size_t srclen, int flags);
343 #define UdmConvHTML(cnv, dst, dstlen, src, srclen) \
344 UdmConv(cnv, (dst), (dstlen), (src), (srclen), UDM_RECODE_HTML)
345 
346 #define UdmConvHTMLNonASCII(cnv, dst, dstlen, src, srclen) \
347 UdmConv(cnv, (dst), (dstlen), (src), (srclen), UDM_RECODE_HTML_NONASCII)
348 
349 void UdmConvFree(UDM_CONV *c);
350 
351 #endif
352