1 /*
2 * Copyright (C) 2008, 2009
3 * Free Software Foundation, Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
8 * any later version.
9 *
10 * \author Yang Jianyu <xiaoyjy@hotmail.com>
11 */
12
13 #include <stdio.h>
14 #include <string.h>
15 #include <stdlib.h>
16 #include <signal.h>
17 #include <time.h>
18 #include <errno.h>
19
20 #include "cconv.h"
21 #include "cconv_table.h"
22 #include "unicode.h"
23
24 #ifdef HAVE_CONFIG_H
25 #include "config.h"
26 #endif
27
28 typedef struct cconv_struct
29 {
30 cconv_type cconv_cd;
31 iconv_t iconv_cd;
32 iconv_t gb_utf8;
33 iconv_t bg_utf8;
34 iconv_t utf8_gb;
35 iconv_t utf8_bg;
36 int size_factor;
37 char options[16];
38 }
39 cconv_struct;
40
41 static size_t cconv_utf8(
42 const char** inbuf,
43 size_t* inleft ,
44 char** outbuf ,
45 size_t* outleft ,
46 const language_zh_map *m,
47 int map_size
48 );
49
50 static int find_keyword(
51 const char* inbytes ,
52 size_t* length ,
53 const language_zh_map *m ,
54 int begin ,
55 int end ,
56 const int whence
57 );
58
59 static int binary_find(
60 const char* inbytes ,
61 size_t* length ,
62 const language_zh_map *m ,
63 int begin ,
64 int end
65 );
66
67 static int match_cond(
68 const factor_zh_map* cond ,
69 const char* str ,
70 int klen ,
71 const int whence
72 );
73
74 static int match_real_cond(
75 const char* mc ,
76 const char* str ,
77 int head ,
78 const int whence
79 );
80
81 /* {{{ cconv_t cconv_open(const char* tocode, const char* fromcode) */
82 /**
83 * Open a cconv handle.
84 *
85 * @param tocode Convert to-code.
86 * @param fromcode Convert from-code.
87 * @retval t_handle Cconv handle,(-1: error).
88 */
cconv_open(const char * tocode,const char * fromcode)89 cconv_t cconv_open(const char* tocode, const char* fromcode)
90 {
91 char code[8] = {0, };
92 char *ptr;
93 cconv_struct* cd = (cconv_struct*)malloc(sizeof(cconv_struct));
94 cd->cconv_cd = CCONV_NULL;
95 cd->iconv_cd = NULL;
96 cd->gb_utf8 = NULL;
97 cd->bg_utf8 = NULL;
98 cd->utf8_gb = NULL;
99 cd->utf8_bg = NULL;
100 cd->size_factor = 4;
101
102 /* //IGNORE //TRANSPORT etc. */
103 if((ptr = strstr(fromcode, "//")) != NULL)
104 {
105 strncpy(cd->options, ptr , 16);
106 strncpy(code , fromcode, ptr - fromcode);
107 fromcode = code;
108 }
109
110 if(0 == strcasecmp(CCONV_CODE_GBL, fromcode))
111 {
112 cd->gb_utf8 = iconv_open(CCONV_CODE_UTF, CCONV_CODE_GBL);
113 if(0 == strcasecmp(CCONV_CODE_UHT, tocode) || 0 == strcasecmp(CCONV_CODE_UHK, tocode)
114 ||0 == strcasecmp(CCONV_CODE_UTW, tocode))
115 {
116 cd->cconv_cd = CCONV_GBL_TO_UHT;
117 }
118 else if(0 == strcasecmp(CCONV_CODE_UHS, tocode) || 0 == strcasecmp(CCONV_CODE_UCN, tocode))
119 cd->cconv_cd = CCONV_GBL_TO_UHS;
120 else if(0 == strcasecmp(CCONV_CODE_BIG, tocode))
121 {
122 cd->cconv_cd = CCONV_GBL_TO_BIG;
123 cd->utf8_bg = iconv_open(CCONV_CODE_BIG, CCONV_CODE_UTF);
124 }
125 else if(0 == strcasecmp(CCONV_CODE_GHS, tocode))
126 {
127 cd->cconv_cd = CCONV_GBL_TO_GHS;
128 cd->utf8_gb = iconv_open(CCONV_CODE_GBL, CCONV_CODE_UTF);
129 }
130 else if(0 == strcasecmp(CCONV_CODE_GHT, tocode))
131 {
132 cd->cconv_cd = CCONV_GBL_TO_GHT;
133 cd->utf8_gb = iconv_open(CCONV_CODE_GBL, CCONV_CODE_UTF);
134 }
135 }
136 else
137 if(0 == strcasecmp(CCONV_CODE_UTF, fromcode)
138 ||0 == strcasecmp(CCONV_CODE_UHS, fromcode)
139 ||0 == strcasecmp(CCONV_CODE_UHT, fromcode)
140 ||0 == strcasecmp(CCONV_CODE_UCN, fromcode)
141 ||0 == strcasecmp(CCONV_CODE_UHK, fromcode)
142 ||0 == strcasecmp(CCONV_CODE_UTW, fromcode)
143 ) {
144 if(0 == strcasecmp(CCONV_CODE_UHS, tocode) || 0 == strcasecmp(CCONV_CODE_UCN, tocode))
145 cd->cconv_cd = CCONV_UTF_TO_UHS;
146 else if(0 == strcasecmp(CCONV_CODE_UHT, tocode) || 0 == strcasecmp(CCONV_CODE_UHK, tocode)
147 || 0 == strcasecmp(CCONV_CODE_UTW, tocode))
148 cd->cconv_cd = CCONV_UTF_TO_UHT;
149 else if(0 == strcasecmp(CCONV_CODE_GBL, tocode))
150 {
151 cd->cconv_cd = CCONV_UTF_TO_GBL;
152 cd->utf8_gb = iconv_open(CCONV_CODE_GBL, CCONV_CODE_UTF);
153 }
154 else if(0 == strcasecmp(CCONV_CODE_BIG, tocode))
155 {
156 cd->cconv_cd = CCONV_UTF_TO_BIG;
157 cd->utf8_bg = iconv_open(CCONV_CODE_BIG, CCONV_CODE_UTF);
158 }
159
160 cd->size_factor = 1;
161 }
162 else
163 if(0 == strcasecmp(CCONV_CODE_BIG, fromcode))
164 {
165 if(0 == strcasecmp(CCONV_CODE_GBL, tocode))
166 {
167 cd->cconv_cd = CCONV_BIG_TO_GBL;
168 cd->bg_utf8 = iconv_open(CCONV_CODE_UTF, CCONV_CODE_BIG);
169 cd->utf8_gb = iconv_open(CCONV_CODE_GBL, CCONV_CODE_UTF);
170 }
171 else if(0 == strcasecmp(CCONV_CODE_UHS, tocode) || 0 == strcasecmp(CCONV_CODE_UCN, tocode))
172 {
173 cd->cconv_cd = CCONV_BIG_TO_UHS;
174 cd->bg_utf8 = iconv_open(CCONV_CODE_UTF, CCONV_CODE_BIG);
175 }
176
177 /* just use iconv to do others. */
178 }
179
180 if(cd->cconv_cd == CCONV_NULL)
181 cd->iconv_cd = iconv_open(tocode, fromcode);
182
183 if( cd->iconv_cd == (iconv_t)(-1) || cd->gb_utf8 == (iconv_t)(-1)
184 || cd->bg_utf8 == (iconv_t)(-1) || cd->utf8_gb == (iconv_t)(-1)
185 || cd->utf8_bg == (iconv_t)(-1)) {
186 cconv_close(cd);
187 return (cconv_t)(CCONV_ERROR);
188 }
189
190 return cd;
191 }
192 /* }}} */
193
194 #define cconv_iconv_first(cd) \
195 ps_outbuf = ps_midbuf = (char*)malloc(o_proc); \
196 if(iconv(cd, inbuf, inbytesleft, &ps_outbuf, &o_proc) == -1) { \
197 free(ps_midbuf); return (size_t)(-1); \
198 } \
199 *ps_outbuf = '\0'; \
200
201 #define cconv_cconv_second(n, o) \
202 cd_struct->cconv_cd = n; \
203 ps_inbuf = ps_midbuf; \
204 o_proc = strlen(ps_midbuf); \
205 if((i_proc = cconv(cd, &ps_inbuf, &o_proc, outbuf, outbytesleft)) == -1) { \
206 free(ps_midbuf); return (size_t)(-1); \
207 } \
208 free(ps_midbuf); \
209 cd_struct->cconv_cd = o; \
210 return i_proc;
211
212 #define cconv_cconv_first(n, o) \
213 ps_outbuf = ps_midbuf = (char*)malloc(o_proc); \
214 cd_struct->cconv_cd = n; \
215 if((i_proc = cconv(cd, inbuf, inbytesleft, &ps_outbuf, &o_proc)) == -1) { \
216 free(ps_midbuf); return (size_t)(-1); \
217 } \
218 cd_struct->cconv_cd = o; \
219
220 #define cconv_iconv_second(c) \
221 ps_outbuf = *outbuf; \
222 ps_inbuf = ps_midbuf; \
223 if(iconv(c, &ps_inbuf, &i_proc, outbuf, outbytesleft) == -1) { \
224 free(ps_midbuf); return (size_t)(-1); \
225 } \
226 free(ps_midbuf); \
227 return *outbuf - ps_outbuf;
228
229 #define const_bin_c_str(x) (const unsigned char*)(x)
230
231 #define EMPTY_END_SIZE 8
232
233 /* {{{ size_t cconv() */
234 /**
235 * Convert character code.
236 *
237 * @param in_charset Cconv input charset.
238 * @param out_charset Cconv output charset.
239 * @param inbuf Input buffer.
240 * @param inbytesleft Input buffer left.
241 * @retval t_handle Cconv handle,(-1: error).
242 */
cconv(cconv_t cd,const char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)243 size_t cconv(cconv_t cd,
244 #ifdef FreeBSD
245 const char** inbuf,
246 #else
247 char** inbuf,
248 #endif
249 size_t* inbytesleft,
250 char** outbuf,
251 size_t* outbytesleft)
252 {
253 size_t i_proc = 0, o_proc = 0;
254 #ifdef FreeBSD
255 const char *ps_inbuf = NULL;
256 #else
257 char *ps_inbuf = NULL;
258 #endif
259 char *ps_midbuf, *ps_outbuf = NULL;
260 language_zh_map *m;
261 int map_size;
262
263 if(NULL == inbuf || NULL == *inbuf || NULL == inbytesleft || NULL == outbuf || NULL == *outbuf || NULL == outbytesleft)
264 return(size_t)(-1);
265
266 cconv_struct *cd_struct = cd;
267 ps_inbuf = *inbuf;
268 ps_outbuf = *outbuf;
269 o_proc = cd_struct->size_factor * (*inbytesleft) + EMPTY_END_SIZE;
270
271 if((cconv_t)(CCONV_ERROR) == cd)
272 return(size_t)(-1);
273
274 switch(cd_struct->cconv_cd)
275 {
276 case CCONV_UTF_TO_UHT:
277 case CCONV_UTF_TO_UHS:
278 m = zh_map (cd_struct->cconv_cd);
279 map_size = zh_map_size(cd_struct->cconv_cd);
280 return cconv_utf8((const char**)inbuf, inbytesleft, outbuf, outbytesleft, m, map_size);
281
282 case CCONV_UTF_TO_GBL:
283 cconv_cconv_first(CCONV_UTF_TO_UHS, CCONV_UTF_TO_GBL);
284 cconv_iconv_second(cd_struct->utf8_gb);
285
286 case CCONV_UTF_TO_BIG:
287 cconv_cconv_first(CCONV_UTF_TO_UHT, CCONV_UTF_TO_BIG);
288 cconv_iconv_second(cd_struct->utf8_bg);
289
290 case CCONV_GBL_TO_UHT:
291 cconv_iconv_first(cd_struct->gb_utf8);
292 cconv_cconv_second(CCONV_UTF_TO_UHT, CCONV_GBL_TO_UHT);
293
294 case CCONV_GBL_TO_UHS:
295 cconv_iconv_first(cd_struct->gb_utf8);
296 cconv_cconv_second(CCONV_UTF_TO_UHS, CCONV_GBL_TO_UHS);
297
298 case CCONV_GBL_TO_BIG:
299 cconv_cconv_first(CCONV_GBL_TO_UHT, CCONV_GBL_TO_BIG);
300 cconv_iconv_second(cd_struct->utf8_bg);
301
302 case CCONV_GBL_TO_GHS:
303 cconv_cconv_first(CCONV_GBL_TO_UHS, CCONV_GBL_TO_GHS);
304 cconv_iconv_second(cd_struct->utf8_gb);
305
306 case CCONV_GBL_TO_GHT:
307 cconv_cconv_first(CCONV_GBL_TO_UHT, CCONV_GBL_TO_GHT);
308 cconv_iconv_second(cd_struct->utf8_gb);
309
310 case CCONV_BIG_TO_UHS:
311 cconv_iconv_first(cd_struct->bg_utf8);
312 cconv_cconv_second(CCONV_UTF_TO_UHS, CCONV_BIG_TO_UHS);
313
314 case CCONV_BIG_TO_GBL:
315 cconv_cconv_first(CCONV_BIG_TO_UHS, CCONV_BIG_TO_GBL);
316 cconv_iconv_second(cd_struct->utf8_gb);
317
318 case CCONV_NULL:
319 default:
320 break;
321 } // switch
322
323 ps_outbuf = *outbuf;
324 if(iconv(cd_struct->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft) == -1)
325 return (size_t)(-1);
326
327 return *outbuf - ps_outbuf;
328 }
329 /* }}} */
330
331 /* {{{ int cconv_close( cconv_t cd ) */
332 /**
333 * Close a cconv handle.
334 *
335 * @param cd Cconv handle.
336 * @return 0: succ, -1: fail.
337 */
cconv_close(cconv_t cd)338 int cconv_close(cconv_t cd)
339 {
340 cconv_struct *c = cd;
341 if(c->iconv_cd && (iconv_t)(-1) != c->iconv_cd) iconv_close(c->iconv_cd);
342 if(c->gb_utf8 && (iconv_t)(-1) != c->gb_utf8 ) iconv_close(c->gb_utf8 );
343 if(c->bg_utf8 && (iconv_t)(-1) != c->bg_utf8 ) iconv_close(c->bg_utf8 );
344 if(c->utf8_gb && (iconv_t)(-1) != c->utf8_gb ) iconv_close(c->utf8_gb );
345 if(c->utf8_bg && (iconv_t)(-1) != c->utf8_bg ) iconv_close(c->utf8_bg );
346 free(c);
347 return 0;
348 }
349 /* }}} */
350
cconv_utf8(const char ** inbuf,size_t * inleft,char ** outbuf,size_t * outleft,const language_zh_map * m,int map_size)351 size_t cconv_utf8(const char** inbuf, size_t* inleft, char** outbuf, size_t* outleft, const language_zh_map *m, int map_size)
352 {
353 const char *ps_inbuf;
354 char *ps_outbuf;
355 int index;
356 size_t i_proc, o_proc, i_conv = 0, o_conv;
357
358 ps_inbuf = *inbuf ;
359 ps_outbuf = *outbuf;
360 for (; *inleft > 0 && *outleft > 0; )
361 {
362 if((i_proc = utf8_char_width(const_bin_c_str(ps_inbuf))) > *inleft)
363 break;
364
365 if(i_proc > 1 &&
366 (index = find_keyword(ps_inbuf, &i_proc, m, 0, map_size - 1, i_conv)) != -1)
367 {
368 o_proc = strlen(map_val(m, index));
369 memcpy(ps_outbuf, map_val(m, index), o_proc);
370 ps_inbuf += i_proc;
371 ps_outbuf += o_proc;
372 *inleft -= i_proc;
373 *outleft -= o_proc;
374 i_conv += i_proc;
375 continue;
376 }
377
378 if(i_proc == (size_t)(-1))
379 {
380 errno = EINVAL;
381 return (size_t)(-2);
382 }
383
384 memcpy(ps_outbuf, ps_inbuf, i_proc);
385 ps_inbuf += i_proc;
386 ps_outbuf += i_proc;
387 *inleft -= i_proc;
388 *outleft -= i_proc;
389 i_conv += i_proc;
390 }
391
392 o_conv = ps_outbuf - *outbuf;
393 *ps_outbuf = '\0';
394 *inbuf = ps_inbuf;
395 *outbuf = ps_outbuf;
396 return o_conv;
397 }
398
find_keyword(const char * inbytes,size_t * length,const language_zh_map * m,int begin,int end,const int whence)399 int find_keyword(const char* inbytes, size_t* length, const language_zh_map *m, int begin, int end, const int whence)
400 {
401 int location, offset;
402 size_t wwidth, nwidth;
403
404 if((offset = binary_find(inbytes, length, m, begin, end)) == -1)
405 return -1;
406
407 /* match the most accurate value */
408 wwidth = *length;
409 do{
410 location = offset;
411 *length = wwidth;
412 nwidth = utf8_char_width(const_bin_c_str(inbytes+wwidth));
413 wwidth += nwidth;
414 }
415 while(nwidth != 0 && (offset = binary_find(inbytes, &wwidth, m, offset, end)) != -1);
416
417 /* extention word fix */
418 if(!match_cond(cond_ptr(m, location), inbytes, strlen(map_key(m, location)), whence))
419 {
420 *length = utf8_char_width(const_bin_c_str(inbytes));
421 return -1;
422 }
423
424 return location;
425 }
426
427 /* {{{ int binary_find(cconv_t cd, const char* inbytes, int length, int begin, int end) */
binary_find(const char * inbytes,size_t * length,const language_zh_map * m,int begin,int end)428 int binary_find(const char* inbytes, size_t* length, const language_zh_map *m, int begin, int end)
429 {
430 int middle, last, next_fix = 0;
431 int ret, offset = -1;
432 size_t width, wwidth, nwidth;
433
434 middle = (begin + end) >> 1;
435 width = *length;
436 last = end;
437 while(1)
438 {
439 ret = memcmp(m[middle].key, inbytes, width);
440 if(ret == 0)
441 {
442 if(width == strlen(m[middle].key))
443 return middle;
444
445 /* word key */
446 if(next_fix == 0)
447 {
448 nwidth = utf8_char_width(const_bin_c_str(inbytes+width));
449 wwidth = width + nwidth;
450 if(nwidth != 0 && memcmp(m[middle].key, inbytes, wwidth) <= 0)
451 {
452 while (nwidth != 0
453 && (offset = binary_find(inbytes, &wwidth, m, offset, end)) != -1)
454 {
455 if(wwidth == strlen(m[offset].key))
456 return offset;
457
458 nwidth = utf8_char_width(const_bin_c_str(inbytes+width));
459 wwidth += nwidth;
460 }
461
462 next_fix = 1;
463 }
464 }
465 ret = 1;
466 }
467
468 if(ret > 0)
469 {
470 end = middle - 1;
471 middle = (begin + end) >> 1;
472 }
473 else if(0 > ret)
474 {
475 begin = middle + 1;
476 middle = (begin + end) >> 1;
477 }
478
479 if(end < begin) return -1;
480 }
481
482 return -1;
483 }
484 /* }}} */
485
match_cond(const factor_zh_map * cond,const char * str,int klen,const int whence)486 int match_cond(const factor_zh_map *cond, const char* str, int klen, const int whence)
487 {
488 int y_ma, y_mb;
489 const char *cond_str = NULL;
490 const char *y_a_null, *y_b_null;
491
492 cond_str = cond_c_str(cond, n_ma);
493 if(cond_str && match_real_cond(cond_str , str + klen, 0, whence))
494 return 0;
495
496 cond_str = cond_c_str(cond, n_mb);
497 if(cond_str && match_real_cond(cond_str , str, 1, whence))
498 return 0;
499
500 y_b_null = cond_str = cond_c_str(cond, y_mb);
501 y_ma = cond_str && match_real_cond(cond_str, str, 1, whence);
502
503 y_a_null = cond_str = cond_c_str(cond, y_ma);
504 y_mb = cond_str && match_real_cond(cond_str, str + klen, 0, whence);
505 return (!y_b_null&&!y_a_null) | y_ma | y_mb;
506 }
507
match_real_cond(const char * mc,const char * str,int head,const int whence)508 int match_real_cond(const char* mc, const char* str, int head, const int whence)
509 {
510 int size;
511 char *m_one, *p;
512
513 size = strlen(mc);
514 p = (char *)malloc(size + 1);
515 memcpy(p, mc, size);
516 p[size] = '\0';
517
518 m_one = strtok(p, ",");
519 while(m_one)
520 {
521 if((head == 1 && whence >= strlen(m_one) &&
522 memcmp(str - strlen(m_one), m_one, strlen(m_one)) == 0)
523 ||(head == 0 && strlen(str) >= strlen(m_one) &&
524 memcmp(str, m_one, strlen(m_one)) == 0)
525 ){
526 free(p);
527 return 1;
528 }
529
530 m_one = strtok(NULL, ",");
531 }
532
533 free(p);
534 return 0;
535 }
536
537