1 /*
2  * Copyright (C) 2008, 2009
3  * Free Software Foundation, Inc.
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2, or (at your option)
8  * any later version.
9  *
10  * \author Yang Jianyu <xiaoyjy@hotmail.com>
11  */
12 
13 #include <stdio.h>
14 #include <string.h>
15 #include <stdlib.h>
16 #include <signal.h>
17 #include <time.h>
18 #include <errno.h>
19 
20 #include "cconv.h"
21 #include "cconv_table.h"
22 #include "unicode.h"
23 
24 #ifdef HAVE_CONFIG_H
25 	#include "config.h"
26 #endif
27 
28 typedef struct cconv_struct
29 {
30 	cconv_type cconv_cd;
31 	iconv_t    iconv_cd;
32 	iconv_t    gb_utf8;
33 	iconv_t    bg_utf8;
34 	iconv_t    utf8_gb;
35 	iconv_t    utf8_bg;
36 	int        size_factor;
37 	char       options[16];
38 }
39 cconv_struct;
40 
41 static size_t cconv_utf8(
42 	const char** inbuf,
43 	size_t* inleft    ,
44 	char**  outbuf    ,
45 	size_t* outleft   ,
46 	const language_zh_map *m,
47 	int map_size
48 );
49 
50 static int find_keyword(
51 	const char* inbytes  ,
52 	size_t*     length   ,
53 	const language_zh_map *m   ,
54 	int         begin    ,
55 	int         end      ,
56 	const int   whence
57 );
58 
59 static int binary_find(
60 	const char* inbytes  ,
61 	size_t*     length   ,
62 	const language_zh_map *m   ,
63 	int         begin    ,
64 	int         end
65 );
66 
67 static int match_cond(
68 	const factor_zh_map* cond   ,
69 	const char*          str    ,
70 	int                  klen   ,
71 	const int            whence
72 );
73 
74 static int match_real_cond(
75 	const char* mc   ,
76 	const char* str  ,
77 	int         head ,
78 	const int   whence
79 );
80 
81 /* {{{ cconv_t cconv_open(const char* tocode, const char* fromcode) */
82 /**
83  * Open a cconv handle.
84  *
85  * @param   tocode	Convert to-code.
86  * @param   fromcode	Convert from-code.
87  * @retval  t_handle	Cconv handle,(-1: error).
88  */
cconv_open(const char * tocode,const char * fromcode)89 cconv_t cconv_open(const char* tocode, const char* fromcode)
90 {
91 	char code[8] = {0, };
92 	char *ptr;
93 	cconv_struct* cd = (cconv_struct*)malloc(sizeof(cconv_struct));
94 	cd->cconv_cd = CCONV_NULL;
95 	cd->iconv_cd = NULL;
96 	cd->gb_utf8  = NULL;
97 	cd->bg_utf8  = NULL;
98 	cd->utf8_gb  = NULL;
99 	cd->utf8_bg  = NULL;
100 	cd->size_factor = 4;
101 
102 	/* //IGNORE //TRANSPORT etc. */
103 	if((ptr = strstr(fromcode, "//")) != NULL)
104 	{
105 		strncpy(cd->options, ptr     , 16);
106 		strncpy(code       , fromcode, ptr - fromcode);
107 		fromcode = code;
108 	}
109 
110 	if(0 == strcasecmp(CCONV_CODE_GBL, fromcode))
111 	{
112 		cd->gb_utf8 = iconv_open(CCONV_CODE_UTF, CCONV_CODE_GBL);
113 		if(0 == strcasecmp(CCONV_CODE_UHT, tocode) || 0 == strcasecmp(CCONV_CODE_UHK, tocode)
114 				||0 == strcasecmp(CCONV_CODE_UTW, tocode))
115 		{
116 			cd->cconv_cd = CCONV_GBL_TO_UHT;
117 		}
118 		else if(0 == strcasecmp(CCONV_CODE_UHS, tocode) || 0 == strcasecmp(CCONV_CODE_UCN, tocode))
119 			cd->cconv_cd = CCONV_GBL_TO_UHS;
120 		else if(0 == strcasecmp(CCONV_CODE_BIG, tocode))
121 		{
122 			cd->cconv_cd = CCONV_GBL_TO_BIG;
123 			cd->utf8_bg  = iconv_open(CCONV_CODE_BIG, CCONV_CODE_UTF);
124 		}
125 		else if(0 == strcasecmp(CCONV_CODE_GHS, tocode))
126 		{
127 			cd->cconv_cd = CCONV_GBL_TO_GHS;
128 			cd->utf8_gb  = iconv_open(CCONV_CODE_GBL, CCONV_CODE_UTF);
129 		}
130 		else if(0 == strcasecmp(CCONV_CODE_GHT, tocode))
131 		{
132 			cd->cconv_cd = CCONV_GBL_TO_GHT;
133 			cd->utf8_gb  = iconv_open(CCONV_CODE_GBL, CCONV_CODE_UTF);
134 		}
135 	}
136 	else
137 	if(0 == strcasecmp(CCONV_CODE_UTF, fromcode)
138 	 ||0 == strcasecmp(CCONV_CODE_UHS, fromcode)
139 	 ||0 == strcasecmp(CCONV_CODE_UHT, fromcode)
140 	 ||0 == strcasecmp(CCONV_CODE_UCN, fromcode)
141 	 ||0 == strcasecmp(CCONV_CODE_UHK, fromcode)
142 	 ||0 == strcasecmp(CCONV_CODE_UTW, fromcode)
143 	) {
144 		if(0 == strcasecmp(CCONV_CODE_UHS, tocode) || 0 == strcasecmp(CCONV_CODE_UCN, tocode))
145 			cd->cconv_cd = CCONV_UTF_TO_UHS;
146 		else if(0 == strcasecmp(CCONV_CODE_UHT, tocode) || 0 == strcasecmp(CCONV_CODE_UHK, tocode)
147 		     || 0 == strcasecmp(CCONV_CODE_UTW, tocode))
148 			cd->cconv_cd = CCONV_UTF_TO_UHT;
149 		else if(0 == strcasecmp(CCONV_CODE_GBL, tocode))
150 		{
151 			cd->cconv_cd = CCONV_UTF_TO_GBL;
152 			cd->utf8_gb  = iconv_open(CCONV_CODE_GBL, CCONV_CODE_UTF);
153 		}
154 		else if(0 == strcasecmp(CCONV_CODE_BIG, tocode))
155 		{
156 			cd->cconv_cd = CCONV_UTF_TO_BIG;
157 			cd->utf8_bg  = iconv_open(CCONV_CODE_BIG, CCONV_CODE_UTF);
158 		}
159 
160 		cd->size_factor = 1;
161 	}
162 	else
163 	if(0 == strcasecmp(CCONV_CODE_BIG, fromcode))
164 	{
165 		if(0 == strcasecmp(CCONV_CODE_GBL, tocode))
166 		{
167 			cd->cconv_cd = CCONV_BIG_TO_GBL;
168 			cd->bg_utf8  = iconv_open(CCONV_CODE_UTF, CCONV_CODE_BIG);
169 			cd->utf8_gb  = iconv_open(CCONV_CODE_GBL, CCONV_CODE_UTF);
170 		}
171 		else if(0 == strcasecmp(CCONV_CODE_UHS, tocode) || 0 == strcasecmp(CCONV_CODE_UCN, tocode))
172 		{
173 			cd->cconv_cd = CCONV_BIG_TO_UHS;
174 			cd->bg_utf8  = iconv_open(CCONV_CODE_UTF, CCONV_CODE_BIG);
175 		}
176 
177 		/* just use iconv to do others. */
178 	}
179 
180 	if(cd->cconv_cd == CCONV_NULL)
181 		cd->iconv_cd = iconv_open(tocode, fromcode);
182 
183 	if( cd->iconv_cd == (iconv_t)(-1) || cd->gb_utf8  == (iconv_t)(-1)
184 	 || cd->bg_utf8  == (iconv_t)(-1) || cd->utf8_gb  == (iconv_t)(-1)
185 	 || cd->utf8_bg  == (iconv_t)(-1)) {
186 		cconv_close(cd);
187 		return (cconv_t)(CCONV_ERROR);
188 	}
189 
190 	return cd;
191 }
192 /* }}} */
193 
194 #define cconv_iconv_first(cd) \
195 	ps_outbuf = ps_midbuf = (char*)malloc(o_proc); \
196 	if(iconv(cd, inbuf, inbytesleft, &ps_outbuf, &o_proc) == -1) { \
197 		free(ps_midbuf); return (size_t)(-1); \
198 	} \
199 	*ps_outbuf = '\0'; \
200 
201 #define cconv_cconv_second(n, o) \
202 	cd_struct->cconv_cd = n; \
203 	ps_inbuf = ps_midbuf; \
204 	o_proc   = strlen(ps_midbuf); \
205 	if((i_proc = cconv(cd, &ps_inbuf, &o_proc, outbuf, outbytesleft)) == -1) { \
206 		free(ps_midbuf); return (size_t)(-1); \
207 	} \
208 	free(ps_midbuf); \
209 	cd_struct->cconv_cd = o; \
210 	return i_proc;
211 
212 #define cconv_cconv_first(n, o) \
213 	ps_outbuf = ps_midbuf = (char*)malloc(o_proc); \
214 	cd_struct->cconv_cd = n; \
215 	if((i_proc = cconv(cd, inbuf, inbytesleft, &ps_outbuf, &o_proc)) == -1) { \
216 		free(ps_midbuf); return (size_t)(-1); \
217 	} \
218 	cd_struct->cconv_cd = o; \
219 
220 #define cconv_iconv_second(c) \
221 	ps_outbuf = *outbuf; \
222 	ps_inbuf  = ps_midbuf; \
223 	if(iconv(c, &ps_inbuf, &i_proc, outbuf, outbytesleft) == -1) { \
224 		free(ps_midbuf); return (size_t)(-1); \
225 	} \
226 	free(ps_midbuf); \
227 	return *outbuf - ps_outbuf;
228 
229 #define const_bin_c_str(x) (const unsigned char*)(x)
230 
231 #define EMPTY_END_SIZE 8
232 
233 /* {{{ size_t cconv() */
234 /**
235  * Convert character code.
236  *
237  * @param   in_charset	Cconv input charset.
238  * @param   out_charset	Cconv output charset.
239  * @param   inbuf	   Input buffer.
240  * @param   inbytesleft Input buffer left.
241  * @retval  t_handle	Cconv handle,(-1: error).
242  */
cconv(cconv_t cd,const char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)243 size_t cconv(cconv_t cd,
244 #ifdef FreeBSD
245 		const char** inbuf,
246 #else
247 		char** inbuf,
248 #endif
249 		size_t* inbytesleft,
250 		char**  outbuf,
251 		size_t* outbytesleft)
252 {
253 	size_t  i_proc = 0, o_proc = 0;
254 #ifdef FreeBSD
255 	const char *ps_inbuf  = NULL;
256 #else
257 	char *ps_inbuf = NULL;
258 #endif
259 	char *ps_midbuf, *ps_outbuf = NULL;
260 	language_zh_map *m;
261 	int map_size;
262 
263 	if(NULL == inbuf  || NULL == *inbuf  || NULL == inbytesleft || NULL == outbuf || NULL == *outbuf || NULL == outbytesleft)
264 		return(size_t)(-1);
265 
266 	cconv_struct *cd_struct = cd;
267 	ps_inbuf  = *inbuf;
268 	ps_outbuf = *outbuf;
269 	o_proc    = cd_struct->size_factor * (*inbytesleft) + EMPTY_END_SIZE;
270 
271 	if((cconv_t)(CCONV_ERROR) == cd)
272 		return(size_t)(-1);
273 
274 	switch(cd_struct->cconv_cd)
275 	{
276 	case CCONV_UTF_TO_UHT:
277 	case CCONV_UTF_TO_UHS:
278 		m        = zh_map     (cd_struct->cconv_cd);
279 		map_size = zh_map_size(cd_struct->cconv_cd);
280 		return cconv_utf8((const char**)inbuf, inbytesleft, outbuf, outbytesleft, m, map_size);
281 
282 	case CCONV_UTF_TO_GBL:
283 		cconv_cconv_first(CCONV_UTF_TO_UHS, CCONV_UTF_TO_GBL);
284 		cconv_iconv_second(cd_struct->utf8_gb);
285 
286 	case CCONV_UTF_TO_BIG:
287 		cconv_cconv_first(CCONV_UTF_TO_UHT, CCONV_UTF_TO_BIG);
288 		cconv_iconv_second(cd_struct->utf8_bg);
289 
290 	case CCONV_GBL_TO_UHT:
291 		cconv_iconv_first(cd_struct->gb_utf8);
292 		cconv_cconv_second(CCONV_UTF_TO_UHT, CCONV_GBL_TO_UHT);
293 
294 	case CCONV_GBL_TO_UHS:
295 		cconv_iconv_first(cd_struct->gb_utf8);
296 		cconv_cconv_second(CCONV_UTF_TO_UHS, CCONV_GBL_TO_UHS);
297 
298 	case CCONV_GBL_TO_BIG:
299 		cconv_cconv_first(CCONV_GBL_TO_UHT, CCONV_GBL_TO_BIG);
300 		cconv_iconv_second(cd_struct->utf8_bg);
301 
302 	case CCONV_GBL_TO_GHS:
303 		cconv_cconv_first(CCONV_GBL_TO_UHS, CCONV_GBL_TO_GHS);
304 		cconv_iconv_second(cd_struct->utf8_gb);
305 
306 	case CCONV_GBL_TO_GHT:
307 		cconv_cconv_first(CCONV_GBL_TO_UHT, CCONV_GBL_TO_GHT);
308 		cconv_iconv_second(cd_struct->utf8_gb);
309 
310 	case CCONV_BIG_TO_UHS:
311 		cconv_iconv_first(cd_struct->bg_utf8);
312 		cconv_cconv_second(CCONV_UTF_TO_UHS, CCONV_BIG_TO_UHS);
313 
314 	case CCONV_BIG_TO_GBL:
315 		cconv_cconv_first(CCONV_BIG_TO_UHS, CCONV_BIG_TO_GBL);
316 		cconv_iconv_second(cd_struct->utf8_gb);
317 
318 	case CCONV_NULL:
319 	default:
320 		break;
321 	} // switch
322 
323 	ps_outbuf = *outbuf;
324 	if(iconv(cd_struct->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft) == -1)
325 		return (size_t)(-1);
326 
327 	return *outbuf - ps_outbuf;
328 }
329 /* }}} */
330 
331 /* {{{ int cconv_close( cconv_t cd ) */
332 /**
333  * Close a cconv handle.
334  *
335  * @param   cd          Cconv handle.
336  * @return              0: succ, -1: fail.
337  */
cconv_close(cconv_t cd)338 int cconv_close(cconv_t cd)
339 {
340 	cconv_struct *c = cd;
341 	if(c->iconv_cd && (iconv_t)(-1) != c->iconv_cd) iconv_close(c->iconv_cd);
342 	if(c->gb_utf8  && (iconv_t)(-1) != c->gb_utf8 ) iconv_close(c->gb_utf8 );
343 	if(c->bg_utf8  && (iconv_t)(-1) != c->bg_utf8 ) iconv_close(c->bg_utf8 );
344 	if(c->utf8_gb  && (iconv_t)(-1) != c->utf8_gb ) iconv_close(c->utf8_gb );
345 	if(c->utf8_bg  && (iconv_t)(-1) != c->utf8_bg ) iconv_close(c->utf8_bg );
346 	free(c);
347 	return 0;
348 }
349 /* }}} */
350 
cconv_utf8(const char ** inbuf,size_t * inleft,char ** outbuf,size_t * outleft,const language_zh_map * m,int map_size)351 size_t cconv_utf8(const char** inbuf, size_t* inleft, char**  outbuf, size_t* outleft, const language_zh_map *m, int map_size)
352 {
353 	const char *ps_inbuf;
354 	char *ps_outbuf;
355 	int index;
356 	size_t i_proc, o_proc, i_conv = 0, o_conv;
357 
358 	ps_inbuf  = *inbuf ;
359 	ps_outbuf = *outbuf;
360 	for (; *inleft > 0 && *outleft > 0; )
361 	{
362 		if((i_proc = utf8_char_width(const_bin_c_str(ps_inbuf))) > *inleft)
363 			break;
364 
365 		if(i_proc > 1 &&
366 		  (index = find_keyword(ps_inbuf, &i_proc, m, 0, map_size - 1, i_conv)) != -1)
367 		{
368 			o_proc = strlen(map_val(m, index));
369 			memcpy(ps_outbuf, map_val(m, index), o_proc);
370 			ps_inbuf  += i_proc;
371 			ps_outbuf += o_proc;
372 			*inleft   -= i_proc;
373 			*outleft  -= o_proc;
374 			i_conv    += i_proc;
375 			continue;
376 		}
377 
378 		if(i_proc == (size_t)(-1))
379 		{
380 			errno  = EINVAL;
381 			return (size_t)(-2);
382 		}
383 
384 		memcpy(ps_outbuf, ps_inbuf, i_proc);
385 		ps_inbuf  += i_proc;
386 		ps_outbuf += i_proc;
387 		*inleft   -= i_proc;
388 		*outleft  -= i_proc;
389 		i_conv    += i_proc;
390 	}
391 
392 	o_conv = ps_outbuf - *outbuf;
393 	*ps_outbuf = '\0';
394 	*inbuf  = ps_inbuf;
395 	*outbuf = ps_outbuf;
396 	return o_conv;
397 }
398 
find_keyword(const char * inbytes,size_t * length,const language_zh_map * m,int begin,int end,const int whence)399 int find_keyword(const char* inbytes, size_t* length, const language_zh_map *m, int begin, int end, const int whence)
400 {
401 	int location, offset;
402 	size_t wwidth, nwidth;
403 
404 	if((offset = binary_find(inbytes, length, m, begin, end)) == -1)
405 		return -1;
406 
407 	/* match the most accurate value */
408 	wwidth = *length;
409 	do{
410 		location = offset;
411 		*length  = wwidth;
412 		nwidth   = utf8_char_width(const_bin_c_str(inbytes+wwidth));
413 		wwidth  += nwidth;
414 	}
415 	while(nwidth != 0 && (offset = binary_find(inbytes, &wwidth, m, offset, end)) != -1);
416 
417 	/* extention word fix */
418 	if(!match_cond(cond_ptr(m, location), inbytes, strlen(map_key(m, location)), whence))
419 	{
420 		*length = utf8_char_width(const_bin_c_str(inbytes));
421 		return -1;
422 	}
423 
424 	return location;
425 }
426 
427 /* {{{ int binary_find(cconv_t cd, const char* inbytes, int length, int begin, int end) */
binary_find(const char * inbytes,size_t * length,const language_zh_map * m,int begin,int end)428 int binary_find(const char* inbytes, size_t* length, const language_zh_map *m, int begin, int end)
429 {
430 	int middle, last, next_fix = 0;
431 	int ret, offset = -1;
432 	size_t width, wwidth, nwidth;
433 
434 	middle = (begin + end) >> 1;
435 	width  = *length;
436 	last   = end;
437 	while(1)
438 	{
439 		ret = memcmp(m[middle].key, inbytes, width);
440 		if(ret == 0)
441 		{
442 			if(width == strlen(m[middle].key))
443 				return middle;
444 
445 			/* word key */
446 			if(next_fix == 0)
447 			{
448 				nwidth = utf8_char_width(const_bin_c_str(inbytes+width));
449 				wwidth = width + nwidth;
450 				if(nwidth != 0 && memcmp(m[middle].key, inbytes, wwidth) <= 0)
451 				{
452 					while (nwidth != 0
453 						&& (offset = binary_find(inbytes, &wwidth, m, offset, end)) != -1)
454 					{
455 						if(wwidth == strlen(m[offset].key))
456 							return offset;
457 
458 						nwidth = utf8_char_width(const_bin_c_str(inbytes+width));
459 						wwidth += nwidth;
460 					}
461 
462 					next_fix = 1;
463 				}
464 			}
465 			ret = 1;
466 		}
467 
468 		if(ret > 0)
469 		{
470 			end = middle - 1;
471 			middle = (begin + end) >> 1;
472 		}
473 		else if(0 > ret)
474 		{
475 			begin = middle + 1;
476 			middle = (begin + end) >> 1;
477 		}
478 
479 		if(end < begin) return -1;
480 	}
481 
482 	return -1;
483 }
484 /* }}} */
485 
match_cond(const factor_zh_map * cond,const char * str,int klen,const int whence)486 int match_cond(const factor_zh_map *cond, const char* str, int klen, const int whence)
487 {
488 	int y_ma, y_mb;
489 	const char *cond_str = NULL;
490 	const char *y_a_null, *y_b_null;
491 
492 	cond_str = cond_c_str(cond, n_ma);
493 	if(cond_str && match_real_cond(cond_str , str + klen, 0, whence))
494 		return 0;
495 
496 	cond_str = cond_c_str(cond, n_mb);
497 	if(cond_str && match_real_cond(cond_str , str, 1, whence))
498 		return 0;
499 
500 	y_b_null = cond_str = cond_c_str(cond, y_mb);
501 	y_ma = cond_str && match_real_cond(cond_str, str, 1, whence);
502 
503 	y_a_null = cond_str = cond_c_str(cond, y_ma);
504 	y_mb = cond_str && match_real_cond(cond_str, str + klen, 0, whence);
505 	return (!y_b_null&&!y_a_null) | y_ma | y_mb;
506 }
507 
match_real_cond(const char * mc,const char * str,int head,const int whence)508 int match_real_cond(const char* mc, const char* str, int head, const int whence)
509 {
510 	int size;
511 	char *m_one, *p;
512 
513 	size = strlen(mc);
514 	p = (char *)malloc(size + 1);
515 	memcpy(p, mc, size);
516 	p[size] = '\0';
517 
518 	m_one = strtok(p, ",");
519 	while(m_one)
520 	{
521 		if((head == 1 && whence >= strlen(m_one) &&
522 			memcmp(str - strlen(m_one), m_one, strlen(m_one)) == 0)
523 		 ||(head == 0 && strlen(str) >= strlen(m_one) &&
524 			memcmp(str, m_one, strlen(m_one)) == 0)
525 		){
526 			free(p);
527 			return 1;
528 		}
529 
530 		m_one = strtok(NULL, ",");
531 	}
532 
533 	free(p);
534 	return 0;
535 }
536 
537