1 /*-------------------------------------------------------------------------
2  *
3  *	  EUC_TW, BIG5 and MULE_INTERNAL
4  *
5  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
6  * Portions Copyright (c) 1994, Regents of the University of California
7  *
8  * IDENTIFICATION
9  *	  src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 
14 #include "postgres.h"
15 #include "fmgr.h"
16 #include "mb/pg_wchar.h"
17 
18 #define ENCODING_GROWTH_RATE 4
19 
20 PG_MODULE_MAGIC;
21 
22 PG_FUNCTION_INFO_V1(euc_tw_to_big5);
23 PG_FUNCTION_INFO_V1(big5_to_euc_tw);
24 PG_FUNCTION_INFO_V1(euc_tw_to_mic);
25 PG_FUNCTION_INFO_V1(mic_to_euc_tw);
26 PG_FUNCTION_INFO_V1(big5_to_mic);
27 PG_FUNCTION_INFO_V1(mic_to_big5);
28 
29 /* ----------
30  * conv_proc(
31  *		INTEGER,	-- source encoding id
32  *		INTEGER,	-- destination encoding id
33  *		CSTRING,	-- source string (null terminated C string)
34  *		CSTRING,	-- destination string (null terminated C string)
35  *		INTEGER,	-- source string length
36  *		BOOL		-- if true, don't throw an error if conversion fails
37  * ) returns INTEGER;
38  *
39  * Returns the number of bytes successfully converted.
40  * ----------
41  */
42 
43 static int	euc_tw2big5(const unsigned char *euc, unsigned char *p, int len, bool noError);
44 static int	big52euc_tw(const unsigned char *euc, unsigned char *p, int len, bool noError);
45 static int	big52mic(const unsigned char *big5, unsigned char *p, int len, bool noError);
46 static int	mic2big5(const unsigned char *mic, unsigned char *p, int len, bool noError);
47 static int	euc_tw2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
48 static int	mic2euc_tw(const unsigned char *mic, unsigned char *p, int len, bool noError);
49 
50 Datum
euc_tw_to_big5(PG_FUNCTION_ARGS)51 euc_tw_to_big5(PG_FUNCTION_ARGS)
52 {
53 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
54 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
55 	int			len = PG_GETARG_INT32(4);
56 	bool		noError = PG_GETARG_BOOL(5);
57 	int			converted;
58 
59 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_BIG5);
60 
61 	converted = euc_tw2big5(src, dest, len, noError);
62 
63 	PG_RETURN_INT32(converted);
64 }
65 
66 Datum
big5_to_euc_tw(PG_FUNCTION_ARGS)67 big5_to_euc_tw(PG_FUNCTION_ARGS)
68 {
69 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
70 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
71 	int			len = PG_GETARG_INT32(4);
72 	bool		noError = PG_GETARG_BOOL(5);
73 	int			converted;
74 
75 	CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_EUC_TW);
76 
77 	converted = big52euc_tw(src, dest, len, noError);
78 
79 	PG_RETURN_INT32(converted);
80 }
81 
82 Datum
euc_tw_to_mic(PG_FUNCTION_ARGS)83 euc_tw_to_mic(PG_FUNCTION_ARGS)
84 {
85 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
86 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
87 	int			len = PG_GETARG_INT32(4);
88 	bool		noError = PG_GETARG_BOOL(5);
89 	int			converted;
90 
91 	CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_MULE_INTERNAL);
92 
93 	converted = euc_tw2mic(src, dest, len, noError);
94 
95 	PG_RETURN_INT32(converted);
96 }
97 
98 Datum
mic_to_euc_tw(PG_FUNCTION_ARGS)99 mic_to_euc_tw(PG_FUNCTION_ARGS)
100 {
101 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
102 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
103 	int			len = PG_GETARG_INT32(4);
104 	bool		noError = PG_GETARG_BOOL(5);
105 	int			converted;
106 
107 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_TW);
108 
109 	converted = mic2euc_tw(src, dest, len, noError);
110 
111 	PG_RETURN_INT32(converted);
112 }
113 
114 Datum
big5_to_mic(PG_FUNCTION_ARGS)115 big5_to_mic(PG_FUNCTION_ARGS)
116 {
117 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
118 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
119 	int			len = PG_GETARG_INT32(4);
120 	bool		noError = PG_GETARG_BOOL(5);
121 	int			converted;
122 
123 	CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_MULE_INTERNAL);
124 
125 	converted = big52mic(src, dest, len, noError);
126 
127 	PG_RETURN_INT32(converted);
128 }
129 
130 Datum
mic_to_big5(PG_FUNCTION_ARGS)131 mic_to_big5(PG_FUNCTION_ARGS)
132 {
133 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
134 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
135 	int			len = PG_GETARG_INT32(4);
136 	bool		noError = PG_GETARG_BOOL(5);
137 	int			converted;
138 
139 	CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_BIG5);
140 
141 	converted = mic2big5(src, dest, len, noError);
142 
143 	PG_RETURN_INT32(converted);
144 }
145 
146 
147 /*
148  * EUC_TW ---> Big5
149  */
150 static int
euc_tw2big5(const unsigned char * euc,unsigned char * p,int len,bool noError)151 euc_tw2big5(const unsigned char *euc, unsigned char *p, int len, bool noError)
152 {
153 	const unsigned char *start = euc;
154 	unsigned char c1;
155 	unsigned short big5buf,
156 				cnsBuf;
157 	unsigned char lc;
158 	int			l;
159 
160 	while (len > 0)
161 	{
162 		c1 = *euc;
163 		if (IS_HIGHBIT_SET(c1))
164 		{
165 			/* Verify and decode the next EUC_TW input character */
166 			l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
167 			if (l < 0)
168 			{
169 				if (noError)
170 					break;
171 				report_invalid_encoding(PG_EUC_TW,
172 										(const char *) euc, len);
173 			}
174 			if (c1 == SS2)
175 			{
176 				c1 = euc[1];	/* plane No. */
177 				if (c1 == 0xa1)
178 					lc = LC_CNS11643_1;
179 				else if (c1 == 0xa2)
180 					lc = LC_CNS11643_2;
181 				else
182 					lc = c1 - 0xa3 + LC_CNS11643_3;
183 				cnsBuf = (euc[2] << 8) | euc[3];
184 			}
185 			else
186 			{					/* CNS11643-1 */
187 				lc = LC_CNS11643_1;
188 				cnsBuf = (c1 << 8) | euc[1];
189 			}
190 
191 			/* Write it out in Big5 */
192 			big5buf = CNStoBIG5(cnsBuf, lc);
193 			if (big5buf == 0)
194 			{
195 				if (noError)
196 					break;
197 				report_untranslatable_char(PG_EUC_TW, PG_BIG5,
198 										   (const char *) euc, len);
199 			}
200 			*p++ = (big5buf >> 8) & 0x00ff;
201 			*p++ = big5buf & 0x00ff;
202 
203 			euc += l;
204 			len -= l;
205 		}
206 		else
207 		{						/* should be ASCII */
208 			if (c1 == 0)
209 			{
210 				if (noError)
211 					break;
212 				report_invalid_encoding(PG_EUC_TW,
213 										(const char *) euc, len);
214 			}
215 			*p++ = c1;
216 			euc++;
217 			len--;
218 		}
219 	}
220 	*p = '\0';
221 
222 	return euc - start;
223 }
224 
225 /*
226  * Big5 ---> EUC_TW
227  */
228 static int
big52euc_tw(const unsigned char * big5,unsigned char * p,int len,bool noError)229 big52euc_tw(const unsigned char *big5, unsigned char *p, int len, bool noError)
230 {
231 	const unsigned char *start = big5;
232 	unsigned short c1;
233 	unsigned short big5buf,
234 				cnsBuf;
235 	unsigned char lc;
236 	int			l;
237 
238 	while (len > 0)
239 	{
240 		/* Verify and decode the next Big5 input character */
241 		c1 = *big5;
242 		if (IS_HIGHBIT_SET(c1))
243 		{
244 			l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
245 			if (l < 0)
246 			{
247 				if (noError)
248 					break;
249 				report_invalid_encoding(PG_BIG5,
250 										(const char *) big5, len);
251 			}
252 			big5buf = (c1 << 8) | big5[1];
253 			cnsBuf = BIG5toCNS(big5buf, &lc);
254 
255 			if (lc == LC_CNS11643_1)
256 			{
257 				*p++ = (cnsBuf >> 8) & 0x00ff;
258 				*p++ = cnsBuf & 0x00ff;
259 			}
260 			else if (lc == LC_CNS11643_2)
261 			{
262 				*p++ = SS2;
263 				*p++ = 0xa2;
264 				*p++ = (cnsBuf >> 8) & 0x00ff;
265 				*p++ = cnsBuf & 0x00ff;
266 			}
267 			else if (lc >= LC_CNS11643_3 && lc <= LC_CNS11643_7)
268 			{
269 				*p++ = SS2;
270 				*p++ = lc - LC_CNS11643_3 + 0xa3;
271 				*p++ = (cnsBuf >> 8) & 0x00ff;
272 				*p++ = cnsBuf & 0x00ff;
273 			}
274 			else
275 			{
276 				if (noError)
277 					break;
278 				report_untranslatable_char(PG_BIG5, PG_EUC_TW,
279 										   (const char *) big5, len);
280 			}
281 
282 			big5 += l;
283 			len -= l;
284 		}
285 		else
286 		{
287 			/* ASCII */
288 			if (c1 == 0)
289 			{
290 				if (noError)
291 					break;
292 				report_invalid_encoding(PG_BIG5,
293 										(const char *) big5, len);
294 			}
295 			*p++ = c1;
296 			big5++;
297 			len--;
298 			continue;
299 		}
300 	}
301 	*p = '\0';
302 
303 	return big5 - start;
304 }
305 
306 /*
307  * EUC_TW ---> MIC
308  */
309 static int
euc_tw2mic(const unsigned char * euc,unsigned char * p,int len,bool noError)310 euc_tw2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
311 {
312 	const unsigned char *start = euc;
313 	int			c1;
314 	int			l;
315 
316 	while (len > 0)
317 	{
318 		c1 = *euc;
319 		if (IS_HIGHBIT_SET(c1))
320 		{
321 			l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
322 			if (l < 0)
323 			{
324 				if (noError)
325 					break;
326 				report_invalid_encoding(PG_EUC_TW,
327 										(const char *) euc, len);
328 			}
329 			if (c1 == SS2)
330 			{
331 				c1 = euc[1];	/* plane No. */
332 				if (c1 == 0xa1)
333 					*p++ = LC_CNS11643_1;
334 				else if (c1 == 0xa2)
335 					*p++ = LC_CNS11643_2;
336 				else
337 				{
338 					/* other planes are MULE private charsets */
339 					*p++ = LCPRV2_B;
340 					*p++ = c1 - 0xa3 + LC_CNS11643_3;
341 				}
342 				*p++ = euc[2];
343 				*p++ = euc[3];
344 			}
345 			else
346 			{					/* CNS11643-1 */
347 				*p++ = LC_CNS11643_1;
348 				*p++ = c1;
349 				*p++ = euc[1];
350 			}
351 			euc += l;
352 			len -= l;
353 		}
354 		else
355 		{						/* should be ASCII */
356 			if (c1 == 0)
357 			{
358 				if (noError)
359 					break;
360 				report_invalid_encoding(PG_EUC_TW,
361 										(const char *) euc, len);
362 			}
363 			*p++ = c1;
364 			euc++;
365 			len--;
366 		}
367 	}
368 	*p = '\0';
369 
370 	return euc - start;
371 }
372 
373 /*
374  * MIC ---> EUC_TW
375  */
376 static int
mic2euc_tw(const unsigned char * mic,unsigned char * p,int len,bool noError)377 mic2euc_tw(const unsigned char *mic, unsigned char *p, int len, bool noError)
378 {
379 	const unsigned char *start = mic;
380 	int			c1;
381 	int			l;
382 
383 	while (len > 0)
384 	{
385 		c1 = *mic;
386 		if (!IS_HIGHBIT_SET(c1))
387 		{
388 			/* ASCII */
389 			if (c1 == 0)
390 			{
391 				if (noError)
392 					break;
393 				report_invalid_encoding(PG_MULE_INTERNAL,
394 										(const char *) mic, len);
395 			}
396 			*p++ = c1;
397 			mic++;
398 			len--;
399 			continue;
400 		}
401 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
402 		if (l < 0)
403 		{
404 			if (noError)
405 				break;
406 			report_invalid_encoding(PG_MULE_INTERNAL,
407 									(const char *) mic, len);
408 		}
409 		if (c1 == LC_CNS11643_1)
410 		{
411 			*p++ = mic[1];
412 			*p++ = mic[2];
413 		}
414 		else if (c1 == LC_CNS11643_2)
415 		{
416 			*p++ = SS2;
417 			*p++ = 0xa2;
418 			*p++ = mic[1];
419 			*p++ = mic[2];
420 		}
421 		else if (c1 == LCPRV2_B &&
422 				 mic[1] >= LC_CNS11643_3 && mic[1] <= LC_CNS11643_7)
423 		{
424 			*p++ = SS2;
425 			*p++ = mic[1] - LC_CNS11643_3 + 0xa3;
426 			*p++ = mic[2];
427 			*p++ = mic[3];
428 		}
429 		else
430 		{
431 			if (noError)
432 				break;
433 			report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_TW,
434 									   (const char *) mic, len);
435 		}
436 		mic += l;
437 		len -= l;
438 	}
439 	*p = '\0';
440 
441 	return mic - start;
442 }
443 
444 /*
445  * Big5 ---> MIC
446  */
447 static int
big52mic(const unsigned char * big5,unsigned char * p,int len,bool noError)448 big52mic(const unsigned char *big5, unsigned char *p, int len, bool noError)
449 {
450 	const unsigned char *start = big5;
451 	unsigned short c1;
452 	unsigned short big5buf,
453 				cnsBuf;
454 	unsigned char lc;
455 	int			l;
456 
457 	while (len > 0)
458 	{
459 		c1 = *big5;
460 		if (!IS_HIGHBIT_SET(c1))
461 		{
462 			/* ASCII */
463 			if (c1 == 0)
464 			{
465 				if (noError)
466 					break;
467 				report_invalid_encoding(PG_BIG5,
468 										(const char *) big5, len);
469 			}
470 			*p++ = c1;
471 			big5++;
472 			len--;
473 			continue;
474 		}
475 		l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
476 		if (l < 0)
477 		{
478 			if (noError)
479 				break;
480 			report_invalid_encoding(PG_BIG5,
481 									(const char *) big5, len);
482 		}
483 		big5buf = (c1 << 8) | big5[1];
484 		cnsBuf = BIG5toCNS(big5buf, &lc);
485 		if (lc != 0)
486 		{
487 			/* Planes 3 and 4 are MULE private charsets */
488 			if (lc == LC_CNS11643_3 || lc == LC_CNS11643_4)
489 				*p++ = LCPRV2_B;
490 			*p++ = lc;			/* Plane No. */
491 			*p++ = (cnsBuf >> 8) & 0x00ff;
492 			*p++ = cnsBuf & 0x00ff;
493 		}
494 		else
495 		{
496 			if (noError)
497 				break;
498 			report_untranslatable_char(PG_BIG5, PG_MULE_INTERNAL,
499 									   (const char *) big5, len);
500 		}
501 		big5 += l;
502 		len -= l;
503 	}
504 	*p = '\0';
505 
506 	return big5 - start;
507 }
508 
509 /*
510  * MIC ---> Big5
511  */
512 static int
mic2big5(const unsigned char * mic,unsigned char * p,int len,bool noError)513 mic2big5(const unsigned char *mic, unsigned char *p, int len, bool noError)
514 {
515 	const unsigned char *start = mic;
516 	unsigned short c1;
517 	unsigned short big5buf,
518 				cnsBuf;
519 	int			l;
520 
521 	while (len > 0)
522 	{
523 		c1 = *mic;
524 		if (!IS_HIGHBIT_SET(c1))
525 		{
526 			/* ASCII */
527 			if (c1 == 0)
528 			{
529 				if (noError)
530 					break;
531 				report_invalid_encoding(PG_MULE_INTERNAL,
532 										(const char *) mic, len);
533 			}
534 			*p++ = c1;
535 			mic++;
536 			len--;
537 			continue;
538 		}
539 		l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
540 		if (l < 0)
541 		{
542 			if (noError)
543 				break;
544 			report_invalid_encoding(PG_MULE_INTERNAL,
545 									(const char *) mic, len);
546 		}
547 		if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2 || c1 == LCPRV2_B)
548 		{
549 			if (c1 == LCPRV2_B)
550 			{
551 				c1 = mic[1];	/* get plane no. */
552 				cnsBuf = (mic[2] << 8) | mic[3];
553 			}
554 			else
555 			{
556 				cnsBuf = (mic[1] << 8) | mic[2];
557 			}
558 			big5buf = CNStoBIG5(cnsBuf, c1);
559 			if (big5buf == 0)
560 			{
561 				if (noError)
562 					break;
563 				report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
564 										   (const char *) mic, len);
565 			}
566 			*p++ = (big5buf >> 8) & 0x00ff;
567 			*p++ = big5buf & 0x00ff;
568 		}
569 		else
570 		{
571 			if (noError)
572 				break;
573 			report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
574 									   (const char *) mic, len);
575 		}
576 		mic += l;
577 		len -= l;
578 	}
579 	*p = '\0';
580 
581 	return mic - start;
582 }
583