1 /*-------------------------------------------------------------------------
2 *
3 * EUC_TW, BIG5 and MULE_INTERNAL
4 *
5 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
7 *
8 * IDENTIFICATION
9 * src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
10 *
11 *-------------------------------------------------------------------------
12 */
13
14 #include "postgres.h"
15 #include "fmgr.h"
16 #include "mb/pg_wchar.h"
17
18 #define ENCODING_GROWTH_RATE 4
19
20 PG_MODULE_MAGIC;
21
22 PG_FUNCTION_INFO_V1(euc_tw_to_big5);
23 PG_FUNCTION_INFO_V1(big5_to_euc_tw);
24 PG_FUNCTION_INFO_V1(euc_tw_to_mic);
25 PG_FUNCTION_INFO_V1(mic_to_euc_tw);
26 PG_FUNCTION_INFO_V1(big5_to_mic);
27 PG_FUNCTION_INFO_V1(mic_to_big5);
28
29 /* ----------
30 * conv_proc(
31 * INTEGER, -- source encoding id
32 * INTEGER, -- destination encoding id
33 * CSTRING, -- source string (null terminated C string)
34 * CSTRING, -- destination string (null terminated C string)
35 * INTEGER, -- source string length
36 * BOOL -- if true, don't throw an error if conversion fails
37 * ) returns INTEGER;
38 *
39 * Returns the number of bytes successfully converted.
40 * ----------
41 */
42
43 static int euc_tw2big5(const unsigned char *euc, unsigned char *p, int len, bool noError);
44 static int big52euc_tw(const unsigned char *euc, unsigned char *p, int len, bool noError);
45 static int big52mic(const unsigned char *big5, unsigned char *p, int len, bool noError);
46 static int mic2big5(const unsigned char *mic, unsigned char *p, int len, bool noError);
47 static int euc_tw2mic(const unsigned char *euc, unsigned char *p, int len, bool noError);
48 static int mic2euc_tw(const unsigned char *mic, unsigned char *p, int len, bool noError);
49
50 Datum
euc_tw_to_big5(PG_FUNCTION_ARGS)51 euc_tw_to_big5(PG_FUNCTION_ARGS)
52 {
53 unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
54 unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
55 int len = PG_GETARG_INT32(4);
56 bool noError = PG_GETARG_BOOL(5);
57 int converted;
58
59 CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_BIG5);
60
61 converted = euc_tw2big5(src, dest, len, noError);
62
63 PG_RETURN_INT32(converted);
64 }
65
66 Datum
big5_to_euc_tw(PG_FUNCTION_ARGS)67 big5_to_euc_tw(PG_FUNCTION_ARGS)
68 {
69 unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
70 unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
71 int len = PG_GETARG_INT32(4);
72 bool noError = PG_GETARG_BOOL(5);
73 int converted;
74
75 CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_EUC_TW);
76
77 converted = big52euc_tw(src, dest, len, noError);
78
79 PG_RETURN_INT32(converted);
80 }
81
82 Datum
euc_tw_to_mic(PG_FUNCTION_ARGS)83 euc_tw_to_mic(PG_FUNCTION_ARGS)
84 {
85 unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
86 unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
87 int len = PG_GETARG_INT32(4);
88 bool noError = PG_GETARG_BOOL(5);
89 int converted;
90
91 CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_MULE_INTERNAL);
92
93 converted = euc_tw2mic(src, dest, len, noError);
94
95 PG_RETURN_INT32(converted);
96 }
97
98 Datum
mic_to_euc_tw(PG_FUNCTION_ARGS)99 mic_to_euc_tw(PG_FUNCTION_ARGS)
100 {
101 unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
102 unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
103 int len = PG_GETARG_INT32(4);
104 bool noError = PG_GETARG_BOOL(5);
105 int converted;
106
107 CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_TW);
108
109 converted = mic2euc_tw(src, dest, len, noError);
110
111 PG_RETURN_INT32(converted);
112 }
113
114 Datum
big5_to_mic(PG_FUNCTION_ARGS)115 big5_to_mic(PG_FUNCTION_ARGS)
116 {
117 unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
118 unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
119 int len = PG_GETARG_INT32(4);
120 bool noError = PG_GETARG_BOOL(5);
121 int converted;
122
123 CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_MULE_INTERNAL);
124
125 converted = big52mic(src, dest, len, noError);
126
127 PG_RETURN_INT32(converted);
128 }
129
130 Datum
mic_to_big5(PG_FUNCTION_ARGS)131 mic_to_big5(PG_FUNCTION_ARGS)
132 {
133 unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
134 unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
135 int len = PG_GETARG_INT32(4);
136 bool noError = PG_GETARG_BOOL(5);
137 int converted;
138
139 CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_BIG5);
140
141 converted = mic2big5(src, dest, len, noError);
142
143 PG_RETURN_INT32(converted);
144 }
145
146
147 /*
148 * EUC_TW ---> Big5
149 */
150 static int
euc_tw2big5(const unsigned char * euc,unsigned char * p,int len,bool noError)151 euc_tw2big5(const unsigned char *euc, unsigned char *p, int len, bool noError)
152 {
153 const unsigned char *start = euc;
154 unsigned char c1;
155 unsigned short big5buf,
156 cnsBuf;
157 unsigned char lc;
158 int l;
159
160 while (len > 0)
161 {
162 c1 = *euc;
163 if (IS_HIGHBIT_SET(c1))
164 {
165 /* Verify and decode the next EUC_TW input character */
166 l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
167 if (l < 0)
168 {
169 if (noError)
170 break;
171 report_invalid_encoding(PG_EUC_TW,
172 (const char *) euc, len);
173 }
174 if (c1 == SS2)
175 {
176 c1 = euc[1]; /* plane No. */
177 if (c1 == 0xa1)
178 lc = LC_CNS11643_1;
179 else if (c1 == 0xa2)
180 lc = LC_CNS11643_2;
181 else
182 lc = c1 - 0xa3 + LC_CNS11643_3;
183 cnsBuf = (euc[2] << 8) | euc[3];
184 }
185 else
186 { /* CNS11643-1 */
187 lc = LC_CNS11643_1;
188 cnsBuf = (c1 << 8) | euc[1];
189 }
190
191 /* Write it out in Big5 */
192 big5buf = CNStoBIG5(cnsBuf, lc);
193 if (big5buf == 0)
194 {
195 if (noError)
196 break;
197 report_untranslatable_char(PG_EUC_TW, PG_BIG5,
198 (const char *) euc, len);
199 }
200 *p++ = (big5buf >> 8) & 0x00ff;
201 *p++ = big5buf & 0x00ff;
202
203 euc += l;
204 len -= l;
205 }
206 else
207 { /* should be ASCII */
208 if (c1 == 0)
209 {
210 if (noError)
211 break;
212 report_invalid_encoding(PG_EUC_TW,
213 (const char *) euc, len);
214 }
215 *p++ = c1;
216 euc++;
217 len--;
218 }
219 }
220 *p = '\0';
221
222 return euc - start;
223 }
224
225 /*
226 * Big5 ---> EUC_TW
227 */
228 static int
big52euc_tw(const unsigned char * big5,unsigned char * p,int len,bool noError)229 big52euc_tw(const unsigned char *big5, unsigned char *p, int len, bool noError)
230 {
231 const unsigned char *start = big5;
232 unsigned short c1;
233 unsigned short big5buf,
234 cnsBuf;
235 unsigned char lc;
236 int l;
237
238 while (len > 0)
239 {
240 /* Verify and decode the next Big5 input character */
241 c1 = *big5;
242 if (IS_HIGHBIT_SET(c1))
243 {
244 l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
245 if (l < 0)
246 {
247 if (noError)
248 break;
249 report_invalid_encoding(PG_BIG5,
250 (const char *) big5, len);
251 }
252 big5buf = (c1 << 8) | big5[1];
253 cnsBuf = BIG5toCNS(big5buf, &lc);
254
255 if (lc == LC_CNS11643_1)
256 {
257 *p++ = (cnsBuf >> 8) & 0x00ff;
258 *p++ = cnsBuf & 0x00ff;
259 }
260 else if (lc == LC_CNS11643_2)
261 {
262 *p++ = SS2;
263 *p++ = 0xa2;
264 *p++ = (cnsBuf >> 8) & 0x00ff;
265 *p++ = cnsBuf & 0x00ff;
266 }
267 else if (lc >= LC_CNS11643_3 && lc <= LC_CNS11643_7)
268 {
269 *p++ = SS2;
270 *p++ = lc - LC_CNS11643_3 + 0xa3;
271 *p++ = (cnsBuf >> 8) & 0x00ff;
272 *p++ = cnsBuf & 0x00ff;
273 }
274 else
275 {
276 if (noError)
277 break;
278 report_untranslatable_char(PG_BIG5, PG_EUC_TW,
279 (const char *) big5, len);
280 }
281
282 big5 += l;
283 len -= l;
284 }
285 else
286 {
287 /* ASCII */
288 if (c1 == 0)
289 {
290 if (noError)
291 break;
292 report_invalid_encoding(PG_BIG5,
293 (const char *) big5, len);
294 }
295 *p++ = c1;
296 big5++;
297 len--;
298 continue;
299 }
300 }
301 *p = '\0';
302
303 return big5 - start;
304 }
305
306 /*
307 * EUC_TW ---> MIC
308 */
309 static int
euc_tw2mic(const unsigned char * euc,unsigned char * p,int len,bool noError)310 euc_tw2mic(const unsigned char *euc, unsigned char *p, int len, bool noError)
311 {
312 const unsigned char *start = euc;
313 int c1;
314 int l;
315
316 while (len > 0)
317 {
318 c1 = *euc;
319 if (IS_HIGHBIT_SET(c1))
320 {
321 l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
322 if (l < 0)
323 {
324 if (noError)
325 break;
326 report_invalid_encoding(PG_EUC_TW,
327 (const char *) euc, len);
328 }
329 if (c1 == SS2)
330 {
331 c1 = euc[1]; /* plane No. */
332 if (c1 == 0xa1)
333 *p++ = LC_CNS11643_1;
334 else if (c1 == 0xa2)
335 *p++ = LC_CNS11643_2;
336 else
337 {
338 /* other planes are MULE private charsets */
339 *p++ = LCPRV2_B;
340 *p++ = c1 - 0xa3 + LC_CNS11643_3;
341 }
342 *p++ = euc[2];
343 *p++ = euc[3];
344 }
345 else
346 { /* CNS11643-1 */
347 *p++ = LC_CNS11643_1;
348 *p++ = c1;
349 *p++ = euc[1];
350 }
351 euc += l;
352 len -= l;
353 }
354 else
355 { /* should be ASCII */
356 if (c1 == 0)
357 {
358 if (noError)
359 break;
360 report_invalid_encoding(PG_EUC_TW,
361 (const char *) euc, len);
362 }
363 *p++ = c1;
364 euc++;
365 len--;
366 }
367 }
368 *p = '\0';
369
370 return euc - start;
371 }
372
373 /*
374 * MIC ---> EUC_TW
375 */
376 static int
mic2euc_tw(const unsigned char * mic,unsigned char * p,int len,bool noError)377 mic2euc_tw(const unsigned char *mic, unsigned char *p, int len, bool noError)
378 {
379 const unsigned char *start = mic;
380 int c1;
381 int l;
382
383 while (len > 0)
384 {
385 c1 = *mic;
386 if (!IS_HIGHBIT_SET(c1))
387 {
388 /* ASCII */
389 if (c1 == 0)
390 {
391 if (noError)
392 break;
393 report_invalid_encoding(PG_MULE_INTERNAL,
394 (const char *) mic, len);
395 }
396 *p++ = c1;
397 mic++;
398 len--;
399 continue;
400 }
401 l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
402 if (l < 0)
403 {
404 if (noError)
405 break;
406 report_invalid_encoding(PG_MULE_INTERNAL,
407 (const char *) mic, len);
408 }
409 if (c1 == LC_CNS11643_1)
410 {
411 *p++ = mic[1];
412 *p++ = mic[2];
413 }
414 else if (c1 == LC_CNS11643_2)
415 {
416 *p++ = SS2;
417 *p++ = 0xa2;
418 *p++ = mic[1];
419 *p++ = mic[2];
420 }
421 else if (c1 == LCPRV2_B &&
422 mic[1] >= LC_CNS11643_3 && mic[1] <= LC_CNS11643_7)
423 {
424 *p++ = SS2;
425 *p++ = mic[1] - LC_CNS11643_3 + 0xa3;
426 *p++ = mic[2];
427 *p++ = mic[3];
428 }
429 else
430 {
431 if (noError)
432 break;
433 report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_TW,
434 (const char *) mic, len);
435 }
436 mic += l;
437 len -= l;
438 }
439 *p = '\0';
440
441 return mic - start;
442 }
443
444 /*
445 * Big5 ---> MIC
446 */
447 static int
big52mic(const unsigned char * big5,unsigned char * p,int len,bool noError)448 big52mic(const unsigned char *big5, unsigned char *p, int len, bool noError)
449 {
450 const unsigned char *start = big5;
451 unsigned short c1;
452 unsigned short big5buf,
453 cnsBuf;
454 unsigned char lc;
455 int l;
456
457 while (len > 0)
458 {
459 c1 = *big5;
460 if (!IS_HIGHBIT_SET(c1))
461 {
462 /* ASCII */
463 if (c1 == 0)
464 {
465 if (noError)
466 break;
467 report_invalid_encoding(PG_BIG5,
468 (const char *) big5, len);
469 }
470 *p++ = c1;
471 big5++;
472 len--;
473 continue;
474 }
475 l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
476 if (l < 0)
477 {
478 if (noError)
479 break;
480 report_invalid_encoding(PG_BIG5,
481 (const char *) big5, len);
482 }
483 big5buf = (c1 << 8) | big5[1];
484 cnsBuf = BIG5toCNS(big5buf, &lc);
485 if (lc != 0)
486 {
487 /* Planes 3 and 4 are MULE private charsets */
488 if (lc == LC_CNS11643_3 || lc == LC_CNS11643_4)
489 *p++ = LCPRV2_B;
490 *p++ = lc; /* Plane No. */
491 *p++ = (cnsBuf >> 8) & 0x00ff;
492 *p++ = cnsBuf & 0x00ff;
493 }
494 else
495 {
496 if (noError)
497 break;
498 report_untranslatable_char(PG_BIG5, PG_MULE_INTERNAL,
499 (const char *) big5, len);
500 }
501 big5 += l;
502 len -= l;
503 }
504 *p = '\0';
505
506 return big5 - start;
507 }
508
509 /*
510 * MIC ---> Big5
511 */
512 static int
mic2big5(const unsigned char * mic,unsigned char * p,int len,bool noError)513 mic2big5(const unsigned char *mic, unsigned char *p, int len, bool noError)
514 {
515 const unsigned char *start = mic;
516 unsigned short c1;
517 unsigned short big5buf,
518 cnsBuf;
519 int l;
520
521 while (len > 0)
522 {
523 c1 = *mic;
524 if (!IS_HIGHBIT_SET(c1))
525 {
526 /* ASCII */
527 if (c1 == 0)
528 {
529 if (noError)
530 break;
531 report_invalid_encoding(PG_MULE_INTERNAL,
532 (const char *) mic, len);
533 }
534 *p++ = c1;
535 mic++;
536 len--;
537 continue;
538 }
539 l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
540 if (l < 0)
541 {
542 if (noError)
543 break;
544 report_invalid_encoding(PG_MULE_INTERNAL,
545 (const char *) mic, len);
546 }
547 if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2 || c1 == LCPRV2_B)
548 {
549 if (c1 == LCPRV2_B)
550 {
551 c1 = mic[1]; /* get plane no. */
552 cnsBuf = (mic[2] << 8) | mic[3];
553 }
554 else
555 {
556 cnsBuf = (mic[1] << 8) | mic[2];
557 }
558 big5buf = CNStoBIG5(cnsBuf, c1);
559 if (big5buf == 0)
560 {
561 if (noError)
562 break;
563 report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
564 (const char *) mic, len);
565 }
566 *p++ = (big5buf >> 8) & 0x00ff;
567 *p++ = big5buf & 0x00ff;
568 }
569 else
570 {
571 if (noError)
572 break;
573 report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
574 (const char *) mic, len);
575 }
576 mic += l;
577 len -= l;
578 }
579 *p = '\0';
580
581 return mic - start;
582 }
583