1 /*-------------------------------------------------------------------------
2 *
3 * Utility functions for conversion procs.
4 *
5 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
7 *
8 * IDENTIFICATION
9 * src/backend/utils/mb/conv.c
10 *
11 *-------------------------------------------------------------------------
12 */
13 #include "postgres.h"
14 #include "mb/pg_wchar.h"
15
16
17 /*
18 * local2local: a generic single byte charset encoding
19 * conversion between two ASCII-superset encodings.
20 *
21 * l points to the source string of length len
22 * p is the output area (must be large enough!)
23 * src_encoding is the PG identifier for the source encoding
24 * dest_encoding is the PG identifier for the target encoding
25 * tab holds conversion entries for the source charset
26 * starting from 128 (0x80). each entry in the table holds the corresponding
27 * code point for the target charset, or 0 if there is no equivalent code.
28 */
29 void
local2local(const unsigned char * l,unsigned char * p,int len,int src_encoding,int dest_encoding,const unsigned char * tab)30 local2local(const unsigned char *l,
31 unsigned char *p,
32 int len,
33 int src_encoding,
34 int dest_encoding,
35 const unsigned char *tab)
36 {
37 unsigned char c1,
38 c2;
39
40 while (len > 0)
41 {
42 c1 = *l;
43 if (c1 == 0)
44 report_invalid_encoding(src_encoding, (const char *) l, len);
45 if (!IS_HIGHBIT_SET(c1))
46 *p++ = c1;
47 else
48 {
49 c2 = tab[c1 - HIGHBIT];
50 if (c2)
51 *p++ = c2;
52 else
53 report_untranslatable_char(src_encoding, dest_encoding,
54 (const char *) l, len);
55 }
56 l++;
57 len--;
58 }
59 *p = '\0';
60 }
61
62 /*
63 * LATINn ---> MIC when the charset's local codes map directly to MIC
64 *
65 * l points to the source string of length len
66 * p is the output area (must be large enough!)
67 * lc is the mule character set id for the local encoding
68 * encoding is the PG identifier for the local encoding
69 */
70 void
latin2mic(const unsigned char * l,unsigned char * p,int len,int lc,int encoding)71 latin2mic(const unsigned char *l, unsigned char *p, int len,
72 int lc, int encoding)
73 {
74 int c1;
75
76 while (len > 0)
77 {
78 c1 = *l;
79 if (c1 == 0)
80 report_invalid_encoding(encoding, (const char *) l, len);
81 if (IS_HIGHBIT_SET(c1))
82 *p++ = lc;
83 *p++ = c1;
84 l++;
85 len--;
86 }
87 *p = '\0';
88 }
89
90 /*
91 * MIC ---> LATINn when the charset's local codes map directly to MIC
92 *
93 * mic points to the source string of length len
94 * p is the output area (must be large enough!)
95 * lc is the mule character set id for the local encoding
96 * encoding is the PG identifier for the local encoding
97 */
98 void
mic2latin(const unsigned char * mic,unsigned char * p,int len,int lc,int encoding)99 mic2latin(const unsigned char *mic, unsigned char *p, int len,
100 int lc, int encoding)
101 {
102 int c1;
103
104 while (len > 0)
105 {
106 c1 = *mic;
107 if (c1 == 0)
108 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
109 if (!IS_HIGHBIT_SET(c1))
110 {
111 /* easy for ASCII */
112 *p++ = c1;
113 mic++;
114 len--;
115 }
116 else
117 {
118 int l = pg_mule_mblen(mic);
119
120 if (len < l)
121 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
122 len);
123 if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
124 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
125 (const char *) mic, len);
126 *p++ = mic[1];
127 mic += 2;
128 len -= 2;
129 }
130 }
131 *p = '\0';
132 }
133
134
135 /*
136 * latin2mic_with_table: a generic single byte charset encoding
137 * conversion from a local charset to the mule internal code.
138 *
139 * l points to the source string of length len
140 * p is the output area (must be large enough!)
141 * lc is the mule character set id for the local encoding
142 * encoding is the PG identifier for the local encoding
143 * tab holds conversion entries for the local charset
144 * starting from 128 (0x80). each entry in the table holds the corresponding
145 * code point for the mule encoding, or 0 if there is no equivalent code.
146 */
147 void
latin2mic_with_table(const unsigned char * l,unsigned char * p,int len,int lc,int encoding,const unsigned char * tab)148 latin2mic_with_table(const unsigned char *l,
149 unsigned char *p,
150 int len,
151 int lc,
152 int encoding,
153 const unsigned char *tab)
154 {
155 unsigned char c1,
156 c2;
157
158 while (len > 0)
159 {
160 c1 = *l;
161 if (c1 == 0)
162 report_invalid_encoding(encoding, (const char *) l, len);
163 if (!IS_HIGHBIT_SET(c1))
164 *p++ = c1;
165 else
166 {
167 c2 = tab[c1 - HIGHBIT];
168 if (c2)
169 {
170 *p++ = lc;
171 *p++ = c2;
172 }
173 else
174 report_untranslatable_char(encoding, PG_MULE_INTERNAL,
175 (const char *) l, len);
176 }
177 l++;
178 len--;
179 }
180 *p = '\0';
181 }
182
183 /*
184 * mic2latin_with_table: a generic single byte charset encoding
185 * conversion from the mule internal code to a local charset.
186 *
187 * mic points to the source string of length len
188 * p is the output area (must be large enough!)
189 * lc is the mule character set id for the local encoding
190 * encoding is the PG identifier for the local encoding
191 * tab holds conversion entries for the mule internal code's second byte,
192 * starting from 128 (0x80). each entry in the table holds the corresponding
193 * code point for the local charset, or 0 if there is no equivalent code.
194 */
195 void
mic2latin_with_table(const unsigned char * mic,unsigned char * p,int len,int lc,int encoding,const unsigned char * tab)196 mic2latin_with_table(const unsigned char *mic,
197 unsigned char *p,
198 int len,
199 int lc,
200 int encoding,
201 const unsigned char *tab)
202 {
203 unsigned char c1,
204 c2;
205
206 while (len > 0)
207 {
208 c1 = *mic;
209 if (c1 == 0)
210 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
211 if (!IS_HIGHBIT_SET(c1))
212 {
213 /* easy for ASCII */
214 *p++ = c1;
215 mic++;
216 len--;
217 }
218 else
219 {
220 int l = pg_mule_mblen(mic);
221
222 if (len < l)
223 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
224 len);
225 if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
226 (c2 = tab[mic[1] - HIGHBIT]) == 0)
227 {
228 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
229 (const char *) mic, len);
230 break; /* keep compiler quiet */
231 }
232 *p++ = c2;
233 mic += 2;
234 len -= 2;
235 }
236 }
237 *p = '\0';
238 }
239
240 /*
241 * comparison routine for bsearch()
242 * this routine is intended for combined UTF8 -> local code
243 */
244 static int
compare3(const void * p1,const void * p2)245 compare3(const void *p1, const void *p2)
246 {
247 uint32 s1,
248 s2,
249 d1,
250 d2;
251
252 s1 = *(const uint32 *) p1;
253 s2 = *((const uint32 *) p1 + 1);
254 d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
255 d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
256 return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
257 }
258
259 /*
260 * comparison routine for bsearch()
261 * this routine is intended for local code -> combined UTF8
262 */
263 static int
compare4(const void * p1,const void * p2)264 compare4(const void *p1, const void *p2)
265 {
266 uint32 v1,
267 v2;
268
269 v1 = *(const uint32 *) p1;
270 v2 = ((const pg_local_to_utf_combined *) p2)->code;
271 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
272 }
273
274 /*
275 * store 32bit character representation into multibyte stream
276 */
277 static inline unsigned char *
store_coded_char(unsigned char * dest,uint32 code)278 store_coded_char(unsigned char *dest, uint32 code)
279 {
280 if (code & 0xff000000)
281 *dest++ = code >> 24;
282 if (code & 0x00ff0000)
283 *dest++ = code >> 16;
284 if (code & 0x0000ff00)
285 *dest++ = code >> 8;
286 if (code & 0x000000ff)
287 *dest++ = code;
288 return dest;
289 }
290
291 /*
292 * Convert a character using a conversion radix tree.
293 *
294 * 'l' is the length of the input character in bytes, and b1-b4 are
295 * the input character's bytes.
296 */
297 static inline uint32
pg_mb_radix_conv(const pg_mb_radix_tree * rt,int l,unsigned char b1,unsigned char b2,unsigned char b3,unsigned char b4)298 pg_mb_radix_conv(const pg_mb_radix_tree *rt,
299 int l,
300 unsigned char b1,
301 unsigned char b2,
302 unsigned char b3,
303 unsigned char b4)
304 {
305 if (l == 4)
306 {
307 /* 4-byte code */
308
309 /* check code validity */
310 if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
311 b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
312 b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
313 b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
314 return 0;
315
316 /* perform lookup */
317 if (rt->chars32)
318 {
319 uint32 idx = rt->b4root;
320
321 idx = rt->chars32[b1 + idx - rt->b4_1_lower];
322 idx = rt->chars32[b2 + idx - rt->b4_2_lower];
323 idx = rt->chars32[b3 + idx - rt->b4_3_lower];
324 return rt->chars32[b4 + idx - rt->b4_4_lower];
325 }
326 else
327 {
328 uint16 idx = rt->b4root;
329
330 idx = rt->chars16[b1 + idx - rt->b4_1_lower];
331 idx = rt->chars16[b2 + idx - rt->b4_2_lower];
332 idx = rt->chars16[b3 + idx - rt->b4_3_lower];
333 return rt->chars16[b4 + idx - rt->b4_4_lower];
334 }
335 }
336 else if (l == 3)
337 {
338 /* 3-byte code */
339
340 /* check code validity */
341 if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
342 b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
343 b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
344 return 0;
345
346 /* perform lookup */
347 if (rt->chars32)
348 {
349 uint32 idx = rt->b3root;
350
351 idx = rt->chars32[b2 + idx - rt->b3_1_lower];
352 idx = rt->chars32[b3 + idx - rt->b3_2_lower];
353 return rt->chars32[b4 + idx - rt->b3_3_lower];
354 }
355 else
356 {
357 uint16 idx = rt->b3root;
358
359 idx = rt->chars16[b2 + idx - rt->b3_1_lower];
360 idx = rt->chars16[b3 + idx - rt->b3_2_lower];
361 return rt->chars16[b4 + idx - rt->b3_3_lower];
362 }
363 }
364 else if (l == 2)
365 {
366 /* 2-byte code */
367
368 /* check code validity - first byte */
369 if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
370 b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
371 return 0;
372
373 /* perform lookup */
374 if (rt->chars32)
375 {
376 uint32 idx = rt->b2root;
377
378 idx = rt->chars32[b3 + idx - rt->b2_1_lower];
379 return rt->chars32[b4 + idx - rt->b2_2_lower];
380 }
381 else
382 {
383 uint16 idx = rt->b2root;
384
385 idx = rt->chars16[b3 + idx - rt->b2_1_lower];
386 return rt->chars16[b4 + idx - rt->b2_2_lower];
387 }
388 }
389 else if (l == 1)
390 {
391 /* 1-byte code */
392
393 /* check code validity - first byte */
394 if (b4 < rt->b1_lower || b4 > rt->b1_upper)
395 return 0;
396
397 /* perform lookup */
398 if (rt->chars32)
399 return rt->chars32[b4 + rt->b1root - rt->b1_lower];
400 else
401 return rt->chars16[b4 + rt->b1root - rt->b1_lower];
402 }
403 return 0; /* shouldn't happen */
404 }
405
406 /*
407 * UTF8 ---> local code
408 *
409 * utf: input string in UTF8 encoding (need not be null-terminated)
410 * len: length of input string (in bytes)
411 * iso: pointer to the output area (must be large enough!)
412 (output string will be null-terminated)
413 * map: conversion map for single characters
414 * cmap: conversion map for combined characters
415 * (optional, pass NULL if none)
416 * cmapsize: number of entries in the conversion map for combined characters
417 * (optional, pass 0 if none)
418 * conv_func: algorithmic encoding conversion function
419 * (optional, pass NULL if none)
420 * encoding: PG identifier for the local encoding
421 *
422 * For each character, the cmap (if provided) is consulted first; if no match,
423 * the map is consulted next; if still no match, the conv_func (if provided)
424 * is applied. An error is raised if no match is found.
425 *
426 * See pg_wchar.h for more details about the data structures used here.
427 */
428 void
UtfToLocal(const unsigned char * utf,int len,unsigned char * iso,const pg_mb_radix_tree * map,const pg_utf_to_local_combined * cmap,int cmapsize,utf_local_conversion_func conv_func,int encoding)429 UtfToLocal(const unsigned char *utf, int len,
430 unsigned char *iso,
431 const pg_mb_radix_tree *map,
432 const pg_utf_to_local_combined *cmap, int cmapsize,
433 utf_local_conversion_func conv_func,
434 int encoding)
435 {
436 uint32 iutf;
437 int l;
438 const pg_utf_to_local_combined *cp;
439
440 if (!PG_VALID_ENCODING(encoding))
441 ereport(ERROR,
442 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
443 errmsg("invalid encoding number: %d", encoding)));
444
445 for (; len > 0; len -= l)
446 {
447 unsigned char b1 = 0;
448 unsigned char b2 = 0;
449 unsigned char b3 = 0;
450 unsigned char b4 = 0;
451
452 /* "break" cases all represent errors */
453 if (*utf == '\0')
454 break;
455
456 l = pg_utf_mblen(utf);
457 if (len < l)
458 break;
459
460 if (!pg_utf8_islegal(utf, l))
461 break;
462
463 if (l == 1)
464 {
465 /* ASCII case is easy, assume it's one-to-one conversion */
466 *iso++ = *utf++;
467 continue;
468 }
469
470 /* collect coded char of length l */
471 if (l == 2)
472 {
473 b3 = *utf++;
474 b4 = *utf++;
475 }
476 else if (l == 3)
477 {
478 b2 = *utf++;
479 b3 = *utf++;
480 b4 = *utf++;
481 }
482 else if (l == 4)
483 {
484 b1 = *utf++;
485 b2 = *utf++;
486 b3 = *utf++;
487 b4 = *utf++;
488 }
489 else
490 {
491 elog(ERROR, "unsupported character length %d", l);
492 iutf = 0; /* keep compiler quiet */
493 }
494 iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
495
496 /* First, try with combined map if possible */
497 if (cmap && len > l)
498 {
499 const unsigned char *utf_save = utf;
500 int len_save = len;
501 int l_save = l;
502
503 /* collect next character, same as above */
504 len -= l;
505
506 l = pg_utf_mblen(utf);
507 if (len < l)
508 break;
509
510 if (!pg_utf8_islegal(utf, l))
511 break;
512
513 /* We assume ASCII character cannot be in combined map */
514 if (l > 1)
515 {
516 uint32 iutf2;
517 uint32 cutf[2];
518
519 if (l == 2)
520 {
521 iutf2 = *utf++ << 8;
522 iutf2 |= *utf++;
523 }
524 else if (l == 3)
525 {
526 iutf2 = *utf++ << 16;
527 iutf2 |= *utf++ << 8;
528 iutf2 |= *utf++;
529 }
530 else if (l == 4)
531 {
532 iutf2 = *utf++ << 24;
533 iutf2 |= *utf++ << 16;
534 iutf2 |= *utf++ << 8;
535 iutf2 |= *utf++;
536 }
537 else
538 {
539 elog(ERROR, "unsupported character length %d", l);
540 iutf2 = 0; /* keep compiler quiet */
541 }
542
543 cutf[0] = iutf;
544 cutf[1] = iutf2;
545
546 cp = bsearch(cutf, cmap, cmapsize,
547 sizeof(pg_utf_to_local_combined), compare3);
548
549 if (cp)
550 {
551 iso = store_coded_char(iso, cp->code);
552 continue;
553 }
554 }
555
556 /* fail, so back up to reprocess second character next time */
557 utf = utf_save;
558 len = len_save;
559 l = l_save;
560 }
561
562 /* Now check ordinary map */
563 if (map)
564 {
565 uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
566
567 if (converted)
568 {
569 iso = store_coded_char(iso, converted);
570 continue;
571 }
572 }
573
574 /* if there's a conversion function, try that */
575 if (conv_func)
576 {
577 uint32 converted = (*conv_func) (iutf);
578
579 if (converted)
580 {
581 iso = store_coded_char(iso, converted);
582 continue;
583 }
584 }
585
586 /* failed to translate this character */
587 report_untranslatable_char(PG_UTF8, encoding,
588 (const char *) (utf - l), len);
589 }
590
591 /* if we broke out of loop early, must be invalid input */
592 if (len > 0)
593 report_invalid_encoding(PG_UTF8, (const char *) utf, len);
594
595 *iso = '\0';
596 }
597
598 /*
599 * local code ---> UTF8
600 *
601 * iso: input string in local encoding (need not be null-terminated)
602 * len: length of input string (in bytes)
603 * utf: pointer to the output area (must be large enough!)
604 (output string will be null-terminated)
605 * map: conversion map for single characters
606 * cmap: conversion map for combined characters
607 * (optional, pass NULL if none)
608 * cmapsize: number of entries in the conversion map for combined characters
609 * (optional, pass 0 if none)
610 * conv_func: algorithmic encoding conversion function
611 * (optional, pass NULL if none)
612 * encoding: PG identifier for the local encoding
613 *
614 * For each character, the map is consulted first; if no match, the cmap
615 * (if provided) is consulted next; if still no match, the conv_func
616 * (if provided) is applied. An error is raised if no match is found.
617 *
618 * See pg_wchar.h for more details about the data structures used here.
619 */
620 void
LocalToUtf(const unsigned char * iso,int len,unsigned char * utf,const pg_mb_radix_tree * map,const pg_local_to_utf_combined * cmap,int cmapsize,utf_local_conversion_func conv_func,int encoding)621 LocalToUtf(const unsigned char *iso, int len,
622 unsigned char *utf,
623 const pg_mb_radix_tree *map,
624 const pg_local_to_utf_combined *cmap, int cmapsize,
625 utf_local_conversion_func conv_func,
626 int encoding)
627 {
628 uint32 iiso;
629 int l;
630 const pg_local_to_utf_combined *cp;
631
632 if (!PG_VALID_ENCODING(encoding))
633 ereport(ERROR,
634 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
635 errmsg("invalid encoding number: %d", encoding)));
636
637 for (; len > 0; len -= l)
638 {
639 unsigned char b1 = 0;
640 unsigned char b2 = 0;
641 unsigned char b3 = 0;
642 unsigned char b4 = 0;
643
644 /* "break" cases all represent errors */
645 if (*iso == '\0')
646 break;
647
648 if (!IS_HIGHBIT_SET(*iso))
649 {
650 /* ASCII case is easy, assume it's one-to-one conversion */
651 *utf++ = *iso++;
652 l = 1;
653 continue;
654 }
655
656 l = pg_encoding_verifymb(encoding, (const char *) iso, len);
657 if (l < 0)
658 break;
659
660 /* collect coded char of length l */
661 if (l == 1)
662 b4 = *iso++;
663 else if (l == 2)
664 {
665 b3 = *iso++;
666 b4 = *iso++;
667 }
668 else if (l == 3)
669 {
670 b2 = *iso++;
671 b3 = *iso++;
672 b4 = *iso++;
673 }
674 else if (l == 4)
675 {
676 b1 = *iso++;
677 b2 = *iso++;
678 b3 = *iso++;
679 b4 = *iso++;
680 }
681 else
682 {
683 elog(ERROR, "unsupported character length %d", l);
684 iiso = 0; /* keep compiler quiet */
685 }
686 iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
687
688 if (map)
689 {
690 uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
691
692 if (converted)
693 {
694 utf = store_coded_char(utf, converted);
695 continue;
696 }
697
698 /* If there's a combined character map, try that */
699 if (cmap)
700 {
701 cp = bsearch(&iiso, cmap, cmapsize,
702 sizeof(pg_local_to_utf_combined), compare4);
703
704 if (cp)
705 {
706 utf = store_coded_char(utf, cp->utf1);
707 utf = store_coded_char(utf, cp->utf2);
708 continue;
709 }
710 }
711 }
712
713 /* if there's a conversion function, try that */
714 if (conv_func)
715 {
716 uint32 converted = (*conv_func) (iiso);
717
718 if (converted)
719 {
720 utf = store_coded_char(utf, converted);
721 continue;
722 }
723 }
724
725 /* failed to translate this character */
726 report_untranslatable_char(encoding, PG_UTF8,
727 (const char *) (iso - l), len);
728 }
729
730 /* if we broke out of loop early, must be invalid input */
731 if (len > 0)
732 report_invalid_encoding(encoding, (const char *) iso, len);
733
734 *utf = '\0';
735 }
736