1 /*-------------------------------------------------------------------------
2  *
3  *	  Utility functions for conversion procs.
4  *
5  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
6  * Portions Copyright (c) 1994, Regents of the University of California
7  *
8  * IDENTIFICATION
9  *	  src/backend/utils/mb/conv.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14 #include "mb/pg_wchar.h"
15 
16 
17 /*
18  * local2local: a generic single byte charset encoding
19  * conversion between two ASCII-superset encodings.
20  *
21  * l points to the source string of length len
22  * p is the output area (must be large enough!)
23  * src_encoding is the PG identifier for the source encoding
24  * dest_encoding is the PG identifier for the target encoding
25  * tab holds conversion entries for the source charset
26  * starting from 128 (0x80). each entry in the table holds the corresponding
27  * code point for the target charset, or 0 if there is no equivalent code.
28  */
29 void
local2local(const unsigned char * l,unsigned char * p,int len,int src_encoding,int dest_encoding,const unsigned char * tab)30 local2local(const unsigned char *l,
31 			unsigned char *p,
32 			int len,
33 			int src_encoding,
34 			int dest_encoding,
35 			const unsigned char *tab)
36 {
37 	unsigned char c1,
38 				c2;
39 
40 	while (len > 0)
41 	{
42 		c1 = *l;
43 		if (c1 == 0)
44 			report_invalid_encoding(src_encoding, (const char *) l, len);
45 		if (!IS_HIGHBIT_SET(c1))
46 			*p++ = c1;
47 		else
48 		{
49 			c2 = tab[c1 - HIGHBIT];
50 			if (c2)
51 				*p++ = c2;
52 			else
53 				report_untranslatable_char(src_encoding, dest_encoding,
54 										   (const char *) l, len);
55 		}
56 		l++;
57 		len--;
58 	}
59 	*p = '\0';
60 }
61 
62 /*
63  * LATINn ---> MIC when the charset's local codes map directly to MIC
64  *
65  * l points to the source string of length len
66  * p is the output area (must be large enough!)
67  * lc is the mule character set id for the local encoding
68  * encoding is the PG identifier for the local encoding
69  */
70 void
latin2mic(const unsigned char * l,unsigned char * p,int len,int lc,int encoding)71 latin2mic(const unsigned char *l, unsigned char *p, int len,
72 		  int lc, int encoding)
73 {
74 	int			c1;
75 
76 	while (len > 0)
77 	{
78 		c1 = *l;
79 		if (c1 == 0)
80 			report_invalid_encoding(encoding, (const char *) l, len);
81 		if (IS_HIGHBIT_SET(c1))
82 			*p++ = lc;
83 		*p++ = c1;
84 		l++;
85 		len--;
86 	}
87 	*p = '\0';
88 }
89 
90 /*
91  * MIC ---> LATINn when the charset's local codes map directly to MIC
92  *
93  * mic points to the source string of length len
94  * p is the output area (must be large enough!)
95  * lc is the mule character set id for the local encoding
96  * encoding is the PG identifier for the local encoding
97  */
98 void
mic2latin(const unsigned char * mic,unsigned char * p,int len,int lc,int encoding)99 mic2latin(const unsigned char *mic, unsigned char *p, int len,
100 		  int lc, int encoding)
101 {
102 	int			c1;
103 
104 	while (len > 0)
105 	{
106 		c1 = *mic;
107 		if (c1 == 0)
108 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
109 		if (!IS_HIGHBIT_SET(c1))
110 		{
111 			/* easy for ASCII */
112 			*p++ = c1;
113 			mic++;
114 			len--;
115 		}
116 		else
117 		{
118 			int			l = pg_mule_mblen(mic);
119 
120 			if (len < l)
121 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
122 										len);
123 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
124 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
125 										   (const char *) mic, len);
126 			*p++ = mic[1];
127 			mic += 2;
128 			len -= 2;
129 		}
130 	}
131 	*p = '\0';
132 }
133 
134 
135 /*
136  * latin2mic_with_table: a generic single byte charset encoding
137  * conversion from a local charset to the mule internal code.
138  *
139  * l points to the source string of length len
140  * p is the output area (must be large enough!)
141  * lc is the mule character set id for the local encoding
142  * encoding is the PG identifier for the local encoding
143  * tab holds conversion entries for the local charset
144  * starting from 128 (0x80). each entry in the table holds the corresponding
145  * code point for the mule encoding, or 0 if there is no equivalent code.
146  */
147 void
latin2mic_with_table(const unsigned char * l,unsigned char * p,int len,int lc,int encoding,const unsigned char * tab)148 latin2mic_with_table(const unsigned char *l,
149 					 unsigned char *p,
150 					 int len,
151 					 int lc,
152 					 int encoding,
153 					 const unsigned char *tab)
154 {
155 	unsigned char c1,
156 				c2;
157 
158 	while (len > 0)
159 	{
160 		c1 = *l;
161 		if (c1 == 0)
162 			report_invalid_encoding(encoding, (const char *) l, len);
163 		if (!IS_HIGHBIT_SET(c1))
164 			*p++ = c1;
165 		else
166 		{
167 			c2 = tab[c1 - HIGHBIT];
168 			if (c2)
169 			{
170 				*p++ = lc;
171 				*p++ = c2;
172 			}
173 			else
174 				report_untranslatable_char(encoding, PG_MULE_INTERNAL,
175 										   (const char *) l, len);
176 		}
177 		l++;
178 		len--;
179 	}
180 	*p = '\0';
181 }
182 
183 /*
184  * mic2latin_with_table: a generic single byte charset encoding
185  * conversion from the mule internal code to a local charset.
186  *
187  * mic points to the source string of length len
188  * p is the output area (must be large enough!)
189  * lc is the mule character set id for the local encoding
190  * encoding is the PG identifier for the local encoding
191  * tab holds conversion entries for the mule internal code's second byte,
192  * starting from 128 (0x80). each entry in the table holds the corresponding
193  * code point for the local charset, or 0 if there is no equivalent code.
194  */
195 void
mic2latin_with_table(const unsigned char * mic,unsigned char * p,int len,int lc,int encoding,const unsigned char * tab)196 mic2latin_with_table(const unsigned char *mic,
197 					 unsigned char *p,
198 					 int len,
199 					 int lc,
200 					 int encoding,
201 					 const unsigned char *tab)
202 {
203 	unsigned char c1,
204 				c2;
205 
206 	while (len > 0)
207 	{
208 		c1 = *mic;
209 		if (c1 == 0)
210 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
211 		if (!IS_HIGHBIT_SET(c1))
212 		{
213 			/* easy for ASCII */
214 			*p++ = c1;
215 			mic++;
216 			len--;
217 		}
218 		else
219 		{
220 			int			l = pg_mule_mblen(mic);
221 
222 			if (len < l)
223 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
224 										len);
225 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
226 				(c2 = tab[mic[1] - HIGHBIT]) == 0)
227 			{
228 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
229 										   (const char *) mic, len);
230 				break;			/* keep compiler quiet */
231 			}
232 			*p++ = c2;
233 			mic += 2;
234 			len -= 2;
235 		}
236 	}
237 	*p = '\0';
238 }
239 
240 /*
241  * comparison routine for bsearch()
242  * this routine is intended for combined UTF8 -> local code
243  */
244 static int
compare3(const void * p1,const void * p2)245 compare3(const void *p1, const void *p2)
246 {
247 	uint32		s1,
248 				s2,
249 				d1,
250 				d2;
251 
252 	s1 = *(const uint32 *) p1;
253 	s2 = *((const uint32 *) p1 + 1);
254 	d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
255 	d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
256 	return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
257 }
258 
259 /*
260  * comparison routine for bsearch()
261  * this routine is intended for local code -> combined UTF8
262  */
263 static int
compare4(const void * p1,const void * p2)264 compare4(const void *p1, const void *p2)
265 {
266 	uint32		v1,
267 				v2;
268 
269 	v1 = *(const uint32 *) p1;
270 	v2 = ((const pg_local_to_utf_combined *) p2)->code;
271 	return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
272 }
273 
274 /*
275  * store 32bit character representation into multibyte stream
276  */
277 static inline unsigned char *
store_coded_char(unsigned char * dest,uint32 code)278 store_coded_char(unsigned char *dest, uint32 code)
279 {
280 	if (code & 0xff000000)
281 		*dest++ = code >> 24;
282 	if (code & 0x00ff0000)
283 		*dest++ = code >> 16;
284 	if (code & 0x0000ff00)
285 		*dest++ = code >> 8;
286 	if (code & 0x000000ff)
287 		*dest++ = code;
288 	return dest;
289 }
290 
291 /*
292  * Convert a character using a conversion radix tree.
293  *
294  * 'l' is the length of the input character in bytes, and b1-b4 are
295  * the input character's bytes.
296  */
297 static inline uint32
pg_mb_radix_conv(const pg_mb_radix_tree * rt,int l,unsigned char b1,unsigned char b2,unsigned char b3,unsigned char b4)298 pg_mb_radix_conv(const pg_mb_radix_tree *rt,
299 				 int l,
300 				 unsigned char b1,
301 				 unsigned char b2,
302 				 unsigned char b3,
303 				 unsigned char b4)
304 {
305 	if (l == 4)
306 	{
307 		/* 4-byte code */
308 
309 		/* check code validity */
310 		if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
311 			b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
312 			b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
313 			b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
314 			return 0;
315 
316 		/* perform lookup */
317 		if (rt->chars32)
318 		{
319 			uint32		idx = rt->b4root;
320 
321 			idx = rt->chars32[b1 + idx - rt->b4_1_lower];
322 			idx = rt->chars32[b2 + idx - rt->b4_2_lower];
323 			idx = rt->chars32[b3 + idx - rt->b4_3_lower];
324 			return rt->chars32[b4 + idx - rt->b4_4_lower];
325 		}
326 		else
327 		{
328 			uint16		idx = rt->b4root;
329 
330 			idx = rt->chars16[b1 + idx - rt->b4_1_lower];
331 			idx = rt->chars16[b2 + idx - rt->b4_2_lower];
332 			idx = rt->chars16[b3 + idx - rt->b4_3_lower];
333 			return rt->chars16[b4 + idx - rt->b4_4_lower];
334 		}
335 	}
336 	else if (l == 3)
337 	{
338 		/* 3-byte code */
339 
340 		/* check code validity */
341 		if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
342 			b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
343 			b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
344 			return 0;
345 
346 		/* perform lookup */
347 		if (rt->chars32)
348 		{
349 			uint32		idx = rt->b3root;
350 
351 			idx = rt->chars32[b2 + idx - rt->b3_1_lower];
352 			idx = rt->chars32[b3 + idx - rt->b3_2_lower];
353 			return rt->chars32[b4 + idx - rt->b3_3_lower];
354 		}
355 		else
356 		{
357 			uint16		idx = rt->b3root;
358 
359 			idx = rt->chars16[b2 + idx - rt->b3_1_lower];
360 			idx = rt->chars16[b3 + idx - rt->b3_2_lower];
361 			return rt->chars16[b4 + idx - rt->b3_3_lower];
362 		}
363 	}
364 	else if (l == 2)
365 	{
366 		/* 2-byte code */
367 
368 		/* check code validity - first byte */
369 		if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
370 			b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
371 			return 0;
372 
373 		/* perform lookup */
374 		if (rt->chars32)
375 		{
376 			uint32		idx = rt->b2root;
377 
378 			idx = rt->chars32[b3 + idx - rt->b2_1_lower];
379 			return rt->chars32[b4 + idx - rt->b2_2_lower];
380 		}
381 		else
382 		{
383 			uint16		idx = rt->b2root;
384 
385 			idx = rt->chars16[b3 + idx - rt->b2_1_lower];
386 			return rt->chars16[b4 + idx - rt->b2_2_lower];
387 		}
388 	}
389 	else if (l == 1)
390 	{
391 		/* 1-byte code */
392 
393 		/* check code validity - first byte */
394 		if (b4 < rt->b1_lower || b4 > rt->b1_upper)
395 			return 0;
396 
397 		/* perform lookup */
398 		if (rt->chars32)
399 			return rt->chars32[b4 + rt->b1root - rt->b1_lower];
400 		else
401 			return rt->chars16[b4 + rt->b1root - rt->b1_lower];
402 	}
403 	return 0;					/* shouldn't happen */
404 }
405 
406 /*
407  * UTF8 ---> local code
408  *
409  * utf: input string in UTF8 encoding (need not be null-terminated)
410  * len: length of input string (in bytes)
411  * iso: pointer to the output area (must be large enough!)
412 		  (output string will be null-terminated)
413  * map: conversion map for single characters
414  * cmap: conversion map for combined characters
415  *		  (optional, pass NULL if none)
416  * cmapsize: number of entries in the conversion map for combined characters
417  *		  (optional, pass 0 if none)
418  * conv_func: algorithmic encoding conversion function
419  *		  (optional, pass NULL if none)
420  * encoding: PG identifier for the local encoding
421  *
422  * For each character, the cmap (if provided) is consulted first; if no match,
423  * the map is consulted next; if still no match, the conv_func (if provided)
424  * is applied.  An error is raised if no match is found.
425  *
426  * See pg_wchar.h for more details about the data structures used here.
427  */
428 void
UtfToLocal(const unsigned char * utf,int len,unsigned char * iso,const pg_mb_radix_tree * map,const pg_utf_to_local_combined * cmap,int cmapsize,utf_local_conversion_func conv_func,int encoding)429 UtfToLocal(const unsigned char *utf, int len,
430 		   unsigned char *iso,
431 		   const pg_mb_radix_tree *map,
432 		   const pg_utf_to_local_combined *cmap, int cmapsize,
433 		   utf_local_conversion_func conv_func,
434 		   int encoding)
435 {
436 	uint32		iutf;
437 	int			l;
438 	const pg_utf_to_local_combined *cp;
439 
440 	if (!PG_VALID_ENCODING(encoding))
441 		ereport(ERROR,
442 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
443 				 errmsg("invalid encoding number: %d", encoding)));
444 
445 	for (; len > 0; len -= l)
446 	{
447 		unsigned char b1 = 0;
448 		unsigned char b2 = 0;
449 		unsigned char b3 = 0;
450 		unsigned char b4 = 0;
451 
452 		/* "break" cases all represent errors */
453 		if (*utf == '\0')
454 			break;
455 
456 		l = pg_utf_mblen(utf);
457 		if (len < l)
458 			break;
459 
460 		if (!pg_utf8_islegal(utf, l))
461 			break;
462 
463 		if (l == 1)
464 		{
465 			/* ASCII case is easy, assume it's one-to-one conversion */
466 			*iso++ = *utf++;
467 			continue;
468 		}
469 
470 		/* collect coded char of length l */
471 		if (l == 2)
472 		{
473 			b3 = *utf++;
474 			b4 = *utf++;
475 		}
476 		else if (l == 3)
477 		{
478 			b2 = *utf++;
479 			b3 = *utf++;
480 			b4 = *utf++;
481 		}
482 		else if (l == 4)
483 		{
484 			b1 = *utf++;
485 			b2 = *utf++;
486 			b3 = *utf++;
487 			b4 = *utf++;
488 		}
489 		else
490 		{
491 			elog(ERROR, "unsupported character length %d", l);
492 			iutf = 0;			/* keep compiler quiet */
493 		}
494 		iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
495 
496 		/* First, try with combined map if possible */
497 		if (cmap && len > l)
498 		{
499 			const unsigned char *utf_save = utf;
500 			int			len_save = len;
501 			int			l_save = l;
502 
503 			/* collect next character, same as above */
504 			len -= l;
505 
506 			l = pg_utf_mblen(utf);
507 			if (len < l)
508 				break;
509 
510 			if (!pg_utf8_islegal(utf, l))
511 				break;
512 
513 			/* We assume ASCII character cannot be in combined map */
514 			if (l > 1)
515 			{
516 				uint32		iutf2;
517 				uint32		cutf[2];
518 
519 				if (l == 2)
520 				{
521 					iutf2 = *utf++ << 8;
522 					iutf2 |= *utf++;
523 				}
524 				else if (l == 3)
525 				{
526 					iutf2 = *utf++ << 16;
527 					iutf2 |= *utf++ << 8;
528 					iutf2 |= *utf++;
529 				}
530 				else if (l == 4)
531 				{
532 					iutf2 = *utf++ << 24;
533 					iutf2 |= *utf++ << 16;
534 					iutf2 |= *utf++ << 8;
535 					iutf2 |= *utf++;
536 				}
537 				else
538 				{
539 					elog(ERROR, "unsupported character length %d", l);
540 					iutf2 = 0;	/* keep compiler quiet */
541 				}
542 
543 				cutf[0] = iutf;
544 				cutf[1] = iutf2;
545 
546 				cp = bsearch(cutf, cmap, cmapsize,
547 							 sizeof(pg_utf_to_local_combined), compare3);
548 
549 				if (cp)
550 				{
551 					iso = store_coded_char(iso, cp->code);
552 					continue;
553 				}
554 			}
555 
556 			/* fail, so back up to reprocess second character next time */
557 			utf = utf_save;
558 			len = len_save;
559 			l = l_save;
560 		}
561 
562 		/* Now check ordinary map */
563 		if (map)
564 		{
565 			uint32		converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
566 
567 			if (converted)
568 			{
569 				iso = store_coded_char(iso, converted);
570 				continue;
571 			}
572 		}
573 
574 		/* if there's a conversion function, try that */
575 		if (conv_func)
576 		{
577 			uint32		converted = (*conv_func) (iutf);
578 
579 			if (converted)
580 			{
581 				iso = store_coded_char(iso, converted);
582 				continue;
583 			}
584 		}
585 
586 		/* failed to translate this character */
587 		report_untranslatable_char(PG_UTF8, encoding,
588 								   (const char *) (utf - l), len);
589 	}
590 
591 	/* if we broke out of loop early, must be invalid input */
592 	if (len > 0)
593 		report_invalid_encoding(PG_UTF8, (const char *) utf, len);
594 
595 	*iso = '\0';
596 }
597 
598 /*
599  * local code ---> UTF8
600  *
601  * iso: input string in local encoding (need not be null-terminated)
602  * len: length of input string (in bytes)
603  * utf: pointer to the output area (must be large enough!)
604 		  (output string will be null-terminated)
605  * map: conversion map for single characters
606  * cmap: conversion map for combined characters
607  *		  (optional, pass NULL if none)
608  * cmapsize: number of entries in the conversion map for combined characters
609  *		  (optional, pass 0 if none)
610  * conv_func: algorithmic encoding conversion function
611  *		  (optional, pass NULL if none)
612  * encoding: PG identifier for the local encoding
613  *
614  * For each character, the map is consulted first; if no match, the cmap
615  * (if provided) is consulted next; if still no match, the conv_func
616  * (if provided) is applied.  An error is raised if no match is found.
617  *
618  * See pg_wchar.h for more details about the data structures used here.
619  */
620 void
LocalToUtf(const unsigned char * iso,int len,unsigned char * utf,const pg_mb_radix_tree * map,const pg_local_to_utf_combined * cmap,int cmapsize,utf_local_conversion_func conv_func,int encoding)621 LocalToUtf(const unsigned char *iso, int len,
622 		   unsigned char *utf,
623 		   const pg_mb_radix_tree *map,
624 		   const pg_local_to_utf_combined *cmap, int cmapsize,
625 		   utf_local_conversion_func conv_func,
626 		   int encoding)
627 {
628 	uint32		iiso;
629 	int			l;
630 	const pg_local_to_utf_combined *cp;
631 
632 	if (!PG_VALID_ENCODING(encoding))
633 		ereport(ERROR,
634 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
635 				 errmsg("invalid encoding number: %d", encoding)));
636 
637 	for (; len > 0; len -= l)
638 	{
639 		unsigned char b1 = 0;
640 		unsigned char b2 = 0;
641 		unsigned char b3 = 0;
642 		unsigned char b4 = 0;
643 
644 		/* "break" cases all represent errors */
645 		if (*iso == '\0')
646 			break;
647 
648 		if (!IS_HIGHBIT_SET(*iso))
649 		{
650 			/* ASCII case is easy, assume it's one-to-one conversion */
651 			*utf++ = *iso++;
652 			l = 1;
653 			continue;
654 		}
655 
656 		l = pg_encoding_verifymb(encoding, (const char *) iso, len);
657 		if (l < 0)
658 			break;
659 
660 		/* collect coded char of length l */
661 		if (l == 1)
662 			b4 = *iso++;
663 		else if (l == 2)
664 		{
665 			b3 = *iso++;
666 			b4 = *iso++;
667 		}
668 		else if (l == 3)
669 		{
670 			b2 = *iso++;
671 			b3 = *iso++;
672 			b4 = *iso++;
673 		}
674 		else if (l == 4)
675 		{
676 			b1 = *iso++;
677 			b2 = *iso++;
678 			b3 = *iso++;
679 			b4 = *iso++;
680 		}
681 		else
682 		{
683 			elog(ERROR, "unsupported character length %d", l);
684 			iiso = 0;			/* keep compiler quiet */
685 		}
686 		iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
687 
688 		if (map)
689 		{
690 			uint32		converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
691 
692 			if (converted)
693 			{
694 				utf = store_coded_char(utf, converted);
695 				continue;
696 			}
697 
698 			/* If there's a combined character map, try that */
699 			if (cmap)
700 			{
701 				cp = bsearch(&iiso, cmap, cmapsize,
702 							 sizeof(pg_local_to_utf_combined), compare4);
703 
704 				if (cp)
705 				{
706 					utf = store_coded_char(utf, cp->utf1);
707 					utf = store_coded_char(utf, cp->utf2);
708 					continue;
709 				}
710 			}
711 		}
712 
713 		/* if there's a conversion function, try that */
714 		if (conv_func)
715 		{
716 			uint32		converted = (*conv_func) (iiso);
717 
718 			if (converted)
719 			{
720 				utf = store_coded_char(utf, converted);
721 				continue;
722 			}
723 		}
724 
725 		/* failed to translate this character */
726 		report_untranslatable_char(encoding, PG_UTF8,
727 								   (const char *) (iso - l), len);
728 	}
729 
730 	/* if we broke out of loop early, must be invalid input */
731 	if (len > 0)
732 		report_invalid_encoding(encoding, (const char *) iso, len);
733 
734 	*utf = '\0';
735 }
736