1 /*-------------------------------------------------------------------------
2  *
3  *	  Utility functions for conversion procs.
4  *
5  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
6  * Portions Copyright (c) 1994, Regents of the University of California
7  *
8  * IDENTIFICATION
9  *	  src/backend/utils/mb/conv.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14 #include "mb/pg_wchar.h"
15 
16 
17 /*
18  * local2local: a generic single byte charset encoding
19  * conversion between two ASCII-superset encodings.
20  *
21  * l points to the source string of length len
22  * p is the output area (must be large enough!)
23  * src_encoding is the PG identifier for the source encoding
24  * dest_encoding is the PG identifier for the target encoding
25  * tab holds conversion entries for the source charset
26  * starting from 128 (0x80). each entry in the table holds the corresponding
27  * code point for the target charset, or 0 if there is no equivalent code.
28  */
29 void
local2local(const unsigned char * l,unsigned char * p,int len,int src_encoding,int dest_encoding,const unsigned char * tab)30 local2local(const unsigned char *l,
31 			unsigned char *p,
32 			int len,
33 			int src_encoding,
34 			int dest_encoding,
35 			const unsigned char *tab)
36 {
37 	unsigned char c1,
38 				c2;
39 
40 	while (len > 0)
41 	{
42 		c1 = *l;
43 		if (c1 == 0)
44 			report_invalid_encoding(src_encoding, (const char *) l, len);
45 		if (!IS_HIGHBIT_SET(c1))
46 			*p++ = c1;
47 		else
48 		{
49 			c2 = tab[c1 - HIGHBIT];
50 			if (c2)
51 				*p++ = c2;
52 			else
53 				report_untranslatable_char(src_encoding, dest_encoding,
54 										   (const char *) l, len);
55 		}
56 		l++;
57 		len--;
58 	}
59 	*p = '\0';
60 }
61 
62 /*
63  * LATINn ---> MIC when the charset's local codes map directly to MIC
64  *
65  * l points to the source string of length len
66  * p is the output area (must be large enough!)
67  * lc is the mule character set id for the local encoding
68  * encoding is the PG identifier for the local encoding
69  */
70 void
latin2mic(const unsigned char * l,unsigned char * p,int len,int lc,int encoding)71 latin2mic(const unsigned char *l, unsigned char *p, int len,
72 		  int lc, int encoding)
73 {
74 	int			c1;
75 
76 	while (len > 0)
77 	{
78 		c1 = *l;
79 		if (c1 == 0)
80 			report_invalid_encoding(encoding, (const char *) l, len);
81 		if (IS_HIGHBIT_SET(c1))
82 			*p++ = lc;
83 		*p++ = c1;
84 		l++;
85 		len--;
86 	}
87 	*p = '\0';
88 }
89 
90 /*
91  * MIC ---> LATINn when the charset's local codes map directly to MIC
92  *
93  * mic points to the source string of length len
94  * p is the output area (must be large enough!)
95  * lc is the mule character set id for the local encoding
96  * encoding is the PG identifier for the local encoding
97  */
98 void
mic2latin(const unsigned char * mic,unsigned char * p,int len,int lc,int encoding)99 mic2latin(const unsigned char *mic, unsigned char *p, int len,
100 		  int lc, int encoding)
101 {
102 	int			c1;
103 
104 	while (len > 0)
105 	{
106 		c1 = *mic;
107 		if (c1 == 0)
108 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
109 		if (!IS_HIGHBIT_SET(c1))
110 		{
111 			/* easy for ASCII */
112 			*p++ = c1;
113 			mic++;
114 			len--;
115 		}
116 		else
117 		{
118 			int			l = pg_mic_mblen(mic);
119 
120 			if (len < l)
121 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
122 										len);
123 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
124 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
125 										   (const char *) mic, len);
126 			*p++ = mic[1];
127 			mic += 2;
128 			len -= 2;
129 		}
130 	}
131 	*p = '\0';
132 }
133 
134 
135 /*
136  * ASCII ---> MIC
137  *
138  * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
139  * characters, here we must take a hard line because we don't know
140  * the appropriate MIC equivalent.
141  */
142 void
pg_ascii2mic(const unsigned char * l,unsigned char * p,int len)143 pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
144 {
145 	int			c1;
146 
147 	while (len > 0)
148 	{
149 		c1 = *l;
150 		if (c1 == 0 || IS_HIGHBIT_SET(c1))
151 			report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
152 		*p++ = c1;
153 		l++;
154 		len--;
155 	}
156 	*p = '\0';
157 }
158 
159 /*
160  * MIC ---> ASCII
161  */
162 void
pg_mic2ascii(const unsigned char * mic,unsigned char * p,int len)163 pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
164 {
165 	int			c1;
166 
167 	while (len > 0)
168 	{
169 		c1 = *mic;
170 		if (c1 == 0 || IS_HIGHBIT_SET(c1))
171 			report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
172 									   (const char *) mic, len);
173 		*p++ = c1;
174 		mic++;
175 		len--;
176 	}
177 	*p = '\0';
178 }
179 
180 /*
181  * latin2mic_with_table: a generic single byte charset encoding
182  * conversion from a local charset to the mule internal code.
183  *
184  * l points to the source string of length len
185  * p is the output area (must be large enough!)
186  * lc is the mule character set id for the local encoding
187  * encoding is the PG identifier for the local encoding
188  * tab holds conversion entries for the local charset
189  * starting from 128 (0x80). each entry in the table holds the corresponding
190  * code point for the mule encoding, or 0 if there is no equivalent code.
191  */
192 void
latin2mic_with_table(const unsigned char * l,unsigned char * p,int len,int lc,int encoding,const unsigned char * tab)193 latin2mic_with_table(const unsigned char *l,
194 					 unsigned char *p,
195 					 int len,
196 					 int lc,
197 					 int encoding,
198 					 const unsigned char *tab)
199 {
200 	unsigned char c1,
201 				c2;
202 
203 	while (len > 0)
204 	{
205 		c1 = *l;
206 		if (c1 == 0)
207 			report_invalid_encoding(encoding, (const char *) l, len);
208 		if (!IS_HIGHBIT_SET(c1))
209 			*p++ = c1;
210 		else
211 		{
212 			c2 = tab[c1 - HIGHBIT];
213 			if (c2)
214 			{
215 				*p++ = lc;
216 				*p++ = c2;
217 			}
218 			else
219 				report_untranslatable_char(encoding, PG_MULE_INTERNAL,
220 										   (const char *) l, len);
221 		}
222 		l++;
223 		len--;
224 	}
225 	*p = '\0';
226 }
227 
228 /*
229  * mic2latin_with_table: a generic single byte charset encoding
230  * conversion from the mule internal code to a local charset.
231  *
232  * mic points to the source string of length len
233  * p is the output area (must be large enough!)
234  * lc is the mule character set id for the local encoding
235  * encoding is the PG identifier for the local encoding
236  * tab holds conversion entries for the mule internal code's second byte,
237  * starting from 128 (0x80). each entry in the table holds the corresponding
238  * code point for the local charset, or 0 if there is no equivalent code.
239  */
240 void
mic2latin_with_table(const unsigned char * mic,unsigned char * p,int len,int lc,int encoding,const unsigned char * tab)241 mic2latin_with_table(const unsigned char *mic,
242 					 unsigned char *p,
243 					 int len,
244 					 int lc,
245 					 int encoding,
246 					 const unsigned char *tab)
247 {
248 	unsigned char c1,
249 				c2;
250 
251 	while (len > 0)
252 	{
253 		c1 = *mic;
254 		if (c1 == 0)
255 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
256 		if (!IS_HIGHBIT_SET(c1))
257 		{
258 			/* easy for ASCII */
259 			*p++ = c1;
260 			mic++;
261 			len--;
262 		}
263 		else
264 		{
265 			int			l = pg_mic_mblen(mic);
266 
267 			if (len < l)
268 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
269 										len);
270 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
271 				(c2 = tab[mic[1] - HIGHBIT]) == 0)
272 			{
273 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
274 										   (const char *) mic, len);
275 				break;			/* keep compiler quiet */
276 			}
277 			*p++ = c2;
278 			mic += 2;
279 			len -= 2;
280 		}
281 	}
282 	*p = '\0';
283 }
284 
285 /*
286  * comparison routine for bsearch()
287  * this routine is intended for UTF8 -> local code
288  */
289 static int
compare1(const void * p1,const void * p2)290 compare1(const void *p1, const void *p2)
291 {
292 	uint32		v1,
293 				v2;
294 
295 	v1 = *(const uint32 *) p1;
296 	v2 = ((const pg_utf_to_local *) p2)->utf;
297 	return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
298 }
299 
300 /*
301  * comparison routine for bsearch()
302  * this routine is intended for local code -> UTF8
303  */
304 static int
compare2(const void * p1,const void * p2)305 compare2(const void *p1, const void *p2)
306 {
307 	uint32		v1,
308 				v2;
309 
310 	v1 = *(const uint32 *) p1;
311 	v2 = ((const pg_local_to_utf *) p2)->code;
312 	return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
313 }
314 
315 /*
316  * comparison routine for bsearch()
317  * this routine is intended for combined UTF8 -> local code
318  */
319 static int
compare3(const void * p1,const void * p2)320 compare3(const void *p1, const void *p2)
321 {
322 	uint32		s1,
323 				s2,
324 				d1,
325 				d2;
326 
327 	s1 = *(const uint32 *) p1;
328 	s2 = *((const uint32 *) p1 + 1);
329 	d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
330 	d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
331 	return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
332 }
333 
334 /*
335  * comparison routine for bsearch()
336  * this routine is intended for local code -> combined UTF8
337  */
338 static int
compare4(const void * p1,const void * p2)339 compare4(const void *p1, const void *p2)
340 {
341 	uint32		v1,
342 				v2;
343 
344 	v1 = *(const uint32 *) p1;
345 	v2 = ((const pg_local_to_utf_combined *) p2)->code;
346 	return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
347 }
348 
349 /*
350  * store 32bit character representation into multibyte stream
351  */
352 static inline unsigned char *
store_coded_char(unsigned char * dest,uint32 code)353 store_coded_char(unsigned char *dest, uint32 code)
354 {
355 	if (code & 0xff000000)
356 		*dest++ = code >> 24;
357 	if (code & 0x00ff0000)
358 		*dest++ = code >> 16;
359 	if (code & 0x0000ff00)
360 		*dest++ = code >> 8;
361 	if (code & 0x000000ff)
362 		*dest++ = code;
363 	return dest;
364 }
365 
366 /*
367  * UTF8 ---> local code
368  *
369  * utf: input string in UTF8 encoding (need not be null-terminated)
370  * len: length of input string (in bytes)
371  * iso: pointer to the output area (must be large enough!)
372 		  (output string will be null-terminated)
373  * map: conversion map for single characters
374  * mapsize: number of entries in the conversion map
375  * cmap: conversion map for combined characters
376  *		  (optional, pass NULL if none)
377  * cmapsize: number of entries in the conversion map for combined characters
378  *		  (optional, pass 0 if none)
379  * conv_func: algorithmic encoding conversion function
380  *		  (optional, pass NULL if none)
381  * encoding: PG identifier for the local encoding
382  *
383  * For each character, the cmap (if provided) is consulted first; if no match,
384  * the map is consulted next; if still no match, the conv_func (if provided)
385  * is applied.  An error is raised if no match is found.
386  *
387  * See pg_wchar.h for more details about the data structures used here.
388  */
389 void
UtfToLocal(const unsigned char * utf,int len,unsigned char * iso,const pg_utf_to_local * map,int mapsize,const pg_utf_to_local_combined * cmap,int cmapsize,utf_local_conversion_func conv_func,int encoding)390 UtfToLocal(const unsigned char *utf, int len,
391 		   unsigned char *iso,
392 		   const pg_utf_to_local *map, int mapsize,
393 		   const pg_utf_to_local_combined *cmap, int cmapsize,
394 		   utf_local_conversion_func conv_func,
395 		   int encoding)
396 {
397 	uint32		iutf;
398 	int			l;
399 	const pg_utf_to_local *p;
400 	const pg_utf_to_local_combined *cp;
401 
402 	if (!PG_VALID_ENCODING(encoding))
403 		ereport(ERROR,
404 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
405 				 errmsg("invalid encoding number: %d", encoding)));
406 
407 	for (; len > 0; len -= l)
408 	{
409 		/* "break" cases all represent errors */
410 		if (*utf == '\0')
411 			break;
412 
413 		l = pg_utf_mblen(utf);
414 		if (len < l)
415 			break;
416 
417 		if (!pg_utf8_islegal(utf, l))
418 			break;
419 
420 		if (l == 1)
421 		{
422 			/* ASCII case is easy, assume it's one-to-one conversion */
423 			*iso++ = *utf++;
424 			continue;
425 		}
426 
427 		/* collect coded char of length l */
428 		if (l == 2)
429 		{
430 			iutf = *utf++ << 8;
431 			iutf |= *utf++;
432 		}
433 		else if (l == 3)
434 		{
435 			iutf = *utf++ << 16;
436 			iutf |= *utf++ << 8;
437 			iutf |= *utf++;
438 		}
439 		else if (l == 4)
440 		{
441 			iutf = *utf++ << 24;
442 			iutf |= *utf++ << 16;
443 			iutf |= *utf++ << 8;
444 			iutf |= *utf++;
445 		}
446 		else
447 		{
448 			elog(ERROR, "unsupported character length %d", l);
449 			iutf = 0;			/* keep compiler quiet */
450 		}
451 
452 		/* First, try with combined map if possible */
453 		if (cmap && len > l)
454 		{
455 			const unsigned char *utf_save = utf;
456 			int			len_save = len;
457 			int			l_save = l;
458 
459 			/* collect next character, same as above */
460 			len -= l;
461 
462 			l = pg_utf_mblen(utf);
463 			if (len < l)
464 				break;
465 
466 			if (!pg_utf8_islegal(utf, l))
467 				break;
468 
469 			/* We assume ASCII character cannot be in combined map */
470 			if (l > 1)
471 			{
472 				uint32		iutf2;
473 				uint32		cutf[2];
474 
475 				if (l == 2)
476 				{
477 					iutf2 = *utf++ << 8;
478 					iutf2 |= *utf++;
479 				}
480 				else if (l == 3)
481 				{
482 					iutf2 = *utf++ << 16;
483 					iutf2 |= *utf++ << 8;
484 					iutf2 |= *utf++;
485 				}
486 				else if (l == 4)
487 				{
488 					iutf2 = *utf++ << 24;
489 					iutf2 |= *utf++ << 16;
490 					iutf2 |= *utf++ << 8;
491 					iutf2 |= *utf++;
492 				}
493 				else
494 				{
495 					elog(ERROR, "unsupported character length %d", l);
496 					iutf2 = 0;	/* keep compiler quiet */
497 				}
498 
499 				cutf[0] = iutf;
500 				cutf[1] = iutf2;
501 
502 				cp = bsearch(cutf, cmap, cmapsize,
503 							 sizeof(pg_utf_to_local_combined), compare3);
504 
505 				if (cp)
506 				{
507 					iso = store_coded_char(iso, cp->code);
508 					continue;
509 				}
510 			}
511 
512 			/* fail, so back up to reprocess second character next time */
513 			utf = utf_save;
514 			len = len_save;
515 			l = l_save;
516 		}
517 
518 		/* Now check ordinary map */
519 		p = bsearch(&iutf, map, mapsize,
520 					sizeof(pg_utf_to_local), compare1);
521 
522 		if (p)
523 		{
524 			iso = store_coded_char(iso, p->code);
525 			continue;
526 		}
527 
528 		/* if there's a conversion function, try that */
529 		if (conv_func)
530 		{
531 			uint32		converted = (*conv_func) (iutf);
532 
533 			if (converted)
534 			{
535 				iso = store_coded_char(iso, converted);
536 				continue;
537 			}
538 		}
539 
540 		/* failed to translate this character */
541 		report_untranslatable_char(PG_UTF8, encoding,
542 								   (const char *) (utf - l), len);
543 	}
544 
545 	/* if we broke out of loop early, must be invalid input */
546 	if (len > 0)
547 		report_invalid_encoding(PG_UTF8, (const char *) utf, len);
548 
549 	*iso = '\0';
550 }
551 
552 /*
553  * local code ---> UTF8
554  *
555  * iso: input string in local encoding (need not be null-terminated)
556  * len: length of input string (in bytes)
557  * utf: pointer to the output area (must be large enough!)
558 		  (output string will be null-terminated)
559  * map: conversion map for single characters
560  * mapsize: number of entries in the conversion map
561  * cmap: conversion map for combined characters
562  *		  (optional, pass NULL if none)
563  * cmapsize: number of entries in the conversion map for combined characters
564  *		  (optional, pass 0 if none)
565  * conv_func: algorithmic encoding conversion function
566  *		  (optional, pass NULL if none)
567  * encoding: PG identifier for the local encoding
568  *
569  * For each character, the map is consulted first; if no match, the cmap
570  * (if provided) is consulted next; if still no match, the conv_func
571  * (if provided) is applied.  An error is raised if no match is found.
572  *
573  * See pg_wchar.h for more details about the data structures used here.
574  */
575 void
LocalToUtf(const unsigned char * iso,int len,unsigned char * utf,const pg_local_to_utf * map,int mapsize,const pg_local_to_utf_combined * cmap,int cmapsize,utf_local_conversion_func conv_func,int encoding)576 LocalToUtf(const unsigned char *iso, int len,
577 		   unsigned char *utf,
578 		   const pg_local_to_utf *map, int mapsize,
579 		   const pg_local_to_utf_combined *cmap, int cmapsize,
580 		   utf_local_conversion_func conv_func,
581 		   int encoding)
582 {
583 	uint32		iiso;
584 	int			l;
585 	const pg_local_to_utf *p;
586 	const pg_local_to_utf_combined *cp;
587 
588 	if (!PG_VALID_ENCODING(encoding))
589 		ereport(ERROR,
590 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
591 				 errmsg("invalid encoding number: %d", encoding)));
592 
593 	for (; len > 0; len -= l)
594 	{
595 		/* "break" cases all represent errors */
596 		if (*iso == '\0')
597 			break;
598 
599 		if (!IS_HIGHBIT_SET(*iso))
600 		{
601 			/* ASCII case is easy, assume it's one-to-one conversion */
602 			*utf++ = *iso++;
603 			l = 1;
604 			continue;
605 		}
606 
607 		l = pg_encoding_verifymb(encoding, (const char *) iso, len);
608 		if (l < 0)
609 			break;
610 
611 		/* collect coded char of length l */
612 		if (l == 1)
613 			iiso = *iso++;
614 		else if (l == 2)
615 		{
616 			iiso = *iso++ << 8;
617 			iiso |= *iso++;
618 		}
619 		else if (l == 3)
620 		{
621 			iiso = *iso++ << 16;
622 			iiso |= *iso++ << 8;
623 			iiso |= *iso++;
624 		}
625 		else if (l == 4)
626 		{
627 			iiso = *iso++ << 24;
628 			iiso |= *iso++ << 16;
629 			iiso |= *iso++ << 8;
630 			iiso |= *iso++;
631 		}
632 		else
633 		{
634 			elog(ERROR, "unsupported character length %d", l);
635 			iiso = 0;			/* keep compiler quiet */
636 		}
637 
638 		/* First check ordinary map */
639 		p = bsearch(&iiso, map, mapsize,
640 					sizeof(pg_local_to_utf), compare2);
641 
642 		if (p)
643 		{
644 			utf = store_coded_char(utf, p->utf);
645 			continue;
646 		}
647 
648 		/* If there's a combined character map, try that */
649 		if (cmap)
650 		{
651 			cp = bsearch(&iiso, cmap, cmapsize,
652 						 sizeof(pg_local_to_utf_combined), compare4);
653 
654 			if (cp)
655 			{
656 				utf = store_coded_char(utf, cp->utf1);
657 				utf = store_coded_char(utf, cp->utf2);
658 				continue;
659 			}
660 		}
661 
662 		/* if there's a conversion function, try that */
663 		if (conv_func)
664 		{
665 			uint32		converted = (*conv_func) (iiso);
666 
667 			if (converted)
668 			{
669 				utf = store_coded_char(utf, converted);
670 				continue;
671 			}
672 		}
673 
674 		/* failed to translate this character */
675 		report_untranslatable_char(encoding, PG_UTF8,
676 								   (const char *) (iso - l), len);
677 	}
678 
679 	/* if we broke out of loop early, must be invalid input */
680 	if (len > 0)
681 		report_invalid_encoding(encoding, (const char *) iso, len);
682 
683 	*utf = '\0';
684 }
685