1 /*-------------------------------------------------------------------------
2  *
3  *	  Utility functions for conversion procs.
4  *
5  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
6  * Portions Copyright (c) 1994, Regents of the University of California
7  *
8  * IDENTIFICATION
9  *	  src/backend/utils/mb/conv.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14 #include "mb/pg_wchar.h"
15 
16 
17 /*
18  * local2local: a generic single byte charset encoding
19  * conversion between two ASCII-superset encodings.
20  *
21  * l points to the source string of length len
22  * p is the output area (must be large enough!)
23  * src_encoding is the PG identifier for the source encoding
24  * dest_encoding is the PG identifier for the target encoding
25  * tab holds conversion entries for the source charset
26  * starting from 128 (0x80). each entry in the table holds the corresponding
27  * code point for the target charset, or 0 if there is no equivalent code.
28  *
29  * Returns the number of input bytes consumed.  If noError is true, this can
30  * be less than 'len'.
31  */
32 int
local2local(const unsigned char * l,unsigned char * p,int len,int src_encoding,int dest_encoding,const unsigned char * tab,bool noError)33 local2local(const unsigned char *l,
34 			unsigned char *p,
35 			int len,
36 			int src_encoding,
37 			int dest_encoding,
38 			const unsigned char *tab,
39 			bool noError)
40 {
41 	const unsigned char *start = l;
42 	unsigned char c1,
43 				c2;
44 
45 	while (len > 0)
46 	{
47 		c1 = *l;
48 		if (c1 == 0)
49 		{
50 			if (noError)
51 				break;
52 			report_invalid_encoding(src_encoding, (const char *) l, len);
53 		}
54 		if (!IS_HIGHBIT_SET(c1))
55 			*p++ = c1;
56 		else
57 		{
58 			c2 = tab[c1 - HIGHBIT];
59 			if (c2)
60 				*p++ = c2;
61 			else
62 			{
63 				if (noError)
64 					break;
65 				report_untranslatable_char(src_encoding, dest_encoding,
66 										   (const char *) l, len);
67 			}
68 		}
69 		l++;
70 		len--;
71 	}
72 	*p = '\0';
73 
74 	return l - start;
75 }
76 
77 /*
78  * LATINn ---> MIC when the charset's local codes map directly to MIC
79  *
80  * l points to the source string of length len
81  * p is the output area (must be large enough!)
82  * lc is the mule character set id for the local encoding
83  * encoding is the PG identifier for the local encoding
84  *
85  * Returns the number of input bytes consumed.  If noError is true, this can
86  * be less than 'len'.
87  */
88 int
latin2mic(const unsigned char * l,unsigned char * p,int len,int lc,int encoding,bool noError)89 latin2mic(const unsigned char *l, unsigned char *p, int len,
90 		  int lc, int encoding, bool noError)
91 {
92 	const unsigned char *start = l;
93 	int			c1;
94 
95 	while (len > 0)
96 	{
97 		c1 = *l;
98 		if (c1 == 0)
99 		{
100 			if (noError)
101 				break;
102 			report_invalid_encoding(encoding, (const char *) l, len);
103 		}
104 		if (IS_HIGHBIT_SET(c1))
105 			*p++ = lc;
106 		*p++ = c1;
107 		l++;
108 		len--;
109 	}
110 	*p = '\0';
111 
112 	return l - start;
113 }
114 
115 /*
116  * MIC ---> LATINn when the charset's local codes map directly to MIC
117  *
118  * mic points to the source string of length len
119  * p is the output area (must be large enough!)
120  * lc is the mule character set id for the local encoding
121  * encoding is the PG identifier for the local encoding
122  *
123  * Returns the number of input bytes consumed.  If noError is true, this can
124  * be less than 'len'.
125  */
126 int
mic2latin(const unsigned char * mic,unsigned char * p,int len,int lc,int encoding,bool noError)127 mic2latin(const unsigned char *mic, unsigned char *p, int len,
128 		  int lc, int encoding, bool noError)
129 {
130 	const unsigned char *start = mic;
131 	int			c1;
132 
133 	while (len > 0)
134 	{
135 		c1 = *mic;
136 		if (c1 == 0)
137 		{
138 			if (noError)
139 				break;
140 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
141 		}
142 		if (!IS_HIGHBIT_SET(c1))
143 		{
144 			/* easy for ASCII */
145 			*p++ = c1;
146 			mic++;
147 			len--;
148 		}
149 		else
150 		{
151 			int			l = pg_mule_mblen(mic);
152 
153 			if (len < l)
154 			{
155 				if (noError)
156 					break;
157 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
158 										len);
159 			}
160 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
161 			{
162 				if (noError)
163 					break;
164 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
165 										   (const char *) mic, len);
166 			}
167 			*p++ = mic[1];
168 			mic += 2;
169 			len -= 2;
170 		}
171 	}
172 	*p = '\0';
173 
174 	return mic - start;
175 }
176 
177 
178 /*
179  * latin2mic_with_table: a generic single byte charset encoding
180  * conversion from a local charset to the mule internal code.
181  *
182  * l points to the source string of length len
183  * p is the output area (must be large enough!)
184  * lc is the mule character set id for the local encoding
185  * encoding is the PG identifier for the local encoding
186  * tab holds conversion entries for the local charset
187  * starting from 128 (0x80). each entry in the table holds the corresponding
188  * code point for the mule encoding, or 0 if there is no equivalent code.
189  *
190  * Returns the number of input bytes consumed.  If noError is true, this can
191  * be less than 'len'.
192  */
193 int
latin2mic_with_table(const unsigned char * l,unsigned char * p,int len,int lc,int encoding,const unsigned char * tab,bool noError)194 latin2mic_with_table(const unsigned char *l,
195 					 unsigned char *p,
196 					 int len,
197 					 int lc,
198 					 int encoding,
199 					 const unsigned char *tab,
200 					 bool noError)
201 {
202 	const unsigned char *start = l;
203 	unsigned char c1,
204 				c2;
205 
206 	while (len > 0)
207 	{
208 		c1 = *l;
209 		if (c1 == 0)
210 		{
211 			if (noError)
212 				break;
213 			report_invalid_encoding(encoding, (const char *) l, len);
214 		}
215 		if (!IS_HIGHBIT_SET(c1))
216 			*p++ = c1;
217 		else
218 		{
219 			c2 = tab[c1 - HIGHBIT];
220 			if (c2)
221 			{
222 				*p++ = lc;
223 				*p++ = c2;
224 			}
225 			else
226 			{
227 				if (noError)
228 					break;
229 				report_untranslatable_char(encoding, PG_MULE_INTERNAL,
230 										   (const char *) l, len);
231 			}
232 		}
233 		l++;
234 		len--;
235 	}
236 	*p = '\0';
237 
238 	return l - start;
239 }
240 
241 /*
242  * mic2latin_with_table: a generic single byte charset encoding
243  * conversion from the mule internal code to a local charset.
244  *
245  * mic points to the source string of length len
246  * p is the output area (must be large enough!)
247  * lc is the mule character set id for the local encoding
248  * encoding is the PG identifier for the local encoding
249  * tab holds conversion entries for the mule internal code's second byte,
250  * starting from 128 (0x80). each entry in the table holds the corresponding
251  * code point for the local charset, or 0 if there is no equivalent code.
252  *
253  * Returns the number of input bytes consumed.  If noError is true, this can
254  * be less than 'len'.
255  */
256 int
mic2latin_with_table(const unsigned char * mic,unsigned char * p,int len,int lc,int encoding,const unsigned char * tab,bool noError)257 mic2latin_with_table(const unsigned char *mic,
258 					 unsigned char *p,
259 					 int len,
260 					 int lc,
261 					 int encoding,
262 					 const unsigned char *tab,
263 					 bool noError)
264 {
265 	const unsigned char *start = mic;
266 	unsigned char c1,
267 				c2;
268 
269 	while (len > 0)
270 	{
271 		c1 = *mic;
272 		if (c1 == 0)
273 		{
274 			if (noError)
275 				break;
276 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
277 		}
278 		if (!IS_HIGHBIT_SET(c1))
279 		{
280 			/* easy for ASCII */
281 			*p++ = c1;
282 			mic++;
283 			len--;
284 		}
285 		else
286 		{
287 			int			l = pg_mule_mblen(mic);
288 
289 			if (len < l)
290 			{
291 				if (noError)
292 					break;
293 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
294 										len);
295 			}
296 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
297 				(c2 = tab[mic[1] - HIGHBIT]) == 0)
298 			{
299 				if (noError)
300 					break;
301 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
302 										   (const char *) mic, len);
303 				break;			/* keep compiler quiet */
304 			}
305 			*p++ = c2;
306 			mic += 2;
307 			len -= 2;
308 		}
309 	}
310 	*p = '\0';
311 
312 	return mic - start;
313 }
314 
315 /*
316  * comparison routine for bsearch()
317  * this routine is intended for combined UTF8 -> local code
318  */
319 static int
compare3(const void * p1,const void * p2)320 compare3(const void *p1, const void *p2)
321 {
322 	uint32		s1,
323 				s2,
324 				d1,
325 				d2;
326 
327 	s1 = *(const uint32 *) p1;
328 	s2 = *((const uint32 *) p1 + 1);
329 	d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
330 	d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
331 	return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
332 }
333 
334 /*
335  * comparison routine for bsearch()
336  * this routine is intended for local code -> combined UTF8
337  */
338 static int
compare4(const void * p1,const void * p2)339 compare4(const void *p1, const void *p2)
340 {
341 	uint32		v1,
342 				v2;
343 
344 	v1 = *(const uint32 *) p1;
345 	v2 = ((const pg_local_to_utf_combined *) p2)->code;
346 	return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
347 }
348 
349 /*
350  * store 32bit character representation into multibyte stream
351  */
352 static inline unsigned char *
store_coded_char(unsigned char * dest,uint32 code)353 store_coded_char(unsigned char *dest, uint32 code)
354 {
355 	if (code & 0xff000000)
356 		*dest++ = code >> 24;
357 	if (code & 0x00ff0000)
358 		*dest++ = code >> 16;
359 	if (code & 0x0000ff00)
360 		*dest++ = code >> 8;
361 	if (code & 0x000000ff)
362 		*dest++ = code;
363 	return dest;
364 }
365 
366 /*
367  * Convert a character using a conversion radix tree.
368  *
369  * 'l' is the length of the input character in bytes, and b1-b4 are
370  * the input character's bytes.
371  */
372 static inline uint32
pg_mb_radix_conv(const pg_mb_radix_tree * rt,int l,unsigned char b1,unsigned char b2,unsigned char b3,unsigned char b4)373 pg_mb_radix_conv(const pg_mb_radix_tree *rt,
374 				 int l,
375 				 unsigned char b1,
376 				 unsigned char b2,
377 				 unsigned char b3,
378 				 unsigned char b4)
379 {
380 	if (l == 4)
381 	{
382 		/* 4-byte code */
383 
384 		/* check code validity */
385 		if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
386 			b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
387 			b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
388 			b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
389 			return 0;
390 
391 		/* perform lookup */
392 		if (rt->chars32)
393 		{
394 			uint32		idx = rt->b4root;
395 
396 			idx = rt->chars32[b1 + idx - rt->b4_1_lower];
397 			idx = rt->chars32[b2 + idx - rt->b4_2_lower];
398 			idx = rt->chars32[b3 + idx - rt->b4_3_lower];
399 			return rt->chars32[b4 + idx - rt->b4_4_lower];
400 		}
401 		else
402 		{
403 			uint16		idx = rt->b4root;
404 
405 			idx = rt->chars16[b1 + idx - rt->b4_1_lower];
406 			idx = rt->chars16[b2 + idx - rt->b4_2_lower];
407 			idx = rt->chars16[b3 + idx - rt->b4_3_lower];
408 			return rt->chars16[b4 + idx - rt->b4_4_lower];
409 		}
410 	}
411 	else if (l == 3)
412 	{
413 		/* 3-byte code */
414 
415 		/* check code validity */
416 		if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
417 			b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
418 			b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
419 			return 0;
420 
421 		/* perform lookup */
422 		if (rt->chars32)
423 		{
424 			uint32		idx = rt->b3root;
425 
426 			idx = rt->chars32[b2 + idx - rt->b3_1_lower];
427 			idx = rt->chars32[b3 + idx - rt->b3_2_lower];
428 			return rt->chars32[b4 + idx - rt->b3_3_lower];
429 		}
430 		else
431 		{
432 			uint16		idx = rt->b3root;
433 
434 			idx = rt->chars16[b2 + idx - rt->b3_1_lower];
435 			idx = rt->chars16[b3 + idx - rt->b3_2_lower];
436 			return rt->chars16[b4 + idx - rt->b3_3_lower];
437 		}
438 	}
439 	else if (l == 2)
440 	{
441 		/* 2-byte code */
442 
443 		/* check code validity - first byte */
444 		if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
445 			b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
446 			return 0;
447 
448 		/* perform lookup */
449 		if (rt->chars32)
450 		{
451 			uint32		idx = rt->b2root;
452 
453 			idx = rt->chars32[b3 + idx - rt->b2_1_lower];
454 			return rt->chars32[b4 + idx - rt->b2_2_lower];
455 		}
456 		else
457 		{
458 			uint16		idx = rt->b2root;
459 
460 			idx = rt->chars16[b3 + idx - rt->b2_1_lower];
461 			return rt->chars16[b4 + idx - rt->b2_2_lower];
462 		}
463 	}
464 	else if (l == 1)
465 	{
466 		/* 1-byte code */
467 
468 		/* check code validity - first byte */
469 		if (b4 < rt->b1_lower || b4 > rt->b1_upper)
470 			return 0;
471 
472 		/* perform lookup */
473 		if (rt->chars32)
474 			return rt->chars32[b4 + rt->b1root - rt->b1_lower];
475 		else
476 			return rt->chars16[b4 + rt->b1root - rt->b1_lower];
477 	}
478 	return 0;					/* shouldn't happen */
479 }
480 
481 /*
482  * UTF8 ---> local code
483  *
484  * utf: input string in UTF8 encoding (need not be null-terminated)
485  * len: length of input string (in bytes)
486  * iso: pointer to the output area (must be large enough!)
487 		  (output string will be null-terminated)
488  * map: conversion map for single characters
489  * cmap: conversion map for combined characters
490  *		  (optional, pass NULL if none)
491  * cmapsize: number of entries in the conversion map for combined characters
492  *		  (optional, pass 0 if none)
493  * conv_func: algorithmic encoding conversion function
494  *		  (optional, pass NULL if none)
495  * encoding: PG identifier for the local encoding
496  *
497  * For each character, the cmap (if provided) is consulted first; if no match,
498  * the map is consulted next; if still no match, the conv_func (if provided)
499  * is applied.  An error is raised if no match is found.
500  *
501  * See pg_wchar.h for more details about the data structures used here.
502  *
503  * Returns the number of input bytes consumed.  If noError is true, this can
504  * be less than 'len'.
505  */
506 int
UtfToLocal(const unsigned char * utf,int len,unsigned char * iso,const pg_mb_radix_tree * map,const pg_utf_to_local_combined * cmap,int cmapsize,utf_local_conversion_func conv_func,int encoding,bool noError)507 UtfToLocal(const unsigned char *utf, int len,
508 		   unsigned char *iso,
509 		   const pg_mb_radix_tree *map,
510 		   const pg_utf_to_local_combined *cmap, int cmapsize,
511 		   utf_local_conversion_func conv_func,
512 		   int encoding, bool noError)
513 {
514 	uint32		iutf;
515 	int			l;
516 	const pg_utf_to_local_combined *cp;
517 	const unsigned char *start = utf;
518 
519 	if (!PG_VALID_ENCODING(encoding))
520 		ereport(ERROR,
521 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
522 				 errmsg("invalid encoding number: %d", encoding)));
523 
524 	for (; len > 0; len -= l)
525 	{
526 		unsigned char b1 = 0;
527 		unsigned char b2 = 0;
528 		unsigned char b3 = 0;
529 		unsigned char b4 = 0;
530 
531 		/* "break" cases all represent errors */
532 		if (*utf == '\0')
533 			break;
534 
535 		l = pg_utf_mblen(utf);
536 		if (len < l)
537 			break;
538 
539 		if (!pg_utf8_islegal(utf, l))
540 			break;
541 
542 		if (l == 1)
543 		{
544 			/* ASCII case is easy, assume it's one-to-one conversion */
545 			*iso++ = *utf++;
546 			continue;
547 		}
548 
549 		/* collect coded char of length l */
550 		if (l == 2)
551 		{
552 			b3 = *utf++;
553 			b4 = *utf++;
554 		}
555 		else if (l == 3)
556 		{
557 			b2 = *utf++;
558 			b3 = *utf++;
559 			b4 = *utf++;
560 		}
561 		else if (l == 4)
562 		{
563 			b1 = *utf++;
564 			b2 = *utf++;
565 			b3 = *utf++;
566 			b4 = *utf++;
567 		}
568 		else
569 		{
570 			elog(ERROR, "unsupported character length %d", l);
571 			iutf = 0;			/* keep compiler quiet */
572 		}
573 		iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
574 
575 		/* First, try with combined map if possible */
576 		if (cmap && len > l)
577 		{
578 			const unsigned char *utf_save = utf;
579 			int			len_save = len;
580 			int			l_save = l;
581 
582 			/* collect next character, same as above */
583 			len -= l;
584 
585 			l = pg_utf_mblen(utf);
586 			if (len < l)
587 			{
588 				/* need more data to decide if this is a combined char */
589 				utf -= l_save;
590 				break;
591 			}
592 
593 			if (!pg_utf8_islegal(utf, l))
594 			{
595 				if (!noError)
596 					report_invalid_encoding(PG_UTF8, (const char *) utf, len);
597 				utf -= l_save;
598 				break;
599 			}
600 
601 			/* We assume ASCII character cannot be in combined map */
602 			if (l > 1)
603 			{
604 				uint32		iutf2;
605 				uint32		cutf[2];
606 
607 				if (l == 2)
608 				{
609 					iutf2 = *utf++ << 8;
610 					iutf2 |= *utf++;
611 				}
612 				else if (l == 3)
613 				{
614 					iutf2 = *utf++ << 16;
615 					iutf2 |= *utf++ << 8;
616 					iutf2 |= *utf++;
617 				}
618 				else if (l == 4)
619 				{
620 					iutf2 = *utf++ << 24;
621 					iutf2 |= *utf++ << 16;
622 					iutf2 |= *utf++ << 8;
623 					iutf2 |= *utf++;
624 				}
625 				else
626 				{
627 					elog(ERROR, "unsupported character length %d", l);
628 					iutf2 = 0;	/* keep compiler quiet */
629 				}
630 
631 				cutf[0] = iutf;
632 				cutf[1] = iutf2;
633 
634 				cp = bsearch(cutf, cmap, cmapsize,
635 							 sizeof(pg_utf_to_local_combined), compare3);
636 
637 				if (cp)
638 				{
639 					iso = store_coded_char(iso, cp->code);
640 					continue;
641 				}
642 			}
643 
644 			/* fail, so back up to reprocess second character next time */
645 			utf = utf_save;
646 			len = len_save;
647 			l = l_save;
648 		}
649 
650 		/* Now check ordinary map */
651 		if (map)
652 		{
653 			uint32		converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
654 
655 			if (converted)
656 			{
657 				iso = store_coded_char(iso, converted);
658 				continue;
659 			}
660 		}
661 
662 		/* if there's a conversion function, try that */
663 		if (conv_func)
664 		{
665 			uint32		converted = (*conv_func) (iutf);
666 
667 			if (converted)
668 			{
669 				iso = store_coded_char(iso, converted);
670 				continue;
671 			}
672 		}
673 
674 		/* failed to translate this character */
675 		utf -= l;
676 		if (noError)
677 			break;
678 		report_untranslatable_char(PG_UTF8, encoding,
679 								   (const char *) utf, len);
680 	}
681 
682 	/* if we broke out of loop early, must be invalid input */
683 	if (len > 0 && !noError)
684 		report_invalid_encoding(PG_UTF8, (const char *) utf, len);
685 
686 	*iso = '\0';
687 
688 	return utf - start;
689 }
690 
691 /*
692  * local code ---> UTF8
693  *
694  * iso: input string in local encoding (need not be null-terminated)
695  * len: length of input string (in bytes)
696  * utf: pointer to the output area (must be large enough!)
697 		  (output string will be null-terminated)
698  * map: conversion map for single characters
699  * cmap: conversion map for combined characters
700  *		  (optional, pass NULL if none)
701  * cmapsize: number of entries in the conversion map for combined characters
702  *		  (optional, pass 0 if none)
703  * conv_func: algorithmic encoding conversion function
704  *		  (optional, pass NULL if none)
705  * encoding: PG identifier for the local encoding
706  *
707  * For each character, the map is consulted first; if no match, the cmap
708  * (if provided) is consulted next; if still no match, the conv_func
709  * (if provided) is applied.  An error is raised if no match is found.
710  *
711  * See pg_wchar.h for more details about the data structures used here.
712  *
713  * Returns the number of input bytes consumed.  If noError is true, this can
714  * be less than 'len'.
715  */
716 int
LocalToUtf(const unsigned char * iso,int len,unsigned char * utf,const pg_mb_radix_tree * map,const pg_local_to_utf_combined * cmap,int cmapsize,utf_local_conversion_func conv_func,int encoding,bool noError)717 LocalToUtf(const unsigned char *iso, int len,
718 		   unsigned char *utf,
719 		   const pg_mb_radix_tree *map,
720 		   const pg_local_to_utf_combined *cmap, int cmapsize,
721 		   utf_local_conversion_func conv_func,
722 		   int encoding,
723 		   bool noError)
724 {
725 	uint32		iiso;
726 	int			l;
727 	const pg_local_to_utf_combined *cp;
728 	const unsigned char *start = iso;
729 
730 	if (!PG_VALID_ENCODING(encoding))
731 		ereport(ERROR,
732 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
733 				 errmsg("invalid encoding number: %d", encoding)));
734 
735 	for (; len > 0; len -= l)
736 	{
737 		unsigned char b1 = 0;
738 		unsigned char b2 = 0;
739 		unsigned char b3 = 0;
740 		unsigned char b4 = 0;
741 
742 		/* "break" cases all represent errors */
743 		if (*iso == '\0')
744 			break;
745 
746 		if (!IS_HIGHBIT_SET(*iso))
747 		{
748 			/* ASCII case is easy, assume it's one-to-one conversion */
749 			*utf++ = *iso++;
750 			l = 1;
751 			continue;
752 		}
753 
754 		l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
755 		if (l < 0)
756 			break;
757 
758 		/* collect coded char of length l */
759 		if (l == 1)
760 			b4 = *iso++;
761 		else if (l == 2)
762 		{
763 			b3 = *iso++;
764 			b4 = *iso++;
765 		}
766 		else if (l == 3)
767 		{
768 			b2 = *iso++;
769 			b3 = *iso++;
770 			b4 = *iso++;
771 		}
772 		else if (l == 4)
773 		{
774 			b1 = *iso++;
775 			b2 = *iso++;
776 			b3 = *iso++;
777 			b4 = *iso++;
778 		}
779 		else
780 		{
781 			elog(ERROR, "unsupported character length %d", l);
782 			iiso = 0;			/* keep compiler quiet */
783 		}
784 		iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
785 
786 		if (map)
787 		{
788 			uint32		converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
789 
790 			if (converted)
791 			{
792 				utf = store_coded_char(utf, converted);
793 				continue;
794 			}
795 
796 			/* If there's a combined character map, try that */
797 			if (cmap)
798 			{
799 				cp = bsearch(&iiso, cmap, cmapsize,
800 							 sizeof(pg_local_to_utf_combined), compare4);
801 
802 				if (cp)
803 				{
804 					utf = store_coded_char(utf, cp->utf1);
805 					utf = store_coded_char(utf, cp->utf2);
806 					continue;
807 				}
808 			}
809 		}
810 
811 		/* if there's a conversion function, try that */
812 		if (conv_func)
813 		{
814 			uint32		converted = (*conv_func) (iiso);
815 
816 			if (converted)
817 			{
818 				utf = store_coded_char(utf, converted);
819 				continue;
820 			}
821 		}
822 
823 		/* failed to translate this character */
824 		iso -= l;
825 		if (noError)
826 			break;
827 		report_untranslatable_char(encoding, PG_UTF8,
828 								   (const char *) iso, len);
829 	}
830 
831 	/* if we broke out of loop early, must be invalid input */
832 	if (len > 0 && !noError)
833 		report_invalid_encoding(encoding, (const char *) iso, len);
834 
835 	*utf = '\0';
836 
837 	return iso - start;
838 }
839