1 /*-------------------------------------------------------------------------
2  *
3  * wchar.c
4  *	  Functions for working with multibyte characters in various encodings.
5  *
6  * Portions Copyright (c) 1998-2020, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  *	  src/common/wchar.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "c.h"
14 
15 #include "mb/pg_wchar.h"
16 
17 
18 /*
19  * Operations on multi-byte encodings are driven by a table of helper
20  * functions.
21  *
22  * To add an encoding support, define mblen(), dsplen() and verifier() for
23  * the encoding.  For server-encodings, also define mb2wchar() and wchar2mb()
24  * conversion functions.
25  *
26  * These functions generally assume that their input is validly formed.
27  * The "verifier" functions, further down in the file, have to be more
28  * paranoid.
29  *
30  * We expect that mblen() does not need to examine more than the first byte
31  * of the character to discover the correct length.  GB18030 is an exception
32  * to that rule, though, as it also looks at second byte.  But even that
33  * behaves in a predictable way, if you only pass the first byte: it will
34  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
35  * good enough for all current uses.
36  *
37  * Note: for the display output of psql to work properly, the return values
38  * of the dsplen functions must conform to the Unicode standard. In particular
39  * the NUL character is zero width and control characters are generally
40  * width -1. It is recommended that non-ASCII encodings refer their ASCII
41  * subset to the ASCII routines to ensure consistency.
42  */
43 
44 /*
45  * SQL/ASCII
46  */
47 static int
pg_ascii2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)48 pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
49 {
50 	int			cnt = 0;
51 
52 	while (len > 0 && *from)
53 	{
54 		*to++ = *from++;
55 		len--;
56 		cnt++;
57 	}
58 	*to = 0;
59 	return cnt;
60 }
61 
62 static int
pg_ascii_mblen(const unsigned char * s)63 pg_ascii_mblen(const unsigned char *s)
64 {
65 	return 1;
66 }
67 
68 static int
pg_ascii_dsplen(const unsigned char * s)69 pg_ascii_dsplen(const unsigned char *s)
70 {
71 	if (*s == '\0')
72 		return 0;
73 	if (*s < 0x20 || *s == 0x7f)
74 		return -1;
75 
76 	return 1;
77 }
78 
79 /*
80  * EUC
81  */
82 static int
pg_euc2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)83 pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
84 {
85 	int			cnt = 0;
86 
87 	while (len > 0 && *from)
88 	{
89 		if (*from == SS2 && len >= 2)	/* JIS X 0201 (so called "1 byte
90 										 * KANA") */
91 		{
92 			from++;
93 			*to = (SS2 << 8) | *from++;
94 			len -= 2;
95 		}
96 		else if (*from == SS3 && len >= 3)	/* JIS X 0212 KANJI */
97 		{
98 			from++;
99 			*to = (SS3 << 16) | (*from++ << 8);
100 			*to |= *from++;
101 			len -= 3;
102 		}
103 		else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
104 		{
105 			*to = *from++ << 8;
106 			*to |= *from++;
107 			len -= 2;
108 		}
109 		else					/* must be ASCII */
110 		{
111 			*to = *from++;
112 			len--;
113 		}
114 		to++;
115 		cnt++;
116 	}
117 	*to = 0;
118 	return cnt;
119 }
120 
121 static inline int
pg_euc_mblen(const unsigned char * s)122 pg_euc_mblen(const unsigned char *s)
123 {
124 	int			len;
125 
126 	if (*s == SS2)
127 		len = 2;
128 	else if (*s == SS3)
129 		len = 3;
130 	else if (IS_HIGHBIT_SET(*s))
131 		len = 2;
132 	else
133 		len = 1;
134 	return len;
135 }
136 
137 static inline int
pg_euc_dsplen(const unsigned char * s)138 pg_euc_dsplen(const unsigned char *s)
139 {
140 	int			len;
141 
142 	if (*s == SS2)
143 		len = 2;
144 	else if (*s == SS3)
145 		len = 2;
146 	else if (IS_HIGHBIT_SET(*s))
147 		len = 2;
148 	else
149 		len = pg_ascii_dsplen(s);
150 	return len;
151 }
152 
153 /*
154  * EUC_JP
155  */
156 static int
pg_eucjp2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)157 pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
158 {
159 	return pg_euc2wchar_with_len(from, to, len);
160 }
161 
162 static int
pg_eucjp_mblen(const unsigned char * s)163 pg_eucjp_mblen(const unsigned char *s)
164 {
165 	return pg_euc_mblen(s);
166 }
167 
168 static int
pg_eucjp_dsplen(const unsigned char * s)169 pg_eucjp_dsplen(const unsigned char *s)
170 {
171 	int			len;
172 
173 	if (*s == SS2)
174 		len = 1;
175 	else if (*s == SS3)
176 		len = 2;
177 	else if (IS_HIGHBIT_SET(*s))
178 		len = 2;
179 	else
180 		len = pg_ascii_dsplen(s);
181 	return len;
182 }
183 
184 /*
185  * EUC_KR
186  */
187 static int
pg_euckr2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)188 pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
189 {
190 	return pg_euc2wchar_with_len(from, to, len);
191 }
192 
193 static int
pg_euckr_mblen(const unsigned char * s)194 pg_euckr_mblen(const unsigned char *s)
195 {
196 	return pg_euc_mblen(s);
197 }
198 
199 static int
pg_euckr_dsplen(const unsigned char * s)200 pg_euckr_dsplen(const unsigned char *s)
201 {
202 	return pg_euc_dsplen(s);
203 }
204 
205 /*
206  * EUC_CN
207  *
208  */
209 static int
pg_euccn2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)210 pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
211 {
212 	int			cnt = 0;
213 
214 	while (len > 0 && *from)
215 	{
216 		if (*from == SS2 && len >= 3)	/* code set 2 (unused?) */
217 		{
218 			from++;
219 			*to = (SS2 << 16) | (*from++ << 8);
220 			*to |= *from++;
221 			len -= 3;
222 		}
223 		else if (*from == SS3 && len >= 3)	/* code set 3 (unused ?) */
224 		{
225 			from++;
226 			*to = (SS3 << 16) | (*from++ << 8);
227 			*to |= *from++;
228 			len -= 3;
229 		}
230 		else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
231 		{
232 			*to = *from++ << 8;
233 			*to |= *from++;
234 			len -= 2;
235 		}
236 		else
237 		{
238 			*to = *from++;
239 			len--;
240 		}
241 		to++;
242 		cnt++;
243 	}
244 	*to = 0;
245 	return cnt;
246 }
247 
248 static int
pg_euccn_mblen(const unsigned char * s)249 pg_euccn_mblen(const unsigned char *s)
250 {
251 	int			len;
252 
253 	if (IS_HIGHBIT_SET(*s))
254 		len = 2;
255 	else
256 		len = 1;
257 	return len;
258 }
259 
260 static int
pg_euccn_dsplen(const unsigned char * s)261 pg_euccn_dsplen(const unsigned char *s)
262 {
263 	int			len;
264 
265 	if (IS_HIGHBIT_SET(*s))
266 		len = 2;
267 	else
268 		len = pg_ascii_dsplen(s);
269 	return len;
270 }
271 
272 /*
273  * EUC_TW
274  *
275  */
276 static int
pg_euctw2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)277 pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
278 {
279 	int			cnt = 0;
280 
281 	while (len > 0 && *from)
282 	{
283 		if (*from == SS2 && len >= 4)	/* code set 2 */
284 		{
285 			from++;
286 			*to = (((uint32) SS2) << 24) | (*from++ << 16);
287 			*to |= *from++ << 8;
288 			*to |= *from++;
289 			len -= 4;
290 		}
291 		else if (*from == SS3 && len >= 3)	/* code set 3 (unused?) */
292 		{
293 			from++;
294 			*to = (SS3 << 16) | (*from++ << 8);
295 			*to |= *from++;
296 			len -= 3;
297 		}
298 		else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
299 		{
300 			*to = *from++ << 8;
301 			*to |= *from++;
302 			len -= 2;
303 		}
304 		else
305 		{
306 			*to = *from++;
307 			len--;
308 		}
309 		to++;
310 		cnt++;
311 	}
312 	*to = 0;
313 	return cnt;
314 }
315 
316 static int
pg_euctw_mblen(const unsigned char * s)317 pg_euctw_mblen(const unsigned char *s)
318 {
319 	int			len;
320 
321 	if (*s == SS2)
322 		len = 4;
323 	else if (*s == SS3)
324 		len = 3;
325 	else if (IS_HIGHBIT_SET(*s))
326 		len = 2;
327 	else
328 		len = 1;
329 	return len;
330 }
331 
332 static int
pg_euctw_dsplen(const unsigned char * s)333 pg_euctw_dsplen(const unsigned char *s)
334 {
335 	int			len;
336 
337 	if (*s == SS2)
338 		len = 2;
339 	else if (*s == SS3)
340 		len = 2;
341 	else if (IS_HIGHBIT_SET(*s))
342 		len = 2;
343 	else
344 		len = pg_ascii_dsplen(s);
345 	return len;
346 }
347 
348 /*
349  * Convert pg_wchar to EUC_* encoding.
350  * caller must allocate enough space for "to", including a trailing zero!
351  * len: length of from.
352  * "from" not necessarily null terminated.
353  */
354 static int
pg_wchar2euc_with_len(const pg_wchar * from,unsigned char * to,int len)355 pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
356 {
357 	int			cnt = 0;
358 
359 	while (len > 0 && *from)
360 	{
361 		unsigned char c;
362 
363 		if ((c = (*from >> 24)))
364 		{
365 			*to++ = c;
366 			*to++ = (*from >> 16) & 0xff;
367 			*to++ = (*from >> 8) & 0xff;
368 			*to++ = *from & 0xff;
369 			cnt += 4;
370 		}
371 		else if ((c = (*from >> 16)))
372 		{
373 			*to++ = c;
374 			*to++ = (*from >> 8) & 0xff;
375 			*to++ = *from & 0xff;
376 			cnt += 3;
377 		}
378 		else if ((c = (*from >> 8)))
379 		{
380 			*to++ = c;
381 			*to++ = *from & 0xff;
382 			cnt += 2;
383 		}
384 		else
385 		{
386 			*to++ = *from;
387 			cnt++;
388 		}
389 		from++;
390 		len--;
391 	}
392 	*to = 0;
393 	return cnt;
394 }
395 
396 
397 /*
398  * JOHAB
399  */
400 static int
pg_johab_mblen(const unsigned char * s)401 pg_johab_mblen(const unsigned char *s)
402 {
403 	return pg_euc_mblen(s);
404 }
405 
406 static int
pg_johab_dsplen(const unsigned char * s)407 pg_johab_dsplen(const unsigned char *s)
408 {
409 	return pg_euc_dsplen(s);
410 }
411 
412 /*
413  * convert UTF8 string to pg_wchar (UCS-4)
414  * caller must allocate enough space for "to", including a trailing zero!
415  * len: length of from.
416  * "from" not necessarily null terminated.
417  */
418 static int
pg_utf2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)419 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
420 {
421 	int			cnt = 0;
422 	uint32		c1,
423 				c2,
424 				c3,
425 				c4;
426 
427 	while (len > 0 && *from)
428 	{
429 		if ((*from & 0x80) == 0)
430 		{
431 			*to = *from++;
432 			len--;
433 		}
434 		else if ((*from & 0xe0) == 0xc0)
435 		{
436 			if (len < 2)
437 				break;			/* drop trailing incomplete char */
438 			c1 = *from++ & 0x1f;
439 			c2 = *from++ & 0x3f;
440 			*to = (c1 << 6) | c2;
441 			len -= 2;
442 		}
443 		else if ((*from & 0xf0) == 0xe0)
444 		{
445 			if (len < 3)
446 				break;			/* drop trailing incomplete char */
447 			c1 = *from++ & 0x0f;
448 			c2 = *from++ & 0x3f;
449 			c3 = *from++ & 0x3f;
450 			*to = (c1 << 12) | (c2 << 6) | c3;
451 			len -= 3;
452 		}
453 		else if ((*from & 0xf8) == 0xf0)
454 		{
455 			if (len < 4)
456 				break;			/* drop trailing incomplete char */
457 			c1 = *from++ & 0x07;
458 			c2 = *from++ & 0x3f;
459 			c3 = *from++ & 0x3f;
460 			c4 = *from++ & 0x3f;
461 			*to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
462 			len -= 4;
463 		}
464 		else
465 		{
466 			/* treat a bogus char as length 1; not ours to raise error */
467 			*to = *from++;
468 			len--;
469 		}
470 		to++;
471 		cnt++;
472 	}
473 	*to = 0;
474 	return cnt;
475 }
476 
477 
478 /*
479  * Map a Unicode code point to UTF-8.  utf8string must have 4 bytes of
480  * space allocated.
481  */
482 unsigned char *
unicode_to_utf8(pg_wchar c,unsigned char * utf8string)483 unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
484 {
485 	if (c <= 0x7F)
486 	{
487 		utf8string[0] = c;
488 	}
489 	else if (c <= 0x7FF)
490 	{
491 		utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
492 		utf8string[1] = 0x80 | (c & 0x3F);
493 	}
494 	else if (c <= 0xFFFF)
495 	{
496 		utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
497 		utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
498 		utf8string[2] = 0x80 | (c & 0x3F);
499 	}
500 	else
501 	{
502 		utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
503 		utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
504 		utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
505 		utf8string[3] = 0x80 | (c & 0x3F);
506 	}
507 
508 	return utf8string;
509 }
510 
511 /*
512  * Trivial conversion from pg_wchar to UTF-8.
513  * caller should allocate enough space for "to"
514  * len: length of from.
515  * "from" not necessarily null terminated.
516  */
517 static int
pg_wchar2utf_with_len(const pg_wchar * from,unsigned char * to,int len)518 pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
519 {
520 	int			cnt = 0;
521 
522 	while (len > 0 && *from)
523 	{
524 		int			char_len;
525 
526 		unicode_to_utf8(*from, to);
527 		char_len = pg_utf_mblen(to);
528 		cnt += char_len;
529 		to += char_len;
530 		from++;
531 		len--;
532 	}
533 	*to = 0;
534 	return cnt;
535 }
536 
537 /*
538  * Return the byte length of a UTF8 character pointed to by s
539  *
540  * Note: in the current implementation we do not support UTF8 sequences
541  * of more than 4 bytes; hence do NOT return a value larger than 4.
542  * We return "1" for any leading byte that is either flat-out illegal or
543  * indicates a length larger than we support.
544  *
545  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
546  * other places would need to be fixed to change this.
547  */
548 int
pg_utf_mblen(const unsigned char * s)549 pg_utf_mblen(const unsigned char *s)
550 {
551 	int			len;
552 
553 	if ((*s & 0x80) == 0)
554 		len = 1;
555 	else if ((*s & 0xe0) == 0xc0)
556 		len = 2;
557 	else if ((*s & 0xf0) == 0xe0)
558 		len = 3;
559 	else if ((*s & 0xf8) == 0xf0)
560 		len = 4;
561 #ifdef NOT_USED
562 	else if ((*s & 0xfc) == 0xf8)
563 		len = 5;
564 	else if ((*s & 0xfe) == 0xfc)
565 		len = 6;
566 #endif
567 	else
568 		len = 1;
569 	return len;
570 }
571 
572 /*
573  * This is an implementation of wcwidth() and wcswidth() as defined in
574  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
575  * <http://www.unix.org/online.html>
576  *
577  * Markus Kuhn -- 2001-09-08 -- public domain
578  *
579  * customised for PostgreSQL
580  *
581  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
582  */
583 
584 struct mbinterval
585 {
586 	unsigned short first;
587 	unsigned short last;
588 };
589 
590 /* auxiliary function for binary search in interval table */
591 static int
mbbisearch(pg_wchar ucs,const struct mbinterval * table,int max)592 mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
593 {
594 	int			min = 0;
595 	int			mid;
596 
597 	if (ucs < table[0].first || ucs > table[max].last)
598 		return 0;
599 	while (max >= min)
600 	{
601 		mid = (min + max) / 2;
602 		if (ucs > table[mid].last)
603 			min = mid + 1;
604 		else if (ucs < table[mid].first)
605 			max = mid - 1;
606 		else
607 			return 1;
608 	}
609 
610 	return 0;
611 }
612 
613 
614 /* The following functions define the column width of an ISO 10646
615  * character as follows:
616  *
617  *	  - The null character (U+0000) has a column width of 0.
618  *
619  *	  - Other C0/C1 control characters and DEL will lead to a return
620  *		value of -1.
621  *
622  *	  - Non-spacing and enclosing combining characters (general
623  *		category code Mn or Me in the Unicode database) have a
624  *		column width of 0.
625  *
626  *	  - Other format characters (general category code Cf in the Unicode
627  *		database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
628  *
629  *	  - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
630  *		have a column width of 0.
631  *
632  *	  - Spacing characters in the East Asian Wide (W) or East Asian
633  *		FullWidth (F) category as defined in Unicode Technical
634  *		Report #11 have a column width of 2.
635  *
636  *	  - All remaining characters (including all printable
637  *		ISO 8859-1 and WGL4 characters, Unicode control characters,
638  *		etc.) have a column width of 1.
639  *
640  * This implementation assumes that wchar_t characters are encoded
641  * in ISO 10646.
642  */
643 
644 static int
ucs_wcwidth(pg_wchar ucs)645 ucs_wcwidth(pg_wchar ucs)
646 {
647 #include "common/unicode_combining_table.h"
648 
649 	/* test for 8-bit control characters */
650 	if (ucs == 0)
651 		return 0;
652 
653 	if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
654 		return -1;
655 
656 	/* binary search in table of non-spacing characters */
657 	if (mbbisearch(ucs, combining,
658 				   sizeof(combining) / sizeof(struct mbinterval) - 1))
659 		return 0;
660 
661 	/*
662 	 * if we arrive here, ucs is not a combining or C0/C1 control character
663 	 */
664 
665 	return 1 +
666 		(ucs >= 0x1100 &&
667 		 (ucs <= 0x115f ||		/* Hangul Jamo init. consonants */
668 		  (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
669 		   ucs != 0x303f) ||	/* CJK ... Yi */
670 		  (ucs >= 0xac00 && ucs <= 0xd7a3) ||	/* Hangul Syllables */
671 		  (ucs >= 0xf900 && ucs <= 0xfaff) ||	/* CJK Compatibility
672 												 * Ideographs */
673 		  (ucs >= 0xfe30 && ucs <= 0xfe6f) ||	/* CJK Compatibility Forms */
674 		  (ucs >= 0xff00 && ucs <= 0xff5f) ||	/* Fullwidth Forms */
675 		  (ucs >= 0xffe0 && ucs <= 0xffe6) ||
676 		  (ucs >= 0x20000 && ucs <= 0x2ffff)));
677 }
678 
679 /*
680  * Convert a UTF-8 character to a Unicode code point.
681  * This is a one-character version of pg_utf2wchar_with_len.
682  *
683  * No error checks here, c must point to a long-enough string.
684  */
685 pg_wchar
utf8_to_unicode(const unsigned char * c)686 utf8_to_unicode(const unsigned char *c)
687 {
688 	if ((*c & 0x80) == 0)
689 		return (pg_wchar) c[0];
690 	else if ((*c & 0xe0) == 0xc0)
691 		return (pg_wchar) (((c[0] & 0x1f) << 6) |
692 						   (c[1] & 0x3f));
693 	else if ((*c & 0xf0) == 0xe0)
694 		return (pg_wchar) (((c[0] & 0x0f) << 12) |
695 						   ((c[1] & 0x3f) << 6) |
696 						   (c[2] & 0x3f));
697 	else if ((*c & 0xf8) == 0xf0)
698 		return (pg_wchar) (((c[0] & 0x07) << 18) |
699 						   ((c[1] & 0x3f) << 12) |
700 						   ((c[2] & 0x3f) << 6) |
701 						   (c[3] & 0x3f));
702 	else
703 		/* that is an invalid code on purpose */
704 		return 0xffffffff;
705 }
706 
707 static int
pg_utf_dsplen(const unsigned char * s)708 pg_utf_dsplen(const unsigned char *s)
709 {
710 	return ucs_wcwidth(utf8_to_unicode(s));
711 }
712 
713 /*
714  * convert mule internal code to pg_wchar
715  * caller should allocate enough space for "to"
716  * len: length of from.
717  * "from" not necessarily null terminated.
718  */
719 static int
pg_mule2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)720 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
721 {
722 	int			cnt = 0;
723 
724 	while (len > 0 && *from)
725 	{
726 		if (IS_LC1(*from) && len >= 2)
727 		{
728 			*to = *from++ << 16;
729 			*to |= *from++;
730 			len -= 2;
731 		}
732 		else if (IS_LCPRV1(*from) && len >= 3)
733 		{
734 			from++;
735 			*to = *from++ << 16;
736 			*to |= *from++;
737 			len -= 3;
738 		}
739 		else if (IS_LC2(*from) && len >= 3)
740 		{
741 			*to = *from++ << 16;
742 			*to |= *from++ << 8;
743 			*to |= *from++;
744 			len -= 3;
745 		}
746 		else if (IS_LCPRV2(*from) && len >= 4)
747 		{
748 			from++;
749 			*to = *from++ << 16;
750 			*to |= *from++ << 8;
751 			*to |= *from++;
752 			len -= 4;
753 		}
754 		else
755 		{						/* assume ASCII */
756 			*to = (unsigned char) *from++;
757 			len--;
758 		}
759 		to++;
760 		cnt++;
761 	}
762 	*to = 0;
763 	return cnt;
764 }
765 
766 /*
767  * convert pg_wchar to mule internal code
768  * caller should allocate enough space for "to"
769  * len: length of from.
770  * "from" not necessarily null terminated.
771  */
772 static int
pg_wchar2mule_with_len(const pg_wchar * from,unsigned char * to,int len)773 pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
774 {
775 	int			cnt = 0;
776 
777 	while (len > 0 && *from)
778 	{
779 		unsigned char lb;
780 
781 		lb = (*from >> 16) & 0xff;
782 		if (IS_LC1(lb))
783 		{
784 			*to++ = lb;
785 			*to++ = *from & 0xff;
786 			cnt += 2;
787 		}
788 		else if (IS_LC2(lb))
789 		{
790 			*to++ = lb;
791 			*to++ = (*from >> 8) & 0xff;
792 			*to++ = *from & 0xff;
793 			cnt += 3;
794 		}
795 		else if (IS_LCPRV1_A_RANGE(lb))
796 		{
797 			*to++ = LCPRV1_A;
798 			*to++ = lb;
799 			*to++ = *from & 0xff;
800 			cnt += 3;
801 		}
802 		else if (IS_LCPRV1_B_RANGE(lb))
803 		{
804 			*to++ = LCPRV1_B;
805 			*to++ = lb;
806 			*to++ = *from & 0xff;
807 			cnt += 3;
808 		}
809 		else if (IS_LCPRV2_A_RANGE(lb))
810 		{
811 			*to++ = LCPRV2_A;
812 			*to++ = lb;
813 			*to++ = (*from >> 8) & 0xff;
814 			*to++ = *from & 0xff;
815 			cnt += 4;
816 		}
817 		else if (IS_LCPRV2_B_RANGE(lb))
818 		{
819 			*to++ = LCPRV2_B;
820 			*to++ = lb;
821 			*to++ = (*from >> 8) & 0xff;
822 			*to++ = *from & 0xff;
823 			cnt += 4;
824 		}
825 		else
826 		{
827 			*to++ = *from & 0xff;
828 			cnt += 1;
829 		}
830 		from++;
831 		len--;
832 	}
833 	*to = 0;
834 	return cnt;
835 }
836 
837 /* exported for direct use by conv.c */
838 int
pg_mule_mblen(const unsigned char * s)839 pg_mule_mblen(const unsigned char *s)
840 {
841 	int			len;
842 
843 	if (IS_LC1(*s))
844 		len = 2;
845 	else if (IS_LCPRV1(*s))
846 		len = 3;
847 	else if (IS_LC2(*s))
848 		len = 3;
849 	else if (IS_LCPRV2(*s))
850 		len = 4;
851 	else
852 		len = 1;				/* assume ASCII */
853 	return len;
854 }
855 
856 static int
pg_mule_dsplen(const unsigned char * s)857 pg_mule_dsplen(const unsigned char *s)
858 {
859 	int			len;
860 
861 	/*
862 	 * Note: it's not really appropriate to assume that all multibyte charsets
863 	 * are double-wide on screen.  But this seems an okay approximation for
864 	 * the MULE charsets we currently support.
865 	 */
866 
867 	if (IS_LC1(*s))
868 		len = 1;
869 	else if (IS_LCPRV1(*s))
870 		len = 1;
871 	else if (IS_LC2(*s))
872 		len = 2;
873 	else if (IS_LCPRV2(*s))
874 		len = 2;
875 	else
876 		len = 1;				/* assume ASCII */
877 
878 	return len;
879 }
880 
881 /*
882  * ISO8859-1
883  */
884 static int
pg_latin12wchar_with_len(const unsigned char * from,pg_wchar * to,int len)885 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
886 {
887 	int			cnt = 0;
888 
889 	while (len > 0 && *from)
890 	{
891 		*to++ = *from++;
892 		len--;
893 		cnt++;
894 	}
895 	*to = 0;
896 	return cnt;
897 }
898 
899 /*
900  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
901  * high bits.
902  * caller should allocate enough space for "to"
903  * len: length of from.
904  * "from" not necessarily null terminated.
905  */
906 static int
pg_wchar2single_with_len(const pg_wchar * from,unsigned char * to,int len)907 pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
908 {
909 	int			cnt = 0;
910 
911 	while (len > 0 && *from)
912 	{
913 		*to++ = *from++;
914 		len--;
915 		cnt++;
916 	}
917 	*to = 0;
918 	return cnt;
919 }
920 
921 static int
pg_latin1_mblen(const unsigned char * s)922 pg_latin1_mblen(const unsigned char *s)
923 {
924 	return 1;
925 }
926 
927 static int
pg_latin1_dsplen(const unsigned char * s)928 pg_latin1_dsplen(const unsigned char *s)
929 {
930 	return pg_ascii_dsplen(s);
931 }
932 
933 /*
934  * SJIS
935  */
936 static int
pg_sjis_mblen(const unsigned char * s)937 pg_sjis_mblen(const unsigned char *s)
938 {
939 	int			len;
940 
941 	if (*s >= 0xa1 && *s <= 0xdf)
942 		len = 1;				/* 1 byte kana? */
943 	else if (IS_HIGHBIT_SET(*s))
944 		len = 2;				/* kanji? */
945 	else
946 		len = 1;				/* should be ASCII */
947 	return len;
948 }
949 
950 static int
pg_sjis_dsplen(const unsigned char * s)951 pg_sjis_dsplen(const unsigned char *s)
952 {
953 	int			len;
954 
955 	if (*s >= 0xa1 && *s <= 0xdf)
956 		len = 1;				/* 1 byte kana? */
957 	else if (IS_HIGHBIT_SET(*s))
958 		len = 2;				/* kanji? */
959 	else
960 		len = pg_ascii_dsplen(s);	/* should be ASCII */
961 	return len;
962 }
963 
964 /*
965  * Big5
966  */
967 static int
pg_big5_mblen(const unsigned char * s)968 pg_big5_mblen(const unsigned char *s)
969 {
970 	int			len;
971 
972 	if (IS_HIGHBIT_SET(*s))
973 		len = 2;				/* kanji? */
974 	else
975 		len = 1;				/* should be ASCII */
976 	return len;
977 }
978 
979 static int
pg_big5_dsplen(const unsigned char * s)980 pg_big5_dsplen(const unsigned char *s)
981 {
982 	int			len;
983 
984 	if (IS_HIGHBIT_SET(*s))
985 		len = 2;				/* kanji? */
986 	else
987 		len = pg_ascii_dsplen(s);	/* should be ASCII */
988 	return len;
989 }
990 
991 /*
992  * GBK
993  */
994 static int
pg_gbk_mblen(const unsigned char * s)995 pg_gbk_mblen(const unsigned char *s)
996 {
997 	int			len;
998 
999 	if (IS_HIGHBIT_SET(*s))
1000 		len = 2;				/* kanji? */
1001 	else
1002 		len = 1;				/* should be ASCII */
1003 	return len;
1004 }
1005 
1006 static int
pg_gbk_dsplen(const unsigned char * s)1007 pg_gbk_dsplen(const unsigned char *s)
1008 {
1009 	int			len;
1010 
1011 	if (IS_HIGHBIT_SET(*s))
1012 		len = 2;				/* kanji? */
1013 	else
1014 		len = pg_ascii_dsplen(s);	/* should be ASCII */
1015 	return len;
1016 }
1017 
1018 /*
1019  * UHC
1020  */
1021 static int
pg_uhc_mblen(const unsigned char * s)1022 pg_uhc_mblen(const unsigned char *s)
1023 {
1024 	int			len;
1025 
1026 	if (IS_HIGHBIT_SET(*s))
1027 		len = 2;				/* 2byte? */
1028 	else
1029 		len = 1;				/* should be ASCII */
1030 	return len;
1031 }
1032 
1033 static int
pg_uhc_dsplen(const unsigned char * s)1034 pg_uhc_dsplen(const unsigned char *s)
1035 {
1036 	int			len;
1037 
1038 	if (IS_HIGHBIT_SET(*s))
1039 		len = 2;				/* 2byte? */
1040 	else
1041 		len = pg_ascii_dsplen(s);	/* should be ASCII */
1042 	return len;
1043 }
1044 
1045 /*
1046  * GB18030
1047  *	Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1048  */
1049 
1050 /*
1051  * Unlike all other mblen() functions, this also looks at the second byte of
1052  * the input.  However, if you only pass the first byte of a multi-byte
1053  * string, and \0 as the second byte, this still works in a predictable way:
1054  * a 4-byte character will be reported as two 2-byte characters.  That's
1055  * enough for all current uses, as a client-only encoding.  It works that
1056  * way, because in any valid 4-byte GB18030-encoded character, the third and
1057  * fourth byte look like a 2-byte encoded character, when looked at
1058  * separately.
1059  */
1060 static int
pg_gb18030_mblen(const unsigned char * s)1061 pg_gb18030_mblen(const unsigned char *s)
1062 {
1063 	int			len;
1064 
1065 	if (!IS_HIGHBIT_SET(*s))
1066 		len = 1;				/* ASCII */
1067 	else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1068 		len = 4;
1069 	else
1070 		len = 2;
1071 	return len;
1072 }
1073 
1074 static int
pg_gb18030_dsplen(const unsigned char * s)1075 pg_gb18030_dsplen(const unsigned char *s)
1076 {
1077 	int			len;
1078 
1079 	if (IS_HIGHBIT_SET(*s))
1080 		len = 2;
1081 	else
1082 		len = pg_ascii_dsplen(s);	/* ASCII */
1083 	return len;
1084 }
1085 
1086 /*
1087  *-------------------------------------------------------------------
1088  * multibyte sequence validators
1089  *
1090  * These functions accept "s", a pointer to the first byte of a string,
1091  * and "len", the remaining length of the string.  If there is a validly
1092  * encoded character beginning at *s, return its length in bytes; else
1093  * return -1.
1094  *
1095  * The functions can assume that len > 0 and that *s != '\0', but they must
1096  * test for and reject zeroes in any additional bytes of a multibyte character.
1097  *
1098  * Note that this definition allows the function for a single-byte
1099  * encoding to be just "return 1".
1100  *-------------------------------------------------------------------
1101  */
1102 
1103 static int
pg_ascii_verifier(const unsigned char * s,int len)1104 pg_ascii_verifier(const unsigned char *s, int len)
1105 {
1106 	return 1;
1107 }
1108 
1109 #define IS_EUC_RANGE_VALID(c)	((c) >= 0xa1 && (c) <= 0xfe)
1110 
1111 static int
pg_eucjp_verifier(const unsigned char * s,int len)1112 pg_eucjp_verifier(const unsigned char *s, int len)
1113 {
1114 	int			l;
1115 	unsigned char c1,
1116 				c2;
1117 
1118 	c1 = *s++;
1119 
1120 	switch (c1)
1121 	{
1122 		case SS2:				/* JIS X 0201 */
1123 			l = 2;
1124 			if (l > len)
1125 				return -1;
1126 			c2 = *s++;
1127 			if (c2 < 0xa1 || c2 > 0xdf)
1128 				return -1;
1129 			break;
1130 
1131 		case SS3:				/* JIS X 0212 */
1132 			l = 3;
1133 			if (l > len)
1134 				return -1;
1135 			c2 = *s++;
1136 			if (!IS_EUC_RANGE_VALID(c2))
1137 				return -1;
1138 			c2 = *s++;
1139 			if (!IS_EUC_RANGE_VALID(c2))
1140 				return -1;
1141 			break;
1142 
1143 		default:
1144 			if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1145 			{
1146 				l = 2;
1147 				if (l > len)
1148 					return -1;
1149 				if (!IS_EUC_RANGE_VALID(c1))
1150 					return -1;
1151 				c2 = *s++;
1152 				if (!IS_EUC_RANGE_VALID(c2))
1153 					return -1;
1154 			}
1155 			else
1156 				/* must be ASCII */
1157 			{
1158 				l = 1;
1159 			}
1160 			break;
1161 	}
1162 
1163 	return l;
1164 }
1165 
1166 static int
pg_euckr_verifier(const unsigned char * s,int len)1167 pg_euckr_verifier(const unsigned char *s, int len)
1168 {
1169 	int			l;
1170 	unsigned char c1,
1171 				c2;
1172 
1173 	c1 = *s++;
1174 
1175 	if (IS_HIGHBIT_SET(c1))
1176 	{
1177 		l = 2;
1178 		if (l > len)
1179 			return -1;
1180 		if (!IS_EUC_RANGE_VALID(c1))
1181 			return -1;
1182 		c2 = *s++;
1183 		if (!IS_EUC_RANGE_VALID(c2))
1184 			return -1;
1185 	}
1186 	else
1187 		/* must be ASCII */
1188 	{
1189 		l = 1;
1190 	}
1191 
1192 	return l;
1193 }
1194 
1195 /* EUC-CN byte sequences are exactly same as EUC-KR */
1196 #define pg_euccn_verifier	pg_euckr_verifier
1197 
1198 static int
pg_euctw_verifier(const unsigned char * s,int len)1199 pg_euctw_verifier(const unsigned char *s, int len)
1200 {
1201 	int			l;
1202 	unsigned char c1,
1203 				c2;
1204 
1205 	c1 = *s++;
1206 
1207 	switch (c1)
1208 	{
1209 		case SS2:				/* CNS 11643 Plane 1-7 */
1210 			l = 4;
1211 			if (l > len)
1212 				return -1;
1213 			c2 = *s++;
1214 			if (c2 < 0xa1 || c2 > 0xa7)
1215 				return -1;
1216 			c2 = *s++;
1217 			if (!IS_EUC_RANGE_VALID(c2))
1218 				return -1;
1219 			c2 = *s++;
1220 			if (!IS_EUC_RANGE_VALID(c2))
1221 				return -1;
1222 			break;
1223 
1224 		case SS3:				/* unused */
1225 			return -1;
1226 
1227 		default:
1228 			if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1229 			{
1230 				l = 2;
1231 				if (l > len)
1232 					return -1;
1233 				/* no further range check on c1? */
1234 				c2 = *s++;
1235 				if (!IS_EUC_RANGE_VALID(c2))
1236 					return -1;
1237 			}
1238 			else
1239 				/* must be ASCII */
1240 			{
1241 				l = 1;
1242 			}
1243 			break;
1244 	}
1245 	return l;
1246 }
1247 
1248 static int
pg_johab_verifier(const unsigned char * s,int len)1249 pg_johab_verifier(const unsigned char *s, int len)
1250 {
1251 	int			l,
1252 				mbl;
1253 	unsigned char c;
1254 
1255 	l = mbl = pg_johab_mblen(s);
1256 
1257 	if (len < l)
1258 		return -1;
1259 
1260 	if (!IS_HIGHBIT_SET(*s))
1261 		return mbl;
1262 
1263 	while (--l > 0)
1264 	{
1265 		c = *++s;
1266 		if (!IS_EUC_RANGE_VALID(c))
1267 			return -1;
1268 	}
1269 	return mbl;
1270 }
1271 
1272 static int
pg_mule_verifier(const unsigned char * s,int len)1273 pg_mule_verifier(const unsigned char *s, int len)
1274 {
1275 	int			l,
1276 				mbl;
1277 	unsigned char c;
1278 
1279 	l = mbl = pg_mule_mblen(s);
1280 
1281 	if (len < l)
1282 		return -1;
1283 
1284 	while (--l > 0)
1285 	{
1286 		c = *++s;
1287 		if (!IS_HIGHBIT_SET(c))
1288 			return -1;
1289 	}
1290 	return mbl;
1291 }
1292 
1293 static int
pg_latin1_verifier(const unsigned char * s,int len)1294 pg_latin1_verifier(const unsigned char *s, int len)
1295 {
1296 	return 1;
1297 }
1298 
1299 static int
pg_sjis_verifier(const unsigned char * s,int len)1300 pg_sjis_verifier(const unsigned char *s, int len)
1301 {
1302 	int			l,
1303 				mbl;
1304 	unsigned char c1,
1305 				c2;
1306 
1307 	l = mbl = pg_sjis_mblen(s);
1308 
1309 	if (len < l)
1310 		return -1;
1311 
1312 	if (l == 1)					/* pg_sjis_mblen already verified it */
1313 		return mbl;
1314 
1315 	c1 = *s++;
1316 	c2 = *s;
1317 	if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1318 		return -1;
1319 	return mbl;
1320 }
1321 
1322 static int
pg_big5_verifier(const unsigned char * s,int len)1323 pg_big5_verifier(const unsigned char *s, int len)
1324 {
1325 	int			l,
1326 				mbl;
1327 
1328 	l = mbl = pg_big5_mblen(s);
1329 
1330 	if (len < l)
1331 		return -1;
1332 
1333 	while (--l > 0)
1334 	{
1335 		if (*++s == '\0')
1336 			return -1;
1337 	}
1338 
1339 	return mbl;
1340 }
1341 
1342 static int
pg_gbk_verifier(const unsigned char * s,int len)1343 pg_gbk_verifier(const unsigned char *s, int len)
1344 {
1345 	int			l,
1346 				mbl;
1347 
1348 	l = mbl = pg_gbk_mblen(s);
1349 
1350 	if (len < l)
1351 		return -1;
1352 
1353 	while (--l > 0)
1354 	{
1355 		if (*++s == '\0')
1356 			return -1;
1357 	}
1358 
1359 	return mbl;
1360 }
1361 
1362 static int
pg_uhc_verifier(const unsigned char * s,int len)1363 pg_uhc_verifier(const unsigned char *s, int len)
1364 {
1365 	int			l,
1366 				mbl;
1367 
1368 	l = mbl = pg_uhc_mblen(s);
1369 
1370 	if (len < l)
1371 		return -1;
1372 
1373 	while (--l > 0)
1374 	{
1375 		if (*++s == '\0')
1376 			return -1;
1377 	}
1378 
1379 	return mbl;
1380 }
1381 
1382 static int
pg_gb18030_verifier(const unsigned char * s,int len)1383 pg_gb18030_verifier(const unsigned char *s, int len)
1384 {
1385 	int			l;
1386 
1387 	if (!IS_HIGHBIT_SET(*s))
1388 		l = 1;					/* ASCII */
1389 	else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1390 	{
1391 		/* Should be 4-byte, validate remaining bytes */
1392 		if (*s >= 0x81 && *s <= 0xfe &&
1393 			*(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1394 			*(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1395 			l = 4;
1396 		else
1397 			l = -1;
1398 	}
1399 	else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1400 	{
1401 		/* Should be 2-byte, validate */
1402 		if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1403 			(*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1404 			l = 2;
1405 		else
1406 			l = -1;
1407 	}
1408 	else
1409 		l = -1;
1410 	return l;
1411 }
1412 
1413 static int
pg_utf8_verifier(const unsigned char * s,int len)1414 pg_utf8_verifier(const unsigned char *s, int len)
1415 {
1416 	int			l = pg_utf_mblen(s);
1417 
1418 	if (len < l)
1419 		return -1;
1420 
1421 	if (!pg_utf8_islegal(s, l))
1422 		return -1;
1423 
1424 	return l;
1425 }
1426 
1427 /*
1428  * Check for validity of a single UTF-8 encoded character
1429  *
1430  * This directly implements the rules in RFC3629.  The bizarre-looking
1431  * restrictions on the second byte are meant to ensure that there isn't
1432  * more than one encoding of a given Unicode character point; that is,
1433  * you may not use a longer-than-necessary byte sequence with high order
1434  * zero bits to represent a character that would fit in fewer bytes.
1435  * To do otherwise is to create security hazards (eg, create an apparent
1436  * non-ASCII character that decodes to plain ASCII).
1437  *
1438  * length is assumed to have been obtained by pg_utf_mblen(), and the
1439  * caller must have checked that that many bytes are present in the buffer.
1440  */
1441 bool
pg_utf8_islegal(const unsigned char * source,int length)1442 pg_utf8_islegal(const unsigned char *source, int length)
1443 {
1444 	unsigned char a;
1445 
1446 	switch (length)
1447 	{
1448 		default:
1449 			/* reject lengths 5 and 6 for now */
1450 			return false;
1451 		case 4:
1452 			a = source[3];
1453 			if (a < 0x80 || a > 0xBF)
1454 				return false;
1455 			/* FALL THRU */
1456 		case 3:
1457 			a = source[2];
1458 			if (a < 0x80 || a > 0xBF)
1459 				return false;
1460 			/* FALL THRU */
1461 		case 2:
1462 			a = source[1];
1463 			switch (*source)
1464 			{
1465 				case 0xE0:
1466 					if (a < 0xA0 || a > 0xBF)
1467 						return false;
1468 					break;
1469 				case 0xED:
1470 					if (a < 0x80 || a > 0x9F)
1471 						return false;
1472 					break;
1473 				case 0xF0:
1474 					if (a < 0x90 || a > 0xBF)
1475 						return false;
1476 					break;
1477 				case 0xF4:
1478 					if (a < 0x80 || a > 0x8F)
1479 						return false;
1480 					break;
1481 				default:
1482 					if (a < 0x80 || a > 0xBF)
1483 						return false;
1484 					break;
1485 			}
1486 			/* FALL THRU */
1487 		case 1:
1488 			a = *source;
1489 			if (a >= 0x80 && a < 0xC2)
1490 				return false;
1491 			if (a > 0xF4)
1492 				return false;
1493 			break;
1494 	}
1495 	return true;
1496 }
1497 
1498 
1499 /*
1500  *-------------------------------------------------------------------
1501  * encoding info table
1502  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
1503  *-------------------------------------------------------------------
1504  */
1505 const pg_wchar_tbl pg_wchar_table[] = {
1506 	{pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */
1507 	{pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},	/* PG_EUC_JP */
1508 	{pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2},	/* PG_EUC_CN */
1509 	{pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3},	/* PG_EUC_KR */
1510 	{pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4},	/* PG_EUC_TW */
1511 	{pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},	/* PG_EUC_JIS_2004 */
1512 	{pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4},	/* PG_UTF8 */
1513 	{pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4},	/* PG_MULE_INTERNAL */
1514 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */
1515 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */
1516 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */
1517 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */
1518 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */
1519 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */
1520 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */
1521 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */
1522 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */
1523 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */
1524 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */
1525 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */
1526 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */
1527 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */
1528 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */
1529 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */
1530 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */
1531 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */
1532 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */
1533 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */
1534 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */
1535 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */
1536 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */
1537 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */
1538 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */
1539 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */
1540 	{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */
1541 	{0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */
1542 	{0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */
1543 	{0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2},	/* PG_GBK */
1544 	{0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2},	/* PG_UHC */
1545 	{0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4},	/* PG_GB18030 */
1546 	{0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3},	/* PG_JOHAB */
1547 	{0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}	/* PG_SHIFT_JIS_2004 */
1548 };
1549 
1550 /*
1551  * Returns the byte length of a multibyte character.
1552  *
1553  * Caution: when dealing with text that is not certainly valid in the
1554  * specified encoding, the result may exceed the actual remaining
1555  * string length.  Callers that are not prepared to deal with that
1556  * should use pg_encoding_mblen_bounded() instead.
1557  */
1558 int
pg_encoding_mblen(int encoding,const char * mbstr)1559 pg_encoding_mblen(int encoding, const char *mbstr)
1560 {
1561 	return (PG_VALID_ENCODING(encoding) ?
1562 			pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
1563 			pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
1564 }
1565 
1566 /*
1567  * Returns the byte length of a multibyte character; but not more than
1568  * the distance to end of string.
1569  */
1570 int
pg_encoding_mblen_bounded(int encoding,const char * mbstr)1571 pg_encoding_mblen_bounded(int encoding, const char *mbstr)
1572 {
1573 	return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
1574 }
1575 
1576 /*
1577  * Returns the display length of a multibyte character.
1578  */
1579 int
pg_encoding_dsplen(int encoding,const char * mbstr)1580 pg_encoding_dsplen(int encoding, const char *mbstr)
1581 {
1582 	return (PG_VALID_ENCODING(encoding) ?
1583 			pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
1584 			pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
1585 }
1586 
1587 /*
1588  * Verify the first multibyte character of the given string.
1589  * Return its byte length if good, -1 if bad.  (See comments above for
1590  * full details of the mbverify API.)
1591  */
1592 int
pg_encoding_verifymb(int encoding,const char * mbstr,int len)1593 pg_encoding_verifymb(int encoding, const char *mbstr, int len)
1594 {
1595 	return (PG_VALID_ENCODING(encoding) ?
1596 			pg_wchar_table[encoding].mbverify((const unsigned char *) mbstr, len) :
1597 			pg_wchar_table[PG_SQL_ASCII].mbverify((const unsigned char *) mbstr, len));
1598 }
1599 
1600 /*
1601  * fetch maximum length of a given encoding
1602  */
1603 int
pg_encoding_max_length(int encoding)1604 pg_encoding_max_length(int encoding)
1605 {
1606 	Assert(PG_VALID_ENCODING(encoding));
1607 
1608 	return pg_wchar_table[encoding].maxmblen;
1609 }
1610