1 /*-------------------------------------------------------------------------
2 *
3 * wchar.c
4 * Functions for working with multibyte characters in various encodings.
5 *
6 * Portions Copyright (c) 1998-2020, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/common/wchar.c
10 *
11 *-------------------------------------------------------------------------
12 */
13 #include "c.h"
14
15 #include "mb/pg_wchar.h"
16
17
18 /*
19 * Operations on multi-byte encodings are driven by a table of helper
20 * functions.
21 *
22 * To add an encoding support, define mblen(), dsplen() and verifier() for
23 * the encoding. For server-encodings, also define mb2wchar() and wchar2mb()
24 * conversion functions.
25 *
26 * These functions generally assume that their input is validly formed.
27 * The "verifier" functions, further down in the file, have to be more
28 * paranoid.
29 *
30 * We expect that mblen() does not need to examine more than the first byte
31 * of the character to discover the correct length. GB18030 is an exception
32 * to that rule, though, as it also looks at second byte. But even that
33 * behaves in a predictable way, if you only pass the first byte: it will
34 * treat 4-byte encoded characters as two 2-byte encoded characters, which is
35 * good enough for all current uses.
36 *
37 * Note: for the display output of psql to work properly, the return values
38 * of the dsplen functions must conform to the Unicode standard. In particular
39 * the NUL character is zero width and control characters are generally
40 * width -1. It is recommended that non-ASCII encodings refer their ASCII
41 * subset to the ASCII routines to ensure consistency.
42 */
43
44 /*
45 * SQL/ASCII
46 */
47 static int
pg_ascii2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)48 pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
49 {
50 int cnt = 0;
51
52 while (len > 0 && *from)
53 {
54 *to++ = *from++;
55 len--;
56 cnt++;
57 }
58 *to = 0;
59 return cnt;
60 }
61
62 static int
pg_ascii_mblen(const unsigned char * s)63 pg_ascii_mblen(const unsigned char *s)
64 {
65 return 1;
66 }
67
68 static int
pg_ascii_dsplen(const unsigned char * s)69 pg_ascii_dsplen(const unsigned char *s)
70 {
71 if (*s == '\0')
72 return 0;
73 if (*s < 0x20 || *s == 0x7f)
74 return -1;
75
76 return 1;
77 }
78
79 /*
80 * EUC
81 */
82 static int
pg_euc2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)83 pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
84 {
85 int cnt = 0;
86
87 while (len > 0 && *from)
88 {
89 if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
90 * KANA") */
91 {
92 from++;
93 *to = (SS2 << 8) | *from++;
94 len -= 2;
95 }
96 else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
97 {
98 from++;
99 *to = (SS3 << 16) | (*from++ << 8);
100 *to |= *from++;
101 len -= 3;
102 }
103 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
104 {
105 *to = *from++ << 8;
106 *to |= *from++;
107 len -= 2;
108 }
109 else /* must be ASCII */
110 {
111 *to = *from++;
112 len--;
113 }
114 to++;
115 cnt++;
116 }
117 *to = 0;
118 return cnt;
119 }
120
121 static inline int
pg_euc_mblen(const unsigned char * s)122 pg_euc_mblen(const unsigned char *s)
123 {
124 int len;
125
126 if (*s == SS2)
127 len = 2;
128 else if (*s == SS3)
129 len = 3;
130 else if (IS_HIGHBIT_SET(*s))
131 len = 2;
132 else
133 len = 1;
134 return len;
135 }
136
137 static inline int
pg_euc_dsplen(const unsigned char * s)138 pg_euc_dsplen(const unsigned char *s)
139 {
140 int len;
141
142 if (*s == SS2)
143 len = 2;
144 else if (*s == SS3)
145 len = 2;
146 else if (IS_HIGHBIT_SET(*s))
147 len = 2;
148 else
149 len = pg_ascii_dsplen(s);
150 return len;
151 }
152
153 /*
154 * EUC_JP
155 */
156 static int
pg_eucjp2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)157 pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
158 {
159 return pg_euc2wchar_with_len(from, to, len);
160 }
161
162 static int
pg_eucjp_mblen(const unsigned char * s)163 pg_eucjp_mblen(const unsigned char *s)
164 {
165 return pg_euc_mblen(s);
166 }
167
168 static int
pg_eucjp_dsplen(const unsigned char * s)169 pg_eucjp_dsplen(const unsigned char *s)
170 {
171 int len;
172
173 if (*s == SS2)
174 len = 1;
175 else if (*s == SS3)
176 len = 2;
177 else if (IS_HIGHBIT_SET(*s))
178 len = 2;
179 else
180 len = pg_ascii_dsplen(s);
181 return len;
182 }
183
184 /*
185 * EUC_KR
186 */
187 static int
pg_euckr2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)188 pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
189 {
190 return pg_euc2wchar_with_len(from, to, len);
191 }
192
193 static int
pg_euckr_mblen(const unsigned char * s)194 pg_euckr_mblen(const unsigned char *s)
195 {
196 return pg_euc_mblen(s);
197 }
198
199 static int
pg_euckr_dsplen(const unsigned char * s)200 pg_euckr_dsplen(const unsigned char *s)
201 {
202 return pg_euc_dsplen(s);
203 }
204
205 /*
206 * EUC_CN
207 *
208 */
209 static int
pg_euccn2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)210 pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
211 {
212 int cnt = 0;
213
214 while (len > 0 && *from)
215 {
216 if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
217 {
218 from++;
219 *to = (SS2 << 16) | (*from++ << 8);
220 *to |= *from++;
221 len -= 3;
222 }
223 else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
224 {
225 from++;
226 *to = (SS3 << 16) | (*from++ << 8);
227 *to |= *from++;
228 len -= 3;
229 }
230 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
231 {
232 *to = *from++ << 8;
233 *to |= *from++;
234 len -= 2;
235 }
236 else
237 {
238 *to = *from++;
239 len--;
240 }
241 to++;
242 cnt++;
243 }
244 *to = 0;
245 return cnt;
246 }
247
248 static int
pg_euccn_mblen(const unsigned char * s)249 pg_euccn_mblen(const unsigned char *s)
250 {
251 int len;
252
253 if (IS_HIGHBIT_SET(*s))
254 len = 2;
255 else
256 len = 1;
257 return len;
258 }
259
260 static int
pg_euccn_dsplen(const unsigned char * s)261 pg_euccn_dsplen(const unsigned char *s)
262 {
263 int len;
264
265 if (IS_HIGHBIT_SET(*s))
266 len = 2;
267 else
268 len = pg_ascii_dsplen(s);
269 return len;
270 }
271
272 /*
273 * EUC_TW
274 *
275 */
276 static int
pg_euctw2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)277 pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
278 {
279 int cnt = 0;
280
281 while (len > 0 && *from)
282 {
283 if (*from == SS2 && len >= 4) /* code set 2 */
284 {
285 from++;
286 *to = (((uint32) SS2) << 24) | (*from++ << 16);
287 *to |= *from++ << 8;
288 *to |= *from++;
289 len -= 4;
290 }
291 else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
292 {
293 from++;
294 *to = (SS3 << 16) | (*from++ << 8);
295 *to |= *from++;
296 len -= 3;
297 }
298 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
299 {
300 *to = *from++ << 8;
301 *to |= *from++;
302 len -= 2;
303 }
304 else
305 {
306 *to = *from++;
307 len--;
308 }
309 to++;
310 cnt++;
311 }
312 *to = 0;
313 return cnt;
314 }
315
316 static int
pg_euctw_mblen(const unsigned char * s)317 pg_euctw_mblen(const unsigned char *s)
318 {
319 int len;
320
321 if (*s == SS2)
322 len = 4;
323 else if (*s == SS3)
324 len = 3;
325 else if (IS_HIGHBIT_SET(*s))
326 len = 2;
327 else
328 len = 1;
329 return len;
330 }
331
332 static int
pg_euctw_dsplen(const unsigned char * s)333 pg_euctw_dsplen(const unsigned char *s)
334 {
335 int len;
336
337 if (*s == SS2)
338 len = 2;
339 else if (*s == SS3)
340 len = 2;
341 else if (IS_HIGHBIT_SET(*s))
342 len = 2;
343 else
344 len = pg_ascii_dsplen(s);
345 return len;
346 }
347
348 /*
349 * Convert pg_wchar to EUC_* encoding.
350 * caller must allocate enough space for "to", including a trailing zero!
351 * len: length of from.
352 * "from" not necessarily null terminated.
353 */
354 static int
pg_wchar2euc_with_len(const pg_wchar * from,unsigned char * to,int len)355 pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
356 {
357 int cnt = 0;
358
359 while (len > 0 && *from)
360 {
361 unsigned char c;
362
363 if ((c = (*from >> 24)))
364 {
365 *to++ = c;
366 *to++ = (*from >> 16) & 0xff;
367 *to++ = (*from >> 8) & 0xff;
368 *to++ = *from & 0xff;
369 cnt += 4;
370 }
371 else if ((c = (*from >> 16)))
372 {
373 *to++ = c;
374 *to++ = (*from >> 8) & 0xff;
375 *to++ = *from & 0xff;
376 cnt += 3;
377 }
378 else if ((c = (*from >> 8)))
379 {
380 *to++ = c;
381 *to++ = *from & 0xff;
382 cnt += 2;
383 }
384 else
385 {
386 *to++ = *from;
387 cnt++;
388 }
389 from++;
390 len--;
391 }
392 *to = 0;
393 return cnt;
394 }
395
396
397 /*
398 * JOHAB
399 */
400 static int
pg_johab_mblen(const unsigned char * s)401 pg_johab_mblen(const unsigned char *s)
402 {
403 return pg_euc_mblen(s);
404 }
405
406 static int
pg_johab_dsplen(const unsigned char * s)407 pg_johab_dsplen(const unsigned char *s)
408 {
409 return pg_euc_dsplen(s);
410 }
411
412 /*
413 * convert UTF8 string to pg_wchar (UCS-4)
414 * caller must allocate enough space for "to", including a trailing zero!
415 * len: length of from.
416 * "from" not necessarily null terminated.
417 */
418 static int
pg_utf2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)419 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
420 {
421 int cnt = 0;
422 uint32 c1,
423 c2,
424 c3,
425 c4;
426
427 while (len > 0 && *from)
428 {
429 if ((*from & 0x80) == 0)
430 {
431 *to = *from++;
432 len--;
433 }
434 else if ((*from & 0xe0) == 0xc0)
435 {
436 if (len < 2)
437 break; /* drop trailing incomplete char */
438 c1 = *from++ & 0x1f;
439 c2 = *from++ & 0x3f;
440 *to = (c1 << 6) | c2;
441 len -= 2;
442 }
443 else if ((*from & 0xf0) == 0xe0)
444 {
445 if (len < 3)
446 break; /* drop trailing incomplete char */
447 c1 = *from++ & 0x0f;
448 c2 = *from++ & 0x3f;
449 c3 = *from++ & 0x3f;
450 *to = (c1 << 12) | (c2 << 6) | c3;
451 len -= 3;
452 }
453 else if ((*from & 0xf8) == 0xf0)
454 {
455 if (len < 4)
456 break; /* drop trailing incomplete char */
457 c1 = *from++ & 0x07;
458 c2 = *from++ & 0x3f;
459 c3 = *from++ & 0x3f;
460 c4 = *from++ & 0x3f;
461 *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
462 len -= 4;
463 }
464 else
465 {
466 /* treat a bogus char as length 1; not ours to raise error */
467 *to = *from++;
468 len--;
469 }
470 to++;
471 cnt++;
472 }
473 *to = 0;
474 return cnt;
475 }
476
477
478 /*
479 * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
480 * space allocated.
481 */
482 unsigned char *
unicode_to_utf8(pg_wchar c,unsigned char * utf8string)483 unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
484 {
485 if (c <= 0x7F)
486 {
487 utf8string[0] = c;
488 }
489 else if (c <= 0x7FF)
490 {
491 utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
492 utf8string[1] = 0x80 | (c & 0x3F);
493 }
494 else if (c <= 0xFFFF)
495 {
496 utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
497 utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
498 utf8string[2] = 0x80 | (c & 0x3F);
499 }
500 else
501 {
502 utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
503 utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
504 utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
505 utf8string[3] = 0x80 | (c & 0x3F);
506 }
507
508 return utf8string;
509 }
510
511 /*
512 * Trivial conversion from pg_wchar to UTF-8.
513 * caller should allocate enough space for "to"
514 * len: length of from.
515 * "from" not necessarily null terminated.
516 */
517 static int
pg_wchar2utf_with_len(const pg_wchar * from,unsigned char * to,int len)518 pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
519 {
520 int cnt = 0;
521
522 while (len > 0 && *from)
523 {
524 int char_len;
525
526 unicode_to_utf8(*from, to);
527 char_len = pg_utf_mblen(to);
528 cnt += char_len;
529 to += char_len;
530 from++;
531 len--;
532 }
533 *to = 0;
534 return cnt;
535 }
536
537 /*
538 * Return the byte length of a UTF8 character pointed to by s
539 *
540 * Note: in the current implementation we do not support UTF8 sequences
541 * of more than 4 bytes; hence do NOT return a value larger than 4.
542 * We return "1" for any leading byte that is either flat-out illegal or
543 * indicates a length larger than we support.
544 *
545 * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
546 * other places would need to be fixed to change this.
547 */
548 int
pg_utf_mblen(const unsigned char * s)549 pg_utf_mblen(const unsigned char *s)
550 {
551 int len;
552
553 if ((*s & 0x80) == 0)
554 len = 1;
555 else if ((*s & 0xe0) == 0xc0)
556 len = 2;
557 else if ((*s & 0xf0) == 0xe0)
558 len = 3;
559 else if ((*s & 0xf8) == 0xf0)
560 len = 4;
561 #ifdef NOT_USED
562 else if ((*s & 0xfc) == 0xf8)
563 len = 5;
564 else if ((*s & 0xfe) == 0xfc)
565 len = 6;
566 #endif
567 else
568 len = 1;
569 return len;
570 }
571
572 /*
573 * This is an implementation of wcwidth() and wcswidth() as defined in
574 * "The Single UNIX Specification, Version 2, The Open Group, 1997"
575 * <http://www.unix.org/online.html>
576 *
577 * Markus Kuhn -- 2001-09-08 -- public domain
578 *
579 * customised for PostgreSQL
580 *
581 * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
582 */
583
584 struct mbinterval
585 {
586 unsigned short first;
587 unsigned short last;
588 };
589
590 /* auxiliary function for binary search in interval table */
591 static int
mbbisearch(pg_wchar ucs,const struct mbinterval * table,int max)592 mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
593 {
594 int min = 0;
595 int mid;
596
597 if (ucs < table[0].first || ucs > table[max].last)
598 return 0;
599 while (max >= min)
600 {
601 mid = (min + max) / 2;
602 if (ucs > table[mid].last)
603 min = mid + 1;
604 else if (ucs < table[mid].first)
605 max = mid - 1;
606 else
607 return 1;
608 }
609
610 return 0;
611 }
612
613
614 /* The following functions define the column width of an ISO 10646
615 * character as follows:
616 *
617 * - The null character (U+0000) has a column width of 0.
618 *
619 * - Other C0/C1 control characters and DEL will lead to a return
620 * value of -1.
621 *
622 * - Non-spacing and enclosing combining characters (general
623 * category code Mn or Me in the Unicode database) have a
624 * column width of 0.
625 *
626 * - Other format characters (general category code Cf in the Unicode
627 * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
628 *
629 * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
630 * have a column width of 0.
631 *
632 * - Spacing characters in the East Asian Wide (W) or East Asian
633 * FullWidth (F) category as defined in Unicode Technical
634 * Report #11 have a column width of 2.
635 *
636 * - All remaining characters (including all printable
637 * ISO 8859-1 and WGL4 characters, Unicode control characters,
638 * etc.) have a column width of 1.
639 *
640 * This implementation assumes that wchar_t characters are encoded
641 * in ISO 10646.
642 */
643
644 static int
ucs_wcwidth(pg_wchar ucs)645 ucs_wcwidth(pg_wchar ucs)
646 {
647 #include "common/unicode_combining_table.h"
648
649 /* test for 8-bit control characters */
650 if (ucs == 0)
651 return 0;
652
653 if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
654 return -1;
655
656 /* binary search in table of non-spacing characters */
657 if (mbbisearch(ucs, combining,
658 sizeof(combining) / sizeof(struct mbinterval) - 1))
659 return 0;
660
661 /*
662 * if we arrive here, ucs is not a combining or C0/C1 control character
663 */
664
665 return 1 +
666 (ucs >= 0x1100 &&
667 (ucs <= 0x115f || /* Hangul Jamo init. consonants */
668 (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
669 ucs != 0x303f) || /* CJK ... Yi */
670 (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
671 (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility
672 * Ideographs */
673 (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
674 (ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */
675 (ucs >= 0xffe0 && ucs <= 0xffe6) ||
676 (ucs >= 0x20000 && ucs <= 0x2ffff)));
677 }
678
679 /*
680 * Convert a UTF-8 character to a Unicode code point.
681 * This is a one-character version of pg_utf2wchar_with_len.
682 *
683 * No error checks here, c must point to a long-enough string.
684 */
685 pg_wchar
utf8_to_unicode(const unsigned char * c)686 utf8_to_unicode(const unsigned char *c)
687 {
688 if ((*c & 0x80) == 0)
689 return (pg_wchar) c[0];
690 else if ((*c & 0xe0) == 0xc0)
691 return (pg_wchar) (((c[0] & 0x1f) << 6) |
692 (c[1] & 0x3f));
693 else if ((*c & 0xf0) == 0xe0)
694 return (pg_wchar) (((c[0] & 0x0f) << 12) |
695 ((c[1] & 0x3f) << 6) |
696 (c[2] & 0x3f));
697 else if ((*c & 0xf8) == 0xf0)
698 return (pg_wchar) (((c[0] & 0x07) << 18) |
699 ((c[1] & 0x3f) << 12) |
700 ((c[2] & 0x3f) << 6) |
701 (c[3] & 0x3f));
702 else
703 /* that is an invalid code on purpose */
704 return 0xffffffff;
705 }
706
707 static int
pg_utf_dsplen(const unsigned char * s)708 pg_utf_dsplen(const unsigned char *s)
709 {
710 return ucs_wcwidth(utf8_to_unicode(s));
711 }
712
713 /*
714 * convert mule internal code to pg_wchar
715 * caller should allocate enough space for "to"
716 * len: length of from.
717 * "from" not necessarily null terminated.
718 */
719 static int
pg_mule2wchar_with_len(const unsigned char * from,pg_wchar * to,int len)720 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
721 {
722 int cnt = 0;
723
724 while (len > 0 && *from)
725 {
726 if (IS_LC1(*from) && len >= 2)
727 {
728 *to = *from++ << 16;
729 *to |= *from++;
730 len -= 2;
731 }
732 else if (IS_LCPRV1(*from) && len >= 3)
733 {
734 from++;
735 *to = *from++ << 16;
736 *to |= *from++;
737 len -= 3;
738 }
739 else if (IS_LC2(*from) && len >= 3)
740 {
741 *to = *from++ << 16;
742 *to |= *from++ << 8;
743 *to |= *from++;
744 len -= 3;
745 }
746 else if (IS_LCPRV2(*from) && len >= 4)
747 {
748 from++;
749 *to = *from++ << 16;
750 *to |= *from++ << 8;
751 *to |= *from++;
752 len -= 4;
753 }
754 else
755 { /* assume ASCII */
756 *to = (unsigned char) *from++;
757 len--;
758 }
759 to++;
760 cnt++;
761 }
762 *to = 0;
763 return cnt;
764 }
765
766 /*
767 * convert pg_wchar to mule internal code
768 * caller should allocate enough space for "to"
769 * len: length of from.
770 * "from" not necessarily null terminated.
771 */
772 static int
pg_wchar2mule_with_len(const pg_wchar * from,unsigned char * to,int len)773 pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
774 {
775 int cnt = 0;
776
777 while (len > 0 && *from)
778 {
779 unsigned char lb;
780
781 lb = (*from >> 16) & 0xff;
782 if (IS_LC1(lb))
783 {
784 *to++ = lb;
785 *to++ = *from & 0xff;
786 cnt += 2;
787 }
788 else if (IS_LC2(lb))
789 {
790 *to++ = lb;
791 *to++ = (*from >> 8) & 0xff;
792 *to++ = *from & 0xff;
793 cnt += 3;
794 }
795 else if (IS_LCPRV1_A_RANGE(lb))
796 {
797 *to++ = LCPRV1_A;
798 *to++ = lb;
799 *to++ = *from & 0xff;
800 cnt += 3;
801 }
802 else if (IS_LCPRV1_B_RANGE(lb))
803 {
804 *to++ = LCPRV1_B;
805 *to++ = lb;
806 *to++ = *from & 0xff;
807 cnt += 3;
808 }
809 else if (IS_LCPRV2_A_RANGE(lb))
810 {
811 *to++ = LCPRV2_A;
812 *to++ = lb;
813 *to++ = (*from >> 8) & 0xff;
814 *to++ = *from & 0xff;
815 cnt += 4;
816 }
817 else if (IS_LCPRV2_B_RANGE(lb))
818 {
819 *to++ = LCPRV2_B;
820 *to++ = lb;
821 *to++ = (*from >> 8) & 0xff;
822 *to++ = *from & 0xff;
823 cnt += 4;
824 }
825 else
826 {
827 *to++ = *from & 0xff;
828 cnt += 1;
829 }
830 from++;
831 len--;
832 }
833 *to = 0;
834 return cnt;
835 }
836
837 /* exported for direct use by conv.c */
838 int
pg_mule_mblen(const unsigned char * s)839 pg_mule_mblen(const unsigned char *s)
840 {
841 int len;
842
843 if (IS_LC1(*s))
844 len = 2;
845 else if (IS_LCPRV1(*s))
846 len = 3;
847 else if (IS_LC2(*s))
848 len = 3;
849 else if (IS_LCPRV2(*s))
850 len = 4;
851 else
852 len = 1; /* assume ASCII */
853 return len;
854 }
855
856 static int
pg_mule_dsplen(const unsigned char * s)857 pg_mule_dsplen(const unsigned char *s)
858 {
859 int len;
860
861 /*
862 * Note: it's not really appropriate to assume that all multibyte charsets
863 * are double-wide on screen. But this seems an okay approximation for
864 * the MULE charsets we currently support.
865 */
866
867 if (IS_LC1(*s))
868 len = 1;
869 else if (IS_LCPRV1(*s))
870 len = 1;
871 else if (IS_LC2(*s))
872 len = 2;
873 else if (IS_LCPRV2(*s))
874 len = 2;
875 else
876 len = 1; /* assume ASCII */
877
878 return len;
879 }
880
881 /*
882 * ISO8859-1
883 */
884 static int
pg_latin12wchar_with_len(const unsigned char * from,pg_wchar * to,int len)885 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
886 {
887 int cnt = 0;
888
889 while (len > 0 && *from)
890 {
891 *to++ = *from++;
892 len--;
893 cnt++;
894 }
895 *to = 0;
896 return cnt;
897 }
898
899 /*
900 * Trivial conversion from pg_wchar to single byte encoding. Just ignores
901 * high bits.
902 * caller should allocate enough space for "to"
903 * len: length of from.
904 * "from" not necessarily null terminated.
905 */
906 static int
pg_wchar2single_with_len(const pg_wchar * from,unsigned char * to,int len)907 pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
908 {
909 int cnt = 0;
910
911 while (len > 0 && *from)
912 {
913 *to++ = *from++;
914 len--;
915 cnt++;
916 }
917 *to = 0;
918 return cnt;
919 }
920
921 static int
pg_latin1_mblen(const unsigned char * s)922 pg_latin1_mblen(const unsigned char *s)
923 {
924 return 1;
925 }
926
927 static int
pg_latin1_dsplen(const unsigned char * s)928 pg_latin1_dsplen(const unsigned char *s)
929 {
930 return pg_ascii_dsplen(s);
931 }
932
933 /*
934 * SJIS
935 */
936 static int
pg_sjis_mblen(const unsigned char * s)937 pg_sjis_mblen(const unsigned char *s)
938 {
939 int len;
940
941 if (*s >= 0xa1 && *s <= 0xdf)
942 len = 1; /* 1 byte kana? */
943 else if (IS_HIGHBIT_SET(*s))
944 len = 2; /* kanji? */
945 else
946 len = 1; /* should be ASCII */
947 return len;
948 }
949
950 static int
pg_sjis_dsplen(const unsigned char * s)951 pg_sjis_dsplen(const unsigned char *s)
952 {
953 int len;
954
955 if (*s >= 0xa1 && *s <= 0xdf)
956 len = 1; /* 1 byte kana? */
957 else if (IS_HIGHBIT_SET(*s))
958 len = 2; /* kanji? */
959 else
960 len = pg_ascii_dsplen(s); /* should be ASCII */
961 return len;
962 }
963
964 /*
965 * Big5
966 */
967 static int
pg_big5_mblen(const unsigned char * s)968 pg_big5_mblen(const unsigned char *s)
969 {
970 int len;
971
972 if (IS_HIGHBIT_SET(*s))
973 len = 2; /* kanji? */
974 else
975 len = 1; /* should be ASCII */
976 return len;
977 }
978
979 static int
pg_big5_dsplen(const unsigned char * s)980 pg_big5_dsplen(const unsigned char *s)
981 {
982 int len;
983
984 if (IS_HIGHBIT_SET(*s))
985 len = 2; /* kanji? */
986 else
987 len = pg_ascii_dsplen(s); /* should be ASCII */
988 return len;
989 }
990
991 /*
992 * GBK
993 */
994 static int
pg_gbk_mblen(const unsigned char * s)995 pg_gbk_mblen(const unsigned char *s)
996 {
997 int len;
998
999 if (IS_HIGHBIT_SET(*s))
1000 len = 2; /* kanji? */
1001 else
1002 len = 1; /* should be ASCII */
1003 return len;
1004 }
1005
1006 static int
pg_gbk_dsplen(const unsigned char * s)1007 pg_gbk_dsplen(const unsigned char *s)
1008 {
1009 int len;
1010
1011 if (IS_HIGHBIT_SET(*s))
1012 len = 2; /* kanji? */
1013 else
1014 len = pg_ascii_dsplen(s); /* should be ASCII */
1015 return len;
1016 }
1017
1018 /*
1019 * UHC
1020 */
1021 static int
pg_uhc_mblen(const unsigned char * s)1022 pg_uhc_mblen(const unsigned char *s)
1023 {
1024 int len;
1025
1026 if (IS_HIGHBIT_SET(*s))
1027 len = 2; /* 2byte? */
1028 else
1029 len = 1; /* should be ASCII */
1030 return len;
1031 }
1032
1033 static int
pg_uhc_dsplen(const unsigned char * s)1034 pg_uhc_dsplen(const unsigned char *s)
1035 {
1036 int len;
1037
1038 if (IS_HIGHBIT_SET(*s))
1039 len = 2; /* 2byte? */
1040 else
1041 len = pg_ascii_dsplen(s); /* should be ASCII */
1042 return len;
1043 }
1044
1045 /*
1046 * GB18030
1047 * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1048 */
1049
1050 /*
1051 * Unlike all other mblen() functions, this also looks at the second byte of
1052 * the input. However, if you only pass the first byte of a multi-byte
1053 * string, and \0 as the second byte, this still works in a predictable way:
1054 * a 4-byte character will be reported as two 2-byte characters. That's
1055 * enough for all current uses, as a client-only encoding. It works that
1056 * way, because in any valid 4-byte GB18030-encoded character, the third and
1057 * fourth byte look like a 2-byte encoded character, when looked at
1058 * separately.
1059 */
1060 static int
pg_gb18030_mblen(const unsigned char * s)1061 pg_gb18030_mblen(const unsigned char *s)
1062 {
1063 int len;
1064
1065 if (!IS_HIGHBIT_SET(*s))
1066 len = 1; /* ASCII */
1067 else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1068 len = 4;
1069 else
1070 len = 2;
1071 return len;
1072 }
1073
1074 static int
pg_gb18030_dsplen(const unsigned char * s)1075 pg_gb18030_dsplen(const unsigned char *s)
1076 {
1077 int len;
1078
1079 if (IS_HIGHBIT_SET(*s))
1080 len = 2;
1081 else
1082 len = pg_ascii_dsplen(s); /* ASCII */
1083 return len;
1084 }
1085
1086 /*
1087 *-------------------------------------------------------------------
1088 * multibyte sequence validators
1089 *
1090 * These functions accept "s", a pointer to the first byte of a string,
1091 * and "len", the remaining length of the string. If there is a validly
1092 * encoded character beginning at *s, return its length in bytes; else
1093 * return -1.
1094 *
1095 * The functions can assume that len > 0 and that *s != '\0', but they must
1096 * test for and reject zeroes in any additional bytes of a multibyte character.
1097 *
1098 * Note that this definition allows the function for a single-byte
1099 * encoding to be just "return 1".
1100 *-------------------------------------------------------------------
1101 */
1102
1103 static int
pg_ascii_verifier(const unsigned char * s,int len)1104 pg_ascii_verifier(const unsigned char *s, int len)
1105 {
1106 return 1;
1107 }
1108
1109 #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1110
1111 static int
pg_eucjp_verifier(const unsigned char * s,int len)1112 pg_eucjp_verifier(const unsigned char *s, int len)
1113 {
1114 int l;
1115 unsigned char c1,
1116 c2;
1117
1118 c1 = *s++;
1119
1120 switch (c1)
1121 {
1122 case SS2: /* JIS X 0201 */
1123 l = 2;
1124 if (l > len)
1125 return -1;
1126 c2 = *s++;
1127 if (c2 < 0xa1 || c2 > 0xdf)
1128 return -1;
1129 break;
1130
1131 case SS3: /* JIS X 0212 */
1132 l = 3;
1133 if (l > len)
1134 return -1;
1135 c2 = *s++;
1136 if (!IS_EUC_RANGE_VALID(c2))
1137 return -1;
1138 c2 = *s++;
1139 if (!IS_EUC_RANGE_VALID(c2))
1140 return -1;
1141 break;
1142
1143 default:
1144 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1145 {
1146 l = 2;
1147 if (l > len)
1148 return -1;
1149 if (!IS_EUC_RANGE_VALID(c1))
1150 return -1;
1151 c2 = *s++;
1152 if (!IS_EUC_RANGE_VALID(c2))
1153 return -1;
1154 }
1155 else
1156 /* must be ASCII */
1157 {
1158 l = 1;
1159 }
1160 break;
1161 }
1162
1163 return l;
1164 }
1165
1166 static int
pg_euckr_verifier(const unsigned char * s,int len)1167 pg_euckr_verifier(const unsigned char *s, int len)
1168 {
1169 int l;
1170 unsigned char c1,
1171 c2;
1172
1173 c1 = *s++;
1174
1175 if (IS_HIGHBIT_SET(c1))
1176 {
1177 l = 2;
1178 if (l > len)
1179 return -1;
1180 if (!IS_EUC_RANGE_VALID(c1))
1181 return -1;
1182 c2 = *s++;
1183 if (!IS_EUC_RANGE_VALID(c2))
1184 return -1;
1185 }
1186 else
1187 /* must be ASCII */
1188 {
1189 l = 1;
1190 }
1191
1192 return l;
1193 }
1194
1195 /* EUC-CN byte sequences are exactly same as EUC-KR */
1196 #define pg_euccn_verifier pg_euckr_verifier
1197
1198 static int
pg_euctw_verifier(const unsigned char * s,int len)1199 pg_euctw_verifier(const unsigned char *s, int len)
1200 {
1201 int l;
1202 unsigned char c1,
1203 c2;
1204
1205 c1 = *s++;
1206
1207 switch (c1)
1208 {
1209 case SS2: /* CNS 11643 Plane 1-7 */
1210 l = 4;
1211 if (l > len)
1212 return -1;
1213 c2 = *s++;
1214 if (c2 < 0xa1 || c2 > 0xa7)
1215 return -1;
1216 c2 = *s++;
1217 if (!IS_EUC_RANGE_VALID(c2))
1218 return -1;
1219 c2 = *s++;
1220 if (!IS_EUC_RANGE_VALID(c2))
1221 return -1;
1222 break;
1223
1224 case SS3: /* unused */
1225 return -1;
1226
1227 default:
1228 if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1229 {
1230 l = 2;
1231 if (l > len)
1232 return -1;
1233 /* no further range check on c1? */
1234 c2 = *s++;
1235 if (!IS_EUC_RANGE_VALID(c2))
1236 return -1;
1237 }
1238 else
1239 /* must be ASCII */
1240 {
1241 l = 1;
1242 }
1243 break;
1244 }
1245 return l;
1246 }
1247
1248 static int
pg_johab_verifier(const unsigned char * s,int len)1249 pg_johab_verifier(const unsigned char *s, int len)
1250 {
1251 int l,
1252 mbl;
1253 unsigned char c;
1254
1255 l = mbl = pg_johab_mblen(s);
1256
1257 if (len < l)
1258 return -1;
1259
1260 if (!IS_HIGHBIT_SET(*s))
1261 return mbl;
1262
1263 while (--l > 0)
1264 {
1265 c = *++s;
1266 if (!IS_EUC_RANGE_VALID(c))
1267 return -1;
1268 }
1269 return mbl;
1270 }
1271
1272 static int
pg_mule_verifier(const unsigned char * s,int len)1273 pg_mule_verifier(const unsigned char *s, int len)
1274 {
1275 int l,
1276 mbl;
1277 unsigned char c;
1278
1279 l = mbl = pg_mule_mblen(s);
1280
1281 if (len < l)
1282 return -1;
1283
1284 while (--l > 0)
1285 {
1286 c = *++s;
1287 if (!IS_HIGHBIT_SET(c))
1288 return -1;
1289 }
1290 return mbl;
1291 }
1292
1293 static int
pg_latin1_verifier(const unsigned char * s,int len)1294 pg_latin1_verifier(const unsigned char *s, int len)
1295 {
1296 return 1;
1297 }
1298
1299 static int
pg_sjis_verifier(const unsigned char * s,int len)1300 pg_sjis_verifier(const unsigned char *s, int len)
1301 {
1302 int l,
1303 mbl;
1304 unsigned char c1,
1305 c2;
1306
1307 l = mbl = pg_sjis_mblen(s);
1308
1309 if (len < l)
1310 return -1;
1311
1312 if (l == 1) /* pg_sjis_mblen already verified it */
1313 return mbl;
1314
1315 c1 = *s++;
1316 c2 = *s;
1317 if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1318 return -1;
1319 return mbl;
1320 }
1321
1322 static int
pg_big5_verifier(const unsigned char * s,int len)1323 pg_big5_verifier(const unsigned char *s, int len)
1324 {
1325 int l,
1326 mbl;
1327
1328 l = mbl = pg_big5_mblen(s);
1329
1330 if (len < l)
1331 return -1;
1332
1333 while (--l > 0)
1334 {
1335 if (*++s == '\0')
1336 return -1;
1337 }
1338
1339 return mbl;
1340 }
1341
1342 static int
pg_gbk_verifier(const unsigned char * s,int len)1343 pg_gbk_verifier(const unsigned char *s, int len)
1344 {
1345 int l,
1346 mbl;
1347
1348 l = mbl = pg_gbk_mblen(s);
1349
1350 if (len < l)
1351 return -1;
1352
1353 while (--l > 0)
1354 {
1355 if (*++s == '\0')
1356 return -1;
1357 }
1358
1359 return mbl;
1360 }
1361
1362 static int
pg_uhc_verifier(const unsigned char * s,int len)1363 pg_uhc_verifier(const unsigned char *s, int len)
1364 {
1365 int l,
1366 mbl;
1367
1368 l = mbl = pg_uhc_mblen(s);
1369
1370 if (len < l)
1371 return -1;
1372
1373 while (--l > 0)
1374 {
1375 if (*++s == '\0')
1376 return -1;
1377 }
1378
1379 return mbl;
1380 }
1381
1382 static int
pg_gb18030_verifier(const unsigned char * s,int len)1383 pg_gb18030_verifier(const unsigned char *s, int len)
1384 {
1385 int l;
1386
1387 if (!IS_HIGHBIT_SET(*s))
1388 l = 1; /* ASCII */
1389 else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1390 {
1391 /* Should be 4-byte, validate remaining bytes */
1392 if (*s >= 0x81 && *s <= 0xfe &&
1393 *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1394 *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1395 l = 4;
1396 else
1397 l = -1;
1398 }
1399 else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1400 {
1401 /* Should be 2-byte, validate */
1402 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1403 (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1404 l = 2;
1405 else
1406 l = -1;
1407 }
1408 else
1409 l = -1;
1410 return l;
1411 }
1412
1413 static int
pg_utf8_verifier(const unsigned char * s,int len)1414 pg_utf8_verifier(const unsigned char *s, int len)
1415 {
1416 int l = pg_utf_mblen(s);
1417
1418 if (len < l)
1419 return -1;
1420
1421 if (!pg_utf8_islegal(s, l))
1422 return -1;
1423
1424 return l;
1425 }
1426
1427 /*
1428 * Check for validity of a single UTF-8 encoded character
1429 *
1430 * This directly implements the rules in RFC3629. The bizarre-looking
1431 * restrictions on the second byte are meant to ensure that there isn't
1432 * more than one encoding of a given Unicode character point; that is,
1433 * you may not use a longer-than-necessary byte sequence with high order
1434 * zero bits to represent a character that would fit in fewer bytes.
1435 * To do otherwise is to create security hazards (eg, create an apparent
1436 * non-ASCII character that decodes to plain ASCII).
1437 *
1438 * length is assumed to have been obtained by pg_utf_mblen(), and the
1439 * caller must have checked that that many bytes are present in the buffer.
1440 */
1441 bool
pg_utf8_islegal(const unsigned char * source,int length)1442 pg_utf8_islegal(const unsigned char *source, int length)
1443 {
1444 unsigned char a;
1445
1446 switch (length)
1447 {
1448 default:
1449 /* reject lengths 5 and 6 for now */
1450 return false;
1451 case 4:
1452 a = source[3];
1453 if (a < 0x80 || a > 0xBF)
1454 return false;
1455 /* FALL THRU */
1456 case 3:
1457 a = source[2];
1458 if (a < 0x80 || a > 0xBF)
1459 return false;
1460 /* FALL THRU */
1461 case 2:
1462 a = source[1];
1463 switch (*source)
1464 {
1465 case 0xE0:
1466 if (a < 0xA0 || a > 0xBF)
1467 return false;
1468 break;
1469 case 0xED:
1470 if (a < 0x80 || a > 0x9F)
1471 return false;
1472 break;
1473 case 0xF0:
1474 if (a < 0x90 || a > 0xBF)
1475 return false;
1476 break;
1477 case 0xF4:
1478 if (a < 0x80 || a > 0x8F)
1479 return false;
1480 break;
1481 default:
1482 if (a < 0x80 || a > 0xBF)
1483 return false;
1484 break;
1485 }
1486 /* FALL THRU */
1487 case 1:
1488 a = *source;
1489 if (a >= 0x80 && a < 0xC2)
1490 return false;
1491 if (a > 0xF4)
1492 return false;
1493 break;
1494 }
1495 return true;
1496 }
1497
1498
1499 /*
1500 *-------------------------------------------------------------------
1501 * encoding info table
1502 * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
1503 *-------------------------------------------------------------------
1504 */
1505 const pg_wchar_tbl pg_wchar_table[] = {
1506 {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */
1507 {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */
1508 {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */
1509 {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* PG_EUC_KR */
1510 {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* PG_EUC_TW */
1511 {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004 */
1512 {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* PG_UTF8 */
1513 {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL */
1514 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */
1515 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */
1516 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */
1517 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */
1518 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */
1519 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */
1520 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */
1521 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */
1522 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */
1523 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */
1524 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */
1525 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */
1526 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */
1527 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */
1528 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */
1529 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */
1530 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */
1531 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */
1532 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */
1533 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */
1534 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */
1535 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */
1536 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */
1537 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */
1538 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */
1539 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */
1540 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */
1541 {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */
1542 {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */
1543 {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */
1544 {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* PG_UHC */
1545 {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */
1546 {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */
1547 {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */
1548 };
1549
1550 /*
1551 * Returns the byte length of a multibyte character.
1552 *
1553 * Caution: when dealing with text that is not certainly valid in the
1554 * specified encoding, the result may exceed the actual remaining
1555 * string length. Callers that are not prepared to deal with that
1556 * should use pg_encoding_mblen_bounded() instead.
1557 */
1558 int
pg_encoding_mblen(int encoding,const char * mbstr)1559 pg_encoding_mblen(int encoding, const char *mbstr)
1560 {
1561 return (PG_VALID_ENCODING(encoding) ?
1562 pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
1563 pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
1564 }
1565
1566 /*
1567 * Returns the byte length of a multibyte character; but not more than
1568 * the distance to end of string.
1569 */
1570 int
pg_encoding_mblen_bounded(int encoding,const char * mbstr)1571 pg_encoding_mblen_bounded(int encoding, const char *mbstr)
1572 {
1573 return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
1574 }
1575
1576 /*
1577 * Returns the display length of a multibyte character.
1578 */
1579 int
pg_encoding_dsplen(int encoding,const char * mbstr)1580 pg_encoding_dsplen(int encoding, const char *mbstr)
1581 {
1582 return (PG_VALID_ENCODING(encoding) ?
1583 pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
1584 pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
1585 }
1586
1587 /*
1588 * Verify the first multibyte character of the given string.
1589 * Return its byte length if good, -1 if bad. (See comments above for
1590 * full details of the mbverify API.)
1591 */
1592 int
pg_encoding_verifymb(int encoding,const char * mbstr,int len)1593 pg_encoding_verifymb(int encoding, const char *mbstr, int len)
1594 {
1595 return (PG_VALID_ENCODING(encoding) ?
1596 pg_wchar_table[encoding].mbverify((const unsigned char *) mbstr, len) :
1597 pg_wchar_table[PG_SQL_ASCII].mbverify((const unsigned char *) mbstr, len));
1598 }
1599
1600 /*
1601 * fetch maximum length of a given encoding
1602 */
1603 int
pg_encoding_max_length(int encoding)1604 pg_encoding_max_length(int encoding)
1605 {
1606 Assert(PG_VALID_ENCODING(encoding));
1607
1608 return pg_wchar_table[encoding].maxmblen;
1609 }
1610