1 /*-------------------------------------------------------------------------
2 *
3 * Utility functions for conversion procs.
4 *
5 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
7 *
8 * IDENTIFICATION
9 * src/backend/utils/mb/conv.c
10 *
11 *-------------------------------------------------------------------------
12 */
13 #include "postgres.h"
14 #include "mb/pg_wchar.h"
15
16
17 /*
18 * local2local: a generic single byte charset encoding
19 * conversion between two ASCII-superset encodings.
20 *
21 * l points to the source string of length len
22 * p is the output area (must be large enough!)
23 * src_encoding is the PG identifier for the source encoding
24 * dest_encoding is the PG identifier for the target encoding
25 * tab holds conversion entries for the source charset
26 * starting from 128 (0x80). each entry in the table holds the corresponding
27 * code point for the target charset, or 0 if there is no equivalent code.
28 */
29 void
local2local(const unsigned char * l,unsigned char * p,int len,int src_encoding,int dest_encoding,const unsigned char * tab)30 local2local(const unsigned char *l,
31 unsigned char *p,
32 int len,
33 int src_encoding,
34 int dest_encoding,
35 const unsigned char *tab)
36 {
37 unsigned char c1,
38 c2;
39
40 while (len > 0)
41 {
42 c1 = *l;
43 if (c1 == 0)
44 report_invalid_encoding(src_encoding, (const char *) l, len);
45 if (!IS_HIGHBIT_SET(c1))
46 *p++ = c1;
47 else
48 {
49 c2 = tab[c1 - HIGHBIT];
50 if (c2)
51 *p++ = c2;
52 else
53 report_untranslatable_char(src_encoding, dest_encoding,
54 (const char *) l, len);
55 }
56 l++;
57 len--;
58 }
59 *p = '\0';
60 }
61
62 /*
63 * LATINn ---> MIC when the charset's local codes map directly to MIC
64 *
65 * l points to the source string of length len
66 * p is the output area (must be large enough!)
67 * lc is the mule character set id for the local encoding
68 * encoding is the PG identifier for the local encoding
69 */
70 void
latin2mic(const unsigned char * l,unsigned char * p,int len,int lc,int encoding)71 latin2mic(const unsigned char *l, unsigned char *p, int len,
72 int lc, int encoding)
73 {
74 int c1;
75
76 while (len > 0)
77 {
78 c1 = *l;
79 if (c1 == 0)
80 report_invalid_encoding(encoding, (const char *) l, len);
81 if (IS_HIGHBIT_SET(c1))
82 *p++ = lc;
83 *p++ = c1;
84 l++;
85 len--;
86 }
87 *p = '\0';
88 }
89
90 /*
91 * MIC ---> LATINn when the charset's local codes map directly to MIC
92 *
93 * mic points to the source string of length len
94 * p is the output area (must be large enough!)
95 * lc is the mule character set id for the local encoding
96 * encoding is the PG identifier for the local encoding
97 */
98 void
mic2latin(const unsigned char * mic,unsigned char * p,int len,int lc,int encoding)99 mic2latin(const unsigned char *mic, unsigned char *p, int len,
100 int lc, int encoding)
101 {
102 int c1;
103
104 while (len > 0)
105 {
106 c1 = *mic;
107 if (c1 == 0)
108 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
109 if (!IS_HIGHBIT_SET(c1))
110 {
111 /* easy for ASCII */
112 *p++ = c1;
113 mic++;
114 len--;
115 }
116 else
117 {
118 int l = pg_mic_mblen(mic);
119
120 if (len < l)
121 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
122 len);
123 if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
124 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
125 (const char *) mic, len);
126 *p++ = mic[1];
127 mic += 2;
128 len -= 2;
129 }
130 }
131 *p = '\0';
132 }
133
134
135 /*
136 * ASCII ---> MIC
137 *
138 * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
139 * characters, here we must take a hard line because we don't know
140 * the appropriate MIC equivalent.
141 */
142 void
pg_ascii2mic(const unsigned char * l,unsigned char * p,int len)143 pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
144 {
145 int c1;
146
147 while (len > 0)
148 {
149 c1 = *l;
150 if (c1 == 0 || IS_HIGHBIT_SET(c1))
151 report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
152 *p++ = c1;
153 l++;
154 len--;
155 }
156 *p = '\0';
157 }
158
159 /*
160 * MIC ---> ASCII
161 */
162 void
pg_mic2ascii(const unsigned char * mic,unsigned char * p,int len)163 pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
164 {
165 int c1;
166
167 while (len > 0)
168 {
169 c1 = *mic;
170 if (c1 == 0 || IS_HIGHBIT_SET(c1))
171 report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
172 (const char *) mic, len);
173 *p++ = c1;
174 mic++;
175 len--;
176 }
177 *p = '\0';
178 }
179
180 /*
181 * latin2mic_with_table: a generic single byte charset encoding
182 * conversion from a local charset to the mule internal code.
183 *
184 * l points to the source string of length len
185 * p is the output area (must be large enough!)
186 * lc is the mule character set id for the local encoding
187 * encoding is the PG identifier for the local encoding
188 * tab holds conversion entries for the local charset
189 * starting from 128 (0x80). each entry in the table holds the corresponding
190 * code point for the mule encoding, or 0 if there is no equivalent code.
191 */
192 void
latin2mic_with_table(const unsigned char * l,unsigned char * p,int len,int lc,int encoding,const unsigned char * tab)193 latin2mic_with_table(const unsigned char *l,
194 unsigned char *p,
195 int len,
196 int lc,
197 int encoding,
198 const unsigned char *tab)
199 {
200 unsigned char c1,
201 c2;
202
203 while (len > 0)
204 {
205 c1 = *l;
206 if (c1 == 0)
207 report_invalid_encoding(encoding, (const char *) l, len);
208 if (!IS_HIGHBIT_SET(c1))
209 *p++ = c1;
210 else
211 {
212 c2 = tab[c1 - HIGHBIT];
213 if (c2)
214 {
215 *p++ = lc;
216 *p++ = c2;
217 }
218 else
219 report_untranslatable_char(encoding, PG_MULE_INTERNAL,
220 (const char *) l, len);
221 }
222 l++;
223 len--;
224 }
225 *p = '\0';
226 }
227
228 /*
229 * mic2latin_with_table: a generic single byte charset encoding
230 * conversion from the mule internal code to a local charset.
231 *
232 * mic points to the source string of length len
233 * p is the output area (must be large enough!)
234 * lc is the mule character set id for the local encoding
235 * encoding is the PG identifier for the local encoding
236 * tab holds conversion entries for the mule internal code's second byte,
237 * starting from 128 (0x80). each entry in the table holds the corresponding
238 * code point for the local charset, or 0 if there is no equivalent code.
239 */
240 void
mic2latin_with_table(const unsigned char * mic,unsigned char * p,int len,int lc,int encoding,const unsigned char * tab)241 mic2latin_with_table(const unsigned char *mic,
242 unsigned char *p,
243 int len,
244 int lc,
245 int encoding,
246 const unsigned char *tab)
247 {
248 unsigned char c1,
249 c2;
250
251 while (len > 0)
252 {
253 c1 = *mic;
254 if (c1 == 0)
255 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
256 if (!IS_HIGHBIT_SET(c1))
257 {
258 /* easy for ASCII */
259 *p++ = c1;
260 mic++;
261 len--;
262 }
263 else
264 {
265 int l = pg_mic_mblen(mic);
266
267 if (len < l)
268 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
269 len);
270 if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
271 (c2 = tab[mic[1] - HIGHBIT]) == 0)
272 {
273 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
274 (const char *) mic, len);
275 break; /* keep compiler quiet */
276 }
277 *p++ = c2;
278 mic += 2;
279 len -= 2;
280 }
281 }
282 *p = '\0';
283 }
284
285 /*
286 * comparison routine for bsearch()
287 * this routine is intended for UTF8 -> local code
288 */
289 static int
compare1(const void * p1,const void * p2)290 compare1(const void *p1, const void *p2)
291 {
292 uint32 v1,
293 v2;
294
295 v1 = *(const uint32 *) p1;
296 v2 = ((const pg_utf_to_local *) p2)->utf;
297 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
298 }
299
300 /*
301 * comparison routine for bsearch()
302 * this routine is intended for local code -> UTF8
303 */
304 static int
compare2(const void * p1,const void * p2)305 compare2(const void *p1, const void *p2)
306 {
307 uint32 v1,
308 v2;
309
310 v1 = *(const uint32 *) p1;
311 v2 = ((const pg_local_to_utf *) p2)->code;
312 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
313 }
314
315 /*
316 * comparison routine for bsearch()
317 * this routine is intended for combined UTF8 -> local code
318 */
319 static int
compare3(const void * p1,const void * p2)320 compare3(const void *p1, const void *p2)
321 {
322 uint32 s1,
323 s2,
324 d1,
325 d2;
326
327 s1 = *(const uint32 *) p1;
328 s2 = *((const uint32 *) p1 + 1);
329 d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
330 d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
331 return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
332 }
333
334 /*
335 * comparison routine for bsearch()
336 * this routine is intended for local code -> combined UTF8
337 */
338 static int
compare4(const void * p1,const void * p2)339 compare4(const void *p1, const void *p2)
340 {
341 uint32 v1,
342 v2;
343
344 v1 = *(const uint32 *) p1;
345 v2 = ((const pg_local_to_utf_combined *) p2)->code;
346 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
347 }
348
349 /*
350 * store 32bit character representation into multibyte stream
351 */
352 static inline unsigned char *
store_coded_char(unsigned char * dest,uint32 code)353 store_coded_char(unsigned char *dest, uint32 code)
354 {
355 if (code & 0xff000000)
356 *dest++ = code >> 24;
357 if (code & 0x00ff0000)
358 *dest++ = code >> 16;
359 if (code & 0x0000ff00)
360 *dest++ = code >> 8;
361 if (code & 0x000000ff)
362 *dest++ = code;
363 return dest;
364 }
365
366 /*
367 * UTF8 ---> local code
368 *
369 * utf: input string in UTF8 encoding (need not be null-terminated)
370 * len: length of input string (in bytes)
371 * iso: pointer to the output area (must be large enough!)
372 (output string will be null-terminated)
373 * map: conversion map for single characters
374 * mapsize: number of entries in the conversion map
375 * cmap: conversion map for combined characters
376 * (optional, pass NULL if none)
377 * cmapsize: number of entries in the conversion map for combined characters
378 * (optional, pass 0 if none)
379 * conv_func: algorithmic encoding conversion function
380 * (optional, pass NULL if none)
381 * encoding: PG identifier for the local encoding
382 *
383 * For each character, the cmap (if provided) is consulted first; if no match,
384 * the map is consulted next; if still no match, the conv_func (if provided)
385 * is applied. An error is raised if no match is found.
386 *
387 * See pg_wchar.h for more details about the data structures used here.
388 */
389 void
UtfToLocal(const unsigned char * utf,int len,unsigned char * iso,const pg_utf_to_local * map,int mapsize,const pg_utf_to_local_combined * cmap,int cmapsize,utf_local_conversion_func conv_func,int encoding)390 UtfToLocal(const unsigned char *utf, int len,
391 unsigned char *iso,
392 const pg_utf_to_local *map, int mapsize,
393 const pg_utf_to_local_combined *cmap, int cmapsize,
394 utf_local_conversion_func conv_func,
395 int encoding)
396 {
397 uint32 iutf;
398 int l;
399 const pg_utf_to_local *p;
400 const pg_utf_to_local_combined *cp;
401
402 if (!PG_VALID_ENCODING(encoding))
403 ereport(ERROR,
404 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
405 errmsg("invalid encoding number: %d", encoding)));
406
407 for (; len > 0; len -= l)
408 {
409 /* "break" cases all represent errors */
410 if (*utf == '\0')
411 break;
412
413 l = pg_utf_mblen(utf);
414 if (len < l)
415 break;
416
417 if (!pg_utf8_islegal(utf, l))
418 break;
419
420 if (l == 1)
421 {
422 /* ASCII case is easy, assume it's one-to-one conversion */
423 *iso++ = *utf++;
424 continue;
425 }
426
427 /* collect coded char of length l */
428 if (l == 2)
429 {
430 iutf = *utf++ << 8;
431 iutf |= *utf++;
432 }
433 else if (l == 3)
434 {
435 iutf = *utf++ << 16;
436 iutf |= *utf++ << 8;
437 iutf |= *utf++;
438 }
439 else if (l == 4)
440 {
441 iutf = *utf++ << 24;
442 iutf |= *utf++ << 16;
443 iutf |= *utf++ << 8;
444 iutf |= *utf++;
445 }
446 else
447 {
448 elog(ERROR, "unsupported character length %d", l);
449 iutf = 0; /* keep compiler quiet */
450 }
451
452 /* First, try with combined map if possible */
453 if (cmap && len > l)
454 {
455 const unsigned char *utf_save = utf;
456 int len_save = len;
457 int l_save = l;
458
459 /* collect next character, same as above */
460 len -= l;
461
462 l = pg_utf_mblen(utf);
463 if (len < l)
464 break;
465
466 if (!pg_utf8_islegal(utf, l))
467 break;
468
469 /* We assume ASCII character cannot be in combined map */
470 if (l > 1)
471 {
472 uint32 iutf2;
473 uint32 cutf[2];
474
475 if (l == 2)
476 {
477 iutf2 = *utf++ << 8;
478 iutf2 |= *utf++;
479 }
480 else if (l == 3)
481 {
482 iutf2 = *utf++ << 16;
483 iutf2 |= *utf++ << 8;
484 iutf2 |= *utf++;
485 }
486 else if (l == 4)
487 {
488 iutf2 = *utf++ << 24;
489 iutf2 |= *utf++ << 16;
490 iutf2 |= *utf++ << 8;
491 iutf2 |= *utf++;
492 }
493 else
494 {
495 elog(ERROR, "unsupported character length %d", l);
496 iutf2 = 0; /* keep compiler quiet */
497 }
498
499 cutf[0] = iutf;
500 cutf[1] = iutf2;
501
502 cp = bsearch(cutf, cmap, cmapsize,
503 sizeof(pg_utf_to_local_combined), compare3);
504
505 if (cp)
506 {
507 iso = store_coded_char(iso, cp->code);
508 continue;
509 }
510 }
511
512 /* fail, so back up to reprocess second character next time */
513 utf = utf_save;
514 len = len_save;
515 l = l_save;
516 }
517
518 /* Now check ordinary map */
519 p = bsearch(&iutf, map, mapsize,
520 sizeof(pg_utf_to_local), compare1);
521
522 if (p)
523 {
524 iso = store_coded_char(iso, p->code);
525 continue;
526 }
527
528 /* if there's a conversion function, try that */
529 if (conv_func)
530 {
531 uint32 converted = (*conv_func) (iutf);
532
533 if (converted)
534 {
535 iso = store_coded_char(iso, converted);
536 continue;
537 }
538 }
539
540 /* failed to translate this character */
541 report_untranslatable_char(PG_UTF8, encoding,
542 (const char *) (utf - l), len);
543 }
544
545 /* if we broke out of loop early, must be invalid input */
546 if (len > 0)
547 report_invalid_encoding(PG_UTF8, (const char *) utf, len);
548
549 *iso = '\0';
550 }
551
552 /*
553 * local code ---> UTF8
554 *
555 * iso: input string in local encoding (need not be null-terminated)
556 * len: length of input string (in bytes)
557 * utf: pointer to the output area (must be large enough!)
558 (output string will be null-terminated)
559 * map: conversion map for single characters
560 * mapsize: number of entries in the conversion map
561 * cmap: conversion map for combined characters
562 * (optional, pass NULL if none)
563 * cmapsize: number of entries in the conversion map for combined characters
564 * (optional, pass 0 if none)
565 * conv_func: algorithmic encoding conversion function
566 * (optional, pass NULL if none)
567 * encoding: PG identifier for the local encoding
568 *
569 * For each character, the map is consulted first; if no match, the cmap
570 * (if provided) is consulted next; if still no match, the conv_func
571 * (if provided) is applied. An error is raised if no match is found.
572 *
573 * See pg_wchar.h for more details about the data structures used here.
574 */
575 void
LocalToUtf(const unsigned char * iso,int len,unsigned char * utf,const pg_local_to_utf * map,int mapsize,const pg_local_to_utf_combined * cmap,int cmapsize,utf_local_conversion_func conv_func,int encoding)576 LocalToUtf(const unsigned char *iso, int len,
577 unsigned char *utf,
578 const pg_local_to_utf *map, int mapsize,
579 const pg_local_to_utf_combined *cmap, int cmapsize,
580 utf_local_conversion_func conv_func,
581 int encoding)
582 {
583 uint32 iiso;
584 int l;
585 const pg_local_to_utf *p;
586 const pg_local_to_utf_combined *cp;
587
588 if (!PG_VALID_ENCODING(encoding))
589 ereport(ERROR,
590 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
591 errmsg("invalid encoding number: %d", encoding)));
592
593 for (; len > 0; len -= l)
594 {
595 /* "break" cases all represent errors */
596 if (*iso == '\0')
597 break;
598
599 if (!IS_HIGHBIT_SET(*iso))
600 {
601 /* ASCII case is easy, assume it's one-to-one conversion */
602 *utf++ = *iso++;
603 l = 1;
604 continue;
605 }
606
607 l = pg_encoding_verifymb(encoding, (const char *) iso, len);
608 if (l < 0)
609 break;
610
611 /* collect coded char of length l */
612 if (l == 1)
613 iiso = *iso++;
614 else if (l == 2)
615 {
616 iiso = *iso++ << 8;
617 iiso |= *iso++;
618 }
619 else if (l == 3)
620 {
621 iiso = *iso++ << 16;
622 iiso |= *iso++ << 8;
623 iiso |= *iso++;
624 }
625 else if (l == 4)
626 {
627 iiso = *iso++ << 24;
628 iiso |= *iso++ << 16;
629 iiso |= *iso++ << 8;
630 iiso |= *iso++;
631 }
632 else
633 {
634 elog(ERROR, "unsupported character length %d", l);
635 iiso = 0; /* keep compiler quiet */
636 }
637
638 /* First check ordinary map */
639 p = bsearch(&iiso, map, mapsize,
640 sizeof(pg_local_to_utf), compare2);
641
642 if (p)
643 {
644 utf = store_coded_char(utf, p->utf);
645 continue;
646 }
647
648 /* If there's a combined character map, try that */
649 if (cmap)
650 {
651 cp = bsearch(&iiso, cmap, cmapsize,
652 sizeof(pg_local_to_utf_combined), compare4);
653
654 if (cp)
655 {
656 utf = store_coded_char(utf, cp->utf1);
657 utf = store_coded_char(utf, cp->utf2);
658 continue;
659 }
660 }
661
662 /* if there's a conversion function, try that */
663 if (conv_func)
664 {
665 uint32 converted = (*conv_func) (iiso);
666
667 if (converted)
668 {
669 utf = store_coded_char(utf, converted);
670 continue;
671 }
672 }
673
674 /* failed to translate this character */
675 report_untranslatable_char(encoding, PG_UTF8,
676 (const char *) (iso - l), len);
677 }
678
679 /* if we broke out of loop early, must be invalid input */
680 if (len > 0)
681 report_invalid_encoding(encoding, (const char *) iso, len);
682
683 *utf = '\0';
684 }
685