1 /*-------------------------------------------------------------------------
2 *
3 * mbutils.c
4 * This file contains functions for encoding conversion.
5 *
6 * The string-conversion functions in this file share some API quirks.
7 * Note the following:
8 *
9 * The functions return a palloc'd, null-terminated string if conversion
10 * is required. However, if no conversion is performed, the given source
11 * string pointer is returned as-is.
12 *
13 * Although the presence of a length argument means that callers can pass
14 * non-null-terminated strings, care is required because the same string
15 * will be passed back if no conversion occurs. Such callers *must* check
16 * whether result == src and handle that case differently.
17 *
18 * If the source and destination encodings are the same, the source string
19 * is returned without any verification; it's assumed to be valid data.
20 * If that might not be the case, the caller is responsible for validating
21 * the string using a separate call to pg_verify_mbstr(). Whenever the
22 * source and destination encodings are different, the functions ensure that
23 * the result is validly encoded according to the destination encoding.
24 *
25 *
26 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
27 * Portions Copyright (c) 1994, Regents of the University of California
28 *
29 *
30 * IDENTIFICATION
31 * src/backend/utils/mb/mbutils.c
32 *
33 *-------------------------------------------------------------------------
DefineAM(PGLOBAL g,LPCSTR am,int poff)34 */
35 #include "postgres.h"
36
37 #include "access/xact.h"
38 #include "catalog/namespace.h"
39 #include "mb/pg_wchar.h"
40 #include "utils/builtins.h"
41 #include "utils/memutils.h"
42 #include "utils/syscache.h"
43
44 /*
45 * We maintain a simple linked list caching the fmgr lookup info for the
46 * currently selected conversion functions, as well as any that have been
47 * selected previously in the current session. (We remember previous
48 * settings because we must be able to restore a previous setting during
49 * transaction rollback, without doing any fresh catalog accesses.)
50 *
51 * Since we'll never release this data, we just keep it in TopMemoryContext.
52 */
53 typedef struct ConvProcInfo
54 {
55 int s_encoding; /* server and client encoding IDs */
56 int c_encoding;
57 FmgrInfo to_server_info; /* lookup info for conversion procs */
58 FmgrInfo to_client_info;
59 } ConvProcInfo;
TDBZIP(PZIPDEF tdp)60
61 static List *ConvProcList = NIL; /* List of ConvProcInfo */
62
63 /*
64 * These variables point to the currently active conversion functions,
65 * or are NULL when no conversion is needed.
66 */
67 static FmgrInfo *ToServerConvProc = NULL;
68 static FmgrInfo *ToClientConvProc = NULL;
69
70 /*
MakeCol(PGLOBAL g,PCOLDEF cdp,PCOL cprec,int n)71 * This variable stores the conversion function to convert from UTF-8
72 * to the server encoding. It's NULL if the server encoding *is* UTF-8,
73 * or if we lack a conversion function for this.
74 */
75 static FmgrInfo *Utf8ToServerConvProc = NULL;
76
77 /*
78 * These variables track the currently-selected encodings.
79 */
80 static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
open(PGLOBAL g,const char * fn)81 static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
82 static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
83
84 /*
85 * During backend startup we can't set client encoding because we (a)
86 * can't look up the conversion functions, and (b) may not know the database
87 * encoding yet either. So SetClientEncoding() just accepts anything and
88 * remembers it for InitializeClientEncoding() to apply later.
89 */
90 static bool backend_startup_complete = false;
91 static int pending_client_encoding = PG_SQL_ASCII;
92
93
94 /* Internal functions */
95 static char *perform_default_encoding_conversion(const char *src,
close()96 int len, bool is_client_to_server);
97 static int cliplen(const char *str, int len, int limit);
98
99
100 /*
101 * Prepare for a future call to SetClientEncoding. Success should mean
102 * that SetClientEncoding is guaranteed to succeed for this encoding request.
103 *
104 * (But note that success before backend_startup_complete does not guarantee
105 * success after ...)
106 *
107 * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
Cardinality(PGLOBAL g)108 */
109 int
110 PrepareClientEncoding(int encoding)
111 {
112 int current_server_encoding;
113 ListCell *lc;
114
115 if (!PG_VALID_FE_ENCODING(encoding))
116 return -1;
117
118 /* Can't do anything during startup, per notes above */
119 if (!backend_startup_complete)
120 return 0;
121
122 current_server_encoding = GetDatabaseEncoding();
123
124 /*
125 * Check for cases that require no conversion function.
126 */
127 if (current_server_encoding == encoding ||
128 current_server_encoding == PG_SQL_ASCII ||
129 encoding == PG_SQL_ASCII)
130 return 0;
131
132 if (IsTransactionState())
133 {
134 /*
135 * If we're in a live transaction, it's safe to access the catalogs,
136 * so look up the functions. We repeat the lookup even if the info is
137 * already cached, so that we can react to changes in the contents of
138 * pg_conversion.
139 */
140 Oid to_server_proc,
141 to_client_proc;
142 ConvProcInfo *convinfo;
143 MemoryContext oldcontext;
144
145 to_server_proc = FindDefaultConversionProc(encoding,
146 current_server_encoding);
147 if (!OidIsValid(to_server_proc))
148 return -1;
149 to_client_proc = FindDefaultConversionProc(current_server_encoding,
150 encoding);
151 if (!OidIsValid(to_client_proc))
152 return -1;
153
154 /*
155 * Load the fmgr info into TopMemoryContext (could still fail here)
156 */
157 convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
158 sizeof(ConvProcInfo));
159 convinfo->s_encoding = current_server_encoding;
160 convinfo->c_encoding = encoding;
161 fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
162 TopMemoryContext);
163 fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
164 TopMemoryContext);
165
166 /* Attach new info to head of list */
167 oldcontext = MemoryContextSwitchTo(TopMemoryContext);
168 ConvProcList = lcons(convinfo, ConvProcList);
169 MemoryContextSwitchTo(oldcontext);
170
171 /*
172 * We cannot yet remove any older entry for the same encoding pair,
173 * since it could still be in use. SetClientEncoding will clean up.
174 */
175
176 return 0; /* success */
177 }
178 else
179 {
180 /*
181 * If we're not in a live transaction, the only thing we can do is
182 * restore a previous setting using the cache. This covers all
183 * transaction-rollback cases. The only case it might not work for is
184 * trying to change client_encoding on the fly by editing
185 * postgresql.conf and SIGHUP'ing. Which would probably be a stupid
186 * thing to do anyway.
187 */
188 foreach(lc, ConvProcList)
189 {
190 ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
191
192 if (oldinfo->s_encoding == current_server_encoding &&
193 oldinfo->c_encoding == encoding)
194 return 0;
195 }
196
197 return -1; /* it's not cached, so fail */
198 }
199 }
200
201 /*
202 * Set the active client encoding and set up the conversion-function pointers.
203 * PrepareClientEncoding should have been called previously for this encoding.
204 *
205 * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
206 */
ZIPCOL(PCOLDEF cdp,PTDB tdbp,PCOL cprec,int i,PCSZ am)207 int
208 SetClientEncoding(int encoding)
209 {
210 int current_server_encoding;
211 bool found;
212 ListCell *lc;
213
214 if (!PG_VALID_FE_ENCODING(encoding))
215 return -1;
216
217 /* Can't do anything during startup, per notes above */
218 if (!backend_startup_complete)
219 {
220 pending_client_encoding = encoding;
221 return 0;
222 }
223
224 current_server_encoding = GetDatabaseEncoding();
225
226 /*
227 * Check for cases that require no conversion function.
228 */
229 if (current_server_encoding == encoding ||
230 current_server_encoding == PG_SQL_ASCII ||
231 encoding == PG_SQL_ASCII)
232 {
233 ClientEncoding = &pg_enc2name_tbl[encoding];
234 ToServerConvProc = NULL;
235 ToClientConvProc = NULL;
236 return 0;
237 }
238
239 /*
240 * Search the cache for the entry previously prepared by
241 * PrepareClientEncoding; if there isn't one, we lose. While at it,
242 * release any duplicate entries so that repeated Prepare/Set cycles don't
243 * leak memory.
244 */
245 found = false;
246 foreach(lc, ConvProcList)
247 {
248 ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
249
250 if (convinfo->s_encoding == current_server_encoding &&
251 convinfo->c_encoding == encoding)
252 {
253 if (!found)
254 {
255 /* Found newest entry, so set up */
256 ClientEncoding = &pg_enc2name_tbl[encoding];
257 ToServerConvProc = &convinfo->to_server_info;
258 ToClientConvProc = &convinfo->to_client_info;
259 found = true;
260 }
261 else
262 {
263 /* Duplicate entry, release it */
264 ConvProcList = foreach_delete_current(ConvProcList, lc);
265 pfree(convinfo);
266 }
267 }
268 }
269
270 if (found)
271 return 0; /* success */
272 else
273 return -1; /* it's not cached, so fail */
274 }
275
276 /*
277 * Initialize client encoding conversions.
278 * Called from InitPostgres() once during backend startup.
279 */
280 void
281 InitializeClientEncoding(void)
282 {
283 int current_server_encoding;
284
285 Assert(!backend_startup_complete);
286 backend_startup_complete = true;
287
288 if (PrepareClientEncoding(pending_client_encoding) < 0 ||
289 SetClientEncoding(pending_client_encoding) < 0)
290 {
291 /*
292 * Oops, the requested conversion is not available. We couldn't fail
293 * before, but we can now.
294 */
295 ereport(FATAL,
296 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
297 errmsg("conversion between %s and %s is not supported",
298 pg_enc2name_tbl[pending_client_encoding].name,
299 GetDatabaseEncodingName())));
300 }
301
302 /*
303 * Also look up the UTF8-to-server conversion function if needed. Since
304 * the server encoding is fixed within any one backend process, we don't
305 * have to do this more than once.
306 */
307 current_server_encoding = GetDatabaseEncoding();
308 if (current_server_encoding != PG_UTF8 &&
309 current_server_encoding != PG_SQL_ASCII)
310 {
311 Oid utf8_to_server_proc;
312
313 Assert(IsTransactionState());
314 utf8_to_server_proc =
315 FindDefaultConversionProc(PG_UTF8,
316 current_server_encoding);
317 /* If there's no such conversion, just leave the pointer as NULL */
318 if (OidIsValid(utf8_to_server_proc))
319 {
320 FmgrInfo *finfo;
321
322 finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext,
323 sizeof(FmgrInfo));
324 fmgr_info_cxt(utf8_to_server_proc, finfo,
325 TopMemoryContext);
326 /* Set Utf8ToServerConvProc only after data is fully valid */
327 Utf8ToServerConvProc = finfo;
328 }
329 }
330 }
331
332 /*
333 * returns the current client encoding
334 */
335 int
336 pg_get_client_encoding(void)
337 {
338 return ClientEncoding->encoding;
339 }
340
341 /*
342 * returns the current client encoding name
343 */
344 const char *
345 pg_get_client_encoding_name(void)
346 {
347 return ClientEncoding->name;
348 }
349
350 /*
351 * Convert src string to another encoding (general case).
352 *
353 * See the notes about string conversion functions at the top of this file.
354 */
355 unsigned char *
356 pg_do_encoding_conversion(unsigned char *src, int len,
357 int src_encoding, int dest_encoding)
358 {
359 unsigned char *result;
360 Oid proc;
361
362 if (len <= 0)
363 return src; /* empty string is always valid */
364
365 if (src_encoding == dest_encoding)
366 return src; /* no conversion required, assume valid */
367
368 if (dest_encoding == PG_SQL_ASCII)
369 return src; /* any string is valid in SQL_ASCII */
370
371 if (src_encoding == PG_SQL_ASCII)
372 {
373 /* No conversion is possible, but we must validate the result */
374 (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
375 return src;
376 }
377
378 if (!IsTransactionState()) /* shouldn't happen */
379 elog(ERROR, "cannot perform encoding conversion outside a transaction");
380
381 proc = FindDefaultConversionProc(src_encoding, dest_encoding);
382 if (!OidIsValid(proc))
383 ereport(ERROR,
384 (errcode(ERRCODE_UNDEFINED_FUNCTION),
385 errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
386 pg_encoding_to_char(src_encoding),
387 pg_encoding_to_char(dest_encoding))));
388
389 /*
390 * Allocate space for conversion result, being wary of integer overflow.
391 *
392 * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
393 * required space, so it might exceed MaxAllocSize even though the result
394 * would actually fit. We do not want to hand back a result string that
395 * exceeds MaxAllocSize, because callers might not cope gracefully --- but
396 * if we just allocate more than that, and don't use it, that's fine.
397 */
398 if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
399 ereport(ERROR,
400 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
401 errmsg("out of memory"),
402 errdetail("String of %d bytes is too long for encoding conversion.",
403 len)));
404
405 result = (unsigned char *)
406 MemoryContextAllocHuge(CurrentMemoryContext,
407 (Size) len * MAX_CONVERSION_GROWTH + 1);
408
409 OidFunctionCall5(proc,
410 Int32GetDatum(src_encoding),
411 Int32GetDatum(dest_encoding),
412 CStringGetDatum(src),
413 CStringGetDatum(result),
414 Int32GetDatum(len));
415
416 /*
417 * If the result is large, it's worth repalloc'ing to release any extra
418 * space we asked for. The cutoff here is somewhat arbitrary, but we
419 * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
420 */
421 if (len > 1000000)
422 {
423 Size resultlen = strlen((char *) result);
424
425 if (resultlen >= MaxAllocSize)
426 ereport(ERROR,
427 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
428 errmsg("out of memory"),
429 errdetail("String of %d bytes is too long for encoding conversion.",
430 len)));
431
432 result = (unsigned char *) repalloc(result, resultlen + 1);
433 }
434
435 return result;
436 }
437
438 /*
439 * Convert string to encoding encoding_name. The source
440 * encoding is the DB encoding.
441 *
442 * BYTEA convert_to(TEXT string, NAME encoding_name) */
443 Datum
444 pg_convert_to(PG_FUNCTION_ARGS)
445 {
446 Datum string = PG_GETARG_DATUM(0);
447 Datum dest_encoding_name = PG_GETARG_DATUM(1);
448 Datum src_encoding_name = DirectFunctionCall1(namein,
449 CStringGetDatum(DatabaseEncoding->name));
450 Datum result;
451
452 /*
453 * pg_convert expects a bytea as its first argument. We're passing it a
454 * text argument here, relying on the fact that they are both in fact
455 * varlena types, and thus structurally identical.
456 */
457 result = DirectFunctionCall3(pg_convert, string,
458 src_encoding_name, dest_encoding_name);
459
460 PG_RETURN_DATUM(result);
461 }
462
463 /*
464 * Convert string from encoding encoding_name. The destination
465 * encoding is the DB encoding.
466 *
467 * TEXT convert_from(BYTEA string, NAME encoding_name) */
468 Datum
469 pg_convert_from(PG_FUNCTION_ARGS)
470 {
471 Datum string = PG_GETARG_DATUM(0);
472 Datum src_encoding_name = PG_GETARG_DATUM(1);
473 Datum dest_encoding_name = DirectFunctionCall1(namein,
474 CStringGetDatum(DatabaseEncoding->name));
475 Datum result;
476
477 result = DirectFunctionCall3(pg_convert, string,
478 src_encoding_name, dest_encoding_name);
479
480 /*
481 * pg_convert returns a bytea, which we in turn return as text, relying on
482 * the fact that they are both in fact varlena types, and thus
483 * structurally identical. Although not all bytea values are valid text,
484 * in this case it will be because we've told pg_convert to return one
485 * that is valid as text in the current database encoding.
486 */
487 PG_RETURN_DATUM(result);
488 }
489
490 /*
491 * Convert string between two arbitrary encodings.
492 *
493 * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
494 */
495 Datum
496 pg_convert(PG_FUNCTION_ARGS)
497 {
498 bytea *string = PG_GETARG_BYTEA_PP(0);
499 char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
500 int src_encoding = pg_char_to_encoding(src_encoding_name);
501 char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
502 int dest_encoding = pg_char_to_encoding(dest_encoding_name);
503 const char *src_str;
504 char *dest_str;
505 bytea *retval;
506 int len;
507
508 if (src_encoding < 0)
509 ereport(ERROR,
510 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
511 errmsg("invalid source encoding name \"%s\"",
512 src_encoding_name)));
513 if (dest_encoding < 0)
514 ereport(ERROR,
515 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
516 errmsg("invalid destination encoding name \"%s\"",
517 dest_encoding_name)));
518
519 /* make sure that source string is valid */
520 len = VARSIZE_ANY_EXHDR(string);
521 src_str = VARDATA_ANY(string);
522 pg_verify_mbstr_len(src_encoding, src_str, len, false);
523
524 /* perform conversion */
525 dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
526 len,
527 src_encoding,
528 dest_encoding);
529
530 /* update len if conversion actually happened */
531 if (dest_str != src_str)
532 len = strlen(dest_str);
533
534 /*
535 * build bytea data type structure.
536 */
537 retval = (bytea *) palloc(len + VARHDRSZ);
538 SET_VARSIZE(retval, len + VARHDRSZ);
539 memcpy(VARDATA(retval), dest_str, len);
540
541 if (dest_str != src_str)
542 pfree(dest_str);
543
544 /* free memory if allocated by the toaster */
545 PG_FREE_IF_COPY(string, 0);
546
547 PG_RETURN_BYTEA_P(retval);
548 }
549
550 /*
551 * get the length of the string considered as text in the specified
552 * encoding. Raises an error if the data is not valid in that
553 * encoding.
554 *
555 * INT4 length (BYTEA string, NAME src_encoding_name)
556 */
557 Datum
558 length_in_encoding(PG_FUNCTION_ARGS)
559 {
560 bytea *string = PG_GETARG_BYTEA_PP(0);
561 char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
562 int src_encoding = pg_char_to_encoding(src_encoding_name);
563 const char *src_str;
564 int len;
565 int retval;
566
567 if (src_encoding < 0)
568 ereport(ERROR,
569 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
570 errmsg("invalid encoding name \"%s\"",
571 src_encoding_name)));
572
573 len = VARSIZE_ANY_EXHDR(string);
574 src_str = VARDATA_ANY(string);
575
576 retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
577
578 PG_RETURN_INT32(retval);
579 }
580
581 /*
582 * Get maximum multibyte character length in the specified encoding.
583 *
584 * Note encoding is specified numerically, not by name as above.
585 */
586 Datum
587 pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
588 {
589 int encoding = PG_GETARG_INT32(0);
590
591 if (PG_VALID_ENCODING(encoding))
592 PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
593 else
594 PG_RETURN_NULL();
595 }
596
597 /*
598 * Convert client encoding to server encoding.
599 *
600 * See the notes about string conversion functions at the top of this file.
601 */
602 char *
603 pg_client_to_server(const char *s, int len)
604 {
605 return pg_any_to_server(s, len, ClientEncoding->encoding);
606 }
607
608 /*
609 * Convert any encoding to server encoding.
610 *
611 * See the notes about string conversion functions at the top of this file.
612 *
613 * Unlike the other string conversion functions, this will apply validation
614 * even if encoding == DatabaseEncoding->encoding. This is because this is
615 * used to process data coming in from outside the database, and we never
616 * want to just assume validity.
617 */
618 char *
619 pg_any_to_server(const char *s, int len, int encoding)
620 {
621 if (len <= 0)
622 return unconstify(char *, s); /* empty string is always valid */
623
624 if (encoding == DatabaseEncoding->encoding ||
625 encoding == PG_SQL_ASCII)
626 {
627 /*
628 * No conversion is needed, but we must still validate the data.
629 */
630 (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
631 return unconstify(char *, s);
632 }
633
634 if (DatabaseEncoding->encoding == PG_SQL_ASCII)
635 {
636 /*
637 * No conversion is possible, but we must still validate the data,
638 * because the client-side code might have done string escaping using
639 * the selected client_encoding. If the client encoding is ASCII-safe
640 * then we just do a straight validation under that encoding. For an
641 * ASCII-unsafe encoding we have a problem: we dare not pass such data
642 * to the parser but we have no way to convert it. We compromise by
643 * rejecting the data if it contains any non-ASCII characters.
644 */
645 if (PG_VALID_BE_ENCODING(encoding))
646 (void) pg_verify_mbstr(encoding, s, len, false);
647 else
648 {
649 int i;
650
651 for (i = 0; i < len; i++)
652 {
653 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
654 ereport(ERROR,
655 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
656 errmsg("invalid byte value for encoding \"%s\": 0x%02x",
657 pg_enc2name_tbl[PG_SQL_ASCII].name,
658 (unsigned char) s[i])));
659 }
660 }
661 return unconstify(char *, s);
662 }
663
664 /* Fast path if we can use cached conversion function */
665 if (encoding == ClientEncoding->encoding)
666 return perform_default_encoding_conversion(s, len, true);
667
668 /* General case ... will not work outside transactions */
669 return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
670 len,
671 encoding,
672 DatabaseEncoding->encoding);
673 }
674
675 /*
676 * Convert server encoding to client encoding.
677 *
678 * See the notes about string conversion functions at the top of this file.
679 */
680 char *
681 pg_server_to_client(const char *s, int len)
682 {
683 return pg_server_to_any(s, len, ClientEncoding->encoding);
684 }
685
686 /*
687 * Convert server encoding to any encoding.
688 *
689 * See the notes about string conversion functions at the top of this file.
690 */
691 char *
692 pg_server_to_any(const char *s, int len, int encoding)
693 {
694 if (len <= 0)
695 return unconstify(char *, s); /* empty string is always valid */
696
697 if (encoding == DatabaseEncoding->encoding ||
698 encoding == PG_SQL_ASCII)
699 return unconstify(char *, s); /* assume data is valid */
700
701 if (DatabaseEncoding->encoding == PG_SQL_ASCII)
702 {
703 /* No conversion is possible, but we must validate the result */
704 (void) pg_verify_mbstr(encoding, s, len, false);
705 return unconstify(char *, s);
706 }
707
708 /* Fast path if we can use cached conversion function */
709 if (encoding == ClientEncoding->encoding)
710 return perform_default_encoding_conversion(s, len, false);
711
712 /* General case ... will not work outside transactions */
713 return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
714 len,
715 DatabaseEncoding->encoding,
716 encoding);
717 }
718
719 /*
720 * Perform default encoding conversion using cached FmgrInfo. Since
721 * this function does not access database at all, it is safe to call
722 * outside transactions. If the conversion has not been set up by
723 * SetClientEncoding(), no conversion is performed.
724 */
725 static char *
726 perform_default_encoding_conversion(const char *src, int len,
727 bool is_client_to_server)
728 {
729 char *result;
730 int src_encoding,
731 dest_encoding;
732 FmgrInfo *flinfo;
733
734 if (is_client_to_server)
735 {
736 src_encoding = ClientEncoding->encoding;
737 dest_encoding = DatabaseEncoding->encoding;
738 flinfo = ToServerConvProc;
739 }
740 else
741 {
742 src_encoding = DatabaseEncoding->encoding;
743 dest_encoding = ClientEncoding->encoding;
744 flinfo = ToClientConvProc;
745 }
746
747 if (flinfo == NULL)
748 return unconstify(char *, src);
749
750 /*
751 * Allocate space for conversion result, being wary of integer overflow.
752 * See comments in pg_do_encoding_conversion.
753 */
754 if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
755 ereport(ERROR,
756 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
757 errmsg("out of memory"),
758 errdetail("String of %d bytes is too long for encoding conversion.",
759 len)));
760
761 result = (char *)
762 MemoryContextAllocHuge(CurrentMemoryContext,
763 (Size) len * MAX_CONVERSION_GROWTH + 1);
764
765 FunctionCall5(flinfo,
766 Int32GetDatum(src_encoding),
767 Int32GetDatum(dest_encoding),
768 CStringGetDatum(src),
769 CStringGetDatum(result),
770 Int32GetDatum(len));
771
772 /*
773 * Release extra space if there might be a lot --- see comments in
774 * pg_do_encoding_conversion.
775 */
776 if (len > 1000000)
777 {
778 Size resultlen = strlen(result);
779
780 if (resultlen >= MaxAllocSize)
781 ereport(ERROR,
782 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
783 errmsg("out of memory"),
784 errdetail("String of %d bytes is too long for encoding conversion.",
785 len)));
786
787 result = (char *) repalloc(result, resultlen + 1);
788 }
789
790 return result;
791 }
792
793 /*
794 * Convert a single Unicode code point into a string in the server encoding.
795 *
796 * The code point given by "c" is converted and stored at *s, which must
797 * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
798 * The output will have a trailing '\0'. Throws error if the conversion
799 * cannot be performed.
800 *
801 * Note that this relies on having previously looked up any required
802 * conversion function. That's partly for speed but mostly because the parser
803 * may call this outside any transaction, or in an aborted transaction.
804 */
805 void
806 pg_unicode_to_server(pg_wchar c, unsigned char *s)
807 {
808 unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
809 int c_as_utf8_len;
810 int server_encoding;
811
812 /*
813 * Complain if invalid Unicode code point. The choice of errcode here is
814 * debatable, but really our caller should have checked this anyway.
815 */
816 if (!is_valid_unicode_codepoint(c))
817 ereport(ERROR,
818 (errcode(ERRCODE_SYNTAX_ERROR),
819 errmsg("invalid Unicode code point")));
820
821 /* Otherwise, if it's in ASCII range, conversion is trivial */
822 if (c <= 0x7F)
823 {
824 s[0] = (unsigned char) c;
825 s[1] = '\0';
826 return;
827 }
828
829 /* If the server encoding is UTF-8, we just need to reformat the code */
830 server_encoding = GetDatabaseEncoding();
831 if (server_encoding == PG_UTF8)
832 {
833 unicode_to_utf8(c, s);
834 s[pg_utf_mblen(s)] = '\0';
835 return;
836 }
837
838 /* For all other cases, we must have a conversion function available */
839 if (Utf8ToServerConvProc == NULL)
840 ereport(ERROR,
841 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
842 errmsg("conversion between %s and %s is not supported",
843 pg_enc2name_tbl[PG_UTF8].name,
844 GetDatabaseEncodingName())));
845
846 /* Construct UTF-8 source string */
847 unicode_to_utf8(c, c_as_utf8);
848 c_as_utf8_len = pg_utf_mblen(c_as_utf8);
849 c_as_utf8[c_as_utf8_len] = '\0';
850
851 /* Convert, or throw error if we can't */
852 FunctionCall5(Utf8ToServerConvProc,
853 Int32GetDatum(PG_UTF8),
854 Int32GetDatum(server_encoding),
855 CStringGetDatum(c_as_utf8),
856 CStringGetDatum(s),
857 Int32GetDatum(c_as_utf8_len));
858 }
859
860
861 /* convert a multibyte string to a wchar */
862 int
863 pg_mb2wchar(const char *from, pg_wchar *to)
864 {
865 return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
866 }
867
868 /* convert a multibyte string to a wchar with a limited length */
869 int
870 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
871 {
872 return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
873 }
874
875 /* same, with any encoding */
876 int
877 pg_encoding_mb2wchar_with_len(int encoding,
878 const char *from, pg_wchar *to, int len)
879 {
880 return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
881 }
882
883 /* convert a wchar string to a multibyte */
884 int
885 pg_wchar2mb(const pg_wchar *from, char *to)
886 {
887 return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
888 }
889
890 /* convert a wchar string to a multibyte with a limited length */
891 int
892 pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
893 {
894 return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
895 }
896
897 /* same, with any encoding */
898 int
899 pg_encoding_wchar2mb_with_len(int encoding,
900 const pg_wchar *from, char *to, int len)
901 {
902 return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
903 }
904
905 /* returns the byte length of a multibyte character */
906 int
907 pg_mblen(const char *mbstr)
908 {
909 return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
910 }
911
912 /* returns the display length of a multibyte character */
913 int
914 pg_dsplen(const char *mbstr)
915 {
916 return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
917 }
918
919 /* returns the length (counted in wchars) of a multibyte string */
920 int
921 pg_mbstrlen(const char *mbstr)
922 {
923 int len = 0;
924
925 /* optimization for single byte encoding */
926 if (pg_database_encoding_max_length() == 1)
927 return strlen(mbstr);
928
929 while (*mbstr)
930 {
931 mbstr += pg_mblen(mbstr);
932 len++;
933 }
934 return len;
935 }
936
937 /* returns the length (counted in wchars) of a multibyte string
938 * (not necessarily NULL terminated)
939 */
940 int
941 pg_mbstrlen_with_len(const char *mbstr, int limit)
942 {
943 int len = 0;
944
945 /* optimization for single byte encoding */
946 if (pg_database_encoding_max_length() == 1)
947 return limit;
948
949 while (limit > 0 && *mbstr)
950 {
951 int l = pg_mblen(mbstr);
952
953 limit -= l;
954 mbstr += l;
955 len++;
956 }
957 return len;
958 }
959
960 /*
961 * returns the byte length of a multibyte string
962 * (not necessarily NULL terminated)
963 * that is no longer than limit.
964 * this function does not break multibyte character boundary.
965 */
966 int
967 pg_mbcliplen(const char *mbstr, int len, int limit)
968 {
969 return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
970 len, limit);
971 }
972
973 /*
974 * pg_mbcliplen with specified encoding
975 */
976 int
977 pg_encoding_mbcliplen(int encoding, const char *mbstr,
978 int len, int limit)
979 {
980 mblen_converter mblen_fn;
981 int clen = 0;
982 int l;
983
984 /* optimization for single byte encoding */
985 if (pg_encoding_max_length(encoding) == 1)
986 return cliplen(mbstr, len, limit);
987
988 mblen_fn = pg_wchar_table[encoding].mblen;
989
990 while (len > 0 && *mbstr)
991 {
992 l = (*mblen_fn) ((const unsigned char *) mbstr);
993 if ((clen + l) > limit)
994 break;
995 clen += l;
996 if (clen == limit)
997 break;
998 len -= l;
999 mbstr += l;
1000 }
1001 return clen;
1002 }
1003
1004 /*
1005 * Similar to pg_mbcliplen except the limit parameter specifies the
1006 * character length, not the byte length.
1007 */
1008 int
1009 pg_mbcharcliplen(const char *mbstr, int len, int limit)
1010 {
1011 int clen = 0;
1012 int nch = 0;
1013 int l;
1014
1015 /* optimization for single byte encoding */
1016 if (pg_database_encoding_max_length() == 1)
1017 return cliplen(mbstr, len, limit);
1018
1019 while (len > 0 && *mbstr)
1020 {
1021 l = pg_mblen(mbstr);
1022 nch++;
1023 if (nch > limit)
1024 break;
1025 clen += l;
1026 len -= l;
1027 mbstr += l;
1028 }
1029 return clen;
1030 }
1031
1032 /* mbcliplen for any single-byte encoding */
1033 static int
1034 cliplen(const char *str, int len, int limit)
1035 {
1036 int l = 0;
1037
1038 len = Min(len, limit);
1039 while (l < len && str[l])
1040 l++;
1041 return l;
1042 }
1043
1044 void
1045 SetDatabaseEncoding(int encoding)
1046 {
1047 if (!PG_VALID_BE_ENCODING(encoding))
1048 elog(ERROR, "invalid database encoding: %d", encoding);
1049
1050 DatabaseEncoding = &pg_enc2name_tbl[encoding];
1051 Assert(DatabaseEncoding->encoding == encoding);
1052 }
1053
1054 void
1055 SetMessageEncoding(int encoding)
1056 {
1057 /* Some calls happen before we can elog()! */
1058 Assert(PG_VALID_ENCODING(encoding));
1059
1060 MessageEncoding = &pg_enc2name_tbl[encoding];
1061 Assert(MessageEncoding->encoding == encoding);
1062 }
1063
1064 #ifdef ENABLE_NLS
1065 /*
1066 * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
1067 * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
1068 * fail for gettext-internal causes like out-of-memory.
1069 */
1070 static bool
1071 raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
1072 {
1073 bool elog_ok = (CurrentMemoryContext != NULL);
1074 int i;
1075
1076 for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
1077 {
1078 if (pg_enc2gettext_tbl[i].encoding == encoding)
1079 {
1080 if (bind_textdomain_codeset(domainname,
1081 pg_enc2gettext_tbl[i].name) != NULL)
1082 return true;
1083
1084 if (elog_ok)
1085 elog(LOG, "bind_textdomain_codeset failed");
1086 else
1087 write_stderr("bind_textdomain_codeset failed");
1088
1089 break;
1090 }
1091 }
1092
1093 return false;
1094 }
1095
1096 /*
1097 * Bind a gettext message domain to the codeset corresponding to the database
1098 * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
1099 * Return the MessageEncoding implied by the new settings.
1100 *
1101 * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
1102 * When that matches the database encoding, we don't need to do anything. In
1103 * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
1104 * database encoding, except for the C locale. (On Windows, we also permit a
1105 * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
1106 * gettext to the right codeset.
1107 *
1108 * On Windows, gettext defaults to the Windows ANSI code page. This is a
1109 * convenient departure for software that passes the strings to Windows ANSI
1110 * APIs, but we don't do that. Compel gettext to use database encoding or,
1111 * failing that, the LC_CTYPE encoding as it would on other platforms.
1112 *
1113 * This function is called before elog() and palloc() are usable.
1114 */
1115 int
1116 pg_bind_textdomain_codeset(const char *domainname)
1117 {
1118 bool elog_ok = (CurrentMemoryContext != NULL);
1119 int encoding = GetDatabaseEncoding();
1120 int new_msgenc;
1121
1122 #ifndef WIN32
1123 const char *ctype = setlocale(LC_CTYPE, NULL);
1124
1125 if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
1126 #endif
1127 if (encoding != PG_SQL_ASCII &&
1128 raw_pg_bind_textdomain_codeset(domainname, encoding))
1129 return encoding;
1130
1131 new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
1132 if (new_msgenc < 0)
1133 new_msgenc = PG_SQL_ASCII;
1134
1135 #ifdef WIN32
1136 if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
1137 /* On failure, the old message encoding remains valid. */
1138 return GetMessageEncoding();
1139 #endif
1140
1141 return new_msgenc;
1142 }
1143 #endif
1144
1145 /*
1146 * The database encoding, also called the server encoding, represents the
1147 * encoding of data stored in text-like data types. Affected types include
1148 * cstring, text, varchar, name, xml, and json.
1149 */
1150 int
1151 GetDatabaseEncoding(void)
1152 {
1153 return DatabaseEncoding->encoding;
1154 }
1155
1156 const char *
1157 GetDatabaseEncodingName(void)
1158 {
1159 return DatabaseEncoding->name;
1160 }
1161
1162 Datum
1163 getdatabaseencoding(PG_FUNCTION_ARGS)
1164 {
1165 return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1166 }
1167
1168 Datum
1169 pg_client_encoding(PG_FUNCTION_ARGS)
1170 {
1171 return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1172 }
1173
1174 Datum
1175 PG_char_to_encoding(PG_FUNCTION_ARGS)
1176 {
1177 Name s = PG_GETARG_NAME(0);
1178
1179 PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
1180 }
1181
1182 Datum
1183 PG_encoding_to_char(PG_FUNCTION_ARGS)
1184 {
1185 int32 encoding = PG_GETARG_INT32(0);
1186 const char *encoding_name = pg_encoding_to_char(encoding);
1187
1188 return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
1189 }
1190
1191 /*
1192 * gettext() returns messages in this encoding. This often matches the
1193 * database encoding, but it differs for SQL_ASCII databases, for processes
1194 * not attached to a database, and under a database encoding lacking iconv
1195 * support (MULE_INTERNAL).
1196 */
1197 int
1198 GetMessageEncoding(void)
1199 {
1200 return MessageEncoding->encoding;
1201 }
1202
1203
1204 /*
1205 * Generic character incrementer function.
1206 *
1207 * Not knowing anything about the properties of the encoding in use, we just
1208 * keep incrementing the last byte until we get a validly-encoded result,
1209 * or we run out of values to try. We don't bother to try incrementing
1210 * higher-order bytes, so there's no growth in runtime for wider characters.
1211 * (If we did try to do that, we'd need to consider the likelihood that 255
1212 * is not a valid final byte in the encoding.)
1213 */
1214 static bool
1215 pg_generic_charinc(unsigned char *charptr, int len)
1216 {
1217 unsigned char *lastbyte = charptr + len - 1;
1218 mbverifier mbverify;
1219
1220 /* We can just invoke the character verifier directly. */
1221 mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
1222
1223 while (*lastbyte < (unsigned char) 255)
1224 {
1225 (*lastbyte)++;
1226 if ((*mbverify) (charptr, len) == len)
1227 return true;
1228 }
1229
1230 return false;
1231 }
1232
1233 /*
1234 * UTF-8 character incrementer function.
1235 *
1236 * For a one-byte character less than 0x7F, we just increment the byte.
1237 *
1238 * For a multibyte character, every byte but the first must fall between 0x80
1239 * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
1240 * the last byte that's not already at its maximum value. If we can't find a
1241 * byte that's less than the maximum allowable value, we simply fail. We also
1242 * need some special-case logic to skip regions used for surrogate pair
1243 * handling, as those should not occur in valid UTF-8.
1244 *
1245 * Note that we don't reset lower-order bytes back to their minimums, since
1246 * we can't afford to make an exhaustive search (see make_greater_string).
1247 */
1248 static bool
1249 pg_utf8_increment(unsigned char *charptr, int length)
1250 {
1251 unsigned char a;
1252 unsigned char limit;
1253
1254 switch (length)
1255 {
1256 default:
1257 /* reject lengths 5 and 6 for now */
1258 return false;
1259 case 4:
1260 a = charptr[3];
1261 if (a < 0xBF)
1262 {
1263 charptr[3]++;
1264 break;
1265 }
1266 /* FALL THRU */
1267 case 3:
1268 a = charptr[2];
1269 if (a < 0xBF)
1270 {
1271 charptr[2]++;
1272 break;
1273 }
1274 /* FALL THRU */
1275 case 2:
1276 a = charptr[1];
1277 switch (*charptr)
1278 {
1279 case 0xED:
1280 limit = 0x9F;
1281 break;
1282 case 0xF4:
1283 limit = 0x8F;
1284 break;
1285 default:
1286 limit = 0xBF;
1287 break;
1288 }
1289 if (a < limit)
1290 {
1291 charptr[1]++;
1292 break;
1293 }
1294 /* FALL THRU */
1295 case 1:
1296 a = *charptr;
1297 if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1298 return false;
1299 charptr[0]++;
1300 break;
1301 }
1302
1303 return true;
1304 }
1305
1306 /*
1307 * EUC-JP character incrementer function.
1308 *
1309 * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1310 * representing JIS X 0201 characters with the second byte ranging between
1311 * 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
1312 * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1313 *
1314 * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1315 * in which the last two bytes range between 0xa1 and 0xfe. The last byte
1316 * is incremented if possible, otherwise the second-to-last byte.
1317 *
1318 * If the sequence starts with a value other than the above and its MSB
1319 * is set, it must be a two-byte sequence representing JIS X 0208 characters
1320 * with both bytes ranging between 0xa1 and 0xfe. The last byte is
1321 * incremented if possible, otherwise the second-to-last byte.
1322 *
1323 * Otherwise, the sequence is a single-byte ASCII character. It is
1324 * incremented up to 0x7f.
1325 */
1326 static bool
1327 pg_eucjp_increment(unsigned char *charptr, int length)
1328 {
1329 unsigned char c1,
1330 c2;
1331 int i;
1332
1333 c1 = *charptr;
1334
1335 switch (c1)
1336 {
1337 case SS2: /* JIS X 0201 */
1338 if (length != 2)
1339 return false;
1340
1341 c2 = charptr[1];
1342
1343 if (c2 >= 0xdf)
1344 charptr[0] = charptr[1] = 0xa1;
1345 else if (c2 < 0xa1)
1346 charptr[1] = 0xa1;
1347 else
1348 charptr[1]++;
1349 break;
1350
1351 case SS3: /* JIS X 0212 */
1352 if (length != 3)
1353 return false;
1354
1355 for (i = 2; i > 0; i--)
1356 {
1357 c2 = charptr[i];
1358 if (c2 < 0xa1)
1359 {
1360 charptr[i] = 0xa1;
1361 return true;
1362 }
1363 else if (c2 < 0xfe)
1364 {
1365 charptr[i]++;
1366 return true;
1367 }
1368 }
1369
1370 /* Out of 3-byte code region */
1371 return false;
1372
1373 default:
1374 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1375 {
1376 if (length != 2)
1377 return false;
1378
1379 for (i = 1; i >= 0; i--)
1380 {
1381 c2 = charptr[i];
1382 if (c2 < 0xa1)
1383 {
1384 charptr[i] = 0xa1;
1385 return true;
1386 }
1387 else if (c2 < 0xfe)
1388 {
1389 charptr[i]++;
1390 return true;
1391 }
1392 }
1393
1394 /* Out of 2 byte code region */
1395 return false;
1396 }
1397 else
1398 { /* ASCII, single byte */
1399 if (c1 > 0x7e)
1400 return false;
1401 (*charptr)++;
1402 }
1403 break;
1404 }
1405
1406 return true;
1407 }
1408
1409 /*
1410 * get the character incrementer for the encoding for the current database
1411 */
1412 mbcharacter_incrementer
1413 pg_database_encoding_character_incrementer(void)
1414 {
1415 /*
1416 * Eventually it might be best to add a field to pg_wchar_table[], but for
1417 * now we just use a switch.
1418 */
1419 switch (GetDatabaseEncoding())
1420 {
1421 case PG_UTF8:
1422 return pg_utf8_increment;
1423
1424 case PG_EUC_JP:
1425 return pg_eucjp_increment;
1426
1427 default:
1428 return pg_generic_charinc;
1429 }
1430 }
1431
1432 /*
1433 * fetch maximum length of the encoding for the current database
1434 */
1435 int
1436 pg_database_encoding_max_length(void)
1437 {
1438 return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
1439 }
1440
1441 /*
1442 * Verify mbstr to make sure that it is validly encoded in the current
1443 * database encoding. Otherwise same as pg_verify_mbstr().
1444 */
1445 bool
1446 pg_verifymbstr(const char *mbstr, int len, bool noError)
1447 {
1448 return
1449 pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
1450 }
1451
1452 /*
1453 * Verify mbstr to make sure that it is validly encoded in the specified
1454 * encoding.
1455 */
1456 bool
1457 pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1458 {
1459 return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
1460 }
1461
1462 /*
1463 * Verify mbstr to make sure that it is validly encoded in the specified
1464 * encoding.
1465 *
1466 * mbstr is not necessarily zero terminated; length of mbstr is
1467 * specified by len.
1468 *
1469 * If OK, return length of string in the encoding.
1470 * If a problem is found, return -1 when noError is
1471 * true; when noError is false, ereport() a descriptive message.
1472 */
1473 int
1474 pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1475 {
1476 mbverifier mbverify;
1477 int mb_len;
1478
1479 Assert(PG_VALID_ENCODING(encoding));
1480
1481 /*
1482 * In single-byte encodings, we need only reject nulls (\0).
1483 */
1484 if (pg_encoding_max_length(encoding) <= 1)
1485 {
1486 const char *nullpos = memchr(mbstr, 0, len);
1487
1488 if (nullpos == NULL)
1489 return len;
1490 if (noError)
1491 return -1;
1492 report_invalid_encoding(encoding, nullpos, 1);
1493 }
1494
1495 /* fetch function pointer just once */
1496 mbverify = pg_wchar_table[encoding].mbverify;
1497
1498 mb_len = 0;
1499
1500 while (len > 0)
1501 {
1502 int l;
1503
1504 /* fast path for ASCII-subset characters */
1505 if (!IS_HIGHBIT_SET(*mbstr))
1506 {
1507 if (*mbstr != '\0')
1508 {
1509 mb_len++;
1510 mbstr++;
1511 len--;
1512 continue;
1513 }
1514 if (noError)
1515 return -1;
1516 report_invalid_encoding(encoding, mbstr, len);
1517 }
1518
1519 l = (*mbverify) ((const unsigned char *) mbstr, len);
1520
1521 if (l < 0)
1522 {
1523 if (noError)
1524 return -1;
1525 report_invalid_encoding(encoding, mbstr, len);
1526 }
1527
1528 mbstr += l;
1529 len -= l;
1530 mb_len++;
1531 }
1532 return mb_len;
1533 }
1534
1535 /*
1536 * check_encoding_conversion_args: check arguments of a conversion function
1537 *
1538 * "expected" arguments can be either an encoding ID or -1 to indicate that
1539 * the caller will check whether it accepts the ID.
1540 *
1541 * Note: the errors here are not really user-facing, so elog instead of
1542 * ereport seems sufficient. Also, we trust that the "expected" encoding
1543 * arguments are valid encoding IDs, but we don't trust the actuals.
1544 */
1545 void
1546 check_encoding_conversion_args(int src_encoding,
1547 int dest_encoding,
1548 int len,
1549 int expected_src_encoding,
1550 int expected_dest_encoding)
1551 {
1552 if (!PG_VALID_ENCODING(src_encoding))
1553 elog(ERROR, "invalid source encoding ID: %d", src_encoding);
1554 if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
1555 elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
1556 pg_enc2name_tbl[expected_src_encoding].name,
1557 pg_enc2name_tbl[src_encoding].name);
1558 if (!PG_VALID_ENCODING(dest_encoding))
1559 elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
1560 if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
1561 elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
1562 pg_enc2name_tbl[expected_dest_encoding].name,
1563 pg_enc2name_tbl[dest_encoding].name);
1564 if (len < 0)
1565 elog(ERROR, "encoding conversion length must not be negative");
1566 }
1567
1568 /*
1569 * report_invalid_encoding: complain about invalid multibyte character
1570 *
1571 * note: len is remaining length of string, not length of character;
1572 * len must be greater than zero, as we always examine the first byte.
1573 */
1574 void
1575 report_invalid_encoding(int encoding, const char *mbstr, int len)
1576 {
1577 int l = pg_encoding_mblen(encoding, mbstr);
1578 char buf[8 * 5 + 1];
1579 char *p = buf;
1580 int j,
1581 jlimit;
1582
1583 jlimit = Min(l, len);
1584 jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1585
1586 for (j = 0; j < jlimit; j++)
1587 {
1588 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1589 if (j < jlimit - 1)
1590 p += sprintf(p, " ");
1591 }
1592
1593 ereport(ERROR,
1594 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1595 errmsg("invalid byte sequence for encoding \"%s\": %s",
1596 pg_enc2name_tbl[encoding].name,
1597 buf)));
1598 }
1599
1600 /*
1601 * report_untranslatable_char: complain about untranslatable character
1602 *
1603 * note: len is remaining length of string, not length of character;
1604 * len must be greater than zero, as we always examine the first byte.
1605 */
1606 void
1607 report_untranslatable_char(int src_encoding, int dest_encoding,
1608 const char *mbstr, int len)
1609 {
1610 int l = pg_encoding_mblen(src_encoding, mbstr);
1611 char buf[8 * 5 + 1];
1612 char *p = buf;
1613 int j,
1614 jlimit;
1615
1616 jlimit = Min(l, len);
1617 jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1618
1619 for (j = 0; j < jlimit; j++)
1620 {
1621 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1622 if (j < jlimit - 1)
1623 p += sprintf(p, " ");
1624 }
1625
1626 ereport(ERROR,
1627 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
1628 errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
1629 buf,
1630 pg_enc2name_tbl[src_encoding].name,
1631 pg_enc2name_tbl[dest_encoding].name)));
1632 }
1633
1634
1635 #ifdef WIN32
1636 /*
1637 * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
1638 * string. The character length is also passed to utf16len if not
1639 * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
1640 * should be ASCII-only; this will function as though MessageEncoding is UTF8.
1641 */
1642 WCHAR *
1643 pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1644 {
1645 int msgenc = GetMessageEncoding();
1646 WCHAR *utf16;
1647 int dstlen;
1648 UINT codepage;
1649
1650 if (msgenc == PG_SQL_ASCII)
1651 /* No conversion is possible, and SQL_ASCII is never utf16. */
1652 return NULL;
1653
1654 codepage = pg_enc2name_tbl[msgenc].codepage;
1655
1656 /*
1657 * Use MultiByteToWideChar directly if there is a corresponding codepage,
1658 * or double conversion through UTF8 if not. Double conversion is needed,
1659 * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1660 */
1661 if (codepage != 0)
1662 {
1663 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1664 dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1665 utf16[dstlen] = (WCHAR) 0;
1666 }
1667 else
1668 {
1669 char *utf8;
1670
1671 /*
1672 * XXX pg_do_encoding_conversion() requires a transaction. In the
1673 * absence of one, hope for the input to be valid UTF8.
1674 */
1675 if (IsTransactionState())
1676 {
1677 utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1678 len,
1679 msgenc,
1680 PG_UTF8);
1681 if (utf8 != str)
1682 len = strlen(utf8);
1683 }
1684 else
1685 utf8 = (char *) str;
1686
1687 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1688 dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1689 utf16[dstlen] = (WCHAR) 0;
1690
1691 if (utf8 != str)
1692 pfree(utf8);
1693 }
1694
1695 if (dstlen == 0 && len > 0)
1696 {
1697 pfree(utf16);
1698 return NULL; /* error */
1699 }
1700
1701 if (utf16len)
1702 *utf16len = dstlen;
1703 return utf16;
1704 }
1705
1706 #endif /* WIN32 */
1707