1 /*-------------------------------------------------------------------------
2 *
3 * mbutils.c
4 * This file contains functions for encoding conversion.
5 *
6 * The string-conversion functions in this file share some API quirks.
7 * Note the following:
8 *
9 * The functions return a palloc'd, null-terminated string if conversion
10 * is required. However, if no conversion is performed, the given source
11 * string pointer is returned as-is.
12 *
13 * Although the presence of a length argument means that callers can pass
14 * non-null-terminated strings, care is required because the same string
15 * will be passed back if no conversion occurs. Such callers *must* check
16 * whether result == src and handle that case differently.
17 *
18 * If the source and destination encodings are the same, the source string
19 * is returned without any verification; it's assumed to be valid data.
20 * If that might not be the case, the caller is responsible for validating
21 * the string using a separate call to pg_verify_mbstr(). Whenever the
22 * source and destination encodings are different, the functions ensure that
23 * the result is validly encoded according to the destination encoding.
24 *
25 *
26 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
27 * Portions Copyright (c) 1994, Regents of the University of California
28 *
29 *
30 * IDENTIFICATION
31 * src/backend/utils/mb/mbutils.c
32 *
33 *-------------------------------------------------------------------------
34 */
35 #include "postgres.h"
36
37 #include "access/xact.h"
38 #include "catalog/namespace.h"
39 #include "mb/pg_wchar.h"
40 #include "utils/builtins.h"
41 #include "utils/memutils.h"
42 #include "utils/syscache.h"
43
44 /*
45 * When converting strings between different encodings, we assume that space
46 * for converted result is 4-to-1 growth in the worst case. The rate for
47 * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
48 * kanna -> UTF8 is the worst case). So "4" should be enough for the moment.
49 *
50 * Note that this is not the same as the maximum character width in any
51 * particular encoding.
52 */
53 #define MAX_CONVERSION_GROWTH 4
54
55 /*
56 * We maintain a simple linked list caching the fmgr lookup info for the
57 * currently selected conversion functions, as well as any that have been
58 * selected previously in the current session. (We remember previous
59 * settings because we must be able to restore a previous setting during
60 * transaction rollback, without doing any fresh catalog accesses.)
61 *
62 * Since we'll never release this data, we just keep it in TopMemoryContext.
63 */
64 typedef struct ConvProcInfo
65 {
66 int s_encoding; /* server and client encoding IDs */
67 int c_encoding;
68 FmgrInfo to_server_info; /* lookup info for conversion procs */
69 FmgrInfo to_client_info;
70 } ConvProcInfo;
71
72 static List *ConvProcList = NIL; /* List of ConvProcInfo */
73
74 /*
75 * These variables point to the currently active conversion functions,
76 * or are NULL when no conversion is needed.
77 */
78 static FmgrInfo *ToServerConvProc = NULL;
79 static FmgrInfo *ToClientConvProc = NULL;
80
81 /*
82 * These variables track the currently-selected encodings.
83 */
84 static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
85 static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
86 static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
87
88 /*
89 * During backend startup we can't set client encoding because we (a)
90 * can't look up the conversion functions, and (b) may not know the database
91 * encoding yet either. So SetClientEncoding() just accepts anything and
92 * remembers it for InitializeClientEncoding() to apply later.
93 */
94 static bool backend_startup_complete = false;
95 static int pending_client_encoding = PG_SQL_ASCII;
96
97
98 /* Internal functions */
99 static char *perform_default_encoding_conversion(const char *src,
100 int len, bool is_client_to_server);
101 static int cliplen(const char *str, int len, int limit);
102
103
104 /*
105 * Prepare for a future call to SetClientEncoding. Success should mean
106 * that SetClientEncoding is guaranteed to succeed for this encoding request.
107 *
108 * (But note that success before backend_startup_complete does not guarantee
109 * success after ...)
110 *
111 * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
112 */
113 int
PrepareClientEncoding(int encoding)114 PrepareClientEncoding(int encoding)
115 {
116 int current_server_encoding;
117 ListCell *lc;
118
119 if (!PG_VALID_FE_ENCODING(encoding))
120 return -1;
121
122 /* Can't do anything during startup, per notes above */
123 if (!backend_startup_complete)
124 return 0;
125
126 current_server_encoding = GetDatabaseEncoding();
127
128 /*
129 * Check for cases that require no conversion function.
130 */
131 if (current_server_encoding == encoding ||
132 current_server_encoding == PG_SQL_ASCII ||
133 encoding == PG_SQL_ASCII)
134 return 0;
135
136 if (IsTransactionState())
137 {
138 /*
139 * If we're in a live transaction, it's safe to access the catalogs,
140 * so look up the functions. We repeat the lookup even if the info is
141 * already cached, so that we can react to changes in the contents of
142 * pg_conversion.
143 */
144 Oid to_server_proc,
145 to_client_proc;
146 ConvProcInfo *convinfo;
147 MemoryContext oldcontext;
148
149 to_server_proc = FindDefaultConversionProc(encoding,
150 current_server_encoding);
151 if (!OidIsValid(to_server_proc))
152 return -1;
153 to_client_proc = FindDefaultConversionProc(current_server_encoding,
154 encoding);
155 if (!OidIsValid(to_client_proc))
156 return -1;
157
158 /*
159 * Load the fmgr info into TopMemoryContext (could still fail here)
160 */
161 convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
162 sizeof(ConvProcInfo));
163 convinfo->s_encoding = current_server_encoding;
164 convinfo->c_encoding = encoding;
165 fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
166 TopMemoryContext);
167 fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
168 TopMemoryContext);
169
170 /* Attach new info to head of list */
171 oldcontext = MemoryContextSwitchTo(TopMemoryContext);
172 ConvProcList = lcons(convinfo, ConvProcList);
173 MemoryContextSwitchTo(oldcontext);
174
175 /*
176 * We cannot yet remove any older entry for the same encoding pair,
177 * since it could still be in use. SetClientEncoding will clean up.
178 */
179
180 return 0; /* success */
181 }
182 else
183 {
184 /*
185 * If we're not in a live transaction, the only thing we can do is
186 * restore a previous setting using the cache. This covers all
187 * transaction-rollback cases. The only case it might not work for is
188 * trying to change client_encoding on the fly by editing
189 * postgresql.conf and SIGHUP'ing. Which would probably be a stupid
190 * thing to do anyway.
191 */
192 foreach(lc, ConvProcList)
193 {
194 ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
195
196 if (oldinfo->s_encoding == current_server_encoding &&
197 oldinfo->c_encoding == encoding)
198 return 0;
199 }
200
201 return -1; /* it's not cached, so fail */
202 }
203 }
204
205 /*
206 * Set the active client encoding and set up the conversion-function pointers.
207 * PrepareClientEncoding should have been called previously for this encoding.
208 *
209 * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
210 */
211 int
SetClientEncoding(int encoding)212 SetClientEncoding(int encoding)
213 {
214 int current_server_encoding;
215 bool found;
216 ListCell *lc;
217 ListCell *prev;
218 ListCell *next;
219
220 if (!PG_VALID_FE_ENCODING(encoding))
221 return -1;
222
223 /* Can't do anything during startup, per notes above */
224 if (!backend_startup_complete)
225 {
226 pending_client_encoding = encoding;
227 return 0;
228 }
229
230 current_server_encoding = GetDatabaseEncoding();
231
232 /*
233 * Check for cases that require no conversion function.
234 */
235 if (current_server_encoding == encoding ||
236 current_server_encoding == PG_SQL_ASCII ||
237 encoding == PG_SQL_ASCII)
238 {
239 ClientEncoding = &pg_enc2name_tbl[encoding];
240 ToServerConvProc = NULL;
241 ToClientConvProc = NULL;
242 return 0;
243 }
244
245 /*
246 * Search the cache for the entry previously prepared by
247 * PrepareClientEncoding; if there isn't one, we lose. While at it,
248 * release any duplicate entries so that repeated Prepare/Set cycles don't
249 * leak memory.
250 */
251 found = false;
252 prev = NULL;
253 for (lc = list_head(ConvProcList); lc; lc = next)
254 {
255 ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
256
257 next = lnext(lc);
258
259 if (convinfo->s_encoding == current_server_encoding &&
260 convinfo->c_encoding == encoding)
261 {
262 if (!found)
263 {
264 /* Found newest entry, so set up */
265 ClientEncoding = &pg_enc2name_tbl[encoding];
266 ToServerConvProc = &convinfo->to_server_info;
267 ToClientConvProc = &convinfo->to_client_info;
268 found = true;
269 }
270 else
271 {
272 /* Duplicate entry, release it */
273 ConvProcList = list_delete_cell(ConvProcList, lc, prev);
274 pfree(convinfo);
275 continue; /* prev mustn't advance */
276 }
277 }
278
279 prev = lc;
280 }
281
282 if (found)
283 return 0; /* success */
284 else
285 return -1; /* it's not cached, so fail */
286 }
287
288 /*
289 * Initialize client encoding conversions.
290 * Called from InitPostgres() once during backend startup.
291 */
292 void
InitializeClientEncoding(void)293 InitializeClientEncoding(void)
294 {
295 Assert(!backend_startup_complete);
296 backend_startup_complete = true;
297
298 if (PrepareClientEncoding(pending_client_encoding) < 0 ||
299 SetClientEncoding(pending_client_encoding) < 0)
300 {
301 /*
302 * Oops, the requested conversion is not available. We couldn't fail
303 * before, but we can now.
304 */
305 ereport(FATAL,
306 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
307 errmsg("conversion between %s and %s is not supported",
308 pg_enc2name_tbl[pending_client_encoding].name,
309 GetDatabaseEncodingName())));
310 }
311 }
312
313 /*
314 * returns the current client encoding
315 */
316 int
pg_get_client_encoding(void)317 pg_get_client_encoding(void)
318 {
319 return ClientEncoding->encoding;
320 }
321
322 /*
323 * returns the current client encoding name
324 */
325 const char *
pg_get_client_encoding_name(void)326 pg_get_client_encoding_name(void)
327 {
328 return ClientEncoding->name;
329 }
330
331 /*
332 * Convert src string to another encoding (general case).
333 *
334 * See the notes about string conversion functions at the top of this file.
335 */
336 unsigned char *
pg_do_encoding_conversion(unsigned char * src,int len,int src_encoding,int dest_encoding)337 pg_do_encoding_conversion(unsigned char *src, int len,
338 int src_encoding, int dest_encoding)
339 {
340 unsigned char *result;
341 Oid proc;
342
343 if (len <= 0)
344 return src; /* empty string is always valid */
345
346 if (src_encoding == dest_encoding)
347 return src; /* no conversion required, assume valid */
348
349 if (dest_encoding == PG_SQL_ASCII)
350 return src; /* any string is valid in SQL_ASCII */
351
352 if (src_encoding == PG_SQL_ASCII)
353 {
354 /* No conversion is possible, but we must validate the result */
355 (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
356 return src;
357 }
358
359 if (!IsTransactionState()) /* shouldn't happen */
360 elog(ERROR, "cannot perform encoding conversion outside a transaction");
361
362 proc = FindDefaultConversionProc(src_encoding, dest_encoding);
363 if (!OidIsValid(proc))
364 ereport(ERROR,
365 (errcode(ERRCODE_UNDEFINED_FUNCTION),
366 errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
367 pg_encoding_to_char(src_encoding),
368 pg_encoding_to_char(dest_encoding))));
369
370 /*
371 * Allocate space for conversion result, being wary of integer overflow.
372 *
373 * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
374 * required space, so it might exceed MaxAllocSize even though the result
375 * would actually fit. We do not want to hand back a result string that
376 * exceeds MaxAllocSize, because callers might not cope gracefully --- but
377 * if we just allocate more than that, and don't use it, that's fine.
378 */
379 if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
380 ereport(ERROR,
381 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
382 errmsg("out of memory"),
383 errdetail("String of %d bytes is too long for encoding conversion.",
384 len)));
385
386 result = (unsigned char *)
387 MemoryContextAllocHuge(CurrentMemoryContext,
388 (Size) len * MAX_CONVERSION_GROWTH + 1);
389
390 OidFunctionCall5(proc,
391 Int32GetDatum(src_encoding),
392 Int32GetDatum(dest_encoding),
393 CStringGetDatum(src),
394 CStringGetDatum(result),
395 Int32GetDatum(len));
396
397 /*
398 * If the result is large, it's worth repalloc'ing to release any extra
399 * space we asked for. The cutoff here is somewhat arbitrary, but we
400 * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
401 */
402 if (len > 1000000)
403 {
404 Size resultlen = strlen((char *) result);
405
406 if (resultlen >= MaxAllocSize)
407 ereport(ERROR,
408 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
409 errmsg("out of memory"),
410 errdetail("String of %d bytes is too long for encoding conversion.",
411 len)));
412
413 result = (unsigned char *) repalloc(result, resultlen + 1);
414 }
415
416 return result;
417 }
418
419 /*
420 * Convert string to encoding encoding_name. The source
421 * encoding is the DB encoding.
422 *
423 * BYTEA convert_to(TEXT string, NAME encoding_name) */
424 Datum
pg_convert_to(PG_FUNCTION_ARGS)425 pg_convert_to(PG_FUNCTION_ARGS)
426 {
427 Datum string = PG_GETARG_DATUM(0);
428 Datum dest_encoding_name = PG_GETARG_DATUM(1);
429 Datum src_encoding_name = DirectFunctionCall1(namein,
430 CStringGetDatum(DatabaseEncoding->name));
431 Datum result;
432
433 /*
434 * pg_convert expects a bytea as its first argument. We're passing it a
435 * text argument here, relying on the fact that they are both in fact
436 * varlena types, and thus structurally identical.
437 */
438 result = DirectFunctionCall3(pg_convert, string,
439 src_encoding_name, dest_encoding_name);
440
441 PG_RETURN_DATUM(result);
442 }
443
444 /*
445 * Convert string from encoding encoding_name. The destination
446 * encoding is the DB encoding.
447 *
448 * TEXT convert_from(BYTEA string, NAME encoding_name) */
449 Datum
pg_convert_from(PG_FUNCTION_ARGS)450 pg_convert_from(PG_FUNCTION_ARGS)
451 {
452 Datum string = PG_GETARG_DATUM(0);
453 Datum src_encoding_name = PG_GETARG_DATUM(1);
454 Datum dest_encoding_name = DirectFunctionCall1(namein,
455 CStringGetDatum(DatabaseEncoding->name));
456 Datum result;
457
458 result = DirectFunctionCall3(pg_convert, string,
459 src_encoding_name, dest_encoding_name);
460
461 /*
462 * pg_convert returns a bytea, which we in turn return as text, relying on
463 * the fact that they are both in fact varlena types, and thus
464 * structurally identical. Although not all bytea values are valid text,
465 * in this case it will be because we've told pg_convert to return one
466 * that is valid as text in the current database encoding.
467 */
468 PG_RETURN_DATUM(result);
469 }
470
471 /*
472 * Convert string between two arbitrary encodings.
473 *
474 * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
475 */
476 Datum
pg_convert(PG_FUNCTION_ARGS)477 pg_convert(PG_FUNCTION_ARGS)
478 {
479 bytea *string = PG_GETARG_BYTEA_PP(0);
480 char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
481 int src_encoding = pg_char_to_encoding(src_encoding_name);
482 char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
483 int dest_encoding = pg_char_to_encoding(dest_encoding_name);
484 const char *src_str;
485 char *dest_str;
486 bytea *retval;
487 int len;
488
489 if (src_encoding < 0)
490 ereport(ERROR,
491 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
492 errmsg("invalid source encoding name \"%s\"",
493 src_encoding_name)));
494 if (dest_encoding < 0)
495 ereport(ERROR,
496 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
497 errmsg("invalid destination encoding name \"%s\"",
498 dest_encoding_name)));
499
500 /* make sure that source string is valid */
501 len = VARSIZE_ANY_EXHDR(string);
502 src_str = VARDATA_ANY(string);
503 pg_verify_mbstr_len(src_encoding, src_str, len, false);
504
505 /* perform conversion */
506 dest_str = (char *) pg_do_encoding_conversion((unsigned char *) src_str,
507 len,
508 src_encoding,
509 dest_encoding);
510
511 /* update len if conversion actually happened */
512 if (dest_str != src_str)
513 len = strlen(dest_str);
514
515 /*
516 * build bytea data type structure.
517 */
518 retval = (bytea *) palloc(len + VARHDRSZ);
519 SET_VARSIZE(retval, len + VARHDRSZ);
520 memcpy(VARDATA(retval), dest_str, len);
521
522 if (dest_str != src_str)
523 pfree(dest_str);
524
525 /* free memory if allocated by the toaster */
526 PG_FREE_IF_COPY(string, 0);
527
528 PG_RETURN_BYTEA_P(retval);
529 }
530
531 /*
532 * get the length of the string considered as text in the specified
533 * encoding. Raises an error if the data is not valid in that
534 * encoding.
535 *
536 * INT4 length (BYTEA string, NAME src_encoding_name)
537 */
538 Datum
length_in_encoding(PG_FUNCTION_ARGS)539 length_in_encoding(PG_FUNCTION_ARGS)
540 {
541 bytea *string = PG_GETARG_BYTEA_PP(0);
542 char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
543 int src_encoding = pg_char_to_encoding(src_encoding_name);
544 const char *src_str;
545 int len;
546 int retval;
547
548 if (src_encoding < 0)
549 ereport(ERROR,
550 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
551 errmsg("invalid encoding name \"%s\"",
552 src_encoding_name)));
553
554 len = VARSIZE_ANY_EXHDR(string);
555 src_str = VARDATA_ANY(string);
556
557 retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
558
559 PG_RETURN_INT32(retval);
560 }
561
562 /*
563 * Get maximum multibyte character length in the specified encoding.
564 *
565 * Note encoding is specified numerically, not by name as above.
566 */
567 Datum
pg_encoding_max_length_sql(PG_FUNCTION_ARGS)568 pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
569 {
570 int encoding = PG_GETARG_INT32(0);
571
572 if (PG_VALID_ENCODING(encoding))
573 PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
574 else
575 PG_RETURN_NULL();
576 }
577
578 /*
579 * Convert client encoding to server encoding.
580 *
581 * See the notes about string conversion functions at the top of this file.
582 */
583 char *
pg_client_to_server(const char * s,int len)584 pg_client_to_server(const char *s, int len)
585 {
586 return pg_any_to_server(s, len, ClientEncoding->encoding);
587 }
588
589 /*
590 * Convert any encoding to server encoding.
591 *
592 * See the notes about string conversion functions at the top of this file.
593 *
594 * Unlike the other string conversion functions, this will apply validation
595 * even if encoding == DatabaseEncoding->encoding. This is because this is
596 * used to process data coming in from outside the database, and we never
597 * want to just assume validity.
598 */
599 char *
pg_any_to_server(const char * s,int len,int encoding)600 pg_any_to_server(const char *s, int len, int encoding)
601 {
602 if (len <= 0)
603 return (char *) s; /* empty string is always valid */
604
605 if (encoding == DatabaseEncoding->encoding ||
606 encoding == PG_SQL_ASCII)
607 {
608 /*
609 * No conversion is needed, but we must still validate the data.
610 */
611 (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
612 return (char *) s;
613 }
614
615 if (DatabaseEncoding->encoding == PG_SQL_ASCII)
616 {
617 /*
618 * No conversion is possible, but we must still validate the data,
619 * because the client-side code might have done string escaping using
620 * the selected client_encoding. If the client encoding is ASCII-safe
621 * then we just do a straight validation under that encoding. For an
622 * ASCII-unsafe encoding we have a problem: we dare not pass such data
623 * to the parser but we have no way to convert it. We compromise by
624 * rejecting the data if it contains any non-ASCII characters.
625 */
626 if (PG_VALID_BE_ENCODING(encoding))
627 (void) pg_verify_mbstr(encoding, s, len, false);
628 else
629 {
630 int i;
631
632 for (i = 0; i < len; i++)
633 {
634 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
635 ereport(ERROR,
636 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
637 errmsg("invalid byte value for encoding \"%s\": 0x%02x",
638 pg_enc2name_tbl[PG_SQL_ASCII].name,
639 (unsigned char) s[i])));
640 }
641 }
642 return (char *) s;
643 }
644
645 /* Fast path if we can use cached conversion function */
646 if (encoding == ClientEncoding->encoding)
647 return perform_default_encoding_conversion(s, len, true);
648
649 /* General case ... will not work outside transactions */
650 return (char *) pg_do_encoding_conversion((unsigned char *) s,
651 len,
652 encoding,
653 DatabaseEncoding->encoding);
654 }
655
656 /*
657 * Convert server encoding to client encoding.
658 *
659 * See the notes about string conversion functions at the top of this file.
660 */
661 char *
pg_server_to_client(const char * s,int len)662 pg_server_to_client(const char *s, int len)
663 {
664 return pg_server_to_any(s, len, ClientEncoding->encoding);
665 }
666
667 /*
668 * Convert server encoding to any encoding.
669 *
670 * See the notes about string conversion functions at the top of this file.
671 */
672 char *
pg_server_to_any(const char * s,int len,int encoding)673 pg_server_to_any(const char *s, int len, int encoding)
674 {
675 if (len <= 0)
676 return (char *) s; /* empty string is always valid */
677
678 if (encoding == DatabaseEncoding->encoding ||
679 encoding == PG_SQL_ASCII)
680 return (char *) s; /* assume data is valid */
681
682 if (DatabaseEncoding->encoding == PG_SQL_ASCII)
683 {
684 /* No conversion is possible, but we must validate the result */
685 (void) pg_verify_mbstr(encoding, s, len, false);
686 return (char *) s;
687 }
688
689 /* Fast path if we can use cached conversion function */
690 if (encoding == ClientEncoding->encoding)
691 return perform_default_encoding_conversion(s, len, false);
692
693 /* General case ... will not work outside transactions */
694 return (char *) pg_do_encoding_conversion((unsigned char *) s,
695 len,
696 DatabaseEncoding->encoding,
697 encoding);
698 }
699
700 /*
701 * Perform default encoding conversion using cached FmgrInfo. Since
702 * this function does not access database at all, it is safe to call
703 * outside transactions. If the conversion has not been set up by
704 * SetClientEncoding(), no conversion is performed.
705 */
706 static char *
perform_default_encoding_conversion(const char * src,int len,bool is_client_to_server)707 perform_default_encoding_conversion(const char *src, int len,
708 bool is_client_to_server)
709 {
710 char *result;
711 int src_encoding,
712 dest_encoding;
713 FmgrInfo *flinfo;
714
715 if (is_client_to_server)
716 {
717 src_encoding = ClientEncoding->encoding;
718 dest_encoding = DatabaseEncoding->encoding;
719 flinfo = ToServerConvProc;
720 }
721 else
722 {
723 src_encoding = DatabaseEncoding->encoding;
724 dest_encoding = ClientEncoding->encoding;
725 flinfo = ToClientConvProc;
726 }
727
728 if (flinfo == NULL)
729 return (char *) src;
730
731 /*
732 * Allocate space for conversion result, being wary of integer overflow.
733 * See comments in pg_do_encoding_conversion.
734 */
735 if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
736 ereport(ERROR,
737 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
738 errmsg("out of memory"),
739 errdetail("String of %d bytes is too long for encoding conversion.",
740 len)));
741
742 result = (char *)
743 MemoryContextAllocHuge(CurrentMemoryContext,
744 (Size) len * MAX_CONVERSION_GROWTH + 1);
745
746 FunctionCall5(flinfo,
747 Int32GetDatum(src_encoding),
748 Int32GetDatum(dest_encoding),
749 CStringGetDatum(src),
750 CStringGetDatum(result),
751 Int32GetDatum(len));
752
753 /*
754 * Release extra space if there might be a lot --- see comments in
755 * pg_do_encoding_conversion.
756 */
757 if (len > 1000000)
758 {
759 Size resultlen = strlen(result);
760
761 if (resultlen >= MaxAllocSize)
762 ereport(ERROR,
763 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
764 errmsg("out of memory"),
765 errdetail("String of %d bytes is too long for encoding conversion.",
766 len)));
767
768 result = (char *) repalloc(result, resultlen + 1);
769 }
770
771 return result;
772 }
773
774
775 /* convert a multibyte string to a wchar */
776 int
pg_mb2wchar(const char * from,pg_wchar * to)777 pg_mb2wchar(const char *from, pg_wchar *to)
778 {
779 return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, strlen(from));
780 }
781
782 /* convert a multibyte string to a wchar with a limited length */
783 int
pg_mb2wchar_with_len(const char * from,pg_wchar * to,int len)784 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
785 {
786 return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
787 }
788
789 /* same, with any encoding */
790 int
pg_encoding_mb2wchar_with_len(int encoding,const char * from,pg_wchar * to,int len)791 pg_encoding_mb2wchar_with_len(int encoding,
792 const char *from, pg_wchar *to, int len)
793 {
794 return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
795 }
796
797 /* convert a wchar string to a multibyte */
798 int
pg_wchar2mb(const pg_wchar * from,char * to)799 pg_wchar2mb(const pg_wchar *from, char *to)
800 {
801 return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *) to, pg_wchar_strlen(from));
802 }
803
804 /* convert a wchar string to a multibyte with a limited length */
805 int
pg_wchar2mb_with_len(const pg_wchar * from,char * to,int len)806 pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
807 {
808 return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *) to, len);
809 }
810
811 /* same, with any encoding */
812 int
pg_encoding_wchar2mb_with_len(int encoding,const pg_wchar * from,char * to,int len)813 pg_encoding_wchar2mb_with_len(int encoding,
814 const pg_wchar *from, char *to, int len)
815 {
816 return (*pg_wchar_table[encoding].wchar2mb_with_len) (from, (unsigned char *) to, len);
817 }
818
819 /* returns the byte length of a multibyte character */
820 int
pg_mblen(const char * mbstr)821 pg_mblen(const char *mbstr)
822 {
823 return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) ((const unsigned char *) mbstr));
824 }
825
826 /* returns the display length of a multibyte character */
827 int
pg_dsplen(const char * mbstr)828 pg_dsplen(const char *mbstr)
829 {
830 return ((*pg_wchar_table[DatabaseEncoding->encoding].dsplen) ((const unsigned char *) mbstr));
831 }
832
833 /* returns the length (counted in wchars) of a multibyte string */
834 int
pg_mbstrlen(const char * mbstr)835 pg_mbstrlen(const char *mbstr)
836 {
837 int len = 0;
838
839 /* optimization for single byte encoding */
840 if (pg_database_encoding_max_length() == 1)
841 return strlen(mbstr);
842
843 while (*mbstr)
844 {
845 mbstr += pg_mblen(mbstr);
846 len++;
847 }
848 return len;
849 }
850
851 /* returns the length (counted in wchars) of a multibyte string
852 * (not necessarily NULL terminated)
853 */
854 int
pg_mbstrlen_with_len(const char * mbstr,int limit)855 pg_mbstrlen_with_len(const char *mbstr, int limit)
856 {
857 int len = 0;
858
859 /* optimization for single byte encoding */
860 if (pg_database_encoding_max_length() == 1)
861 return limit;
862
863 while (limit > 0 && *mbstr)
864 {
865 int l = pg_mblen(mbstr);
866
867 limit -= l;
868 mbstr += l;
869 len++;
870 }
871 return len;
872 }
873
874 /*
875 * returns the byte length of a multibyte string
876 * (not necessarily NULL terminated)
877 * that is no longer than limit.
878 * this function does not break multibyte character boundary.
879 */
880 int
pg_mbcliplen(const char * mbstr,int len,int limit)881 pg_mbcliplen(const char *mbstr, int len, int limit)
882 {
883 return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
884 len, limit);
885 }
886
887 /*
888 * pg_mbcliplen with specified encoding
889 */
890 int
pg_encoding_mbcliplen(int encoding,const char * mbstr,int len,int limit)891 pg_encoding_mbcliplen(int encoding, const char *mbstr,
892 int len, int limit)
893 {
894 mblen_converter mblen_fn;
895 int clen = 0;
896 int l;
897
898 /* optimization for single byte encoding */
899 if (pg_encoding_max_length(encoding) == 1)
900 return cliplen(mbstr, len, limit);
901
902 mblen_fn = pg_wchar_table[encoding].mblen;
903
904 while (len > 0 && *mbstr)
905 {
906 l = (*mblen_fn) ((const unsigned char *) mbstr);
907 if ((clen + l) > limit)
908 break;
909 clen += l;
910 if (clen == limit)
911 break;
912 len -= l;
913 mbstr += l;
914 }
915 return clen;
916 }
917
918 /*
919 * Similar to pg_mbcliplen except the limit parameter specifies the
920 * character length, not the byte length.
921 */
922 int
pg_mbcharcliplen(const char * mbstr,int len,int limit)923 pg_mbcharcliplen(const char *mbstr, int len, int limit)
924 {
925 int clen = 0;
926 int nch = 0;
927 int l;
928
929 /* optimization for single byte encoding */
930 if (pg_database_encoding_max_length() == 1)
931 return cliplen(mbstr, len, limit);
932
933 while (len > 0 && *mbstr)
934 {
935 l = pg_mblen(mbstr);
936 nch++;
937 if (nch > limit)
938 break;
939 clen += l;
940 len -= l;
941 mbstr += l;
942 }
943 return clen;
944 }
945
946 /* mbcliplen for any single-byte encoding */
947 static int
cliplen(const char * str,int len,int limit)948 cliplen(const char *str, int len, int limit)
949 {
950 int l = 0;
951
952 len = Min(len, limit);
953 while (l < len && str[l])
954 l++;
955 return l;
956 }
957
958 void
SetDatabaseEncoding(int encoding)959 SetDatabaseEncoding(int encoding)
960 {
961 if (!PG_VALID_BE_ENCODING(encoding))
962 elog(ERROR, "invalid database encoding: %d", encoding);
963
964 DatabaseEncoding = &pg_enc2name_tbl[encoding];
965 Assert(DatabaseEncoding->encoding == encoding);
966 }
967
968 void
SetMessageEncoding(int encoding)969 SetMessageEncoding(int encoding)
970 {
971 /* Some calls happen before we can elog()! */
972 Assert(PG_VALID_ENCODING(encoding));
973
974 MessageEncoding = &pg_enc2name_tbl[encoding];
975 Assert(MessageEncoding->encoding == encoding);
976 }
977
978 #ifdef ENABLE_NLS
979 /*
980 * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
981 * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
982 * fail for gettext-internal causes like out-of-memory.
983 */
984 static bool
raw_pg_bind_textdomain_codeset(const char * domainname,int encoding)985 raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
986 {
987 bool elog_ok = (CurrentMemoryContext != NULL);
988 int i;
989
990 for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
991 {
992 if (pg_enc2gettext_tbl[i].encoding == encoding)
993 {
994 if (bind_textdomain_codeset(domainname,
995 pg_enc2gettext_tbl[i].name) != NULL)
996 return true;
997
998 if (elog_ok)
999 elog(LOG, "bind_textdomain_codeset failed");
1000 else
1001 write_stderr("bind_textdomain_codeset failed");
1002
1003 break;
1004 }
1005 }
1006
1007 return false;
1008 }
1009
1010 /*
1011 * Bind a gettext message domain to the codeset corresponding to the database
1012 * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
1013 * Return the MessageEncoding implied by the new settings.
1014 *
1015 * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
1016 * When that matches the database encoding, we don't need to do anything. In
1017 * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
1018 * database encoding, except for the C locale. (On Windows, we also permit a
1019 * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
1020 * gettext to the right codeset.
1021 *
1022 * On Windows, gettext defaults to the Windows ANSI code page. This is a
1023 * convenient departure for software that passes the strings to Windows ANSI
1024 * APIs, but we don't do that. Compel gettext to use database encoding or,
1025 * failing that, the LC_CTYPE encoding as it would on other platforms.
1026 *
1027 * This function is called before elog() and palloc() are usable.
1028 */
1029 int
pg_bind_textdomain_codeset(const char * domainname)1030 pg_bind_textdomain_codeset(const char *domainname)
1031 {
1032 bool elog_ok = (CurrentMemoryContext != NULL);
1033 int encoding = GetDatabaseEncoding();
1034 int new_msgenc;
1035
1036 #ifndef WIN32
1037 const char *ctype = setlocale(LC_CTYPE, NULL);
1038
1039 if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
1040 #endif
1041 if (encoding != PG_SQL_ASCII &&
1042 raw_pg_bind_textdomain_codeset(domainname, encoding))
1043 return encoding;
1044
1045 new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
1046 if (new_msgenc < 0)
1047 new_msgenc = PG_SQL_ASCII;
1048
1049 #ifdef WIN32
1050 if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
1051 /* On failure, the old message encoding remains valid. */
1052 return GetMessageEncoding();
1053 #endif
1054
1055 return new_msgenc;
1056 }
1057 #endif
1058
1059 /*
1060 * The database encoding, also called the server encoding, represents the
1061 * encoding of data stored in text-like data types. Affected types include
1062 * cstring, text, varchar, name, xml, and json.
1063 */
1064 int
GetDatabaseEncoding(void)1065 GetDatabaseEncoding(void)
1066 {
1067 return DatabaseEncoding->encoding;
1068 }
1069
1070 const char *
GetDatabaseEncodingName(void)1071 GetDatabaseEncodingName(void)
1072 {
1073 return DatabaseEncoding->name;
1074 }
1075
1076 Datum
getdatabaseencoding(PG_FUNCTION_ARGS)1077 getdatabaseencoding(PG_FUNCTION_ARGS)
1078 {
1079 return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1080 }
1081
1082 Datum
pg_client_encoding(PG_FUNCTION_ARGS)1083 pg_client_encoding(PG_FUNCTION_ARGS)
1084 {
1085 return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1086 }
1087
1088 /*
1089 * gettext() returns messages in this encoding. This often matches the
1090 * database encoding, but it differs for SQL_ASCII databases, for processes
1091 * not attached to a database, and under a database encoding lacking iconv
1092 * support (MULE_INTERNAL).
1093 */
1094 int
GetMessageEncoding(void)1095 GetMessageEncoding(void)
1096 {
1097 return MessageEncoding->encoding;
1098 }
1099
1100 #ifdef WIN32
1101 /*
1102 * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
1103 * string. The character length is also passed to utf16len if not
1104 * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
1105 * should be ASCII-only; this will function as though MessageEncoding is UTF8.
1106 */
1107 WCHAR *
pgwin32_message_to_UTF16(const char * str,int len,int * utf16len)1108 pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1109 {
1110 int msgenc = GetMessageEncoding();
1111 WCHAR *utf16;
1112 int dstlen;
1113 UINT codepage;
1114
1115 if (msgenc == PG_SQL_ASCII)
1116 /* No conversion is possible, and SQL_ASCII is never utf16. */
1117 return NULL;
1118
1119 codepage = pg_enc2name_tbl[msgenc].codepage;
1120
1121 /*
1122 * Use MultiByteToWideChar directly if there is a corresponding codepage,
1123 * or double conversion through UTF8 if not. Double conversion is needed,
1124 * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1125 */
1126 if (codepage != 0)
1127 {
1128 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1129 dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1130 utf16[dstlen] = (WCHAR) 0;
1131 }
1132 else
1133 {
1134 char *utf8;
1135
1136 /*
1137 * XXX pg_do_encoding_conversion() requires a transaction. In the
1138 * absence of one, hope for the input to be valid UTF8.
1139 */
1140 if (IsTransactionState())
1141 {
1142 utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1143 len,
1144 msgenc,
1145 PG_UTF8);
1146 if (utf8 != str)
1147 len = strlen(utf8);
1148 }
1149 else
1150 utf8 = (char *) str;
1151
1152 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1153 dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1154 utf16[dstlen] = (WCHAR) 0;
1155
1156 if (utf8 != str)
1157 pfree(utf8);
1158 }
1159
1160 if (dstlen == 0 && len > 0)
1161 {
1162 pfree(utf16);
1163 return NULL; /* error */
1164 }
1165
1166 if (utf16len)
1167 *utf16len = dstlen;
1168 return utf16;
1169 }
1170
1171 #endif
1172