1 /*-------------------------------------------------------------------------
2  *
3  * mbutils.c
4  *	  This file contains functions for encoding conversion.
5  *
6  * The string-conversion functions in this file share some API quirks.
7  * Note the following:
8  *
9  * The functions return a palloc'd, null-terminated string if conversion
10  * is required.  However, if no conversion is performed, the given source
11  * string pointer is returned as-is.
12  *
13  * Although the presence of a length argument means that callers can pass
14  * non-null-terminated strings, care is required because the same string
15  * will be passed back if no conversion occurs.  Such callers *must* check
16  * whether result == src and handle that case differently.
17  *
18  * If the source and destination encodings are the same, the source string
19  * is returned without any verification; it's assumed to be valid data.
20  * If that might not be the case, the caller is responsible for validating
21  * the string using a separate call to pg_verify_mbstr().  Whenever the
22  * source and destination encodings are different, the functions ensure that
23  * the result is validly encoded according to the destination encoding.
24  *
25  *
26  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
27  * Portions Copyright (c) 1994, Regents of the University of California
28  *
29  *
30  * IDENTIFICATION
31  *	  src/backend/utils/mb/mbutils.c
32  *
33  *-------------------------------------------------------------------------
34  */
35 #include "postgres.h"
36 
37 #include "access/xact.h"
38 #include "catalog/namespace.h"
39 #include "mb/pg_wchar.h"
40 #include "utils/builtins.h"
41 #include "utils/memutils.h"
42 #include "utils/syscache.h"
43 
44 /*
45  * When converting strings between different encodings, we assume that space
46  * for converted result is 4-to-1 growth in the worst case. The rate for
47  * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
48  * kanna -> UTF8 is the worst case).  So "4" should be enough for the moment.
49  *
50  * Note that this is not the same as the maximum character width in any
51  * particular encoding.
52  */
53 #define MAX_CONVERSION_GROWTH  4
54 
55 /*
56  * We maintain a simple linked list caching the fmgr lookup info for the
57  * currently selected conversion functions, as well as any that have been
58  * selected previously in the current session.  (We remember previous
59  * settings because we must be able to restore a previous setting during
60  * transaction rollback, without doing any fresh catalog accesses.)
61  *
62  * Since we'll never release this data, we just keep it in TopMemoryContext.
63  */
64 typedef struct ConvProcInfo
65 {
66 	int			s_encoding;		/* server and client encoding IDs */
67 	int			c_encoding;
68 	FmgrInfo	to_server_info; /* lookup info for conversion procs */
69 	FmgrInfo	to_client_info;
70 } ConvProcInfo;
71 
72 static List *ConvProcList = NIL;	/* List of ConvProcInfo */
73 
74 /*
75  * These variables point to the currently active conversion functions,
76  * or are NULL when no conversion is needed.
77  */
78 static FmgrInfo *ToServerConvProc = NULL;
79 static FmgrInfo *ToClientConvProc = NULL;
80 
81 /*
82  * These variables track the currently-selected encodings.
83  */
84 static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
85 static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
86 static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
87 
88 /*
89  * During backend startup we can't set client encoding because we (a)
90  * can't look up the conversion functions, and (b) may not know the database
91  * encoding yet either.  So SetClientEncoding() just accepts anything and
92  * remembers it for InitializeClientEncoding() to apply later.
93  */
94 static bool backend_startup_complete = false;
95 static int	pending_client_encoding = PG_SQL_ASCII;
96 
97 
98 /* Internal functions */
99 static char *perform_default_encoding_conversion(const char *src,
100 									int len, bool is_client_to_server);
101 static int	cliplen(const char *str, int len, int limit);
102 
103 
104 /*
105  * Prepare for a future call to SetClientEncoding.  Success should mean
106  * that SetClientEncoding is guaranteed to succeed for this encoding request.
107  *
108  * (But note that success before backend_startup_complete does not guarantee
109  * success after ...)
110  *
111  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
112  */
113 int
PrepareClientEncoding(int encoding)114 PrepareClientEncoding(int encoding)
115 {
116 	int			current_server_encoding;
117 	ListCell   *lc;
118 
119 	if (!PG_VALID_FE_ENCODING(encoding))
120 		return -1;
121 
122 	/* Can't do anything during startup, per notes above */
123 	if (!backend_startup_complete)
124 		return 0;
125 
126 	current_server_encoding = GetDatabaseEncoding();
127 
128 	/*
129 	 * Check for cases that require no conversion function.
130 	 */
131 	if (current_server_encoding == encoding ||
132 		current_server_encoding == PG_SQL_ASCII ||
133 		encoding == PG_SQL_ASCII)
134 		return 0;
135 
136 	if (IsTransactionState())
137 	{
138 		/*
139 		 * If we're in a live transaction, it's safe to access the catalogs,
140 		 * so look up the functions.  We repeat the lookup even if the info is
141 		 * already cached, so that we can react to changes in the contents of
142 		 * pg_conversion.
143 		 */
144 		Oid			to_server_proc,
145 					to_client_proc;
146 		ConvProcInfo *convinfo;
147 		MemoryContext oldcontext;
148 
149 		to_server_proc = FindDefaultConversionProc(encoding,
150 												   current_server_encoding);
151 		if (!OidIsValid(to_server_proc))
152 			return -1;
153 		to_client_proc = FindDefaultConversionProc(current_server_encoding,
154 												   encoding);
155 		if (!OidIsValid(to_client_proc))
156 			return -1;
157 
158 		/*
159 		 * Load the fmgr info into TopMemoryContext (could still fail here)
160 		 */
161 		convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
162 													   sizeof(ConvProcInfo));
163 		convinfo->s_encoding = current_server_encoding;
164 		convinfo->c_encoding = encoding;
165 		fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
166 					  TopMemoryContext);
167 		fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
168 					  TopMemoryContext);
169 
170 		/* Attach new info to head of list */
171 		oldcontext = MemoryContextSwitchTo(TopMemoryContext);
172 		ConvProcList = lcons(convinfo, ConvProcList);
173 		MemoryContextSwitchTo(oldcontext);
174 
175 		/*
176 		 * We cannot yet remove any older entry for the same encoding pair,
177 		 * since it could still be in use.  SetClientEncoding will clean up.
178 		 */
179 
180 		return 0;				/* success */
181 	}
182 	else
183 	{
184 		/*
185 		 * If we're not in a live transaction, the only thing we can do is
186 		 * restore a previous setting using the cache.  This covers all
187 		 * transaction-rollback cases.  The only case it might not work for is
188 		 * trying to change client_encoding on the fly by editing
189 		 * postgresql.conf and SIGHUP'ing.  Which would probably be a stupid
190 		 * thing to do anyway.
191 		 */
192 		foreach(lc, ConvProcList)
193 		{
194 			ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
195 
196 			if (oldinfo->s_encoding == current_server_encoding &&
197 				oldinfo->c_encoding == encoding)
198 				return 0;
199 		}
200 
201 		return -1;				/* it's not cached, so fail */
202 	}
203 }
204 
205 /*
206  * Set the active client encoding and set up the conversion-function pointers.
207  * PrepareClientEncoding should have been called previously for this encoding.
208  *
209  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
210  */
211 int
SetClientEncoding(int encoding)212 SetClientEncoding(int encoding)
213 {
214 	int			current_server_encoding;
215 	bool		found;
216 	ListCell   *lc;
217 	ListCell   *prev;
218 	ListCell   *next;
219 
220 	if (!PG_VALID_FE_ENCODING(encoding))
221 		return -1;
222 
223 	/* Can't do anything during startup, per notes above */
224 	if (!backend_startup_complete)
225 	{
226 		pending_client_encoding = encoding;
227 		return 0;
228 	}
229 
230 	current_server_encoding = GetDatabaseEncoding();
231 
232 	/*
233 	 * Check for cases that require no conversion function.
234 	 */
235 	if (current_server_encoding == encoding ||
236 		current_server_encoding == PG_SQL_ASCII ||
237 		encoding == PG_SQL_ASCII)
238 	{
239 		ClientEncoding = &pg_enc2name_tbl[encoding];
240 		ToServerConvProc = NULL;
241 		ToClientConvProc = NULL;
242 		return 0;
243 	}
244 
245 	/*
246 	 * Search the cache for the entry previously prepared by
247 	 * PrepareClientEncoding; if there isn't one, we lose.  While at it,
248 	 * release any duplicate entries so that repeated Prepare/Set cycles don't
249 	 * leak memory.
250 	 */
251 	found = false;
252 	prev = NULL;
253 	for (lc = list_head(ConvProcList); lc; lc = next)
254 	{
255 		ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
256 
257 		next = lnext(lc);
258 
259 		if (convinfo->s_encoding == current_server_encoding &&
260 			convinfo->c_encoding == encoding)
261 		{
262 			if (!found)
263 			{
264 				/* Found newest entry, so set up */
265 				ClientEncoding = &pg_enc2name_tbl[encoding];
266 				ToServerConvProc = &convinfo->to_server_info;
267 				ToClientConvProc = &convinfo->to_client_info;
268 				found = true;
269 			}
270 			else
271 			{
272 				/* Duplicate entry, release it */
273 				ConvProcList = list_delete_cell(ConvProcList, lc, prev);
274 				pfree(convinfo);
275 				continue;		/* prev mustn't advance */
276 			}
277 		}
278 
279 		prev = lc;
280 	}
281 
282 	if (found)
283 		return 0;				/* success */
284 	else
285 		return -1;				/* it's not cached, so fail */
286 }
287 
288 /*
289  * Initialize client encoding conversions.
290  *		Called from InitPostgres() once during backend startup.
291  */
292 void
InitializeClientEncoding(void)293 InitializeClientEncoding(void)
294 {
295 	Assert(!backend_startup_complete);
296 	backend_startup_complete = true;
297 
298 	if (PrepareClientEncoding(pending_client_encoding) < 0 ||
299 		SetClientEncoding(pending_client_encoding) < 0)
300 	{
301 		/*
302 		 * Oops, the requested conversion is not available. We couldn't fail
303 		 * before, but we can now.
304 		 */
305 		ereport(FATAL,
306 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
307 				 errmsg("conversion between %s and %s is not supported",
308 						pg_enc2name_tbl[pending_client_encoding].name,
309 						GetDatabaseEncodingName())));
310 	}
311 }
312 
313 /*
314  * returns the current client encoding
315  */
316 int
pg_get_client_encoding(void)317 pg_get_client_encoding(void)
318 {
319 	return ClientEncoding->encoding;
320 }
321 
322 /*
323  * returns the current client encoding name
324  */
325 const char *
pg_get_client_encoding_name(void)326 pg_get_client_encoding_name(void)
327 {
328 	return ClientEncoding->name;
329 }
330 
331 /*
332  * Convert src string to another encoding (general case).
333  *
334  * See the notes about string conversion functions at the top of this file.
335  */
336 unsigned char *
pg_do_encoding_conversion(unsigned char * src,int len,int src_encoding,int dest_encoding)337 pg_do_encoding_conversion(unsigned char *src, int len,
338 						  int src_encoding, int dest_encoding)
339 {
340 	unsigned char *result;
341 	Oid			proc;
342 
343 	if (len <= 0)
344 		return src;				/* empty string is always valid */
345 
346 	if (src_encoding == dest_encoding)
347 		return src;				/* no conversion required, assume valid */
348 
349 	if (dest_encoding == PG_SQL_ASCII)
350 		return src;				/* any string is valid in SQL_ASCII */
351 
352 	if (src_encoding == PG_SQL_ASCII)
353 	{
354 		/* No conversion is possible, but we must validate the result */
355 		(void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
356 		return src;
357 	}
358 
359 	if (!IsTransactionState())	/* shouldn't happen */
360 		elog(ERROR, "cannot perform encoding conversion outside a transaction");
361 
362 	proc = FindDefaultConversionProc(src_encoding, dest_encoding);
363 	if (!OidIsValid(proc))
364 		ereport(ERROR,
365 				(errcode(ERRCODE_UNDEFINED_FUNCTION),
366 				 errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
367 						pg_encoding_to_char(src_encoding),
368 						pg_encoding_to_char(dest_encoding))));
369 
370 	/*
371 	 * Allocate space for conversion result, being wary of integer overflow.
372 	 *
373 	 * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
374 	 * required space, so it might exceed MaxAllocSize even though the result
375 	 * would actually fit.  We do not want to hand back a result string that
376 	 * exceeds MaxAllocSize, because callers might not cope gracefully --- but
377 	 * if we just allocate more than that, and don't use it, that's fine.
378 	 */
379 	if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
380 		ereport(ERROR,
381 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
382 				 errmsg("out of memory"),
383 				 errdetail("String of %d bytes is too long for encoding conversion.",
384 						   len)));
385 
386 	result = (unsigned char *)
387 		MemoryContextAllocHuge(CurrentMemoryContext,
388 							   (Size) len * MAX_CONVERSION_GROWTH + 1);
389 
390 	OidFunctionCall5(proc,
391 					 Int32GetDatum(src_encoding),
392 					 Int32GetDatum(dest_encoding),
393 					 CStringGetDatum(src),
394 					 CStringGetDatum(result),
395 					 Int32GetDatum(len));
396 
397 	/*
398 	 * If the result is large, it's worth repalloc'ing to release any extra
399 	 * space we asked for.  The cutoff here is somewhat arbitrary, but we
400 	 * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
401 	 */
402 	if (len > 1000000)
403 	{
404 		Size		resultlen = strlen((char *) result);
405 
406 		if (resultlen >= MaxAllocSize)
407 			ereport(ERROR,
408 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
409 					 errmsg("out of memory"),
410 					 errdetail("String of %d bytes is too long for encoding conversion.",
411 							   len)));
412 
413 		result = (unsigned char *) repalloc(result, resultlen + 1);
414 	}
415 
416 	return result;
417 }
418 
419 /*
420  * Convert string to encoding encoding_name. The source
421  * encoding is the DB encoding.
422  *
423  * BYTEA convert_to(TEXT string, NAME encoding_name) */
424 Datum
pg_convert_to(PG_FUNCTION_ARGS)425 pg_convert_to(PG_FUNCTION_ARGS)
426 {
427 	Datum		string = PG_GETARG_DATUM(0);
428 	Datum		dest_encoding_name = PG_GETARG_DATUM(1);
429 	Datum		src_encoding_name = DirectFunctionCall1(namein,
430 														CStringGetDatum(DatabaseEncoding->name));
431 	Datum		result;
432 
433 	/*
434 	 * pg_convert expects a bytea as its first argument. We're passing it a
435 	 * text argument here, relying on the fact that they are both in fact
436 	 * varlena types, and thus structurally identical.
437 	 */
438 	result = DirectFunctionCall3(pg_convert, string,
439 								 src_encoding_name, dest_encoding_name);
440 
441 	PG_RETURN_DATUM(result);
442 }
443 
444 /*
445  * Convert string from encoding encoding_name. The destination
446  * encoding is the DB encoding.
447  *
448  * TEXT convert_from(BYTEA string, NAME encoding_name) */
449 Datum
pg_convert_from(PG_FUNCTION_ARGS)450 pg_convert_from(PG_FUNCTION_ARGS)
451 {
452 	Datum		string = PG_GETARG_DATUM(0);
453 	Datum		src_encoding_name = PG_GETARG_DATUM(1);
454 	Datum		dest_encoding_name = DirectFunctionCall1(namein,
455 														 CStringGetDatum(DatabaseEncoding->name));
456 	Datum		result;
457 
458 	result = DirectFunctionCall3(pg_convert, string,
459 								 src_encoding_name, dest_encoding_name);
460 
461 	/*
462 	 * pg_convert returns a bytea, which we in turn return as text, relying on
463 	 * the fact that they are both in fact varlena types, and thus
464 	 * structurally identical. Although not all bytea values are valid text,
465 	 * in this case it will be because we've told pg_convert to return one
466 	 * that is valid as text in the current database encoding.
467 	 */
468 	PG_RETURN_DATUM(result);
469 }
470 
471 /*
472  * Convert string between two arbitrary encodings.
473  *
474  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
475  */
476 Datum
pg_convert(PG_FUNCTION_ARGS)477 pg_convert(PG_FUNCTION_ARGS)
478 {
479 	bytea	   *string = PG_GETARG_BYTEA_PP(0);
480 	char	   *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
481 	int			src_encoding = pg_char_to_encoding(src_encoding_name);
482 	char	   *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
483 	int			dest_encoding = pg_char_to_encoding(dest_encoding_name);
484 	const char *src_str;
485 	char	   *dest_str;
486 	bytea	   *retval;
487 	int			len;
488 
489 	if (src_encoding < 0)
490 		ereport(ERROR,
491 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
492 				 errmsg("invalid source encoding name \"%s\"",
493 						src_encoding_name)));
494 	if (dest_encoding < 0)
495 		ereport(ERROR,
496 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
497 				 errmsg("invalid destination encoding name \"%s\"",
498 						dest_encoding_name)));
499 
500 	/* make sure that source string is valid */
501 	len = VARSIZE_ANY_EXHDR(string);
502 	src_str = VARDATA_ANY(string);
503 	pg_verify_mbstr_len(src_encoding, src_str, len, false);
504 
505 	/* perform conversion */
506 	dest_str = (char *) pg_do_encoding_conversion((unsigned char *) src_str,
507 												  len,
508 												  src_encoding,
509 												  dest_encoding);
510 
511 	/* update len if conversion actually happened */
512 	if (dest_str != src_str)
513 		len = strlen(dest_str);
514 
515 	/*
516 	 * build bytea data type structure.
517 	 */
518 	retval = (bytea *) palloc(len + VARHDRSZ);
519 	SET_VARSIZE(retval, len + VARHDRSZ);
520 	memcpy(VARDATA(retval), dest_str, len);
521 
522 	if (dest_str != src_str)
523 		pfree(dest_str);
524 
525 	/* free memory if allocated by the toaster */
526 	PG_FREE_IF_COPY(string, 0);
527 
528 	PG_RETURN_BYTEA_P(retval);
529 }
530 
531 /*
532  * get the length of the string considered as text in the specified
533  * encoding. Raises an error if the data is not valid in that
534  * encoding.
535  *
536  * INT4 length (BYTEA string, NAME src_encoding_name)
537  */
538 Datum
length_in_encoding(PG_FUNCTION_ARGS)539 length_in_encoding(PG_FUNCTION_ARGS)
540 {
541 	bytea	   *string = PG_GETARG_BYTEA_PP(0);
542 	char	   *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
543 	int			src_encoding = pg_char_to_encoding(src_encoding_name);
544 	const char *src_str;
545 	int			len;
546 	int			retval;
547 
548 	if (src_encoding < 0)
549 		ereport(ERROR,
550 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
551 				 errmsg("invalid encoding name \"%s\"",
552 						src_encoding_name)));
553 
554 	len = VARSIZE_ANY_EXHDR(string);
555 	src_str = VARDATA_ANY(string);
556 
557 	retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
558 
559 	PG_RETURN_INT32(retval);
560 }
561 
562 /*
563  * Get maximum multibyte character length in the specified encoding.
564  *
565  * Note encoding is specified numerically, not by name as above.
566  */
567 Datum
pg_encoding_max_length_sql(PG_FUNCTION_ARGS)568 pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
569 {
570 	int			encoding = PG_GETARG_INT32(0);
571 
572 	if (PG_VALID_ENCODING(encoding))
573 		PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
574 	else
575 		PG_RETURN_NULL();
576 }
577 
578 /*
579  * Convert client encoding to server encoding.
580  *
581  * See the notes about string conversion functions at the top of this file.
582  */
583 char *
pg_client_to_server(const char * s,int len)584 pg_client_to_server(const char *s, int len)
585 {
586 	return pg_any_to_server(s, len, ClientEncoding->encoding);
587 }
588 
589 /*
590  * Convert any encoding to server encoding.
591  *
592  * See the notes about string conversion functions at the top of this file.
593  *
594  * Unlike the other string conversion functions, this will apply validation
595  * even if encoding == DatabaseEncoding->encoding.  This is because this is
596  * used to process data coming in from outside the database, and we never
597  * want to just assume validity.
598  */
599 char *
pg_any_to_server(const char * s,int len,int encoding)600 pg_any_to_server(const char *s, int len, int encoding)
601 {
602 	if (len <= 0)
603 		return (char *) s;		/* empty string is always valid */
604 
605 	if (encoding == DatabaseEncoding->encoding ||
606 		encoding == PG_SQL_ASCII)
607 	{
608 		/*
609 		 * No conversion is needed, but we must still validate the data.
610 		 */
611 		(void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
612 		return (char *) s;
613 	}
614 
615 	if (DatabaseEncoding->encoding == PG_SQL_ASCII)
616 	{
617 		/*
618 		 * No conversion is possible, but we must still validate the data,
619 		 * because the client-side code might have done string escaping using
620 		 * the selected client_encoding.  If the client encoding is ASCII-safe
621 		 * then we just do a straight validation under that encoding.  For an
622 		 * ASCII-unsafe encoding we have a problem: we dare not pass such data
623 		 * to the parser but we have no way to convert it.  We compromise by
624 		 * rejecting the data if it contains any non-ASCII characters.
625 		 */
626 		if (PG_VALID_BE_ENCODING(encoding))
627 			(void) pg_verify_mbstr(encoding, s, len, false);
628 		else
629 		{
630 			int			i;
631 
632 			for (i = 0; i < len; i++)
633 			{
634 				if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
635 					ereport(ERROR,
636 							(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
637 							 errmsg("invalid byte value for encoding \"%s\": 0x%02x",
638 									pg_enc2name_tbl[PG_SQL_ASCII].name,
639 									(unsigned char) s[i])));
640 			}
641 		}
642 		return (char *) s;
643 	}
644 
645 	/* Fast path if we can use cached conversion function */
646 	if (encoding == ClientEncoding->encoding)
647 		return perform_default_encoding_conversion(s, len, true);
648 
649 	/* General case ... will not work outside transactions */
650 	return (char *) pg_do_encoding_conversion((unsigned char *) s,
651 											  len,
652 											  encoding,
653 											  DatabaseEncoding->encoding);
654 }
655 
656 /*
657  * Convert server encoding to client encoding.
658  *
659  * See the notes about string conversion functions at the top of this file.
660  */
661 char *
pg_server_to_client(const char * s,int len)662 pg_server_to_client(const char *s, int len)
663 {
664 	return pg_server_to_any(s, len, ClientEncoding->encoding);
665 }
666 
667 /*
668  * Convert server encoding to any encoding.
669  *
670  * See the notes about string conversion functions at the top of this file.
671  */
672 char *
pg_server_to_any(const char * s,int len,int encoding)673 pg_server_to_any(const char *s, int len, int encoding)
674 {
675 	if (len <= 0)
676 		return (char *) s;		/* empty string is always valid */
677 
678 	if (encoding == DatabaseEncoding->encoding ||
679 		encoding == PG_SQL_ASCII)
680 		return (char *) s;		/* assume data is valid */
681 
682 	if (DatabaseEncoding->encoding == PG_SQL_ASCII)
683 	{
684 		/* No conversion is possible, but we must validate the result */
685 		(void) pg_verify_mbstr(encoding, s, len, false);
686 		return (char *) s;
687 	}
688 
689 	/* Fast path if we can use cached conversion function */
690 	if (encoding == ClientEncoding->encoding)
691 		return perform_default_encoding_conversion(s, len, false);
692 
693 	/* General case ... will not work outside transactions */
694 	return (char *) pg_do_encoding_conversion((unsigned char *) s,
695 											  len,
696 											  DatabaseEncoding->encoding,
697 											  encoding);
698 }
699 
700 /*
701  *	Perform default encoding conversion using cached FmgrInfo. Since
702  *	this function does not access database at all, it is safe to call
703  *	outside transactions.  If the conversion has not been set up by
704  *	SetClientEncoding(), no conversion is performed.
705  */
706 static char *
perform_default_encoding_conversion(const char * src,int len,bool is_client_to_server)707 perform_default_encoding_conversion(const char *src, int len,
708 									bool is_client_to_server)
709 {
710 	char	   *result;
711 	int			src_encoding,
712 				dest_encoding;
713 	FmgrInfo   *flinfo;
714 
715 	if (is_client_to_server)
716 	{
717 		src_encoding = ClientEncoding->encoding;
718 		dest_encoding = DatabaseEncoding->encoding;
719 		flinfo = ToServerConvProc;
720 	}
721 	else
722 	{
723 		src_encoding = DatabaseEncoding->encoding;
724 		dest_encoding = ClientEncoding->encoding;
725 		flinfo = ToClientConvProc;
726 	}
727 
728 	if (flinfo == NULL)
729 		return (char *) src;
730 
731 	/*
732 	 * Allocate space for conversion result, being wary of integer overflow.
733 	 * See comments in pg_do_encoding_conversion.
734 	 */
735 	if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
736 		ereport(ERROR,
737 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
738 				 errmsg("out of memory"),
739 				 errdetail("String of %d bytes is too long for encoding conversion.",
740 						   len)));
741 
742 	result = (char *)
743 		MemoryContextAllocHuge(CurrentMemoryContext,
744 							   (Size) len * MAX_CONVERSION_GROWTH + 1);
745 
746 	FunctionCall5(flinfo,
747 				  Int32GetDatum(src_encoding),
748 				  Int32GetDatum(dest_encoding),
749 				  CStringGetDatum(src),
750 				  CStringGetDatum(result),
751 				  Int32GetDatum(len));
752 
753 	/*
754 	 * Release extra space if there might be a lot --- see comments in
755 	 * pg_do_encoding_conversion.
756 	 */
757 	if (len > 1000000)
758 	{
759 		Size		resultlen = strlen(result);
760 
761 		if (resultlen >= MaxAllocSize)
762 			ereport(ERROR,
763 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
764 					 errmsg("out of memory"),
765 					 errdetail("String of %d bytes is too long for encoding conversion.",
766 							   len)));
767 
768 		result = (char *) repalloc(result, resultlen + 1);
769 	}
770 
771 	return result;
772 }
773 
774 
775 /* convert a multibyte string to a wchar */
776 int
pg_mb2wchar(const char * from,pg_wchar * to)777 pg_mb2wchar(const char *from, pg_wchar *to)
778 {
779 	return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, strlen(from));
780 }
781 
782 /* convert a multibyte string to a wchar with a limited length */
783 int
pg_mb2wchar_with_len(const char * from,pg_wchar * to,int len)784 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
785 {
786 	return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
787 }
788 
789 /* same, with any encoding */
790 int
pg_encoding_mb2wchar_with_len(int encoding,const char * from,pg_wchar * to,int len)791 pg_encoding_mb2wchar_with_len(int encoding,
792 							  const char *from, pg_wchar *to, int len)
793 {
794 	return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
795 }
796 
797 /* convert a wchar string to a multibyte */
798 int
pg_wchar2mb(const pg_wchar * from,char * to)799 pg_wchar2mb(const pg_wchar *from, char *to)
800 {
801 	return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *) to, pg_wchar_strlen(from));
802 }
803 
804 /* convert a wchar string to a multibyte with a limited length */
805 int
pg_wchar2mb_with_len(const pg_wchar * from,char * to,int len)806 pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
807 {
808 	return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *) to, len);
809 }
810 
811 /* same, with any encoding */
812 int
pg_encoding_wchar2mb_with_len(int encoding,const pg_wchar * from,char * to,int len)813 pg_encoding_wchar2mb_with_len(int encoding,
814 							  const pg_wchar *from, char *to, int len)
815 {
816 	return (*pg_wchar_table[encoding].wchar2mb_with_len) (from, (unsigned char *) to, len);
817 }
818 
819 /* returns the byte length of a multibyte character */
820 int
pg_mblen(const char * mbstr)821 pg_mblen(const char *mbstr)
822 {
823 	return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) ((const unsigned char *) mbstr));
824 }
825 
826 /* returns the display length of a multibyte character */
827 int
pg_dsplen(const char * mbstr)828 pg_dsplen(const char *mbstr)
829 {
830 	return ((*pg_wchar_table[DatabaseEncoding->encoding].dsplen) ((const unsigned char *) mbstr));
831 }
832 
833 /* returns the length (counted in wchars) of a multibyte string */
834 int
pg_mbstrlen(const char * mbstr)835 pg_mbstrlen(const char *mbstr)
836 {
837 	int			len = 0;
838 
839 	/* optimization for single byte encoding */
840 	if (pg_database_encoding_max_length() == 1)
841 		return strlen(mbstr);
842 
843 	while (*mbstr)
844 	{
845 		mbstr += pg_mblen(mbstr);
846 		len++;
847 	}
848 	return len;
849 }
850 
851 /* returns the length (counted in wchars) of a multibyte string
852  * (not necessarily NULL terminated)
853  */
854 int
pg_mbstrlen_with_len(const char * mbstr,int limit)855 pg_mbstrlen_with_len(const char *mbstr, int limit)
856 {
857 	int			len = 0;
858 
859 	/* optimization for single byte encoding */
860 	if (pg_database_encoding_max_length() == 1)
861 		return limit;
862 
863 	while (limit > 0 && *mbstr)
864 	{
865 		int			l = pg_mblen(mbstr);
866 
867 		limit -= l;
868 		mbstr += l;
869 		len++;
870 	}
871 	return len;
872 }
873 
874 /*
875  * returns the byte length of a multibyte string
876  * (not necessarily NULL terminated)
877  * that is no longer than limit.
878  * this function does not break multibyte character boundary.
879  */
880 int
pg_mbcliplen(const char * mbstr,int len,int limit)881 pg_mbcliplen(const char *mbstr, int len, int limit)
882 {
883 	return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
884 								 len, limit);
885 }
886 
887 /*
888  * pg_mbcliplen with specified encoding
889  */
890 int
pg_encoding_mbcliplen(int encoding,const char * mbstr,int len,int limit)891 pg_encoding_mbcliplen(int encoding, const char *mbstr,
892 					  int len, int limit)
893 {
894 	mblen_converter mblen_fn;
895 	int			clen = 0;
896 	int			l;
897 
898 	/* optimization for single byte encoding */
899 	if (pg_encoding_max_length(encoding) == 1)
900 		return cliplen(mbstr, len, limit);
901 
902 	mblen_fn = pg_wchar_table[encoding].mblen;
903 
904 	while (len > 0 && *mbstr)
905 	{
906 		l = (*mblen_fn) ((const unsigned char *) mbstr);
907 		if ((clen + l) > limit)
908 			break;
909 		clen += l;
910 		if (clen == limit)
911 			break;
912 		len -= l;
913 		mbstr += l;
914 	}
915 	return clen;
916 }
917 
918 /*
919  * Similar to pg_mbcliplen except the limit parameter specifies the
920  * character length, not the byte length.
921  */
922 int
pg_mbcharcliplen(const char * mbstr,int len,int limit)923 pg_mbcharcliplen(const char *mbstr, int len, int limit)
924 {
925 	int			clen = 0;
926 	int			nch = 0;
927 	int			l;
928 
929 	/* optimization for single byte encoding */
930 	if (pg_database_encoding_max_length() == 1)
931 		return cliplen(mbstr, len, limit);
932 
933 	while (len > 0 && *mbstr)
934 	{
935 		l = pg_mblen(mbstr);
936 		nch++;
937 		if (nch > limit)
938 			break;
939 		clen += l;
940 		len -= l;
941 		mbstr += l;
942 	}
943 	return clen;
944 }
945 
946 /* mbcliplen for any single-byte encoding */
947 static int
cliplen(const char * str,int len,int limit)948 cliplen(const char *str, int len, int limit)
949 {
950 	int			l = 0;
951 
952 	len = Min(len, limit);
953 	while (l < len && str[l])
954 		l++;
955 	return l;
956 }
957 
958 void
SetDatabaseEncoding(int encoding)959 SetDatabaseEncoding(int encoding)
960 {
961 	if (!PG_VALID_BE_ENCODING(encoding))
962 		elog(ERROR, "invalid database encoding: %d", encoding);
963 
964 	DatabaseEncoding = &pg_enc2name_tbl[encoding];
965 	Assert(DatabaseEncoding->encoding == encoding);
966 }
967 
968 void
SetMessageEncoding(int encoding)969 SetMessageEncoding(int encoding)
970 {
971 	/* Some calls happen before we can elog()! */
972 	Assert(PG_VALID_ENCODING(encoding));
973 
974 	MessageEncoding = &pg_enc2name_tbl[encoding];
975 	Assert(MessageEncoding->encoding == encoding);
976 }
977 
978 #ifdef ENABLE_NLS
979 /*
980  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
981  * codeset.  Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
982  * fail for gettext-internal causes like out-of-memory.
983  */
984 static bool
raw_pg_bind_textdomain_codeset(const char * domainname,int encoding)985 raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
986 {
987 	bool		elog_ok = (CurrentMemoryContext != NULL);
988 	int			i;
989 
990 	for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
991 	{
992 		if (pg_enc2gettext_tbl[i].encoding == encoding)
993 		{
994 			if (bind_textdomain_codeset(domainname,
995 										pg_enc2gettext_tbl[i].name) != NULL)
996 				return true;
997 
998 			if (elog_ok)
999 				elog(LOG, "bind_textdomain_codeset failed");
1000 			else
1001 				write_stderr("bind_textdomain_codeset failed");
1002 
1003 			break;
1004 		}
1005 	}
1006 
1007 	return false;
1008 }
1009 
1010 /*
1011  * Bind a gettext message domain to the codeset corresponding to the database
1012  * encoding.  For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
1013  * Return the MessageEncoding implied by the new settings.
1014  *
1015  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
1016  * When that matches the database encoding, we don't need to do anything.  In
1017  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
1018  * database encoding, except for the C locale.  (On Windows, we also permit a
1019  * discrepancy under the UTF8 encoding.)  For the C locale, explicitly bind
1020  * gettext to the right codeset.
1021  *
1022  * On Windows, gettext defaults to the Windows ANSI code page.  This is a
1023  * convenient departure for software that passes the strings to Windows ANSI
1024  * APIs, but we don't do that.  Compel gettext to use database encoding or,
1025  * failing that, the LC_CTYPE encoding as it would on other platforms.
1026  *
1027  * This function is called before elog() and palloc() are usable.
1028  */
1029 int
pg_bind_textdomain_codeset(const char * domainname)1030 pg_bind_textdomain_codeset(const char *domainname)
1031 {
1032 	bool		elog_ok = (CurrentMemoryContext != NULL);
1033 	int			encoding = GetDatabaseEncoding();
1034 	int			new_msgenc;
1035 
1036 #ifndef WIN32
1037 	const char *ctype = setlocale(LC_CTYPE, NULL);
1038 
1039 	if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
1040 #endif
1041 		if (encoding != PG_SQL_ASCII &&
1042 			raw_pg_bind_textdomain_codeset(domainname, encoding))
1043 			return encoding;
1044 
1045 	new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
1046 	if (new_msgenc < 0)
1047 		new_msgenc = PG_SQL_ASCII;
1048 
1049 #ifdef WIN32
1050 	if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
1051 		/* On failure, the old message encoding remains valid. */
1052 		return GetMessageEncoding();
1053 #endif
1054 
1055 	return new_msgenc;
1056 }
1057 #endif
1058 
1059 /*
1060  * The database encoding, also called the server encoding, represents the
1061  * encoding of data stored in text-like data types.  Affected types include
1062  * cstring, text, varchar, name, xml, and json.
1063  */
1064 int
GetDatabaseEncoding(void)1065 GetDatabaseEncoding(void)
1066 {
1067 	return DatabaseEncoding->encoding;
1068 }
1069 
1070 const char *
GetDatabaseEncodingName(void)1071 GetDatabaseEncodingName(void)
1072 {
1073 	return DatabaseEncoding->name;
1074 }
1075 
1076 Datum
getdatabaseencoding(PG_FUNCTION_ARGS)1077 getdatabaseencoding(PG_FUNCTION_ARGS)
1078 {
1079 	return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1080 }
1081 
1082 Datum
pg_client_encoding(PG_FUNCTION_ARGS)1083 pg_client_encoding(PG_FUNCTION_ARGS)
1084 {
1085 	return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1086 }
1087 
1088 /*
1089  * gettext() returns messages in this encoding.  This often matches the
1090  * database encoding, but it differs for SQL_ASCII databases, for processes
1091  * not attached to a database, and under a database encoding lacking iconv
1092  * support (MULE_INTERNAL).
1093  */
1094 int
GetMessageEncoding(void)1095 GetMessageEncoding(void)
1096 {
1097 	return MessageEncoding->encoding;
1098 }
1099 
1100 #ifdef WIN32
1101 /*
1102  * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
1103  * string. The character length is also passed to utf16len if not
1104  * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
1105  * should be ASCII-only; this will function as though MessageEncoding is UTF8.
1106  */
1107 WCHAR *
pgwin32_message_to_UTF16(const char * str,int len,int * utf16len)1108 pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1109 {
1110 	int			msgenc = GetMessageEncoding();
1111 	WCHAR	   *utf16;
1112 	int			dstlen;
1113 	UINT		codepage;
1114 
1115 	if (msgenc == PG_SQL_ASCII)
1116 		/* No conversion is possible, and SQL_ASCII is never utf16. */
1117 		return NULL;
1118 
1119 	codepage = pg_enc2name_tbl[msgenc].codepage;
1120 
1121 	/*
1122 	 * Use MultiByteToWideChar directly if there is a corresponding codepage,
1123 	 * or double conversion through UTF8 if not.  Double conversion is needed,
1124 	 * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1125 	 */
1126 	if (codepage != 0)
1127 	{
1128 		utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1129 		dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1130 		utf16[dstlen] = (WCHAR) 0;
1131 	}
1132 	else
1133 	{
1134 		char	   *utf8;
1135 
1136 		/*
1137 		 * XXX pg_do_encoding_conversion() requires a transaction.  In the
1138 		 * absence of one, hope for the input to be valid UTF8.
1139 		 */
1140 		if (IsTransactionState())
1141 		{
1142 			utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1143 													  len,
1144 													  msgenc,
1145 													  PG_UTF8);
1146 			if (utf8 != str)
1147 				len = strlen(utf8);
1148 		}
1149 		else
1150 			utf8 = (char *) str;
1151 
1152 		utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1153 		dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1154 		utf16[dstlen] = (WCHAR) 0;
1155 
1156 		if (utf8 != str)
1157 			pfree(utf8);
1158 	}
1159 
1160 	if (dstlen == 0 && len > 0)
1161 	{
1162 		pfree(utf16);
1163 		return NULL;			/* error */
1164 	}
1165 
1166 	if (utf16len)
1167 		*utf16len = dstlen;
1168 	return utf16;
1169 }
1170 
1171 #endif
1172