1 /*-------------------------------------------------------------------------
2  *
3  * mbutils.c
4  *	  This file contains functions for encoding conversion.
5  *
6  * The string-conversion functions in this file share some API quirks.
7  * Note the following:
8  *
9  * The functions return a palloc'd, null-terminated string if conversion
10  * is required.  However, if no conversion is performed, the given source
11  * string pointer is returned as-is.
12  *
13  * Although the presence of a length argument means that callers can pass
14  * non-null-terminated strings, care is required because the same string
15  * will be passed back if no conversion occurs.  Such callers *must* check
16  * whether result == src and handle that case differently.
17  *
18  * If the source and destination encodings are the same, the source string
19  * is returned without any verification; it's assumed to be valid data.
20  * If that might not be the case, the caller is responsible for validating
21  * the string using a separate call to pg_verify_mbstr().  Whenever the
22  * source and destination encodings are different, the functions ensure that
23  * the result is validly encoded according to the destination encoding.
24  *
25  *
26  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
27  * Portions Copyright (c) 1994, Regents of the University of California
28  *
29  *
30  * IDENTIFICATION
31  *	  src/backend/utils/mb/mbutils.c
32  *
33  *-------------------------------------------------------------------------
34  */
35 #include "postgres.h"
36 
37 #include "access/xact.h"
38 #include "catalog/namespace.h"
39 #include "mb/pg_wchar.h"
40 #include "utils/builtins.h"
41 #include "utils/memutils.h"
42 #include "utils/syscache.h"
43 
44 /*
45  * We maintain a simple linked list caching the fmgr lookup info for the
46  * currently selected conversion functions, as well as any that have been
47  * selected previously in the current session.  (We remember previous
48  * settings because we must be able to restore a previous setting during
49  * transaction rollback, without doing any fresh catalog accesses.)
50  *
51  * Since we'll never release this data, we just keep it in TopMemoryContext.
52  */
53 typedef struct ConvProcInfo
54 {
55 	int			s_encoding;		/* server and client encoding IDs */
56 	int			c_encoding;
57 	FmgrInfo	to_server_info; /* lookup info for conversion procs */
58 	FmgrInfo	to_client_info;
59 } ConvProcInfo;
60 
61 static List *ConvProcList = NIL;	/* List of ConvProcInfo */
62 
63 /*
64  * These variables point to the currently active conversion functions,
65  * or are NULL when no conversion is needed.
66  */
67 static FmgrInfo *ToServerConvProc = NULL;
68 static FmgrInfo *ToClientConvProc = NULL;
69 
70 /*
71  * This variable stores the conversion function to convert from UTF-8
72  * to the server encoding.  It's NULL if the server encoding *is* UTF-8,
73  * or if we lack a conversion function for this.
74  */
75 static FmgrInfo *Utf8ToServerConvProc = NULL;
76 
77 /*
78  * These variables track the currently-selected encodings.
79  */
80 static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
81 static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
82 static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
83 
84 /*
85  * During backend startup we can't set client encoding because we (a)
86  * can't look up the conversion functions, and (b) may not know the database
87  * encoding yet either.  So SetClientEncoding() just accepts anything and
88  * remembers it for InitializeClientEncoding() to apply later.
89  */
90 static bool backend_startup_complete = false;
91 static int	pending_client_encoding = PG_SQL_ASCII;
92 
93 
94 /* Internal functions */
95 static char *perform_default_encoding_conversion(const char *src,
96 												 int len, bool is_client_to_server);
97 static int	cliplen(const char *str, int len, int limit);
98 
99 
100 /*
101  * Prepare for a future call to SetClientEncoding.  Success should mean
102  * that SetClientEncoding is guaranteed to succeed for this encoding request.
103  *
104  * (But note that success before backend_startup_complete does not guarantee
105  * success after ...)
106  *
107  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
108  */
109 int
PrepareClientEncoding(int encoding)110 PrepareClientEncoding(int encoding)
111 {
112 	int			current_server_encoding;
113 	ListCell   *lc;
114 
115 	if (!PG_VALID_FE_ENCODING(encoding))
116 		return -1;
117 
118 	/* Can't do anything during startup, per notes above */
119 	if (!backend_startup_complete)
120 		return 0;
121 
122 	current_server_encoding = GetDatabaseEncoding();
123 
124 	/*
125 	 * Check for cases that require no conversion function.
126 	 */
127 	if (current_server_encoding == encoding ||
128 		current_server_encoding == PG_SQL_ASCII ||
129 		encoding == PG_SQL_ASCII)
130 		return 0;
131 
132 	if (IsTransactionState())
133 	{
134 		/*
135 		 * If we're in a live transaction, it's safe to access the catalogs,
136 		 * so look up the functions.  We repeat the lookup even if the info is
137 		 * already cached, so that we can react to changes in the contents of
138 		 * pg_conversion.
139 		 */
140 		Oid			to_server_proc,
141 					to_client_proc;
142 		ConvProcInfo *convinfo;
143 		MemoryContext oldcontext;
144 
145 		to_server_proc = FindDefaultConversionProc(encoding,
146 												   current_server_encoding);
147 		if (!OidIsValid(to_server_proc))
148 			return -1;
149 		to_client_proc = FindDefaultConversionProc(current_server_encoding,
150 												   encoding);
151 		if (!OidIsValid(to_client_proc))
152 			return -1;
153 
154 		/*
155 		 * Load the fmgr info into TopMemoryContext (could still fail here)
156 		 */
157 		convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
158 													   sizeof(ConvProcInfo));
159 		convinfo->s_encoding = current_server_encoding;
160 		convinfo->c_encoding = encoding;
161 		fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
162 					  TopMemoryContext);
163 		fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
164 					  TopMemoryContext);
165 
166 		/* Attach new info to head of list */
167 		oldcontext = MemoryContextSwitchTo(TopMemoryContext);
168 		ConvProcList = lcons(convinfo, ConvProcList);
169 		MemoryContextSwitchTo(oldcontext);
170 
171 		/*
172 		 * We cannot yet remove any older entry for the same encoding pair,
173 		 * since it could still be in use.  SetClientEncoding will clean up.
174 		 */
175 
176 		return 0;				/* success */
177 	}
178 	else
179 	{
180 		/*
181 		 * If we're not in a live transaction, the only thing we can do is
182 		 * restore a previous setting using the cache.  This covers all
183 		 * transaction-rollback cases.  The only case it might not work for is
184 		 * trying to change client_encoding on the fly by editing
185 		 * postgresql.conf and SIGHUP'ing.  Which would probably be a stupid
186 		 * thing to do anyway.
187 		 */
188 		foreach(lc, ConvProcList)
189 		{
190 			ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
191 
192 			if (oldinfo->s_encoding == current_server_encoding &&
193 				oldinfo->c_encoding == encoding)
194 				return 0;
195 		}
196 
197 		return -1;				/* it's not cached, so fail */
198 	}
199 }
200 
201 /*
202  * Set the active client encoding and set up the conversion-function pointers.
203  * PrepareClientEncoding should have been called previously for this encoding.
204  *
205  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
206  */
207 int
SetClientEncoding(int encoding)208 SetClientEncoding(int encoding)
209 {
210 	int			current_server_encoding;
211 	bool		found;
212 	ListCell   *lc;
213 
214 	if (!PG_VALID_FE_ENCODING(encoding))
215 		return -1;
216 
217 	/* Can't do anything during startup, per notes above */
218 	if (!backend_startup_complete)
219 	{
220 		pending_client_encoding = encoding;
221 		return 0;
222 	}
223 
224 	current_server_encoding = GetDatabaseEncoding();
225 
226 	/*
227 	 * Check for cases that require no conversion function.
228 	 */
229 	if (current_server_encoding == encoding ||
230 		current_server_encoding == PG_SQL_ASCII ||
231 		encoding == PG_SQL_ASCII)
232 	{
233 		ClientEncoding = &pg_enc2name_tbl[encoding];
234 		ToServerConvProc = NULL;
235 		ToClientConvProc = NULL;
236 		return 0;
237 	}
238 
239 	/*
240 	 * Search the cache for the entry previously prepared by
241 	 * PrepareClientEncoding; if there isn't one, we lose.  While at it,
242 	 * release any duplicate entries so that repeated Prepare/Set cycles don't
243 	 * leak memory.
244 	 */
245 	found = false;
246 	foreach(lc, ConvProcList)
247 	{
248 		ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
249 
250 		if (convinfo->s_encoding == current_server_encoding &&
251 			convinfo->c_encoding == encoding)
252 		{
253 			if (!found)
254 			{
255 				/* Found newest entry, so set up */
256 				ClientEncoding = &pg_enc2name_tbl[encoding];
257 				ToServerConvProc = &convinfo->to_server_info;
258 				ToClientConvProc = &convinfo->to_client_info;
259 				found = true;
260 			}
261 			else
262 			{
263 				/* Duplicate entry, release it */
264 				ConvProcList = foreach_delete_current(ConvProcList, lc);
265 				pfree(convinfo);
266 			}
267 		}
268 	}
269 
270 	if (found)
271 		return 0;				/* success */
272 	else
273 		return -1;				/* it's not cached, so fail */
274 }
275 
276 /*
277  * Initialize client encoding conversions.
278  *		Called from InitPostgres() once during backend startup.
279  */
280 void
InitializeClientEncoding(void)281 InitializeClientEncoding(void)
282 {
283 	int			current_server_encoding;
284 
285 	Assert(!backend_startup_complete);
286 	backend_startup_complete = true;
287 
288 	if (PrepareClientEncoding(pending_client_encoding) < 0 ||
289 		SetClientEncoding(pending_client_encoding) < 0)
290 	{
291 		/*
292 		 * Oops, the requested conversion is not available. We couldn't fail
293 		 * before, but we can now.
294 		 */
295 		ereport(FATAL,
296 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
297 				 errmsg("conversion between %s and %s is not supported",
298 						pg_enc2name_tbl[pending_client_encoding].name,
299 						GetDatabaseEncodingName())));
300 	}
301 
302 	/*
303 	 * Also look up the UTF8-to-server conversion function if needed.  Since
304 	 * the server encoding is fixed within any one backend process, we don't
305 	 * have to do this more than once.
306 	 */
307 	current_server_encoding = GetDatabaseEncoding();
308 	if (current_server_encoding != PG_UTF8 &&
309 		current_server_encoding != PG_SQL_ASCII)
310 	{
311 		Oid			utf8_to_server_proc;
312 
313 		Assert(IsTransactionState());
314 		utf8_to_server_proc =
315 			FindDefaultConversionProc(PG_UTF8,
316 									  current_server_encoding);
317 		/* If there's no such conversion, just leave the pointer as NULL */
318 		if (OidIsValid(utf8_to_server_proc))
319 		{
320 			FmgrInfo   *finfo;
321 
322 			finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext,
323 													sizeof(FmgrInfo));
324 			fmgr_info_cxt(utf8_to_server_proc, finfo,
325 						  TopMemoryContext);
326 			/* Set Utf8ToServerConvProc only after data is fully valid */
327 			Utf8ToServerConvProc = finfo;
328 		}
329 	}
330 }
331 
332 /*
333  * returns the current client encoding
334  */
335 int
pg_get_client_encoding(void)336 pg_get_client_encoding(void)
337 {
338 	return ClientEncoding->encoding;
339 }
340 
341 /*
342  * returns the current client encoding name
343  */
344 const char *
pg_get_client_encoding_name(void)345 pg_get_client_encoding_name(void)
346 {
347 	return ClientEncoding->name;
348 }
349 
350 /*
351  * Convert src string to another encoding (general case).
352  *
353  * See the notes about string conversion functions at the top of this file.
354  */
355 unsigned char *
pg_do_encoding_conversion(unsigned char * src,int len,int src_encoding,int dest_encoding)356 pg_do_encoding_conversion(unsigned char *src, int len,
357 						  int src_encoding, int dest_encoding)
358 {
359 	unsigned char *result;
360 	Oid			proc;
361 
362 	if (len <= 0)
363 		return src;				/* empty string is always valid */
364 
365 	if (src_encoding == dest_encoding)
366 		return src;				/* no conversion required, assume valid */
367 
368 	if (dest_encoding == PG_SQL_ASCII)
369 		return src;				/* any string is valid in SQL_ASCII */
370 
371 	if (src_encoding == PG_SQL_ASCII)
372 	{
373 		/* No conversion is possible, but we must validate the result */
374 		(void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
375 		return src;
376 	}
377 
378 	if (!IsTransactionState())	/* shouldn't happen */
379 		elog(ERROR, "cannot perform encoding conversion outside a transaction");
380 
381 	proc = FindDefaultConversionProc(src_encoding, dest_encoding);
382 	if (!OidIsValid(proc))
383 		ereport(ERROR,
384 				(errcode(ERRCODE_UNDEFINED_FUNCTION),
385 				 errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
386 						pg_encoding_to_char(src_encoding),
387 						pg_encoding_to_char(dest_encoding))));
388 
389 	/*
390 	 * Allocate space for conversion result, being wary of integer overflow.
391 	 *
392 	 * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
393 	 * required space, so it might exceed MaxAllocSize even though the result
394 	 * would actually fit.  We do not want to hand back a result string that
395 	 * exceeds MaxAllocSize, because callers might not cope gracefully --- but
396 	 * if we just allocate more than that, and don't use it, that's fine.
397 	 */
398 	if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
399 		ereport(ERROR,
400 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
401 				 errmsg("out of memory"),
402 				 errdetail("String of %d bytes is too long for encoding conversion.",
403 						   len)));
404 
405 	result = (unsigned char *)
406 		MemoryContextAllocHuge(CurrentMemoryContext,
407 							   (Size) len * MAX_CONVERSION_GROWTH + 1);
408 
409 	(void) OidFunctionCall6(proc,
410 							Int32GetDatum(src_encoding),
411 							Int32GetDatum(dest_encoding),
412 							CStringGetDatum(src),
413 							CStringGetDatum(result),
414 							Int32GetDatum(len),
415 							BoolGetDatum(false));
416 
417 	/*
418 	 * If the result is large, it's worth repalloc'ing to release any extra
419 	 * space we asked for.  The cutoff here is somewhat arbitrary, but we
420 	 * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
421 	 */
422 	if (len > 1000000)
423 	{
424 		Size		resultlen = strlen((char *) result);
425 
426 		if (resultlen >= MaxAllocSize)
427 			ereport(ERROR,
428 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
429 					 errmsg("out of memory"),
430 					 errdetail("String of %d bytes is too long for encoding conversion.",
431 							   len)));
432 
433 		result = (unsigned char *) repalloc(result, resultlen + 1);
434 	}
435 
436 	return result;
437 }
438 
439 /*
440  * Convert src string to another encoding.
441  *
442  * This function has a different API than the other conversion functions.
443  * The caller should've looked up the conversion function using
444  * FindDefaultConversionProc().  Unlike the other functions, the converted
445  * result is not palloc'd.  It is written to the caller-supplied buffer
446  * instead.
447  *
448  * src_encoding   - encoding to convert from
449  * dest_encoding  - encoding to convert to
450  * src, srclen    - input buffer and its length in bytes
451  * dest, destlen  - destination buffer and its size in bytes
452  *
453  * The output is null-terminated.
454  *
455  * If destlen < srclen * MAX_CONVERSION_LENGTH + 1, the converted output
456  * wouldn't necessarily fit in the output buffer, and the function will not
457  * convert the whole input.
458  *
459  * TODO: The conversion function interface is not great.  Firstly, it
460  * would be nice to pass through the destination buffer size to the
461  * conversion function, so that if you pass a shorter destination buffer, it
462  * could still continue to fill up the whole buffer.  Currently, we have to
463  * assume worst case expansion and stop the conversion short, even if there
464  * is in fact space left in the destination buffer.  Secondly, it would be
465  * nice to return the number of bytes written to the caller, to avoid a call
466  * to strlen().
467  */
468 int
pg_do_encoding_conversion_buf(Oid proc,int src_encoding,int dest_encoding,unsigned char * src,int srclen,unsigned char * dest,int destlen,bool noError)469 pg_do_encoding_conversion_buf(Oid proc,
470 							  int src_encoding,
471 							  int dest_encoding,
472 							  unsigned char *src, int srclen,
473 							  unsigned char *dest, int destlen,
474 							  bool noError)
475 {
476 	Datum		result;
477 
478 	/*
479 	 * If the destination buffer is not large enough to hold the result in the
480 	 * worst case, limit the input size passed to the conversion function.
481 	 */
482 	if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
483 		srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
484 
485 	result = OidFunctionCall6(proc,
486 							  Int32GetDatum(src_encoding),
487 							  Int32GetDatum(dest_encoding),
488 							  CStringGetDatum(src),
489 							  CStringGetDatum(dest),
490 							  Int32GetDatum(srclen),
491 							  BoolGetDatum(noError));
492 	return DatumGetInt32(result);
493 }
494 
495 /*
496  * Convert string to encoding encoding_name. The source
497  * encoding is the DB encoding.
498  *
499  * BYTEA convert_to(TEXT string, NAME encoding_name) */
500 Datum
pg_convert_to(PG_FUNCTION_ARGS)501 pg_convert_to(PG_FUNCTION_ARGS)
502 {
503 	Datum		string = PG_GETARG_DATUM(0);
504 	Datum		dest_encoding_name = PG_GETARG_DATUM(1);
505 	Datum		src_encoding_name = DirectFunctionCall1(namein,
506 														CStringGetDatum(DatabaseEncoding->name));
507 	Datum		result;
508 
509 	/*
510 	 * pg_convert expects a bytea as its first argument. We're passing it a
511 	 * text argument here, relying on the fact that they are both in fact
512 	 * varlena types, and thus structurally identical.
513 	 */
514 	result = DirectFunctionCall3(pg_convert, string,
515 								 src_encoding_name, dest_encoding_name);
516 
517 	PG_RETURN_DATUM(result);
518 }
519 
520 /*
521  * Convert string from encoding encoding_name. The destination
522  * encoding is the DB encoding.
523  *
524  * TEXT convert_from(BYTEA string, NAME encoding_name) */
525 Datum
pg_convert_from(PG_FUNCTION_ARGS)526 pg_convert_from(PG_FUNCTION_ARGS)
527 {
528 	Datum		string = PG_GETARG_DATUM(0);
529 	Datum		src_encoding_name = PG_GETARG_DATUM(1);
530 	Datum		dest_encoding_name = DirectFunctionCall1(namein,
531 														 CStringGetDatum(DatabaseEncoding->name));
532 	Datum		result;
533 
534 	result = DirectFunctionCall3(pg_convert, string,
535 								 src_encoding_name, dest_encoding_name);
536 
537 	/*
538 	 * pg_convert returns a bytea, which we in turn return as text, relying on
539 	 * the fact that they are both in fact varlena types, and thus
540 	 * structurally identical. Although not all bytea values are valid text,
541 	 * in this case it will be because we've told pg_convert to return one
542 	 * that is valid as text in the current database encoding.
543 	 */
544 	PG_RETURN_DATUM(result);
545 }
546 
547 /*
548  * Convert string between two arbitrary encodings.
549  *
550  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
551  */
552 Datum
pg_convert(PG_FUNCTION_ARGS)553 pg_convert(PG_FUNCTION_ARGS)
554 {
555 	bytea	   *string = PG_GETARG_BYTEA_PP(0);
556 	char	   *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
557 	int			src_encoding = pg_char_to_encoding(src_encoding_name);
558 	char	   *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
559 	int			dest_encoding = pg_char_to_encoding(dest_encoding_name);
560 	const char *src_str;
561 	char	   *dest_str;
562 	bytea	   *retval;
563 	int			len;
564 
565 	if (src_encoding < 0)
566 		ereport(ERROR,
567 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
568 				 errmsg("invalid source encoding name \"%s\"",
569 						src_encoding_name)));
570 	if (dest_encoding < 0)
571 		ereport(ERROR,
572 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
573 				 errmsg("invalid destination encoding name \"%s\"",
574 						dest_encoding_name)));
575 
576 	/* make sure that source string is valid */
577 	len = VARSIZE_ANY_EXHDR(string);
578 	src_str = VARDATA_ANY(string);
579 	(void) pg_verify_mbstr(src_encoding, src_str, len, false);
580 
581 	/* perform conversion */
582 	dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
583 												  len,
584 												  src_encoding,
585 												  dest_encoding);
586 
587 	/* update len if conversion actually happened */
588 	if (dest_str != src_str)
589 		len = strlen(dest_str);
590 
591 	/*
592 	 * build bytea data type structure.
593 	 */
594 	retval = (bytea *) palloc(len + VARHDRSZ);
595 	SET_VARSIZE(retval, len + VARHDRSZ);
596 	memcpy(VARDATA(retval), dest_str, len);
597 
598 	if (dest_str != src_str)
599 		pfree(dest_str);
600 
601 	/* free memory if allocated by the toaster */
602 	PG_FREE_IF_COPY(string, 0);
603 
604 	PG_RETURN_BYTEA_P(retval);
605 }
606 
607 /*
608  * get the length of the string considered as text in the specified
609  * encoding. Raises an error if the data is not valid in that
610  * encoding.
611  *
612  * INT4 length (BYTEA string, NAME src_encoding_name)
613  */
614 Datum
length_in_encoding(PG_FUNCTION_ARGS)615 length_in_encoding(PG_FUNCTION_ARGS)
616 {
617 	bytea	   *string = PG_GETARG_BYTEA_PP(0);
618 	char	   *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
619 	int			src_encoding = pg_char_to_encoding(src_encoding_name);
620 	const char *src_str;
621 	int			len;
622 	int			retval;
623 
624 	if (src_encoding < 0)
625 		ereport(ERROR,
626 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
627 				 errmsg("invalid encoding name \"%s\"",
628 						src_encoding_name)));
629 
630 	len = VARSIZE_ANY_EXHDR(string);
631 	src_str = VARDATA_ANY(string);
632 
633 	retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
634 
635 	PG_RETURN_INT32(retval);
636 }
637 
638 /*
639  * Get maximum multibyte character length in the specified encoding.
640  *
641  * Note encoding is specified numerically, not by name as above.
642  */
643 Datum
pg_encoding_max_length_sql(PG_FUNCTION_ARGS)644 pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
645 {
646 	int			encoding = PG_GETARG_INT32(0);
647 
648 	if (PG_VALID_ENCODING(encoding))
649 		PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
650 	else
651 		PG_RETURN_NULL();
652 }
653 
654 /*
655  * Convert client encoding to server encoding.
656  *
657  * See the notes about string conversion functions at the top of this file.
658  */
659 char *
pg_client_to_server(const char * s,int len)660 pg_client_to_server(const char *s, int len)
661 {
662 	return pg_any_to_server(s, len, ClientEncoding->encoding);
663 }
664 
665 /*
666  * Convert any encoding to server encoding.
667  *
668  * See the notes about string conversion functions at the top of this file.
669  *
670  * Unlike the other string conversion functions, this will apply validation
671  * even if encoding == DatabaseEncoding->encoding.  This is because this is
672  * used to process data coming in from outside the database, and we never
673  * want to just assume validity.
674  */
675 char *
pg_any_to_server(const char * s,int len,int encoding)676 pg_any_to_server(const char *s, int len, int encoding)
677 {
678 	if (len <= 0)
679 		return unconstify(char *, s);	/* empty string is always valid */
680 
681 	if (encoding == DatabaseEncoding->encoding ||
682 		encoding == PG_SQL_ASCII)
683 	{
684 		/*
685 		 * No conversion is needed, but we must still validate the data.
686 		 */
687 		(void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
688 		return unconstify(char *, s);
689 	}
690 
691 	if (DatabaseEncoding->encoding == PG_SQL_ASCII)
692 	{
693 		/*
694 		 * No conversion is possible, but we must still validate the data,
695 		 * because the client-side code might have done string escaping using
696 		 * the selected client_encoding.  If the client encoding is ASCII-safe
697 		 * then we just do a straight validation under that encoding.  For an
698 		 * ASCII-unsafe encoding we have a problem: we dare not pass such data
699 		 * to the parser but we have no way to convert it.  We compromise by
700 		 * rejecting the data if it contains any non-ASCII characters.
701 		 */
702 		if (PG_VALID_BE_ENCODING(encoding))
703 			(void) pg_verify_mbstr(encoding, s, len, false);
704 		else
705 		{
706 			int			i;
707 
708 			for (i = 0; i < len; i++)
709 			{
710 				if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
711 					ereport(ERROR,
712 							(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
713 							 errmsg("invalid byte value for encoding \"%s\": 0x%02x",
714 									pg_enc2name_tbl[PG_SQL_ASCII].name,
715 									(unsigned char) s[i])));
716 			}
717 		}
718 		return unconstify(char *, s);
719 	}
720 
721 	/* Fast path if we can use cached conversion function */
722 	if (encoding == ClientEncoding->encoding)
723 		return perform_default_encoding_conversion(s, len, true);
724 
725 	/* General case ... will not work outside transactions */
726 	return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
727 											  len,
728 											  encoding,
729 											  DatabaseEncoding->encoding);
730 }
731 
732 /*
733  * Convert server encoding to client encoding.
734  *
735  * See the notes about string conversion functions at the top of this file.
736  */
737 char *
pg_server_to_client(const char * s,int len)738 pg_server_to_client(const char *s, int len)
739 {
740 	return pg_server_to_any(s, len, ClientEncoding->encoding);
741 }
742 
743 /*
744  * Convert server encoding to any encoding.
745  *
746  * See the notes about string conversion functions at the top of this file.
747  */
748 char *
pg_server_to_any(const char * s,int len,int encoding)749 pg_server_to_any(const char *s, int len, int encoding)
750 {
751 	if (len <= 0)
752 		return unconstify(char *, s);	/* empty string is always valid */
753 
754 	if (encoding == DatabaseEncoding->encoding ||
755 		encoding == PG_SQL_ASCII)
756 		return unconstify(char *, s);	/* assume data is valid */
757 
758 	if (DatabaseEncoding->encoding == PG_SQL_ASCII)
759 	{
760 		/* No conversion is possible, but we must validate the result */
761 		(void) pg_verify_mbstr(encoding, s, len, false);
762 		return unconstify(char *, s);
763 	}
764 
765 	/* Fast path if we can use cached conversion function */
766 	if (encoding == ClientEncoding->encoding)
767 		return perform_default_encoding_conversion(s, len, false);
768 
769 	/* General case ... will not work outside transactions */
770 	return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
771 											  len,
772 											  DatabaseEncoding->encoding,
773 											  encoding);
774 }
775 
776 /*
777  *	Perform default encoding conversion using cached FmgrInfo. Since
778  *	this function does not access database at all, it is safe to call
779  *	outside transactions.  If the conversion has not been set up by
780  *	SetClientEncoding(), no conversion is performed.
781  */
782 static char *
perform_default_encoding_conversion(const char * src,int len,bool is_client_to_server)783 perform_default_encoding_conversion(const char *src, int len,
784 									bool is_client_to_server)
785 {
786 	char	   *result;
787 	int			src_encoding,
788 				dest_encoding;
789 	FmgrInfo   *flinfo;
790 
791 	if (is_client_to_server)
792 	{
793 		src_encoding = ClientEncoding->encoding;
794 		dest_encoding = DatabaseEncoding->encoding;
795 		flinfo = ToServerConvProc;
796 	}
797 	else
798 	{
799 		src_encoding = DatabaseEncoding->encoding;
800 		dest_encoding = ClientEncoding->encoding;
801 		flinfo = ToClientConvProc;
802 	}
803 
804 	if (flinfo == NULL)
805 		return unconstify(char *, src);
806 
807 	/*
808 	 * Allocate space for conversion result, being wary of integer overflow.
809 	 * See comments in pg_do_encoding_conversion.
810 	 */
811 	if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
812 		ereport(ERROR,
813 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
814 				 errmsg("out of memory"),
815 				 errdetail("String of %d bytes is too long for encoding conversion.",
816 						   len)));
817 
818 	result = (char *)
819 		MemoryContextAllocHuge(CurrentMemoryContext,
820 							   (Size) len * MAX_CONVERSION_GROWTH + 1);
821 
822 	FunctionCall6(flinfo,
823 				  Int32GetDatum(src_encoding),
824 				  Int32GetDatum(dest_encoding),
825 				  CStringGetDatum(src),
826 				  CStringGetDatum(result),
827 				  Int32GetDatum(len),
828 				  BoolGetDatum(false));
829 
830 	/*
831 	 * Release extra space if there might be a lot --- see comments in
832 	 * pg_do_encoding_conversion.
833 	 */
834 	if (len > 1000000)
835 	{
836 		Size		resultlen = strlen(result);
837 
838 		if (resultlen >= MaxAllocSize)
839 			ereport(ERROR,
840 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
841 					 errmsg("out of memory"),
842 					 errdetail("String of %d bytes is too long for encoding conversion.",
843 							   len)));
844 
845 		result = (char *) repalloc(result, resultlen + 1);
846 	}
847 
848 	return result;
849 }
850 
851 /*
852  * Convert a single Unicode code point into a string in the server encoding.
853  *
854  * The code point given by "c" is converted and stored at *s, which must
855  * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
856  * The output will have a trailing '\0'.  Throws error if the conversion
857  * cannot be performed.
858  *
859  * Note that this relies on having previously looked up any required
860  * conversion function.  That's partly for speed but mostly because the parser
861  * may call this outside any transaction, or in an aborted transaction.
862  */
863 void
pg_unicode_to_server(pg_wchar c,unsigned char * s)864 pg_unicode_to_server(pg_wchar c, unsigned char *s)
865 {
866 	unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
867 	int			c_as_utf8_len;
868 	int			server_encoding;
869 
870 	/*
871 	 * Complain if invalid Unicode code point.  The choice of errcode here is
872 	 * debatable, but really our caller should have checked this anyway.
873 	 */
874 	if (!is_valid_unicode_codepoint(c))
875 		ereport(ERROR,
876 				(errcode(ERRCODE_SYNTAX_ERROR),
877 				 errmsg("invalid Unicode code point")));
878 
879 	/* Otherwise, if it's in ASCII range, conversion is trivial */
880 	if (c <= 0x7F)
881 	{
882 		s[0] = (unsigned char) c;
883 		s[1] = '\0';
884 		return;
885 	}
886 
887 	/* If the server encoding is UTF-8, we just need to reformat the code */
888 	server_encoding = GetDatabaseEncoding();
889 	if (server_encoding == PG_UTF8)
890 	{
891 		unicode_to_utf8(c, s);
892 		s[pg_utf_mblen(s)] = '\0';
893 		return;
894 	}
895 
896 	/* For all other cases, we must have a conversion function available */
897 	if (Utf8ToServerConvProc == NULL)
898 		ereport(ERROR,
899 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
900 				 errmsg("conversion between %s and %s is not supported",
901 						pg_enc2name_tbl[PG_UTF8].name,
902 						GetDatabaseEncodingName())));
903 
904 	/* Construct UTF-8 source string */
905 	unicode_to_utf8(c, c_as_utf8);
906 	c_as_utf8_len = pg_utf_mblen(c_as_utf8);
907 	c_as_utf8[c_as_utf8_len] = '\0';
908 
909 	/* Convert, or throw error if we can't */
910 	FunctionCall6(Utf8ToServerConvProc,
911 				  Int32GetDatum(PG_UTF8),
912 				  Int32GetDatum(server_encoding),
913 				  CStringGetDatum(c_as_utf8),
914 				  CStringGetDatum(s),
915 				  Int32GetDatum(c_as_utf8_len),
916 				  BoolGetDatum(false));
917 }
918 
919 
920 /* convert a multibyte string to a wchar */
921 int
pg_mb2wchar(const char * from,pg_wchar * to)922 pg_mb2wchar(const char *from, pg_wchar *to)
923 {
924 	return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
925 }
926 
927 /* convert a multibyte string to a wchar with a limited length */
928 int
pg_mb2wchar_with_len(const char * from,pg_wchar * to,int len)929 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
930 {
931 	return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
932 }
933 
934 /* same, with any encoding */
935 int
pg_encoding_mb2wchar_with_len(int encoding,const char * from,pg_wchar * to,int len)936 pg_encoding_mb2wchar_with_len(int encoding,
937 							  const char *from, pg_wchar *to, int len)
938 {
939 	return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
940 }
941 
942 /* convert a wchar string to a multibyte */
943 int
pg_wchar2mb(const pg_wchar * from,char * to)944 pg_wchar2mb(const pg_wchar *from, char *to)
945 {
946 	return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
947 }
948 
949 /* convert a wchar string to a multibyte with a limited length */
950 int
pg_wchar2mb_with_len(const pg_wchar * from,char * to,int len)951 pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
952 {
953 	return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
954 }
955 
956 /* same, with any encoding */
957 int
pg_encoding_wchar2mb_with_len(int encoding,const pg_wchar * from,char * to,int len)958 pg_encoding_wchar2mb_with_len(int encoding,
959 							  const pg_wchar *from, char *to, int len)
960 {
961 	return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
962 }
963 
964 /* returns the byte length of a multibyte character */
965 int
pg_mblen(const char * mbstr)966 pg_mblen(const char *mbstr)
967 {
968 	return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
969 }
970 
971 /* returns the display length of a multibyte character */
972 int
pg_dsplen(const char * mbstr)973 pg_dsplen(const char *mbstr)
974 {
975 	return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
976 }
977 
978 /* returns the length (counted in wchars) of a multibyte string */
979 int
pg_mbstrlen(const char * mbstr)980 pg_mbstrlen(const char *mbstr)
981 {
982 	int			len = 0;
983 
984 	/* optimization for single byte encoding */
985 	if (pg_database_encoding_max_length() == 1)
986 		return strlen(mbstr);
987 
988 	while (*mbstr)
989 	{
990 		mbstr += pg_mblen(mbstr);
991 		len++;
992 	}
993 	return len;
994 }
995 
996 /* returns the length (counted in wchars) of a multibyte string
997  * (not necessarily NULL terminated)
998  */
999 int
pg_mbstrlen_with_len(const char * mbstr,int limit)1000 pg_mbstrlen_with_len(const char *mbstr, int limit)
1001 {
1002 	int			len = 0;
1003 
1004 	/* optimization for single byte encoding */
1005 	if (pg_database_encoding_max_length() == 1)
1006 		return limit;
1007 
1008 	while (limit > 0 && *mbstr)
1009 	{
1010 		int			l = pg_mblen(mbstr);
1011 
1012 		limit -= l;
1013 		mbstr += l;
1014 		len++;
1015 	}
1016 	return len;
1017 }
1018 
1019 /*
1020  * returns the byte length of a multibyte string
1021  * (not necessarily NULL terminated)
1022  * that is no longer than limit.
1023  * this function does not break multibyte character boundary.
1024  */
1025 int
pg_mbcliplen(const char * mbstr,int len,int limit)1026 pg_mbcliplen(const char *mbstr, int len, int limit)
1027 {
1028 	return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
1029 								 len, limit);
1030 }
1031 
1032 /*
1033  * pg_mbcliplen with specified encoding
1034  */
1035 int
pg_encoding_mbcliplen(int encoding,const char * mbstr,int len,int limit)1036 pg_encoding_mbcliplen(int encoding, const char *mbstr,
1037 					  int len, int limit)
1038 {
1039 	mblen_converter mblen_fn;
1040 	int			clen = 0;
1041 	int			l;
1042 
1043 	/* optimization for single byte encoding */
1044 	if (pg_encoding_max_length(encoding) == 1)
1045 		return cliplen(mbstr, len, limit);
1046 
1047 	mblen_fn = pg_wchar_table[encoding].mblen;
1048 
1049 	while (len > 0 && *mbstr)
1050 	{
1051 		l = (*mblen_fn) ((const unsigned char *) mbstr);
1052 		if ((clen + l) > limit)
1053 			break;
1054 		clen += l;
1055 		if (clen == limit)
1056 			break;
1057 		len -= l;
1058 		mbstr += l;
1059 	}
1060 	return clen;
1061 }
1062 
1063 /*
1064  * Similar to pg_mbcliplen except the limit parameter specifies the
1065  * character length, not the byte length.
1066  */
1067 int
pg_mbcharcliplen(const char * mbstr,int len,int limit)1068 pg_mbcharcliplen(const char *mbstr, int len, int limit)
1069 {
1070 	int			clen = 0;
1071 	int			nch = 0;
1072 	int			l;
1073 
1074 	/* optimization for single byte encoding */
1075 	if (pg_database_encoding_max_length() == 1)
1076 		return cliplen(mbstr, len, limit);
1077 
1078 	while (len > 0 && *mbstr)
1079 	{
1080 		l = pg_mblen(mbstr);
1081 		nch++;
1082 		if (nch > limit)
1083 			break;
1084 		clen += l;
1085 		len -= l;
1086 		mbstr += l;
1087 	}
1088 	return clen;
1089 }
1090 
1091 /* mbcliplen for any single-byte encoding */
1092 static int
cliplen(const char * str,int len,int limit)1093 cliplen(const char *str, int len, int limit)
1094 {
1095 	int			l = 0;
1096 
1097 	len = Min(len, limit);
1098 	while (l < len && str[l])
1099 		l++;
1100 	return l;
1101 }
1102 
1103 void
SetDatabaseEncoding(int encoding)1104 SetDatabaseEncoding(int encoding)
1105 {
1106 	if (!PG_VALID_BE_ENCODING(encoding))
1107 		elog(ERROR, "invalid database encoding: %d", encoding);
1108 
1109 	DatabaseEncoding = &pg_enc2name_tbl[encoding];
1110 	Assert(DatabaseEncoding->encoding == encoding);
1111 }
1112 
1113 void
SetMessageEncoding(int encoding)1114 SetMessageEncoding(int encoding)
1115 {
1116 	/* Some calls happen before we can elog()! */
1117 	Assert(PG_VALID_ENCODING(encoding));
1118 
1119 	MessageEncoding = &pg_enc2name_tbl[encoding];
1120 	Assert(MessageEncoding->encoding == encoding);
1121 }
1122 
1123 #ifdef ENABLE_NLS
1124 /*
1125  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
1126  * codeset.  Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
1127  * fail for gettext-internal causes like out-of-memory.
1128  */
1129 static bool
raw_pg_bind_textdomain_codeset(const char * domainname,int encoding)1130 raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
1131 {
1132 	bool		elog_ok = (CurrentMemoryContext != NULL);
1133 	int			i;
1134 
1135 	for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
1136 	{
1137 		if (pg_enc2gettext_tbl[i].encoding == encoding)
1138 		{
1139 			if (bind_textdomain_codeset(domainname,
1140 										pg_enc2gettext_tbl[i].name) != NULL)
1141 				return true;
1142 
1143 			if (elog_ok)
1144 				elog(LOG, "bind_textdomain_codeset failed");
1145 			else
1146 				write_stderr("bind_textdomain_codeset failed");
1147 
1148 			break;
1149 		}
1150 	}
1151 
1152 	return false;
1153 }
1154 
1155 /*
1156  * Bind a gettext message domain to the codeset corresponding to the database
1157  * encoding.  For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
1158  * Return the MessageEncoding implied by the new settings.
1159  *
1160  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
1161  * When that matches the database encoding, we don't need to do anything.  In
1162  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
1163  * database encoding, except for the C locale.  (On Windows, we also permit a
1164  * discrepancy under the UTF8 encoding.)  For the C locale, explicitly bind
1165  * gettext to the right codeset.
1166  *
1167  * On Windows, gettext defaults to the Windows ANSI code page.  This is a
1168  * convenient departure for software that passes the strings to Windows ANSI
1169  * APIs, but we don't do that.  Compel gettext to use database encoding or,
1170  * failing that, the LC_CTYPE encoding as it would on other platforms.
1171  *
1172  * This function is called before elog() and palloc() are usable.
1173  */
1174 int
pg_bind_textdomain_codeset(const char * domainname)1175 pg_bind_textdomain_codeset(const char *domainname)
1176 {
1177 	bool		elog_ok = (CurrentMemoryContext != NULL);
1178 	int			encoding = GetDatabaseEncoding();
1179 	int			new_msgenc;
1180 
1181 #ifndef WIN32
1182 	const char *ctype = setlocale(LC_CTYPE, NULL);
1183 
1184 	if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
1185 #endif
1186 		if (encoding != PG_SQL_ASCII &&
1187 			raw_pg_bind_textdomain_codeset(domainname, encoding))
1188 			return encoding;
1189 
1190 	new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
1191 	if (new_msgenc < 0)
1192 		new_msgenc = PG_SQL_ASCII;
1193 
1194 #ifdef WIN32
1195 	if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
1196 		/* On failure, the old message encoding remains valid. */
1197 		return GetMessageEncoding();
1198 #endif
1199 
1200 	return new_msgenc;
1201 }
1202 #endif
1203 
1204 /*
1205  * The database encoding, also called the server encoding, represents the
1206  * encoding of data stored in text-like data types.  Affected types include
1207  * cstring, text, varchar, name, xml, and json.
1208  */
1209 int
GetDatabaseEncoding(void)1210 GetDatabaseEncoding(void)
1211 {
1212 	return DatabaseEncoding->encoding;
1213 }
1214 
1215 const char *
GetDatabaseEncodingName(void)1216 GetDatabaseEncodingName(void)
1217 {
1218 	return DatabaseEncoding->name;
1219 }
1220 
1221 Datum
getdatabaseencoding(PG_FUNCTION_ARGS)1222 getdatabaseencoding(PG_FUNCTION_ARGS)
1223 {
1224 	return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1225 }
1226 
1227 Datum
pg_client_encoding(PG_FUNCTION_ARGS)1228 pg_client_encoding(PG_FUNCTION_ARGS)
1229 {
1230 	return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1231 }
1232 
1233 Datum
PG_char_to_encoding(PG_FUNCTION_ARGS)1234 PG_char_to_encoding(PG_FUNCTION_ARGS)
1235 {
1236 	Name		s = PG_GETARG_NAME(0);
1237 
1238 	PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
1239 }
1240 
1241 Datum
PG_encoding_to_char(PG_FUNCTION_ARGS)1242 PG_encoding_to_char(PG_FUNCTION_ARGS)
1243 {
1244 	int32		encoding = PG_GETARG_INT32(0);
1245 	const char *encoding_name = pg_encoding_to_char(encoding);
1246 
1247 	return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
1248 }
1249 
1250 /*
1251  * gettext() returns messages in this encoding.  This often matches the
1252  * database encoding, but it differs for SQL_ASCII databases, for processes
1253  * not attached to a database, and under a database encoding lacking iconv
1254  * support (MULE_INTERNAL).
1255  */
1256 int
GetMessageEncoding(void)1257 GetMessageEncoding(void)
1258 {
1259 	return MessageEncoding->encoding;
1260 }
1261 
1262 
1263 /*
1264  * Generic character incrementer function.
1265  *
1266  * Not knowing anything about the properties of the encoding in use, we just
1267  * keep incrementing the last byte until we get a validly-encoded result,
1268  * or we run out of values to try.  We don't bother to try incrementing
1269  * higher-order bytes, so there's no growth in runtime for wider characters.
1270  * (If we did try to do that, we'd need to consider the likelihood that 255
1271  * is not a valid final byte in the encoding.)
1272  */
1273 static bool
pg_generic_charinc(unsigned char * charptr,int len)1274 pg_generic_charinc(unsigned char *charptr, int len)
1275 {
1276 	unsigned char *lastbyte = charptr + len - 1;
1277 	mbchar_verifier mbverify;
1278 
1279 	/* We can just invoke the character verifier directly. */
1280 	mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverifychar;
1281 
1282 	while (*lastbyte < (unsigned char) 255)
1283 	{
1284 		(*lastbyte)++;
1285 		if ((*mbverify) (charptr, len) == len)
1286 			return true;
1287 	}
1288 
1289 	return false;
1290 }
1291 
1292 /*
1293  * UTF-8 character incrementer function.
1294  *
1295  * For a one-byte character less than 0x7F, we just increment the byte.
1296  *
1297  * For a multibyte character, every byte but the first must fall between 0x80
1298  * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
1299  * the last byte that's not already at its maximum value.  If we can't find a
1300  * byte that's less than the maximum allowable value, we simply fail.  We also
1301  * need some special-case logic to skip regions used for surrogate pair
1302  * handling, as those should not occur in valid UTF-8.
1303  *
1304  * Note that we don't reset lower-order bytes back to their minimums, since
1305  * we can't afford to make an exhaustive search (see make_greater_string).
1306  */
1307 static bool
pg_utf8_increment(unsigned char * charptr,int length)1308 pg_utf8_increment(unsigned char *charptr, int length)
1309 {
1310 	unsigned char a;
1311 	unsigned char limit;
1312 
1313 	switch (length)
1314 	{
1315 		default:
1316 			/* reject lengths 5 and 6 for now */
1317 			return false;
1318 		case 4:
1319 			a = charptr[3];
1320 			if (a < 0xBF)
1321 			{
1322 				charptr[3]++;
1323 				break;
1324 			}
1325 			/* FALL THRU */
1326 		case 3:
1327 			a = charptr[2];
1328 			if (a < 0xBF)
1329 			{
1330 				charptr[2]++;
1331 				break;
1332 			}
1333 			/* FALL THRU */
1334 		case 2:
1335 			a = charptr[1];
1336 			switch (*charptr)
1337 			{
1338 				case 0xED:
1339 					limit = 0x9F;
1340 					break;
1341 				case 0xF4:
1342 					limit = 0x8F;
1343 					break;
1344 				default:
1345 					limit = 0xBF;
1346 					break;
1347 			}
1348 			if (a < limit)
1349 			{
1350 				charptr[1]++;
1351 				break;
1352 			}
1353 			/* FALL THRU */
1354 		case 1:
1355 			a = *charptr;
1356 			if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1357 				return false;
1358 			charptr[0]++;
1359 			break;
1360 	}
1361 
1362 	return true;
1363 }
1364 
1365 /*
1366  * EUC-JP character incrementer function.
1367  *
1368  * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1369  * representing JIS X 0201 characters with the second byte ranging between
1370  * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
1371  * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1372  *
1373  * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1374  * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
1375  * is incremented if possible, otherwise the second-to-last byte.
1376  *
1377  * If the sequence starts with a value other than the above and its MSB
1378  * is set, it must be a two-byte sequence representing JIS X 0208 characters
1379  * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
1380  * incremented if possible, otherwise the second-to-last byte.
1381  *
1382  * Otherwise, the sequence is a single-byte ASCII character. It is
1383  * incremented up to 0x7f.
1384  */
1385 static bool
pg_eucjp_increment(unsigned char * charptr,int length)1386 pg_eucjp_increment(unsigned char *charptr, int length)
1387 {
1388 	unsigned char c1,
1389 				c2;
1390 	int			i;
1391 
1392 	c1 = *charptr;
1393 
1394 	switch (c1)
1395 	{
1396 		case SS2:				/* JIS X 0201 */
1397 			if (length != 2)
1398 				return false;
1399 
1400 			c2 = charptr[1];
1401 
1402 			if (c2 >= 0xdf)
1403 				charptr[0] = charptr[1] = 0xa1;
1404 			else if (c2 < 0xa1)
1405 				charptr[1] = 0xa1;
1406 			else
1407 				charptr[1]++;
1408 			break;
1409 
1410 		case SS3:				/* JIS X 0212 */
1411 			if (length != 3)
1412 				return false;
1413 
1414 			for (i = 2; i > 0; i--)
1415 			{
1416 				c2 = charptr[i];
1417 				if (c2 < 0xa1)
1418 				{
1419 					charptr[i] = 0xa1;
1420 					return true;
1421 				}
1422 				else if (c2 < 0xfe)
1423 				{
1424 					charptr[i]++;
1425 					return true;
1426 				}
1427 			}
1428 
1429 			/* Out of 3-byte code region */
1430 			return false;
1431 
1432 		default:
1433 			if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1434 			{
1435 				if (length != 2)
1436 					return false;
1437 
1438 				for (i = 1; i >= 0; i--)
1439 				{
1440 					c2 = charptr[i];
1441 					if (c2 < 0xa1)
1442 					{
1443 						charptr[i] = 0xa1;
1444 						return true;
1445 					}
1446 					else if (c2 < 0xfe)
1447 					{
1448 						charptr[i]++;
1449 						return true;
1450 					}
1451 				}
1452 
1453 				/* Out of 2 byte code region */
1454 				return false;
1455 			}
1456 			else
1457 			{					/* ASCII, single byte */
1458 				if (c1 > 0x7e)
1459 					return false;
1460 				(*charptr)++;
1461 			}
1462 			break;
1463 	}
1464 
1465 	return true;
1466 }
1467 
1468 /*
1469  * get the character incrementer for the encoding for the current database
1470  */
1471 mbcharacter_incrementer
pg_database_encoding_character_incrementer(void)1472 pg_database_encoding_character_incrementer(void)
1473 {
1474 	/*
1475 	 * Eventually it might be best to add a field to pg_wchar_table[], but for
1476 	 * now we just use a switch.
1477 	 */
1478 	switch (GetDatabaseEncoding())
1479 	{
1480 		case PG_UTF8:
1481 			return pg_utf8_increment;
1482 
1483 		case PG_EUC_JP:
1484 			return pg_eucjp_increment;
1485 
1486 		default:
1487 			return pg_generic_charinc;
1488 	}
1489 }
1490 
1491 /*
1492  * fetch maximum length of the encoding for the current database
1493  */
1494 int
pg_database_encoding_max_length(void)1495 pg_database_encoding_max_length(void)
1496 {
1497 	return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
1498 }
1499 
1500 /*
1501  * Verify mbstr to make sure that it is validly encoded in the current
1502  * database encoding.  Otherwise same as pg_verify_mbstr().
1503  */
1504 bool
pg_verifymbstr(const char * mbstr,int len,bool noError)1505 pg_verifymbstr(const char *mbstr, int len, bool noError)
1506 {
1507 	return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
1508 }
1509 
1510 /*
1511  * Verify mbstr to make sure that it is validly encoded in the specified
1512  * encoding.
1513  */
1514 bool
pg_verify_mbstr(int encoding,const char * mbstr,int len,bool noError)1515 pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1516 {
1517 	int			oklen;
1518 
1519 	Assert(PG_VALID_ENCODING(encoding));
1520 
1521 	oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
1522 	if (oklen != len)
1523 	{
1524 		if (noError)
1525 			return false;
1526 		report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
1527 	}
1528 	return true;
1529 }
1530 
1531 /*
1532  * Verify mbstr to make sure that it is validly encoded in the specified
1533  * encoding.
1534  *
1535  * mbstr is not necessarily zero terminated; length of mbstr is
1536  * specified by len.
1537  *
1538  * If OK, return length of string in the encoding.
1539  * If a problem is found, return -1 when noError is
1540  * true; when noError is false, ereport() a descriptive message.
1541  *
1542  * Note: We cannot use the faster encoding-specific mbverifystr() function
1543  * here, because we need to count the number of characters in the string.
1544  */
1545 int
pg_verify_mbstr_len(int encoding,const char * mbstr,int len,bool noError)1546 pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1547 {
1548 	mbchar_verifier mbverifychar;
1549 	int			mb_len;
1550 
1551 	Assert(PG_VALID_ENCODING(encoding));
1552 
1553 	/*
1554 	 * In single-byte encodings, we need only reject nulls (\0).
1555 	 */
1556 	if (pg_encoding_max_length(encoding) <= 1)
1557 	{
1558 		const char *nullpos = memchr(mbstr, 0, len);
1559 
1560 		if (nullpos == NULL)
1561 			return len;
1562 		if (noError)
1563 			return -1;
1564 		report_invalid_encoding(encoding, nullpos, 1);
1565 	}
1566 
1567 	/* fetch function pointer just once */
1568 	mbverifychar = pg_wchar_table[encoding].mbverifychar;
1569 
1570 	mb_len = 0;
1571 
1572 	while (len > 0)
1573 	{
1574 		int			l;
1575 
1576 		/* fast path for ASCII-subset characters */
1577 		if (!IS_HIGHBIT_SET(*mbstr))
1578 		{
1579 			if (*mbstr != '\0')
1580 			{
1581 				mb_len++;
1582 				mbstr++;
1583 				len--;
1584 				continue;
1585 			}
1586 			if (noError)
1587 				return -1;
1588 			report_invalid_encoding(encoding, mbstr, len);
1589 		}
1590 
1591 		l = (*mbverifychar) ((const unsigned char *) mbstr, len);
1592 
1593 		if (l < 0)
1594 		{
1595 			if (noError)
1596 				return -1;
1597 			report_invalid_encoding(encoding, mbstr, len);
1598 		}
1599 
1600 		mbstr += l;
1601 		len -= l;
1602 		mb_len++;
1603 	}
1604 	return mb_len;
1605 }
1606 
1607 /*
1608  * check_encoding_conversion_args: check arguments of a conversion function
1609  *
1610  * "expected" arguments can be either an encoding ID or -1 to indicate that
1611  * the caller will check whether it accepts the ID.
1612  *
1613  * Note: the errors here are not really user-facing, so elog instead of
1614  * ereport seems sufficient.  Also, we trust that the "expected" encoding
1615  * arguments are valid encoding IDs, but we don't trust the actuals.
1616  */
1617 void
check_encoding_conversion_args(int src_encoding,int dest_encoding,int len,int expected_src_encoding,int expected_dest_encoding)1618 check_encoding_conversion_args(int src_encoding,
1619 							   int dest_encoding,
1620 							   int len,
1621 							   int expected_src_encoding,
1622 							   int expected_dest_encoding)
1623 {
1624 	if (!PG_VALID_ENCODING(src_encoding))
1625 		elog(ERROR, "invalid source encoding ID: %d", src_encoding);
1626 	if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
1627 		elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
1628 			 pg_enc2name_tbl[expected_src_encoding].name,
1629 			 pg_enc2name_tbl[src_encoding].name);
1630 	if (!PG_VALID_ENCODING(dest_encoding))
1631 		elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
1632 	if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
1633 		elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
1634 			 pg_enc2name_tbl[expected_dest_encoding].name,
1635 			 pg_enc2name_tbl[dest_encoding].name);
1636 	if (len < 0)
1637 		elog(ERROR, "encoding conversion length must not be negative");
1638 }
1639 
1640 /*
1641  * report_invalid_encoding: complain about invalid multibyte character
1642  *
1643  * note: len is remaining length of string, not length of character;
1644  * len must be greater than zero, as we always examine the first byte.
1645  */
1646 void
report_invalid_encoding(int encoding,const char * mbstr,int len)1647 report_invalid_encoding(int encoding, const char *mbstr, int len)
1648 {
1649 	int			l = pg_encoding_mblen(encoding, mbstr);
1650 	char		buf[8 * 5 + 1];
1651 	char	   *p = buf;
1652 	int			j,
1653 				jlimit;
1654 
1655 	jlimit = Min(l, len);
1656 	jlimit = Min(jlimit, 8);	/* prevent buffer overrun */
1657 
1658 	for (j = 0; j < jlimit; j++)
1659 	{
1660 		p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1661 		if (j < jlimit - 1)
1662 			p += sprintf(p, " ");
1663 	}
1664 
1665 	ereport(ERROR,
1666 			(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1667 			 errmsg("invalid byte sequence for encoding \"%s\": %s",
1668 					pg_enc2name_tbl[encoding].name,
1669 					buf)));
1670 }
1671 
1672 /*
1673  * report_untranslatable_char: complain about untranslatable character
1674  *
1675  * note: len is remaining length of string, not length of character;
1676  * len must be greater than zero, as we always examine the first byte.
1677  */
1678 void
report_untranslatable_char(int src_encoding,int dest_encoding,const char * mbstr,int len)1679 report_untranslatable_char(int src_encoding, int dest_encoding,
1680 						   const char *mbstr, int len)
1681 {
1682 	int			l = pg_encoding_mblen(src_encoding, mbstr);
1683 	char		buf[8 * 5 + 1];
1684 	char	   *p = buf;
1685 	int			j,
1686 				jlimit;
1687 
1688 	jlimit = Min(l, len);
1689 	jlimit = Min(jlimit, 8);	/* prevent buffer overrun */
1690 
1691 	for (j = 0; j < jlimit; j++)
1692 	{
1693 		p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1694 		if (j < jlimit - 1)
1695 			p += sprintf(p, " ");
1696 	}
1697 
1698 	ereport(ERROR,
1699 			(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
1700 			 errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
1701 					buf,
1702 					pg_enc2name_tbl[src_encoding].name,
1703 					pg_enc2name_tbl[dest_encoding].name)));
1704 }
1705 
1706 
1707 #ifdef WIN32
1708 /*
1709  * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
1710  * string. The character length is also passed to utf16len if not
1711  * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
1712  * should be ASCII-only; this will function as though MessageEncoding is UTF8.
1713  */
1714 WCHAR *
pgwin32_message_to_UTF16(const char * str,int len,int * utf16len)1715 pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1716 {
1717 	int			msgenc = GetMessageEncoding();
1718 	WCHAR	   *utf16;
1719 	int			dstlen;
1720 	UINT		codepage;
1721 
1722 	if (msgenc == PG_SQL_ASCII)
1723 		/* No conversion is possible, and SQL_ASCII is never utf16. */
1724 		return NULL;
1725 
1726 	codepage = pg_enc2name_tbl[msgenc].codepage;
1727 
1728 	/*
1729 	 * Use MultiByteToWideChar directly if there is a corresponding codepage,
1730 	 * or double conversion through UTF8 if not.  Double conversion is needed,
1731 	 * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1732 	 */
1733 	if (codepage != 0)
1734 	{
1735 		utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1736 		dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1737 		utf16[dstlen] = (WCHAR) 0;
1738 	}
1739 	else
1740 	{
1741 		char	   *utf8;
1742 
1743 		/*
1744 		 * XXX pg_do_encoding_conversion() requires a transaction.  In the
1745 		 * absence of one, hope for the input to be valid UTF8.
1746 		 */
1747 		if (IsTransactionState())
1748 		{
1749 			utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1750 													  len,
1751 													  msgenc,
1752 													  PG_UTF8);
1753 			if (utf8 != str)
1754 				len = strlen(utf8);
1755 		}
1756 		else
1757 			utf8 = (char *) str;
1758 
1759 		utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1760 		dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1761 		utf16[dstlen] = (WCHAR) 0;
1762 
1763 		if (utf8 != str)
1764 			pfree(utf8);
1765 	}
1766 
1767 	if (dstlen == 0 && len > 0)
1768 	{
1769 		pfree(utf16);
1770 		return NULL;			/* error */
1771 	}
1772 
1773 	if (utf16len)
1774 		*utf16len = dstlen;
1775 	return utf16;
1776 }
1777 
1778 #endif							/* WIN32 */
1779