1 /*--------------------------------------------------------------------
2  * Symbols referenced in this file:
3  * - GetDatabaseEncoding
4  * - DatabaseEncoding
5  * - pg_get_client_encoding
6  * - ClientEncoding
7  * - pg_mbcliplen
8  * - pg_encoding_mbcliplen
9  * - cliplen
10  * - pg_mblen
11  * - pg_mbstrlen_with_len
12  * - SetDatabaseEncoding
13  *--------------------------------------------------------------------
14  */
15 
16 /*-------------------------------------------------------------------------
17  *
18  * mbutils.c
19  *	  This file contains functions for encoding conversion.
20  *
21  * The string-conversion functions in this file share some API quirks.
22  * Note the following:
23  *
24  * The functions return a palloc'd, null-terminated string if conversion
25  * is required.  However, if no conversion is performed, the given source
26  * string pointer is returned as-is.
27  *
28  * Although the presence of a length argument means that callers can pass
29  * non-null-terminated strings, care is required because the same string
30  * will be passed back if no conversion occurs.  Such callers *must* check
31  * whether result == src and handle that case differently.
32  *
33  * If the source and destination encodings are the same, the source string
34  * is returned without any verification; it's assumed to be valid data.
35  * If that might not be the case, the caller is responsible for validating
36  * the string using a separate call to pg_verify_mbstr().  Whenever the
37  * source and destination encodings are different, the functions ensure that
38  * the result is validly encoded according to the destination encoding.
39  *
40  *
41  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
42  * Portions Copyright (c) 1994, Regents of the University of California
43  *
44  *
45  * IDENTIFICATION
46  *	  src/backend/utils/mb/mbutils.c
47  *
48  *-------------------------------------------------------------------------
49  */
50 #include "postgres.h"
51 
52 #include "access/xact.h"
53 #include "catalog/namespace.h"
54 #include "mb/pg_wchar.h"
55 #include "utils/builtins.h"
56 #include "utils/memutils.h"
57 #include "utils/syscache.h"
58 
59 /*
60  * When converting strings between different encodings, we assume that space
61  * for converted result is 4-to-1 growth in the worst case. The rate for
62  * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
63  * kanna -> UTF8 is the worst case).  So "4" should be enough for the moment.
64  *
65  * Note that this is not the same as the maximum character width in any
66  * particular encoding.
67  */
68 #define MAX_CONVERSION_GROWTH  4
69 
70 /*
71  * We maintain a simple linked list caching the fmgr lookup info for the
72  * currently selected conversion functions, as well as any that have been
73  * selected previously in the current session.  (We remember previous
74  * settings because we must be able to restore a previous setting during
75  * transaction rollback, without doing any fresh catalog accesses.)
76  *
77  * Since we'll never release this data, we just keep it in TopMemoryContext.
78  */
79 typedef struct ConvProcInfo
80 {
81 	int			s_encoding;		/* server and client encoding IDs */
82 	int			c_encoding;
83 	FmgrInfo	to_server_info; /* lookup info for conversion procs */
84 	FmgrInfo	to_client_info;
85 } ConvProcInfo;
86 
87 	/* List of ConvProcInfo */
88 
89 /*
90  * These variables point to the currently active conversion functions,
91  * or are NULL when no conversion is needed.
92  */
93 
94 
95 
96 /*
97  * These variables track the currently-selected encodings.
98  */
99 static __thread const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
100 
101 static __thread const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
102 
103 
104 
105 /*
106  * During backend startup we can't set client encoding because we (a)
107  * can't look up the conversion functions, and (b) may not know the database
108  * encoding yet either.  So SetClientEncoding() just accepts anything and
109  * remembers it for InitializeClientEncoding() to apply later.
110  */
111 
112 
113 
114 
115 /* Internal functions */
116 static char *perform_default_encoding_conversion(const char *src,
117 									int len, bool is_client_to_server);
118 static int	cliplen(const char *str, int len, int limit);
119 
120 
121 /*
122  * Prepare for a future call to SetClientEncoding.  Success should mean
123  * that SetClientEncoding is guaranteed to succeed for this encoding request.
124  *
125  * (But note that success before backend_startup_complete does not guarantee
126  * success after ...)
127  *
128  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
129  */
130 
131 
132 /*
133  * Set the active client encoding and set up the conversion-function pointers.
134  * PrepareClientEncoding should have been called previously for this encoding.
135  *
136  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
137  */
138 
139 
140 /*
141  * Initialize client encoding conversions.
142  *		Called from InitPostgres() once during backend startup.
143  */
144 
145 
146 /*
147  * returns the current client encoding
148  */
149 int
pg_get_client_encoding(void)150 pg_get_client_encoding(void)
151 {
152 	return ClientEncoding->encoding;
153 }
154 
155 /*
156  * returns the current client encoding name
157  */
158 
159 
160 /*
161  * Convert src string to another encoding (general case).
162  *
163  * See the notes about string conversion functions at the top of this file.
164  */
165 
166 
167 /*
168  * Convert string to encoding encoding_name. The source
169  * encoding is the DB encoding.
170  *
171  * BYTEA convert_to(TEXT string, NAME encoding_name) */
172 
173 
174 /*
175  * Convert string from encoding encoding_name. The destination
176  * encoding is the DB encoding.
177  *
178  * TEXT convert_from(BYTEA string, NAME encoding_name) */
179 
180 
181 /*
182  * Convert string between two arbitrary encodings.
183  *
184  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
185  */
186 
187 
188 /*
189  * get the length of the string considered as text in the specified
190  * encoding. Raises an error if the data is not valid in that
191  * encoding.
192  *
193  * INT4 length (BYTEA string, NAME src_encoding_name)
194  */
195 
196 
197 /*
198  * Get maximum multibyte character length in the specified encoding.
199  *
200  * Note encoding is specified numerically, not by name as above.
201  */
202 
203 
204 /*
205  * Convert client encoding to server encoding.
206  *
207  * See the notes about string conversion functions at the top of this file.
208  */
209 
210 
211 /*
212  * Convert any encoding to server encoding.
213  *
214  * See the notes about string conversion functions at the top of this file.
215  *
216  * Unlike the other string conversion functions, this will apply validation
217  * even if encoding == DatabaseEncoding->encoding.  This is because this is
218  * used to process data coming in from outside the database, and we never
219  * want to just assume validity.
220  */
221 
222 
223 /*
224  * Convert server encoding to client encoding.
225  *
226  * See the notes about string conversion functions at the top of this file.
227  */
228 
229 
230 /*
231  * Convert server encoding to any encoding.
232  *
233  * See the notes about string conversion functions at the top of this file.
234  */
235 
236 
237 /*
238  *	Perform default encoding conversion using cached FmgrInfo. Since
239  *	this function does not access database at all, it is safe to call
240  *	outside transactions.  If the conversion has not been set up by
241  *	SetClientEncoding(), no conversion is performed.
242  */
243 
244 
245 
246 /* convert a multibyte string to a wchar */
247 
248 
249 /* convert a multibyte string to a wchar with a limited length */
250 
251 
252 /* same, with any encoding */
253 
254 
255 /* convert a wchar string to a multibyte */
256 
257 
258 /* convert a wchar string to a multibyte with a limited length */
259 
260 
261 /* same, with any encoding */
262 
263 
264 /* returns the byte length of a multibyte character */
265 int
pg_mblen(const char * mbstr)266 pg_mblen(const char *mbstr)
267 {
268 	return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) ((const unsigned char *) mbstr));
269 }
270 
271 /* returns the display length of a multibyte character */
272 
273 
274 /* returns the length (counted in wchars) of a multibyte string */
275 
276 
277 /* returns the length (counted in wchars) of a multibyte string
278  * (not necessarily NULL terminated)
279  */
280 int
pg_mbstrlen_with_len(const char * mbstr,int limit)281 pg_mbstrlen_with_len(const char *mbstr, int limit)
282 {
283 	int			len = 0;
284 
285 	/* optimization for single byte encoding */
286 	if (pg_database_encoding_max_length() == 1)
287 		return limit;
288 
289 	while (limit > 0 && *mbstr)
290 	{
291 		int			l = pg_mblen(mbstr);
292 
293 		limit -= l;
294 		mbstr += l;
295 		len++;
296 	}
297 	return len;
298 }
299 
300 /*
301  * returns the byte length of a multibyte string
302  * (not necessarily NULL terminated)
303  * that is no longer than limit.
304  * this function does not break multibyte character boundary.
305  */
306 int
pg_mbcliplen(const char * mbstr,int len,int limit)307 pg_mbcliplen(const char *mbstr, int len, int limit)
308 {
309 	return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
310 								 len, limit);
311 }
312 
313 /*
314  * pg_mbcliplen with specified encoding
315  */
316 int
pg_encoding_mbcliplen(int encoding,const char * mbstr,int len,int limit)317 pg_encoding_mbcliplen(int encoding, const char *mbstr,
318 					  int len, int limit)
319 {
320 	mblen_converter mblen_fn;
321 	int			clen = 0;
322 	int			l;
323 
324 	/* optimization for single byte encoding */
325 	if (pg_encoding_max_length(encoding) == 1)
326 		return cliplen(mbstr, len, limit);
327 
328 	mblen_fn = pg_wchar_table[encoding].mblen;
329 
330 	while (len > 0 && *mbstr)
331 	{
332 		l = (*mblen_fn) ((const unsigned char *) mbstr);
333 		if ((clen + l) > limit)
334 			break;
335 		clen += l;
336 		if (clen == limit)
337 			break;
338 		len -= l;
339 		mbstr += l;
340 	}
341 	return clen;
342 }
343 
344 /*
345  * Similar to pg_mbcliplen except the limit parameter specifies the
346  * character length, not the byte length.
347  */
348 
349 
350 /* mbcliplen for any single-byte encoding */
351 static int
cliplen(const char * str,int len,int limit)352 cliplen(const char *str, int len, int limit)
353 {
354 	int			l = 0;
355 
356 	len = Min(len, limit);
357 	while (l < len && str[l])
358 		l++;
359 	return l;
360 }
361 
362 void
SetDatabaseEncoding(int encoding)363 SetDatabaseEncoding(int encoding)
364 {
365 	if (!PG_VALID_BE_ENCODING(encoding))
366 		elog(ERROR, "invalid database encoding: %d", encoding);
367 
368 	DatabaseEncoding = &pg_enc2name_tbl[encoding];
369 	Assert(DatabaseEncoding->encoding == encoding);
370 }
371 
372 
373 
374 #ifdef ENABLE_NLS
375 /*
376  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
377  * codeset.  Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
378  * fail for gettext-internal causes like out-of-memory.
379  */
380 static bool
raw_pg_bind_textdomain_codeset(const char * domainname,int encoding)381 raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
382 {
383 	bool		elog_ok = (CurrentMemoryContext != NULL);
384 	int			i;
385 
386 	for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
387 	{
388 		if (pg_enc2gettext_tbl[i].encoding == encoding)
389 		{
390 			if (bind_textdomain_codeset(domainname,
391 										pg_enc2gettext_tbl[i].name) != NULL)
392 				return true;
393 
394 			if (elog_ok)
395 				elog(LOG, "bind_textdomain_codeset failed");
396 			else
397 				write_stderr("bind_textdomain_codeset failed");
398 
399 			break;
400 		}
401 	}
402 
403 	return false;
404 }
405 
406 /*
407  * Bind a gettext message domain to the codeset corresponding to the database
408  * encoding.  For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
409  * Return the MessageEncoding implied by the new settings.
410  *
411  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
412  * When that matches the database encoding, we don't need to do anything.  In
413  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
414  * database encoding, except for the C locale.  (On Windows, we also permit a
415  * discrepancy under the UTF8 encoding.)  For the C locale, explicitly bind
416  * gettext to the right codeset.
417  *
418  * On Windows, gettext defaults to the Windows ANSI code page.  This is a
419  * convenient departure for software that passes the strings to Windows ANSI
420  * APIs, but we don't do that.  Compel gettext to use database encoding or,
421  * failing that, the LC_CTYPE encoding as it would on other platforms.
422  *
423  * This function is called before elog() and palloc() are usable.
424  */
425 int
pg_bind_textdomain_codeset(const char * domainname)426 pg_bind_textdomain_codeset(const char *domainname)
427 {
428 	bool		elog_ok = (CurrentMemoryContext != NULL);
429 	int			encoding = GetDatabaseEncoding();
430 	int			new_msgenc;
431 
432 #ifndef WIN32
433 	const char *ctype = setlocale(LC_CTYPE, NULL);
434 
435 	if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
436 #endif
437 		if (encoding != PG_SQL_ASCII &&
438 			raw_pg_bind_textdomain_codeset(domainname, encoding))
439 			return encoding;
440 
441 	new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
442 	if (new_msgenc < 0)
443 		new_msgenc = PG_SQL_ASCII;
444 
445 #ifdef WIN32
446 	if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
447 		/* On failure, the old message encoding remains valid. */
448 		return GetMessageEncoding();
449 #endif
450 
451 	return new_msgenc;
452 }
453 #endif
454 
455 /*
456  * The database encoding, also called the server encoding, represents the
457  * encoding of data stored in text-like data types.  Affected types include
458  * cstring, text, varchar, name, xml, and json.
459  */
460 int
GetDatabaseEncoding(void)461 GetDatabaseEncoding(void)
462 {
463 	return DatabaseEncoding->encoding;
464 }
465 
466 
467 
468 
469 
470 
471 
472 /*
473  * gettext() returns messages in this encoding.  This often matches the
474  * database encoding, but it differs for SQL_ASCII databases, for processes
475  * not attached to a database, and under a database encoding lacking iconv
476  * support (MULE_INTERNAL).
477  */
478 
479 
480 #ifdef WIN32
481 /*
482  * Result is palloc'ed null-terminated utf16 string. The character length
483  * is also passed to utf16len if not null. Returns NULL iff failed.
484  */
485 WCHAR *
pgwin32_message_to_UTF16(const char * str,int len,int * utf16len)486 pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
487 {
488 	WCHAR	   *utf16;
489 	int			dstlen;
490 	UINT		codepage;
491 
492 	codepage = pg_enc2name_tbl[GetMessageEncoding()].codepage;
493 
494 	/*
495 	 * Use MultiByteToWideChar directly if there is a corresponding codepage,
496 	 * or double conversion through UTF8 if not.  Double conversion is needed,
497 	 * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
498 	 */
499 	if (codepage != 0)
500 	{
501 		utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
502 		dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
503 		utf16[dstlen] = (WCHAR) 0;
504 	}
505 	else
506 	{
507 		char	   *utf8;
508 
509 		/*
510 		 * XXX pg_do_encoding_conversion() requires a transaction.  In the
511 		 * absence of one, hope for the input to be valid UTF8.
512 		 */
513 		if (IsTransactionState())
514 		{
515 			utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
516 													  len,
517 													  GetMessageEncoding(),
518 													  PG_UTF8);
519 			if (utf8 != str)
520 				len = strlen(utf8);
521 		}
522 		else
523 			utf8 = (char *) str;
524 
525 		utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
526 		dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
527 		utf16[dstlen] = (WCHAR) 0;
528 
529 		if (utf8 != str)
530 			pfree(utf8);
531 	}
532 
533 	if (dstlen == 0 && len > 0)
534 	{
535 		pfree(utf16);
536 		return NULL;			/* error */
537 	}
538 
539 	if (utf16len)
540 		*utf16len = dstlen;
541 	return utf16;
542 }
543 
544 #endif
545