1 /*--------------------------------------------------------------------
2 * Symbols referenced in this file:
3 * - GetDatabaseEncoding
4 * - DatabaseEncoding
5 * - pg_get_client_encoding
6 * - ClientEncoding
7 * - pg_mbcliplen
8 * - pg_encoding_mbcliplen
9 * - cliplen
10 * - pg_mblen
11 * - pg_mbstrlen_with_len
12 * - SetDatabaseEncoding
13 *--------------------------------------------------------------------
14 */
15
16 /*-------------------------------------------------------------------------
17 *
18 * mbutils.c
19 * This file contains functions for encoding conversion.
20 *
21 * The string-conversion functions in this file share some API quirks.
22 * Note the following:
23 *
24 * The functions return a palloc'd, null-terminated string if conversion
25 * is required. However, if no conversion is performed, the given source
26 * string pointer is returned as-is.
27 *
28 * Although the presence of a length argument means that callers can pass
29 * non-null-terminated strings, care is required because the same string
30 * will be passed back if no conversion occurs. Such callers *must* check
31 * whether result == src and handle that case differently.
32 *
33 * If the source and destination encodings are the same, the source string
34 * is returned without any verification; it's assumed to be valid data.
35 * If that might not be the case, the caller is responsible for validating
36 * the string using a separate call to pg_verify_mbstr(). Whenever the
37 * source and destination encodings are different, the functions ensure that
38 * the result is validly encoded according to the destination encoding.
39 *
40 *
41 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
42 * Portions Copyright (c) 1994, Regents of the University of California
43 *
44 *
45 * IDENTIFICATION
46 * src/backend/utils/mb/mbutils.c
47 *
48 *-------------------------------------------------------------------------
49 */
50 #include "postgres.h"
51
52 #include "access/xact.h"
53 #include "catalog/namespace.h"
54 #include "mb/pg_wchar.h"
55 #include "utils/builtins.h"
56 #include "utils/memutils.h"
57 #include "utils/syscache.h"
58
59 /*
60 * When converting strings between different encodings, we assume that space
61 * for converted result is 4-to-1 growth in the worst case. The rate for
62 * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
63 * kanna -> UTF8 is the worst case). So "4" should be enough for the moment.
64 *
65 * Note that this is not the same as the maximum character width in any
66 * particular encoding.
67 */
68 #define MAX_CONVERSION_GROWTH 4
69
70 /*
71 * We maintain a simple linked list caching the fmgr lookup info for the
72 * currently selected conversion functions, as well as any that have been
73 * selected previously in the current session. (We remember previous
74 * settings because we must be able to restore a previous setting during
75 * transaction rollback, without doing any fresh catalog accesses.)
76 *
77 * Since we'll never release this data, we just keep it in TopMemoryContext.
78 */
79 typedef struct ConvProcInfo
80 {
81 int s_encoding; /* server and client encoding IDs */
82 int c_encoding;
83 FmgrInfo to_server_info; /* lookup info for conversion procs */
84 FmgrInfo to_client_info;
85 } ConvProcInfo;
86
87 /* List of ConvProcInfo */
88
89 /*
90 * These variables point to the currently active conversion functions,
91 * or are NULL when no conversion is needed.
92 */
93
94
95
96 /*
97 * These variables track the currently-selected encodings.
98 */
99 static __thread const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
100
101 static __thread const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
102
103
104
105 /*
106 * During backend startup we can't set client encoding because we (a)
107 * can't look up the conversion functions, and (b) may not know the database
108 * encoding yet either. So SetClientEncoding() just accepts anything and
109 * remembers it for InitializeClientEncoding() to apply later.
110 */
111
112
113
114
115 /* Internal functions */
116 static char *perform_default_encoding_conversion(const char *src,
117 int len, bool is_client_to_server);
118 static int cliplen(const char *str, int len, int limit);
119
120
121 /*
122 * Prepare for a future call to SetClientEncoding. Success should mean
123 * that SetClientEncoding is guaranteed to succeed for this encoding request.
124 *
125 * (But note that success before backend_startup_complete does not guarantee
126 * success after ...)
127 *
128 * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
129 */
130
131
132 /*
133 * Set the active client encoding and set up the conversion-function pointers.
134 * PrepareClientEncoding should have been called previously for this encoding.
135 *
136 * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
137 */
138
139
140 /*
141 * Initialize client encoding conversions.
142 * Called from InitPostgres() once during backend startup.
143 */
144
145
146 /*
147 * returns the current client encoding
148 */
149 int
pg_get_client_encoding(void)150 pg_get_client_encoding(void)
151 {
152 return ClientEncoding->encoding;
153 }
154
155 /*
156 * returns the current client encoding name
157 */
158
159
160 /*
161 * Convert src string to another encoding (general case).
162 *
163 * See the notes about string conversion functions at the top of this file.
164 */
165
166
167 /*
168 * Convert string to encoding encoding_name. The source
169 * encoding is the DB encoding.
170 *
171 * BYTEA convert_to(TEXT string, NAME encoding_name) */
172
173
174 /*
175 * Convert string from encoding encoding_name. The destination
176 * encoding is the DB encoding.
177 *
178 * TEXT convert_from(BYTEA string, NAME encoding_name) */
179
180
181 /*
182 * Convert string between two arbitrary encodings.
183 *
184 * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
185 */
186
187
188 /*
189 * get the length of the string considered as text in the specified
190 * encoding. Raises an error if the data is not valid in that
191 * encoding.
192 *
193 * INT4 length (BYTEA string, NAME src_encoding_name)
194 */
195
196
197 /*
198 * Get maximum multibyte character length in the specified encoding.
199 *
200 * Note encoding is specified numerically, not by name as above.
201 */
202
203
204 /*
205 * Convert client encoding to server encoding.
206 *
207 * See the notes about string conversion functions at the top of this file.
208 */
209
210
211 /*
212 * Convert any encoding to server encoding.
213 *
214 * See the notes about string conversion functions at the top of this file.
215 *
216 * Unlike the other string conversion functions, this will apply validation
217 * even if encoding == DatabaseEncoding->encoding. This is because this is
218 * used to process data coming in from outside the database, and we never
219 * want to just assume validity.
220 */
221
222
223 /*
224 * Convert server encoding to client encoding.
225 *
226 * See the notes about string conversion functions at the top of this file.
227 */
228
229
230 /*
231 * Convert server encoding to any encoding.
232 *
233 * See the notes about string conversion functions at the top of this file.
234 */
235
236
237 /*
238 * Perform default encoding conversion using cached FmgrInfo. Since
239 * this function does not access database at all, it is safe to call
240 * outside transactions. If the conversion has not been set up by
241 * SetClientEncoding(), no conversion is performed.
242 */
243
244
245
246 /* convert a multibyte string to a wchar */
247
248
249 /* convert a multibyte string to a wchar with a limited length */
250
251
252 /* same, with any encoding */
253
254
255 /* convert a wchar string to a multibyte */
256
257
258 /* convert a wchar string to a multibyte with a limited length */
259
260
261 /* same, with any encoding */
262
263
264 /* returns the byte length of a multibyte character */
265 int
pg_mblen(const char * mbstr)266 pg_mblen(const char *mbstr)
267 {
268 return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) ((const unsigned char *) mbstr));
269 }
270
271 /* returns the display length of a multibyte character */
272
273
274 /* returns the length (counted in wchars) of a multibyte string */
275
276
277 /* returns the length (counted in wchars) of a multibyte string
278 * (not necessarily NULL terminated)
279 */
280 int
pg_mbstrlen_with_len(const char * mbstr,int limit)281 pg_mbstrlen_with_len(const char *mbstr, int limit)
282 {
283 int len = 0;
284
285 /* optimization for single byte encoding */
286 if (pg_database_encoding_max_length() == 1)
287 return limit;
288
289 while (limit > 0 && *mbstr)
290 {
291 int l = pg_mblen(mbstr);
292
293 limit -= l;
294 mbstr += l;
295 len++;
296 }
297 return len;
298 }
299
300 /*
301 * returns the byte length of a multibyte string
302 * (not necessarily NULL terminated)
303 * that is no longer than limit.
304 * this function does not break multibyte character boundary.
305 */
306 int
pg_mbcliplen(const char * mbstr,int len,int limit)307 pg_mbcliplen(const char *mbstr, int len, int limit)
308 {
309 return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
310 len, limit);
311 }
312
313 /*
314 * pg_mbcliplen with specified encoding
315 */
316 int
pg_encoding_mbcliplen(int encoding,const char * mbstr,int len,int limit)317 pg_encoding_mbcliplen(int encoding, const char *mbstr,
318 int len, int limit)
319 {
320 mblen_converter mblen_fn;
321 int clen = 0;
322 int l;
323
324 /* optimization for single byte encoding */
325 if (pg_encoding_max_length(encoding) == 1)
326 return cliplen(mbstr, len, limit);
327
328 mblen_fn = pg_wchar_table[encoding].mblen;
329
330 while (len > 0 && *mbstr)
331 {
332 l = (*mblen_fn) ((const unsigned char *) mbstr);
333 if ((clen + l) > limit)
334 break;
335 clen += l;
336 if (clen == limit)
337 break;
338 len -= l;
339 mbstr += l;
340 }
341 return clen;
342 }
343
344 /*
345 * Similar to pg_mbcliplen except the limit parameter specifies the
346 * character length, not the byte length.
347 */
348
349
350 /* mbcliplen for any single-byte encoding */
351 static int
cliplen(const char * str,int len,int limit)352 cliplen(const char *str, int len, int limit)
353 {
354 int l = 0;
355
356 len = Min(len, limit);
357 while (l < len && str[l])
358 l++;
359 return l;
360 }
361
362 void
SetDatabaseEncoding(int encoding)363 SetDatabaseEncoding(int encoding)
364 {
365 if (!PG_VALID_BE_ENCODING(encoding))
366 elog(ERROR, "invalid database encoding: %d", encoding);
367
368 DatabaseEncoding = &pg_enc2name_tbl[encoding];
369 Assert(DatabaseEncoding->encoding == encoding);
370 }
371
372
373
374 #ifdef ENABLE_NLS
375 /*
376 * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
377 * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
378 * fail for gettext-internal causes like out-of-memory.
379 */
380 static bool
raw_pg_bind_textdomain_codeset(const char * domainname,int encoding)381 raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
382 {
383 bool elog_ok = (CurrentMemoryContext != NULL);
384 int i;
385
386 for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
387 {
388 if (pg_enc2gettext_tbl[i].encoding == encoding)
389 {
390 if (bind_textdomain_codeset(domainname,
391 pg_enc2gettext_tbl[i].name) != NULL)
392 return true;
393
394 if (elog_ok)
395 elog(LOG, "bind_textdomain_codeset failed");
396 else
397 write_stderr("bind_textdomain_codeset failed");
398
399 break;
400 }
401 }
402
403 return false;
404 }
405
406 /*
407 * Bind a gettext message domain to the codeset corresponding to the database
408 * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
409 * Return the MessageEncoding implied by the new settings.
410 *
411 * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
412 * When that matches the database encoding, we don't need to do anything. In
413 * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
414 * database encoding, except for the C locale. (On Windows, we also permit a
415 * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
416 * gettext to the right codeset.
417 *
418 * On Windows, gettext defaults to the Windows ANSI code page. This is a
419 * convenient departure for software that passes the strings to Windows ANSI
420 * APIs, but we don't do that. Compel gettext to use database encoding or,
421 * failing that, the LC_CTYPE encoding as it would on other platforms.
422 *
423 * This function is called before elog() and palloc() are usable.
424 */
425 int
pg_bind_textdomain_codeset(const char * domainname)426 pg_bind_textdomain_codeset(const char *domainname)
427 {
428 bool elog_ok = (CurrentMemoryContext != NULL);
429 int encoding = GetDatabaseEncoding();
430 int new_msgenc;
431
432 #ifndef WIN32
433 const char *ctype = setlocale(LC_CTYPE, NULL);
434
435 if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
436 #endif
437 if (encoding != PG_SQL_ASCII &&
438 raw_pg_bind_textdomain_codeset(domainname, encoding))
439 return encoding;
440
441 new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
442 if (new_msgenc < 0)
443 new_msgenc = PG_SQL_ASCII;
444
445 #ifdef WIN32
446 if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
447 /* On failure, the old message encoding remains valid. */
448 return GetMessageEncoding();
449 #endif
450
451 return new_msgenc;
452 }
453 #endif
454
455 /*
456 * The database encoding, also called the server encoding, represents the
457 * encoding of data stored in text-like data types. Affected types include
458 * cstring, text, varchar, name, xml, and json.
459 */
460 int
GetDatabaseEncoding(void)461 GetDatabaseEncoding(void)
462 {
463 return DatabaseEncoding->encoding;
464 }
465
466
467
468
469
470
471
472 /*
473 * gettext() returns messages in this encoding. This often matches the
474 * database encoding, but it differs for SQL_ASCII databases, for processes
475 * not attached to a database, and under a database encoding lacking iconv
476 * support (MULE_INTERNAL).
477 */
478
479
480 #ifdef WIN32
481 /*
482 * Result is palloc'ed null-terminated utf16 string. The character length
483 * is also passed to utf16len if not null. Returns NULL iff failed.
484 */
485 WCHAR *
pgwin32_message_to_UTF16(const char * str,int len,int * utf16len)486 pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
487 {
488 WCHAR *utf16;
489 int dstlen;
490 UINT codepage;
491
492 codepage = pg_enc2name_tbl[GetMessageEncoding()].codepage;
493
494 /*
495 * Use MultiByteToWideChar directly if there is a corresponding codepage,
496 * or double conversion through UTF8 if not. Double conversion is needed,
497 * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
498 */
499 if (codepage != 0)
500 {
501 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
502 dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
503 utf16[dstlen] = (WCHAR) 0;
504 }
505 else
506 {
507 char *utf8;
508
509 /*
510 * XXX pg_do_encoding_conversion() requires a transaction. In the
511 * absence of one, hope for the input to be valid UTF8.
512 */
513 if (IsTransactionState())
514 {
515 utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
516 len,
517 GetMessageEncoding(),
518 PG_UTF8);
519 if (utf8 != str)
520 len = strlen(utf8);
521 }
522 else
523 utf8 = (char *) str;
524
525 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
526 dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
527 utf16[dstlen] = (WCHAR) 0;
528
529 if (utf8 != str)
530 pfree(utf8);
531 }
532
533 if (dstlen == 0 && len > 0)
534 {
535 pfree(utf16);
536 return NULL; /* error */
537 }
538
539 if (utf16len)
540 *utf16len = dstlen;
541 return utf16;
542 }
543
544 #endif
545