1 /*-------------------------------------------------------------------------
2 *
3 * mbutils.c
4 * This file contains functions for encoding conversion.
5 *
6 * The string-conversion functions in this file share some API quirks.
7 * Note the following:
8 *
9 * The functions return a palloc'd, null-terminated string if conversion
10 * is required. However, if no conversion is performed, the given source
11 * string pointer is returned as-is.
12 *
13 * Although the presence of a length argument means that callers can pass
14 * non-null-terminated strings, care is required because the same string
15 * will be passed back if no conversion occurs. Such callers *must* check
16 * whether result == src and handle that case differently.
17 *
18 * If the source and destination encodings are the same, the source string
19 * is returned without any verification; it's assumed to be valid data.
20 * If that might not be the case, the caller is responsible for validating
21 * the string using a separate call to pg_verify_mbstr(). Whenever the
22 * source and destination encodings are different, the functions ensure that
23 * the result is validly encoded according to the destination encoding.
24 *
25 *
26 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
27 * Portions Copyright (c) 1994, Regents of the University of California
28 *
29 *
30 * IDENTIFICATION
31 * src/backend/utils/mb/mbutils.c
32 *
33 *-------------------------------------------------------------------------
34 */
35 #include "postgres.h"
36
37 #include "access/xact.h"
38 #include "catalog/namespace.h"
39 #include "mb/pg_wchar.h"
40 #include "utils/builtins.h"
41 #include "utils/memutils.h"
42 #include "utils/syscache.h"
43
44 /*
45 * We maintain a simple linked list caching the fmgr lookup info for the
46 * currently selected conversion functions, as well as any that have been
47 * selected previously in the current session. (We remember previous
48 * settings because we must be able to restore a previous setting during
49 * transaction rollback, without doing any fresh catalog accesses.)
50 *
51 * Since we'll never release this data, we just keep it in TopMemoryContext.
52 */
53 typedef struct ConvProcInfo
54 {
55 int s_encoding; /* server and client encoding IDs */
56 int c_encoding;
57 FmgrInfo to_server_info; /* lookup info for conversion procs */
58 FmgrInfo to_client_info;
59 } ConvProcInfo;
60
61 static List *ConvProcList = NIL; /* List of ConvProcInfo */
62
63 /*
64 * These variables point to the currently active conversion functions,
65 * or are NULL when no conversion is needed.
66 */
67 static FmgrInfo *ToServerConvProc = NULL;
68 static FmgrInfo *ToClientConvProc = NULL;
69
70 /*
71 * This variable stores the conversion function to convert from UTF-8
72 * to the server encoding. It's NULL if the server encoding *is* UTF-8,
73 * or if we lack a conversion function for this.
74 */
75 static FmgrInfo *Utf8ToServerConvProc = NULL;
76
77 /*
78 * These variables track the currently-selected encodings.
79 */
80 static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
81 static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
82 static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
83
84 /*
85 * During backend startup we can't set client encoding because we (a)
86 * can't look up the conversion functions, and (b) may not know the database
87 * encoding yet either. So SetClientEncoding() just accepts anything and
88 * remembers it for InitializeClientEncoding() to apply later.
89 */
90 static bool backend_startup_complete = false;
91 static int pending_client_encoding = PG_SQL_ASCII;
92
93
94 /* Internal functions */
95 static char *perform_default_encoding_conversion(const char *src,
96 int len, bool is_client_to_server);
97 static int cliplen(const char *str, int len, int limit);
98
99
100 /*
101 * Prepare for a future call to SetClientEncoding. Success should mean
102 * that SetClientEncoding is guaranteed to succeed for this encoding request.
103 *
104 * (But note that success before backend_startup_complete does not guarantee
105 * success after ...)
106 *
107 * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
108 */
109 int
PrepareClientEncoding(int encoding)110 PrepareClientEncoding(int encoding)
111 {
112 int current_server_encoding;
113 ListCell *lc;
114
115 if (!PG_VALID_FE_ENCODING(encoding))
116 return -1;
117
118 /* Can't do anything during startup, per notes above */
119 if (!backend_startup_complete)
120 return 0;
121
122 current_server_encoding = GetDatabaseEncoding();
123
124 /*
125 * Check for cases that require no conversion function.
126 */
127 if (current_server_encoding == encoding ||
128 current_server_encoding == PG_SQL_ASCII ||
129 encoding == PG_SQL_ASCII)
130 return 0;
131
132 if (IsTransactionState())
133 {
134 /*
135 * If we're in a live transaction, it's safe to access the catalogs,
136 * so look up the functions. We repeat the lookup even if the info is
137 * already cached, so that we can react to changes in the contents of
138 * pg_conversion.
139 */
140 Oid to_server_proc,
141 to_client_proc;
142 ConvProcInfo *convinfo;
143 MemoryContext oldcontext;
144
145 to_server_proc = FindDefaultConversionProc(encoding,
146 current_server_encoding);
147 if (!OidIsValid(to_server_proc))
148 return -1;
149 to_client_proc = FindDefaultConversionProc(current_server_encoding,
150 encoding);
151 if (!OidIsValid(to_client_proc))
152 return -1;
153
154 /*
155 * Load the fmgr info into TopMemoryContext (could still fail here)
156 */
157 convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
158 sizeof(ConvProcInfo));
159 convinfo->s_encoding = current_server_encoding;
160 convinfo->c_encoding = encoding;
161 fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
162 TopMemoryContext);
163 fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
164 TopMemoryContext);
165
166 /* Attach new info to head of list */
167 oldcontext = MemoryContextSwitchTo(TopMemoryContext);
168 ConvProcList = lcons(convinfo, ConvProcList);
169 MemoryContextSwitchTo(oldcontext);
170
171 /*
172 * We cannot yet remove any older entry for the same encoding pair,
173 * since it could still be in use. SetClientEncoding will clean up.
174 */
175
176 return 0; /* success */
177 }
178 else
179 {
180 /*
181 * If we're not in a live transaction, the only thing we can do is
182 * restore a previous setting using the cache. This covers all
183 * transaction-rollback cases. The only case it might not work for is
184 * trying to change client_encoding on the fly by editing
185 * postgresql.conf and SIGHUP'ing. Which would probably be a stupid
186 * thing to do anyway.
187 */
188 foreach(lc, ConvProcList)
189 {
190 ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
191
192 if (oldinfo->s_encoding == current_server_encoding &&
193 oldinfo->c_encoding == encoding)
194 return 0;
195 }
196
197 return -1; /* it's not cached, so fail */
198 }
199 }
200
201 /*
202 * Set the active client encoding and set up the conversion-function pointers.
203 * PrepareClientEncoding should have been called previously for this encoding.
204 *
205 * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
206 */
207 int
SetClientEncoding(int encoding)208 SetClientEncoding(int encoding)
209 {
210 int current_server_encoding;
211 bool found;
212 ListCell *lc;
213
214 if (!PG_VALID_FE_ENCODING(encoding))
215 return -1;
216
217 /* Can't do anything during startup, per notes above */
218 if (!backend_startup_complete)
219 {
220 pending_client_encoding = encoding;
221 return 0;
222 }
223
224 current_server_encoding = GetDatabaseEncoding();
225
226 /*
227 * Check for cases that require no conversion function.
228 */
229 if (current_server_encoding == encoding ||
230 current_server_encoding == PG_SQL_ASCII ||
231 encoding == PG_SQL_ASCII)
232 {
233 ClientEncoding = &pg_enc2name_tbl[encoding];
234 ToServerConvProc = NULL;
235 ToClientConvProc = NULL;
236 return 0;
237 }
238
239 /*
240 * Search the cache for the entry previously prepared by
241 * PrepareClientEncoding; if there isn't one, we lose. While at it,
242 * release any duplicate entries so that repeated Prepare/Set cycles don't
243 * leak memory.
244 */
245 found = false;
246 foreach(lc, ConvProcList)
247 {
248 ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
249
250 if (convinfo->s_encoding == current_server_encoding &&
251 convinfo->c_encoding == encoding)
252 {
253 if (!found)
254 {
255 /* Found newest entry, so set up */
256 ClientEncoding = &pg_enc2name_tbl[encoding];
257 ToServerConvProc = &convinfo->to_server_info;
258 ToClientConvProc = &convinfo->to_client_info;
259 found = true;
260 }
261 else
262 {
263 /* Duplicate entry, release it */
264 ConvProcList = foreach_delete_current(ConvProcList, lc);
265 pfree(convinfo);
266 }
267 }
268 }
269
270 if (found)
271 return 0; /* success */
272 else
273 return -1; /* it's not cached, so fail */
274 }
275
276 /*
277 * Initialize client encoding conversions.
278 * Called from InitPostgres() once during backend startup.
279 */
280 void
InitializeClientEncoding(void)281 InitializeClientEncoding(void)
282 {
283 int current_server_encoding;
284
285 Assert(!backend_startup_complete);
286 backend_startup_complete = true;
287
288 if (PrepareClientEncoding(pending_client_encoding) < 0 ||
289 SetClientEncoding(pending_client_encoding) < 0)
290 {
291 /*
292 * Oops, the requested conversion is not available. We couldn't fail
293 * before, but we can now.
294 */
295 ereport(FATAL,
296 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
297 errmsg("conversion between %s and %s is not supported",
298 pg_enc2name_tbl[pending_client_encoding].name,
299 GetDatabaseEncodingName())));
300 }
301
302 /*
303 * Also look up the UTF8-to-server conversion function if needed. Since
304 * the server encoding is fixed within any one backend process, we don't
305 * have to do this more than once.
306 */
307 current_server_encoding = GetDatabaseEncoding();
308 if (current_server_encoding != PG_UTF8 &&
309 current_server_encoding != PG_SQL_ASCII)
310 {
311 Oid utf8_to_server_proc;
312
313 Assert(IsTransactionState());
314 utf8_to_server_proc =
315 FindDefaultConversionProc(PG_UTF8,
316 current_server_encoding);
317 /* If there's no such conversion, just leave the pointer as NULL */
318 if (OidIsValid(utf8_to_server_proc))
319 {
320 FmgrInfo *finfo;
321
322 finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext,
323 sizeof(FmgrInfo));
324 fmgr_info_cxt(utf8_to_server_proc, finfo,
325 TopMemoryContext);
326 /* Set Utf8ToServerConvProc only after data is fully valid */
327 Utf8ToServerConvProc = finfo;
328 }
329 }
330 }
331
332 /*
333 * returns the current client encoding
334 */
335 int
pg_get_client_encoding(void)336 pg_get_client_encoding(void)
337 {
338 return ClientEncoding->encoding;
339 }
340
341 /*
342 * returns the current client encoding name
343 */
344 const char *
pg_get_client_encoding_name(void)345 pg_get_client_encoding_name(void)
346 {
347 return ClientEncoding->name;
348 }
349
350 /*
351 * Convert src string to another encoding (general case).
352 *
353 * See the notes about string conversion functions at the top of this file.
354 */
355 unsigned char *
pg_do_encoding_conversion(unsigned char * src,int len,int src_encoding,int dest_encoding)356 pg_do_encoding_conversion(unsigned char *src, int len,
357 int src_encoding, int dest_encoding)
358 {
359 unsigned char *result;
360 Oid proc;
361
362 if (len <= 0)
363 return src; /* empty string is always valid */
364
365 if (src_encoding == dest_encoding)
366 return src; /* no conversion required, assume valid */
367
368 if (dest_encoding == PG_SQL_ASCII)
369 return src; /* any string is valid in SQL_ASCII */
370
371 if (src_encoding == PG_SQL_ASCII)
372 {
373 /* No conversion is possible, but we must validate the result */
374 (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
375 return src;
376 }
377
378 if (!IsTransactionState()) /* shouldn't happen */
379 elog(ERROR, "cannot perform encoding conversion outside a transaction");
380
381 proc = FindDefaultConversionProc(src_encoding, dest_encoding);
382 if (!OidIsValid(proc))
383 ereport(ERROR,
384 (errcode(ERRCODE_UNDEFINED_FUNCTION),
385 errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
386 pg_encoding_to_char(src_encoding),
387 pg_encoding_to_char(dest_encoding))));
388
389 /*
390 * Allocate space for conversion result, being wary of integer overflow.
391 *
392 * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
393 * required space, so it might exceed MaxAllocSize even though the result
394 * would actually fit. We do not want to hand back a result string that
395 * exceeds MaxAllocSize, because callers might not cope gracefully --- but
396 * if we just allocate more than that, and don't use it, that's fine.
397 */
398 if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
399 ereport(ERROR,
400 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
401 errmsg("out of memory"),
402 errdetail("String of %d bytes is too long for encoding conversion.",
403 len)));
404
405 result = (unsigned char *)
406 MemoryContextAllocHuge(CurrentMemoryContext,
407 (Size) len * MAX_CONVERSION_GROWTH + 1);
408
409 (void) OidFunctionCall6(proc,
410 Int32GetDatum(src_encoding),
411 Int32GetDatum(dest_encoding),
412 CStringGetDatum(src),
413 CStringGetDatum(result),
414 Int32GetDatum(len),
415 BoolGetDatum(false));
416
417 /*
418 * If the result is large, it's worth repalloc'ing to release any extra
419 * space we asked for. The cutoff here is somewhat arbitrary, but we
420 * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
421 */
422 if (len > 1000000)
423 {
424 Size resultlen = strlen((char *) result);
425
426 if (resultlen >= MaxAllocSize)
427 ereport(ERROR,
428 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
429 errmsg("out of memory"),
430 errdetail("String of %d bytes is too long for encoding conversion.",
431 len)));
432
433 result = (unsigned char *) repalloc(result, resultlen + 1);
434 }
435
436 return result;
437 }
438
439 /*
440 * Convert src string to another encoding.
441 *
442 * This function has a different API than the other conversion functions.
443 * The caller should've looked up the conversion function using
444 * FindDefaultConversionProc(). Unlike the other functions, the converted
445 * result is not palloc'd. It is written to the caller-supplied buffer
446 * instead.
447 *
448 * src_encoding - encoding to convert from
449 * dest_encoding - encoding to convert to
450 * src, srclen - input buffer and its length in bytes
451 * dest, destlen - destination buffer and its size in bytes
452 *
453 * The output is null-terminated.
454 *
455 * If destlen < srclen * MAX_CONVERSION_LENGTH + 1, the converted output
456 * wouldn't necessarily fit in the output buffer, and the function will not
457 * convert the whole input.
458 *
459 * TODO: The conversion function interface is not great. Firstly, it
460 * would be nice to pass through the destination buffer size to the
461 * conversion function, so that if you pass a shorter destination buffer, it
462 * could still continue to fill up the whole buffer. Currently, we have to
463 * assume worst case expansion and stop the conversion short, even if there
464 * is in fact space left in the destination buffer. Secondly, it would be
465 * nice to return the number of bytes written to the caller, to avoid a call
466 * to strlen().
467 */
468 int
pg_do_encoding_conversion_buf(Oid proc,int src_encoding,int dest_encoding,unsigned char * src,int srclen,unsigned char * dest,int destlen,bool noError)469 pg_do_encoding_conversion_buf(Oid proc,
470 int src_encoding,
471 int dest_encoding,
472 unsigned char *src, int srclen,
473 unsigned char *dest, int destlen,
474 bool noError)
475 {
476 Datum result;
477
478 /*
479 * If the destination buffer is not large enough to hold the result in the
480 * worst case, limit the input size passed to the conversion function.
481 */
482 if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
483 srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
484
485 result = OidFunctionCall6(proc,
486 Int32GetDatum(src_encoding),
487 Int32GetDatum(dest_encoding),
488 CStringGetDatum(src),
489 CStringGetDatum(dest),
490 Int32GetDatum(srclen),
491 BoolGetDatum(noError));
492 return DatumGetInt32(result);
493 }
494
495 /*
496 * Convert string to encoding encoding_name. The source
497 * encoding is the DB encoding.
498 *
499 * BYTEA convert_to(TEXT string, NAME encoding_name) */
500 Datum
pg_convert_to(PG_FUNCTION_ARGS)501 pg_convert_to(PG_FUNCTION_ARGS)
502 {
503 Datum string = PG_GETARG_DATUM(0);
504 Datum dest_encoding_name = PG_GETARG_DATUM(1);
505 Datum src_encoding_name = DirectFunctionCall1(namein,
506 CStringGetDatum(DatabaseEncoding->name));
507 Datum result;
508
509 /*
510 * pg_convert expects a bytea as its first argument. We're passing it a
511 * text argument here, relying on the fact that they are both in fact
512 * varlena types, and thus structurally identical.
513 */
514 result = DirectFunctionCall3(pg_convert, string,
515 src_encoding_name, dest_encoding_name);
516
517 PG_RETURN_DATUM(result);
518 }
519
520 /*
521 * Convert string from encoding encoding_name. The destination
522 * encoding is the DB encoding.
523 *
524 * TEXT convert_from(BYTEA string, NAME encoding_name) */
525 Datum
pg_convert_from(PG_FUNCTION_ARGS)526 pg_convert_from(PG_FUNCTION_ARGS)
527 {
528 Datum string = PG_GETARG_DATUM(0);
529 Datum src_encoding_name = PG_GETARG_DATUM(1);
530 Datum dest_encoding_name = DirectFunctionCall1(namein,
531 CStringGetDatum(DatabaseEncoding->name));
532 Datum result;
533
534 result = DirectFunctionCall3(pg_convert, string,
535 src_encoding_name, dest_encoding_name);
536
537 /*
538 * pg_convert returns a bytea, which we in turn return as text, relying on
539 * the fact that they are both in fact varlena types, and thus
540 * structurally identical. Although not all bytea values are valid text,
541 * in this case it will be because we've told pg_convert to return one
542 * that is valid as text in the current database encoding.
543 */
544 PG_RETURN_DATUM(result);
545 }
546
547 /*
548 * Convert string between two arbitrary encodings.
549 *
550 * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
551 */
552 Datum
pg_convert(PG_FUNCTION_ARGS)553 pg_convert(PG_FUNCTION_ARGS)
554 {
555 bytea *string = PG_GETARG_BYTEA_PP(0);
556 char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
557 int src_encoding = pg_char_to_encoding(src_encoding_name);
558 char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
559 int dest_encoding = pg_char_to_encoding(dest_encoding_name);
560 const char *src_str;
561 char *dest_str;
562 bytea *retval;
563 int len;
564
565 if (src_encoding < 0)
566 ereport(ERROR,
567 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
568 errmsg("invalid source encoding name \"%s\"",
569 src_encoding_name)));
570 if (dest_encoding < 0)
571 ereport(ERROR,
572 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
573 errmsg("invalid destination encoding name \"%s\"",
574 dest_encoding_name)));
575
576 /* make sure that source string is valid */
577 len = VARSIZE_ANY_EXHDR(string);
578 src_str = VARDATA_ANY(string);
579 (void) pg_verify_mbstr(src_encoding, src_str, len, false);
580
581 /* perform conversion */
582 dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
583 len,
584 src_encoding,
585 dest_encoding);
586
587 /* update len if conversion actually happened */
588 if (dest_str != src_str)
589 len = strlen(dest_str);
590
591 /*
592 * build bytea data type structure.
593 */
594 retval = (bytea *) palloc(len + VARHDRSZ);
595 SET_VARSIZE(retval, len + VARHDRSZ);
596 memcpy(VARDATA(retval), dest_str, len);
597
598 if (dest_str != src_str)
599 pfree(dest_str);
600
601 /* free memory if allocated by the toaster */
602 PG_FREE_IF_COPY(string, 0);
603
604 PG_RETURN_BYTEA_P(retval);
605 }
606
607 /*
608 * get the length of the string considered as text in the specified
609 * encoding. Raises an error if the data is not valid in that
610 * encoding.
611 *
612 * INT4 length (BYTEA string, NAME src_encoding_name)
613 */
614 Datum
length_in_encoding(PG_FUNCTION_ARGS)615 length_in_encoding(PG_FUNCTION_ARGS)
616 {
617 bytea *string = PG_GETARG_BYTEA_PP(0);
618 char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
619 int src_encoding = pg_char_to_encoding(src_encoding_name);
620 const char *src_str;
621 int len;
622 int retval;
623
624 if (src_encoding < 0)
625 ereport(ERROR,
626 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
627 errmsg("invalid encoding name \"%s\"",
628 src_encoding_name)));
629
630 len = VARSIZE_ANY_EXHDR(string);
631 src_str = VARDATA_ANY(string);
632
633 retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
634
635 PG_RETURN_INT32(retval);
636 }
637
638 /*
639 * Get maximum multibyte character length in the specified encoding.
640 *
641 * Note encoding is specified numerically, not by name as above.
642 */
643 Datum
pg_encoding_max_length_sql(PG_FUNCTION_ARGS)644 pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
645 {
646 int encoding = PG_GETARG_INT32(0);
647
648 if (PG_VALID_ENCODING(encoding))
649 PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
650 else
651 PG_RETURN_NULL();
652 }
653
654 /*
655 * Convert client encoding to server encoding.
656 *
657 * See the notes about string conversion functions at the top of this file.
658 */
659 char *
pg_client_to_server(const char * s,int len)660 pg_client_to_server(const char *s, int len)
661 {
662 return pg_any_to_server(s, len, ClientEncoding->encoding);
663 }
664
665 /*
666 * Convert any encoding to server encoding.
667 *
668 * See the notes about string conversion functions at the top of this file.
669 *
670 * Unlike the other string conversion functions, this will apply validation
671 * even if encoding == DatabaseEncoding->encoding. This is because this is
672 * used to process data coming in from outside the database, and we never
673 * want to just assume validity.
674 */
675 char *
pg_any_to_server(const char * s,int len,int encoding)676 pg_any_to_server(const char *s, int len, int encoding)
677 {
678 if (len <= 0)
679 return unconstify(char *, s); /* empty string is always valid */
680
681 if (encoding == DatabaseEncoding->encoding ||
682 encoding == PG_SQL_ASCII)
683 {
684 /*
685 * No conversion is needed, but we must still validate the data.
686 */
687 (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
688 return unconstify(char *, s);
689 }
690
691 if (DatabaseEncoding->encoding == PG_SQL_ASCII)
692 {
693 /*
694 * No conversion is possible, but we must still validate the data,
695 * because the client-side code might have done string escaping using
696 * the selected client_encoding. If the client encoding is ASCII-safe
697 * then we just do a straight validation under that encoding. For an
698 * ASCII-unsafe encoding we have a problem: we dare not pass such data
699 * to the parser but we have no way to convert it. We compromise by
700 * rejecting the data if it contains any non-ASCII characters.
701 */
702 if (PG_VALID_BE_ENCODING(encoding))
703 (void) pg_verify_mbstr(encoding, s, len, false);
704 else
705 {
706 int i;
707
708 for (i = 0; i < len; i++)
709 {
710 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
711 ereport(ERROR,
712 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
713 errmsg("invalid byte value for encoding \"%s\": 0x%02x",
714 pg_enc2name_tbl[PG_SQL_ASCII].name,
715 (unsigned char) s[i])));
716 }
717 }
718 return unconstify(char *, s);
719 }
720
721 /* Fast path if we can use cached conversion function */
722 if (encoding == ClientEncoding->encoding)
723 return perform_default_encoding_conversion(s, len, true);
724
725 /* General case ... will not work outside transactions */
726 return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
727 len,
728 encoding,
729 DatabaseEncoding->encoding);
730 }
731
732 /*
733 * Convert server encoding to client encoding.
734 *
735 * See the notes about string conversion functions at the top of this file.
736 */
737 char *
pg_server_to_client(const char * s,int len)738 pg_server_to_client(const char *s, int len)
739 {
740 return pg_server_to_any(s, len, ClientEncoding->encoding);
741 }
742
743 /*
744 * Convert server encoding to any encoding.
745 *
746 * See the notes about string conversion functions at the top of this file.
747 */
748 char *
pg_server_to_any(const char * s,int len,int encoding)749 pg_server_to_any(const char *s, int len, int encoding)
750 {
751 if (len <= 0)
752 return unconstify(char *, s); /* empty string is always valid */
753
754 if (encoding == DatabaseEncoding->encoding ||
755 encoding == PG_SQL_ASCII)
756 return unconstify(char *, s); /* assume data is valid */
757
758 if (DatabaseEncoding->encoding == PG_SQL_ASCII)
759 {
760 /* No conversion is possible, but we must validate the result */
761 (void) pg_verify_mbstr(encoding, s, len, false);
762 return unconstify(char *, s);
763 }
764
765 /* Fast path if we can use cached conversion function */
766 if (encoding == ClientEncoding->encoding)
767 return perform_default_encoding_conversion(s, len, false);
768
769 /* General case ... will not work outside transactions */
770 return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
771 len,
772 DatabaseEncoding->encoding,
773 encoding);
774 }
775
776 /*
777 * Perform default encoding conversion using cached FmgrInfo. Since
778 * this function does not access database at all, it is safe to call
779 * outside transactions. If the conversion has not been set up by
780 * SetClientEncoding(), no conversion is performed.
781 */
782 static char *
perform_default_encoding_conversion(const char * src,int len,bool is_client_to_server)783 perform_default_encoding_conversion(const char *src, int len,
784 bool is_client_to_server)
785 {
786 char *result;
787 int src_encoding,
788 dest_encoding;
789 FmgrInfo *flinfo;
790
791 if (is_client_to_server)
792 {
793 src_encoding = ClientEncoding->encoding;
794 dest_encoding = DatabaseEncoding->encoding;
795 flinfo = ToServerConvProc;
796 }
797 else
798 {
799 src_encoding = DatabaseEncoding->encoding;
800 dest_encoding = ClientEncoding->encoding;
801 flinfo = ToClientConvProc;
802 }
803
804 if (flinfo == NULL)
805 return unconstify(char *, src);
806
807 /*
808 * Allocate space for conversion result, being wary of integer overflow.
809 * See comments in pg_do_encoding_conversion.
810 */
811 if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
812 ereport(ERROR,
813 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
814 errmsg("out of memory"),
815 errdetail("String of %d bytes is too long for encoding conversion.",
816 len)));
817
818 result = (char *)
819 MemoryContextAllocHuge(CurrentMemoryContext,
820 (Size) len * MAX_CONVERSION_GROWTH + 1);
821
822 FunctionCall6(flinfo,
823 Int32GetDatum(src_encoding),
824 Int32GetDatum(dest_encoding),
825 CStringGetDatum(src),
826 CStringGetDatum(result),
827 Int32GetDatum(len),
828 BoolGetDatum(false));
829
830 /*
831 * Release extra space if there might be a lot --- see comments in
832 * pg_do_encoding_conversion.
833 */
834 if (len > 1000000)
835 {
836 Size resultlen = strlen(result);
837
838 if (resultlen >= MaxAllocSize)
839 ereport(ERROR,
840 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
841 errmsg("out of memory"),
842 errdetail("String of %d bytes is too long for encoding conversion.",
843 len)));
844
845 result = (char *) repalloc(result, resultlen + 1);
846 }
847
848 return result;
849 }
850
851 /*
852 * Convert a single Unicode code point into a string in the server encoding.
853 *
854 * The code point given by "c" is converted and stored at *s, which must
855 * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
856 * The output will have a trailing '\0'. Throws error if the conversion
857 * cannot be performed.
858 *
859 * Note that this relies on having previously looked up any required
860 * conversion function. That's partly for speed but mostly because the parser
861 * may call this outside any transaction, or in an aborted transaction.
862 */
863 void
pg_unicode_to_server(pg_wchar c,unsigned char * s)864 pg_unicode_to_server(pg_wchar c, unsigned char *s)
865 {
866 unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
867 int c_as_utf8_len;
868 int server_encoding;
869
870 /*
871 * Complain if invalid Unicode code point. The choice of errcode here is
872 * debatable, but really our caller should have checked this anyway.
873 */
874 if (!is_valid_unicode_codepoint(c))
875 ereport(ERROR,
876 (errcode(ERRCODE_SYNTAX_ERROR),
877 errmsg("invalid Unicode code point")));
878
879 /* Otherwise, if it's in ASCII range, conversion is trivial */
880 if (c <= 0x7F)
881 {
882 s[0] = (unsigned char) c;
883 s[1] = '\0';
884 return;
885 }
886
887 /* If the server encoding is UTF-8, we just need to reformat the code */
888 server_encoding = GetDatabaseEncoding();
889 if (server_encoding == PG_UTF8)
890 {
891 unicode_to_utf8(c, s);
892 s[pg_utf_mblen(s)] = '\0';
893 return;
894 }
895
896 /* For all other cases, we must have a conversion function available */
897 if (Utf8ToServerConvProc == NULL)
898 ereport(ERROR,
899 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
900 errmsg("conversion between %s and %s is not supported",
901 pg_enc2name_tbl[PG_UTF8].name,
902 GetDatabaseEncodingName())));
903
904 /* Construct UTF-8 source string */
905 unicode_to_utf8(c, c_as_utf8);
906 c_as_utf8_len = pg_utf_mblen(c_as_utf8);
907 c_as_utf8[c_as_utf8_len] = '\0';
908
909 /* Convert, or throw error if we can't */
910 FunctionCall6(Utf8ToServerConvProc,
911 Int32GetDatum(PG_UTF8),
912 Int32GetDatum(server_encoding),
913 CStringGetDatum(c_as_utf8),
914 CStringGetDatum(s),
915 Int32GetDatum(c_as_utf8_len),
916 BoolGetDatum(false));
917 }
918
919
920 /* convert a multibyte string to a wchar */
921 int
pg_mb2wchar(const char * from,pg_wchar * to)922 pg_mb2wchar(const char *from, pg_wchar *to)
923 {
924 return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
925 }
926
927 /* convert a multibyte string to a wchar with a limited length */
928 int
pg_mb2wchar_with_len(const char * from,pg_wchar * to,int len)929 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
930 {
931 return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
932 }
933
934 /* same, with any encoding */
935 int
pg_encoding_mb2wchar_with_len(int encoding,const char * from,pg_wchar * to,int len)936 pg_encoding_mb2wchar_with_len(int encoding,
937 const char *from, pg_wchar *to, int len)
938 {
939 return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
940 }
941
942 /* convert a wchar string to a multibyte */
943 int
pg_wchar2mb(const pg_wchar * from,char * to)944 pg_wchar2mb(const pg_wchar *from, char *to)
945 {
946 return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
947 }
948
949 /* convert a wchar string to a multibyte with a limited length */
950 int
pg_wchar2mb_with_len(const pg_wchar * from,char * to,int len)951 pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
952 {
953 return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
954 }
955
956 /* same, with any encoding */
957 int
pg_encoding_wchar2mb_with_len(int encoding,const pg_wchar * from,char * to,int len)958 pg_encoding_wchar2mb_with_len(int encoding,
959 const pg_wchar *from, char *to, int len)
960 {
961 return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
962 }
963
964 /* returns the byte length of a multibyte character */
965 int
pg_mblen(const char * mbstr)966 pg_mblen(const char *mbstr)
967 {
968 return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
969 }
970
971 /* returns the display length of a multibyte character */
972 int
pg_dsplen(const char * mbstr)973 pg_dsplen(const char *mbstr)
974 {
975 return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
976 }
977
978 /* returns the length (counted in wchars) of a multibyte string */
979 int
pg_mbstrlen(const char * mbstr)980 pg_mbstrlen(const char *mbstr)
981 {
982 int len = 0;
983
984 /* optimization for single byte encoding */
985 if (pg_database_encoding_max_length() == 1)
986 return strlen(mbstr);
987
988 while (*mbstr)
989 {
990 mbstr += pg_mblen(mbstr);
991 len++;
992 }
993 return len;
994 }
995
996 /* returns the length (counted in wchars) of a multibyte string
997 * (not necessarily NULL terminated)
998 */
999 int
pg_mbstrlen_with_len(const char * mbstr,int limit)1000 pg_mbstrlen_with_len(const char *mbstr, int limit)
1001 {
1002 int len = 0;
1003
1004 /* optimization for single byte encoding */
1005 if (pg_database_encoding_max_length() == 1)
1006 return limit;
1007
1008 while (limit > 0 && *mbstr)
1009 {
1010 int l = pg_mblen(mbstr);
1011
1012 limit -= l;
1013 mbstr += l;
1014 len++;
1015 }
1016 return len;
1017 }
1018
1019 /*
1020 * returns the byte length of a multibyte string
1021 * (not necessarily NULL terminated)
1022 * that is no longer than limit.
1023 * this function does not break multibyte character boundary.
1024 */
1025 int
pg_mbcliplen(const char * mbstr,int len,int limit)1026 pg_mbcliplen(const char *mbstr, int len, int limit)
1027 {
1028 return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
1029 len, limit);
1030 }
1031
1032 /*
1033 * pg_mbcliplen with specified encoding
1034 */
1035 int
pg_encoding_mbcliplen(int encoding,const char * mbstr,int len,int limit)1036 pg_encoding_mbcliplen(int encoding, const char *mbstr,
1037 int len, int limit)
1038 {
1039 mblen_converter mblen_fn;
1040 int clen = 0;
1041 int l;
1042
1043 /* optimization for single byte encoding */
1044 if (pg_encoding_max_length(encoding) == 1)
1045 return cliplen(mbstr, len, limit);
1046
1047 mblen_fn = pg_wchar_table[encoding].mblen;
1048
1049 while (len > 0 && *mbstr)
1050 {
1051 l = (*mblen_fn) ((const unsigned char *) mbstr);
1052 if ((clen + l) > limit)
1053 break;
1054 clen += l;
1055 if (clen == limit)
1056 break;
1057 len -= l;
1058 mbstr += l;
1059 }
1060 return clen;
1061 }
1062
1063 /*
1064 * Similar to pg_mbcliplen except the limit parameter specifies the
1065 * character length, not the byte length.
1066 */
1067 int
pg_mbcharcliplen(const char * mbstr,int len,int limit)1068 pg_mbcharcliplen(const char *mbstr, int len, int limit)
1069 {
1070 int clen = 0;
1071 int nch = 0;
1072 int l;
1073
1074 /* optimization for single byte encoding */
1075 if (pg_database_encoding_max_length() == 1)
1076 return cliplen(mbstr, len, limit);
1077
1078 while (len > 0 && *mbstr)
1079 {
1080 l = pg_mblen(mbstr);
1081 nch++;
1082 if (nch > limit)
1083 break;
1084 clen += l;
1085 len -= l;
1086 mbstr += l;
1087 }
1088 return clen;
1089 }
1090
1091 /* mbcliplen for any single-byte encoding */
1092 static int
cliplen(const char * str,int len,int limit)1093 cliplen(const char *str, int len, int limit)
1094 {
1095 int l = 0;
1096
1097 len = Min(len, limit);
1098 while (l < len && str[l])
1099 l++;
1100 return l;
1101 }
1102
1103 void
SetDatabaseEncoding(int encoding)1104 SetDatabaseEncoding(int encoding)
1105 {
1106 if (!PG_VALID_BE_ENCODING(encoding))
1107 elog(ERROR, "invalid database encoding: %d", encoding);
1108
1109 DatabaseEncoding = &pg_enc2name_tbl[encoding];
1110 Assert(DatabaseEncoding->encoding == encoding);
1111 }
1112
1113 void
SetMessageEncoding(int encoding)1114 SetMessageEncoding(int encoding)
1115 {
1116 /* Some calls happen before we can elog()! */
1117 Assert(PG_VALID_ENCODING(encoding));
1118
1119 MessageEncoding = &pg_enc2name_tbl[encoding];
1120 Assert(MessageEncoding->encoding == encoding);
1121 }
1122
1123 #ifdef ENABLE_NLS
1124 /*
1125 * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
1126 * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
1127 * fail for gettext-internal causes like out-of-memory.
1128 */
1129 static bool
raw_pg_bind_textdomain_codeset(const char * domainname,int encoding)1130 raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
1131 {
1132 bool elog_ok = (CurrentMemoryContext != NULL);
1133 int i;
1134
1135 for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
1136 {
1137 if (pg_enc2gettext_tbl[i].encoding == encoding)
1138 {
1139 if (bind_textdomain_codeset(domainname,
1140 pg_enc2gettext_tbl[i].name) != NULL)
1141 return true;
1142
1143 if (elog_ok)
1144 elog(LOG, "bind_textdomain_codeset failed");
1145 else
1146 write_stderr("bind_textdomain_codeset failed");
1147
1148 break;
1149 }
1150 }
1151
1152 return false;
1153 }
1154
1155 /*
1156 * Bind a gettext message domain to the codeset corresponding to the database
1157 * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
1158 * Return the MessageEncoding implied by the new settings.
1159 *
1160 * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
1161 * When that matches the database encoding, we don't need to do anything. In
1162 * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
1163 * database encoding, except for the C locale. (On Windows, we also permit a
1164 * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
1165 * gettext to the right codeset.
1166 *
1167 * On Windows, gettext defaults to the Windows ANSI code page. This is a
1168 * convenient departure for software that passes the strings to Windows ANSI
1169 * APIs, but we don't do that. Compel gettext to use database encoding or,
1170 * failing that, the LC_CTYPE encoding as it would on other platforms.
1171 *
1172 * This function is called before elog() and palloc() are usable.
1173 */
1174 int
pg_bind_textdomain_codeset(const char * domainname)1175 pg_bind_textdomain_codeset(const char *domainname)
1176 {
1177 bool elog_ok = (CurrentMemoryContext != NULL);
1178 int encoding = GetDatabaseEncoding();
1179 int new_msgenc;
1180
1181 #ifndef WIN32
1182 const char *ctype = setlocale(LC_CTYPE, NULL);
1183
1184 if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
1185 #endif
1186 if (encoding != PG_SQL_ASCII &&
1187 raw_pg_bind_textdomain_codeset(domainname, encoding))
1188 return encoding;
1189
1190 new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
1191 if (new_msgenc < 0)
1192 new_msgenc = PG_SQL_ASCII;
1193
1194 #ifdef WIN32
1195 if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
1196 /* On failure, the old message encoding remains valid. */
1197 return GetMessageEncoding();
1198 #endif
1199
1200 return new_msgenc;
1201 }
1202 #endif
1203
1204 /*
1205 * The database encoding, also called the server encoding, represents the
1206 * encoding of data stored in text-like data types. Affected types include
1207 * cstring, text, varchar, name, xml, and json.
1208 */
1209 int
GetDatabaseEncoding(void)1210 GetDatabaseEncoding(void)
1211 {
1212 return DatabaseEncoding->encoding;
1213 }
1214
1215 const char *
GetDatabaseEncodingName(void)1216 GetDatabaseEncodingName(void)
1217 {
1218 return DatabaseEncoding->name;
1219 }
1220
1221 Datum
getdatabaseencoding(PG_FUNCTION_ARGS)1222 getdatabaseencoding(PG_FUNCTION_ARGS)
1223 {
1224 return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1225 }
1226
1227 Datum
pg_client_encoding(PG_FUNCTION_ARGS)1228 pg_client_encoding(PG_FUNCTION_ARGS)
1229 {
1230 return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1231 }
1232
1233 Datum
PG_char_to_encoding(PG_FUNCTION_ARGS)1234 PG_char_to_encoding(PG_FUNCTION_ARGS)
1235 {
1236 Name s = PG_GETARG_NAME(0);
1237
1238 PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
1239 }
1240
1241 Datum
PG_encoding_to_char(PG_FUNCTION_ARGS)1242 PG_encoding_to_char(PG_FUNCTION_ARGS)
1243 {
1244 int32 encoding = PG_GETARG_INT32(0);
1245 const char *encoding_name = pg_encoding_to_char(encoding);
1246
1247 return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
1248 }
1249
1250 /*
1251 * gettext() returns messages in this encoding. This often matches the
1252 * database encoding, but it differs for SQL_ASCII databases, for processes
1253 * not attached to a database, and under a database encoding lacking iconv
1254 * support (MULE_INTERNAL).
1255 */
1256 int
GetMessageEncoding(void)1257 GetMessageEncoding(void)
1258 {
1259 return MessageEncoding->encoding;
1260 }
1261
1262
1263 /*
1264 * Generic character incrementer function.
1265 *
1266 * Not knowing anything about the properties of the encoding in use, we just
1267 * keep incrementing the last byte until we get a validly-encoded result,
1268 * or we run out of values to try. We don't bother to try incrementing
1269 * higher-order bytes, so there's no growth in runtime for wider characters.
1270 * (If we did try to do that, we'd need to consider the likelihood that 255
1271 * is not a valid final byte in the encoding.)
1272 */
1273 static bool
pg_generic_charinc(unsigned char * charptr,int len)1274 pg_generic_charinc(unsigned char *charptr, int len)
1275 {
1276 unsigned char *lastbyte = charptr + len - 1;
1277 mbchar_verifier mbverify;
1278
1279 /* We can just invoke the character verifier directly. */
1280 mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverifychar;
1281
1282 while (*lastbyte < (unsigned char) 255)
1283 {
1284 (*lastbyte)++;
1285 if ((*mbverify) (charptr, len) == len)
1286 return true;
1287 }
1288
1289 return false;
1290 }
1291
1292 /*
1293 * UTF-8 character incrementer function.
1294 *
1295 * For a one-byte character less than 0x7F, we just increment the byte.
1296 *
1297 * For a multibyte character, every byte but the first must fall between 0x80
1298 * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
1299 * the last byte that's not already at its maximum value. If we can't find a
1300 * byte that's less than the maximum allowable value, we simply fail. We also
1301 * need some special-case logic to skip regions used for surrogate pair
1302 * handling, as those should not occur in valid UTF-8.
1303 *
1304 * Note that we don't reset lower-order bytes back to their minimums, since
1305 * we can't afford to make an exhaustive search (see make_greater_string).
1306 */
1307 static bool
pg_utf8_increment(unsigned char * charptr,int length)1308 pg_utf8_increment(unsigned char *charptr, int length)
1309 {
1310 unsigned char a;
1311 unsigned char limit;
1312
1313 switch (length)
1314 {
1315 default:
1316 /* reject lengths 5 and 6 for now */
1317 return false;
1318 case 4:
1319 a = charptr[3];
1320 if (a < 0xBF)
1321 {
1322 charptr[3]++;
1323 break;
1324 }
1325 /* FALL THRU */
1326 case 3:
1327 a = charptr[2];
1328 if (a < 0xBF)
1329 {
1330 charptr[2]++;
1331 break;
1332 }
1333 /* FALL THRU */
1334 case 2:
1335 a = charptr[1];
1336 switch (*charptr)
1337 {
1338 case 0xED:
1339 limit = 0x9F;
1340 break;
1341 case 0xF4:
1342 limit = 0x8F;
1343 break;
1344 default:
1345 limit = 0xBF;
1346 break;
1347 }
1348 if (a < limit)
1349 {
1350 charptr[1]++;
1351 break;
1352 }
1353 /* FALL THRU */
1354 case 1:
1355 a = *charptr;
1356 if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1357 return false;
1358 charptr[0]++;
1359 break;
1360 }
1361
1362 return true;
1363 }
1364
1365 /*
1366 * EUC-JP character incrementer function.
1367 *
1368 * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1369 * representing JIS X 0201 characters with the second byte ranging between
1370 * 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
1371 * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1372 *
1373 * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1374 * in which the last two bytes range between 0xa1 and 0xfe. The last byte
1375 * is incremented if possible, otherwise the second-to-last byte.
1376 *
1377 * If the sequence starts with a value other than the above and its MSB
1378 * is set, it must be a two-byte sequence representing JIS X 0208 characters
1379 * with both bytes ranging between 0xa1 and 0xfe. The last byte is
1380 * incremented if possible, otherwise the second-to-last byte.
1381 *
1382 * Otherwise, the sequence is a single-byte ASCII character. It is
1383 * incremented up to 0x7f.
1384 */
1385 static bool
pg_eucjp_increment(unsigned char * charptr,int length)1386 pg_eucjp_increment(unsigned char *charptr, int length)
1387 {
1388 unsigned char c1,
1389 c2;
1390 int i;
1391
1392 c1 = *charptr;
1393
1394 switch (c1)
1395 {
1396 case SS2: /* JIS X 0201 */
1397 if (length != 2)
1398 return false;
1399
1400 c2 = charptr[1];
1401
1402 if (c2 >= 0xdf)
1403 charptr[0] = charptr[1] = 0xa1;
1404 else if (c2 < 0xa1)
1405 charptr[1] = 0xa1;
1406 else
1407 charptr[1]++;
1408 break;
1409
1410 case SS3: /* JIS X 0212 */
1411 if (length != 3)
1412 return false;
1413
1414 for (i = 2; i > 0; i--)
1415 {
1416 c2 = charptr[i];
1417 if (c2 < 0xa1)
1418 {
1419 charptr[i] = 0xa1;
1420 return true;
1421 }
1422 else if (c2 < 0xfe)
1423 {
1424 charptr[i]++;
1425 return true;
1426 }
1427 }
1428
1429 /* Out of 3-byte code region */
1430 return false;
1431
1432 default:
1433 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1434 {
1435 if (length != 2)
1436 return false;
1437
1438 for (i = 1; i >= 0; i--)
1439 {
1440 c2 = charptr[i];
1441 if (c2 < 0xa1)
1442 {
1443 charptr[i] = 0xa1;
1444 return true;
1445 }
1446 else if (c2 < 0xfe)
1447 {
1448 charptr[i]++;
1449 return true;
1450 }
1451 }
1452
1453 /* Out of 2 byte code region */
1454 return false;
1455 }
1456 else
1457 { /* ASCII, single byte */
1458 if (c1 > 0x7e)
1459 return false;
1460 (*charptr)++;
1461 }
1462 break;
1463 }
1464
1465 return true;
1466 }
1467
1468 /*
1469 * get the character incrementer for the encoding for the current database
1470 */
1471 mbcharacter_incrementer
pg_database_encoding_character_incrementer(void)1472 pg_database_encoding_character_incrementer(void)
1473 {
1474 /*
1475 * Eventually it might be best to add a field to pg_wchar_table[], but for
1476 * now we just use a switch.
1477 */
1478 switch (GetDatabaseEncoding())
1479 {
1480 case PG_UTF8:
1481 return pg_utf8_increment;
1482
1483 case PG_EUC_JP:
1484 return pg_eucjp_increment;
1485
1486 default:
1487 return pg_generic_charinc;
1488 }
1489 }
1490
1491 /*
1492 * fetch maximum length of the encoding for the current database
1493 */
1494 int
pg_database_encoding_max_length(void)1495 pg_database_encoding_max_length(void)
1496 {
1497 return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
1498 }
1499
1500 /*
1501 * Verify mbstr to make sure that it is validly encoded in the current
1502 * database encoding. Otherwise same as pg_verify_mbstr().
1503 */
1504 bool
pg_verifymbstr(const char * mbstr,int len,bool noError)1505 pg_verifymbstr(const char *mbstr, int len, bool noError)
1506 {
1507 return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
1508 }
1509
1510 /*
1511 * Verify mbstr to make sure that it is validly encoded in the specified
1512 * encoding.
1513 */
1514 bool
pg_verify_mbstr(int encoding,const char * mbstr,int len,bool noError)1515 pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1516 {
1517 int oklen;
1518
1519 Assert(PG_VALID_ENCODING(encoding));
1520
1521 oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
1522 if (oklen != len)
1523 {
1524 if (noError)
1525 return false;
1526 report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
1527 }
1528 return true;
1529 }
1530
1531 /*
1532 * Verify mbstr to make sure that it is validly encoded in the specified
1533 * encoding.
1534 *
1535 * mbstr is not necessarily zero terminated; length of mbstr is
1536 * specified by len.
1537 *
1538 * If OK, return length of string in the encoding.
1539 * If a problem is found, return -1 when noError is
1540 * true; when noError is false, ereport() a descriptive message.
1541 *
1542 * Note: We cannot use the faster encoding-specific mbverifystr() function
1543 * here, because we need to count the number of characters in the string.
1544 */
1545 int
pg_verify_mbstr_len(int encoding,const char * mbstr,int len,bool noError)1546 pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1547 {
1548 mbchar_verifier mbverifychar;
1549 int mb_len;
1550
1551 Assert(PG_VALID_ENCODING(encoding));
1552
1553 /*
1554 * In single-byte encodings, we need only reject nulls (\0).
1555 */
1556 if (pg_encoding_max_length(encoding) <= 1)
1557 {
1558 const char *nullpos = memchr(mbstr, 0, len);
1559
1560 if (nullpos == NULL)
1561 return len;
1562 if (noError)
1563 return -1;
1564 report_invalid_encoding(encoding, nullpos, 1);
1565 }
1566
1567 /* fetch function pointer just once */
1568 mbverifychar = pg_wchar_table[encoding].mbverifychar;
1569
1570 mb_len = 0;
1571
1572 while (len > 0)
1573 {
1574 int l;
1575
1576 /* fast path for ASCII-subset characters */
1577 if (!IS_HIGHBIT_SET(*mbstr))
1578 {
1579 if (*mbstr != '\0')
1580 {
1581 mb_len++;
1582 mbstr++;
1583 len--;
1584 continue;
1585 }
1586 if (noError)
1587 return -1;
1588 report_invalid_encoding(encoding, mbstr, len);
1589 }
1590
1591 l = (*mbverifychar) ((const unsigned char *) mbstr, len);
1592
1593 if (l < 0)
1594 {
1595 if (noError)
1596 return -1;
1597 report_invalid_encoding(encoding, mbstr, len);
1598 }
1599
1600 mbstr += l;
1601 len -= l;
1602 mb_len++;
1603 }
1604 return mb_len;
1605 }
1606
1607 /*
1608 * check_encoding_conversion_args: check arguments of a conversion function
1609 *
1610 * "expected" arguments can be either an encoding ID or -1 to indicate that
1611 * the caller will check whether it accepts the ID.
1612 *
1613 * Note: the errors here are not really user-facing, so elog instead of
1614 * ereport seems sufficient. Also, we trust that the "expected" encoding
1615 * arguments are valid encoding IDs, but we don't trust the actuals.
1616 */
1617 void
check_encoding_conversion_args(int src_encoding,int dest_encoding,int len,int expected_src_encoding,int expected_dest_encoding)1618 check_encoding_conversion_args(int src_encoding,
1619 int dest_encoding,
1620 int len,
1621 int expected_src_encoding,
1622 int expected_dest_encoding)
1623 {
1624 if (!PG_VALID_ENCODING(src_encoding))
1625 elog(ERROR, "invalid source encoding ID: %d", src_encoding);
1626 if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
1627 elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
1628 pg_enc2name_tbl[expected_src_encoding].name,
1629 pg_enc2name_tbl[src_encoding].name);
1630 if (!PG_VALID_ENCODING(dest_encoding))
1631 elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
1632 if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
1633 elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
1634 pg_enc2name_tbl[expected_dest_encoding].name,
1635 pg_enc2name_tbl[dest_encoding].name);
1636 if (len < 0)
1637 elog(ERROR, "encoding conversion length must not be negative");
1638 }
1639
1640 /*
1641 * report_invalid_encoding: complain about invalid multibyte character
1642 *
1643 * note: len is remaining length of string, not length of character;
1644 * len must be greater than zero, as we always examine the first byte.
1645 */
1646 void
report_invalid_encoding(int encoding,const char * mbstr,int len)1647 report_invalid_encoding(int encoding, const char *mbstr, int len)
1648 {
1649 int l = pg_encoding_mblen(encoding, mbstr);
1650 char buf[8 * 5 + 1];
1651 char *p = buf;
1652 int j,
1653 jlimit;
1654
1655 jlimit = Min(l, len);
1656 jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1657
1658 for (j = 0; j < jlimit; j++)
1659 {
1660 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1661 if (j < jlimit - 1)
1662 p += sprintf(p, " ");
1663 }
1664
1665 ereport(ERROR,
1666 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1667 errmsg("invalid byte sequence for encoding \"%s\": %s",
1668 pg_enc2name_tbl[encoding].name,
1669 buf)));
1670 }
1671
1672 /*
1673 * report_untranslatable_char: complain about untranslatable character
1674 *
1675 * note: len is remaining length of string, not length of character;
1676 * len must be greater than zero, as we always examine the first byte.
1677 */
1678 void
report_untranslatable_char(int src_encoding,int dest_encoding,const char * mbstr,int len)1679 report_untranslatable_char(int src_encoding, int dest_encoding,
1680 const char *mbstr, int len)
1681 {
1682 int l = pg_encoding_mblen(src_encoding, mbstr);
1683 char buf[8 * 5 + 1];
1684 char *p = buf;
1685 int j,
1686 jlimit;
1687
1688 jlimit = Min(l, len);
1689 jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1690
1691 for (j = 0; j < jlimit; j++)
1692 {
1693 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1694 if (j < jlimit - 1)
1695 p += sprintf(p, " ");
1696 }
1697
1698 ereport(ERROR,
1699 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
1700 errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
1701 buf,
1702 pg_enc2name_tbl[src_encoding].name,
1703 pg_enc2name_tbl[dest_encoding].name)));
1704 }
1705
1706
1707 #ifdef WIN32
1708 /*
1709 * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
1710 * string. The character length is also passed to utf16len if not
1711 * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
1712 * should be ASCII-only; this will function as though MessageEncoding is UTF8.
1713 */
1714 WCHAR *
pgwin32_message_to_UTF16(const char * str,int len,int * utf16len)1715 pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1716 {
1717 int msgenc = GetMessageEncoding();
1718 WCHAR *utf16;
1719 int dstlen;
1720 UINT codepage;
1721
1722 if (msgenc == PG_SQL_ASCII)
1723 /* No conversion is possible, and SQL_ASCII is never utf16. */
1724 return NULL;
1725
1726 codepage = pg_enc2name_tbl[msgenc].codepage;
1727
1728 /*
1729 * Use MultiByteToWideChar directly if there is a corresponding codepage,
1730 * or double conversion through UTF8 if not. Double conversion is needed,
1731 * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1732 */
1733 if (codepage != 0)
1734 {
1735 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1736 dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1737 utf16[dstlen] = (WCHAR) 0;
1738 }
1739 else
1740 {
1741 char *utf8;
1742
1743 /*
1744 * XXX pg_do_encoding_conversion() requires a transaction. In the
1745 * absence of one, hope for the input to be valid UTF8.
1746 */
1747 if (IsTransactionState())
1748 {
1749 utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1750 len,
1751 msgenc,
1752 PG_UTF8);
1753 if (utf8 != str)
1754 len = strlen(utf8);
1755 }
1756 else
1757 utf8 = (char *) str;
1758
1759 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1760 dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1761 utf16[dstlen] = (WCHAR) 0;
1762
1763 if (utf8 != str)
1764 pfree(utf8);
1765 }
1766
1767 if (dstlen == 0 && len > 0)
1768 {
1769 pfree(utf16);
1770 return NULL; /* error */
1771 }
1772
1773 if (utf16len)
1774 *utf16len = dstlen;
1775 return utf16;
1776 }
1777
1778 #endif /* WIN32 */
1779