1 /*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities
4 *
5 * Portions Copyright (c) 2002-2020, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12 /*----------
13 * Here is how the locale stuff is handled: LC_COLLATE and LC_CTYPE
14 * are fixed at CREATE DATABASE time, stored in pg_database, and cannot
15 * be changed. Thus, the effects of strcoll(), strxfrm(), isupper(),
16 * toupper(), etc. are always in the same fixed locale.
17 *
18 * LC_MESSAGES is settable at run time and will take effect
19 * immediately.
20 *
21 * The other categories, LC_MONETARY, LC_NUMERIC, and LC_TIME are also
22 * settable at run-time. However, we don't actually set those locale
23 * categories permanently. This would have bizarre effects like no
24 * longer accepting standard floating-point literals in some locales.
25 * Instead, we only set these locale categories briefly when needed,
26 * cache the required information obtained from localeconv() or
27 * strftime(), and then set the locale categories back to "C".
28 * The cached information is only used by the formatting functions
29 * (to_char, etc.) and the money type. For the user, this should all be
30 * transparent.
31 *
32 * !!! NOW HEAR THIS !!!
33 *
34 * We've been bitten repeatedly by this bug, so let's try to keep it in
35 * mind in future: on some platforms, the locale functions return pointers
36 * to static data that will be overwritten by any later locale function.
37 * Thus, for example, the obvious-looking sequence
38 * save = setlocale(category, NULL);
39 * if (!setlocale(category, value))
40 * fail = true;
41 * setlocale(category, save);
42 * DOES NOT WORK RELIABLY: on some platforms the second setlocale() call
43 * will change the memory save is pointing at. To do this sort of thing
44 * safely, you *must* pstrdup what setlocale returns the first time.
45 *
46 * The POSIX locale standard is available here:
47 *
48 * http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap07.html
49 *----------
50 */
51
52
53 #include "postgres.h"
54
55 #include <time.h>
56
57 #include "access/htup_details.h"
58 #include "catalog/pg_collation.h"
59 #include "catalog/pg_control.h"
60 #include "mb/pg_wchar.h"
61 #include "utils/builtins.h"
62 #include "utils/formatting.h"
63 #include "utils/hsearch.h"
64 #include "utils/lsyscache.h"
65 #include "utils/memutils.h"
66 #include "utils/pg_locale.h"
67 #include "utils/syscache.h"
68
69 #ifdef USE_ICU
70 #include <unicode/ucnv.h>
71 #endif
72
73 #ifdef __GLIBC__
74 #include <gnu/libc-version.h>
75 #endif
76
77 #ifdef WIN32
78 /*
79 * This Windows file defines StrNCpy. We don't need it here, so we undefine
80 * it to keep the compiler quiet, and undefine it again after the file is
81 * included, so we don't accidentally use theirs.
82 */
83 #undef StrNCpy
84 #include <shlwapi.h>
85 #ifdef StrNCpy
86 #undef StrNCpy
87 #endif
88 #endif
89
90 #define MAX_L10N_DATA 80
91
92
93 /* GUC settings */
94 char *locale_messages;
95 char *locale_monetary;
96 char *locale_numeric;
97 char *locale_time;
98
99 /*
100 * lc_time localization cache.
101 *
102 * We use only the first 7 or 12 entries of these arrays. The last array
103 * element is left as NULL for the convenience of outside code that wants
104 * to sequentially scan these arrays.
105 */
106 char *localized_abbrev_days[7 + 1];
107 char *localized_full_days[7 + 1];
108 char *localized_abbrev_months[12 + 1];
109 char *localized_full_months[12 + 1];
110
111 /* indicates whether locale information cache is valid */
112 static bool CurrentLocaleConvValid = false;
113 static bool CurrentLCTimeValid = false;
114
115 /* Environment variable storage area */
116
117 #define LC_ENV_BUFSIZE (NAMEDATALEN + 20)
118
119 static char lc_collate_envbuf[LC_ENV_BUFSIZE];
120 static char lc_ctype_envbuf[LC_ENV_BUFSIZE];
121
122 #ifdef LC_MESSAGES
123 static char lc_messages_envbuf[LC_ENV_BUFSIZE];
124 #endif
125 static char lc_monetary_envbuf[LC_ENV_BUFSIZE];
126 static char lc_numeric_envbuf[LC_ENV_BUFSIZE];
127 static char lc_time_envbuf[LC_ENV_BUFSIZE];
128
129 /* Cache for collation-related knowledge */
130
131 typedef struct
132 {
133 Oid collid; /* hash key: pg_collation OID */
134 bool collate_is_c; /* is collation's LC_COLLATE C? */
135 bool ctype_is_c; /* is collation's LC_CTYPE C? */
136 bool flags_valid; /* true if above flags are valid */
137 pg_locale_t locale; /* locale_t struct, or 0 if not valid */
138 } collation_cache_entry;
139
140 static HTAB *collation_cache = NULL;
141
142
143 #if defined(WIN32) && defined(LC_MESSAGES)
144 static char *IsoLocaleName(const char *); /* MSVC specific */
145 #endif
146
147 #ifdef USE_ICU
148 static void icu_set_collation_attributes(UCollator *collator, const char *loc);
149 #endif
150
151 /*
152 * pg_perm_setlocale
153 *
154 * This wraps the libc function setlocale(), with two additions. First, when
155 * changing LC_CTYPE, update gettext's encoding for the current message
156 * domain. GNU gettext automatically tracks LC_CTYPE on most platforms, but
157 * not on Windows. Second, if the operation is successful, the corresponding
158 * LC_XXX environment variable is set to match. By setting the environment
159 * variable, we ensure that any subsequent use of setlocale(..., "") will
160 * preserve the settings made through this routine. Of course, LC_ALL must
161 * also be unset to fully ensure that, but that has to be done elsewhere after
162 * all the individual LC_XXX variables have been set correctly. (Thank you
163 * Perl for making this kluge necessary.)
164 */
165 char *
pg_perm_setlocale(int category,const char * locale)166 pg_perm_setlocale(int category, const char *locale)
167 {
168 char *result;
169 const char *envvar;
170 char *envbuf;
171
172 #ifndef WIN32
173 result = setlocale(category, locale);
174 #else
175
176 /*
177 * On Windows, setlocale(LC_MESSAGES) does not work, so just assume that
178 * the given value is good and set it in the environment variables. We
179 * must ignore attempts to set to "", which means "keep using the old
180 * environment value".
181 */
182 #ifdef LC_MESSAGES
183 if (category == LC_MESSAGES)
184 {
185 result = (char *) locale;
186 if (locale == NULL || locale[0] == '\0')
187 return result;
188 }
189 else
190 #endif
191 result = setlocale(category, locale);
192 #endif /* WIN32 */
193
194 if (result == NULL)
195 return result; /* fall out immediately on failure */
196
197 /*
198 * Use the right encoding in translated messages. Under ENABLE_NLS, let
199 * pg_bind_textdomain_codeset() figure it out. Under !ENABLE_NLS, message
200 * format strings are ASCII, but database-encoding strings may enter the
201 * message via %s. This makes the overall message encoding equal to the
202 * database encoding.
203 */
204 if (category == LC_CTYPE)
205 {
206 static char save_lc_ctype[LC_ENV_BUFSIZE];
207
208 /* copy setlocale() return value before callee invokes it again */
209 strlcpy(save_lc_ctype, result, sizeof(save_lc_ctype));
210 result = save_lc_ctype;
211
212 #ifdef ENABLE_NLS
213 SetMessageEncoding(pg_bind_textdomain_codeset(textdomain(NULL)));
214 #else
215 SetMessageEncoding(GetDatabaseEncoding());
216 #endif
217 }
218
219 switch (category)
220 {
221 case LC_COLLATE:
222 envvar = "LC_COLLATE";
223 envbuf = lc_collate_envbuf;
224 break;
225 case LC_CTYPE:
226 envvar = "LC_CTYPE";
227 envbuf = lc_ctype_envbuf;
228 break;
229 #ifdef LC_MESSAGES
230 case LC_MESSAGES:
231 envvar = "LC_MESSAGES";
232 envbuf = lc_messages_envbuf;
233 #ifdef WIN32
234 result = IsoLocaleName(locale);
235 if (result == NULL)
236 result = (char *) locale;
237 elog(DEBUG3, "IsoLocaleName() executed; locale: \"%s\"", result);
238 #endif /* WIN32 */
239 break;
240 #endif /* LC_MESSAGES */
241 case LC_MONETARY:
242 envvar = "LC_MONETARY";
243 envbuf = lc_monetary_envbuf;
244 break;
245 case LC_NUMERIC:
246 envvar = "LC_NUMERIC";
247 envbuf = lc_numeric_envbuf;
248 break;
249 case LC_TIME:
250 envvar = "LC_TIME";
251 envbuf = lc_time_envbuf;
252 break;
253 default:
254 elog(FATAL, "unrecognized LC category: %d", category);
255 envvar = NULL; /* keep compiler quiet */
256 envbuf = NULL;
257 return NULL;
258 }
259
260 snprintf(envbuf, LC_ENV_BUFSIZE - 1, "%s=%s", envvar, result);
261
262 if (putenv(envbuf))
263 return NULL;
264
265 return result;
266 }
267
268
269 /*
270 * Is the locale name valid for the locale category?
271 *
272 * If successful, and canonname isn't NULL, a palloc'd copy of the locale's
273 * canonical name is stored there. This is especially useful for figuring out
274 * what locale name "" means (ie, the server environment value). (Actually,
275 * it seems that on most implementations that's the only thing it's good for;
276 * we could wish that setlocale gave back a canonically spelled version of
277 * the locale name, but typically it doesn't.)
278 */
279 bool
check_locale(int category,const char * locale,char ** canonname)280 check_locale(int category, const char *locale, char **canonname)
281 {
282 char *save;
283 char *res;
284
285 if (canonname)
286 *canonname = NULL; /* in case of failure */
287
288 save = setlocale(category, NULL);
289 if (!save)
290 return false; /* won't happen, we hope */
291
292 /* save may be pointing at a modifiable scratch variable, see above. */
293 save = pstrdup(save);
294
295 /* set the locale with setlocale, to see if it accepts it. */
296 res = setlocale(category, locale);
297
298 /* save canonical name if requested. */
299 if (res && canonname)
300 *canonname = pstrdup(res);
301
302 /* restore old value. */
303 if (!setlocale(category, save))
304 elog(WARNING, "failed to restore old locale \"%s\"", save);
305 pfree(save);
306
307 return (res != NULL);
308 }
309
310
311 /*
312 * GUC check/assign hooks
313 *
314 * For most locale categories, the assign hook doesn't actually set the locale
315 * permanently, just reset flags so that the next use will cache the
316 * appropriate values. (See explanation at the top of this file.)
317 *
318 * Note: we accept value = "" as selecting the postmaster's environment
319 * value, whatever it was (so long as the environment setting is legal).
320 * This will have been locked down by an earlier call to pg_perm_setlocale.
321 */
322 bool
check_locale_monetary(char ** newval,void ** extra,GucSource source)323 check_locale_monetary(char **newval, void **extra, GucSource source)
324 {
325 return check_locale(LC_MONETARY, *newval, NULL);
326 }
327
328 void
assign_locale_monetary(const char * newval,void * extra)329 assign_locale_monetary(const char *newval, void *extra)
330 {
331 CurrentLocaleConvValid = false;
332 }
333
334 bool
check_locale_numeric(char ** newval,void ** extra,GucSource source)335 check_locale_numeric(char **newval, void **extra, GucSource source)
336 {
337 return check_locale(LC_NUMERIC, *newval, NULL);
338 }
339
340 void
assign_locale_numeric(const char * newval,void * extra)341 assign_locale_numeric(const char *newval, void *extra)
342 {
343 CurrentLocaleConvValid = false;
344 }
345
346 bool
check_locale_time(char ** newval,void ** extra,GucSource source)347 check_locale_time(char **newval, void **extra, GucSource source)
348 {
349 return check_locale(LC_TIME, *newval, NULL);
350 }
351
352 void
assign_locale_time(const char * newval,void * extra)353 assign_locale_time(const char *newval, void *extra)
354 {
355 CurrentLCTimeValid = false;
356 }
357
358 /*
359 * We allow LC_MESSAGES to actually be set globally.
360 *
361 * Note: we normally disallow value = "" because it wouldn't have consistent
362 * semantics (it'd effectively just use the previous value). However, this
363 * is the value passed for PGC_S_DEFAULT, so don't complain in that case,
364 * not even if the attempted setting fails due to invalid environment value.
365 * The idea there is just to accept the environment setting *if possible*
366 * during startup, until we can read the proper value from postgresql.conf.
367 */
368 bool
check_locale_messages(char ** newval,void ** extra,GucSource source)369 check_locale_messages(char **newval, void **extra, GucSource source)
370 {
371 if (**newval == '\0')
372 {
373 if (source == PGC_S_DEFAULT)
374 return true;
375 else
376 return false;
377 }
378
379 /*
380 * LC_MESSAGES category does not exist everywhere, but accept it anyway
381 *
382 * On Windows, we can't even check the value, so accept blindly
383 */
384 #if defined(LC_MESSAGES) && !defined(WIN32)
385 return check_locale(LC_MESSAGES, *newval, NULL);
386 #else
387 return true;
388 #endif
389 }
390
391 void
assign_locale_messages(const char * newval,void * extra)392 assign_locale_messages(const char *newval, void *extra)
393 {
394 /*
395 * LC_MESSAGES category does not exist everywhere, but accept it anyway.
396 * We ignore failure, as per comment above.
397 */
398 #ifdef LC_MESSAGES
399 (void) pg_perm_setlocale(LC_MESSAGES, newval);
400 #endif
401 }
402
403
404 /*
405 * Frees the malloced content of a struct lconv. (But not the struct
406 * itself.) It's important that this not throw elog(ERROR).
407 */
408 static void
free_struct_lconv(struct lconv * s)409 free_struct_lconv(struct lconv *s)
410 {
411 if (s->decimal_point)
412 free(s->decimal_point);
413 if (s->thousands_sep)
414 free(s->thousands_sep);
415 if (s->grouping)
416 free(s->grouping);
417 if (s->int_curr_symbol)
418 free(s->int_curr_symbol);
419 if (s->currency_symbol)
420 free(s->currency_symbol);
421 if (s->mon_decimal_point)
422 free(s->mon_decimal_point);
423 if (s->mon_thousands_sep)
424 free(s->mon_thousands_sep);
425 if (s->mon_grouping)
426 free(s->mon_grouping);
427 if (s->positive_sign)
428 free(s->positive_sign);
429 if (s->negative_sign)
430 free(s->negative_sign);
431 }
432
433 /*
434 * Check that all fields of a struct lconv (or at least, the ones we care
435 * about) are non-NULL. The field list must match free_struct_lconv().
436 */
437 static bool
struct_lconv_is_valid(struct lconv * s)438 struct_lconv_is_valid(struct lconv *s)
439 {
440 if (s->decimal_point == NULL)
441 return false;
442 if (s->thousands_sep == NULL)
443 return false;
444 if (s->grouping == NULL)
445 return false;
446 if (s->int_curr_symbol == NULL)
447 return false;
448 if (s->currency_symbol == NULL)
449 return false;
450 if (s->mon_decimal_point == NULL)
451 return false;
452 if (s->mon_thousands_sep == NULL)
453 return false;
454 if (s->mon_grouping == NULL)
455 return false;
456 if (s->positive_sign == NULL)
457 return false;
458 if (s->negative_sign == NULL)
459 return false;
460 return true;
461 }
462
463
464 /*
465 * Convert the strdup'd string at *str from the specified encoding to the
466 * database encoding.
467 */
468 static void
db_encoding_convert(int encoding,char ** str)469 db_encoding_convert(int encoding, char **str)
470 {
471 char *pstr;
472 char *mstr;
473
474 /* convert the string to the database encoding */
475 pstr = pg_any_to_server(*str, strlen(*str), encoding);
476 if (pstr == *str)
477 return; /* no conversion happened */
478
479 /* need it malloc'd not palloc'd */
480 mstr = strdup(pstr);
481 if (mstr == NULL)
482 ereport(ERROR,
483 (errcode(ERRCODE_OUT_OF_MEMORY),
484 errmsg("out of memory")));
485
486 /* replace old string */
487 free(*str);
488 *str = mstr;
489
490 pfree(pstr);
491 }
492
493
494 /*
495 * Return the POSIX lconv struct (contains number/money formatting
496 * information) with locale information for all categories.
497 */
498 struct lconv *
PGLC_localeconv(void)499 PGLC_localeconv(void)
500 {
501 static struct lconv CurrentLocaleConv;
502 static bool CurrentLocaleConvAllocated = false;
503 struct lconv *extlconv;
504 struct lconv worklconv;
505 char *save_lc_monetary;
506 char *save_lc_numeric;
507 #ifdef WIN32
508 char *save_lc_ctype;
509 #endif
510
511 /* Did we do it already? */
512 if (CurrentLocaleConvValid)
513 return &CurrentLocaleConv;
514
515 /* Free any already-allocated storage */
516 if (CurrentLocaleConvAllocated)
517 {
518 free_struct_lconv(&CurrentLocaleConv);
519 CurrentLocaleConvAllocated = false;
520 }
521
522 /*
523 * This is tricky because we really don't want to risk throwing error
524 * while the locale is set to other than our usual settings. Therefore,
525 * the process is: collect the usual settings, set locale to special
526 * setting, copy relevant data into worklconv using strdup(), restore
527 * normal settings, convert data to desired encoding, and finally stash
528 * the collected data in CurrentLocaleConv. This makes it safe if we
529 * throw an error during encoding conversion or run out of memory anywhere
530 * in the process. All data pointed to by struct lconv members is
531 * allocated with strdup, to avoid premature elog(ERROR) and to allow
532 * using a single cleanup routine.
533 */
534 memset(&worklconv, 0, sizeof(worklconv));
535
536 /* Save prevailing values of monetary and numeric locales */
537 save_lc_monetary = setlocale(LC_MONETARY, NULL);
538 if (!save_lc_monetary)
539 elog(ERROR, "setlocale(NULL) failed");
540 save_lc_monetary = pstrdup(save_lc_monetary);
541
542 save_lc_numeric = setlocale(LC_NUMERIC, NULL);
543 if (!save_lc_numeric)
544 elog(ERROR, "setlocale(NULL) failed");
545 save_lc_numeric = pstrdup(save_lc_numeric);
546
547 #ifdef WIN32
548
549 /*
550 * The POSIX standard explicitly says that it is undefined what happens if
551 * LC_MONETARY or LC_NUMERIC imply an encoding (codeset) different from
552 * that implied by LC_CTYPE. In practice, all Unix-ish platforms seem to
553 * believe that localeconv() should return strings that are encoded in the
554 * codeset implied by the LC_MONETARY or LC_NUMERIC locale name. Hence,
555 * once we have successfully collected the localeconv() results, we will
556 * convert them from that codeset to the desired server encoding.
557 *
558 * Windows, of course, resolutely does things its own way; on that
559 * platform LC_CTYPE has to match LC_MONETARY/LC_NUMERIC to get sane
560 * results. Hence, we must temporarily set that category as well.
561 */
562
563 /* Save prevailing value of ctype locale */
564 save_lc_ctype = setlocale(LC_CTYPE, NULL);
565 if (!save_lc_ctype)
566 elog(ERROR, "setlocale(NULL) failed");
567 save_lc_ctype = pstrdup(save_lc_ctype);
568
569 /* Here begins the critical section where we must not throw error */
570
571 /* use numeric to set the ctype */
572 setlocale(LC_CTYPE, locale_numeric);
573 #endif
574
575 /* Get formatting information for numeric */
576 setlocale(LC_NUMERIC, locale_numeric);
577 extlconv = localeconv();
578
579 /* Must copy data now in case setlocale() overwrites it */
580 worklconv.decimal_point = strdup(extlconv->decimal_point);
581 worklconv.thousands_sep = strdup(extlconv->thousands_sep);
582 worklconv.grouping = strdup(extlconv->grouping);
583
584 #ifdef WIN32
585 /* use monetary to set the ctype */
586 setlocale(LC_CTYPE, locale_monetary);
587 #endif
588
589 /* Get formatting information for monetary */
590 setlocale(LC_MONETARY, locale_monetary);
591 extlconv = localeconv();
592
593 /* Must copy data now in case setlocale() overwrites it */
594 worklconv.int_curr_symbol = strdup(extlconv->int_curr_symbol);
595 worklconv.currency_symbol = strdup(extlconv->currency_symbol);
596 worklconv.mon_decimal_point = strdup(extlconv->mon_decimal_point);
597 worklconv.mon_thousands_sep = strdup(extlconv->mon_thousands_sep);
598 worklconv.mon_grouping = strdup(extlconv->mon_grouping);
599 worklconv.positive_sign = strdup(extlconv->positive_sign);
600 worklconv.negative_sign = strdup(extlconv->negative_sign);
601 /* Copy scalar fields as well */
602 worklconv.int_frac_digits = extlconv->int_frac_digits;
603 worklconv.frac_digits = extlconv->frac_digits;
604 worklconv.p_cs_precedes = extlconv->p_cs_precedes;
605 worklconv.p_sep_by_space = extlconv->p_sep_by_space;
606 worklconv.n_cs_precedes = extlconv->n_cs_precedes;
607 worklconv.n_sep_by_space = extlconv->n_sep_by_space;
608 worklconv.p_sign_posn = extlconv->p_sign_posn;
609 worklconv.n_sign_posn = extlconv->n_sign_posn;
610
611 /*
612 * Restore the prevailing locale settings; failure to do so is fatal.
613 * Possibly we could limp along with nondefault LC_MONETARY or LC_NUMERIC,
614 * but proceeding with the wrong value of LC_CTYPE would certainly be bad
615 * news; and considering that the prevailing LC_MONETARY and LC_NUMERIC
616 * are almost certainly "C", there's really no reason that restoring those
617 * should fail.
618 */
619 #ifdef WIN32
620 if (!setlocale(LC_CTYPE, save_lc_ctype))
621 elog(FATAL, "failed to restore LC_CTYPE to \"%s\"", save_lc_ctype);
622 #endif
623 if (!setlocale(LC_MONETARY, save_lc_monetary))
624 elog(FATAL, "failed to restore LC_MONETARY to \"%s\"", save_lc_monetary);
625 if (!setlocale(LC_NUMERIC, save_lc_numeric))
626 elog(FATAL, "failed to restore LC_NUMERIC to \"%s\"", save_lc_numeric);
627
628 /*
629 * At this point we've done our best to clean up, and can call functions
630 * that might possibly throw errors with a clean conscience. But let's
631 * make sure we don't leak any already-strdup'd fields in worklconv.
632 */
633 PG_TRY();
634 {
635 int encoding;
636
637 /* Release the pstrdup'd locale names */
638 pfree(save_lc_monetary);
639 pfree(save_lc_numeric);
640 #ifdef WIN32
641 pfree(save_lc_ctype);
642 #endif
643
644 /* If any of the preceding strdup calls failed, complain now. */
645 if (!struct_lconv_is_valid(&worklconv))
646 ereport(ERROR,
647 (errcode(ERRCODE_OUT_OF_MEMORY),
648 errmsg("out of memory")));
649
650 /*
651 * Now we must perform encoding conversion from whatever's associated
652 * with the locales into the database encoding. If we can't identify
653 * the encoding implied by LC_NUMERIC or LC_MONETARY (ie we get -1),
654 * use PG_SQL_ASCII, which will result in just validating that the
655 * strings are OK in the database encoding.
656 */
657 encoding = pg_get_encoding_from_locale(locale_numeric, true);
658 if (encoding < 0)
659 encoding = PG_SQL_ASCII;
660
661 db_encoding_convert(encoding, &worklconv.decimal_point);
662 db_encoding_convert(encoding, &worklconv.thousands_sep);
663 /* grouping is not text and does not require conversion */
664
665 encoding = pg_get_encoding_from_locale(locale_monetary, true);
666 if (encoding < 0)
667 encoding = PG_SQL_ASCII;
668
669 db_encoding_convert(encoding, &worklconv.int_curr_symbol);
670 db_encoding_convert(encoding, &worklconv.currency_symbol);
671 db_encoding_convert(encoding, &worklconv.mon_decimal_point);
672 db_encoding_convert(encoding, &worklconv.mon_thousands_sep);
673 /* mon_grouping is not text and does not require conversion */
674 db_encoding_convert(encoding, &worklconv.positive_sign);
675 db_encoding_convert(encoding, &worklconv.negative_sign);
676 }
677 PG_CATCH();
678 {
679 free_struct_lconv(&worklconv);
680 PG_RE_THROW();
681 }
682 PG_END_TRY();
683
684 /*
685 * Everything is good, so save the results.
686 */
687 CurrentLocaleConv = worklconv;
688 CurrentLocaleConvAllocated = true;
689 CurrentLocaleConvValid = true;
690 return &CurrentLocaleConv;
691 }
692
693 #ifdef WIN32
694 /*
695 * On Windows, strftime() returns its output in encoding CP_ACP (the default
696 * operating system codepage for the computer), which is likely different
697 * from SERVER_ENCODING. This is especially important in Japanese versions
698 * of Windows which will use SJIS encoding, which we don't support as a
699 * server encoding.
700 *
701 * So, instead of using strftime(), use wcsftime() to return the value in
702 * wide characters (internally UTF16) and then convert to UTF8, which we
703 * know how to handle directly.
704 *
705 * Note that this only affects the calls to strftime() in this file, which are
706 * used to get the locale-aware strings. Other parts of the backend use
707 * pg_strftime(), which isn't locale-aware and does not need to be replaced.
708 */
709 static size_t
strftime_win32(char * dst,size_t dstlen,const char * format,const struct tm * tm)710 strftime_win32(char *dst, size_t dstlen,
711 const char *format, const struct tm *tm)
712 {
713 size_t len;
714 wchar_t wformat[8]; /* formats used below need 3 chars */
715 wchar_t wbuf[MAX_L10N_DATA];
716
717 /*
718 * Get a wchar_t version of the format string. We only actually use
719 * plain-ASCII formats in this file, so we can say that they're UTF8.
720 */
721 len = MultiByteToWideChar(CP_UTF8, 0, format, -1,
722 wformat, lengthof(wformat));
723 if (len == 0)
724 elog(ERROR, "could not convert format string from UTF-8: error code %lu",
725 GetLastError());
726
727 len = wcsftime(wbuf, MAX_L10N_DATA, wformat, tm);
728 if (len == 0)
729 {
730 /*
731 * wcsftime failed, possibly because the result would not fit in
732 * MAX_L10N_DATA. Return 0 with the contents of dst unspecified.
733 */
734 return 0;
735 }
736
737 len = WideCharToMultiByte(CP_UTF8, 0, wbuf, len, dst, dstlen - 1,
738 NULL, NULL);
739 if (len == 0)
740 elog(ERROR, "could not convert string to UTF-8: error code %lu",
741 GetLastError());
742
743 dst[len] = '\0';
744
745 return len;
746 }
747
748 /* redefine strftime() */
749 #define strftime(a,b,c,d) strftime_win32(a,b,c,d)
750 #endif /* WIN32 */
751
752 /*
753 * Subroutine for cache_locale_time().
754 * Convert the given string from encoding "encoding" to the database
755 * encoding, and store the result at *dst, replacing any previous value.
756 */
757 static void
cache_single_string(char ** dst,const char * src,int encoding)758 cache_single_string(char **dst, const char *src, int encoding)
759 {
760 char *ptr;
761 char *olddst;
762
763 /* Convert the string to the database encoding, or validate it's OK */
764 ptr = pg_any_to_server(src, strlen(src), encoding);
765
766 /* Store the string in long-lived storage, replacing any previous value */
767 olddst = *dst;
768 *dst = MemoryContextStrdup(TopMemoryContext, ptr);
769 if (olddst)
770 pfree(olddst);
771
772 /* Might as well clean up any palloc'd conversion result, too */
773 if (ptr != src)
774 pfree(ptr);
775 }
776
777 /*
778 * Update the lc_time localization cache variables if needed.
779 */
780 void
cache_locale_time(void)781 cache_locale_time(void)
782 {
783 char buf[(2 * 7 + 2 * 12) * MAX_L10N_DATA];
784 char *bufptr;
785 time_t timenow;
786 struct tm *timeinfo;
787 bool strftimefail = false;
788 int encoding;
789 int i;
790 char *save_lc_time;
791 #ifdef WIN32
792 char *save_lc_ctype;
793 #endif
794
795 /* did we do this already? */
796 if (CurrentLCTimeValid)
797 return;
798
799 elog(DEBUG3, "cache_locale_time() executed; locale: \"%s\"", locale_time);
800
801 /*
802 * As in PGLC_localeconv(), it's critical that we not throw error while
803 * libc's locale settings have nondefault values. Hence, we just call
804 * strftime() within the critical section, and then convert and save its
805 * results afterwards.
806 */
807
808 /* Save prevailing value of time locale */
809 save_lc_time = setlocale(LC_TIME, NULL);
810 if (!save_lc_time)
811 elog(ERROR, "setlocale(NULL) failed");
812 save_lc_time = pstrdup(save_lc_time);
813
814 #ifdef WIN32
815
816 /*
817 * On Windows, it appears that wcsftime() internally uses LC_CTYPE, so we
818 * must set it here. This code looks the same as what PGLC_localeconv()
819 * does, but the underlying reason is different: this does NOT determine
820 * the encoding we'll get back from strftime_win32().
821 */
822
823 /* Save prevailing value of ctype locale */
824 save_lc_ctype = setlocale(LC_CTYPE, NULL);
825 if (!save_lc_ctype)
826 elog(ERROR, "setlocale(NULL) failed");
827 save_lc_ctype = pstrdup(save_lc_ctype);
828
829 /* use lc_time to set the ctype */
830 setlocale(LC_CTYPE, locale_time);
831 #endif
832
833 setlocale(LC_TIME, locale_time);
834
835 /* We use times close to current time as data for strftime(). */
836 timenow = time(NULL);
837 timeinfo = localtime(&timenow);
838
839 /* Store the strftime results in MAX_L10N_DATA-sized portions of buf[] */
840 bufptr = buf;
841
842 /*
843 * MAX_L10N_DATA is sufficient buffer space for every known locale, and
844 * POSIX defines no strftime() errors. (Buffer space exhaustion is not an
845 * error.) An implementation might report errors (e.g. ENOMEM) by
846 * returning 0 (or, less plausibly, a negative value) and setting errno.
847 * Report errno just in case the implementation did that, but clear it in
848 * advance of the calls so we don't emit a stale, unrelated errno.
849 */
850 errno = 0;
851
852 /* localized days */
853 for (i = 0; i < 7; i++)
854 {
855 timeinfo->tm_wday = i;
856 if (strftime(bufptr, MAX_L10N_DATA, "%a", timeinfo) <= 0)
857 strftimefail = true;
858 bufptr += MAX_L10N_DATA;
859 if (strftime(bufptr, MAX_L10N_DATA, "%A", timeinfo) <= 0)
860 strftimefail = true;
861 bufptr += MAX_L10N_DATA;
862 }
863
864 /* localized months */
865 for (i = 0; i < 12; i++)
866 {
867 timeinfo->tm_mon = i;
868 timeinfo->tm_mday = 1; /* make sure we don't have invalid date */
869 if (strftime(bufptr, MAX_L10N_DATA, "%b", timeinfo) <= 0)
870 strftimefail = true;
871 bufptr += MAX_L10N_DATA;
872 if (strftime(bufptr, MAX_L10N_DATA, "%B", timeinfo) <= 0)
873 strftimefail = true;
874 bufptr += MAX_L10N_DATA;
875 }
876
877 /*
878 * Restore the prevailing locale settings; as in PGLC_localeconv(),
879 * failure to do so is fatal.
880 */
881 #ifdef WIN32
882 if (!setlocale(LC_CTYPE, save_lc_ctype))
883 elog(FATAL, "failed to restore LC_CTYPE to \"%s\"", save_lc_ctype);
884 #endif
885 if (!setlocale(LC_TIME, save_lc_time))
886 elog(FATAL, "failed to restore LC_TIME to \"%s\"", save_lc_time);
887
888 /*
889 * At this point we've done our best to clean up, and can throw errors, or
890 * call functions that might throw errors, with a clean conscience.
891 */
892 if (strftimefail)
893 elog(ERROR, "strftime() failed: %m");
894
895 /* Release the pstrdup'd locale names */
896 pfree(save_lc_time);
897 #ifdef WIN32
898 pfree(save_lc_ctype);
899 #endif
900
901 #ifndef WIN32
902
903 /*
904 * As in PGLC_localeconv(), we must convert strftime()'s output from the
905 * encoding implied by LC_TIME to the database encoding. If we can't
906 * identify the LC_TIME encoding, just perform encoding validation.
907 */
908 encoding = pg_get_encoding_from_locale(locale_time, true);
909 if (encoding < 0)
910 encoding = PG_SQL_ASCII;
911
912 #else
913
914 /*
915 * On Windows, strftime_win32() always returns UTF8 data, so convert from
916 * that if necessary.
917 */
918 encoding = PG_UTF8;
919
920 #endif /* WIN32 */
921
922 bufptr = buf;
923
924 /* localized days */
925 for (i = 0; i < 7; i++)
926 {
927 cache_single_string(&localized_abbrev_days[i], bufptr, encoding);
928 bufptr += MAX_L10N_DATA;
929 cache_single_string(&localized_full_days[i], bufptr, encoding);
930 bufptr += MAX_L10N_DATA;
931 }
932 localized_abbrev_days[7] = NULL;
933 localized_full_days[7] = NULL;
934
935 /* localized months */
936 for (i = 0; i < 12; i++)
937 {
938 cache_single_string(&localized_abbrev_months[i], bufptr, encoding);
939 bufptr += MAX_L10N_DATA;
940 cache_single_string(&localized_full_months[i], bufptr, encoding);
941 bufptr += MAX_L10N_DATA;
942 }
943 localized_abbrev_months[12] = NULL;
944 localized_full_months[12] = NULL;
945
946 CurrentLCTimeValid = true;
947 }
948
949
950 #if defined(WIN32) && defined(LC_MESSAGES)
951 /*
952 * Convert a Windows setlocale() argument to a Unix-style one.
953 *
954 * Regardless of platform, we install message catalogs under a Unix-style
955 * LL[_CC][.ENCODING][@VARIANT] naming convention. Only LC_MESSAGES settings
956 * following that style will elicit localized interface strings.
957 *
958 * Before Visual Studio 2012 (msvcr110.dll), Windows setlocale() accepted "C"
959 * (but not "c") and strings of the form <Language>[_<Country>][.<CodePage>],
960 * case-insensitive. setlocale() returns the fully-qualified form; for
961 * example, setlocale("thaI") returns "Thai_Thailand.874". Internally,
962 * setlocale() and _create_locale() select a "locale identifier"[1] and store
963 * it in an undocumented _locale_t field. From that LCID, we can retrieve the
964 * ISO 639 language and the ISO 3166 country. Character encoding does not
965 * matter, because the server and client encodings govern that.
966 *
967 * Windows Vista introduced the "locale name" concept[2], closely following
968 * RFC 4646. Locale identifiers are now deprecated. Starting with Visual
969 * Studio 2012, setlocale() accepts locale names in addition to the strings it
970 * accepted historically. It does not standardize them; setlocale("Th-tH")
971 * returns "Th-tH". setlocale(category, "") still returns a traditional
972 * string. Furthermore, msvcr110.dll changed the undocumented _locale_t
973 * content to carry locale names instead of locale identifiers.
974 *
975 * Visual Studio 2015 should still be able to do the same as Visual Studio
976 * 2012, but the declaration of locale_name is missing in _locale_t, causing
977 * this code compilation to fail, hence this falls back instead on to
978 * enumerating all system locales by using EnumSystemLocalesEx to find the
979 * required locale name. If the input argument is in Unix-style then we can
980 * get ISO Locale name directly by using GetLocaleInfoEx() with LCType as
981 * LOCALE_SNAME.
982 *
983 * MinGW headers declare _create_locale(), but msvcrt.dll lacks that symbol in
984 * releases before Windows 8. IsoLocaleName() always fails in a MinGW-built
985 * postgres.exe, so only Unix-style values of the lc_messages GUC can elicit
986 * localized messages. In particular, every lc_messages setting that initdb
987 * can select automatically will yield only C-locale messages. XXX This could
988 * be fixed by running the fully-qualified locale name through a lookup table.
989 *
990 * This function returns a pointer to a static buffer bearing the converted
991 * name or NULL if conversion fails.
992 *
993 * [1] https://docs.microsoft.com/en-us/windows/win32/intl/locale-identifiers
994 * [2] https://docs.microsoft.com/en-us/windows/win32/intl/locale-names
995 */
996
997 #if _MSC_VER >= 1900
998 /*
999 * Callback function for EnumSystemLocalesEx() in get_iso_localename().
1000 *
1001 * This function enumerates all system locales, searching for one that matches
1002 * an input with the format: <Language>[_<Country>], e.g.
1003 * English[_United States]
1004 *
1005 * The input is a three wchar_t array as an LPARAM. The first element is the
1006 * locale_name we want to match, the second element is an allocated buffer
1007 * where the Unix-style locale is copied if a match is found, and the third
1008 * element is the search status, 1 if a match was found, 0 otherwise.
1009 */
1010 static BOOL CALLBACK
search_locale_enum(LPWSTR pStr,DWORD dwFlags,LPARAM lparam)1011 search_locale_enum(LPWSTR pStr, DWORD dwFlags, LPARAM lparam)
1012 {
1013 wchar_t test_locale[LOCALE_NAME_MAX_LENGTH];
1014 wchar_t **argv;
1015
1016 (void) (dwFlags);
1017
1018 argv = (wchar_t **) lparam;
1019 *argv[2] = (wchar_t) 0;
1020
1021 memset(test_locale, 0, sizeof(test_locale));
1022
1023 /* Get the name of the <Language> in English */
1024 if (GetLocaleInfoEx(pStr, LOCALE_SENGLISHLANGUAGENAME,
1025 test_locale, LOCALE_NAME_MAX_LENGTH))
1026 {
1027 /*
1028 * If the enumerated locale does not have a hyphen ("en") OR the
1029 * lc_message input does not have an underscore ("English"), we only
1030 * need to compare the <Language> tags.
1031 */
1032 if (wcsrchr(pStr, '-') == NULL || wcsrchr(argv[0], '_') == NULL)
1033 {
1034 if (_wcsicmp(argv[0], test_locale) == 0)
1035 {
1036 wcscpy(argv[1], pStr);
1037 *argv[2] = (wchar_t) 1;
1038 return FALSE;
1039 }
1040 }
1041
1042 /*
1043 * We have to compare a full <Language>_<Country> tag, so we append
1044 * the underscore and name of the country/region in English, e.g.
1045 * "English_United States".
1046 */
1047 else
1048 {
1049 size_t len;
1050
1051 wcscat(test_locale, L"_");
1052 len = wcslen(test_locale);
1053 if (GetLocaleInfoEx(pStr, LOCALE_SENGLISHCOUNTRYNAME,
1054 test_locale + len,
1055 LOCALE_NAME_MAX_LENGTH - len))
1056 {
1057 if (_wcsicmp(argv[0], test_locale) == 0)
1058 {
1059 wcscpy(argv[1], pStr);
1060 *argv[2] = (wchar_t) 1;
1061 return FALSE;
1062 }
1063 }
1064 }
1065 }
1066
1067 return TRUE;
1068 }
1069
1070 /*
1071 * This function converts a Windows locale name to an ISO formatted version
1072 * for Visual Studio 2015 or greater.
1073 *
1074 * Returns NULL, if no valid conversion was found.
1075 */
1076 static char *
get_iso_localename(const char * winlocname)1077 get_iso_localename(const char *winlocname)
1078 {
1079 wchar_t wc_locale_name[LOCALE_NAME_MAX_LENGTH];
1080 wchar_t buffer[LOCALE_NAME_MAX_LENGTH];
1081 static char iso_lc_messages[LOCALE_NAME_MAX_LENGTH];
1082 char *period;
1083 int len;
1084 int ret_val;
1085
1086 /*
1087 * Valid locales have the following syntax:
1088 * <Language>[_<Country>[.<CodePage>]]
1089 *
1090 * GetLocaleInfoEx can only take locale name without code-page and for the
1091 * purpose of this API the code-page doesn't matter.
1092 */
1093 period = strchr(winlocname, '.');
1094 if (period != NULL)
1095 len = period - winlocname;
1096 else
1097 len = pg_mbstrlen(winlocname);
1098
1099 memset(wc_locale_name, 0, sizeof(wc_locale_name));
1100 memset(buffer, 0, sizeof(buffer));
1101 MultiByteToWideChar(CP_ACP, 0, winlocname, len, wc_locale_name,
1102 LOCALE_NAME_MAX_LENGTH);
1103
1104 /*
1105 * If the lc_messages is already an Unix-style string, we have a direct
1106 * match with LOCALE_SNAME, e.g. en-US, en_US.
1107 */
1108 ret_val = GetLocaleInfoEx(wc_locale_name, LOCALE_SNAME, (LPWSTR) &buffer,
1109 LOCALE_NAME_MAX_LENGTH);
1110 if (!ret_val)
1111 {
1112 /*
1113 * Search for a locale in the system that matches language and country
1114 * name.
1115 */
1116 wchar_t *argv[3];
1117
1118 argv[0] = wc_locale_name;
1119 argv[1] = buffer;
1120 argv[2] = (wchar_t *) &ret_val;
1121 EnumSystemLocalesEx(search_locale_enum, LOCALE_WINDOWS, (LPARAM) argv,
1122 NULL);
1123 }
1124
1125 if (ret_val)
1126 {
1127 size_t rc;
1128 char *hyphen;
1129
1130 /* Locale names use only ASCII, any conversion locale suffices. */
1131 rc = wchar2char(iso_lc_messages, buffer, sizeof(iso_lc_messages), NULL);
1132 if (rc == -1 || rc == sizeof(iso_lc_messages))
1133 return NULL;
1134
1135 /*
1136 * Simply replace the hyphen with an underscore. See comments in
1137 * IsoLocaleName.
1138 */
1139 hyphen = strchr(iso_lc_messages, '-');
1140 if (hyphen)
1141 *hyphen = '_';
1142 return iso_lc_messages;
1143 }
1144
1145 return NULL;
1146 }
1147 #endif /* _MSC_VER >= 1900 */
1148
1149 static char *
IsoLocaleName(const char * winlocname)1150 IsoLocaleName(const char *winlocname)
1151 {
1152 #if defined(_MSC_VER)
1153 static char iso_lc_messages[LOCALE_NAME_MAX_LENGTH];
1154
1155 if (pg_strcasecmp("c", winlocname) == 0 ||
1156 pg_strcasecmp("posix", winlocname) == 0)
1157 {
1158 strcpy(iso_lc_messages, "C");
1159 return iso_lc_messages;
1160 }
1161 else
1162 {
1163 #if (_MSC_VER >= 1900) /* Visual Studio 2015 or later */
1164 return get_iso_localename(winlocname);
1165 #else
1166 _locale_t loct;
1167
1168 loct = _create_locale(LC_CTYPE, winlocname);
1169 if (loct != NULL)
1170 {
1171 size_t rc;
1172 char *hyphen;
1173
1174 /* Locale names use only ASCII, any conversion locale suffices. */
1175 rc = wchar2char(iso_lc_messages, loct->locinfo->locale_name[LC_CTYPE],
1176 sizeof(iso_lc_messages), NULL);
1177 _free_locale(loct);
1178 if (rc == -1 || rc == sizeof(iso_lc_messages))
1179 return NULL;
1180
1181 /*
1182 * Since the message catalogs sit on a case-insensitive
1183 * filesystem, we need not standardize letter case here. So long
1184 * as we do not ship message catalogs for which it would matter,
1185 * we also need not translate the script/variant portion, e.g.
1186 * uz-Cyrl-UZ to uz_UZ@cyrillic. Simply replace the hyphen with
1187 * an underscore.
1188 *
1189 * Note that the locale name can be less-specific than the value
1190 * we would derive under earlier Visual Studio releases. For
1191 * example, French_France.1252 yields just "fr". This does not
1192 * affect any of the country-specific message catalogs available
1193 * as of this writing (pt_BR, zh_CN, zh_TW).
1194 */
1195 hyphen = strchr(iso_lc_messages, '-');
1196 if (hyphen)
1197 *hyphen = '_';
1198 return iso_lc_messages;
1199 }
1200 #endif /* Visual Studio 2015 or later */
1201 }
1202 #endif /* defined(_MSC_VER) */
1203 return NULL; /* Not supported on this version of msvc/mingw */
1204 }
1205 #endif /* WIN32 && LC_MESSAGES */
1206
1207
1208 /*
1209 * Detect aging strxfrm() implementations that, in a subset of locales, write
1210 * past the specified buffer length. Affected users must update OS packages
1211 * before using PostgreSQL 9.5 or later.
1212 *
1213 * Assume that the bug can come and go from one postmaster startup to another
1214 * due to physical replication among diverse machines. Assume that the bug's
1215 * presence will not change during the life of a particular postmaster. Given
1216 * those assumptions, call this no less than once per postmaster startup per
1217 * LC_COLLATE setting used. No known-affected system offers strxfrm_l(), so
1218 * there is no need to consider pg_collation locales.
1219 */
1220 void
check_strxfrm_bug(void)1221 check_strxfrm_bug(void)
1222 {
1223 char buf[32];
1224 const int canary = 0x7F;
1225 bool ok = true;
1226
1227 /*
1228 * Given a two-byte ASCII string and length limit 7, 8 or 9, Solaris 10
1229 * 05/08 returns 18 and modifies 10 bytes. It respects limits above or
1230 * below that range.
1231 *
1232 * The bug is present in Solaris 8 as well; it is absent in Solaris 10
1233 * 01/13 and Solaris 11.2. Affected locales include is_IS.ISO8859-1,
1234 * en_US.UTF-8, en_US.ISO8859-1, and ru_RU.KOI8-R. Unaffected locales
1235 * include de_DE.UTF-8, de_DE.ISO8859-1, zh_TW.UTF-8, and C.
1236 */
1237 buf[7] = canary;
1238 (void) strxfrm(buf, "ab", 7);
1239 if (buf[7] != canary)
1240 ok = false;
1241
1242 /*
1243 * illumos bug #1594 was present in the source tree from 2010-10-11 to
1244 * 2012-02-01. Given an ASCII string of any length and length limit 1,
1245 * affected systems ignore the length limit and modify a number of bytes
1246 * one less than the return value. The problem inputs for this bug do not
1247 * overlap those for the Solaris bug, hence a distinct test.
1248 *
1249 * Affected systems include smartos-20110926T021612Z. Affected locales
1250 * include en_US.ISO8859-1 and en_US.UTF-8. Unaffected locales include C.
1251 */
1252 buf[1] = canary;
1253 (void) strxfrm(buf, "a", 1);
1254 if (buf[1] != canary)
1255 ok = false;
1256
1257 if (!ok)
1258 ereport(ERROR,
1259 (errcode(ERRCODE_SYSTEM_ERROR),
1260 errmsg_internal("strxfrm(), in locale \"%s\", writes past the specified array length",
1261 setlocale(LC_COLLATE, NULL)),
1262 errhint("Apply system library package updates.")));
1263 }
1264
1265
1266 /*
1267 * Cache mechanism for collation information.
1268 *
1269 * We cache two flags: whether the collation's LC_COLLATE or LC_CTYPE is C
1270 * (or POSIX), so we can optimize a few code paths in various places.
1271 * For the built-in C and POSIX collations, we can know that without even
1272 * doing a cache lookup, but we want to support aliases for C/POSIX too.
1273 * For the "default" collation, there are separate static cache variables,
1274 * since consulting the pg_collation catalog doesn't tell us what we need.
1275 *
1276 * Also, if a pg_locale_t has been requested for a collation, we cache that
1277 * for the life of a backend.
1278 *
1279 * Note that some code relies on the flags not reporting false negatives
1280 * (that is, saying it's not C when it is). For example, char2wchar()
1281 * could fail if the locale is C, so str_tolower() shouldn't call it
1282 * in that case.
1283 *
1284 * Note that we currently lack any way to flush the cache. Since we don't
1285 * support ALTER COLLATION, this is OK. The worst case is that someone
1286 * drops a collation, and a useless cache entry hangs around in existing
1287 * backends.
1288 */
1289
1290 static collation_cache_entry *
lookup_collation_cache(Oid collation,bool set_flags)1291 lookup_collation_cache(Oid collation, bool set_flags)
1292 {
1293 collation_cache_entry *cache_entry;
1294 bool found;
1295
1296 Assert(OidIsValid(collation));
1297 Assert(collation != DEFAULT_COLLATION_OID);
1298
1299 if (collation_cache == NULL)
1300 {
1301 /* First time through, initialize the hash table */
1302 HASHCTL ctl;
1303
1304 memset(&ctl, 0, sizeof(ctl));
1305 ctl.keysize = sizeof(Oid);
1306 ctl.entrysize = sizeof(collation_cache_entry);
1307 collation_cache = hash_create("Collation cache", 100, &ctl,
1308 HASH_ELEM | HASH_BLOBS);
1309 }
1310
1311 cache_entry = hash_search(collation_cache, &collation, HASH_ENTER, &found);
1312 if (!found)
1313 {
1314 /*
1315 * Make sure cache entry is marked invalid, in case we fail before
1316 * setting things.
1317 */
1318 cache_entry->flags_valid = false;
1319 cache_entry->locale = 0;
1320 }
1321
1322 if (set_flags && !cache_entry->flags_valid)
1323 {
1324 /* Attempt to set the flags */
1325 HeapTuple tp;
1326 Form_pg_collation collform;
1327 const char *collcollate;
1328 const char *collctype;
1329
1330 tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collation));
1331 if (!HeapTupleIsValid(tp))
1332 elog(ERROR, "cache lookup failed for collation %u", collation);
1333 collform = (Form_pg_collation) GETSTRUCT(tp);
1334
1335 collcollate = NameStr(collform->collcollate);
1336 collctype = NameStr(collform->collctype);
1337
1338 cache_entry->collate_is_c = ((strcmp(collcollate, "C") == 0) ||
1339 (strcmp(collcollate, "POSIX") == 0));
1340 cache_entry->ctype_is_c = ((strcmp(collctype, "C") == 0) ||
1341 (strcmp(collctype, "POSIX") == 0));
1342
1343 cache_entry->flags_valid = true;
1344
1345 ReleaseSysCache(tp);
1346 }
1347
1348 return cache_entry;
1349 }
1350
1351
1352 /*
1353 * Detect whether collation's LC_COLLATE property is C
1354 */
1355 bool
lc_collate_is_c(Oid collation)1356 lc_collate_is_c(Oid collation)
1357 {
1358 /*
1359 * If we're asked about "collation 0", return false, so that the code will
1360 * go into the non-C path and report that the collation is bogus.
1361 */
1362 if (!OidIsValid(collation))
1363 return false;
1364
1365 /*
1366 * If we're asked about the default collation, we have to inquire of the C
1367 * library. Cache the result so we only have to compute it once.
1368 */
1369 if (collation == DEFAULT_COLLATION_OID)
1370 {
1371 static int result = -1;
1372 char *localeptr;
1373
1374 if (result >= 0)
1375 return (bool) result;
1376 localeptr = setlocale(LC_COLLATE, NULL);
1377 if (!localeptr)
1378 elog(ERROR, "invalid LC_COLLATE setting");
1379
1380 if (strcmp(localeptr, "C") == 0)
1381 result = true;
1382 else if (strcmp(localeptr, "POSIX") == 0)
1383 result = true;
1384 else
1385 result = false;
1386 return (bool) result;
1387 }
1388
1389 /*
1390 * If we're asked about the built-in C/POSIX collations, we know that.
1391 */
1392 if (collation == C_COLLATION_OID ||
1393 collation == POSIX_COLLATION_OID)
1394 return true;
1395
1396 /*
1397 * Otherwise, we have to consult pg_collation, but we cache that.
1398 */
1399 return (lookup_collation_cache(collation, true))->collate_is_c;
1400 }
1401
1402 /*
1403 * Detect whether collation's LC_CTYPE property is C
1404 */
1405 bool
lc_ctype_is_c(Oid collation)1406 lc_ctype_is_c(Oid collation)
1407 {
1408 /*
1409 * If we're asked about "collation 0", return false, so that the code will
1410 * go into the non-C path and report that the collation is bogus.
1411 */
1412 if (!OidIsValid(collation))
1413 return false;
1414
1415 /*
1416 * If we're asked about the default collation, we have to inquire of the C
1417 * library. Cache the result so we only have to compute it once.
1418 */
1419 if (collation == DEFAULT_COLLATION_OID)
1420 {
1421 static int result = -1;
1422 char *localeptr;
1423
1424 if (result >= 0)
1425 return (bool) result;
1426 localeptr = setlocale(LC_CTYPE, NULL);
1427 if (!localeptr)
1428 elog(ERROR, "invalid LC_CTYPE setting");
1429
1430 if (strcmp(localeptr, "C") == 0)
1431 result = true;
1432 else if (strcmp(localeptr, "POSIX") == 0)
1433 result = true;
1434 else
1435 result = false;
1436 return (bool) result;
1437 }
1438
1439 /*
1440 * If we're asked about the built-in C/POSIX collations, we know that.
1441 */
1442 if (collation == C_COLLATION_OID ||
1443 collation == POSIX_COLLATION_OID)
1444 return true;
1445
1446 /*
1447 * Otherwise, we have to consult pg_collation, but we cache that.
1448 */
1449 return (lookup_collation_cache(collation, true))->ctype_is_c;
1450 }
1451
1452
1453 /* simple subroutine for reporting errors from newlocale() */
1454 #ifdef HAVE_LOCALE_T
1455 static void
report_newlocale_failure(const char * localename)1456 report_newlocale_failure(const char *localename)
1457 {
1458 int save_errno;
1459
1460 /*
1461 * Windows doesn't provide any useful error indication from
1462 * _create_locale(), and BSD-derived platforms don't seem to feel they
1463 * need to set errno either (even though POSIX is pretty clear that
1464 * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1465 * is what to report.
1466 */
1467 if (errno == 0)
1468 errno = ENOENT;
1469
1470 /*
1471 * ENOENT means "no such locale", not "no such file", so clarify that
1472 * errno with an errdetail message.
1473 */
1474 save_errno = errno; /* auxiliary funcs might change errno */
1475 ereport(ERROR,
1476 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1477 errmsg("could not create locale \"%s\": %m",
1478 localename),
1479 (save_errno == ENOENT ?
1480 errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1481 localename) : 0)));
1482 }
1483 #endif /* HAVE_LOCALE_T */
1484
1485
1486 /*
1487 * Create a locale_t from a collation OID. Results are cached for the
1488 * lifetime of the backend. Thus, do not free the result with freelocale().
1489 *
1490 * As a special optimization, the default/database collation returns 0.
1491 * Callers should then revert to the non-locale_t-enabled code path.
1492 * In fact, they shouldn't call this function at all when they are dealing
1493 * with the default locale. That can save quite a bit in hotspots.
1494 * Also, callers should avoid calling this before going down a C/POSIX
1495 * fastpath, because such a fastpath should work even on platforms without
1496 * locale_t support in the C library.
1497 *
1498 * For simplicity, we always generate COLLATE + CTYPE even though we
1499 * might only need one of them. Since this is called only once per session,
1500 * it shouldn't cost much.
1501 */
1502 pg_locale_t
pg_newlocale_from_collation(Oid collid)1503 pg_newlocale_from_collation(Oid collid)
1504 {
1505 collation_cache_entry *cache_entry;
1506
1507 /* Callers must pass a valid OID */
1508 Assert(OidIsValid(collid));
1509
1510 /* Return 0 for "default" collation, just in case caller forgets */
1511 if (collid == DEFAULT_COLLATION_OID)
1512 return (pg_locale_t) 0;
1513
1514 cache_entry = lookup_collation_cache(collid, false);
1515
1516 if (cache_entry->locale == 0)
1517 {
1518 /* We haven't computed this yet in this session, so do it */
1519 HeapTuple tp;
1520 Form_pg_collation collform;
1521 const char *collcollate;
1522 const char *collctype pg_attribute_unused();
1523 struct pg_locale_struct result;
1524 pg_locale_t resultp;
1525 Datum collversion;
1526 bool isnull;
1527
1528 tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
1529 if (!HeapTupleIsValid(tp))
1530 elog(ERROR, "cache lookup failed for collation %u", collid);
1531 collform = (Form_pg_collation) GETSTRUCT(tp);
1532
1533 collcollate = NameStr(collform->collcollate);
1534 collctype = NameStr(collform->collctype);
1535
1536 /* We'll fill in the result struct locally before allocating memory */
1537 memset(&result, 0, sizeof(result));
1538 result.provider = collform->collprovider;
1539 result.deterministic = collform->collisdeterministic;
1540
1541 if (collform->collprovider == COLLPROVIDER_LIBC)
1542 {
1543 #ifdef HAVE_LOCALE_T
1544 locale_t loc;
1545
1546 if (strcmp(collcollate, collctype) == 0)
1547 {
1548 /* Normal case where they're the same */
1549 errno = 0;
1550 #ifndef WIN32
1551 loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collcollate,
1552 NULL);
1553 #else
1554 loc = _create_locale(LC_ALL, collcollate);
1555 #endif
1556 if (!loc)
1557 report_newlocale_failure(collcollate);
1558 }
1559 else
1560 {
1561 #ifndef WIN32
1562 /* We need two newlocale() steps */
1563 locale_t loc1;
1564
1565 errno = 0;
1566 loc1 = newlocale(LC_COLLATE_MASK, collcollate, NULL);
1567 if (!loc1)
1568 report_newlocale_failure(collcollate);
1569 errno = 0;
1570 loc = newlocale(LC_CTYPE_MASK, collctype, loc1);
1571 if (!loc)
1572 report_newlocale_failure(collctype);
1573 #else
1574
1575 /*
1576 * XXX The _create_locale() API doesn't appear to support
1577 * this. Could perhaps be worked around by changing
1578 * pg_locale_t to contain two separate fields.
1579 */
1580 ereport(ERROR,
1581 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1582 errmsg("collations with different collate and ctype values are not supported on this platform")));
1583 #endif
1584 }
1585
1586 result.info.lt = loc;
1587 #else /* not HAVE_LOCALE_T */
1588 /* platform that doesn't support locale_t */
1589 ereport(ERROR,
1590 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1591 errmsg("collation provider LIBC is not supported on this platform")));
1592 #endif /* not HAVE_LOCALE_T */
1593 }
1594 else if (collform->collprovider == COLLPROVIDER_ICU)
1595 {
1596 #ifdef USE_ICU
1597 UCollator *collator;
1598 UErrorCode status;
1599
1600 if (strcmp(collcollate, collctype) != 0)
1601 ereport(ERROR,
1602 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1603 errmsg("collations with different collate and ctype values are not supported by ICU")));
1604
1605 status = U_ZERO_ERROR;
1606 collator = ucol_open(collcollate, &status);
1607 if (U_FAILURE(status))
1608 ereport(ERROR,
1609 (errmsg("could not open collator for locale \"%s\": %s",
1610 collcollate, u_errorName(status))));
1611
1612 if (U_ICU_VERSION_MAJOR_NUM < 54)
1613 icu_set_collation_attributes(collator, collcollate);
1614
1615 /* We will leak this string if we get an error below :-( */
1616 result.info.icu.locale = MemoryContextStrdup(TopMemoryContext,
1617 collcollate);
1618 result.info.icu.ucol = collator;
1619 #else /* not USE_ICU */
1620 /* could get here if a collation was created by a build with ICU */
1621 ereport(ERROR,
1622 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1623 errmsg("ICU is not supported in this build"), \
1624 errhint("You need to rebuild PostgreSQL using --with-icu.")));
1625 #endif /* not USE_ICU */
1626 }
1627
1628 collversion = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion,
1629 &isnull);
1630 if (!isnull)
1631 {
1632 char *actual_versionstr;
1633 char *collversionstr;
1634
1635 actual_versionstr = get_collation_actual_version(collform->collprovider, collcollate);
1636 if (!actual_versionstr)
1637 {
1638 /*
1639 * This could happen when specifying a version in CREATE
1640 * COLLATION for a libc locale, or manually creating a mess in
1641 * the catalogs.
1642 */
1643 ereport(ERROR,
1644 (errmsg("collation \"%s\" has no actual version, but a version was specified",
1645 NameStr(collform->collname))));
1646 }
1647 collversionstr = TextDatumGetCString(collversion);
1648
1649 if (strcmp(actual_versionstr, collversionstr) != 0)
1650 ereport(WARNING,
1651 (errmsg("collation \"%s\" has version mismatch",
1652 NameStr(collform->collname)),
1653 errdetail("The collation in the database was created using version %s, "
1654 "but the operating system provides version %s.",
1655 collversionstr, actual_versionstr),
1656 errhint("Rebuild all objects affected by this collation and run "
1657 "ALTER COLLATION %s REFRESH VERSION, "
1658 "or build PostgreSQL with the right library version.",
1659 quote_qualified_identifier(get_namespace_name(collform->collnamespace),
1660 NameStr(collform->collname)))));
1661 }
1662
1663 ReleaseSysCache(tp);
1664
1665 /* We'll keep the pg_locale_t structures in TopMemoryContext */
1666 resultp = MemoryContextAlloc(TopMemoryContext, sizeof(*resultp));
1667 *resultp = result;
1668
1669 cache_entry->locale = resultp;
1670 }
1671
1672 return cache_entry->locale;
1673 }
1674
1675 /*
1676 * Get provider-specific collation version string for the given collation from
1677 * the operating system/library.
1678 */
1679 char *
get_collation_actual_version(char collprovider,const char * collcollate)1680 get_collation_actual_version(char collprovider, const char *collcollate)
1681 {
1682 char *collversion = NULL;
1683
1684 #ifdef USE_ICU
1685 if (collprovider == COLLPROVIDER_ICU)
1686 {
1687 UCollator *collator;
1688 UErrorCode status;
1689 UVersionInfo versioninfo;
1690 char buf[U_MAX_VERSION_STRING_LENGTH];
1691
1692 status = U_ZERO_ERROR;
1693 collator = ucol_open(collcollate, &status);
1694 if (U_FAILURE(status))
1695 ereport(ERROR,
1696 (errmsg("could not open collator for locale \"%s\": %s",
1697 collcollate, u_errorName(status))));
1698 ucol_getVersion(collator, versioninfo);
1699 ucol_close(collator);
1700
1701 u_versionToString(versioninfo, buf);
1702 collversion = pstrdup(buf);
1703 }
1704 else
1705 #endif
1706 if (collprovider == COLLPROVIDER_LIBC)
1707 {
1708 #if defined(__GLIBC__)
1709 char *copy = pstrdup(collcollate);
1710 char *copy_suffix = strstr(copy, ".");
1711 bool need_version = true;
1712
1713 /*
1714 * Check for names like C.UTF-8 by chopping off the encoding suffix on
1715 * our temporary copy, so we can skip the version.
1716 */
1717 if (copy_suffix)
1718 *copy_suffix = '\0';
1719 if (pg_strcasecmp("c", copy) == 0 ||
1720 pg_strcasecmp("posix", copy) == 0)
1721 need_version = false;
1722 pfree(copy);
1723 if (!need_version)
1724 return NULL;
1725
1726 /* Use the glibc version because we don't have anything better. */
1727 collversion = pstrdup(gnu_get_libc_version());
1728 #elif defined(WIN32) && _WIN32_WINNT >= 0x0600
1729 /*
1730 * If we are targeting Windows Vista and above, we can ask for a name
1731 * given a collation name (earlier versions required a location code
1732 * that we don't have).
1733 */
1734 NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
1735 WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
1736
1737 /* These would be invalid arguments, but have no version. */
1738 if (pg_strcasecmp("c", collcollate) == 0 ||
1739 pg_strcasecmp("posix", collcollate) == 0)
1740 return NULL;
1741
1742 /* For all other names, ask the OS. */
1743 MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
1744 LOCALE_NAME_MAX_LENGTH);
1745 if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
1746 {
1747 /*
1748 * GetNLSVersionEx() wants a language tag such as "en-US", not a
1749 * locale name like "English_United States.1252". Until those
1750 * values can be prevented from entering the system, or 100%
1751 * reliably converted to the more useful tag format, tolerate the
1752 * resulting error and report that we have no version data.
1753 */
1754 if (GetLastError() == ERROR_INVALID_PARAMETER)
1755 return NULL;
1756
1757 ereport(ERROR,
1758 (errmsg("could not get collation version for locale \"%s\": error code %lu",
1759 collcollate,
1760 GetLastError())));
1761 }
1762 collversion = psprintf("%d.%d,%d.%d",
1763 (version.dwNLSVersion >> 8) & 0xFFFF,
1764 version.dwNLSVersion & 0xFF,
1765 (version.dwDefinedVersion >> 8) & 0xFFFF,
1766 version.dwDefinedVersion & 0xFF);
1767 #endif
1768 }
1769
1770 return collversion;
1771 }
1772
1773
1774 #ifdef USE_ICU
1775 /*
1776 * Converter object for converting between ICU's UChar strings and C strings
1777 * in database encoding. Since the database encoding doesn't change, we only
1778 * need one of these per session.
1779 */
1780 static UConverter *icu_converter = NULL;
1781
1782 static void
init_icu_converter(void)1783 init_icu_converter(void)
1784 {
1785 const char *icu_encoding_name;
1786 UErrorCode status;
1787 UConverter *conv;
1788
1789 if (icu_converter)
1790 return; /* already done */
1791
1792 icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
1793 if (!icu_encoding_name)
1794 ereport(ERROR,
1795 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1796 errmsg("encoding \"%s\" not supported by ICU",
1797 pg_encoding_to_char(GetDatabaseEncoding()))));
1798
1799 status = U_ZERO_ERROR;
1800 conv = ucnv_open(icu_encoding_name, &status);
1801 if (U_FAILURE(status))
1802 ereport(ERROR,
1803 (errmsg("could not open ICU converter for encoding \"%s\": %s",
1804 icu_encoding_name, u_errorName(status))));
1805
1806 icu_converter = conv;
1807 }
1808
1809 /*
1810 * Convert a string in the database encoding into a string of UChars.
1811 *
1812 * The source string at buff is of length nbytes
1813 * (it needn't be nul-terminated)
1814 *
1815 * *buff_uchar receives a pointer to the palloc'd result string, and
1816 * the function's result is the number of UChars generated.
1817 *
1818 * The result string is nul-terminated, though most callers rely on the
1819 * result length instead.
1820 */
1821 int32_t
icu_to_uchar(UChar ** buff_uchar,const char * buff,size_t nbytes)1822 icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
1823 {
1824 UErrorCode status;
1825 int32_t len_uchar;
1826
1827 init_icu_converter();
1828
1829 status = U_ZERO_ERROR;
1830 len_uchar = ucnv_toUChars(icu_converter, NULL, 0,
1831 buff, nbytes, &status);
1832 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
1833 ereport(ERROR,
1834 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
1835
1836 *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
1837
1838 status = U_ZERO_ERROR;
1839 len_uchar = ucnv_toUChars(icu_converter, *buff_uchar, len_uchar + 1,
1840 buff, nbytes, &status);
1841 if (U_FAILURE(status))
1842 ereport(ERROR,
1843 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
1844
1845 return len_uchar;
1846 }
1847
1848 /*
1849 * Convert a string of UChars into the database encoding.
1850 *
1851 * The source string at buff_uchar is of length len_uchar
1852 * (it needn't be nul-terminated)
1853 *
1854 * *result receives a pointer to the palloc'd result string, and the
1855 * function's result is the number of bytes generated (not counting nul).
1856 *
1857 * The result string is nul-terminated.
1858 */
1859 int32_t
icu_from_uchar(char ** result,const UChar * buff_uchar,int32_t len_uchar)1860 icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
1861 {
1862 UErrorCode status;
1863 int32_t len_result;
1864
1865 init_icu_converter();
1866
1867 status = U_ZERO_ERROR;
1868 len_result = ucnv_fromUChars(icu_converter, NULL, 0,
1869 buff_uchar, len_uchar, &status);
1870 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
1871 ereport(ERROR,
1872 (errmsg("%s failed: %s", "ucnv_fromUChars",
1873 u_errorName(status))));
1874
1875 *result = palloc(len_result + 1);
1876
1877 status = U_ZERO_ERROR;
1878 len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1,
1879 buff_uchar, len_uchar, &status);
1880 if (U_FAILURE(status))
1881 ereport(ERROR,
1882 (errmsg("%s failed: %s", "ucnv_fromUChars",
1883 u_errorName(status))));
1884
1885 return len_result;
1886 }
1887
1888 /*
1889 * Parse collation attributes and apply them to the open collator. This takes
1890 * a string like "und@colStrength=primary;colCaseLevel=yes" and parses and
1891 * applies the key-value arguments.
1892 *
1893 * Starting with ICU version 54, the attributes are processed automatically by
1894 * ucol_open(), so this is only necessary for emulating this behavior on older
1895 * versions.
1896 */
pg_attribute_unused()1897 pg_attribute_unused()
1898 static void
1899 icu_set_collation_attributes(UCollator *collator, const char *loc)
1900 {
1901 char *str = asc_tolower(loc, strlen(loc));
1902
1903 str = strchr(str, '@');
1904 if (!str)
1905 return;
1906 str++;
1907
1908 for (char *token = strtok(str, ";"); token; token = strtok(NULL, ";"))
1909 {
1910 char *e = strchr(token, '=');
1911
1912 if (e)
1913 {
1914 char *name;
1915 char *value;
1916 UColAttribute uattr;
1917 UColAttributeValue uvalue;
1918 UErrorCode status;
1919
1920 status = U_ZERO_ERROR;
1921
1922 *e = '\0';
1923 name = token;
1924 value = e + 1;
1925
1926 /*
1927 * See attribute name and value lists in ICU i18n/coll.cpp
1928 */
1929 if (strcmp(name, "colstrength") == 0)
1930 uattr = UCOL_STRENGTH;
1931 else if (strcmp(name, "colbackwards") == 0)
1932 uattr = UCOL_FRENCH_COLLATION;
1933 else if (strcmp(name, "colcaselevel") == 0)
1934 uattr = UCOL_CASE_LEVEL;
1935 else if (strcmp(name, "colcasefirst") == 0)
1936 uattr = UCOL_CASE_FIRST;
1937 else if (strcmp(name, "colalternate") == 0)
1938 uattr = UCOL_ALTERNATE_HANDLING;
1939 else if (strcmp(name, "colnormalization") == 0)
1940 uattr = UCOL_NORMALIZATION_MODE;
1941 else if (strcmp(name, "colnumeric") == 0)
1942 uattr = UCOL_NUMERIC_COLLATION;
1943 else
1944 /* ignore if unknown */
1945 continue;
1946
1947 if (strcmp(value, "primary") == 0)
1948 uvalue = UCOL_PRIMARY;
1949 else if (strcmp(value, "secondary") == 0)
1950 uvalue = UCOL_SECONDARY;
1951 else if (strcmp(value, "tertiary") == 0)
1952 uvalue = UCOL_TERTIARY;
1953 else if (strcmp(value, "quaternary") == 0)
1954 uvalue = UCOL_QUATERNARY;
1955 else if (strcmp(value, "identical") == 0)
1956 uvalue = UCOL_IDENTICAL;
1957 else if (strcmp(value, "no") == 0)
1958 uvalue = UCOL_OFF;
1959 else if (strcmp(value, "yes") == 0)
1960 uvalue = UCOL_ON;
1961 else if (strcmp(value, "shifted") == 0)
1962 uvalue = UCOL_SHIFTED;
1963 else if (strcmp(value, "non-ignorable") == 0)
1964 uvalue = UCOL_NON_IGNORABLE;
1965 else if (strcmp(value, "lower") == 0)
1966 uvalue = UCOL_LOWER_FIRST;
1967 else if (strcmp(value, "upper") == 0)
1968 uvalue = UCOL_UPPER_FIRST;
1969 else
1970 status = U_ILLEGAL_ARGUMENT_ERROR;
1971
1972 if (status == U_ZERO_ERROR)
1973 ucol_setAttribute(collator, uattr, uvalue, &status);
1974
1975 /*
1976 * Pretend the error came from ucol_open(), for consistent error
1977 * message across ICU versions.
1978 */
1979 if (U_FAILURE(status))
1980 ereport(ERROR,
1981 (errmsg("could not open collator for locale \"%s\": %s",
1982 loc, u_errorName(status))));
1983 }
1984 }
1985 }
1986
1987 #endif /* USE_ICU */
1988
1989 /*
1990 * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
1991 * Therefore we keep them here rather than with the mbutils code.
1992 */
1993
1994 /*
1995 * wchar2char --- convert wide characters to multibyte format
1996 *
1997 * This has the same API as the standard wcstombs_l() function; in particular,
1998 * tolen is the maximum number of bytes to store at *to, and *from must be
1999 * zero-terminated. The output will be zero-terminated iff there is room.
2000 */
2001 size_t
wchar2char(char * to,const wchar_t * from,size_t tolen,pg_locale_t locale)2002 wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
2003 {
2004 size_t result;
2005
2006 Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
2007
2008 if (tolen == 0)
2009 return 0;
2010
2011 #ifdef WIN32
2012
2013 /*
2014 * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
2015 * for some reason mbstowcs and wcstombs won't do this for us, so we use
2016 * MultiByteToWideChar().
2017 */
2018 if (GetDatabaseEncoding() == PG_UTF8)
2019 {
2020 result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
2021 NULL, NULL);
2022 /* A zero return is failure */
2023 if (result <= 0)
2024 result = -1;
2025 else
2026 {
2027 Assert(result <= tolen);
2028 /* Microsoft counts the zero terminator in the result */
2029 result--;
2030 }
2031 }
2032 else
2033 #endif /* WIN32 */
2034 if (locale == (pg_locale_t) 0)
2035 {
2036 /* Use wcstombs directly for the default locale */
2037 result = wcstombs(to, from, tolen);
2038 }
2039 else
2040 {
2041 #ifdef HAVE_LOCALE_T
2042 #ifdef HAVE_WCSTOMBS_L
2043 /* Use wcstombs_l for nondefault locales */
2044 result = wcstombs_l(to, from, tolen, locale->info.lt);
2045 #else /* !HAVE_WCSTOMBS_L */
2046 /* We have to temporarily set the locale as current ... ugh */
2047 locale_t save_locale = uselocale(locale->info.lt);
2048
2049 result = wcstombs(to, from, tolen);
2050
2051 uselocale(save_locale);
2052 #endif /* HAVE_WCSTOMBS_L */
2053 #else /* !HAVE_LOCALE_T */
2054 /* Can't have locale != 0 without HAVE_LOCALE_T */
2055 elog(ERROR, "wcstombs_l is not available");
2056 result = 0; /* keep compiler quiet */
2057 #endif /* HAVE_LOCALE_T */
2058 }
2059
2060 return result;
2061 }
2062
2063 /*
2064 * char2wchar --- convert multibyte characters to wide characters
2065 *
2066 * This has almost the API of mbstowcs_l(), except that *from need not be
2067 * null-terminated; instead, the number of input bytes is specified as
2068 * fromlen. Also, we ereport() rather than returning -1 for invalid
2069 * input encoding. tolen is the maximum number of wchar_t's to store at *to.
2070 * The output will be zero-terminated iff there is room.
2071 */
2072 size_t
char2wchar(wchar_t * to,size_t tolen,const char * from,size_t fromlen,pg_locale_t locale)2073 char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
2074 pg_locale_t locale)
2075 {
2076 size_t result;
2077
2078 Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
2079
2080 if (tolen == 0)
2081 return 0;
2082
2083 #ifdef WIN32
2084 /* See WIN32 "Unicode" comment above */
2085 if (GetDatabaseEncoding() == PG_UTF8)
2086 {
2087 /* Win32 API does not work for zero-length input */
2088 if (fromlen == 0)
2089 result = 0;
2090 else
2091 {
2092 result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
2093 /* A zero return is failure */
2094 if (result == 0)
2095 result = -1;
2096 }
2097
2098 if (result != -1)
2099 {
2100 Assert(result < tolen);
2101 /* Append trailing null wchar (MultiByteToWideChar() does not) */
2102 to[result] = 0;
2103 }
2104 }
2105 else
2106 #endif /* WIN32 */
2107 {
2108 /* mbstowcs requires ending '\0' */
2109 char *str = pnstrdup(from, fromlen);
2110
2111 if (locale == (pg_locale_t) 0)
2112 {
2113 /* Use mbstowcs directly for the default locale */
2114 result = mbstowcs(to, str, tolen);
2115 }
2116 else
2117 {
2118 #ifdef HAVE_LOCALE_T
2119 #ifdef HAVE_MBSTOWCS_L
2120 /* Use mbstowcs_l for nondefault locales */
2121 result = mbstowcs_l(to, str, tolen, locale->info.lt);
2122 #else /* !HAVE_MBSTOWCS_L */
2123 /* We have to temporarily set the locale as current ... ugh */
2124 locale_t save_locale = uselocale(locale->info.lt);
2125
2126 result = mbstowcs(to, str, tolen);
2127
2128 uselocale(save_locale);
2129 #endif /* HAVE_MBSTOWCS_L */
2130 #else /* !HAVE_LOCALE_T */
2131 /* Can't have locale != 0 without HAVE_LOCALE_T */
2132 elog(ERROR, "mbstowcs_l is not available");
2133 result = 0; /* keep compiler quiet */
2134 #endif /* HAVE_LOCALE_T */
2135 }
2136
2137 pfree(str);
2138 }
2139
2140 if (result == -1)
2141 {
2142 /*
2143 * Invalid multibyte character encountered. We try to give a useful
2144 * error message by letting pg_verifymbstr check the string. But it's
2145 * possible that the string is OK to us, and not OK to mbstowcs ---
2146 * this suggests that the LC_CTYPE locale is different from the
2147 * database encoding. Give a generic error message if pg_verifymbstr
2148 * can't find anything wrong.
2149 */
2150 pg_verifymbstr(from, fromlen, false); /* might not return */
2151 /* but if it does ... */
2152 ereport(ERROR,
2153 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
2154 errmsg("invalid multibyte character for locale"),
2155 errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
2156 }
2157
2158 return result;
2159 }
2160