xref: /openbsd/gnu/usr.bin/perl/locale.c (revision 3bef86f7)
1 /*    locale.c
2  *
3  *    Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4  *    2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
5  *
6  *    You may distribute under the terms of either the GNU General Public
7  *    License or the Artistic License, as specified in the README file.
8  *
9  */
10 
11 /*
12  *      A Elbereth Gilthoniel,
13  *      silivren penna míriel
14  *      o menel aglar elenath!
15  *      Na-chaered palan-díriel
16  *      o galadhremmin ennorath,
17  *      Fanuilos, le linnathon
18  *      nef aear, si nef aearon!
19  *
20  *     [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
21  */
22 
23 /* utility functions for handling locale-specific stuff like what
24  * character represents the decimal point.
25  *
26  * All C programs have an underlying locale.  Perl code generally doesn't pay
27  * any attention to it except within the scope of a 'use locale'.  For most
28  * categories, it accomplishes this by just using different operations if it is
29  * in such scope than if not.  However, various libc functions called by Perl
30  * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31  * are used to toggle between the current locale and the C locale depending on
32  * the desired behavior of those functions at the moment.  And, LC_MESSAGES is
33  * switched to the C locale for outputting the message unless within the scope
34  * of 'use locale'.
35  *
36  * This code now has multi-thread-safe locale handling on systems that support
37  * that.  This is completely transparent to most XS code.  On earlier systems,
38  * it would be possible to emulate thread-safe locales, but this likely would
39  * involve a lot of locale switching, and would require XS code changes.
40  * Macros could be written so that the code wouldn't have to know which type of
41  * system is being used.  It's unlikely that we would ever do that, since most
42  * modern systems support thread-safe locales, but there was code written to
43  * this end, and is retained, #ifdef'd out.
44  */
45 
46 #include "EXTERN.h"
47 #define PERL_IN_LOCALE_C
48 #include "perl_langinfo.h"
49 #include "perl.h"
50 
51 #include "reentr.h"
52 
53 #ifdef I_WCHAR
54 #  include <wchar.h>
55 #endif
56 #ifdef I_WCTYPE
57 #  include <wctype.h>
58 #endif
59 
60 /* If the environment says to, we can output debugging information during
61  * initialization.  This is done before option parsing, and before any thread
62  * creation, so can be a file-level static */
63 #if ! defined(DEBUGGING)
64 #  define debug_initialization 0
65 #  define DEBUG_INITIALIZATION_set(v)
66 #else
67 static bool debug_initialization = FALSE;
68 #  define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
69 #endif
70 
71 
72 /* Returns the Unix errno portion; ignoring any others.  This is a macro here
73  * instead of putting it into perl.h, because unclear to khw what should be
74  * done generally. */
75 #define GET_ERRNO   saved_errno
76 
77 /* strlen() of a literal string constant.  We might want this more general,
78  * but using it in just this file for now.  A problem with more generality is
79  * the compiler warnings about comparing unlike signs */
80 #define STRLENs(s)  (sizeof("" s "") - 1)
81 
82 /* Is the C string input 'name' "C" or "POSIX"?  If so, and 'name' is the
83  * return of setlocale(), then this is extremely likely to be the C or POSIX
84  * locale.  However, the output of setlocale() is documented to be opaque, but
85  * the odds are extremely small that it would return these two strings for some
86  * other locale.  Note that VMS in these two locales includes many non-ASCII
87  * characters as controls and punctuation (below are hex bytes):
88  *   cntrl:  84-97 9B-9F
89  *   punct:  A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
90  * Oddly, none there are listed as alphas, though some represent alphabetics
91  * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
92 #define isNAME_C_OR_POSIX(name)                                              \
93                              (   (name) != NULL                              \
94                               && (( *(name) == 'C' && (*(name + 1)) == '\0') \
95                                    || strEQ((name), "POSIX")))
96 
97 #ifdef USE_LOCALE
98 
99 /* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far
100  * looked up.  This is in the form of a C string:  */
101 
102 #define UTF8NESS_SEP     "\v"
103 #define UTF8NESS_PREFIX  "\f"
104 
105 /* So, the string looks like:
106  *
107  *      \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0
108  *
109  * where the digit 0 after the \a indicates that the locale starting just
110  * after the preceding \v is not UTF-8, and the digit 1 mean it is. */
111 
112 STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1);
113 STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1);
114 
115 #define C_and_POSIX_utf8ness    UTF8NESS_SEP "C"     UTF8NESS_PREFIX "0"    \
116                                 UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0"
117 
118 /* The cache is initialized to C_and_POSIX_utf8ness at start up.  These are
119  * kept there always.  The remining portion of the cache is LRU, with the
120  * oldest looked-up locale at the tail end */
121 
122 STATIC char *
123 S_stdize_locale(pTHX_ char *locs)
124 {
125     /* Standardize the locale name from a string returned by 'setlocale',
126      * possibly modifying that string.
127      *
128      * The typical return value of setlocale() is either
129      * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
130      * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
131      *     (the space-separated values represent the various sublocales,
132      *      in some unspecified order).  This is not handled by this function.
133      *
134      * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
135      * which is harmful for further use of the string in setlocale().  This
136      * function removes the trailing new line and everything up through the '='
137      * */
138 
139     const char * const s = strchr(locs, '=');
140     bool okay = TRUE;
141 
142     PERL_ARGS_ASSERT_STDIZE_LOCALE;
143 
144     if (s) {
145         const char * const t = strchr(s, '.');
146         okay = FALSE;
147         if (t) {
148             const char * const u = strchr(t, '\n');
149             if (u && (u[1] == 0)) {
150                 const STRLEN len = u - s;
151                 Move(s + 1, locs, len, char);
152                 locs[len] = 0;
153                 okay = TRUE;
154             }
155         }
156     }
157 
158     if (!okay)
159         Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
160 
161     return locs;
162 }
163 
164 /* Two parallel arrays; first the locale categories Perl uses on this system;
165  * the second array is their names.  These arrays are in mostly arbitrary
166  * order. */
167 
168 const int categories[] = {
169 
170 #    ifdef USE_LOCALE_NUMERIC
171                              LC_NUMERIC,
172 #    endif
173 #    ifdef USE_LOCALE_CTYPE
174                              LC_CTYPE,
175 #    endif
176 #    ifdef USE_LOCALE_COLLATE
177                              LC_COLLATE,
178 #    endif
179 #    ifdef USE_LOCALE_TIME
180                              LC_TIME,
181 #    endif
182 #    ifdef USE_LOCALE_MESSAGES
183                              LC_MESSAGES,
184 #    endif
185 #    ifdef USE_LOCALE_MONETARY
186                              LC_MONETARY,
187 #    endif
188 #    ifdef USE_LOCALE_ADDRESS
189                              LC_ADDRESS,
190 #    endif
191 #    ifdef USE_LOCALE_IDENTIFICATION
192                              LC_IDENTIFICATION,
193 #    endif
194 #    ifdef USE_LOCALE_MEASUREMENT
195                              LC_MEASUREMENT,
196 #    endif
197 #    ifdef USE_LOCALE_PAPER
198                              LC_PAPER,
199 #    endif
200 #    ifdef USE_LOCALE_TELEPHONE
201                              LC_TELEPHONE,
202 #    endif
203 #    ifdef USE_LOCALE_SYNTAX
204                              LC_SYNTAX,
205 #    endif
206 #    ifdef USE_LOCALE_TOD
207                              LC_TOD,
208 #    endif
209 #    ifdef LC_ALL
210                              LC_ALL,
211 #    endif
212                             -1  /* Placeholder because C doesn't allow a
213                                    trailing comma, and it would get complicated
214                                    with all the #ifdef's */
215 };
216 
217 /* The top-most real element is LC_ALL */
218 
219 const char * const category_names[] = {
220 
221 #    ifdef USE_LOCALE_NUMERIC
222                                  "LC_NUMERIC",
223 #    endif
224 #    ifdef USE_LOCALE_CTYPE
225                                  "LC_CTYPE",
226 #    endif
227 #    ifdef USE_LOCALE_COLLATE
228                                  "LC_COLLATE",
229 #    endif
230 #    ifdef USE_LOCALE_TIME
231                                  "LC_TIME",
232 #    endif
233 #    ifdef USE_LOCALE_MESSAGES
234                                  "LC_MESSAGES",
235 #    endif
236 #    ifdef USE_LOCALE_MONETARY
237                                  "LC_MONETARY",
238 #    endif
239 #    ifdef USE_LOCALE_ADDRESS
240                                  "LC_ADDRESS",
241 #    endif
242 #    ifdef USE_LOCALE_IDENTIFICATION
243                                  "LC_IDENTIFICATION",
244 #    endif
245 #    ifdef USE_LOCALE_MEASUREMENT
246                                  "LC_MEASUREMENT",
247 #    endif
248 #    ifdef USE_LOCALE_PAPER
249                                  "LC_PAPER",
250 #    endif
251 #    ifdef USE_LOCALE_TELEPHONE
252                                  "LC_TELEPHONE",
253 #    endif
254 #    ifdef USE_LOCALE_SYNTAX
255                                  "LC_SYNTAX",
256 #    endif
257 #    ifdef USE_LOCALE_TOD
258                                  "LC_TOD",
259 #    endif
260 #    ifdef LC_ALL
261                                  "LC_ALL",
262 #    endif
263                                  NULL  /* Placeholder */
264                             };
265 
266 #  ifdef LC_ALL
267 
268     /* On systems with LC_ALL, it is kept in the highest index position.  (-2
269      * to account for the final unused placeholder element.) */
270 #    define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
271 
272 #  else
273 
274     /* On systems without LC_ALL, we pretend it is there, one beyond the real
275      * top element, hence in the unused placeholder element. */
276 #    define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
277 
278 #  endif
279 
280 /* Pretending there is an LC_ALL element just above allows us to avoid most
281  * special cases.  Most loops through these arrays in the code below are
282  * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'.  They will work
283  * on either type of system.  But the code must be written to not access the
284  * element at 'LC_ALL_INDEX' except on platforms that have it.  This can be
285  * checked for at compile time by using the #define LC_ALL_INDEX which is only
286  * defined if we do have LC_ALL. */
287 
288 STATIC const char *
289 S_category_name(const int category)
290 {
291     unsigned int i;
292 
293 #ifdef LC_ALL
294 
295     if (category == LC_ALL) {
296         return "LC_ALL";
297     }
298 
299 #endif
300 
301     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
302         if (category == categories[i]) {
303             return category_names[i];
304         }
305     }
306 
307     {
308         const char suffix[] = " (unknown)";
309         int temp = category;
310         Size_t length = sizeof(suffix) + 1;
311         char * unknown;
312         dTHX;
313 
314         if (temp < 0) {
315             length++;
316             temp = - temp;
317         }
318 
319         /* Calculate the number of digits */
320         while (temp >= 10) {
321             temp /= 10;
322             length++;
323         }
324 
325         Newx(unknown, length, char);
326         my_snprintf(unknown, length, "%d%s", category, suffix);
327         SAVEFREEPV(unknown);
328         return unknown;
329     }
330 }
331 
332 /* Now create LC_foo_INDEX #defines for just those categories on this system */
333 #  ifdef USE_LOCALE_NUMERIC
334 #    define LC_NUMERIC_INDEX            0
335 #    define _DUMMY_NUMERIC              LC_NUMERIC_INDEX
336 #  else
337 #    define _DUMMY_NUMERIC              -1
338 #  endif
339 #  ifdef USE_LOCALE_CTYPE
340 #    define LC_CTYPE_INDEX              _DUMMY_NUMERIC + 1
341 #    define _DUMMY_CTYPE                LC_CTYPE_INDEX
342 #  else
343 #    define _DUMMY_CTYPE                _DUMMY_NUMERIC
344 #  endif
345 #  ifdef USE_LOCALE_COLLATE
346 #    define LC_COLLATE_INDEX            _DUMMY_CTYPE + 1
347 #    define _DUMMY_COLLATE              LC_COLLATE_INDEX
348 #  else
349 #    define _DUMMY_COLLATE              _DUMMY_CTYPE
350 #  endif
351 #  ifdef USE_LOCALE_TIME
352 #    define LC_TIME_INDEX               _DUMMY_COLLATE + 1
353 #    define _DUMMY_TIME                 LC_TIME_INDEX
354 #  else
355 #    define _DUMMY_TIME                 _DUMMY_COLLATE
356 #  endif
357 #  ifdef USE_LOCALE_MESSAGES
358 #    define LC_MESSAGES_INDEX           _DUMMY_TIME + 1
359 #    define _DUMMY_MESSAGES             LC_MESSAGES_INDEX
360 #  else
361 #    define _DUMMY_MESSAGES             _DUMMY_TIME
362 #  endif
363 #  ifdef USE_LOCALE_MONETARY
364 #    define LC_MONETARY_INDEX           _DUMMY_MESSAGES + 1
365 #    define _DUMMY_MONETARY             LC_MONETARY_INDEX
366 #  else
367 #    define _DUMMY_MONETARY             _DUMMY_MESSAGES
368 #  endif
369 #  ifdef USE_LOCALE_ADDRESS
370 #    define LC_ADDRESS_INDEX            _DUMMY_MONETARY + 1
371 #    define _DUMMY_ADDRESS              LC_ADDRESS_INDEX
372 #  else
373 #    define _DUMMY_ADDRESS              _DUMMY_MONETARY
374 #  endif
375 #  ifdef USE_LOCALE_IDENTIFICATION
376 #    define LC_IDENTIFICATION_INDEX     _DUMMY_ADDRESS + 1
377 #    define _DUMMY_IDENTIFICATION       LC_IDENTIFICATION_INDEX
378 #  else
379 #    define _DUMMY_IDENTIFICATION       _DUMMY_ADDRESS
380 #  endif
381 #  ifdef USE_LOCALE_MEASUREMENT
382 #    define LC_MEASUREMENT_INDEX        _DUMMY_IDENTIFICATION + 1
383 #    define _DUMMY_MEASUREMENT          LC_MEASUREMENT_INDEX
384 #  else
385 #    define _DUMMY_MEASUREMENT          _DUMMY_IDENTIFICATION
386 #  endif
387 #  ifdef USE_LOCALE_PAPER
388 #    define LC_PAPER_INDEX              _DUMMY_MEASUREMENT + 1
389 #    define _DUMMY_PAPER                LC_PAPER_INDEX
390 #  else
391 #    define _DUMMY_PAPER                _DUMMY_MEASUREMENT
392 #  endif
393 #  ifdef USE_LOCALE_TELEPHONE
394 #    define LC_TELEPHONE_INDEX          _DUMMY_PAPER + 1
395 #    define _DUMMY_TELEPHONE            LC_TELEPHONE_INDEX
396 #  else
397 #    define _DUMMY_TELEPHONE            _DUMMY_PAPER
398 #  endif
399 #  ifdef USE_LOCALE_SYNTAX
400 #    define LC_SYNTAX_INDEX             _DUMMY_TELEPHONE + 1
401 #    define _DUMMY_SYNTAX               LC_SYNTAX_INDEX
402 #  else
403 #    define _DUMMY_SYNTAX               _DUMMY_TELEPHONE
404 #  endif
405 #  ifdef USE_LOCALE_TOD
406 #    define LC_TOD_INDEX                _DUMMY_SYNTAX + 1
407 #    define _DUMMY_TOD                  LC_TOD_INDEX
408 #  else
409 #    define _DUMMY_TOD                  _DUMMY_SYNTAX
410 #  endif
411 #  ifdef LC_ALL
412 #    define LC_ALL_INDEX                _DUMMY_TOD + 1
413 #  endif
414 #endif /* ifdef USE_LOCALE */
415 
416 /* Windows requres a customized base-level setlocale() */
417 #ifdef WIN32
418 #  define my_setlocale(cat, locale) win32_setlocale(cat, locale)
419 #else
420 #  define my_setlocale(cat, locale) setlocale(cat, locale)
421 #endif
422 
423 #ifndef USE_POSIX_2008_LOCALE
424 
425 /* "do_setlocale_c" is intended to be called when the category is a constant
426  * known at compile time; "do_setlocale_r", not known until run time  */
427 #  define do_setlocale_c(cat, locale) my_setlocale(cat, locale)
428 #  define do_setlocale_r(cat, locale) my_setlocale(cat, locale)
429 #  define FIX_GLIBC_LC_MESSAGES_BUG(i)
430 
431 #else   /* Below uses POSIX 2008 */
432 
433 /* We emulate setlocale with our own function.  LC_foo is not valid for the
434  * POSIX 2008 functions.  Instead LC_foo_MASK is used, which we use an array
435  * lookup to convert to.  At compile time we have defined LC_foo_INDEX as the
436  * proper offset into the array 'category_masks[]'.  At runtime, we have to
437  * search through the array (as the actual numbers may not be small contiguous
438  * positive integers which would lend themselves to array lookup). */
439 #  define do_setlocale_c(cat, locale)                                       \
440                         emulate_setlocale(cat, locale, cat ## _INDEX, TRUE)
441 #  define do_setlocale_r(cat, locale) emulate_setlocale(cat, locale, 0, FALSE)
442 
443 #  if ! defined(__GLIBC__) || ! defined(USE_LOCALE_MESSAGES)
444 
445 #    define FIX_GLIBC_LC_MESSAGES_BUG(i)
446 
447 #  else /* Invalidate glibc cache of loaded translations, see [perl #134264] */
448 
449 #    include <libintl.h>
450 #    define FIX_GLIBC_LC_MESSAGES_BUG(i)                                        \
451         STMT_START {                                                        \
452             if ((i) == LC_MESSAGES_INDEX) {                                 \
453                 textdomain(textdomain(NULL));                               \
454             }                                                               \
455         } STMT_END
456 
457 #  endif
458 
459 /* A third array, parallel to the ones above to map from category to its
460  * equivalent mask */
461 const int category_masks[] = {
462 #  ifdef USE_LOCALE_NUMERIC
463                                 LC_NUMERIC_MASK,
464 #  endif
465 #  ifdef USE_LOCALE_CTYPE
466                                 LC_CTYPE_MASK,
467 #  endif
468 #  ifdef USE_LOCALE_COLLATE
469                                 LC_COLLATE_MASK,
470 #  endif
471 #  ifdef USE_LOCALE_TIME
472                                 LC_TIME_MASK,
473 #  endif
474 #  ifdef USE_LOCALE_MESSAGES
475                                 LC_MESSAGES_MASK,
476 #  endif
477 #  ifdef USE_LOCALE_MONETARY
478                                 LC_MONETARY_MASK,
479 #  endif
480 #  ifdef USE_LOCALE_ADDRESS
481                                 LC_ADDRESS_MASK,
482 #  endif
483 #  ifdef USE_LOCALE_IDENTIFICATION
484                                 LC_IDENTIFICATION_MASK,
485 #  endif
486 #  ifdef USE_LOCALE_MEASUREMENT
487                                 LC_MEASUREMENT_MASK,
488 #  endif
489 #  ifdef USE_LOCALE_PAPER
490                                 LC_PAPER_MASK,
491 #  endif
492 #  ifdef USE_LOCALE_TELEPHONE
493                                 LC_TELEPHONE_MASK,
494 #  endif
495 #  ifdef USE_LOCALE_SYNTAX
496                                 LC_SYNTAX_MASK,
497 #  endif
498 #  ifdef USE_LOCALE_TOD
499                                 LC_TOD_MASK,
500 #  endif
501                                 /* LC_ALL can't be turned off by a Configure
502                                  * option, and in Posix 2008, should always be
503                                  * here, so compile it in unconditionally.
504                                  * This could catch some glitches at compile
505                                  * time */
506                                 LC_ALL_MASK
507                             };
508 
509 STATIC const char *
510 S_emulate_setlocale(const int category,
511                     const char * locale,
512                     unsigned int index,
513                     const bool is_index_valid
514                    )
515 {
516     /* This function effectively performs a setlocale() on just the current
517      * thread; thus it is thread-safe.  It does this by using the POSIX 2008
518      * locale functions to emulate the behavior of setlocale().  Similar to
519      * regular setlocale(), the return from this function points to memory that
520      * can be overwritten by other system calls, so needs to be copied
521      * immediately if you need to retain it.  The difference here is that
522      * system calls besides another setlocale() can overwrite it.
523      *
524      * By doing this, most locale-sensitive functions become thread-safe.  The
525      * exceptions are mostly those that return a pointer to static memory.
526      *
527      * This function takes the same parameters, 'category' and 'locale', that
528      * the regular setlocale() function does, but it also takes two additional
529      * ones.  This is because the 2008 functions don't use a category; instead
530      * they use a corresponding mask.  Because this function operates in both
531      * worlds, it may need one or the other or both.  This function can
532      * calculate the mask from the input category, but to avoid this
533      * calculation, if the caller knows at compile time what the mask is, it
534      * can pass it, setting 'is_index_valid' to TRUE; otherwise the mask
535      * parameter is ignored.
536      *
537      * POSIX 2008, for some sick reason, chose not to provide a method to find
538      * the category name of a locale.  Some vendors have created a
539      * querylocale() function to do just that.  This function is a lot simpler
540      * to implement on systems that have this.  Otherwise, we have to keep
541      * track of what the locale has been set to, so that we can return its
542      * name to emulate setlocale().  It's also possible for C code in some
543      * library to change the locale without us knowing it, though as of
544      * September 2017, there are no occurrences in CPAN of uselocale().  Some
545      * libraries do use setlocale(), but that changes the global locale, and
546      * threads using per-thread locales will just ignore those changes.
547      * Another problem is that without querylocale(), we have to guess at what
548      * was meant by setting a locale of "".  We handle this by not actually
549      * ever setting to "" (unless querylocale exists), but to emulate what we
550      * think should happen for "".
551      */
552 
553     int mask;
554     locale_t old_obj;
555     locale_t new_obj;
556     const char * safelocale = locale ? locale : "(null)";
557     dTHX;
558 
559 #  ifdef DEBUGGING
560 
561     if (DEBUG_Lv_TEST || debug_initialization) {
562         PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale input=%d (%s), \"%s\", %d, %d\n", __FILE__, __LINE__, category, category_name(category), safelocale, index, is_index_valid);
563     }
564 
565 #  endif
566 
567     /* If the input mask might be incorrect, calculate the correct one */
568     if (! is_index_valid) {
569         unsigned int i;
570 
571 #  ifdef DEBUGGING
572 
573         if (DEBUG_Lv_TEST || debug_initialization) {
574             PerlIO_printf(Perl_debug_log, "%s:%d: finding index of category %d (%s)\n", __FILE__, __LINE__, category, category_name(category));
575         }
576 
577 #  endif
578 
579         for (i = 0; i <= LC_ALL_INDEX; i++) {
580             if (category == categories[i]) {
581                 index = i;
582                 goto found_index;
583             }
584         }
585 
586         /* Here, we don't know about this category, so can't handle it.
587          * Fallback to the early POSIX usages */
588         Perl_warner(aTHX_ packWARN(WARN_LOCALE),
589                             "Unknown locale category %d; can't set it to %s\n",
590                                                      category, safelocale);
591         return NULL;
592 
593       found_index: ;
594 
595 #  ifdef DEBUGGING
596 
597         if (DEBUG_Lv_TEST || debug_initialization) {
598             PerlIO_printf(Perl_debug_log, "%s:%d: index is %d for %s\n", __FILE__, __LINE__, index, category_name(category));
599         }
600 
601 #  endif
602 
603     }
604 
605     mask = category_masks[index];
606 
607 #  ifdef DEBUGGING
608 
609     if (DEBUG_Lv_TEST || debug_initialization) {
610         PerlIO_printf(Perl_debug_log, "%s:%d: category name is %s; mask is 0x%x\n", __FILE__, __LINE__, category_names[index], mask);
611     }
612 
613 #  endif
614 
615     /* If just querying what the existing locale is ... */
616     if (locale == NULL) {
617         locale_t cur_obj = uselocale((locale_t) 0);
618 
619 #  ifdef DEBUGGING
620 
621         if (DEBUG_Lv_TEST || debug_initialization) {
622             PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale querying %p\n", __FILE__, __LINE__, cur_obj);
623         }
624 
625 #  endif
626 
627         if (cur_obj == LC_GLOBAL_LOCALE) {
628             return my_setlocale(category, NULL);
629         }
630 
631 #  ifdef HAS_QUERYLOCALE
632 
633         return (char *) querylocale(mask, cur_obj);
634 
635 #  else
636 
637         /* If this assert fails, adjust the size of curlocales in intrpvar.h */
638         STATIC_ASSERT_STMT(C_ARRAY_LENGTH(PL_curlocales) > LC_ALL_INDEX);
639 
640 #    if   defined(_NL_LOCALE_NAME)                                          \
641      &&   defined(DEBUGGING)                                                \
642           /* On systems that accept any locale name, the real underlying    \
643            * locale is often returned by this internal function, so we      \
644            * can't use it */                                                \
645      && ! defined(SETLOCALE_ACCEPTS_ANY_LOCALE_NAME)
646         {
647             /* Internal glibc for querylocale(), but doesn't handle
648              * empty-string ("") locale properly; who knows what other
649              * glitches.  Check for it now, under debug. */
650 
651             char * temp_name = nl_langinfo_l(_NL_LOCALE_NAME(category),
652                                              uselocale((locale_t) 0));
653             /*
654             PerlIO_printf(Perl_debug_log, "%s:%d: temp_name=%s\n", __FILE__, __LINE__, temp_name ? temp_name : "NULL");
655             PerlIO_printf(Perl_debug_log, "%s:%d: index=%d\n", __FILE__, __LINE__, index);
656             PerlIO_printf(Perl_debug_log, "%s:%d: PL_curlocales[index]=%s\n", __FILE__, __LINE__, PL_curlocales[index]);
657             */
658             if (temp_name && PL_curlocales[index] && strNE(temp_name, "")) {
659                 if (         strNE(PL_curlocales[index], temp_name)
660                     && ! (   isNAME_C_OR_POSIX(temp_name)
661                           && isNAME_C_OR_POSIX(PL_curlocales[index]))) {
662 
663 #      ifdef USE_C_BACKTRACE
664 
665                     dump_c_backtrace(Perl_debug_log, 20, 1);
666 
667 #      endif
668 
669                     Perl_croak(aTHX_ "panic: Mismatch between what Perl thinks %s is"
670                                      " (%s) and what internal glibc thinks"
671                                      " (%s)\n", category_names[index],
672                                      PL_curlocales[index], temp_name);
673                 }
674 
675                 return temp_name;
676             }
677         }
678 
679 #    endif
680 
681         /* Without querylocale(), we have to use our record-keeping we've
682          *  done. */
683 
684         if (category != LC_ALL) {
685 
686 #    ifdef DEBUGGING
687 
688             if (DEBUG_Lv_TEST || debug_initialization) {
689                 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, PL_curlocales[index]);
690             }
691 
692 #    endif
693 
694             return PL_curlocales[index];
695         }
696         else {  /* For LC_ALL */
697             unsigned int i;
698             Size_t names_len = 0;
699             char * all_string;
700             bool are_all_categories_the_same_locale = TRUE;
701 
702             /* If we have a valid LC_ALL value, just return it */
703             if (PL_curlocales[LC_ALL_INDEX]) {
704 
705 #    ifdef DEBUGGING
706 
707                 if (DEBUG_Lv_TEST || debug_initialization) {
708                     PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, PL_curlocales[LC_ALL_INDEX]);
709                 }
710 
711 #    endif
712 
713                 return PL_curlocales[LC_ALL_INDEX];
714             }
715 
716             /* Otherwise, we need to construct a string of name=value pairs.
717              * We use the glibc syntax, like
718              *      LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
719              *  First calculate the needed size.  Along the way, check if all
720              *  the locale names are the same */
721             for (i = 0; i < LC_ALL_INDEX; i++) {
722 
723 #    ifdef DEBUGGING
724 
725                 if (DEBUG_Lv_TEST || debug_initialization) {
726                     PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale i=%d, name=%s, locale=%s\n", __FILE__, __LINE__, i, category_names[i], PL_curlocales[i]);
727                 }
728 
729 #    endif
730 
731                 names_len += strlen(category_names[i])
732                           + 1                       /* '=' */
733                           + strlen(PL_curlocales[i])
734                           + 1;                      /* ';' */
735 
736                 if (i > 0 && strNE(PL_curlocales[i], PL_curlocales[i-1])) {
737                     are_all_categories_the_same_locale = FALSE;
738                 }
739             }
740 
741             /* If they are the same, we don't actually have to construct the
742              * string; we just make the entry in LC_ALL_INDEX valid, and be
743              * that single name */
744             if (are_all_categories_the_same_locale) {
745                 PL_curlocales[LC_ALL_INDEX] = savepv(PL_curlocales[0]);
746                 return PL_curlocales[LC_ALL_INDEX];
747             }
748 
749             names_len++;    /* Trailing '\0' */
750             SAVEFREEPV(Newx(all_string, names_len, char));
751             *all_string = '\0';
752 
753             /* Then fill in the string */
754             for (i = 0; i < LC_ALL_INDEX; i++) {
755 
756 #    ifdef DEBUGGING
757 
758                 if (DEBUG_Lv_TEST || debug_initialization) {
759                     PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale i=%d, name=%s, locale=%s\n", __FILE__, __LINE__, i, category_names[i], PL_curlocales[i]);
760                 }
761 
762 #    endif
763 
764                 my_strlcat(all_string, category_names[i], names_len);
765                 my_strlcat(all_string, "=", names_len);
766                 my_strlcat(all_string, PL_curlocales[i], names_len);
767                 my_strlcat(all_string, ";", names_len);
768             }
769 
770 #    ifdef DEBUGGING
771 
772             if (DEBUG_L_TEST || debug_initialization) {
773                 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, all_string);
774             }
775 
776     #endif
777 
778             return all_string;
779         }
780 
781 #    ifdef EINVAL
782 
783         SETERRNO(EINVAL, LIB_INVARG);
784 
785 #    endif
786 
787         return NULL;
788 
789 #  endif
790 
791     }   /* End of this being setlocale(LC_foo, NULL) */
792 
793     /* Here, we are switching locales. */
794 
795 #  ifndef HAS_QUERYLOCALE
796 
797     if (strEQ(locale, "")) {
798 
799         /* For non-querylocale() systems, we do the setting of "" ourselves to
800          * be sure that we really know what's going on.  We follow the Linux
801          * documented behavior (but if that differs from the actual behavior,
802          * this won't work exactly as the OS implements).  We go out and
803          * examine the environment based on our understanding of how the system
804          * works, and use that to figure things out */
805 
806         const char * const lc_all = PerlEnv_getenv("LC_ALL");
807 
808         /* Use any "LC_ALL" environment variable, as it overrides everything
809          * else. */
810         if (lc_all && strNE(lc_all, "")) {
811             locale = lc_all;
812         }
813         else {
814 
815             /* Otherwise, we need to dig deeper.  Unless overridden, the
816              * default is the LANG environment variable; if it doesn't exist,
817              * then "C" */
818 
819             const char * default_name;
820 
821             default_name = PerlEnv_getenv("LANG");
822 
823             if (! default_name || strEQ(default_name, "")) {
824                 default_name = "C";
825             }
826 
827             if (category != LC_ALL) {
828                 const char * const name = PerlEnv_getenv(category_names[index]);
829 
830                 /* Here we are setting a single category.  Assume will have the
831                  * default name */
832                 locale = default_name;
833 
834                 /* But then look for an overriding environment variable */
835                 if (name && strNE(name, "")) {
836                     locale = name;
837                 }
838             }
839             else {
840                 bool did_override = FALSE;
841                 unsigned int i;
842 
843                 /* Here, we are getting LC_ALL.  Any categories that don't have
844                  * a corresponding environment variable set should be set to
845                  * LANG, or to "C" if there is no LANG.  If no individual
846                  * categories differ from this, we can just set LC_ALL.  This
847                  * is buggy on systems that have extra categories that we don't
848                  * know about.  If there is an environment variable that sets
849                  * that category, we won't know to look for it, and so our use
850                  * of LANG or "C" improperly overrides it.  On the other hand,
851                  * if we don't do what is done here, and there is no
852                  * environment variable, the category's locale should be set to
853                  * LANG or "C".  So there is no good solution.  khw thinks the
854                  * best is to look at systems to see what categories they have,
855                  * and include them, and then to assume that we know the
856                  * complete set */
857 
858                 for (i = 0; i < LC_ALL_INDEX; i++) {
859                     const char * const env_override
860                                             = PerlEnv_getenv(category_names[i]);
861                     const char * this_locale = (   env_override
862                                                 && strNE(env_override, ""))
863                                                ? env_override
864                                                : default_name;
865                     if (! emulate_setlocale(categories[i], this_locale, i, TRUE))
866                     {
867                         return NULL;
868                     }
869 
870                     if (strNE(this_locale, default_name)) {
871                         did_override = TRUE;
872                     }
873                 }
874 
875                 /* If all the categories are the same, we can set LC_ALL to
876                  * that */
877                 if (! did_override) {
878                     locale = default_name;
879                 }
880                 else {
881 
882                     /* Here, LC_ALL is no longer valid, as some individual
883                      * categories don't match it.  We call ourselves
884                      * recursively, as that will execute the code that
885                      * generates the proper locale string for this situation.
886                      * We don't do the remainder of this function, as that is
887                      * to update our records, and we've just done that for the
888                      * individual categories in the loop above, and doing so
889                      * would cause LC_ALL to be done as well */
890                     return emulate_setlocale(LC_ALL, NULL, LC_ALL_INDEX, TRUE);
891                 }
892             }
893         }
894     }   /* End of this being setlocale(LC_foo, "") */
895     else if (strchr(locale, ';')) {
896 
897         /* LC_ALL may actually incude a conglomeration of various categories.
898          * Without querylocale, this code uses the glibc (as of this writing)
899          * syntax for representing that, but that is not a stable API, and
900          * other platforms do it differently, so we have to handle all cases
901          * ourselves */
902 
903         unsigned int i;
904         const char * s = locale;
905         const char * e = locale + strlen(locale);
906         const char * p = s;
907         const char * category_end;
908         const char * name_start;
909         const char * name_end;
910 
911         /* If the string that gives what to set doesn't include all categories,
912          * the omitted ones get set to "C".  To get this behavior, first set
913          * all the individual categories to "C", and override the furnished
914          * ones below */
915         for (i = 0; i < LC_ALL_INDEX; i++) {
916             if (! emulate_setlocale(categories[i], "C", i, TRUE)) {
917                 return NULL;
918             }
919         }
920 
921         while (s < e) {
922 
923             /* Parse through the category */
924             while (isWORDCHAR(*p)) {
925                 p++;
926             }
927             category_end = p;
928 
929             if (*p++ != '=') {
930                 Perl_croak(aTHX_
931                     "panic: %s: %d: Unexpected character in locale name '%02X",
932                     __FILE__, __LINE__, *(p-1));
933             }
934 
935             /* Parse through the locale name */
936             name_start = p;
937             while (p < e && *p != ';') {
938                 if (! isGRAPH(*p)) {
939                     Perl_croak(aTHX_
940                         "panic: %s: %d: Unexpected character in locale name '%02X",
941                         __FILE__, __LINE__, *(p-1));
942                 }
943                 p++;
944             }
945             name_end = p;
946 
947             /* Space past the semi-colon */
948             if (p < e) {
949                 p++;
950             }
951 
952             /* Find the index of the category name in our lists */
953             for (i = 0; i < LC_ALL_INDEX; i++) {
954                 char * individ_locale;
955 
956                 /* Keep going if this isn't the index.  The strnNE() avoids a
957                  * Perl_form(), but would fail if ever a category name could be
958                  * a substring of another one, like if there were a
959                  * "LC_TIME_DATE" */
960                 if strnNE(s, category_names[i], category_end - s) {
961                     continue;
962                 }
963 
964                 /* If this index is for the single category we're changing, we
965                  * have found the locale to set it to. */
966                 if (category == categories[i]) {
967                     locale = Perl_form(aTHX_ "%.*s",
968                                              (int) (name_end - name_start),
969                                              name_start);
970                     goto ready_to_set;
971                 }
972 
973                 assert(category == LC_ALL);
974                 individ_locale = Perl_form(aTHX_ "%.*s",
975                                     (int) (name_end - name_start), name_start);
976                 if (! emulate_setlocale(categories[i], individ_locale, i, TRUE))
977                 {
978                     return NULL;
979                 }
980             }
981 
982             s = p;
983         }
984 
985         /* Here we have set all the individual categories by recursive calls.
986          * These collectively should have fixed up LC_ALL, so can just query
987          * what that now is */
988         assert(category == LC_ALL);
989 
990         return do_setlocale_c(LC_ALL, NULL);
991     }   /* End of this being setlocale(LC_ALL,
992            "LC_CTYPE=foo;LC_NUMERIC=bar;...") */
993 
994   ready_to_set: ;
995 
996     /* Here at the end of having to deal with the absence of querylocale().
997      * Some cases have already been fully handled by recursive calls to this
998      * function.  But at this point, we haven't dealt with those, but are now
999      * prepared to, knowing what the locale name to set this category to is.
1000      * This would have come for free if this system had had querylocale() */
1001 
1002 #  endif  /* end of ! querylocale */
1003 
1004     assert(PL_C_locale_obj);
1005 
1006     /* Switching locales generally entails freeing the current one's space (at
1007      * the C library's discretion).  We need to stop using that locale before
1008      * the switch.  So switch to a known locale object that we don't otherwise
1009      * mess with.  This returns the locale object in effect at the time of the
1010      * switch. */
1011     old_obj = uselocale(PL_C_locale_obj);
1012 
1013 #  ifdef DEBUGGING
1014 
1015     if (DEBUG_Lv_TEST || debug_initialization) {
1016         PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale was using %p\n", __FILE__, __LINE__, old_obj);
1017     }
1018 
1019 #  endif
1020 
1021     if (! old_obj) {
1022 
1023 #  ifdef DEBUGGING
1024 
1025         if (DEBUG_L_TEST || debug_initialization) {
1026             dSAVE_ERRNO;
1027             PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale switching to C failed: %d\n", __FILE__, __LINE__, GET_ERRNO);
1028             RESTORE_ERRNO;
1029         }
1030 
1031 #  endif
1032 
1033         return NULL;
1034     }
1035 
1036 #  ifdef DEBUGGING
1037 
1038     if (DEBUG_Lv_TEST || debug_initialization) {
1039         PerlIO_printf(Perl_debug_log,
1040                       "%s:%d: emulate_setlocale now using %p\n",
1041                       __FILE__, __LINE__, PL_C_locale_obj);
1042     }
1043 
1044 #  endif
1045 
1046     /* If this call is to switch to the LC_ALL C locale, it already exists, and
1047      * in fact, we already have switched to it (in preparation for what
1048      * normally is to come).  But since we're already there, continue to use
1049      * it instead of trying to create a new locale */
1050     if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(locale)) {
1051 
1052 #  ifdef DEBUGGING
1053 
1054         if (DEBUG_Lv_TEST || debug_initialization) {
1055             PerlIO_printf(Perl_debug_log,
1056                           "%s:%d: will stay in C object\n", __FILE__, __LINE__);
1057         }
1058 
1059 #  endif
1060 
1061         new_obj = PL_C_locale_obj;
1062 
1063         /* We already had switched to the C locale in preparation for freeing
1064          * 'old_obj' */
1065         if (old_obj != LC_GLOBAL_LOCALE && old_obj != PL_C_locale_obj) {
1066             freelocale(old_obj);
1067         }
1068     }
1069     else {
1070         /* If we weren't in a thread safe locale, set so that newlocale() below
1071          * which uses 'old_obj', uses an empty one.  Same for our reserved C
1072          * object.  The latter is defensive coding, so that, even if there is
1073          * some bug, we will never end up trying to modify either of these, as
1074          * if passed to newlocale(), they can be. */
1075         if (old_obj == LC_GLOBAL_LOCALE || old_obj == PL_C_locale_obj) {
1076             old_obj = (locale_t) 0;
1077         }
1078 
1079         /* Ready to create a new locale by modification of the exising one */
1080         new_obj = newlocale(mask, locale, old_obj);
1081 
1082         if (! new_obj) {
1083             dSAVE_ERRNO;
1084 
1085 #  ifdef DEBUGGING
1086 
1087             if (DEBUG_L_TEST || debug_initialization) {
1088                 PerlIO_printf(Perl_debug_log,
1089                               "%s:%d: emulate_setlocale creating new object"
1090                               " failed: %d\n", __FILE__, __LINE__, GET_ERRNO);
1091             }
1092 
1093 #  endif
1094 
1095             if (! uselocale(old_obj)) {
1096 
1097 #  ifdef DEBUGGING
1098 
1099                 if (DEBUG_L_TEST || debug_initialization) {
1100                     PerlIO_printf(Perl_debug_log,
1101                                   "%s:%d: switching back failed: %d\n",
1102                                   __FILE__, __LINE__, GET_ERRNO);
1103                 }
1104 
1105 #  endif
1106 
1107             }
1108             RESTORE_ERRNO;
1109             return NULL;
1110         }
1111 
1112 #  ifdef DEBUGGING
1113 
1114         if (DEBUG_Lv_TEST || debug_initialization) {
1115             PerlIO_printf(Perl_debug_log,
1116                           "%s:%d: emulate_setlocale created %p",
1117                           __FILE__, __LINE__, new_obj);
1118             if (old_obj) {
1119                 PerlIO_printf(Perl_debug_log,
1120                               "; should have freed %p", old_obj);
1121             }
1122             PerlIO_printf(Perl_debug_log, "\n");
1123         }
1124 
1125 #  endif
1126 
1127         /* And switch into it */
1128         if (! uselocale(new_obj)) {
1129             dSAVE_ERRNO;
1130 
1131 #  ifdef DEBUGGING
1132 
1133             if (DEBUG_L_TEST || debug_initialization) {
1134                 PerlIO_printf(Perl_debug_log,
1135                               "%s:%d: emulate_setlocale switching to new object"
1136                               " failed\n", __FILE__, __LINE__);
1137             }
1138 
1139 #  endif
1140 
1141             if (! uselocale(old_obj)) {
1142 
1143 #  ifdef DEBUGGING
1144 
1145                 if (DEBUG_L_TEST || debug_initialization) {
1146                     PerlIO_printf(Perl_debug_log,
1147                                   "%s:%d: switching back failed: %d\n",
1148                                   __FILE__, __LINE__, GET_ERRNO);
1149                 }
1150 
1151 #  endif
1152 
1153             }
1154             freelocale(new_obj);
1155             RESTORE_ERRNO;
1156             return NULL;
1157         }
1158     }
1159 
1160 #  ifdef DEBUGGING
1161 
1162     if (DEBUG_Lv_TEST || debug_initialization) {
1163         PerlIO_printf(Perl_debug_log,
1164                       "%s:%d: emulate_setlocale now using %p\n",
1165                       __FILE__, __LINE__, new_obj);
1166     }
1167 
1168 #  endif
1169 
1170     /* We are done, except for updating our records (if the system doesn't keep
1171      * them) and in the case of locale "", we don't actually know what the
1172      * locale that got switched to is, as it came from the environment.  So
1173      * have to find it */
1174 
1175 #  ifdef HAS_QUERYLOCALE
1176 
1177     if (strEQ(locale, "")) {
1178         locale = querylocale(mask, new_obj);
1179     }
1180 
1181 #  else
1182 
1183     /* Here, 'locale' is the return value */
1184 
1185     /* Without querylocale(), we have to update our records */
1186 
1187     if (category == LC_ALL) {
1188         unsigned int i;
1189 
1190         /* For LC_ALL, we change all individual categories to correspond */
1191                               /* PL_curlocales is a parallel array, so has same
1192                                * length as 'categories' */
1193         for (i = 0; i <= LC_ALL_INDEX; i++) {
1194             Safefree(PL_curlocales[i]);
1195             PL_curlocales[i] = savepv(locale);
1196         }
1197 
1198         FIX_GLIBC_LC_MESSAGES_BUG(LC_MESSAGES_INDEX);
1199     }
1200     else {
1201 
1202         /* For a single category, if it's not the same as the one in LC_ALL, we
1203          * nullify LC_ALL */
1204 
1205         if (PL_curlocales[LC_ALL_INDEX] && strNE(PL_curlocales[LC_ALL_INDEX], locale)) {
1206             Safefree(PL_curlocales[LC_ALL_INDEX]);
1207             PL_curlocales[LC_ALL_INDEX] = NULL;
1208         }
1209 
1210         /* Then update the category's record */
1211         Safefree(PL_curlocales[index]);
1212         PL_curlocales[index] = savepv(locale);
1213 
1214         FIX_GLIBC_LC_MESSAGES_BUG(index);
1215     }
1216 
1217 #  endif
1218 
1219     return locale;
1220 }
1221 
1222 #endif /* USE_POSIX_2008_LOCALE */
1223 
1224 #ifdef USE_LOCALE
1225 
1226 STATIC void
1227 S_set_numeric_radix(pTHX_ const bool use_locale)
1228 {
1229     /* If 'use_locale' is FALSE, set to use a dot for the radix character.  If
1230      * TRUE, use the radix character derived from the current locale */
1231 
1232 #if defined(USE_LOCALE_NUMERIC) && (   defined(HAS_LOCALECONV)              \
1233                                     || defined(HAS_NL_LANGINFO))
1234 
1235     const char * radix = (use_locale)
1236                          ? my_nl_langinfo(RADIXCHAR, FALSE)
1237                                         /* FALSE => already in dest locale */
1238                          : ".";
1239 
1240         sv_setpv(PL_numeric_radix_sv, radix);
1241 
1242     /* If this is valid UTF-8 that isn't totally ASCII, and we are in
1243         * a UTF-8 locale, then mark the radix as being in UTF-8 */
1244     if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv),
1245                                             SvCUR(PL_numeric_radix_sv))
1246         && _is_cur_LC_category_utf8(LC_NUMERIC))
1247     {
1248         SvUTF8_on(PL_numeric_radix_sv);
1249     }
1250 
1251 #  ifdef DEBUGGING
1252 
1253     if (DEBUG_L_TEST || debug_initialization) {
1254         PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
1255                                            SvPVX(PL_numeric_radix_sv),
1256                                            cBOOL(SvUTF8(PL_numeric_radix_sv)));
1257     }
1258 
1259 #  endif
1260 #else
1261 
1262     PERL_UNUSED_ARG(use_locale);
1263 
1264 #endif /* USE_LOCALE_NUMERIC and can find the radix char */
1265 
1266 }
1267 
1268 STATIC void
1269 S_new_numeric(pTHX_ const char *newnum)
1270 {
1271 
1272 #ifndef USE_LOCALE_NUMERIC
1273 
1274     PERL_UNUSED_ARG(newnum);
1275 
1276 #else
1277 
1278     /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
1279      * core Perl this and that 'newnum' is the name of the new locale.
1280      * It installs this locale as the current underlying default.
1281      *
1282      * The default locale and the C locale can be toggled between by use of the
1283      * set_numeric_underlying() and set_numeric_standard() functions, which
1284      * should probably not be called directly, but only via macros like
1285      * SET_NUMERIC_STANDARD() in perl.h.
1286      *
1287      * The toggling is necessary mainly so that a non-dot radix decimal point
1288      * character can be output, while allowing internal calculations to use a
1289      * dot.
1290      *
1291      * This sets several interpreter-level variables:
1292      * PL_numeric_name  The underlying locale's name: a copy of 'newnum'
1293      * PL_numeric_underlying  A boolean indicating if the toggled state is such
1294      *                  that the current locale is the program's underlying
1295      *                  locale
1296      * PL_numeric_standard An int indicating if the toggled state is such
1297      *                  that the current locale is the C locale or
1298      *                  indistinguishable from the C locale.  If non-zero, it
1299      *                  is in C; if > 1, it means it may not be toggled away
1300      *                  from C.
1301      * PL_numeric_underlying_is_standard   A bool kept by this function
1302      *                  indicating that the underlying locale and the standard
1303      *                  C locale are indistinguishable for the purposes of
1304      *                  LC_NUMERIC.  This happens when both of the above two
1305      *                  variables are true at the same time.  (Toggling is a
1306      *                  no-op under these circumstances.)  This variable is
1307      *                  used to avoid having to recalculate.
1308      */
1309 
1310     char *save_newnum;
1311 
1312     if (! newnum) {
1313         Safefree(PL_numeric_name);
1314         PL_numeric_name = NULL;
1315         PL_numeric_standard = TRUE;
1316         PL_numeric_underlying = TRUE;
1317         PL_numeric_underlying_is_standard = TRUE;
1318         return;
1319     }
1320 
1321     save_newnum = stdize_locale(savepv(newnum));
1322     PL_numeric_underlying = TRUE;
1323     PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
1324 
1325 #ifndef TS_W32_BROKEN_LOCALECONV
1326 
1327     /* If its name isn't C nor POSIX, it could still be indistinguishable from
1328      * them.  But on broken Windows systems calling my_nl_langinfo() for
1329      * THOUSEP can currently (but rarely) cause a race, so avoid doing that,
1330      * and just always change the locale if not C nor POSIX on those systems */
1331     if (! PL_numeric_standard) {
1332         PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(RADIXCHAR,
1333                                             FALSE /* Don't toggle locale */  ))
1334                                  && strEQ("",  my_nl_langinfo(THOUSEP, FALSE)));
1335     }
1336 
1337 #endif
1338 
1339     /* Save the new name if it isn't the same as the previous one, if any */
1340     if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
1341         Safefree(PL_numeric_name);
1342         PL_numeric_name = save_newnum;
1343     }
1344     else {
1345         Safefree(save_newnum);
1346     }
1347 
1348     PL_numeric_underlying_is_standard = PL_numeric_standard;
1349 
1350 #  ifdef HAS_POSIX_2008_LOCALE
1351 
1352     PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
1353                                           PL_numeric_name,
1354                                           PL_underlying_numeric_obj);
1355 
1356 #endif
1357 
1358     if (DEBUG_L_TEST || debug_initialization) {
1359         PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name);
1360     }
1361 
1362     /* Keep LC_NUMERIC in the C locale.  This is for XS modules, so they don't
1363      * have to worry about the radix being a non-dot.  (Core operations that
1364      * need the underlying locale change to it temporarily). */
1365     if (PL_numeric_standard) {
1366         set_numeric_radix(0);
1367     }
1368     else {
1369         set_numeric_standard();
1370     }
1371 
1372 #endif /* USE_LOCALE_NUMERIC */
1373 
1374 }
1375 
1376 void
1377 Perl_set_numeric_standard(pTHX)
1378 {
1379 
1380 #ifdef USE_LOCALE_NUMERIC
1381 
1382     /* Toggle the LC_NUMERIC locale to C.  Most code should use the macros like
1383      * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly.  The
1384      * macro avoids calling this routine if toggling isn't necessary according
1385      * to our records (which could be wrong if some XS code has changed the
1386      * locale behind our back) */
1387 
1388 #  ifdef DEBUGGING
1389 
1390     if (DEBUG_L_TEST || debug_initialization) {
1391         PerlIO_printf(Perl_debug_log,
1392                           "Setting LC_NUMERIC locale to standard C\n");
1393     }
1394 
1395 #  endif
1396 
1397     do_setlocale_c(LC_NUMERIC, "C");
1398     PL_numeric_standard = TRUE;
1399     PL_numeric_underlying = PL_numeric_underlying_is_standard;
1400     set_numeric_radix(0);
1401 
1402 #endif /* USE_LOCALE_NUMERIC */
1403 
1404 }
1405 
1406 void
1407 Perl_set_numeric_underlying(pTHX)
1408 {
1409 
1410 #ifdef USE_LOCALE_NUMERIC
1411 
1412     /* Toggle the LC_NUMERIC locale to the current underlying default.  Most
1413      * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
1414      * instead of calling this directly.  The macro avoids calling this routine
1415      * if toggling isn't necessary according to our records (which could be
1416      * wrong if some XS code has changed the locale behind our back) */
1417 
1418 #  ifdef DEBUGGING
1419 
1420     if (DEBUG_L_TEST || debug_initialization) {
1421         PerlIO_printf(Perl_debug_log,
1422                           "Setting LC_NUMERIC locale to %s\n",
1423                           PL_numeric_name);
1424     }
1425 
1426 #  endif
1427 
1428     do_setlocale_c(LC_NUMERIC, PL_numeric_name);
1429     PL_numeric_standard = PL_numeric_underlying_is_standard;
1430     PL_numeric_underlying = TRUE;
1431     set_numeric_radix(! PL_numeric_standard);
1432 
1433 #endif /* USE_LOCALE_NUMERIC */
1434 
1435 }
1436 
1437 /*
1438  * Set up for a new ctype locale.
1439  */
1440 STATIC void
1441 S_new_ctype(pTHX_ const char *newctype)
1442 {
1443 
1444 #ifndef USE_LOCALE_CTYPE
1445 
1446     PERL_UNUSED_ARG(newctype);
1447     PERL_UNUSED_CONTEXT;
1448 
1449 #else
1450 
1451     /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
1452      * core Perl this and that 'newctype' is the name of the new locale.
1453      *
1454      * This function sets up the folding arrays for all 256 bytes, assuming
1455      * that tofold() is tolc() since fold case is not a concept in POSIX,
1456      *
1457      * Any code changing the locale (outside this file) should use
1458      * Perl_setlocale or POSIX::setlocale, which call this function.  Therefore
1459      * this function should be called directly only from this file and from
1460      * POSIX::setlocale() */
1461 
1462     unsigned int i;
1463 
1464     /* Don't check for problems if we are suppressing the warnings */
1465     bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
1466     bool maybe_utf8_turkic = FALSE;
1467 
1468     PERL_ARGS_ASSERT_NEW_CTYPE;
1469 
1470     /* We will replace any bad locale warning with 1) nothing if the new one is
1471      * ok; or 2) a new warning for the bad new locale */
1472     if (PL_warn_locale) {
1473         SvREFCNT_dec_NN(PL_warn_locale);
1474         PL_warn_locale = NULL;
1475     }
1476 
1477     PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
1478 
1479     /* A UTF-8 locale gets standard rules.  But note that code still has to
1480      * handle this specially because of the three problematic code points */
1481     if (PL_in_utf8_CTYPE_locale) {
1482         Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
1483 
1484         /* UTF-8 locales can have special handling for 'I' and 'i' if they are
1485          * Turkic.  Make sure these two are the only anomalies.  (We don't use
1486          * towupper and towlower because they aren't in C89.) */
1487 
1488 #if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
1489 
1490         if (towupper('i') == 0x130 && towlower('I') == 0x131) {
1491 
1492 #else
1493 
1494         if (toupper('i') == 'i' && tolower('I') == 'I') {
1495 
1496 #endif
1497             check_for_problems = TRUE;
1498             maybe_utf8_turkic = TRUE;
1499         }
1500     }
1501 
1502     /* We don't populate the other lists if a UTF-8 locale, but do check that
1503      * everything works as expected, unless checking turned off */
1504     if (check_for_problems || ! PL_in_utf8_CTYPE_locale) {
1505         /* Assume enough space for every character being bad.  4 spaces each
1506          * for the 94 printable characters that are output like "'x' "; and 5
1507          * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
1508          * NUL */
1509         char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
1510         bool multi_byte_locale = FALSE;     /* Assume is a single-byte locale
1511                                                to start */
1512         unsigned int bad_count = 0;         /* Count of bad characters */
1513 
1514         for (i = 0; i < 256; i++) {
1515             if (! PL_in_utf8_CTYPE_locale) {
1516                 if (isupper(i))
1517                     PL_fold_locale[i] = (U8) tolower(i);
1518                 else if (islower(i))
1519                     PL_fold_locale[i] = (U8) toupper(i);
1520                 else
1521                     PL_fold_locale[i] = (U8) i;
1522             }
1523 
1524             /* If checking for locale problems, see if the native ASCII-range
1525              * printables plus \n and \t are in their expected categories in
1526              * the new locale.  If not, this could mean big trouble, upending
1527              * Perl's and most programs' assumptions, like having a
1528              * metacharacter with special meaning become a \w.  Fortunately,
1529              * it's very rare to find locales that aren't supersets of ASCII
1530              * nowadays.  It isn't a problem for most controls to be changed
1531              * into something else; we check only \n and \t, though perhaps \r
1532              * could be an issue as well. */
1533             if (    check_for_problems
1534                 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
1535             {
1536                 bool is_bad = FALSE;
1537                 char name[4] = { '\0' };
1538 
1539                 /* Convert the name into a string */
1540                 if (isGRAPH_A(i)) {
1541                     name[0] = i;
1542                     name[1] = '\0';
1543                 }
1544                 else if (i == '\n') {
1545                     my_strlcpy(name, "\\n", sizeof(name));
1546                 }
1547                 else if (i == '\t') {
1548                     my_strlcpy(name, "\\t", sizeof(name));
1549                 }
1550                 else {
1551                     assert(i == ' ');
1552                     my_strlcpy(name, "' '", sizeof(name));
1553                 }
1554 
1555                 /* Check each possibe class */
1556                 if (UNLIKELY(cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC_A(i))))  {
1557                     is_bad = TRUE;
1558                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1559                                           "isalnum('%s') unexpectedly is %d\n",
1560                                           name, cBOOL(isalnum(i))));
1561                 }
1562                 if (UNLIKELY(cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i))))  {
1563                     is_bad = TRUE;
1564                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1565                                           "isalpha('%s') unexpectedly is %d\n",
1566                                           name, cBOOL(isalpha(i))));
1567                 }
1568                 if (UNLIKELY(cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i))))  {
1569                     is_bad = TRUE;
1570                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1571                                           "isdigit('%s') unexpectedly is %d\n",
1572                                           name, cBOOL(isdigit(i))));
1573                 }
1574                 if (UNLIKELY(cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i))))  {
1575                     is_bad = TRUE;
1576                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1577                                           "isgraph('%s') unexpectedly is %d\n",
1578                                           name, cBOOL(isgraph(i))));
1579                 }
1580                 if (UNLIKELY(cBOOL(islower(i)) != cBOOL(isLOWER_A(i))))  {
1581                     is_bad = TRUE;
1582                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1583                                           "islower('%s') unexpectedly is %d\n",
1584                                           name, cBOOL(islower(i))));
1585                 }
1586                 if (UNLIKELY(cBOOL(isprint(i)) != cBOOL(isPRINT_A(i))))  {
1587                     is_bad = TRUE;
1588                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1589                                           "isprint('%s') unexpectedly is %d\n",
1590                                           name, cBOOL(isprint(i))));
1591                 }
1592                 if (UNLIKELY(cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i))))  {
1593                     is_bad = TRUE;
1594                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1595                                           "ispunct('%s') unexpectedly is %d\n",
1596                                           name, cBOOL(ispunct(i))));
1597                 }
1598                 if (UNLIKELY(cBOOL(isspace(i)) != cBOOL(isSPACE_A(i))))  {
1599                     is_bad = TRUE;
1600                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1601                                           "isspace('%s') unexpectedly is %d\n",
1602                                           name, cBOOL(isspace(i))));
1603                 }
1604                 if (UNLIKELY(cBOOL(isupper(i)) != cBOOL(isUPPER_A(i))))  {
1605                     is_bad = TRUE;
1606                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1607                                           "isupper('%s') unexpectedly is %d\n",
1608                                           name, cBOOL(isupper(i))));
1609                 }
1610                 if (UNLIKELY(cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i))))  {
1611                     is_bad = TRUE;
1612                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1613                                           "isxdigit('%s') unexpectedly is %d\n",
1614                                           name, cBOOL(isxdigit(i))));
1615                 }
1616                 if (UNLIKELY(tolower(i) != (int) toLOWER_A(i))) {
1617                     is_bad = TRUE;
1618                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1619                             "tolower('%s')=0x%x instead of the expected 0x%x\n",
1620                             name, tolower(i), (int) toLOWER_A(i)));
1621                 }
1622                 if (UNLIKELY(toupper(i) != (int) toUPPER_A(i))) {
1623                     is_bad = TRUE;
1624                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1625                             "toupper('%s')=0x%x instead of the expected 0x%x\n",
1626                             name, toupper(i), (int) toUPPER_A(i)));
1627                 }
1628                 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i))))  {
1629                     is_bad = TRUE;
1630                     DEBUG_L(PerlIO_printf(Perl_debug_log,
1631                                 "'\\n' (=%02X) is not a control\n", (int) i));
1632                 }
1633 
1634                 /* Add to the list;  Separate multiple entries with a blank */
1635                 if (is_bad) {
1636                     if (bad_count) {
1637                         my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
1638                     }
1639                     my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
1640                     bad_count++;
1641                 }
1642             }
1643         }
1644 
1645         if (bad_count == 2 && maybe_utf8_turkic) {
1646             bad_count = 0;
1647             *bad_chars_list = '\0';
1648             PL_fold_locale['I'] = 'I';
1649             PL_fold_locale['i'] = 'i';
1650             PL_in_utf8_turkic_locale = TRUE;
1651             DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s is turkic\n",
1652                                                  __FILE__, __LINE__, newctype));
1653         }
1654         else {
1655             PL_in_utf8_turkic_locale = FALSE;
1656         }
1657 
1658 #  ifdef MB_CUR_MAX
1659 
1660         /* We only handle single-byte locales (outside of UTF-8 ones; so if
1661          * this locale requires more than one byte, there are going to be
1662          * problems. */
1663         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1664                  "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
1665                  __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
1666 
1667         if (   check_for_problems && MB_CUR_MAX > 1
1668             && ! PL_in_utf8_CTYPE_locale
1669 
1670                /* Some platforms return MB_CUR_MAX > 1 for even the "C"
1671                 * locale.  Just assume that the implementation for them (plus
1672                 * for POSIX) is correct and the > 1 value is spurious.  (Since
1673                 * these are specially handled to never be considered UTF-8
1674                 * locales, as long as this is the only problem, everything
1675                 * should work fine */
1676             && strNE(newctype, "C") && strNE(newctype, "POSIX"))
1677         {
1678             multi_byte_locale = TRUE;
1679         }
1680 
1681 #  endif
1682 
1683         /* If we found problems and we want them output, do so */
1684         if (   (UNLIKELY(bad_count) || UNLIKELY(multi_byte_locale))
1685             && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
1686         {
1687             if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) {
1688                 PL_warn_locale = Perl_newSVpvf(aTHX_
1689                      "Locale '%s' contains (at least) the following characters"
1690                      " which have\nunexpected meanings: %s\nThe Perl program"
1691                      " will use the expected meanings",
1692                       newctype, bad_chars_list);
1693             }
1694             else {
1695                 PL_warn_locale = Perl_newSVpvf(aTHX_
1696                              "Locale '%s' may not work well.%s%s%s\n",
1697                              newctype,
1698                              (multi_byte_locale)
1699                               ? "  Some characters in it are not recognized by"
1700                                 " Perl."
1701                               : "",
1702                              (bad_count)
1703                               ? "\nThe following characters (and maybe others)"
1704                                 " may not have the same meaning as the Perl"
1705                                 " program expects:\n"
1706                               : "",
1707                              (bad_count)
1708                               ? bad_chars_list
1709                               : ""
1710                             );
1711             }
1712 
1713 #  ifdef HAS_NL_LANGINFO
1714 
1715             Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
1716                                     /* parameter FALSE is a don't care here */
1717                                     my_nl_langinfo(CODESET, FALSE));
1718 
1719 #  endif
1720 
1721             Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
1722 
1723             /* If we are actually in the scope of the locale or are debugging,
1724              * output the message now.  If not in that scope, we save the
1725              * message to be output at the first operation using this locale,
1726              * if that actually happens.  Most programs don't use locales, so
1727              * they are immune to bad ones.  */
1728             if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
1729 
1730                 /* The '0' below suppresses a bogus gcc compiler warning */
1731                 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
1732 
1733                 if (IN_LC(LC_CTYPE)) {
1734                     SvREFCNT_dec_NN(PL_warn_locale);
1735                     PL_warn_locale = NULL;
1736                 }
1737             }
1738         }
1739     }
1740 
1741 #endif /* USE_LOCALE_CTYPE */
1742 
1743 }
1744 
1745 void
1746 Perl__warn_problematic_locale()
1747 {
1748 
1749 #ifdef USE_LOCALE_CTYPE
1750 
1751     dTHX;
1752 
1753     /* Internal-to-core function that outputs the message in PL_warn_locale,
1754      * and then NULLS it.  Should be called only through the macro
1755      * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
1756 
1757     if (PL_warn_locale) {
1758         Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
1759                              SvPVX(PL_warn_locale),
1760                              0 /* dummy to avoid compiler warning */ );
1761         SvREFCNT_dec_NN(PL_warn_locale);
1762         PL_warn_locale = NULL;
1763     }
1764 
1765 #endif
1766 
1767 }
1768 
1769 STATIC void
1770 S_new_collate(pTHX_ const char *newcoll)
1771 {
1772 
1773 #ifndef USE_LOCALE_COLLATE
1774 
1775     PERL_UNUSED_ARG(newcoll);
1776     PERL_UNUSED_CONTEXT;
1777 
1778 #else
1779 
1780     /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
1781      * core Perl this and that 'newcoll' is the name of the new locale.
1782      *
1783      * The design of locale collation is that every locale change is given an
1784      * index 'PL_collation_ix'.  The first time a string particpates in an
1785      * operation that requires collation while locale collation is active, it
1786      * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()).  That
1787      * magic includes the collation index, and the transformation of the string
1788      * by strxfrm(), q.v.  That transformation is used when doing comparisons,
1789      * instead of the string itself.  If a string changes, the magic is
1790      * cleared.  The next time the locale changes, the index is incremented,
1791      * and so we know during a comparison that the transformation is not
1792      * necessarily still valid, and so is recomputed.  Note that if the locale
1793      * changes enough times, the index could wrap (a U32), and it is possible
1794      * that a transformation would improperly be considered valid, leading to
1795      * an unlikely bug */
1796 
1797     if (! newcoll) {
1798         if (PL_collation_name) {
1799             ++PL_collation_ix;
1800             Safefree(PL_collation_name);
1801             PL_collation_name = NULL;
1802         }
1803         PL_collation_standard = TRUE;
1804       is_standard_collation:
1805         PL_collxfrm_base = 0;
1806         PL_collxfrm_mult = 2;
1807         PL_in_utf8_COLLATE_locale = FALSE;
1808         PL_strxfrm_NUL_replacement = '\0';
1809         PL_strxfrm_max_cp = 0;
1810         return;
1811     }
1812 
1813     /* If this is not the same locale as currently, set the new one up */
1814     if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
1815         ++PL_collation_ix;
1816         Safefree(PL_collation_name);
1817         PL_collation_name = stdize_locale(savepv(newcoll));
1818         PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
1819         if (PL_collation_standard) {
1820             goto is_standard_collation;
1821         }
1822 
1823         PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
1824         PL_strxfrm_NUL_replacement = '\0';
1825         PL_strxfrm_max_cp = 0;
1826 
1827         /* A locale collation definition includes primary, secondary, tertiary,
1828          * etc. weights for each character.  To sort, the primary weights are
1829          * used, and only if they compare equal, then the secondary weights are
1830          * used, and only if they compare equal, then the tertiary, etc.
1831          *
1832          * strxfrm() works by taking the input string, say ABC, and creating an
1833          * output transformed string consisting of first the primary weights,
1834          * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
1835          * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ ....  Some characters
1836          * may not have weights at every level.  In our example, let's say B
1837          * doesn't have a tertiary weight, and A doesn't have a secondary
1838          * weight.  The constructed string is then going to be
1839          *  A¹B¹C¹ B²C² A³C³ ....
1840          * This has the desired effect that strcmp() will look at the secondary
1841          * or tertiary weights only if the strings compare equal at all higher
1842          * priority weights.  The spaces shown here, like in
1843          *  "A¹B¹C¹ A²B²C² "
1844          * are not just for readability.  In the general case, these must
1845          * actually be bytes, which we will call here 'separator weights'; and
1846          * they must be smaller than any other weight value, but since these
1847          * are C strings, only the terminating one can be a NUL (some
1848          * implementations may include a non-NUL separator weight just before
1849          * the NUL).  Implementations tend to reserve 01 for the separator
1850          * weights.  They are needed so that a shorter string's secondary
1851          * weights won't be misconstrued as primary weights of a longer string,
1852          * etc.  By making them smaller than any other weight, the shorter
1853          * string will sort first.  (Actually, if all secondary weights are
1854          * smaller than all primary ones, there is no need for a separator
1855          * weight between those two levels, etc.)
1856          *
1857          * The length of the transformed string is roughly a linear function of
1858          * the input string.  It's not exactly linear because some characters
1859          * don't have weights at all levels.  When we call strxfrm() we have to
1860          * allocate some memory to hold the transformed string.  The
1861          * calculations below try to find coefficients 'm' and 'b' for this
1862          * locale so that m*x + b equals how much space we need, given the size
1863          * of the input string in 'x'.  If we calculate too small, we increase
1864          * the size as needed, and call strxfrm() again, but it is better to
1865          * get it right the first time to avoid wasted expensive string
1866          * transformations. */
1867 
1868         {
1869             /* We use the string below to find how long the tranformation of it
1870              * is.  Almost all locales are supersets of ASCII, or at least the
1871              * ASCII letters.  We use all of them, half upper half lower,
1872              * because if we used fewer, we might hit just the ones that are
1873              * outliers in a particular locale.  Most of the strings being
1874              * collated will contain a preponderance of letters, and even if
1875              * they are above-ASCII, they are likely to have the same number of
1876              * weight levels as the ASCII ones.  It turns out that digits tend
1877              * to have fewer levels, and some punctuation has more, but those
1878              * are relatively sparse in text, and khw believes this gives a
1879              * reasonable result, but it could be changed if experience so
1880              * dictates. */
1881             const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
1882             char * x_longer;        /* Transformed 'longer' */
1883             Size_t x_len_longer;    /* Length of 'x_longer' */
1884 
1885             char * x_shorter;   /* We also transform a substring of 'longer' */
1886             Size_t x_len_shorter;
1887 
1888             /* _mem_collxfrm() is used get the transformation (though here we
1889              * are interested only in its length).  It is used because it has
1890              * the intelligence to handle all cases, but to work, it needs some
1891              * values of 'm' and 'b' to get it started.  For the purposes of
1892              * this calculation we use a very conservative estimate of 'm' and
1893              * 'b'.  This assumes a weight can be multiple bytes, enough to
1894              * hold any UV on the platform, and there are 5 levels, 4 weight
1895              * bytes, and a trailing NUL.  */
1896             PL_collxfrm_base = 5;
1897             PL_collxfrm_mult = 5 * sizeof(UV);
1898 
1899             /* Find out how long the transformation really is */
1900             x_longer = _mem_collxfrm(longer,
1901                                      sizeof(longer) - 1,
1902                                      &x_len_longer,
1903 
1904                                      /* We avoid converting to UTF-8 in the
1905                                       * called function by telling it the
1906                                       * string is in UTF-8 if the locale is a
1907                                       * UTF-8 one.  Since the string passed
1908                                       * here is invariant under UTF-8, we can
1909                                       * claim it's UTF-8 even though it isn't.
1910                                       * */
1911                                      PL_in_utf8_COLLATE_locale);
1912             Safefree(x_longer);
1913 
1914             /* Find out how long the transformation of a substring of 'longer'
1915              * is.  Together the lengths of these transformations are
1916              * sufficient to calculate 'm' and 'b'.  The substring is all of
1917              * 'longer' except the first character.  This minimizes the chances
1918              * of being swayed by outliers */
1919             x_shorter = _mem_collxfrm(longer + 1,
1920                                       sizeof(longer) - 2,
1921                                       &x_len_shorter,
1922                                       PL_in_utf8_COLLATE_locale);
1923             Safefree(x_shorter);
1924 
1925             /* If the results are nonsensical for this simple test, the whole
1926              * locale definition is suspect.  Mark it so that locale collation
1927              * is not active at all for it.  XXX Should we warn? */
1928             if (   x_len_shorter == 0
1929                 || x_len_longer == 0
1930                 || x_len_shorter >= x_len_longer)
1931             {
1932                 PL_collxfrm_mult = 0;
1933                 PL_collxfrm_base = 0;
1934             }
1935             else {
1936                 SSize_t base;       /* Temporary */
1937 
1938                 /* We have both:    m * strlen(longer)  + b = x_len_longer
1939                  *                  m * strlen(shorter) + b = x_len_shorter;
1940                  * subtracting yields:
1941                  *          m * (strlen(longer) - strlen(shorter))
1942                  *                             = x_len_longer - x_len_shorter
1943                  * But we have set things up so that 'shorter' is 1 byte smaller
1944                  * than 'longer'.  Hence:
1945                  *          m = x_len_longer - x_len_shorter
1946                  *
1947                  * But if something went wrong, make sure the multiplier is at
1948                  * least 1.
1949                  */
1950                 if (x_len_longer > x_len_shorter) {
1951                     PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
1952                 }
1953                 else {
1954                     PL_collxfrm_mult = 1;
1955                 }
1956 
1957                 /*     mx + b = len
1958                  * so:      b = len - mx
1959                  * but in case something has gone wrong, make sure it is
1960                  * non-negative */
1961                 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
1962                 if (base < 0) {
1963                     base = 0;
1964                 }
1965 
1966                 /* Add 1 for the trailing NUL */
1967                 PL_collxfrm_base = base + 1;
1968             }
1969 
1970 #  ifdef DEBUGGING
1971 
1972             if (DEBUG_L_TEST || debug_initialization) {
1973                 PerlIO_printf(Perl_debug_log,
1974                     "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
1975                     "x_len_longer=%zu,"
1976                     " collate multipler=%zu, collate base=%zu\n",
1977                     __FILE__, __LINE__,
1978                     PL_in_utf8_COLLATE_locale,
1979                     x_len_shorter, x_len_longer,
1980                     PL_collxfrm_mult, PL_collxfrm_base);
1981             }
1982 #  endif
1983 
1984         }
1985     }
1986 
1987 #endif /* USE_LOCALE_COLLATE */
1988 
1989 }
1990 
1991 #endif
1992 
1993 #ifdef WIN32
1994 
1995 #define USE_WSETLOCALE
1996 
1997 #ifdef USE_WSETLOCALE
1998 
1999 STATIC char *
2000 S_wrap_wsetlocale(pTHX_ int category, const char *locale) {
2001     wchar_t *wlocale;
2002     wchar_t *wresult;
2003     char *result;
2004 
2005     if (locale) {
2006         int req_size =
2007             MultiByteToWideChar(CP_UTF8, 0, locale, -1, NULL, 0);
2008 
2009         if (!req_size) {
2010             errno = EINVAL;
2011             return NULL;
2012         }
2013 
2014         Newx(wlocale, req_size, wchar_t);
2015         if (!MultiByteToWideChar(CP_UTF8, 0, locale, -1, wlocale, req_size)) {
2016             Safefree(wlocale);
2017             errno = EINVAL;
2018             return NULL;
2019         }
2020     }
2021     else {
2022         wlocale = NULL;
2023     }
2024     wresult = _wsetlocale(category, wlocale);
2025     Safefree(wlocale);
2026     if (wresult) {
2027         int req_size =
2028             WideCharToMultiByte(CP_UTF8, 0, wresult, -1, NULL, 0, NULL, NULL);
2029         Newx(result, req_size, char);
2030         SAVEFREEPV(result); /* is there something better we can do here? */
2031         if (!WideCharToMultiByte(CP_UTF8, 0, wresult, -1,
2032                                  result, req_size, NULL, NULL)) {
2033             errno = EINVAL;
2034             return NULL;
2035         }
2036     }
2037     else {
2038         result = NULL;
2039     }
2040 
2041     return result;
2042 }
2043 
2044 #endif
2045 
2046 STATIC char *
2047 S_win32_setlocale(pTHX_ int category, const char* locale)
2048 {
2049     /* This, for Windows, emulates POSIX setlocale() behavior.  There is no
2050      * difference between the two unless the input locale is "", which normally
2051      * means on Windows to get the machine default, which is set via the
2052      * computer's "Regional and Language Options" (or its current equivalent).
2053      * In POSIX, it instead means to find the locale from the user's
2054      * environment.  This routine changes the Windows behavior to first look in
2055      * the environment, and, if anything is found, use that instead of going to
2056      * the machine default.  If there is no environment override, the machine
2057      * default is used, by calling the real setlocale() with "".
2058      *
2059      * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
2060      * use the particular category's variable if set; otherwise to use the LANG
2061      * variable. */
2062 
2063     bool override_LC_ALL = FALSE;
2064     char * result;
2065     unsigned int i;
2066 
2067     if (locale && strEQ(locale, "")) {
2068 
2069 #  ifdef LC_ALL
2070 
2071         locale = PerlEnv_getenv("LC_ALL");
2072         if (! locale) {
2073             if (category ==  LC_ALL) {
2074                 override_LC_ALL = TRUE;
2075             }
2076             else {
2077 
2078 #  endif
2079 
2080                 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2081                     if (category == categories[i]) {
2082                         locale = PerlEnv_getenv(category_names[i]);
2083                         goto found_locale;
2084                     }
2085                 }
2086 
2087                 locale = PerlEnv_getenv("LANG");
2088                 if (! locale) {
2089                     locale = "";
2090                 }
2091 
2092               found_locale: ;
2093 
2094 #  ifdef LC_ALL
2095 
2096             }
2097         }
2098 
2099 #  endif
2100 
2101     }
2102 
2103 #ifdef USE_WSETLOCALE
2104     result = S_wrap_wsetlocale(aTHX_ category, locale);
2105 #else
2106     result = setlocale(category, locale);
2107 #endif
2108     DEBUG_L(STMT_START {
2109                 dSAVE_ERRNO;
2110                 PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
2111                             setlocale_debug_string(category, locale, result));
2112                 RESTORE_ERRNO;
2113             } STMT_END);
2114 
2115     if (! override_LC_ALL)  {
2116         return result;
2117     }
2118 
2119     /* Here the input category was LC_ALL, and we have set it to what is in the
2120      * LANG variable or the system default if there is no LANG.  But these have
2121      * lower priority than the other LC_foo variables, so override it for each
2122      * one that is set.  (If they are set to "", it means to use the same thing
2123      * we just set LC_ALL to, so can skip) */
2124 
2125     for (i = 0; i < LC_ALL_INDEX; i++) {
2126         result = PerlEnv_getenv(category_names[i]);
2127         if (result && strNE(result, "")) {
2128 #ifdef USE_WSETLOCALE
2129             S_wrap_wsetlocale(aTHX_ categories[i], result);
2130 #else
2131             setlocale(categories[i], result);
2132 #endif
2133             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
2134                 __FILE__, __LINE__,
2135                 setlocale_debug_string(categories[i], result, "not captured")));
2136         }
2137     }
2138 
2139     result = setlocale(LC_ALL, NULL);
2140     DEBUG_L(STMT_START {
2141                 dSAVE_ERRNO;
2142                 PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
2143                                __FILE__, __LINE__,
2144                                setlocale_debug_string(LC_ALL, NULL, result));
2145                 RESTORE_ERRNO;
2146             } STMT_END);
2147 
2148     return result;
2149 }
2150 
2151 #endif
2152 
2153 /*
2154 =for apidoc Perl_setlocale
2155 
2156 This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
2157 taking the same parameters, and returning the same information, except that it
2158 returns the correct underlying C<LC_NUMERIC> locale.  Regular C<setlocale> will
2159 instead return C<C> if the underlying locale has a non-dot decimal point
2160 character, or a non-empty thousands separator for displaying floating point
2161 numbers.  This is because perl keeps that locale category such that it has a
2162 dot and empty separator, changing the locale briefly during the operations
2163 where the underlying one is required. C<Perl_setlocale> knows about this, and
2164 compensates; regular C<setlocale> doesn't.
2165 
2166 Another reason it isn't completely a drop-in replacement is that it is
2167 declared to return S<C<const char *>>, whereas the system setlocale omits the
2168 C<const> (presumably because its API was specified long ago, and can't be
2169 updated; it is illegal to change the information C<setlocale> returns; doing
2170 so leads to segfaults.)
2171 
2172 Finally, C<Perl_setlocale> works under all circumstances, whereas plain
2173 C<setlocale> can be completely ineffective on some platforms under some
2174 configurations.
2175 
2176 C<Perl_setlocale> should not be used to change the locale except on systems
2177 where the predefined variable C<${^SAFE_LOCALES}> is 1.  On some such systems,
2178 the system C<setlocale()> is ineffective, returning the wrong information, and
2179 failing to actually change the locale.  C<Perl_setlocale>, however works
2180 properly in all circumstances.
2181 
2182 The return points to a per-thread static buffer, which is overwritten the next
2183 time C<Perl_setlocale> is called from the same thread.
2184 
2185 =cut
2186 
2187 */
2188 
2189 const char *
2190 Perl_setlocale(const int category, const char * locale)
2191 {
2192     /* This wraps POSIX::setlocale() */
2193 
2194 #ifndef USE_LOCALE
2195 
2196     PERL_UNUSED_ARG(category);
2197     PERL_UNUSED_ARG(locale);
2198 
2199     return "C";
2200 
2201 #else
2202 
2203     const char * retval;
2204     const char * newlocale;
2205     dSAVEDERRNO;
2206     dTHX;
2207     DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
2208 
2209 #ifdef USE_LOCALE_NUMERIC
2210 
2211     /* A NULL locale means only query what the current one is.  We have the
2212      * LC_NUMERIC name saved, because we are normally switched into the C
2213      * (or equivalent) locale for it.  For an LC_ALL query, switch back to get
2214      * the correct results.  All other categories don't require special
2215      * handling */
2216     if (locale == NULL) {
2217         if (category == LC_NUMERIC) {
2218 
2219             /* We don't have to copy this return value, as it is a per-thread
2220              * variable, and won't change until a future setlocale */
2221             return PL_numeric_name;
2222         }
2223 
2224 #  ifdef LC_ALL
2225 
2226         else if (category == LC_ALL) {
2227             STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
2228         }
2229 
2230 #  endif
2231 
2232     }
2233 
2234 #endif
2235 
2236     retval = save_to_buffer(do_setlocale_r(category, locale),
2237                             &PL_setlocale_buf, &PL_setlocale_bufsize, 0);
2238     SAVE_ERRNO;
2239 
2240 #if defined(USE_LOCALE_NUMERIC) && defined(LC_ALL)
2241 
2242     if (locale == NULL && category == LC_ALL) {
2243         RESTORE_LC_NUMERIC();
2244     }
2245 
2246 #endif
2247 
2248     DEBUG_L(PerlIO_printf(Perl_debug_log,
2249         "%s:%d: %s\n", __FILE__, __LINE__,
2250             setlocale_debug_string(category, locale, retval)));
2251 
2252     RESTORE_ERRNO;
2253 
2254     if (! retval) {
2255         return NULL;
2256     }
2257 
2258     /* If locale == NULL, we are just querying the state */
2259     if (locale == NULL) {
2260         return retval;
2261     }
2262 
2263     /* Now that have switched locales, we have to update our records to
2264      * correspond. */
2265 
2266     switch (category) {
2267 
2268 #ifdef USE_LOCALE_CTYPE
2269 
2270         case LC_CTYPE:
2271             new_ctype(retval);
2272             break;
2273 
2274 #endif
2275 #ifdef USE_LOCALE_COLLATE
2276 
2277         case LC_COLLATE:
2278             new_collate(retval);
2279             break;
2280 
2281 #endif
2282 #ifdef USE_LOCALE_NUMERIC
2283 
2284         case LC_NUMERIC:
2285             new_numeric(retval);
2286             break;
2287 
2288 #endif
2289 #ifdef LC_ALL
2290 
2291         case LC_ALL:
2292 
2293             /* LC_ALL updates all the things we care about.  The values may not
2294              * be the same as 'retval', as the locale "" may have set things
2295              * individually */
2296 
2297 #  ifdef USE_LOCALE_CTYPE
2298 
2299             newlocale = savepv(do_setlocale_c(LC_CTYPE, NULL));
2300             new_ctype(newlocale);
2301             Safefree(newlocale);
2302 
2303 #  endif /* USE_LOCALE_CTYPE */
2304 #  ifdef USE_LOCALE_COLLATE
2305 
2306             newlocale = savepv(do_setlocale_c(LC_COLLATE, NULL));
2307             new_collate(newlocale);
2308             Safefree(newlocale);
2309 
2310 #  endif
2311 #  ifdef USE_LOCALE_NUMERIC
2312 
2313             newlocale = savepv(do_setlocale_c(LC_NUMERIC, NULL));
2314             new_numeric(newlocale);
2315             Safefree(newlocale);
2316 
2317 #  endif /* USE_LOCALE_NUMERIC */
2318 #endif /* LC_ALL */
2319 
2320         default:
2321             break;
2322     }
2323 
2324     return retval;
2325 
2326 #endif
2327 
2328 }
2329 
2330 PERL_STATIC_INLINE const char *
2331 S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
2332 {
2333     /* Copy the NUL-terminated 'string' to 'buf' + 'offset'.  'buf' has size 'buf_size',
2334      * growing it if necessary */
2335 
2336     Size_t string_size;
2337 
2338     PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
2339 
2340     if (! string) {
2341         return NULL;
2342     }
2343 
2344     string_size = strlen(string) + offset + 1;
2345 
2346     if (*buf_size == 0) {
2347         Newx(*buf, string_size, char);
2348         *buf_size = string_size;
2349     }
2350     else if (string_size > *buf_size) {
2351         Renew(*buf, string_size, char);
2352         *buf_size = string_size;
2353     }
2354 
2355     Copy(string, *buf + offset, string_size - offset, char);
2356     return *buf;
2357 }
2358 
2359 /*
2360 
2361 =for apidoc Perl_langinfo
2362 
2363 This is an (almost) drop-in replacement for the system C<L<nl_langinfo(3)>>,
2364 taking the same C<item> parameter values, and returning the same information.
2365 But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
2366 of Perl's locale handling from your code, and can be used on systems that lack
2367 a native C<nl_langinfo>.
2368 
2369 Expanding on these:
2370 
2371 =over
2372 
2373 =item *
2374 
2375 The reason it isn't quite a drop-in replacement is actually an advantage.  The
2376 only difference is that it returns S<C<const char *>>, whereas plain
2377 C<nl_langinfo()> returns S<C<char *>>, but you are (only by documentation)
2378 forbidden to write into the buffer.  By declaring this C<const>, the compiler
2379 enforces this restriction, so if it is violated, you know at compilation time,
2380 rather than getting segfaults at runtime.
2381 
2382 =item *
2383 
2384 It delivers the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
2385 without you having to write extra code.  The reason for the extra code would be
2386 because these are from the C<LC_NUMERIC> locale category, which is normally
2387 kept set by Perl so that the radix is a dot, and the separator is the empty
2388 string, no matter what the underlying locale is supposed to be, and so to get
2389 the expected results, you have to temporarily toggle into the underlying
2390 locale, and later toggle back.  (You could use plain C<nl_langinfo> and
2391 C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
2392 the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
2393 (or equivalent) locale would break a lot of CPAN, which is expecting the radix
2394 (decimal point) character to be a dot.)
2395 
2396 =item *
2397 
2398 The system function it replaces can have its static return buffer trashed,
2399 not only by a subsequent call to that function, but by a C<freelocale>,
2400 C<setlocale>, or other locale change.  The returned buffer of this function is
2401 not changed until the next call to it, so the buffer is never in a trashed
2402 state.
2403 
2404 =item *
2405 
2406 Its return buffer is per-thread, so it also is never overwritten by a call to
2407 this function from another thread;  unlike the function it replaces.
2408 
2409 =item *
2410 
2411 But most importantly, it works on systems that don't have C<nl_langinfo>, such
2412 as Windows, hence makes your code more portable.  Of the fifty-some possible
2413 items specified by the POSIX 2008 standard,
2414 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
2415 only one is completely unimplemented, though on non-Windows platforms, another
2416 significant one is also not implemented).  It uses various techniques to
2417 recover the other items, including calling C<L<localeconv(3)>>, and
2418 C<L<strftime(3)>>, both of which are specified in C89, so should be always be
2419 available.  Later C<strftime()> versions have additional capabilities; C<""> is
2420 returned for those not available on your system.
2421 
2422 It is important to note that when called with an item that is recovered by
2423 using C<localeconv>, the buffer from any previous explicit call to
2424 C<localeconv> will be overwritten.  This means you must save that buffer's
2425 contents if you need to access them after a call to this function.  (But note
2426 that you might not want to be using C<localeconv()> directly anyway, because of
2427 issues like the ones listed in the second item of this list (above) for
2428 C<RADIXCHAR> and C<THOUSEP>.  You can use the methods given in L<perlcall> to
2429 call L<POSIX/localeconv> and avoid all the issues, but then you have a hash to
2430 unpack).
2431 
2432 The details for those items which may deviate from what this emulation returns
2433 and what a native C<nl_langinfo()> would return are specified in
2434 L<I18N::Langinfo>.
2435 
2436 =back
2437 
2438 When using C<Perl_langinfo> on systems that don't have a native
2439 C<nl_langinfo()>, you must
2440 
2441  #include "perl_langinfo.h"
2442 
2443 before the C<perl.h> C<#include>.  You can replace your C<langinfo.h>
2444 C<#include> with this one.  (Doing it this way keeps out the symbols that plain
2445 C<langinfo.h> would try to import into the namespace for code that doesn't need
2446 it.)
2447 
2448 The original impetus for C<Perl_langinfo()> was so that code that needs to
2449 find out the current currency symbol, floating point radix character, or digit
2450 grouping separator can use, on all systems, the simpler and more
2451 thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
2452 pain to make thread-friendly.  For other fields returned by C<localeconv>, it
2453 is better to use the methods given in L<perlcall> to call
2454 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
2455 
2456 =cut
2457 
2458 */
2459 
2460 const char *
2461 #ifdef HAS_NL_LANGINFO
2462 Perl_langinfo(const nl_item item)
2463 #else
2464 Perl_langinfo(const int item)
2465 #endif
2466 {
2467     return my_nl_langinfo(item, TRUE);
2468 }
2469 
2470 STATIC const char *
2471 #ifdef HAS_NL_LANGINFO
2472 S_my_nl_langinfo(const nl_item item, bool toggle)
2473 #else
2474 S_my_nl_langinfo(const int item, bool toggle)
2475 #endif
2476 {
2477     dTHX;
2478     const char * retval;
2479 
2480 #ifdef USE_LOCALE_NUMERIC
2481 
2482     /* We only need to toggle into the underlying LC_NUMERIC locale for these
2483      * two items, and only if not already there */
2484     if (toggle && ((   item != RADIXCHAR && item != THOUSEP)
2485                     || PL_numeric_underlying))
2486 
2487 #endif  /* No toggling needed if not using LC_NUMERIC */
2488 
2489         toggle = FALSE;
2490 
2491 #if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available.  */
2492 #  if   ! defined(HAS_THREAD_SAFE_NL_LANGINFO_L)      \
2493      || ! defined(HAS_POSIX_2008_LOCALE)
2494 
2495     /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
2496      * for those items dependent on it.  This must be copied to a buffer before
2497      * switching back, as some systems destroy the buffer when setlocale() is
2498      * called */
2499 
2500     {
2501         DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
2502 
2503         if (toggle) {
2504             STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
2505         }
2506 
2507         /* Prevent interference from another thread executing this code
2508          * section. */
2509         NL_LANGINFO_LOCK;
2510 
2511         /* Copy to a per-thread buffer, which is also one that won't be
2512          * destroyed by a subsequent setlocale(), such as the
2513          * RESTORE_LC_NUMERIC may do just below. */
2514         retval = save_to_buffer(nl_langinfo(item),
2515                                 &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
2516         NL_LANGINFO_UNLOCK;
2517 
2518         if (toggle) {
2519             RESTORE_LC_NUMERIC();
2520         }
2521     }
2522 
2523 #  else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
2524 
2525     {
2526         bool do_free = FALSE;
2527         locale_t cur = uselocale((locale_t) 0);
2528 
2529         if (cur == LC_GLOBAL_LOCALE) {
2530             cur = duplocale(LC_GLOBAL_LOCALE);
2531             do_free = TRUE;
2532         }
2533 
2534 #    ifdef USE_LOCALE_NUMERIC
2535 
2536         if (toggle) {
2537             if (PL_underlying_numeric_obj) {
2538                 cur = PL_underlying_numeric_obj;
2539             }
2540             else {
2541                 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
2542                 do_free = TRUE;
2543             }
2544         }
2545 
2546 #    endif
2547 
2548         /* We have to save it to a buffer, because the freelocale() just below
2549          * can invalidate the internal one */
2550         retval = save_to_buffer(nl_langinfo_l(item, cur),
2551                                 &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
2552 
2553         if (do_free) {
2554             freelocale(cur);
2555         }
2556     }
2557 
2558 #  endif
2559 
2560     if (strEQ(retval, "")) {
2561         if (item == YESSTR) {
2562             return "yes";
2563         }
2564         if (item == NOSTR) {
2565             return "no";
2566         }
2567     }
2568 
2569     return retval;
2570 
2571 #else   /* Below, emulate nl_langinfo as best we can */
2572 
2573     {
2574 
2575 #  ifdef HAS_LOCALECONV
2576 
2577         const struct lconv* lc;
2578         const char * temp;
2579         DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
2580 
2581 #    ifdef TS_W32_BROKEN_LOCALECONV
2582 
2583         const char * save_global;
2584         const char * save_thread;
2585         int needed_size;
2586         char * ptr;
2587         char * e;
2588         char * item_start;
2589 
2590 #    endif
2591 #  endif
2592 #  ifdef HAS_STRFTIME
2593 
2594         struct tm tm;
2595         bool return_format = FALSE; /* Return the %format, not the value */
2596         const char * format;
2597 
2598 #  endif
2599 
2600         /* We copy the results to a per-thread buffer, even if not
2601          * multi-threaded.  This is in part to simplify this code, and partly
2602          * because we need a buffer anyway for strftime(), and partly because a
2603          * call of localeconv() could otherwise wipe out the buffer, and the
2604          * programmer would not be expecting this, as this is a nl_langinfo()
2605          * substitute after all, so s/he might be thinking their localeconv()
2606          * is safe until another localeconv() call. */
2607 
2608         switch (item) {
2609             Size_t len;
2610 
2611             /* This is unimplemented */
2612             case ERA:      /* For use with strftime() %E modifier */
2613 
2614             default:
2615                 return "";
2616 
2617             /* We use only an English set, since we don't know any more */
2618             case YESEXPR:   return "^[+1yY]";
2619             case YESSTR:    return "yes";
2620             case NOEXPR:    return "^[-0nN]";
2621             case NOSTR:     return "no";
2622 
2623             case CODESET:
2624 
2625 #  ifndef WIN32
2626 
2627                 /* On non-windows, this is unimplemented, in part because of
2628                  * inconsistencies between vendors.  The Darwin native
2629                  * nl_langinfo() implementation simply looks at everything past
2630                  * any dot in the name, but that doesn't work for other
2631                  * vendors.  Many Linux locales that don't have UTF-8 in their
2632                  * names really are UTF-8, for example; z/OS locales that do
2633                  * have UTF-8 in their names, aren't really UTF-8 */
2634                 return "";
2635 
2636 #  else
2637 
2638                 {   /* But on Windows, the name does seem to be consistent, so
2639                        use that. */
2640                     const char * p;
2641                     const char * first;
2642                     Size_t offset = 0;
2643                     const char * name = my_setlocale(LC_CTYPE, NULL);
2644 
2645                     if (isNAME_C_OR_POSIX(name)) {
2646                         return "ANSI_X3.4-1968";
2647                     }
2648 
2649                     /* Find the dot in the locale name */
2650                     first = (const char *) strchr(name, '.');
2651                     if (! first) {
2652                         first = name;
2653                         goto has_nondigit;
2654                     }
2655 
2656                     /* Look at everything past the dot */
2657                     first++;
2658                     p = first;
2659 
2660                     while (*p) {
2661                         if (! isDIGIT(*p)) {
2662                             goto has_nondigit;
2663                         }
2664 
2665                         p++;
2666                     }
2667 
2668                     /* Here everything past the dot is a digit.  Treat it as a
2669                      * code page */
2670                     retval = save_to_buffer("CP", &PL_langinfo_buf,
2671                                                 &PL_langinfo_bufsize, 0);
2672                     offset = STRLENs("CP");
2673 
2674                   has_nondigit:
2675 
2676                     retval = save_to_buffer(first, &PL_langinfo_buf,
2677                                             &PL_langinfo_bufsize, offset);
2678                 }
2679 
2680                 break;
2681 
2682 #  endif
2683 #  ifdef HAS_LOCALECONV
2684 
2685             case CRNCYSTR:
2686 
2687                 /* We don't bother with localeconv_l() because any system that
2688                  * has it is likely to also have nl_langinfo() */
2689 
2690                 LOCALECONV_LOCK;    /* Prevent interference with other threads
2691                                        using localeconv() */
2692 
2693 #    ifdef TS_W32_BROKEN_LOCALECONV
2694 
2695                 /* This is a workaround for a Windows bug prior to VS 15.
2696                  * What we do here is, while locked, switch to the global
2697                  * locale so localeconv() works; then switch back just before
2698                  * the unlock.  This can screw things up if some thread is
2699                  * already using the global locale while assuming no other is.
2700                  * A different workaround would be to call GetCurrencyFormat on
2701                  * a known value, and parse it; patches welcome
2702                  *
2703                  * We have to use LC_ALL instead of LC_MONETARY because of
2704                  * another bug in Windows */
2705 
2706                 save_thread = savepv(my_setlocale(LC_ALL, NULL));
2707                 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
2708                 save_global= savepv(my_setlocale(LC_ALL, NULL));
2709                 my_setlocale(LC_ALL, save_thread);
2710 
2711 #    endif
2712 
2713                 lc = localeconv();
2714                 if (   ! lc
2715                     || ! lc->currency_symbol
2716                     || strEQ("", lc->currency_symbol))
2717                 {
2718                     LOCALECONV_UNLOCK;
2719                     return "";
2720                 }
2721 
2722                 /* Leave the first spot empty to be filled in below */
2723                 retval = save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
2724                                         &PL_langinfo_bufsize, 1);
2725                 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
2726                 { /*  khw couldn't figure out how the localedef specifications
2727                       would show that the $ should replace the radix; this is
2728                       just a guess as to how it might work.*/
2729                     PL_langinfo_buf[0] = '.';
2730                 }
2731                 else if (lc->p_cs_precedes) {
2732                     PL_langinfo_buf[0] = '-';
2733                 }
2734                 else {
2735                     PL_langinfo_buf[0] = '+';
2736                 }
2737 
2738 #    ifdef TS_W32_BROKEN_LOCALECONV
2739 
2740                 my_setlocale(LC_ALL, save_global);
2741                 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
2742                 my_setlocale(LC_ALL, save_thread);
2743                 Safefree(save_global);
2744                 Safefree(save_thread);
2745 
2746 #    endif
2747 
2748                 LOCALECONV_UNLOCK;
2749                 break;
2750 
2751 #    ifdef TS_W32_BROKEN_LOCALECONV
2752 
2753             case RADIXCHAR:
2754 
2755                 /* For this, we output a known simple floating point number to
2756                  * a buffer, and parse it, looking for the radix */
2757 
2758                 if (toggle) {
2759                     STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
2760                 }
2761 
2762                 if (PL_langinfo_bufsize < 10) {
2763                     PL_langinfo_bufsize = 10;
2764                     Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
2765                 }
2766 
2767                 needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize,
2768                                           "%.1f", 1.5);
2769                 if (needed_size >= (int) PL_langinfo_bufsize) {
2770                     PL_langinfo_bufsize = needed_size + 1;
2771                     Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
2772                     needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize,
2773                                              "%.1f", 1.5);
2774                     assert(needed_size < (int) PL_langinfo_bufsize);
2775                 }
2776 
2777                 ptr = PL_langinfo_buf;
2778                 e = PL_langinfo_buf + PL_langinfo_bufsize;
2779 
2780                 /* Find the '1' */
2781                 while (ptr < e && *ptr != '1') {
2782                     ptr++;
2783                 }
2784                 ptr++;
2785 
2786                 /* Find the '5' */
2787                 item_start = ptr;
2788                 while (ptr < e && *ptr != '5') {
2789                     ptr++;
2790                 }
2791 
2792                 /* Everything in between is the radix string */
2793                 if (ptr >= e) {
2794                     PL_langinfo_buf[0] = '?';
2795                     PL_langinfo_buf[1] = '\0';
2796                 }
2797                 else {
2798                     *ptr = '\0';
2799                     Move(item_start, PL_langinfo_buf, ptr - PL_langinfo_buf, char);
2800                 }
2801 
2802                 if (toggle) {
2803                     RESTORE_LC_NUMERIC();
2804                 }
2805 
2806                 retval = PL_langinfo_buf;
2807                 break;
2808 
2809 #    else
2810 
2811             case RADIXCHAR:     /* No special handling needed */
2812 
2813 #    endif
2814 
2815             case THOUSEP:
2816 
2817                 if (toggle) {
2818                     STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
2819                 }
2820 
2821                 LOCALECONV_LOCK;    /* Prevent interference with other threads
2822                                        using localeconv() */
2823 
2824 #    ifdef TS_W32_BROKEN_LOCALECONV
2825 
2826                 /* This should only be for the thousands separator.  A
2827                  * different work around would be to use GetNumberFormat on a
2828                  * known value and parse the result to find the separator */
2829                 save_thread = savepv(my_setlocale(LC_ALL, NULL));
2830                 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
2831                 save_global = savepv(my_setlocale(LC_ALL, NULL));
2832                 my_setlocale(LC_ALL, save_thread);
2833 #      if 0
2834                 /* This is the start of code that for broken Windows replaces
2835                  * the above and below code, and instead calls
2836                  * GetNumberFormat() and then would parse that to find the
2837                  * thousands separator.  It needs to handle UTF-16 vs -8
2838                  * issues. */
2839 
2840                 needed_size = GetNumberFormatEx(PL_numeric_name, 0, "1234.5", NULL, PL_langinfo_buf, PL_langinfo_bufsize);
2841                 DEBUG_L(PerlIO_printf(Perl_debug_log,
2842                     "%s: %d: return from GetNumber, count=%d, val=%s\n",
2843                     __FILE__, __LINE__, needed_size, PL_langinfo_buf));
2844 
2845 #      endif
2846 #    endif
2847 
2848                 lc = localeconv();
2849                 if (! lc) {
2850                     temp = "";
2851                 }
2852                 else {
2853                     temp = (item == RADIXCHAR)
2854                              ? lc->decimal_point
2855                              : lc->thousands_sep;
2856                     if (! temp) {
2857                         temp = "";
2858                     }
2859                 }
2860 
2861                 retval = save_to_buffer(temp, &PL_langinfo_buf,
2862                                         &PL_langinfo_bufsize, 0);
2863 
2864 #    ifdef TS_W32_BROKEN_LOCALECONV
2865 
2866                 my_setlocale(LC_ALL, save_global);
2867                 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
2868                 my_setlocale(LC_ALL, save_thread);
2869                 Safefree(save_global);
2870                 Safefree(save_thread);
2871 
2872 #    endif
2873 
2874                 LOCALECONV_UNLOCK;
2875 
2876                 if (toggle) {
2877                     RESTORE_LC_NUMERIC();
2878                 }
2879 
2880                 break;
2881 
2882 #  endif
2883 #  ifdef HAS_STRFTIME
2884 
2885             /* These are defined by C89, so we assume that strftime supports
2886              * them, and so are returned unconditionally; they may not be what
2887              * the locale actually says, but should give good enough results
2888              * for someone using them as formats (as opposed to trying to parse
2889              * them to figure out what the locale says).  The other format
2890              * items are actually tested to verify they work on the platform */
2891             case D_FMT:         return "%x";
2892             case T_FMT:         return "%X";
2893             case D_T_FMT:       return "%c";
2894 
2895             /* These formats are only available in later strfmtime's */
2896             case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM:
2897 
2898             /* The rest can be gotten from most versions of strftime(). */
2899             case ABDAY_1: case ABDAY_2: case ABDAY_3:
2900             case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7:
2901             case ALT_DIGITS:
2902             case AM_STR: case PM_STR:
2903             case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4:
2904             case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8:
2905             case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12:
2906             case DAY_1: case DAY_2: case DAY_3: case DAY_4:
2907             case DAY_5: case DAY_6: case DAY_7:
2908             case MON_1: case MON_2: case MON_3: case MON_4:
2909             case MON_5: case MON_6: case MON_7: case MON_8:
2910             case MON_9: case MON_10: case MON_11: case MON_12:
2911 
2912                 init_tm(&tm);   /* Precaution against core dumps */
2913                 tm.tm_sec = 30;
2914                 tm.tm_min = 30;
2915                 tm.tm_hour = 6;
2916                 tm.tm_year = 2017 - 1900;
2917                 tm.tm_wday = 0;
2918                 tm.tm_mon = 0;
2919 
2920                 GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
2921 
2922                 switch (item) {
2923                     default:
2924                         Perl_croak(aTHX_
2925                                     "panic: %s: %d: switch case: %d problem",
2926                                        __FILE__, __LINE__, item);
2927                         NOT_REACHED; /* NOTREACHED */
2928 
2929                     case PM_STR: tm.tm_hour = 18;
2930                     case AM_STR:
2931                         format = "%p";
2932                         break;
2933 
2934                     case ABDAY_7: tm.tm_wday++;
2935                     case ABDAY_6: tm.tm_wday++;
2936                     case ABDAY_5: tm.tm_wday++;
2937                     case ABDAY_4: tm.tm_wday++;
2938                     case ABDAY_3: tm.tm_wday++;
2939                     case ABDAY_2: tm.tm_wday++;
2940                     case ABDAY_1:
2941                         format = "%a";
2942                         break;
2943 
2944                     case DAY_7: tm.tm_wday++;
2945                     case DAY_6: tm.tm_wday++;
2946                     case DAY_5: tm.tm_wday++;
2947                     case DAY_4: tm.tm_wday++;
2948                     case DAY_3: tm.tm_wday++;
2949                     case DAY_2: tm.tm_wday++;
2950                     case DAY_1:
2951                         format = "%A";
2952                         break;
2953 
2954                     case ABMON_12: tm.tm_mon++;
2955                     case ABMON_11: tm.tm_mon++;
2956                     case ABMON_10: tm.tm_mon++;
2957                     case ABMON_9: tm.tm_mon++;
2958                     case ABMON_8: tm.tm_mon++;
2959                     case ABMON_7: tm.tm_mon++;
2960                     case ABMON_6: tm.tm_mon++;
2961                     case ABMON_5: tm.tm_mon++;
2962                     case ABMON_4: tm.tm_mon++;
2963                     case ABMON_3: tm.tm_mon++;
2964                     case ABMON_2: tm.tm_mon++;
2965                     case ABMON_1:
2966                         format = "%b";
2967                         break;
2968 
2969                     case MON_12: tm.tm_mon++;
2970                     case MON_11: tm.tm_mon++;
2971                     case MON_10: tm.tm_mon++;
2972                     case MON_9: tm.tm_mon++;
2973                     case MON_8: tm.tm_mon++;
2974                     case MON_7: tm.tm_mon++;
2975                     case MON_6: tm.tm_mon++;
2976                     case MON_5: tm.tm_mon++;
2977                     case MON_4: tm.tm_mon++;
2978                     case MON_3: tm.tm_mon++;
2979                     case MON_2: tm.tm_mon++;
2980                     case MON_1:
2981                         format = "%B";
2982                         break;
2983 
2984                     case T_FMT_AMPM:
2985                         format = "%r";
2986                         return_format = TRUE;
2987                         break;
2988 
2989                     case ERA_D_FMT:
2990                         format = "%Ex";
2991                         return_format = TRUE;
2992                         break;
2993 
2994                     case ERA_T_FMT:
2995                         format = "%EX";
2996                         return_format = TRUE;
2997                         break;
2998 
2999                     case ERA_D_T_FMT:
3000                         format = "%Ec";
3001                         return_format = TRUE;
3002                         break;
3003 
3004                     case ALT_DIGITS:
3005                         tm.tm_wday = 0;
3006                         format = "%Ow";	/* Find the alternate digit for 0 */
3007                         break;
3008                 }
3009 
3010                 GCC_DIAG_RESTORE_STMT;
3011 
3012                 /* We can't use my_strftime() because it doesn't look at
3013                  * tm_wday  */
3014                 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
3015                                      format, &tm))
3016                 {
3017                     /* A zero return means one of:
3018                      *  a)  there wasn't enough space in PL_langinfo_buf
3019                      *  b)  the format, like a plain %p, returns empty
3020                      *  c)  it was an illegal format, though some
3021                      *      implementations of strftime will just return the
3022                      *      illegal format as a plain character sequence.
3023                      *
3024                      *  To quickly test for case 'b)', try again but precede
3025                      *  the format with a plain character.  If that result is
3026                      *  still empty, the problem is either 'a)' or 'c)' */
3027 
3028                     Size_t format_size = strlen(format) + 1;
3029                     Size_t mod_size = format_size + 1;
3030                     char * mod_format;
3031                     char * temp_result;
3032 
3033                     Newx(mod_format, mod_size, char);
3034                     Newx(temp_result, PL_langinfo_bufsize, char);
3035                     *mod_format = ' ';
3036                     my_strlcpy(mod_format + 1, format, mod_size);
3037                     len = strftime(temp_result,
3038                                    PL_langinfo_bufsize,
3039                                    mod_format, &tm);
3040                     Safefree(mod_format);
3041                     Safefree(temp_result);
3042 
3043                     /* If 'len' is non-zero, it means that we had a case like
3044                      * %p which means the current locale doesn't use a.m. or
3045                      * p.m., and that is valid */
3046                     if (len == 0) {
3047 
3048                         /* Here, still didn't work.  If we get well beyond a
3049                          * reasonable size, bail out to prevent an infinite
3050                          * loop. */
3051 
3052                         if (PL_langinfo_bufsize > 100 * format_size) {
3053                             *PL_langinfo_buf = '\0';
3054                         }
3055                         else {
3056                             /* Double the buffer size to retry;  Add 1 in case
3057                              * original was 0, so we aren't stuck at 0.  */
3058                             PL_langinfo_bufsize *= 2;
3059                             PL_langinfo_bufsize++;
3060                             Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
3061                             continue;
3062                         }
3063                     }
3064 
3065                     break;
3066                 }
3067 
3068                 /* Here, we got a result.
3069                  *
3070                  * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
3071                  * alternate format for wday 0.  If the value is the same as
3072                  * the normal 0, there isn't an alternate, so clear the buffer.
3073                  * */
3074                 if (   item == ALT_DIGITS
3075                     && strEQ(PL_langinfo_buf, "0"))
3076                 {
3077                     *PL_langinfo_buf = '\0';
3078                 }
3079 
3080                 /* ALT_DIGITS is problematic.  Experiments on it showed that
3081                  * strftime() did not always work properly when going from
3082                  * alt-9 to alt-10.  Only a few locales have this item defined,
3083                  * and in all of them on Linux that khw was able to find,
3084                  * nl_langinfo() merely returned the alt-0 character, possibly
3085                  * doubled.  Most Unicode digits are in blocks of 10
3086                  * consecutive code points, so that is sufficient information
3087                  * for those scripts, as we can infer alt-1, alt-2, ....  But
3088                  * for a Japanese locale, a CJK ideographic 0 is returned, and
3089                  * the CJK digits are not in code point order, so you can't
3090                  * really infer anything.  The localedef for this locale did
3091                  * specify the succeeding digits, so that strftime() works
3092                  * properly on them, without needing to infer anything.  But
3093                  * the nl_langinfo() return did not give sufficient information
3094                  * for the caller to understand what's going on.  So until
3095                  * there is evidence that it should work differently, this
3096                  * returns the alt-0 string for ALT_DIGITS.
3097                  *
3098                  * wday was chosen because its range is all a single digit.
3099                  * Things like tm_sec have two digits as the minimum: '00' */
3100 
3101                 retval = PL_langinfo_buf;
3102 
3103                 /* If to return the format, not the value, overwrite the buffer
3104                  * with it.  But some strftime()s will keep the original format
3105                  * if illegal, so change those to "" */
3106                 if (return_format) {
3107                     if (strEQ(PL_langinfo_buf, format)) {
3108                         *PL_langinfo_buf = '\0';
3109                     }
3110                     else {
3111                         retval = save_to_buffer(format, &PL_langinfo_buf,
3112                                                 &PL_langinfo_bufsize, 0);
3113                     }
3114                 }
3115 
3116                 break;
3117 
3118 #  endif
3119 
3120         }
3121     }
3122 
3123     return retval;
3124 
3125 #endif
3126 
3127 }
3128 
3129 /*
3130  * Initialize locale awareness.
3131  */
3132 int
3133 Perl_init_i18nl10n(pTHX_ int printwarn)
3134 {
3135     /* printwarn is
3136      *
3137      *    0 if not to output warning when setup locale is bad
3138      *    1 if to output warning based on value of PERL_BADLANG
3139      *    >1 if to output regardless of PERL_BADLANG
3140      *
3141      * returns
3142      *    1 = set ok or not applicable,
3143      *    0 = fallback to a locale of lower priority
3144      *   -1 = fallback to all locales failed, not even to the C locale
3145      *
3146      * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
3147      * set, debugging information is output.
3148      *
3149      * This looks more complicated than it is, mainly due to the #ifdefs and
3150      * error handling.
3151      *
3152      * Besides some asserts, data structure initialization, and specific
3153      * platform complications, this routine is effectively just two things.
3154      *
3155      *    a)    setlocale(LC_ALL, "");
3156      *
3157      * which sets LC_ALL to the values in the current environment.
3158      *
3159      * And for each individual category 'foo' whose value we care about:
3160      *
3161      *    b)    save_foo = setlocale(LC_foo, NULL); handle_foo(save_foo);
3162      *
3163      * (We don't tend to care about categories like LC_PAPER, for example.)
3164      *
3165      * But there are complications.  On systems without LC_ALL, it emulates
3166      * step a) by looping through all the categories, and doing
3167      *
3168      *    setlocale(LC_foo, "");
3169      *
3170      * on each.
3171      *
3172      * And it has to deal with if this is an embedded perl, whose locale
3173      * doesn't come from the environment, but has been set up by the caller.
3174      * This is pretty simply handled: the "" in the setlocale calls is not a
3175      * string constant, but a variable which is set to NULL in the embedded
3176      * case.
3177      *
3178      * But the major complication is handling failure and doing fallback.
3179      * There is an array, trial_locales, the elements of which are looped over
3180      * until the locale is successfully set.  The array is initialized with
3181      * just one element, for
3182      *      setlocale(LC_ALL, $NULL_or_empty)
3183      * If that works, as it almost always does, there's no more elements and
3184      * the loop iterates just the once.  Otherwise elements are added for each
3185      * of the environment variables that POSIX dictates should control the
3186      * program, in priority order, with a final one being "C".  The loop is
3187      * repeated until the first one succeeds.  If all fail, we limp along with
3188      * whatever state we got to.  If there is no LC_ALL, an inner loop is run
3189      * through all categories (making things look complex).
3190      *
3191      * A further complication is that Windows has an additional fallback, the
3192      * user-default ANSI code page obtained from the operating system.  This is
3193      * added as yet another loop iteration, just before the final "C"
3194      *
3195      * On Ultrix, the locale MUST come from the environment, so there is
3196      * preliminary code to set it.  I (khw) am not sure that it is necessary,
3197      * and that this couldn't be folded into the loop, but barring any real
3198      * platforms to test on, it's staying as-is
3199      */
3200 
3201     int ok = 1;
3202 
3203 #ifndef USE_LOCALE
3204 
3205     PERL_UNUSED_ARG(printwarn);
3206 
3207 #else  /* USE_LOCALE */
3208 #  ifdef __GLIBC__
3209 
3210     const char * const language = PerlEnv_getenv("LANGUAGE");
3211 
3212 #  endif
3213 
3214     /* NULL uses the existing already set up locale */
3215     const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
3216                                         ? NULL
3217                                         : "";
3218     const char* trial_locales[5];   /* 5 = 1 each for "", LC_ALL, LANG, "", C */
3219     unsigned int trial_locales_count;
3220     const char * const lc_all     = PerlEnv_getenv("LC_ALL");
3221     const char * const lang       = PerlEnv_getenv("LANG");
3222     bool setlocale_failure = FALSE;
3223     unsigned int i;
3224 
3225     /* A later getenv() could zap this, so only use here */
3226     const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
3227 
3228     const bool locwarn = (printwarn > 1
3229                           || (          printwarn
3230                               && (    ! bad_lang_use_once
3231                                   || (
3232                                          /* disallow with "" or "0" */
3233                                          *bad_lang_use_once
3234                                        && strNE("0", bad_lang_use_once)))));
3235 
3236     /* setlocale() return vals; not copied so must be looked at immediately */
3237     const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
3238 
3239     /* current locale for given category; should have been copied so aren't
3240      * volatile */
3241     const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
3242 
3243 #  ifdef WIN32
3244 
3245     /* In some systems you can find out the system default locale
3246      * and use that as the fallback locale. */
3247 #    define SYSTEM_DEFAULT_LOCALE
3248 #  endif
3249 #  ifdef SYSTEM_DEFAULT_LOCALE
3250 
3251     const char *system_default_locale = NULL;
3252 
3253 #  endif
3254 
3255 #  ifndef DEBUGGING
3256 #    define DEBUG_LOCALE_INIT(a,b,c)
3257 #  else
3258 
3259     DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
3260 
3261 #    define DEBUG_LOCALE_INIT(category, locale, result)                     \
3262         STMT_START {                                                        \
3263                 if (debug_initialization) {                                 \
3264                     PerlIO_printf(Perl_debug_log,                           \
3265                                   "%s:%d: %s\n",                            \
3266                                   __FILE__, __LINE__,                       \
3267                                   setlocale_debug_string(category,          \
3268                                                           locale,           \
3269                                                           result));         \
3270                 }                                                           \
3271         } STMT_END
3272 
3273 /* Make sure the parallel arrays are properly set up */
3274 #    ifdef USE_LOCALE_NUMERIC
3275     assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC);
3276     assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC"));
3277 #      ifdef USE_POSIX_2008_LOCALE
3278     assert(category_masks[LC_NUMERIC_INDEX] == LC_NUMERIC_MASK);
3279 #      endif
3280 #    endif
3281 #    ifdef USE_LOCALE_CTYPE
3282     assert(categories[LC_CTYPE_INDEX] == LC_CTYPE);
3283     assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE"));
3284 #      ifdef USE_POSIX_2008_LOCALE
3285     assert(category_masks[LC_CTYPE_INDEX] == LC_CTYPE_MASK);
3286 #      endif
3287 #    endif
3288 #    ifdef USE_LOCALE_COLLATE
3289     assert(categories[LC_COLLATE_INDEX] == LC_COLLATE);
3290     assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE"));
3291 #      ifdef USE_POSIX_2008_LOCALE
3292     assert(category_masks[LC_COLLATE_INDEX] == LC_COLLATE_MASK);
3293 #      endif
3294 #    endif
3295 #    ifdef USE_LOCALE_TIME
3296     assert(categories[LC_TIME_INDEX] == LC_TIME);
3297     assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME"));
3298 #      ifdef USE_POSIX_2008_LOCALE
3299     assert(category_masks[LC_TIME_INDEX] == LC_TIME_MASK);
3300 #      endif
3301 #    endif
3302 #    ifdef USE_LOCALE_MESSAGES
3303     assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES);
3304     assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES"));
3305 #      ifdef USE_POSIX_2008_LOCALE
3306     assert(category_masks[LC_MESSAGES_INDEX] == LC_MESSAGES_MASK);
3307 #      endif
3308 #    endif
3309 #    ifdef USE_LOCALE_MONETARY
3310     assert(categories[LC_MONETARY_INDEX] == LC_MONETARY);
3311     assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY"));
3312 #      ifdef USE_POSIX_2008_LOCALE
3313     assert(category_masks[LC_MONETARY_INDEX] == LC_MONETARY_MASK);
3314 #      endif
3315 #    endif
3316 #    ifdef USE_LOCALE_ADDRESS
3317     assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS);
3318     assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS"));
3319 #      ifdef USE_POSIX_2008_LOCALE
3320     assert(category_masks[LC_ADDRESS_INDEX] == LC_ADDRESS_MASK);
3321 #      endif
3322 #    endif
3323 #    ifdef USE_LOCALE_IDENTIFICATION
3324     assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION);
3325     assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION"));
3326 #      ifdef USE_POSIX_2008_LOCALE
3327     assert(category_masks[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION_MASK);
3328 #      endif
3329 #    endif
3330 #    ifdef USE_LOCALE_MEASUREMENT
3331     assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT);
3332     assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT"));
3333 #      ifdef USE_POSIX_2008_LOCALE
3334     assert(category_masks[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT_MASK);
3335 #      endif
3336 #    endif
3337 #    ifdef USE_LOCALE_PAPER
3338     assert(categories[LC_PAPER_INDEX] == LC_PAPER);
3339     assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER"));
3340 #      ifdef USE_POSIX_2008_LOCALE
3341     assert(category_masks[LC_PAPER_INDEX] == LC_PAPER_MASK);
3342 #      endif
3343 #    endif
3344 #    ifdef USE_LOCALE_TELEPHONE
3345     assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE);
3346     assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE"));
3347 #      ifdef USE_POSIX_2008_LOCALE
3348     assert(category_masks[LC_TELEPHONE_INDEX] == LC_TELEPHONE_MASK);
3349 #      endif
3350 #    endif
3351 #    ifdef USE_LOCALE_SYNTAX
3352     assert(categories[LC_SYNTAX_INDEX] == LC_SYNTAX);
3353     assert(strEQ(category_names[LC_SYNTAX_INDEX], "LC_SYNTAX"));
3354 #      ifdef USE_POSIX_2008_LOCALE
3355     assert(category_masks[LC_SYNTAX_INDEX] == LC_SYNTAX_MASK);
3356 #      endif
3357 #    endif
3358 #    ifdef USE_LOCALE_TOD
3359     assert(categories[LC_TOD_INDEX] == LC_TOD);
3360     assert(strEQ(category_names[LC_TOD_INDEX], "LC_TOD"));
3361 #      ifdef USE_POSIX_2008_LOCALE
3362     assert(category_masks[LC_TOD_INDEX] == LC_TOD_MASK);
3363 #      endif
3364 #    endif
3365 #    ifdef LC_ALL
3366     assert(categories[LC_ALL_INDEX] == LC_ALL);
3367     assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL"));
3368     assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX);
3369 #      ifdef USE_POSIX_2008_LOCALE
3370     assert(category_masks[LC_ALL_INDEX] == LC_ALL_MASK);
3371 #      endif
3372 #    endif
3373 #  endif    /* DEBUGGING */
3374 
3375     /* Initialize the per-thread mbrFOO() state variables.  See POSIX.xs for
3376      * why these particular incantations are used. */
3377 #ifdef HAS_MBRLEN
3378     memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
3379 #endif
3380 #ifdef HAS_MBRTOWC
3381     memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
3382 #endif
3383 #ifdef HAS_WCTOMBR
3384     wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
3385 #endif
3386 
3387     /* Initialize the cache of the program's UTF-8ness for the always known
3388      * locales C and POSIX */
3389     my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness,
3390                sizeof(PL_locale_utf8ness));
3391 
3392     /* See https://github.com/Perl/perl5/issues/17824 */
3393     Zero(curlocales, NOMINAL_LC_ALL_INDEX, char *);
3394 
3395 #  ifdef USE_THREAD_SAFE_LOCALE
3396 #    ifdef WIN32
3397 
3398     _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
3399 
3400 #    endif
3401 #  endif
3402 #  ifdef USE_POSIX_2008_LOCALE
3403 
3404     PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
3405     if (! PL_C_locale_obj) {
3406         Perl_croak_nocontext(
3407             "panic: Cannot create POSIX 2008 C locale object; errno=%d", errno);
3408     }
3409     if (DEBUG_Lv_TEST || debug_initialization) {
3410         PerlIO_printf(Perl_debug_log, "%s:%d: created C object %p\n", __FILE__, __LINE__, PL_C_locale_obj);
3411     }
3412 
3413 #  endif
3414 
3415 #  ifdef USE_LOCALE_NUMERIC
3416 
3417     PL_numeric_radix_sv = newSVpvs(".");
3418 
3419 #  endif
3420 
3421 #  if defined(USE_POSIX_2008_LOCALE) && ! defined(HAS_QUERYLOCALE)
3422 
3423     /* Initialize our records.  If we have POSIX 2008, we have LC_ALL */
3424     do_setlocale_c(LC_ALL, my_setlocale(LC_ALL, NULL));
3425 
3426 #  endif
3427 #  ifdef LOCALE_ENVIRON_REQUIRED
3428 
3429     /*
3430      * Ultrix setlocale(..., "") fails if there are no environment
3431      * variables from which to get a locale name.
3432      */
3433 
3434 #    ifndef LC_ALL
3435 #      error Ultrix without LC_ALL not implemented
3436 #    else
3437 
3438     {
3439         bool done = FALSE;
3440         if (lang) {
3441             sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init);
3442             DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]);
3443             if (sl_result[LC_ALL_INDEX])
3444                 done = TRUE;
3445             else
3446                 setlocale_failure = TRUE;
3447         }
3448         if (! setlocale_failure) {
3449             const char * locale_param;
3450             for (i = 0; i < LC_ALL_INDEX; i++) {
3451                 locale_param = (! done && (lang || PerlEnv_getenv(category_names[i])))
3452                             ? setlocale_init
3453                             : NULL;
3454                 sl_result[i] = do_setlocale_r(categories[i], locale_param);
3455                 if (! sl_result[i]) {
3456                     setlocale_failure = TRUE;
3457                 }
3458                 DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]);
3459             }
3460         }
3461     }
3462 
3463 #    endif /* LC_ALL */
3464 #  endif /* LOCALE_ENVIRON_REQUIRED */
3465 
3466     /* We try each locale in the list until we get one that works, or exhaust
3467      * the list.  Normally the loop is executed just once.  But if setting the
3468      * locale fails, inside the loop we add fallback trials to the array and so
3469      * will execute the loop multiple times */
3470     trial_locales[0] = setlocale_init;
3471     trial_locales_count = 1;
3472 
3473     for (i= 0; i < trial_locales_count; i++) {
3474         const char * trial_locale = trial_locales[i];
3475 
3476         if (i > 0) {
3477 
3478             /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
3479              * when i==0, but I (khw) don't think that behavior makes much
3480              * sense */
3481             setlocale_failure = FALSE;
3482 
3483 #  ifdef SYSTEM_DEFAULT_LOCALE
3484 #    ifdef WIN32    /* Note that assumes Win32 has LC_ALL */
3485 
3486             /* On Windows machines, an entry of "" after the 0th means to use
3487              * the system default locale, which we now proceed to get. */
3488             if (strEQ(trial_locale, "")) {
3489                 unsigned int j;
3490 
3491                 /* Note that this may change the locale, but we are going to do
3492                  * that anyway just below */
3493                 system_default_locale = do_setlocale_c(LC_ALL, "");
3494                 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
3495 
3496                 /* Skip if invalid or if it's already on the list of locales to
3497                  * try */
3498                 if (! system_default_locale) {
3499                     goto next_iteration;
3500                 }
3501                 for (j = 0; j < trial_locales_count; j++) {
3502                     if (strEQ(system_default_locale, trial_locales[j])) {
3503                         goto next_iteration;
3504                     }
3505                 }
3506 
3507                 trial_locale = system_default_locale;
3508             }
3509 #    else
3510 #      error SYSTEM_DEFAULT_LOCALE only implemented for Win32
3511 #    endif
3512 #  endif /* SYSTEM_DEFAULT_LOCALE */
3513 
3514         }   /* For i > 0 */
3515 
3516 #  ifdef LC_ALL
3517 
3518         sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale);
3519         DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[LC_ALL_INDEX]);
3520         if (! sl_result[LC_ALL_INDEX]) {
3521             setlocale_failure = TRUE;
3522         }
3523         else {
3524             /* Since LC_ALL succeeded, it should have changed all the other
3525              * categories it can to its value; so we massage things so that the
3526              * setlocales below just return their category's current values.
3527              * This adequately handles the case in NetBSD where LC_COLLATE may
3528              * not be defined for a locale, and setting it individually will
3529              * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
3530              * the POSIX locale. */
3531             trial_locale = NULL;
3532         }
3533 
3534 #  endif /* LC_ALL */
3535 
3536         if (! setlocale_failure) {
3537             unsigned int j;
3538             for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
3539                 curlocales[j]
3540                         = savepv(do_setlocale_r(categories[j], trial_locale));
3541                 if (! curlocales[j]) {
3542                     setlocale_failure = TRUE;
3543                 }
3544                 DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]);
3545             }
3546 
3547             if (LIKELY(! setlocale_failure)) {  /* All succeeded */
3548                 break;  /* Exit trial_locales loop */
3549             }
3550         }
3551 
3552         /* Here, something failed; will need to try a fallback. */
3553         ok = 0;
3554 
3555         if (i == 0) {
3556             unsigned int j;
3557 
3558             if (locwarn) { /* Output failure info only on the first one */
3559 
3560 #  ifdef LC_ALL
3561 
3562                 PerlIO_printf(Perl_error_log,
3563                 "perl: warning: Setting locale failed.\n");
3564 
3565 #  else /* !LC_ALL */
3566 
3567                 PerlIO_printf(Perl_error_log,
3568                 "perl: warning: Setting locale failed for the categories:\n\t");
3569 
3570                 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
3571                     if (! curlocales[j]) {
3572                         PerlIO_printf(Perl_error_log, category_names[j]);
3573                     }
3574                     else {
3575                         Safefree(curlocales[j]);
3576                     }
3577                 }
3578 
3579 #  endif /* LC_ALL */
3580 
3581                 PerlIO_printf(Perl_error_log,
3582                     "perl: warning: Please check that your locale settings:\n");
3583 
3584 #  ifdef __GLIBC__
3585 
3586                 PerlIO_printf(Perl_error_log,
3587                             "\tLANGUAGE = %c%s%c,\n",
3588                             language ? '"' : '(',
3589                             language ? language : "unset",
3590                             language ? '"' : ')');
3591 #  endif
3592 
3593                 PerlIO_printf(Perl_error_log,
3594                             "\tLC_ALL = %c%s%c,\n",
3595                             lc_all ? '"' : '(',
3596                             lc_all ? lc_all : "unset",
3597                             lc_all ? '"' : ')');
3598 
3599 #  if defined(USE_ENVIRON_ARRAY)
3600 
3601                 {
3602                     char **e;
3603 
3604                     /* Look through the environment for any variables of the
3605                      * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
3606                      * already handled above.  These are assumed to be locale
3607                      * settings.  Output them and their values. */
3608                     for (e = environ; *e; e++) {
3609                         const STRLEN prefix_len = sizeof("LC_") - 1;
3610                         STRLEN uppers_len;
3611 
3612                         if (     strBEGINs(*e, "LC_")
3613                             && ! strBEGINs(*e, "LC_ALL=")
3614                             && (uppers_len = strspn(*e + prefix_len,
3615                                              "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
3616                             && ((*e)[prefix_len + uppers_len] == '='))
3617                         {
3618                             PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
3619                                 (int) (prefix_len + uppers_len), *e,
3620                                 *e + prefix_len + uppers_len + 1);
3621                         }
3622                     }
3623                 }
3624 
3625 #  else
3626 
3627                 PerlIO_printf(Perl_error_log,
3628                             "\t(possibly more locale environment variables)\n");
3629 
3630 #  endif
3631 
3632                 PerlIO_printf(Perl_error_log,
3633                             "\tLANG = %c%s%c\n",
3634                             lang ? '"' : '(',
3635                             lang ? lang : "unset",
3636                             lang ? '"' : ')');
3637 
3638                 PerlIO_printf(Perl_error_log,
3639                             "    are supported and installed on your system.\n");
3640             }
3641 
3642             /* Calculate what fallback locales to try.  We have avoided this
3643              * until we have to, because failure is quite unlikely.  This will
3644              * usually change the upper bound of the loop we are in.
3645              *
3646              * Since the system's default way of setting the locale has not
3647              * found one that works, We use Perl's defined ordering: LC_ALL,
3648              * LANG, and the C locale.  We don't try the same locale twice, so
3649              * don't add to the list if already there.  (On POSIX systems, the
3650              * LC_ALL element will likely be a repeat of the 0th element "",
3651              * but there's no harm done by doing it explicitly.
3652              *
3653              * Note that this tries the LC_ALL environment variable even on
3654              * systems which have no LC_ALL locale setting.  This may or may
3655              * not have been originally intentional, but there's no real need
3656              * to change the behavior. */
3657             if (lc_all) {
3658                 for (j = 0; j < trial_locales_count; j++) {
3659                     if (strEQ(lc_all, trial_locales[j])) {
3660                         goto done_lc_all;
3661                     }
3662                 }
3663                 trial_locales[trial_locales_count++] = lc_all;
3664             }
3665           done_lc_all:
3666 
3667             if (lang) {
3668                 for (j = 0; j < trial_locales_count; j++) {
3669                     if (strEQ(lang, trial_locales[j])) {
3670                         goto done_lang;
3671                     }
3672                 }
3673                 trial_locales[trial_locales_count++] = lang;
3674             }
3675           done_lang:
3676 
3677 #  if defined(WIN32) && defined(LC_ALL)
3678 
3679             /* For Windows, we also try the system default locale before "C".
3680              * (If there exists a Windows without LC_ALL we skip this because
3681              * it gets too complicated.  For those, the "C" is the next
3682              * fallback possibility).  The "" is the same as the 0th element of
3683              * the array, but the code at the loop above knows to treat it
3684              * differently when not the 0th */
3685             trial_locales[trial_locales_count++] = "";
3686 
3687 #  endif
3688 
3689             for (j = 0; j < trial_locales_count; j++) {
3690                 if (strEQ("C", trial_locales[j])) {
3691                     goto done_C;
3692                 }
3693             }
3694             trial_locales[trial_locales_count++] = "C";
3695 
3696           done_C: ;
3697         }   /* end of first time through the loop */
3698 
3699 #  ifdef WIN32
3700 
3701       next_iteration: ;
3702 
3703 #  endif
3704 
3705     }   /* end of looping through the trial locales */
3706 
3707     if (ok < 1) {   /* If we tried to fallback */
3708         const char* msg;
3709         if (! setlocale_failure) {  /* fallback succeeded */
3710            msg = "Falling back to";
3711         }
3712         else {  /* fallback failed */
3713             unsigned int j;
3714 
3715             /* We dropped off the end of the loop, so have to decrement i to
3716              * get back to the value the last time through */
3717             i--;
3718 
3719             ok = -1;
3720             msg = "Failed to fall back to";
3721 
3722             /* To continue, we should use whatever values we've got */
3723 
3724             for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
3725                 Safefree(curlocales[j]);
3726                 curlocales[j] = savepv(do_setlocale_r(categories[j], NULL));
3727                 DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]);
3728             }
3729         }
3730 
3731         if (locwarn) {
3732             const char * description;
3733             const char * name = "";
3734             if (strEQ(trial_locales[i], "C")) {
3735                 description = "the standard locale";
3736                 name = "C";
3737             }
3738 
3739 #  ifdef SYSTEM_DEFAULT_LOCALE
3740 
3741             else if (strEQ(trial_locales[i], "")) {
3742                 description = "the system default locale";
3743                 if (system_default_locale) {
3744                     name = system_default_locale;
3745                 }
3746             }
3747 
3748 #  endif /* SYSTEM_DEFAULT_LOCALE */
3749 
3750             else {
3751                 description = "a fallback locale";
3752                 name = trial_locales[i];
3753             }
3754             if (name && strNE(name, "")) {
3755                 PerlIO_printf(Perl_error_log,
3756                     "perl: warning: %s %s (\"%s\").\n", msg, description, name);
3757             }
3758             else {
3759                 PerlIO_printf(Perl_error_log,
3760                                    "perl: warning: %s %s.\n", msg, description);
3761             }
3762         }
3763     } /* End of tried to fallback */
3764 
3765     /* Done with finding the locales; update our records */
3766 
3767 #  ifdef USE_LOCALE_CTYPE
3768 
3769     new_ctype(curlocales[LC_CTYPE_INDEX]);
3770 
3771 #  endif
3772 #  ifdef USE_LOCALE_COLLATE
3773 
3774     new_collate(curlocales[LC_COLLATE_INDEX]);
3775 
3776 #  endif
3777 #  ifdef USE_LOCALE_NUMERIC
3778 
3779     new_numeric(curlocales[LC_NUMERIC_INDEX]);
3780 
3781 #  endif
3782 
3783     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
3784 
3785 #  if defined(USE_ITHREADS) && ! defined(USE_THREAD_SAFE_LOCALE)
3786 
3787         /* This caches whether each category's locale is UTF-8 or not.  This
3788          * may involve changing the locale.  It is ok to do this at
3789          * initialization time before any threads have started, but not later
3790          * unless thread-safe operations are used.
3791          * Caching means that if the program heeds our dictate not to change
3792          * locales in threaded applications, this data will remain valid, and
3793          * it may get queried without having to change locales.  If the
3794          * environment is such that all categories have the same locale, this
3795          * isn't needed, as the code will not change the locale; but this
3796          * handles the uncommon case where the environment has disparate
3797          * locales for the categories */
3798         (void) _is_cur_LC_category_utf8(categories[i]);
3799 
3800 #  endif
3801 
3802         Safefree(curlocales[i]);
3803     }
3804 
3805 #  if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
3806 
3807     /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
3808      * locale is UTF-8.  The call to new_ctype() just above has already
3809      * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
3810      * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
3811      * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
3812      * STDIN, STDOUT, STDERR, _and_ the default open discipline.  */
3813     PL_utf8locale = PL_in_utf8_CTYPE_locale;
3814 
3815     /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
3816        This is an alternative to using the -C command line switch
3817        (the -C if present will override this). */
3818     {
3819          const char *p = PerlEnv_getenv("PERL_UNICODE");
3820          PL_unicode = p ? parse_unicode_opts(&p) : 0;
3821          if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
3822              PL_utf8cache = -1;
3823     }
3824 
3825 #  endif
3826 #endif /* USE_LOCALE */
3827 #ifdef DEBUGGING
3828 
3829     /* So won't continue to output stuff */
3830     DEBUG_INITIALIZATION_set(FALSE);
3831 
3832 #endif
3833 
3834     return ok;
3835 }
3836 
3837 #ifdef USE_LOCALE_COLLATE
3838 
3839 char *
3840 Perl__mem_collxfrm(pTHX_ const char *input_string,
3841                          STRLEN len,    /* Length of 'input_string' */
3842                          STRLEN *xlen,  /* Set to length of returned string
3843                                            (not including the collation index
3844                                            prefix) */
3845                          bool utf8      /* Is the input in UTF-8? */
3846                    )
3847 {
3848 
3849     /* _mem_collxfrm() is a bit like strxfrm() but with two important
3850      * differences. First, it handles embedded NULs. Second, it allocates a bit
3851      * more memory than needed for the transformed data itself.  The real
3852      * transformed data begins at offset COLLXFRM_HDR_LEN.  *xlen is set to
3853      * the length of that, and doesn't include the collation index size.
3854      * Please see sv_collxfrm() to see how this is used. */
3855 
3856 #define COLLXFRM_HDR_LEN    sizeof(PL_collation_ix)
3857 
3858     char * s = (char *) input_string;
3859     STRLEN s_strlen = strlen(input_string);
3860     char *xbuf = NULL;
3861     STRLEN xAlloc;          /* xalloc is a reserved word in VC */
3862     STRLEN length_in_chars;
3863     bool first_time = TRUE; /* Cleared after first loop iteration */
3864 
3865     PERL_ARGS_ASSERT__MEM_COLLXFRM;
3866 
3867     /* Must be NUL-terminated */
3868     assert(*(input_string + len) == '\0');
3869 
3870     /* If this locale has defective collation, skip */
3871     if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
3872         DEBUG_L(PerlIO_printf(Perl_debug_log,
3873                       "_mem_collxfrm: locale's collation is defective\n"));
3874         goto bad;
3875     }
3876 
3877     /* Replace any embedded NULs with the control that sorts before any others.
3878      * This will give as good as possible results on strings that don't
3879      * otherwise contain that character, but otherwise there may be
3880      * less-than-perfect results with that character and NUL.  This is
3881      * unavoidable unless we replace strxfrm with our own implementation. */
3882     if (UNLIKELY(s_strlen < len)) {   /* Only execute if there is an embedded
3883                                          NUL */
3884         char * e = s + len;
3885         char * sans_nuls;
3886         STRLEN sans_nuls_len;
3887         int try_non_controls;
3888         char this_replacement_char[] = "?\0";   /* Room for a two-byte string,
3889                                                    making sure 2nd byte is NUL.
3890                                                  */
3891         STRLEN this_replacement_len;
3892 
3893         /* If we don't know what non-NUL control character sorts lowest for
3894          * this locale, find it */
3895         if (PL_strxfrm_NUL_replacement == '\0') {
3896             int j;
3897             char * cur_min_x = NULL;    /* The min_char's xfrm, (except it also
3898                                            includes the collation index
3899                                            prefixed. */
3900 
3901             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
3902 
3903             /* Unlikely, but it may be that no control will work to replace
3904              * NUL, in which case we instead look for any character.  Controls
3905              * are preferred because collation order is, in general, context
3906              * sensitive, with adjoining characters affecting the order, and
3907              * controls are less likely to have such interactions, allowing the
3908              * NUL-replacement to stand on its own.  (Another way to look at it
3909              * is to imagine what would happen if the NUL were replaced by a
3910              * combining character; it wouldn't work out all that well.) */
3911             for (try_non_controls = 0;
3912                  try_non_controls < 2;
3913                  try_non_controls++)
3914             {
3915                 /* Look through all legal code points (NUL isn't) */
3916                 for (j = 1; j < 256; j++) {
3917                     char * x;       /* j's xfrm plus collation index */
3918                     STRLEN x_len;   /* length of 'x' */
3919                     STRLEN trial_len = 1;
3920                     char cur_source[] = { '\0', '\0' };
3921 
3922                     /* Skip non-controls the first time through the loop.  The
3923                      * controls in a UTF-8 locale are the L1 ones */
3924                     if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
3925                                                ? ! isCNTRL_L1(j)
3926                                                : ! isCNTRL_LC(j))
3927                     {
3928                         continue;
3929                     }
3930 
3931                     /* Create a 1-char string of the current code point */
3932                     cur_source[0] = (char) j;
3933 
3934                     /* Then transform it */
3935                     x = _mem_collxfrm(cur_source, trial_len, &x_len,
3936                                       0 /* The string is not in UTF-8 */);
3937 
3938                     /* Ignore any character that didn't successfully transform.
3939                      * */
3940                     if (! x) {
3941                         continue;
3942                     }
3943 
3944                     /* If this character's transformation is lower than
3945                      * the current lowest, this one becomes the lowest */
3946                     if (   cur_min_x == NULL
3947                         || strLT(x         + COLLXFRM_HDR_LEN,
3948                                  cur_min_x + COLLXFRM_HDR_LEN))
3949                     {
3950                         PL_strxfrm_NUL_replacement = j;
3951                         Safefree(cur_min_x);
3952                         cur_min_x = x;
3953                     }
3954                     else {
3955                         Safefree(x);
3956                     }
3957                 } /* end of loop through all 255 characters */
3958 
3959                 /* Stop looking if found */
3960                 if (cur_min_x) {
3961                     break;
3962                 }
3963 
3964                 /* Unlikely, but possible, if there aren't any controls that
3965                  * work in the locale, repeat the loop, looking for any
3966                  * character that works */
3967                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3968                 "_mem_collxfrm: No control worked.  Trying non-controls\n"));
3969             } /* End of loop to try first the controls, then any char */
3970 
3971             if (! cur_min_x) {
3972                 DEBUG_L(PerlIO_printf(Perl_debug_log,
3973                     "_mem_collxfrm: Couldn't find any character to replace"
3974                     " embedded NULs in locale %s with", PL_collation_name));
3975                 goto bad;
3976             }
3977 
3978             DEBUG_L(PerlIO_printf(Perl_debug_log,
3979                     "_mem_collxfrm: Replacing embedded NULs in locale %s with "
3980                     "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
3981 
3982             Safefree(cur_min_x);
3983         } /* End of determining the character that is to replace NULs */
3984 
3985         /* If the replacement is variant under UTF-8, it must match the
3986          * UTF8-ness of the original */
3987         if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
3988             this_replacement_char[0] =
3989                                 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
3990             this_replacement_char[1] =
3991                                 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
3992             this_replacement_len = 2;
3993         }
3994         else {
3995             this_replacement_char[0] = PL_strxfrm_NUL_replacement;
3996             /* this_replacement_char[1] = '\0' was done at initialization */
3997             this_replacement_len = 1;
3998         }
3999 
4000         /* The worst case length for the replaced string would be if every
4001          * character in it is NUL.  Multiply that by the length of each
4002          * replacement, and allow for a trailing NUL */
4003         sans_nuls_len = (len * this_replacement_len) + 1;
4004         Newx(sans_nuls, sans_nuls_len, char);
4005         *sans_nuls = '\0';
4006 
4007         /* Replace each NUL with the lowest collating control.  Loop until have
4008          * exhausted all the NULs */
4009         while (s + s_strlen < e) {
4010             my_strlcat(sans_nuls, s, sans_nuls_len);
4011 
4012             /* Do the actual replacement */
4013             my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
4014 
4015             /* Move past the input NUL */
4016             s += s_strlen + 1;
4017             s_strlen = strlen(s);
4018         }
4019 
4020         /* And add anything that trails the final NUL */
4021         my_strlcat(sans_nuls, s, sans_nuls_len);
4022 
4023         /* Switch so below we transform this modified string */
4024         s = sans_nuls;
4025         len = strlen(s);
4026     } /* End of replacing NULs */
4027 
4028     /* Make sure the UTF8ness of the string and locale match */
4029     if (utf8 != PL_in_utf8_COLLATE_locale) {
4030         /* XXX convert above Unicode to 10FFFF? */
4031         const char * const t = s;   /* Temporary so we can later find where the
4032                                        input was */
4033 
4034         /* Here they don't match.  Change the string's to be what the locale is
4035          * expecting */
4036 
4037         if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
4038             s = (char *) bytes_to_utf8((const U8 *) s, &len);
4039             utf8 = TRUE;
4040         }
4041         else {   /* locale is not UTF-8; but input is; downgrade the input */
4042 
4043             s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
4044 
4045             /* If the downgrade was successful we are done, but if the input
4046              * contains things that require UTF-8 to represent, have to do
4047              * damage control ... */
4048             if (UNLIKELY(utf8)) {
4049 
4050                 /* What we do is construct a non-UTF-8 string with
4051                  *  1) the characters representable by a single byte converted
4052                  *     to be so (if necessary);
4053                  *  2) and the rest converted to collate the same as the
4054                  *     highest collating representable character.  That makes
4055                  *     them collate at the end.  This is similar to how we
4056                  *     handle embedded NULs, but we use the highest collating
4057                  *     code point instead of the smallest.  Like the NUL case,
4058                  *     this isn't perfect, but is the best we can reasonably
4059                  *     do.  Every above-255 code point will sort the same as
4060                  *     the highest-sorting 0-255 code point.  If that code
4061                  *     point can combine in a sequence with some other code
4062                  *     points for weight calculations, us changing something to
4063                  *     be it can adversely affect the results.  But in most
4064                  *     cases, it should work reasonably.  And note that this is
4065                  *     really an illegal situation: using code points above 255
4066                  *     on a locale where only 0-255 are valid.  If two strings
4067                  *     sort entirely equal, then the sort order for the
4068                  *     above-255 code points will be in code point order. */
4069 
4070                 utf8 = FALSE;
4071 
4072                 /* If we haven't calculated the code point with the maximum
4073                  * collating order for this locale, do so now */
4074                 if (! PL_strxfrm_max_cp) {
4075                     int j;
4076 
4077                     /* The current transformed string that collates the
4078                      * highest (except it also includes the prefixed collation
4079                      * index. */
4080                     char * cur_max_x = NULL;
4081 
4082                     /* Look through all legal code points (NUL isn't) */
4083                     for (j = 1; j < 256; j++) {
4084                         char * x;
4085                         STRLEN x_len;
4086                         char cur_source[] = { '\0', '\0' };
4087 
4088                         /* Create a 1-char string of the current code point */
4089                         cur_source[0] = (char) j;
4090 
4091                         /* Then transform it */
4092                         x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
4093 
4094                         /* If something went wrong (which it shouldn't), just
4095                          * ignore this code point */
4096                         if (! x) {
4097                             continue;
4098                         }
4099 
4100                         /* If this character's transformation is higher than
4101                          * the current highest, this one becomes the highest */
4102                         if (   cur_max_x == NULL
4103                             || strGT(x         + COLLXFRM_HDR_LEN,
4104                                      cur_max_x + COLLXFRM_HDR_LEN))
4105                         {
4106                             PL_strxfrm_max_cp = j;
4107                             Safefree(cur_max_x);
4108                             cur_max_x = x;
4109                         }
4110                         else {
4111                             Safefree(x);
4112                         }
4113                     }
4114 
4115                     if (! cur_max_x) {
4116                         DEBUG_L(PerlIO_printf(Perl_debug_log,
4117                             "_mem_collxfrm: Couldn't find any character to"
4118                             " replace above-Latin1 chars in locale %s with",
4119                             PL_collation_name));
4120                         goto bad;
4121                     }
4122 
4123                     DEBUG_L(PerlIO_printf(Perl_debug_log,
4124                             "_mem_collxfrm: highest 1-byte collating character"
4125                             " in locale %s is 0x%02X\n",
4126                             PL_collation_name,
4127                             PL_strxfrm_max_cp));
4128 
4129                     Safefree(cur_max_x);
4130                 }
4131 
4132                 /* Here we know which legal code point collates the highest.
4133                  * We are ready to construct the non-UTF-8 string.  The length
4134                  * will be at least 1 byte smaller than the input string
4135                  * (because we changed at least one 2-byte character into a
4136                  * single byte), but that is eaten up by the trailing NUL */
4137                 Newx(s, len, char);
4138 
4139                 {
4140                     STRLEN i;
4141                     STRLEN d= 0;
4142                     char * e = (char *) t + len;
4143 
4144                     for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
4145                         U8 cur_char = t[i];
4146                         if (UTF8_IS_INVARIANT(cur_char)) {
4147                             s[d++] = cur_char;
4148                         }
4149                         else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
4150                             s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
4151                         }
4152                         else {  /* Replace illegal cp with highest collating
4153                                    one */
4154                             s[d++] = PL_strxfrm_max_cp;
4155                         }
4156                     }
4157                     s[d++] = '\0';
4158                     Renew(s, d, char);   /* Free up unused space */
4159                 }
4160             }
4161         }
4162 
4163         /* Here, we have constructed a modified version of the input.  It could
4164          * be that we already had a modified copy before we did this version.
4165          * If so, that copy is no longer needed */
4166         if (t != input_string) {
4167             Safefree(t);
4168         }
4169     }
4170 
4171     length_in_chars = (utf8)
4172                       ? utf8_length((U8 *) s, (U8 *) s + len)
4173                       : len;
4174 
4175     /* The first element in the output is the collation id, used by
4176      * sv_collxfrm(); then comes the space for the transformed string.  The
4177      * equation should give us a good estimate as to how much is needed */
4178     xAlloc = COLLXFRM_HDR_LEN
4179            + PL_collxfrm_base
4180            + (PL_collxfrm_mult * length_in_chars);
4181     Newx(xbuf, xAlloc, char);
4182     if (UNLIKELY(! xbuf)) {
4183         DEBUG_L(PerlIO_printf(Perl_debug_log,
4184                       "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
4185         goto bad;
4186     }
4187 
4188     /* Store the collation id */
4189     *(U32*)xbuf = PL_collation_ix;
4190 
4191     /* Then the transformation of the input.  We loop until successful, or we
4192      * give up */
4193     for (;;) {
4194 
4195         *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
4196 
4197         /* If the transformed string occupies less space than we told strxfrm()
4198          * was available, it means it successfully transformed the whole
4199          * string. */
4200         if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
4201 
4202             /* Some systems include a trailing NUL in the returned length.
4203              * Ignore it, using a loop in case multiple trailing NULs are
4204              * returned. */
4205             while (   (*xlen) > 0
4206                    && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
4207             {
4208                 (*xlen)--;
4209             }
4210 
4211             /* If the first try didn't get it, it means our prediction was low.
4212              * Modify the coefficients so that we predict a larger value in any
4213              * future transformations */
4214             if (! first_time) {
4215                 STRLEN needed = *xlen + 1;   /* +1 For trailing NUL */
4216                 STRLEN computed_guess = PL_collxfrm_base
4217                                       + (PL_collxfrm_mult * length_in_chars);
4218 
4219                 /* On zero-length input, just keep current slope instead of
4220                  * dividing by 0 */
4221                 const STRLEN new_m = (length_in_chars != 0)
4222                                      ? needed / length_in_chars
4223                                      : PL_collxfrm_mult;
4224 
4225                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4226                     "%s: %d: initial size of %zu bytes for a length "
4227                     "%zu string was insufficient, %zu needed\n",
4228                     __FILE__, __LINE__,
4229                     computed_guess, length_in_chars, needed));
4230 
4231                 /* If slope increased, use it, but discard this result for
4232                  * length 1 strings, as we can't be sure that it's a real slope
4233                  * change */
4234                 if (length_in_chars > 1 && new_m  > PL_collxfrm_mult) {
4235 
4236 #  ifdef DEBUGGING
4237 
4238                     STRLEN old_m = PL_collxfrm_mult;
4239                     STRLEN old_b = PL_collxfrm_base;
4240 
4241 #  endif
4242 
4243                     PL_collxfrm_mult = new_m;
4244                     PL_collxfrm_base = 1;   /* +1 For trailing NUL */
4245                     computed_guess = PL_collxfrm_base
4246                                     + (PL_collxfrm_mult * length_in_chars);
4247                     if (computed_guess < needed) {
4248                         PL_collxfrm_base += needed - computed_guess;
4249                     }
4250 
4251                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4252                         "%s: %d: slope is now %zu; was %zu, base "
4253                         "is now %zu; was %zu\n",
4254                         __FILE__, __LINE__,
4255                         PL_collxfrm_mult, old_m,
4256                         PL_collxfrm_base, old_b));
4257                 }
4258                 else {  /* Slope didn't change, but 'b' did */
4259                     const STRLEN new_b = needed
4260                                         - computed_guess
4261                                         + PL_collxfrm_base;
4262                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4263                         "%s: %d: base is now %zu; was %zu\n",
4264                         __FILE__, __LINE__,
4265                         new_b, PL_collxfrm_base));
4266                     PL_collxfrm_base = new_b;
4267                 }
4268             }
4269 
4270             break;
4271         }
4272 
4273         if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
4274             DEBUG_L(PerlIO_printf(Perl_debug_log,
4275                   "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
4276                   *xlen, PERL_INT_MAX));
4277             goto bad;
4278         }
4279 
4280         /* A well-behaved strxfrm() returns exactly how much space it needs
4281          * (usually not including the trailing NUL) when it fails due to not
4282          * enough space being provided.  Assume that this is the case unless
4283          * it's been proven otherwise */
4284         if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
4285             xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
4286         }
4287         else { /* Here, either:
4288                 *  1)  The strxfrm() has previously shown bad behavior; or
4289                 *  2)  It isn't the first time through the loop, which means
4290                 *      that the strxfrm() is now showing bad behavior, because
4291                 *      we gave it what it said was needed in the previous
4292                 *      iteration, and it came back saying it needed still more.
4293                 *      (Many versions of cygwin fit this.  When the buffer size
4294                 *      isn't sufficient, they return the input size instead of
4295                 *      how much is needed.)
4296                 * Increase the buffer size by a fixed percentage and try again.
4297                 * */
4298             xAlloc += (xAlloc / 4) + 1;
4299             PL_strxfrm_is_behaved = FALSE;
4300 
4301 #  ifdef DEBUGGING
4302 
4303             if (DEBUG_Lv_TEST || debug_initialization) {
4304                 PerlIO_printf(Perl_debug_log,
4305                 "_mem_collxfrm required more space than previously calculated"
4306                 " for locale %s, trying again with new guess=%zu+%zu\n",
4307                 PL_collation_name,  COLLXFRM_HDR_LEN,
4308                 xAlloc - COLLXFRM_HDR_LEN);
4309             }
4310 
4311 #  endif
4312 
4313         }
4314 
4315         Renew(xbuf, xAlloc, char);
4316         if (UNLIKELY(! xbuf)) {
4317             DEBUG_L(PerlIO_printf(Perl_debug_log,
4318                       "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
4319             goto bad;
4320         }
4321 
4322         first_time = FALSE;
4323     }
4324 
4325 
4326 #  ifdef DEBUGGING
4327 
4328     if (DEBUG_Lv_TEST || debug_initialization) {
4329 
4330         print_collxfrm_input_and_return(s, s + len, xlen, utf8);
4331         PerlIO_printf(Perl_debug_log, "Its xfrm is:");
4332         PerlIO_printf(Perl_debug_log, "%s\n",
4333                       _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
4334                        *xlen, 1));
4335     }
4336 
4337 #  endif
4338 
4339     /* Free up unneeded space; retain ehough for trailing NUL */
4340     Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
4341 
4342     if (s != input_string) {
4343         Safefree(s);
4344     }
4345 
4346     return xbuf;
4347 
4348   bad:
4349 
4350 #  ifdef DEBUGGING
4351 
4352     if (DEBUG_Lv_TEST || debug_initialization) {
4353         print_collxfrm_input_and_return(s, s + len, NULL, utf8);
4354     }
4355 
4356 #  endif
4357 
4358     Safefree(xbuf);
4359     if (s != input_string) {
4360         Safefree(s);
4361     }
4362     *xlen = 0;
4363 
4364     return NULL;
4365 }
4366 
4367 #  ifdef DEBUGGING
4368 
4369 STATIC void
4370 S_print_collxfrm_input_and_return(pTHX_
4371                                   const char * const s,
4372                                   const char * const e,
4373                                   const STRLEN * const xlen,
4374                                   const bool is_utf8)
4375 {
4376 
4377     PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
4378 
4379     PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
4380                                                         (UV)PL_collation_ix);
4381     if (xlen) {
4382         PerlIO_printf(Perl_debug_log, "%zu", *xlen);
4383     }
4384     else {
4385         PerlIO_printf(Perl_debug_log, "NULL");
4386     }
4387     PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
4388                                                             PL_collation_name);
4389     print_bytes_for_locale(s, e, is_utf8);
4390 
4391     PerlIO_printf(Perl_debug_log, "'\n");
4392 }
4393 
4394 #  endif    /* DEBUGGING */
4395 #endif /* USE_LOCALE_COLLATE */
4396 #ifdef USE_LOCALE
4397 #  ifdef DEBUGGING
4398 
4399 STATIC void
4400 S_print_bytes_for_locale(pTHX_
4401                     const char * const s,
4402                     const char * const e,
4403                     const bool is_utf8)
4404 {
4405     const char * t = s;
4406     bool prev_was_printable = TRUE;
4407     bool first_time = TRUE;
4408 
4409     PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
4410 
4411     while (t < e) {
4412         UV cp = (is_utf8)
4413                 ?  utf8_to_uvchr_buf((U8 *) t, e, NULL)
4414                 : * (U8 *) t;
4415         if (isPRINT(cp)) {
4416             if (! prev_was_printable) {
4417                 PerlIO_printf(Perl_debug_log, " ");
4418             }
4419             PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
4420             prev_was_printable = TRUE;
4421         }
4422         else {
4423             if (! first_time) {
4424                 PerlIO_printf(Perl_debug_log, " ");
4425             }
4426             PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
4427             prev_was_printable = FALSE;
4428         }
4429         t += (is_utf8) ? UTF8SKIP(t) : 1;
4430         first_time = FALSE;
4431     }
4432 }
4433 
4434 #  endif   /* #ifdef DEBUGGING */
4435 
4436 STATIC const char *
4437 S_switch_category_locale_to_template(pTHX_ const int switch_category, const int template_category, const char * template_locale)
4438 {
4439     /* Changes the locale for LC_'switch_category" to that of
4440      * LC_'template_category', if they aren't already the same.  If not NULL,
4441      * 'template_locale' is the locale that 'template_category' is in.
4442      *
4443      * Returns a copy of the name of the original locale for 'switch_category'
4444      * so can be switched back to with the companion function
4445      * restore_switched_locale(),  (NULL if no restoral is necessary.) */
4446 
4447     char * restore_to_locale = NULL;
4448 
4449     if (switch_category == template_category) { /* No changes needed */
4450         return NULL;
4451     }
4452 
4453     /* Find the original locale of the category we may need to change, so that
4454      * it can be restored to later */
4455     restore_to_locale = stdize_locale(savepv(do_setlocale_r(switch_category,
4456                                                             NULL)));
4457     if (! restore_to_locale) {
4458         Perl_croak(aTHX_
4459              "panic: %s: %d: Could not find current %s locale, errno=%d\n",
4460                 __FILE__, __LINE__, category_name(switch_category), errno);
4461     }
4462 
4463     /* If the locale of the template category wasn't passed in, find it now */
4464     if (template_locale == NULL) {
4465         template_locale = do_setlocale_r(template_category, NULL);
4466         if (! template_locale) {
4467             Perl_croak(aTHX_
4468              "panic: %s: %d: Could not find current %s locale, errno=%d\n",
4469                    __FILE__, __LINE__, category_name(template_category), errno);
4470         }
4471     }
4472 
4473     /* It the locales are the same, there's nothing to do */
4474     if (strEQ(restore_to_locale, template_locale)) {
4475         Safefree(restore_to_locale);
4476 
4477         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
4478                             category_name(switch_category), template_locale));
4479 
4480         return NULL;
4481     }
4482 
4483     /* Finally, change the locale to the template one */
4484     if (! do_setlocale_r(switch_category, template_locale)) {
4485         Perl_croak(aTHX_
4486          "panic: %s: %d: Could not change %s locale to %s, errno=%d\n",
4487                             __FILE__, __LINE__, category_name(switch_category),
4488                                                        template_locale, errno);
4489     }
4490 
4491     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n",
4492                             category_name(switch_category), template_locale));
4493 
4494     return restore_to_locale;
4495 }
4496 
4497 STATIC void
4498 S_restore_switched_locale(pTHX_ const int category, const char * const original_locale)
4499 {
4500     /* Restores the locale for LC_'category' to 'original_locale' (which is a
4501      * copy that will be freed by this function), or do nothing if the latter
4502      * parameter is NULL */
4503 
4504     if (original_locale == NULL) {
4505         return;
4506     }
4507 
4508     if (! do_setlocale_r(category, original_locale)) {
4509         Perl_croak(aTHX_
4510              "panic: %s: %d: setlocale %s restore to %s failed, errno=%d\n",
4511                  __FILE__, __LINE__,
4512                              category_name(category), original_locale, errno);
4513     }
4514 
4515     Safefree(original_locale);
4516 }
4517 
4518 /* is_cur_LC_category_utf8 uses a small char buffer to avoid malloc/free */
4519 #define CUR_LC_BUFFER_SIZE  64
4520 
4521 bool
4522 Perl__is_cur_LC_category_utf8(pTHX_ int category)
4523 {
4524     /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
4525      * otherwise. 'category' may not be LC_ALL.  If the platform doesn't have
4526      * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
4527      * could give the wrong result.  The result will very likely be correct for
4528      * languages that have commonly used non-ASCII characters, but for notably
4529      * English, it comes down to if the locale's name ends in something like
4530      * "UTF-8".  It errs on the side of not being a UTF-8 locale.
4531      *
4532      * If the platform is early C89, not containing mbtowc(), or we are
4533      * compiled to not pay attention to LC_CTYPE, this employs heuristics.
4534      * These work very well for non-Latin locales or those whose currency
4535      * symbol isn't a '$' nor plain ASCII text.  But without LC_CTYPE and at
4536      * least MB_CUR_MAX, English locales with an ASCII currency symbol depend
4537      * on the name containing UTF-8 or not. */
4538 
4539     /* Name of current locale corresponding to the input category */
4540     const char *save_input_locale = NULL;
4541 
4542     bool is_utf8 = FALSE;                /* The return value */
4543 
4544     /* The variables below are for the cache of previous lookups using this
4545      * function.  The cache is a C string, described at the definition for
4546      * 'C_and_POSIX_utf8ness'.
4547      *
4548      * The first part of the cache is fixed, for the C and POSIX locales.  The
4549      * varying part starts just after them. */
4550     char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness);
4551 
4552     Size_t utf8ness_cache_size; /* Size of the varying portion */
4553     Size_t input_name_len;      /* Length in bytes of save_input_locale */
4554     Size_t input_name_len_with_overhead;    /* plus extra chars used to store
4555                                                the name in the cache */
4556     char * delimited;           /* The name plus the delimiters used to store
4557                                    it in the cache */
4558     char buffer[CUR_LC_BUFFER_SIZE];        /* small buffer */
4559     char * name_pos;            /* position of 'delimited' in the cache, or 0
4560                                    if not there */
4561 
4562 
4563 #  ifdef LC_ALL
4564 
4565     assert(category != LC_ALL);
4566 
4567 #  endif
4568 
4569     /* Get the desired category's locale */
4570     save_input_locale = stdize_locale(savepv(do_setlocale_r(category, NULL)));
4571     if (! save_input_locale) {
4572         Perl_croak(aTHX_
4573              "panic: %s: %d: Could not find current %s locale, errno=%d\n",
4574                      __FILE__, __LINE__, category_name(category), errno);
4575     }
4576 
4577     DEBUG_L(PerlIO_printf(Perl_debug_log,
4578                           "Current locale for %s is %s\n",
4579                           category_name(category), save_input_locale));
4580 
4581     input_name_len = strlen(save_input_locale);
4582 
4583     /* In our cache, each name is accompanied by two delimiters and a single
4584      * utf8ness digit */
4585     input_name_len_with_overhead = input_name_len + 3;
4586 
4587     if ( input_name_len_with_overhead <= CUR_LC_BUFFER_SIZE ) {
4588         /* we can use the buffer, avoid a malloc */
4589         delimited = buffer;
4590     } else { /* need a malloc */
4591         /* Allocate and populate space for a copy of the name surrounded by the
4592          * delimiters */
4593         Newx(delimited, input_name_len_with_overhead, char);
4594     }
4595 
4596     delimited[0] = UTF8NESS_SEP[0];
4597     Copy(save_input_locale, delimited + 1, input_name_len, char);
4598     delimited[input_name_len+1] = UTF8NESS_PREFIX[0];
4599     delimited[input_name_len+2] = '\0';
4600 
4601     /* And see if that is in the cache */
4602     name_pos = instr(PL_locale_utf8ness, delimited);
4603     if (name_pos) {
4604         is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0';
4605 
4606 #  ifdef DEBUGGING
4607 
4608         if (DEBUG_Lv_TEST || debug_initialization) {
4609             PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n",
4610                                           save_input_locale, is_utf8);
4611         }
4612 
4613 #  endif
4614 
4615         /* And, if not already in that position, move it to the beginning of
4616          * the non-constant portion of the list, since it is the most recently
4617          * used.  (We don't have to worry about overflow, since just moving
4618          * existing names around) */
4619         if (name_pos > utf8ness_cache) {
4620             Move(utf8ness_cache,
4621                  utf8ness_cache + input_name_len_with_overhead,
4622                  name_pos - utf8ness_cache, char);
4623             Copy(delimited,
4624                  utf8ness_cache,
4625                  input_name_len_with_overhead - 1, char);
4626             utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
4627         }
4628 
4629         /* free only when not using the buffer */
4630         if ( delimited != buffer ) Safefree(delimited);
4631         Safefree(save_input_locale);
4632         return is_utf8;
4633     }
4634 
4635     /* Here we don't have stored the utf8ness for the input locale.  We have to
4636      * calculate it */
4637 
4638 #  if        defined(USE_LOCALE_CTYPE)                                  \
4639      && (    defined(HAS_NL_LANGINFO)                                   \
4640          || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)))
4641 
4642     {
4643         const char *original_ctype_locale
4644                         = switch_category_locale_to_template(LC_CTYPE,
4645                                                              category,
4646                                                              save_input_locale);
4647 
4648         /* Here the current LC_CTYPE is set to the locale of the category whose
4649          * information is desired.  This means that nl_langinfo() and mbtowc()
4650          * should give the correct results */
4651 
4652 #    ifdef MB_CUR_MAX  /* But we can potentially rule out UTF-8ness, avoiding
4653                           calling the functions if we have this */
4654 
4655             /* Standard UTF-8 needs at least 4 bytes to represent the maximum
4656              * Unicode code point. */
4657 
4658             DEBUG_L(PerlIO_printf(Perl_debug_log, "%s: %d: MB_CUR_MAX=%d\n",
4659                                        __FILE__, __LINE__, (int) MB_CUR_MAX));
4660             if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) {
4661                 is_utf8 = FALSE;
4662                 restore_switched_locale(LC_CTYPE, original_ctype_locale);
4663                 goto finish_and_return;
4664             }
4665 
4666 #    endif
4667 #    if defined(HAS_NL_LANGINFO)
4668 
4669         { /* The task is easiest if the platform has this POSIX 2001 function.
4670              Except on some platforms it can wrongly return "", so have to have
4671              a fallback.  And it can return that it's UTF-8, even if there are
4672              variances from that.  For example, Turkish locales may use the
4673              alternate dotted I rules, and sometimes it appears to be a
4674              defective locale definition.  XXX We should probably check for
4675              these in the Latin1 range and warn (but on glibc, requires
4676              iswalnum() etc. due to their not handling 80-FF correctly */
4677             const char *codeset = my_nl_langinfo(CODESET, FALSE);
4678                                           /* FALSE => already in dest locale */
4679 
4680             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4681                             "\tnllanginfo returned CODESET '%s'\n", codeset));
4682 
4683             if (codeset && strNE(codeset, "")) {
4684 
4685                               /* If the implementation of foldEQ() somehow were
4686                                * to change to not go byte-by-byte, this could
4687                                * read past end of string, as only one length is
4688                                * checked.  But currently, a premature NUL will
4689                                * compare false, and it will stop there */
4690                 is_utf8 = cBOOL(   foldEQ(codeset, STR_WITH_LEN("UTF-8"))
4691                                 || foldEQ(codeset, STR_WITH_LEN("UTF8")));
4692 
4693                 DEBUG_L(PerlIO_printf(Perl_debug_log,
4694                        "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
4695                                                      codeset,         is_utf8));
4696                 restore_switched_locale(LC_CTYPE, original_ctype_locale);
4697                 goto finish_and_return;
4698             }
4699         }
4700 
4701 #    endif
4702 #    if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4703      /* We can see if this is a UTF-8-like locale if have mbtowc().  It was a
4704       * late adder to C89, so very likely to have it.  However, testing has
4705       * shown that, like nl_langinfo() above, there are locales that are not
4706       * strictly UTF-8 that this will return that they are */
4707 
4708         {
4709             wchar_t wc;
4710             int len;
4711             dSAVEDERRNO;
4712 
4713 #      if defined(HAS_MBRTOWC) && defined(USE_ITHREADS)
4714 
4715             mbstate_t ps;
4716 
4717 #      endif
4718 
4719             /* mbrtowc() and mbtowc() convert a byte string to a wide
4720              * character.  Feed a byte string to one of them and check that the
4721              * result is the expected Unicode code point */
4722 
4723 #      if defined(HAS_MBRTOWC) && defined(USE_ITHREADS)
4724             /* Prefer this function if available, as it's reentrant */
4725 
4726             memzero(&ps, sizeof(ps));;
4727             PERL_UNUSED_RESULT(mbrtowc(&wc, NULL, 0, &ps)); /* Reset any shift
4728                                                                state */
4729             SETERRNO(0, 0);
4730             len = mbrtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8), &ps);
4731             SAVE_ERRNO;
4732 
4733 #      else
4734 
4735             MBTOWC_LOCK;
4736             PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
4737             SETERRNO(0, 0);
4738             len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
4739             SAVE_ERRNO;
4740             MBTOWC_UNLOCK;
4741 
4742 #      endif
4743 
4744             RESTORE_ERRNO;
4745             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4746                     "\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n",
4747                                    len,      (unsigned int) wc, GET_ERRNO));
4748 
4749             is_utf8 = cBOOL(   len == STRLENs(REPLACEMENT_CHARACTER_UTF8)
4750                             && wc == (wchar_t) UNICODE_REPLACEMENT);
4751         }
4752 
4753 #    endif
4754 
4755         restore_switched_locale(LC_CTYPE, original_ctype_locale);
4756         goto finish_and_return;
4757     }
4758 
4759 #  else
4760 
4761         /* Here, we must have a C89 compiler that doesn't have mbtowc().  Next
4762          * try looking at the currency symbol to see if it disambiguates
4763          * things.  Often that will be in the native script, and if the symbol
4764          * isn't in UTF-8, we know that the locale isn't.  If it is non-ASCII
4765          * UTF-8, we infer that the locale is too, as the odds of a non-UTF8
4766          * string being valid UTF-8 are quite small */
4767 
4768 #    ifdef USE_LOCALE_MONETARY
4769 
4770         /* If have LC_MONETARY, we can look at the currency symbol.  Often that
4771          * will be in the native script.  We do this one first because there is
4772          * just one string to examine, so potentially avoids work */
4773 
4774         {
4775             const char *original_monetary_locale
4776                         = switch_category_locale_to_template(LC_MONETARY,
4777                                                              category,
4778                                                              save_input_locale);
4779             bool only_ascii = FALSE;
4780             const U8 * currency_string
4781                             = (const U8 *) my_nl_langinfo(CRNCYSTR, FALSE);
4782                                       /* 2nd param not relevant for this item */
4783             const U8 * first_variant;
4784 
4785             assert(   *currency_string == '-'
4786                    || *currency_string == '+'
4787                    || *currency_string == '.');
4788 
4789             currency_string++;
4790 
4791             if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant))
4792             {
4793                 DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
4794                 only_ascii = TRUE;
4795             }
4796             else {
4797                 is_utf8 = is_strict_utf8_string(first_variant, 0);
4798             }
4799 
4800             restore_switched_locale(LC_MONETARY, original_monetary_locale);
4801 
4802             if (! only_ascii) {
4803 
4804                 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
4805                  * otherwise assume the locale is UTF-8 if and only if the symbol
4806                  * is non-ascii UTF-8. */
4807                 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
4808                                         save_input_locale, is_utf8));
4809                 goto finish_and_return;
4810             }
4811         }
4812 
4813 #    endif /* USE_LOCALE_MONETARY */
4814 #    if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
4815 
4816     /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not.  Try
4817      * the names of the months and weekdays, timezone, and am/pm indicator */
4818         {
4819             const char *original_time_locale
4820                             = switch_category_locale_to_template(LC_TIME,
4821                                                                  category,
4822                                                                  save_input_locale);
4823             int hour = 10;
4824             bool is_dst = FALSE;
4825             int dom = 1;
4826             int month = 0;
4827             int i;
4828             char * formatted_time;
4829 
4830             /* Here the current LC_TIME is set to the locale of the category
4831              * whose information is desired.  Look at all the days of the week and
4832              * month names, and the timezone and am/pm indicator for UTF-8 variant
4833              * characters.  The first such a one found will tell us if the locale
4834              * is UTF-8 or not */
4835 
4836             for (i = 0; i < 7 + 12; i++) {  /* 7 days; 12 months */
4837                 formatted_time = my_strftime("%A %B %Z %p",
4838                                 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
4839                 if ( ! formatted_time
4840                     || is_utf8_invariant_string((U8 *) formatted_time, 0))
4841                 {
4842 
4843                     /* Here, we didn't find a non-ASCII.  Try the next time through
4844                      * with the complemented dst and am/pm, and try with the next
4845                      * weekday.  After we have gotten all weekdays, try the next
4846                      * month */
4847                     is_dst = ! is_dst;
4848                     hour = (hour + 12) % 24;
4849                     dom++;
4850                     if (i > 6) {
4851                         month++;
4852                     }
4853                     continue;
4854                 }
4855 
4856                 /* Here, we have a non-ASCII.  Return TRUE is it is valid UTF8;
4857                  * false otherwise.  But first, restore LC_TIME to its original
4858                  * locale if we changed it */
4859                 restore_switched_locale(LC_TIME, original_time_locale);
4860 
4861                 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
4862                                     save_input_locale,
4863                                     is_utf8_string((U8 *) formatted_time, 0)));
4864                 is_utf8 = is_utf8_string((U8 *) formatted_time, 0);
4865                 goto finish_and_return;
4866             }
4867 
4868             /* Falling off the end of the loop indicates all the names were just
4869              * ASCII.  Go on to the next test.  If we changed it, restore LC_TIME
4870              * to its original locale */
4871             restore_switched_locale(LC_TIME, original_time_locale);
4872             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
4873         }
4874 
4875 #    endif
4876 
4877 #    if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
4878 
4879     /* This code is ifdefd out because it was found to not be necessary in testing
4880      * on our dromedary test machine, which has over 700 locales.  There, this
4881      * added no value to looking at the currency symbol and the time strings.  I
4882      * left it in so as to avoid rewriting it if real-world experience indicates
4883      * that dromedary is an outlier.  Essentially, instead of returning abpve if we
4884      * haven't found illegal utf8, we continue on and examine all the strerror()
4885      * messages on the platform for utf8ness.  If all are ASCII, we still don't
4886      * know the answer; but otherwise we have a pretty good indication of the
4887      * utf8ness.  The reason this doesn't help much is that the messages may not
4888      * have been translated into the locale.  The currency symbol and time strings
4889      * are much more likely to have been translated.  */
4890         {
4891             int e;
4892             bool non_ascii = FALSE;
4893             const char *original_messages_locale
4894                             = switch_category_locale_to_template(LC_MESSAGES,
4895                                                                  category,
4896                                                                  save_input_locale);
4897             const char * errmsg = NULL;
4898 
4899             /* Here the current LC_MESSAGES is set to the locale of the category
4900              * whose information is desired.  Look through all the messages.  We
4901              * can't use Strerror() here because it may expand to code that
4902              * segfaults in miniperl */
4903 
4904             for (e = 0; e <= sys_nerr; e++) {
4905                 errno = 0;
4906                 errmsg = sys_errlist[e];
4907                 if (errno || !errmsg) {
4908                     break;
4909                 }
4910                 errmsg = savepv(errmsg);
4911                 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
4912                     non_ascii = TRUE;
4913                     is_utf8 = is_utf8_string((U8 *) errmsg, 0);
4914                     break;
4915                 }
4916             }
4917             Safefree(errmsg);
4918 
4919             restore_switched_locale(LC_MESSAGES, original_messages_locale);
4920 
4921             if (non_ascii) {
4922 
4923                 /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
4924                  * any non-ascii means it is one; otherwise we assume it isn't */
4925                 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
4926                                     save_input_locale,
4927                                     is_utf8));
4928                 goto finish_and_return;
4929             }
4930 
4931             DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
4932         }
4933 
4934 #    endif
4935 #    ifndef EBCDIC  /* On os390, even if the name ends with "UTF-8', it isn't a
4936                    UTF-8 locale */
4937 
4938     /* As a last resort, look at the locale name to see if it matches
4939      * qr/UTF -?  * 8 /ix, or some other common locale names.  This "name", the
4940      * return of setlocale(), is actually defined to be opaque, so we can't
4941      * really rely on the absence of various substrings in the name to indicate
4942      * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
4943      * be a UTF-8 locale.  Similarly for the other common names */
4944 
4945     {
4946         const Size_t final_pos = strlen(save_input_locale) - 1;
4947 
4948         if (final_pos >= 3) {
4949             const char *name = save_input_locale;
4950 
4951             /* Find next 'U' or 'u' and look from there */
4952             while ((name += strcspn(name, "Uu") + 1)
4953                                         <= save_input_locale + final_pos - 2)
4954             {
4955                 if (   isALPHA_FOLD_NE(*name, 't')
4956                     || isALPHA_FOLD_NE(*(name + 1), 'f'))
4957                 {
4958                     continue;
4959                 }
4960                 name += 2;
4961                 if (*(name) == '-') {
4962                     if ((name > save_input_locale + final_pos - 1)) {
4963                         break;
4964                     }
4965                     name++;
4966                 }
4967                 if (*(name) == '8') {
4968                     DEBUG_L(PerlIO_printf(Perl_debug_log,
4969                                         "Locale %s ends with UTF-8 in name\n",
4970                                         save_input_locale));
4971                     is_utf8 = TRUE;
4972                     goto finish_and_return;
4973                 }
4974             }
4975             DEBUG_L(PerlIO_printf(Perl_debug_log,
4976                                 "Locale %s doesn't end with UTF-8 in name\n",
4977                                     save_input_locale));
4978         }
4979 
4980 #      ifdef WIN32
4981 
4982         /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
4983         if (memENDs(save_input_locale, final_pos, "65001")) {
4984             DEBUG_L(PerlIO_printf(Perl_debug_log,
4985                         "Locale %s ends with 65001 in name, is UTF-8 locale\n",
4986                         save_input_locale));
4987             is_utf8 = TRUE;
4988             goto finish_and_return;
4989         }
4990 
4991 #      endif
4992     }
4993 #    endif
4994 
4995     /* Other common encodings are the ISO 8859 series, which aren't UTF-8.  But
4996      * since we are about to return FALSE anyway, there is no point in doing
4997      * this extra work */
4998 
4999 #    if 0
5000     if (instr(save_input_locale, "8859")) {
5001         DEBUG_L(PerlIO_printf(Perl_debug_log,
5002                              "Locale %s has 8859 in name, not UTF-8 locale\n",
5003                              save_input_locale));
5004         is_utf8 = FALSE;
5005         goto finish_and_return;
5006     }
5007 #    endif
5008 
5009     DEBUG_L(PerlIO_printf(Perl_debug_log,
5010                           "Assuming locale %s is not a UTF-8 locale\n",
5011                                     save_input_locale));
5012     is_utf8 = FALSE;
5013 
5014 #  endif /* the code that is compiled when no modern LC_CTYPE */
5015 
5016   finish_and_return:
5017 
5018     /* Cache this result so we don't have to go through all this next time. */
5019     utf8ness_cache_size = sizeof(PL_locale_utf8ness)
5020                        - (utf8ness_cache - PL_locale_utf8ness);
5021 
5022     /* But we can't save it if it is too large for the total space available */
5023     if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) {
5024         Size_t utf8ness_cache_len = strlen(utf8ness_cache);
5025 
5026         /* Here it can fit, but we may need to clear out the oldest cached
5027          * result(s) to do so.  Check */
5028         if (utf8ness_cache_len + input_name_len_with_overhead
5029                                                         >= utf8ness_cache_size)
5030         {
5031             /* Here we have to clear something out to make room for this.
5032              * Start looking at the rightmost place where it could fit and find
5033              * the beginning of the entry that extends past that. */
5034             char * cutoff = (char *) my_memrchr(utf8ness_cache,
5035                                                 UTF8NESS_SEP[0],
5036                                                 utf8ness_cache_size
5037                                               - input_name_len_with_overhead);
5038 
5039             assert(cutoff);
5040             assert(cutoff >= utf8ness_cache);
5041 
5042             /* This and all subsequent entries must be removed */
5043             *cutoff = '\0';
5044             utf8ness_cache_len = strlen(utf8ness_cache);
5045         }
5046 
5047         /* Make space for the new entry */
5048         Move(utf8ness_cache,
5049              utf8ness_cache + input_name_len_with_overhead,
5050              utf8ness_cache_len + 1 /* Incl. trailing NUL */, char);
5051 
5052         /* And insert it */
5053         Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char);
5054         utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
5055 
5056         if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] & ~1) != '0') {
5057             Perl_croak(aTHX_
5058              "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%zu,"
5059              " inserted_name=%s, its_len=%zu\n",
5060                 __FILE__, __LINE__,
5061                 PL_locale_utf8ness, strlen(PL_locale_utf8ness),
5062                 delimited, input_name_len_with_overhead);
5063         }
5064     }
5065 
5066 #  ifdef DEBUGGING
5067 
5068     if (DEBUG_Lv_TEST) {
5069         const char * s = PL_locale_utf8ness;
5070 
5071         /* Audit the structure */
5072         while (s < PL_locale_utf8ness + strlen(PL_locale_utf8ness)) {
5073             const char *e;
5074 
5075             if (*s != UTF8NESS_SEP[0]) {
5076                 Perl_croak(aTHX_
5077                            "panic: %s: %d: Corrupt utf8ness_cache: missing"
5078                            " separator %.*s<-- HERE %s\n",
5079                            __FILE__, __LINE__,
5080                            (int) (s - PL_locale_utf8ness), PL_locale_utf8ness,
5081                            s);
5082             }
5083             s++;
5084             e = strchr(s, UTF8NESS_PREFIX[0]);
5085             if (! e) {
5086                 e = PL_locale_utf8ness + strlen(PL_locale_utf8ness);
5087                 Perl_croak(aTHX_
5088                            "panic: %s: %d: Corrupt utf8ness_cache: missing"
5089                            " separator %.*s<-- HERE %s\n",
5090                            __FILE__, __LINE__,
5091                            (int) (e - PL_locale_utf8ness), PL_locale_utf8ness,
5092                            e);
5093             }
5094             e++;
5095             if (*e != '0' && *e != '1') {
5096                 Perl_croak(aTHX_
5097                            "panic: %s: %d: Corrupt utf8ness_cache: utf8ness"
5098                            " must be [01] %.*s<-- HERE %s\n",
5099                            __FILE__, __LINE__,
5100                            (int) (e + 1 - PL_locale_utf8ness),
5101                            PL_locale_utf8ness, e + 1);
5102             }
5103             if (ninstr(PL_locale_utf8ness, s, s-1, e)) {
5104                 Perl_croak(aTHX_
5105                            "panic: %s: %d: Corrupt utf8ness_cache: entry"
5106                            " has duplicate %.*s<-- HERE %s\n",
5107                            __FILE__, __LINE__,
5108                            (int) (e - PL_locale_utf8ness), PL_locale_utf8ness,
5109                            e);
5110             }
5111             s = e + 1;
5112         }
5113     }
5114 
5115     if (DEBUG_Lv_TEST || debug_initialization) {
5116 
5117         PerlIO_printf(Perl_debug_log,
5118                 "PL_locale_utf8ness is now %s; returning %d\n",
5119                                      PL_locale_utf8ness, is_utf8);
5120     }
5121 
5122 #  endif
5123 
5124     /* free only when not using the buffer */
5125     if ( delimited != buffer ) Safefree(delimited);
5126     Safefree(save_input_locale);
5127     return is_utf8;
5128 }
5129 
5130 #endif
5131 
5132 bool
5133 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
5134 {
5135     /* Internal function which returns if we are in the scope of a pragma that
5136      * enables the locale category 'category'.  'compiling' should indicate if
5137      * this is during the compilation phase (TRUE) or not (FALSE). */
5138 
5139     const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
5140 
5141     SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
5142     if (! these_categories || these_categories == &PL_sv_placeholder) {
5143         return FALSE;
5144     }
5145 
5146     /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
5147      * a valid unsigned */
5148     assert(category >= -1);
5149     return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
5150 }
5151 
5152 char *
5153 Perl_my_strerror(pTHX_ const int errnum)
5154 {
5155     /* Returns a mortalized copy of the text of the error message associated
5156      * with 'errnum'.  It uses the current locale's text unless the platform
5157      * doesn't have the LC_MESSAGES category or we are not being called from
5158      * within the scope of 'use locale'.  In the former case, it uses whatever
5159      * strerror returns; in the latter case it uses the text from the C locale.
5160      *
5161      * The function just calls strerror(), but temporarily switches, if needed,
5162      * to the C locale */
5163 
5164     char *errstr;
5165 
5166 #ifndef USE_LOCALE_MESSAGES
5167 
5168     /* If platform doesn't have messages category, we don't do any switching to
5169      * the C locale; we just use whatever strerror() returns */
5170 
5171     errstr = savepv(Strerror(errnum));
5172 
5173 #else   /* Has locale messages */
5174 
5175     const bool within_locale_scope = IN_LC(LC_MESSAGES);
5176 
5177 #  ifndef USE_ITHREADS
5178 
5179     /* This function is trivial without threads. */
5180     if (within_locale_scope) {
5181         errstr = savepv(strerror(errnum));
5182     }
5183     else {
5184         const char * save_locale = savepv(do_setlocale_c(LC_MESSAGES, NULL));
5185 
5186         do_setlocale_c(LC_MESSAGES, "C");
5187         errstr = savepv(strerror(errnum));
5188         do_setlocale_c(LC_MESSAGES, save_locale);
5189         Safefree(save_locale);
5190     }
5191 
5192 #  elif   defined(USE_POSIX_2008_LOCALE)                      \
5193      &&   defined(HAS_STRERROR_L)
5194 
5195     /* This function is also trivial if we don't have to worry about thread
5196      * safety and have strerror_l(), as it handles the switch of locales so we
5197      * don't have to deal with that.  We don't have to worry about thread
5198      * safety if strerror_r() is also available.  Both it and strerror_l() are
5199      * thread-safe.  Plain strerror() isn't thread safe.  But on threaded
5200      * builds when strerror_r() is available, the apparent call to strerror()
5201      * below is actually a macro that behind-the-scenes calls strerror_r(). */
5202 
5203 #    ifdef HAS_STRERROR_R
5204 
5205     if (within_locale_scope) {
5206         errstr = savepv(strerror(errnum));
5207     }
5208     else {
5209         errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
5210     }
5211 
5212 #    else
5213 
5214     /* Here we have strerror_l(), but not strerror_r() and we are on a
5215      * threaded-build.  We use strerror_l() for everything, constructing a
5216      * locale to pass to it if necessary */
5217 
5218     bool do_free = FALSE;
5219     locale_t locale_to_use;
5220 
5221     if (within_locale_scope) {
5222         locale_to_use = uselocale((locale_t) 0);
5223         if (locale_to_use == LC_GLOBAL_LOCALE) {
5224             locale_to_use = duplocale(LC_GLOBAL_LOCALE);
5225             do_free = TRUE;
5226         }
5227     }
5228     else {  /* Use C locale if not within 'use locale' scope */
5229         locale_to_use = PL_C_locale_obj;
5230     }
5231 
5232     errstr = savepv(strerror_l(errnum, locale_to_use));
5233 
5234     if (do_free) {
5235         freelocale(locale_to_use);
5236     }
5237 
5238 #    endif
5239 #  else /* Doesn't have strerror_l() */
5240 
5241     const char * save_locale = NULL;
5242     bool locale_is_C = FALSE;
5243 
5244     /* We have a critical section to prevent another thread from executing this
5245      * same code at the same time.  (On thread-safe perls, the LOCK is a
5246      * no-op.)  Since this is the only place in core that changes LC_MESSAGES
5247      * (unless the user has called setlocale(), this works to prevent races. */
5248     SETLOCALE_LOCK;
5249 
5250     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5251                             "my_strerror called with errnum %d\n", errnum));
5252     if (! within_locale_scope) {
5253         save_locale = do_setlocale_c(LC_MESSAGES, NULL);
5254         if (! save_locale) {
5255             SETLOCALE_UNLOCK;
5256             Perl_croak(aTHX_
5257                  "panic: %s: %d: Could not find current LC_MESSAGES locale,"
5258                  " errno=%d\n", __FILE__, __LINE__, errno);
5259         }
5260         else {
5261             locale_is_C = isNAME_C_OR_POSIX(save_locale);
5262 
5263             /* Switch to the C locale if not already in it */
5264             if (! locale_is_C) {
5265 
5266                 /* The setlocale() just below likely will zap 'save_locale', so
5267                  * create a copy.  */
5268                 save_locale = savepv(save_locale);
5269                 if (! do_setlocale_c(LC_MESSAGES, "C")) {
5270 
5271                     /* If, for some reason, the locale change failed, we
5272                      * soldier on as best as possible under the circumstances,
5273                      * using the current locale, and clear save_locale, so we
5274                      * don't try to change back.  On z/0S, all setlocale()
5275                      * calls fail after you've created a thread.  This is their
5276                      * way of making sure the entire process is always a single
5277                      * locale.  This means that 'use locale' is always in place
5278                      * for messages under these circumstances. */
5279                     Safefree(save_locale);
5280                     save_locale = NULL;
5281                 }
5282             }
5283         }
5284     }   /* end of ! within_locale_scope */
5285     else {
5286         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
5287                                                __FILE__, __LINE__));
5288     }
5289 
5290     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5291              "Any locale change has been done; about to call Strerror\n"));
5292     errstr = savepv(Strerror(errnum));
5293 
5294     if (! within_locale_scope) {
5295         if (save_locale && ! locale_is_C) {
5296             if (! do_setlocale_c(LC_MESSAGES, save_locale)) {
5297                 SETLOCALE_UNLOCK;
5298                 Perl_croak(aTHX_
5299                      "panic: %s: %d: setlocale restore to '%s' failed, errno=%d\n",
5300                              __FILE__, __LINE__, save_locale, errno);
5301             }
5302             Safefree(save_locale);
5303         }
5304     }
5305 
5306     SETLOCALE_UNLOCK;
5307 
5308 #  endif /* End of doesn't have strerror_l */
5309 #  ifdef DEBUGGING
5310 
5311     if (DEBUG_Lv_TEST) {
5312         PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
5313         print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
5314         PerlIO_printf(Perl_debug_log, "'\n");
5315     }
5316 
5317 #  endif
5318 #endif   /* End of does have locale messages */
5319 
5320     SAVEFREEPV(errstr);
5321     return errstr;
5322 }
5323 
5324 /*
5325 
5326 =for apidoc switch_to_global_locale
5327 
5328 On systems without locale support, or on typical single-threaded builds, or on
5329 platforms that do not support per-thread locale operations, this function does
5330 nothing.  On such systems that do have locale support, only a locale global to
5331 the whole program is available.
5332 
5333 On multi-threaded builds on systems that do have per-thread locale operations,
5334 this function converts the thread it is running in to use the global locale.
5335 This is for code that has not yet or cannot be updated to handle multi-threaded
5336 locale operation.  As long as only a single thread is so-converted, everything
5337 works fine, as all the other threads continue to ignore the global one, so only
5338 this thread looks at it.
5339 
5340 However, on Windows systems this isn't quite true prior to Visual Studio 15,
5341 at which point Microsoft fixed a bug.  A race can occur if you use the
5342 following operations on earlier Windows platforms:
5343 
5344 =over
5345 
5346 =item L<POSIX::localeconv|POSIX/localeconv>
5347 
5348 =item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
5349 
5350 =item L<perlapi/Perl_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
5351 
5352 =back
5353 
5354 The first item is not fixable (except by upgrading to a later Visual Studio
5355 release), but it would be possible to work around the latter two items by using
5356 the Windows API functions C<GetNumberFormat> and C<GetCurrencyFormat>; patches
5357 welcome.
5358 
5359 Without this function call, threads that use the L<C<setlocale(3)>> system
5360 function will not work properly, as all the locale-sensitive functions will
5361 look at the per-thread locale, and C<setlocale> will have no effect on this
5362 thread.
5363 
5364 Perl code should convert to either call
5365 L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in for the system
5366 C<setlocale>) or use the methods given in L<perlcall> to call
5367 L<C<POSIX::setlocale>|POSIX/setlocale>.  Either one will transparently properly
5368 handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
5369 
5370 Non-Perl libraries, such as C<gtk>, that call the system C<setlocale> can
5371 continue to work if this function is called before transferring control to the
5372 library.
5373 
5374 Upon return from the code that needs to use the global locale,
5375 L<C<sync_locale()>|perlapi/sync_locale> should be called to restore the safe
5376 multi-thread operation.
5377 
5378 =cut
5379 */
5380 
5381 void
5382 Perl_switch_to_global_locale()
5383 {
5384 
5385 #ifdef USE_THREAD_SAFE_LOCALE
5386 #  ifdef WIN32
5387 
5388     _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
5389 
5390 #  else
5391 #    ifdef HAS_QUERYLOCALE
5392 
5393     setlocale(LC_ALL, querylocale(LC_ALL_MASK, uselocale((locale_t) 0)));
5394 
5395 #    else
5396 
5397     {
5398         unsigned int i;
5399 
5400         for (i = 0; i < LC_ALL_INDEX; i++) {
5401             setlocale(categories[i], do_setlocale_r(categories[i], NULL));
5402         }
5403     }
5404 
5405 #    endif
5406 
5407     uselocale(LC_GLOBAL_LOCALE);
5408 
5409 #  endif
5410 #endif
5411 
5412 }
5413 
5414 /*
5415 
5416 =for apidoc sync_locale
5417 
5418 L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
5419 change the locale (though changing the locale is antisocial and dangerous on
5420 multi-threaded systems that don't have multi-thread safe locale operations.
5421 (See L<perllocale/Multi-threaded operation>).  Using the system
5422 L<C<setlocale(3)>> should be avoided.  Nevertheless, certain non-Perl libraries
5423 called from XS, such as C<Gtk> do so, and this can't be changed.  When the
5424 locale is changed by XS code that didn't use
5425 L<C<Perl_setlocale>|perlapi/Perl_setlocale>, Perl needs to be told that the
5426 locale has changed.  Use this function to do so, before returning to Perl.
5427 
5428 The return value is a boolean: TRUE if the global locale at the time of call
5429 was in effect; and FALSE if a per-thread locale was in effect.  This can be
5430 used by the caller that needs to restore things as-they-were to decide whether
5431 or not to call
5432 L<C<Perl_switch_to_global_locale>|perlapi/switch_to_global_locale>.
5433 
5434 =cut
5435 */
5436 
5437 bool
5438 Perl_sync_locale()
5439 {
5440 
5441 #ifndef USE_LOCALE
5442 
5443     return TRUE;
5444 
5445 #else
5446 
5447     const char * newlocale;
5448     dTHX;
5449 
5450 #  ifdef USE_POSIX_2008_LOCALE
5451 
5452     bool was_in_global_locale = FALSE;
5453     locale_t cur_obj = uselocale((locale_t) 0);
5454 
5455     /* On Windows, unless the foreign code has turned off the thread-safe
5456      * locale setting, any plain setlocale() will have affected what we see, so
5457      * no need to worry.  Otherwise, If the foreign code has done a plain
5458      * setlocale(), it will only affect the global locale on POSIX systems, but
5459      * will affect the */
5460     if (cur_obj == LC_GLOBAL_LOCALE) {
5461 
5462 #    ifdef HAS_QUERY_LOCALE
5463 
5464         do_setlocale_c(LC_ALL, setlocale(LC_ALL, NULL));
5465 
5466 #    else
5467 
5468         unsigned int i;
5469 
5470         /* We can't trust that we can read the LC_ALL format on the
5471          * platform, so do them individually */
5472         for (i = 0; i < LC_ALL_INDEX; i++) {
5473             do_setlocale_r(categories[i], setlocale(categories[i], NULL));
5474         }
5475 
5476 #    endif
5477 
5478         was_in_global_locale = TRUE;
5479     }
5480 
5481 #  else
5482 
5483     bool was_in_global_locale = TRUE;
5484 
5485 #  endif
5486 #  ifdef USE_LOCALE_CTYPE
5487 
5488     newlocale = savepv(do_setlocale_c(LC_CTYPE, NULL));
5489     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5490         "%s:%d: %s\n", __FILE__, __LINE__,
5491         setlocale_debug_string(LC_CTYPE, NULL, newlocale)));
5492     new_ctype(newlocale);
5493     Safefree(newlocale);
5494 
5495 #  endif /* USE_LOCALE_CTYPE */
5496 #  ifdef USE_LOCALE_COLLATE
5497 
5498     newlocale = savepv(do_setlocale_c(LC_COLLATE, NULL));
5499     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5500         "%s:%d: %s\n", __FILE__, __LINE__,
5501         setlocale_debug_string(LC_COLLATE, NULL, newlocale)));
5502     new_collate(newlocale);
5503     Safefree(newlocale);
5504 
5505 #  endif
5506 #  ifdef USE_LOCALE_NUMERIC
5507 
5508     newlocale = savepv(do_setlocale_c(LC_NUMERIC, NULL));
5509     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5510         "%s:%d: %s\n", __FILE__, __LINE__,
5511         setlocale_debug_string(LC_NUMERIC, NULL, newlocale)));
5512     new_numeric(newlocale);
5513     Safefree(newlocale);
5514 
5515 #  endif /* USE_LOCALE_NUMERIC */
5516 
5517     return was_in_global_locale;
5518 
5519 #endif
5520 
5521 }
5522 
5523 #if defined(DEBUGGING) && defined(USE_LOCALE)
5524 
5525 STATIC char *
5526 S_setlocale_debug_string(const int category,        /* category number,
5527                                                            like LC_ALL */
5528                             const char* const locale,   /* locale name */
5529 
5530                             /* return value from setlocale() when attempting to
5531                              * set 'category' to 'locale' */
5532                             const char* const retval)
5533 {
5534     /* Returns a pointer to a NUL-terminated string in static storage with
5535      * added text about the info passed in.  This is not thread safe and will
5536      * be overwritten by the next call, so this should be used just to
5537      * formulate a string to immediately print or savepv() on. */
5538 
5539     static char ret[256];
5540 
5541     my_strlcpy(ret, "setlocale(", sizeof(ret));
5542     my_strlcat(ret, category_name(category), sizeof(ret));
5543     my_strlcat(ret, ", ", sizeof(ret));
5544 
5545     if (locale) {
5546         my_strlcat(ret, "\"", sizeof(ret));
5547         my_strlcat(ret, locale, sizeof(ret));
5548         my_strlcat(ret, "\"", sizeof(ret));
5549     }
5550     else {
5551         my_strlcat(ret, "NULL", sizeof(ret));
5552     }
5553 
5554     my_strlcat(ret, ") returned ", sizeof(ret));
5555 
5556     if (retval) {
5557         my_strlcat(ret, "\"", sizeof(ret));
5558         my_strlcat(ret, retval, sizeof(ret));
5559         my_strlcat(ret, "\"", sizeof(ret));
5560     }
5561     else {
5562         my_strlcat(ret, "NULL", sizeof(ret));
5563     }
5564 
5565     assert(strlen(ret) < sizeof(ret));
5566 
5567     return ret;
5568 }
5569 
5570 #endif
5571 
5572 void
5573 Perl_thread_locale_init()
5574 {
5575     /* Called from a thread on startup*/
5576 
5577 #ifdef USE_THREAD_SAFE_LOCALE
5578 
5579     dTHX_DEBUGGING;
5580 
5581     /* C starts the new thread in the global C locale.  If we are thread-safe,
5582      * we want to not be in the global locale */
5583 
5584      DEBUG_L(PerlIO_printf(Perl_debug_log,
5585             "%s:%d: new thread, initial locale is %s; calling setlocale\n",
5586             __FILE__, __LINE__, setlocale(LC_ALL, NULL)));
5587 
5588 #  ifdef WIN32
5589 
5590     _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
5591 
5592 #  else
5593 
5594     Perl_setlocale(LC_ALL, "C");
5595 
5596 #  endif
5597 #endif
5598 
5599 }
5600 
5601 void
5602 Perl_thread_locale_term()
5603 {
5604     /* Called from a thread as it gets ready to terminate */
5605 
5606 #ifdef USE_THREAD_SAFE_LOCALE
5607 
5608     /* C starts the new thread in the global C locale.  If we are thread-safe,
5609      * we want to not be in the global locale */
5610 
5611 #  ifndef WIN32
5612 
5613     {   /* Free up */
5614         locale_t cur_obj = uselocale(LC_GLOBAL_LOCALE);
5615         if (cur_obj != LC_GLOBAL_LOCALE && cur_obj != PL_C_locale_obj) {
5616             freelocale(cur_obj);
5617         }
5618     }
5619 
5620 #  endif
5621 #endif
5622 
5623 }
5624 
5625 /*
5626  * ex: set ts=8 sts=4 sw=4 et:
5627  */
5628