xref: /openbsd/gnu/usr.bin/perl/locale.c (revision 3d61058a)
1 /*    locale.c
2  *
3  *    Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4  *    2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
5  *
6  *    You may distribute under the terms of either the GNU General Public
7  *    License or the Artistic License, as specified in the README file.
8  *
9  */
10 
11 /*
12  *      A Elbereth Gilthoniel,
13  *      silivren penna míriel
14  *      o menel aglar elenath!
15  *      Na-chaered palan-díriel
16  *      o galadhremmin ennorath,
17  *      Fanuilos, le linnathon
18  *      nef aear, si nef aearon!
19  *
20  *     [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
21  */
22 
23 /* utility functions for handling locale-specific stuff like what
24  * character represents the decimal point.
25  *
26  * All C programs have an underlying locale.  Perl code generally doesn't pay
27  * any attention to it except within the scope of a 'use locale'.  For most
28  * categories, it accomplishes this by just using different operations if it is
29  * in such scope than if not.  However, various libc functions called by Perl
30  * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31  * are used to toggle between the current locale and the C locale depending on
32  * the desired behavior of those functions at the moment.  And, LC_MESSAGES is
33  * switched to the C locale for outputting the message unless within the scope
34  * of 'use locale'.
35  *
36  * There is more than the typical amount of variation between platforms with
37  * regard to locale handling.  At the end of these introductory comments, are
38  * listed various relevent Configuration options, including some that can be
39  * used to pretend to some extent that this is being developed on a different
40  * platform than it actually is.  This allows you to make changes and catch
41  * some errors without having access to those other platforms.
42  *
43  * This code now has multi-thread-safe locale handling on systems that support
44  * that.  This is completely transparent to most XS code.  On earlier systems,
45  * it would be possible to emulate thread-safe locales, but this likely would
46  * involve a lot of locale switching, and would require XS code changes.
47  * Macros could be written so that the code wouldn't have to know which type of
48  * system is being used.
49  *
50  * Table-driven code is used for simplicity and clarity, as many operations
51  * differ only in which category is being worked on.  However the system
52  * categories need not be small contiguous integers, so do not lend themselves
53  * to table lookup.  Instead we have created our own equivalent values which
54  * are all small contiguous non-negative integers, and translation functions
55  * between the two sets.  For category 'LC_foo', the name of our index is
56  * LC_foo_INDEX_.  Various parallel tables, indexed by these, are used for the
57  * translation.  The tables are generated at compile-time based on platform
58  * characteristics and Configure options.  They hide from the code many of the
59  * vagaries of the different locale implementations out there.
60  *
61  * On unthreaded perls, most operations expand out to just the basic
62  * setlocale() calls.  That sort of is true on threaded perls on modern Windows
63  * systems where the same API, after set up, is used for thread-safe locale
64  * handling.  (But there are complications on Windows due to internal character
65  * set issues.)  On other systems, there is a completely different API,
66  * specified in POSIX 2008, to do thread-safe locales.  On these systems, our
67  * bool_setlocale_2008_i() function is used to hide the different API from the
68  * outside.  This makes it completely transparent to most XS code.
69  *
70  * A huge complicating factor is that the LC_NUMERIC category is normally held
71  * in the C locale, except during those relatively rare times when it needs to
72  * be in the underlying locale.  There is a bunch of code to accomplish this,
73  * and to allow easy switches from one state to the other.
74  *
75  * In addition, the setlocale equivalents have versions for the return context,
76  * 'void' and 'bool', besides the full return value.  This can present
77  * opportunities for avoiding work.  We don't have to necessarily create a safe
78  * copy to return if no return is desired.
79  *
80  * There are 3.5 major implementations here; which one chosen depends on what
81  * the platform has available, and Configuration options.
82  *
83  * 1) Raw posix_setlocale().  This implementation is basically the libc
84  *    setlocale(), with possibly minor tweaks.  This is used for startup, and
85  *    always for unthreaded perls, and when the API for safe locale threading
86  *    is identical to the unsafe API (Windows, currently).
87  *
88  *    This implementation is composed of two layers:
89  *      a)  posix_setlocale() implements the libc setlocale().  In most cases,
90  *          it is just an alias for the libc version.  But Windows doesn't
91  *          fully conform to the POSIX standard, and this is a layer on top of
92  *          libc to bring it more into conformance.  And in Configurations
93  *          where perl is to ignore some locale categories that the libc
94  *          setlocale() knows about, there is a layer to cope with that.
95  *      b)  stdized_setlocale() is a layer above a) that fixes some vagaries in
96  *          the return value of the libc setlocale().  On most platforms this
97  *          layer is empty; in order to be activated, it requires perl to be
98  *          Configured with a parameter indicating the platform's defect.  The
99  *          current ones are listed at the definition of the macro.
100  *
101  * 2) An implementation that adds a minimal layer above implementation 1),
102  *    making that implementation uninterruptible and returning a
103  *    per-thread/per-category value.
104  *
105  * 3a and 3b) An implementation of POSIX 2008 thread-safe locale handling,
106  *    hiding from the programmer the completely different API for this.
107  *    This automatically makes almost all code thread-safe without need for
108  *    changes.  This implementation is chosen on threaded perls when the
109  *    platform properly supports the POSIX 2008 functions, and when there is no
110  *    manual override to the contrary passed to Configure.
111  *
112  *    3a) is when the platform has a documented reliable querylocale() function
113  *        or equivalent that is selected to be used.
114  *    3b) is when we have to emulate that functionality.
115  *
116  *    Unfortunately, it seems that some platforms that claim to support these
117  *    are buggy, in one way or another.  There are workarounds encoded here,
118  *    where feasible, for platforms where the bugs are amenable to that
119  *    (glibc, for example).  But other platforms instead don't use this
120  *    implementation.
121  *
122  * z/OS (os390) is an outlier.  Locales really don't work under threads when
123  * either the radix character isn't a dot, or attempts are made to change
124  * locales after the first thread is created.  The reason is that IBM has made
125  * it thread-safe by refusing to change locales (returning failure if
126  * attempted) any time after an application has called pthread_create() to
127  * create another thread.  The expectation is that an application will set up
128  * its locale information before the first fork, and be stable thereafter.  But
129  * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
130  * the other toggles, which are less common.
131  *
132  * Associated with each implementation are three sets of macros that translate
133  * a consistent API into what that implementation needs.  Each set consists of
134  * three macros with the suffixes:
135  *  _c  Means the argument is a locale category number known at compile time.
136  *          An example would be LC_TIME.  This token is a compile-time constant
137  *          and can be passed to a '_c' macro.
138  *  _r  Means the argument is a locale category number whose value might not be
139  *      known until runtime
140  *  _i  Means the argument is our internal index of a locale category
141  *
142  * The three sets are:    ('_X'  means one of '_c', '_r', '_i')
143  * 1) bool_setlocale_X()
144  *      This calls the appropriate setlocale()-equivalent for the
145  *      implementation, with the category and new locale.  The input locale is
146  *      not necessarily valid, so the return is true or false depending on
147  *      whether or not the setlocale() succeeded.  This is not used for
148  *      querying the locale, so the input locale must not be NULL.
149  *
150  *      This macro is suitable for toggling the locale back and forth during an
151  *      operation.  For example, the names of days and months under LC_TIME are
152  *      strings that are also subject to LC_CTYPE.  If the locales of these two
153  *      categories differ, mojibake can result on many platforms.  The code
154  *      here will toggle LC_CTYPE into the locale of LC_TIME temporarily to
155  *      avoid this.
156  *
157  *      Several categories require extra work when their locale is changed.
158  *      LC_CTYPE, for example, requires the calculation of the table of which
159  *      characters fold to which others under /i pattern matching or fc(), as
160  *      folding is not a concept in POSIX.  This table isn't needed when the
161  *      LC_CTYPE locale gets toggled during an operation, and will be toggled
162  *      back before return to the caller.  To save work that would be
163  *      discarded, the bool_setlocale_X() implementations don't do this extra
164  *      work.  Instead, there is a separate function for just this purpose to
165  *      be done before control is transferred back to the external caller.  All
166  *      categories that have such requirements have such a function.  The
167  *      update_functions[] array contains pointers to them (or NULL for
168  *      categories which don't need a function).
169  *
170  *      Care must be taken to remember to call the separate function before
171  *      returning to an external caller, and to not use things it updates
172  *      before its call.  An alternative approach would be to have
173  *      bool_setlocale_X() always call the update, which would return
174  *      immediately if a flag wasn't set indicating it was time to actually
175  *      perform it.
176  *
177  * 2) void_setlocale_X()
178  *      This is like bool_setlocale_X(), but it is used only when it is
179  *      expected that the call must succeed, or something is seriously wrong.
180  *      A panic is issued if it fails.  The caller uses this form when it just
181  *      wants to assume things worked.
182  *
183  * 3) querylocale_X()
184  *      This returns a string that specifies the current locale for the given
185  *      category given by the input argument.  The string is safe from other
186  *      threads zapping it, and the caller need not worry about freeing it, but
187  *      it may be mortalized, so must be copied if you need to preserve it
188  *      across calls, or long term.  This returns the actual current locale,
189  *      not the nominal.  These differ, for example, when LC_NUMERIC is
190  *      supposed to be a locale whose decimal radix character is a comma.  As
191  *      mentioned above, Perl actually keeps this category set to C in such
192  *      circumstances so that XS code can just assume a dot radix character.
193  *      querylocale_X() returns the locale that libc has stored at this moment,
194  *      so most of the time will return a locale whose radix character is a
195  *      dot.  The macro query_nominal_locale_i() can be used to get the nominal
196  *      locale that an external caller would expect, for all categories except
197  *      LC_ALL.  For that, you can use the function
198  *      S_calculate_LC_ALL_string().  Or S_native_querylocale_i() will operate
199  *      on any category.
200  *
201  * The underlying C API that this implements uses category numbers, hence the
202  * code is structured to use '_r' at the API level to convert to indexes, which
203  * are then used internally with the '_i' forms.
204  *
205  * The splitting apart into setting vs querying means that the return value of
206  * the bool macros is not subject to potential clashes with other threads,
207  * eliminating any need for the calling code to worry about that and get it
208  * wrong.  Whereas, you do have to think about thread interactions when using a
209  * query.
210  *
211  * Additionally, for the implementations where there aren't any complications,
212  * a setlocale_i() is defined that is like plain setlocale(), returning the new
213  * locale.  Thus it combines a bool_setlocale_X() with a querylocale_X().  It
214  * is used only for performance on implementations that allow it, such as
215  * non-threaded perls.
216  *
217  * There are also a few other macros herein that use this naming convention to
218  * describe their category parameter.
219  *
220  * Relevant Configure options
221  *
222  *      -Accflags=-DNO_LOCALE
223  *          This compiles perl to always use the C locale, ignoring any
224  *          attempts to change it.  This could be useful on platforms with a
225  *          crippled locale implementation.
226  *
227  *      -Accflags=-DNO_THREAD_SAFE_LOCALE
228  *          Even if thread-safe operations are available on this platform and
229  *          would otherwise be used (because this is a perl with multiplicity),
230  *          perl is compiled to not use them.  This could be useful on
231  *          platforms where the libc is buggy.
232  *
233  *      -Accflags=-DNO_POSIX_2008_LOCALE
234  *          Even if the libc locale operations specified by the Posix 2008
235  *          Standard are available on this platform and would otherwise be used
236  *          (because this is a threaded perl), perl is compiled to not use
237  *          them.  This could be useful on platforms where the libc is buggy.
238  *          This is like NO_THREAD_SAFE_LOCALE, but has no effect on platforms
239  *          that don't have these functions.
240  *
241  *      -Accflags=-DUSE_POSIX_2008_LOCALE
242  *          Normally, setlocale() is used for locale operations on perls
243  *          compiled without threads.  This option causes the locale operations
244  *          defined by the Posix 2008 Standard to always be used instead.  This
245  *          could be useful on platforms where the libc setlocale() is buggy.
246  *
247  *      -Accflags=-DNO_THREAD_SAFE_QUERYLOCALE
248  *          This applies only to platforms that have a querylocale() libc
249  *          function.  perl assumes that that function is thread-safe, unless
250  *          overridden by this, typically in a hints file.  When overridden,
251  *          querylocale() is called only while the locale mutex is locked, and
252  *          the result is copied to a per-thread place before unlocking.
253  *
254  *      -Accflags=-DNO_USE_NL_LOCALE_NAME
255  *          glibc has an undocumented equivalent function to querylocale(),
256  *          which our experience indicates is reliable.  But you can forbid its
257  *          use by specifying this Configure option (with no effect on systems
258  *          lacking it).  When this is function is enabled, it removes the need
259  *          for perl to keep its own records, hence is more efficient and
260  *          guaranteed to be accurate.
261  *
262  *      -Accflags=-DNO_LOCALE_CTYPE
263  *      -Accflags=-DNO_LOCALE_NUMERIC
264  *          etc.
265  *
266  *          If the named category(ies) does(do) not exist on this platform,
267  *          these have no effect.  Otherwise they cause perl to be compiled to
268  *          always keep the named category(ies) in the C locale.
269  *
270  *      -Accflags=-DHAS_BROKEN_SETLOCALE_QUERY_LC_ALL
271  *          This would be set in a hints file to tell perl that doing a libc
272  *              setlocale(LC_ALL, NULL)
273  *          can give erroneous results, and perl will compensate to get the
274  *          correct results.  This is known to be a problem in earlier AIX
275  *          versions
276  *
277  *      -Accflags=-DHAS_LF_IN_SETLOCALE_RETURN
278  *          This would be set in a hints file to tell perl that a libc
279  *          setlocale() can return results containing \n characters that need
280  *          to be stripped off.  khw believes there aren't any such platforms
281  *          still in existence.
282  *
283  *      -Accflags=-DLIBC_HANDLES_MISMATCHED_CTYPE
284  *          Consider the name of a month in some language, Chinese for example.
285  *          If LC_TIME has been set to a Chinese locale, strftime() can be used
286  *          to generate the Chinese month name for any given date, by using the
287  *          %B format.  But also suppose that LC_CTYPE is set to, say, "C".
288  *          The return from strftime() on many platforms will be mojibake given
289  *          that no Chinese month name is composed of just ASCII characters.
290  *          Perl handles this for you by automatically toggling LC_CTYPE to
291  *          whatever LC_TIME is during the execution of strftime(), and
292  *          afterwards restoring it to its prior value.  But the strftime()
293  *          (and similar functions) in some libc implementations already do
294  *          this toggle, meaning perl's action is redundant.  You can tell perl
295  *          that a libc does this by setting this Configure option, and it will
296  *          skip its syncing LC_CTYPE and whatever the other locale is.
297  *          Currently, perl ignores this Configuration option and  syncs anyway
298  *          for LC_COLLATE-related operations, due to perl's internal needs.
299  *
300  *      -Accflags=USE_FAKE_LC_ALL_POSITIONAL_NOTATION
301  *          This is used when developing Perl on a platform that uses
302  *          'name=value;' notation to represent LC_ALL when not all categories
303  *          are the same.  When so compiled, much of the code gets compiled
304  *          and exercised that applies to platforms that instead use positional
305  *          notation.  This allows for finding many bugs in that portion of the
306  *          implementation, without having to access such a platform.
307  *
308  *      -Accflags=-DWIN32_USE_FAKE_OLD_MINGW_LOCALES
309  *          This is used when developing Perl on a non-Windows platform to
310  *          compile and exercise much of the locale-related code that instead
311  *          applies to MingW platforms that don't use the more modern UCRT
312  *          library.  This allows for finding many bugs in that portion of the
313  *          implementation, without having to access such a platform.
314  */
315 
316 /* If the environment says to, we can output debugging information during
317  * initialization.  This is done before option parsing, and before any thread
318  * creation, so can be a file-level static.  (Must come before #including
319  * perl.h) */
320 #include "config.h"
321 
322 /* Returns the Unix errno portion; ignoring any others.  This is a macro here
323  * instead of putting it into perl.h, because unclear to khw what should be
324  * done generally. */
325 #define GET_ERRNO   saved_errno
326 
327 #ifdef DEBUGGING
328 static int debug_initialization = 0;
329 #  define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
330 #  define DEBUG_LOCALE_INITIALIZATION_  debug_initialization
331 
332 #  ifdef HAS_EXTENDED_OS_ERRNO
333      /* Output the non-zero errno and/or the non-zero extended errno */
334 #    define DEBUG_ERRNO                                                     \
335         dSAVE_ERRNO; dTHX;                                                  \
336         int extended = get_extended_os_errno();                             \
337         const char * errno_string;                                          \
338         if (GET_ERRNO == 0) { /* Skip output if both errno types are 0 */   \
339             if (LIKELY(extended == 0)) errno_string = "";                   \
340             else errno_string = Perl_form(aTHX_ "; $^E=%d", extended);      \
341         }                                                                   \
342         else if (LIKELY(extended == GET_ERRNO))                             \
343             errno_string = Perl_form(aTHX_ "; $!=%d", GET_ERRNO);           \
344         else errno_string = Perl_form(aTHX_ "; $!=%d, $^E=%d",              \
345                                                     GET_ERRNO, extended);
346 #  else
347      /* Output the errno, if non-zero */
348 #    define DEBUG_ERRNO                                                     \
349         dSAVE_ERRNO;                                                        \
350         const char * errno_string = "";                                     \
351         if (GET_ERRNO != 0) {                                               \
352             dTHX;                                                           \
353             errno_string = Perl_form(aTHX_ "; $!=%d", GET_ERRNO);           \
354         }
355 #  endif
356 
357 /* Automatically include the caller's file, and line number in debugging output;
358  * and the errno (and/or extended errno) if non-zero.  On threaded perls add
359  * the aTHX too. */
360 #  if defined(MULTIPLICITY) && ! defined(NO_LOCALE_THREADS)
361 #    define DEBUG_PRE_STMTS                                                 \
362         DEBUG_ERRNO;                                                        \
363         PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf ": 0x%p%s: ",       \
364                                       __FILE__, (line_t)__LINE__, aTHX_     \
365                                       errno_string);
366 #  else
367 #    define DEBUG_PRE_STMTS                                                 \
368         DEBUG_ERRNO;                                                        \
369         PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf "%s: ",             \
370                                       __FILE__, (line_t)__LINE__,           \
371                                       errno_string);
372 #  endif
373 #  define DEBUG_POST_STMTS  RESTORE_ERRNO;
374 #else
375 #  define debug_initialization 0
376 #  define DEBUG_INITIALIZATION_set(v)
377 #  define DEBUG_PRE_STMTS
378 #  define DEBUG_POST_STMTS
379 #endif
380 
381 #include "EXTERN.h"
382 #define PERL_IN_LOCALE_C
383 #include "perl.h"
384 
385 /* Some platforms require LC_CTYPE to be congruent with the category we are
386  * looking for.  XXX This still presumes that we have to match COLLATE and
387  * CTYPE even on platforms that apparently handle this. */
388 #if defined(USE_LOCALE_CTYPE) && ! defined(LIBC_HANDLES_MISMATCHED_CTYPE)
389 #  define WE_MUST_DEAL_WITH_MISMATCHED_CTYPE    /* no longer used; kept for
390                                                    possible future use */
391 #  define start_DEALING_WITH_MISMATCHED_CTYPE(locale)                          \
392         const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale)
393 #  define end_DEALING_WITH_MISMATCHED_CTYPE(locale)                            \
394         restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
395 #else
396 #  define start_DEALING_WITH_MISMATCHED_CTYPE(locale)
397 #  define end_DEALING_WITH_MISMATCHED_CTYPE(locale)
398 #endif
399 
400 #ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
401 
402    /* Use -Accflags=-DWIN32_USE_FAKE_OLD_MINGW_LOCALES on a POSIX or *nix box
403     * to get a semblance of pretending the locale handling is that of a MingW
404     * that doesn't use UCRT (hence 'OLD' in the name).  This exercizes code
405     * paths that are not compiled on non-Windows boxes, and allows for ASAN
406     * and PERL_MEMLOG.  This is thus a way to see if locale.c on Windows is
407     * likely going to compile, without having to use a real Win32 box.  And
408     * running the test suite will verify to a large extent our logic and memory
409     * allocation handling for such boxes.  Of course the underlying calls are
410     * to the POSIX libc, so any differences in implementation between those and
411     * the Windows versions will not be caught by this. */
412 
413 #  define WIN32
414 #  undef P_CS_PRECEDES
415 #  undef CURRENCY_SYMBOL
416 #  define CP_UTF8 -1
417 #  undef _configthreadlocale
418 #  define _configthreadlocale(arg) NOOP
419 
420 #  define MultiByteToWideChar(cp, flags, byte_string, m1, wstring, req_size) \
421                     (PERL_UNUSED_ARG(cp),                                    \
422                      mbsrtowcs(wstring, &(byte_string), req_size, NULL) + 1)
423 #  define WideCharToMultiByte(cp, flags, wstring, m1, byte_string,          \
424                               req_size, default_char, found_default_char)   \
425                     (PERL_UNUSED_ARG(cp),                                   \
426                      wcsrtombs(byte_string, &(wstring), req_size, NULL) + 1)
427 
428 #  ifdef USE_LOCALE
429 
430 static const wchar_t * wsetlocale_buf = NULL;
431 static Size_t wsetlocale_buf_size = 0;
432 
433 #    ifdef MULTIPLICITY
434 
435 static PerlInterpreter * wsetlocale_buf_aTHX = NULL;
436 
437 #    endif
438 
439 STATIC
440 const wchar_t *
S_wsetlocale(const int category,const wchar_t * wlocale)441 S_wsetlocale(const int category, const wchar_t * wlocale)
442 {
443     /* Windows uses a setlocale that takes a wchar_t* locale.  Other boxes
444      * don't have this, so this Windows replacement converts the wchar_t input
445      * to plain 'char*', calls plain setlocale(), and converts the result back
446      * to 'wchar_t*' */
447 
448     const char * byte_locale = NULL;
449     if (wlocale) {
450         byte_locale = Win_wstring_to_byte_string(CP_UTF8, wlocale);
451     }
452 
453     const char * byte_result = setlocale(category, byte_locale);
454     Safefree(byte_locale);
455     if (byte_result == NULL) {
456         return NULL;
457     }
458 
459     const wchar_t * wresult = Win_byte_string_to_wstring(CP_UTF8, byte_result);
460 
461     if (! wresult) {
462         return NULL;
463     }
464 
465     /* Emulate a global static memory return from wsetlocale().  This currently
466      * leaks at process end; would require changing LOCALE_TERM to fix that */
467     Size_t string_size = wcslen(wresult) + 1;
468 
469     if (wsetlocale_buf_size == 0) {
470         Newx(wsetlocale_buf, string_size, wchar_t);
471         wsetlocale_buf_size = string_size;
472 
473 #  ifdef MULTIPLICITY
474 
475         dTHX;
476         wsetlocale_buf_aTHX = aTHX;
477 
478 #  endif
479 
480     }
481     else if (string_size > wsetlocale_buf_size) {
482         Renew(wsetlocale_buf, string_size, wchar_t);
483         wsetlocale_buf_size = string_size;
484     }
485 
486     Copy(wresult, wsetlocale_buf, string_size, wchar_t);
487     Safefree(wresult);
488 
489     return wsetlocale_buf;
490 }
491 
492 #  define _wsetlocale(category, wlocale)  S_wsetlocale(category, wlocale)
493 #  endif
494 #endif  /* WIN32_USE_FAKE_OLD_MINGW_LOCALES */
495 
496 /* 'for' loop headers to hide the necessary casts */
497 #define for_category_indexes_between(i, m, n)                               \
498     for (locale_category_index i = (locale_category_index) (m);             \
499          i <= (locale_category_index) (n);                                  \
500          i = (locale_category_index) ((int) i + 1))
501 #define for_all_individual_category_indexes(i)                              \
502         for_category_indexes_between(i, 0, LC_ALL_INDEX_ - 1)
503 #define for_all_but_0th_individual_category_indexes(i)                      \
504         for_category_indexes_between(i, 1, LC_ALL_INDEX_ - 1)
505 #define for_all_category_indexes(i)                                         \
506         for_category_indexes_between(i, 0, LC_ALL_INDEX_)
507 
508 #ifdef USE_LOCALE
509 #  if defined(USE_FAKE_LC_ALL_POSITIONAL_NOTATION) && defined(LC_ALL)
510 
511 /* This simulates an underlying positional notation for LC_ALL when compiled on
512  * a system that uses name=value notation.  Use this to develop on Linux and
513  * make a quick check that things have some chance of working on a positional
514  * box.  Enable by adding to the Congfigure parameters:
515  *      -Accflags=USE_FAKE_LC_ALL_POSITIONAL_NOTATION
516  *
517  * NOTE it redefines setlocale() and usequerylocale()
518  * */
519 
520 STATIC const char *
S_positional_name_value_xlation(const char * locale,bool direction)521 S_positional_name_value_xlation(const char * locale, bool direction)
522 {   /* direction == 1 is from name=value to positional
523        direction == 0 is from positional to name=value */
524     assert(locale);
525 
526     dTHX;
527     const char * individ_locales[LC_ALL_INDEX_] = { NULL };
528 
529     /* This parses either notation */
530     switch (parse_LC_ALL_string(locale,
531                                 (const char **) &individ_locales,
532                                 no_override,  /* Handled by other code */
533                                 false,      /* Return only [0] if suffices */
534                                 false,      /* Don't panic on error */
535                                 __LINE__))
536     {
537       default:      /* Some compilers don't realize that below is the complete
538                        list of the available enum values */
539       case invalid:
540         return NULL;
541 
542       case no_array:
543         return locale;
544       case only_element_0:
545         SAVEFREEPV(individ_locales[0]);
546         return individ_locales[0];
547       case full_array:
548        {
549         calc_LC_ALL_format  format = (direction)
550                                      ? EXTERNAL_FORMAT_FOR_SET
551                                      : INTERNAL_FORMAT;
552         const char * retval = calculate_LC_ALL_string(individ_locales,
553                                                       format,
554                                                       WANT_TEMP_PV,
555                                                       __LINE__);
556 
557         for_all_individual_category_indexes(i) {
558             Safefree(individ_locales[i]);
559         }
560 
561         return retval;
562        }
563     }
564 }
565 
566 STATIC const char *
S_positional_setlocale(int cat,const char * locale)567 S_positional_setlocale(int cat, const char * locale)
568 {
569     if (cat != LC_ALL) return setlocale(cat, locale);
570 
571     if (locale && strNE(locale, "")) {
572         locale = S_positional_name_value_xlation(locale, 0);
573         if (! locale) return NULL;
574     }
575 
576     locale = setlocale(cat, locale);
577     if (locale == NULL) return NULL;
578     return S_positional_name_value_xlation(locale, 1);
579 }
580 
581 #    undef setlocale
582 #    define setlocale(a,b)  S_positional_setlocale(a,b)
583 #    ifdef USE_POSIX_2008_LOCALE
584 
585 STATIC locale_t
S_positional_newlocale(int mask,const char * locale,locale_t base)586 S_positional_newlocale(int mask, const char * locale, locale_t base)
587 {
588     assert(locale);
589 
590     if (mask != LC_ALL_MASK) return newlocale(mask, locale, base);
591 
592     if (strNE(locale, "")) locale = S_positional_name_value_xlation(locale, 0);
593     if (locale == NULL) return NULL;
594     return newlocale(LC_ALL_MASK, locale, base);
595 }
596 
597 #    undef newlocale
598 #    define newlocale(a,b,c)  S_positional_newlocale(a,b,c)
599 #    endif
600 #  endif
601 #endif  /* End of fake positional notation */
602 
603 #include "reentr.h"
604 
605 #ifdef I_WCHAR
606 #  include <wchar.h>
607 #endif
608 #ifdef I_WCTYPE
609 #  include <wctype.h>
610 #endif
611 
612  /* The main errno that gets used is this one, on platforms that support it */
613 #ifdef EINVAL
614 #  define SET_EINVAL  SETERRNO(EINVAL, LIB_INVARG)
615 #else
616 #  define SET_EINVAL
617 #endif
618 
619 /* This is a starting guess as to when this is true.  It definititely isn't
620  * true on *BSD where positional LC_ALL notation is used.  Likely this will end
621  * up being defined in hints files. */
622 #ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
623 #  define NEWLOCALE_HANDLES_DISPARATE_LC_ALL
624 #endif
625 
626 /* But regardless, we have to look at individual categories if some are
627  * ignored.  */
628 #ifdef HAS_IGNORED_LOCALE_CATEGORIES_
629 #  undef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
630 #endif
631 #ifdef USE_LOCALE
632 
633 /* Not all categories need be set to the same locale.  This macro determines if
634  * 'name' which represents LC_ALL is uniform or disparate.  There are two
635  * situations: 1) the platform uses unordered name=value pairs; 2) the platform
636  * uses ordered positional values, with a separator string between them */
637 #  ifdef PERL_LC_ALL_SEPARATOR   /* positional */
638 #    define is_disparate_LC_ALL(name)  cBOOL(instr(name, PERL_LC_ALL_SEPARATOR))
639 #  else     /* name=value */
640 
641     /* In the, hopefully never occurring, event that the platform doesn't use
642      * either mechanism for disparate LC_ALL's, assume the name=value pairs
643      * form, rather than taking the extreme step of refusing to compile.  Many
644      * programs won't have disparate locales, so will generally work */
645 #    define PERL_LC_ALL_SEPARATOR  ";"
646 #    define is_disparate_LC_ALL(name)  cBOOL(   strchr(name, ';')   \
647                                              && strchr(name, '='))
648 #  endif
649 
650 /* It is possible to compile perl to always keep any individual category in the
651  * C locale.  This would be done where the implementation on a platform is
652  * flawed or incomplete.  At the time of this writing, for example, OpenBSD has
653  * not implemented LC_COLLATE beyond the C locale.  The 'category_available[]'
654  * table is a bool that says whether a category is changeable, or must be kept
655  * in C.  This macro substitutes C for the locale appropriately, expanding to
656  * nothing on the more typical case where all possible categories present on
657  * the platform are handled. */
658 #  if defined(HAS_IGNORED_LOCALE_CATEGORIES_)       \
659    || defined(HAS_MISSING_LANGINFO_ITEM_)
660 #    define need_to_override_category(i)  (! category_available[i])
661 #    define override_ignored_category(i, new_locale)                        \
662                     ((need_to_override_category(i)) ? "C" : (new_locale))
663 #  else
664 #    define need_to_override_category(i)  0
665 #    define override_ignored_category(i, new_locale)  (new_locale)
666 #  endif
667 
668 PERL_STATIC_INLINE const char *
S_mortalized_pv_copy(pTHX_ const char * const pv)669 S_mortalized_pv_copy(pTHX_ const char * const pv)
670 {
671     PERL_ARGS_ASSERT_MORTALIZED_PV_COPY;
672 
673     /* Copies the input pv, and arranges for it to be freed at an unspecified
674      * later time. */
675 
676     if (pv == NULL) {
677         return NULL;
678     }
679 
680     const char * copy = savepv(pv);
681     SAVEFREEPV(copy);
682     return copy;
683 }
684 
685 #endif
686 
687 /* Default values come from the C locale */
688 #define C_codeset "ANSI_X3.4-1968" /* Only in some Configurations, and usually
689                                       a single instance, so is a #define */
690 static const char C_decimal_point[] = ".";
691 
692 #if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
693 #  define HAS_SOME_LANGINFO
694 #endif
695 
696 #if (defined(USE_LOCALE_NUMERIC) && ! defined(TS_W32_BROKEN_LOCALECONV))    \
697  || ! (   defined(USE_LOCALE_NUMERIC)                                       \
698        && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV)))
699 static const char C_thousands_sep[] = "";
700 #endif
701 
702 /* Is the C string input 'name' "C" or "POSIX"?  If so, and 'name' is the
703  * return of setlocale(), then this is extremely likely to be the C or POSIX
704  * locale.  However, the output of setlocale() is documented to be opaque, but
705  * the odds are extremely small that it would return these two strings for some
706  * other locale.  Note that VMS includes many non-ASCII characters in these two
707  * locales as controls and punctuation (below are hex bytes):
708  *   cntrl:  84-97 9B-9F
709  *   punct:  A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
710  * Oddly, none there are listed as alphas, though some represent alphabetics
711  * https://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
712 #define isNAME_C_OR_POSIX(name)                                              \
713                              (   (name) != NULL                              \
714                               && (( *(name) == 'C' && (*(name + 1)) == '\0') \
715                                    || strEQ((name), "POSIX")))
716 
717 /* If this interface to nl_langinfo() isn't defined by embed.fnc, it means it
718  * isn't available on this platform, so instead emulate it */
719 #ifndef langinfo_sv_i
720 #  define langinfo_sv_i(i, c, l, s, u)                                      \
721                         (PERL_UNUSED_VAR(c), emulate_langinfo(i, l, s, u))
722 #endif
723 
724 /* In either case, create a version that takes things like 'LC_NUMERIC' as a
725  * parameter */
726 #define langinfo_sv_c(item, category, locale, sv, utf8ness)                 \
727         langinfo_sv_i(item, category##_INDEX_, locale, sv, utf8ness)
728 
729 /* The normal method for interfacing with nl_langinfo() in this file is to use
730  * a scratch buffer (whose existence is hidden from the caller by these
731  * macros). */
732 #define langinfo_i(item, index, locale, utf8ness)                           \
733         langinfo_sv_i(item, index, locale, PL_scratch_langinfo, utf8ness)
734 
735 #define langinfo_c(item, category, locale, utf8ness)                        \
736         langinfo_i(item, category##_INDEX_, locale, utf8ness)
737 
738 #ifndef USE_LOCALE  /* A no-op unless locales are enabled */
739 #  define toggle_locale_i(index, locale)                                    \
740     ((const char *) (PERL_UNUSED_VAR(locale), NULL))
741 #  define restore_toggled_locale_i(index, locale)  PERL_UNUSED_VAR(locale)
742 #else
743 #  define toggle_locale_i(index, locale)                                    \
744                  S_toggle_locale_i(aTHX_ index, locale, __LINE__)
745 #  define restore_toggled_locale_i(index, locale)                           \
746                  S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__)
747 #endif
748 
749 #  define toggle_locale_c(cat, locale)  toggle_locale_i(cat##_INDEX_, locale)
750 #  define restore_toggled_locale_c(cat, locale)                             \
751                              restore_toggled_locale_i(cat##_INDEX_, locale)
752 #ifdef USE_LOCALE
753 #  ifdef DEBUGGING
754 #    define setlocale_debug_string_i(index, locale, result)                 \
755             my_setlocale_debug_string_i(index, locale, result, __LINE__)
756 #    define setlocale_debug_string_c(category, locale, result)              \
757                 setlocale_debug_string_i(category##_INDEX_, locale, result)
758 #    define setlocale_debug_string_r(category, locale, result)              \
759              setlocale_debug_string_i(get_category_index(category),         \
760                                       locale, result)
761 #  endif
762 
763 /* On systems without LC_ALL, pretending it exists anyway simplifies things.
764  * Choose a value for it that is very unlikely to clash with any actual
765  * category */
766 #  define FAKE_LC_ALL  PERL_INT_MIN
767 
768 /* Below are parallel arrays for locale information indexed by our mapping of
769  * category numbers into small non-negative indexes.  locale_table.h contains
770  * an entry like this for each individual category used on this system:
771  *      PERL_LOCALE_TABLE_ENTRY(CTYPE, S_new_ctype)
772  *
773  * Each array redefines PERL_LOCALE_TABLE_ENTRY to generate the information
774  * needed for that array, and #includes locale_table.h to get the valid
775  * categories.
776  *
777  * An entry for the conglomerate category LC_ALL is added here, immediately
778  * following the individual categories.  (The treatment for it varies, so can't
779  * be in locale_table.h.)
780  *
781  * Following this, each array ends with an entry for illegal categories.  All
782  * category numbers unknown to perl get mapped to this entry.  This is likely
783  * to be a parameter error from the calling program; but it could be that this
784  * platform has a category we don't know about, in which case it needs to be
785  * added, using the paradigm of one of the existing categories. */
786 
787 /* The first array is the locale categories perl uses on this system, used to
788  * map our index back to the system's category number. */
789 STATIC const int categories[] = {
790 
791 #  undef PERL_LOCALE_TABLE_ENTRY
792 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  LC_ ## name,
793 #  include "locale_table.h"
794 
795 #  ifdef LC_ALL
796     LC_ALL,
797 #  else
798     FAKE_LC_ALL,
799 #  endif
800 
801    (FAKE_LC_ALL + 1)    /* Entry for unknown category; this number is unlikely
802                            to clash with a real category */
803 };
804 
805 # define GET_NAME_AS_STRING(token)  # token
806 # define GET_LC_NAME_AS_STRING(token) GET_NAME_AS_STRING(LC_ ## token)
807 
808 /* The second array is the category names. */
809 STATIC const char * const category_names[] = {
810 
811 #  undef PERL_LOCALE_TABLE_ENTRY
812 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  GET_LC_NAME_AS_STRING(name),
813 #  include "locale_table.h"
814 
815 #  ifdef LC_ALL
816 #    define LC_ALL_STRING  "LC_ALL"
817 #  else
818 #    define LC_ALL_STRING  "If you see this, it is a bug in perl;"      \
819                            " please report it via perlbug"
820 #  endif
821 
822     LC_ALL_STRING,
823 
824 #  define LC_UNKNOWN_STRING  "Locale category unknown to Perl; if you see"  \
825                              " this, it is a bug in perl; please report it" \
826                              " via perlbug"
827     LC_UNKNOWN_STRING
828 };
829 
830 STATIC const Size_t category_name_lengths[] = {
831 
832 #  undef PERL_LOCALE_TABLE_ENTRY
833 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)                          \
834                                     STRLENs(GET_LC_NAME_AS_STRING(name)),
835 #  include "locale_table.h"
836 
837     STRLENs(LC_ALL_STRING),
838     STRLENs(LC_UNKNOWN_STRING)
839 };
840 
841 /* Each entry includes space for the '=' and ';' */
842 #  undef PERL_LOCALE_TABLE_ENTRY
843 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)                          \
844                                 + STRLENs(GET_LC_NAME_AS_STRING(name)) + 2
845 
846 STATIC const Size_t lc_all_boiler_plate_length = 1  /* space for trailing NUL */
847 #  include "locale_table.h"
848 ;
849 
850 /* A few categories require additional setup when they are changed.  This table
851  * points to the functions that do that setup */
852 STATIC void (*update_functions[]) (pTHX_ const char *, bool force) = {
853 
854 #  undef PERL_LOCALE_TABLE_ENTRY
855 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  call_back,
856 #  include "locale_table.h"
857 
858     S_new_LC_ALL,
859     NULL,   /* No update for unknown category */
860 };
861 
862 #  if defined(HAS_IGNORED_LOCALE_CATEGORIES_)       \
863    || defined(HAS_MISSING_LANGINFO_ITEM_)
864 
865 /* Indicates if each category on this platform is available to use not in
866  * the C locale */
867 STATIC const bool category_available[] = {
868 
869 #  undef PERL_LOCALE_TABLE_ENTRY
870 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)  LC_ ## name ## _AVAIL_,
871 #  include "locale_table.h"
872 
873 #  ifdef LC_ALL
874     true,
875 #  else
876     false,
877 #  endif
878 
879     false   /* LC_UNKNOWN_AVAIL_ */
880 };
881 
882 #  endif
883 #  if defined(USE_POSIX_2008_LOCALE)
884 
885 STATIC const int category_masks[] = {
886 
887 #    undef PERL_LOCALE_TABLE_ENTRY
888 #    define PERL_LOCALE_TABLE_ENTRY(name, call_back)  LC_ ## name ## _MASK,
889 #    include "locale_table.h"
890 
891     LC_ALL_MASK,    /* Will rightly refuse to compile unless this is defined */
892     0               /* Empty mask for unknown category */
893 };
894 
895 #  endif
896 #  if ! defined(PERL_LC_ALL_USES_NAME_VALUE_PAIRS)
897 
898 /* On platforms that use positional notation for expressing LC_ALL, this maps
899  * the position of each category to our corresponding internal index for it.
900  * This is initialized at run time if needed.  LC_ALL_INDEX_ is not legal for
901  * an individual locale, hence marks the elements here as not actually
902  * initialized. */
903 STATIC
904 unsigned int
905 map_LC_ALL_position_to_index[LC_ALL_INDEX_] = { LC_ALL_INDEX_ };
906 
907 #  endif
908 #endif
909 #if defined(USE_LOCALE) || defined(DEBUGGING)
910 
911 STATIC const char *
S_get_displayable_string(pTHX_ const char * const s,const char * const e,const bool is_utf8)912 S_get_displayable_string(pTHX_
913                          const char * const s,
914                          const char * const e,
915                          const bool is_utf8)
916 {
917     PERL_ARGS_ASSERT_GET_DISPLAYABLE_STRING;
918 
919     if (e <= s) {
920         return "";
921     }
922 
923     const char * t = s;
924     bool prev_was_printable = TRUE;
925     bool first_time = TRUE;
926     char * ret;
927 
928     /* Worst case scenario: All are non-printable so have a blank between each.
929      * If UTF-8, all are the largest possible code point; otherwise all are a
930      * single byte.  '(2 + 1)'  is from each byte takes 2 characters to
931      * display, and a blank (or NUL for the final one) after it */
932     const Size_t size = (e - s) * (2 + 1) * ((is_utf8) ? UVSIZE : 1);
933     Newxz(ret, size, char);
934     SAVEFREEPV(ret);
935 
936     while (t < e) {
937         UV cp = (is_utf8)
938                 ?  utf8_to_uvchr_buf((U8 *) t, e, NULL)
939                 : * (U8 *) t;
940         if (isPRINT(cp)) {
941             if (! prev_was_printable) {
942                 my_strlcat(ret, " ", size);
943             }
944 
945             /* Escape these to avoid any ambiguity */
946             if (cp == ' ' || cp == '\\') {
947                 my_strlcat(ret, "\\", size);
948             }
949             my_strlcat(ret, Perl_form(aTHX_ "%c", (U8) cp), size);
950             prev_was_printable = TRUE;
951         }
952         else {
953             if (! first_time) {
954                 my_strlcat(ret, " ", size);
955             }
956             my_strlcat(ret, Perl_form(aTHX_ "%02" UVXf, cp), size);
957             prev_was_printable = FALSE;
958         }
959         t += (is_utf8) ? UTF8SKIP(t) : 1;
960         first_time = FALSE;
961     }
962 
963     return ret;
964 }
965 
966 #endif
967 #ifdef USE_LOCALE
968 
969 # define get_category_index(cat) get_category_index_helper(cat, NULL, __LINE__)
970 
971 STATIC locale_category_index
S_get_category_index_helper(pTHX_ const int category,bool * succeeded,const line_t caller_line)972 S_get_category_index_helper(pTHX_ const int category, bool * succeeded,
973                                   const line_t caller_line)
974 {
975     PERL_ARGS_ASSERT_GET_CATEGORY_INDEX_HELPER;
976 
977     /* Given a category, return the equivalent internal index we generally use
978      * instead, warn or panic if not found. */
979 
980     locale_category_index i;
981 
982 #  undef PERL_LOCALE_TABLE_ENTRY
983 #  define PERL_LOCALE_TABLE_ENTRY(name, call_back)                          \
984                     case LC_ ## name: i =  LC_ ## name ## _INDEX_; break;
985 
986     switch (category) {
987 
988 #  include "locale_table.h"
989 #  ifdef LC_ALL
990       case LC_ALL: i =  LC_ALL_INDEX_; break;
991 #  endif
992 
993       default: goto unknown_locale;
994     }
995 
996     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
997                            "index of category %d (%s) is %d;"
998                            " called from %" LINE_Tf "\n",
999                            category, category_names[i], i, caller_line));
1000 
1001     if (succeeded) {
1002         *succeeded = true;
1003     }
1004 
1005     return i;
1006 
1007   unknown_locale:
1008 
1009     if (succeeded) {
1010         *succeeded = false;
1011         return LC_ALL_INDEX_;   /* Arbitrary */
1012     }
1013 
1014     locale_panic_via_(Perl_form(aTHX_ "Unknown locale category %d", category),
1015                       __FILE__, caller_line);
1016     NOT_REACHED; /* NOTREACHED */
1017 }
1018 
1019 #endif /* ifdef USE_LOCALE */
1020 
1021 void
Perl_force_locale_unlock(pTHX)1022 Perl_force_locale_unlock(pTHX)
1023 {
1024     /* Remove any locale mutex, in preperation for an inglorious termination,
1025      * typically a  panic */
1026 
1027 #if defined(USE_LOCALE_THREADS)
1028 
1029     /* If recursively locked, clear all at once */
1030     if (PL_locale_mutex_depth > 1) {
1031         PL_locale_mutex_depth = 1;
1032     }
1033 
1034     if (PL_locale_mutex_depth > 0) {
1035         LOCALE_UNLOCK_;
1036     }
1037 
1038 #endif
1039 
1040 }
1041 
1042 #ifdef USE_POSIX_2008_LOCALE
1043 
1044 STATIC locale_t
S_use_curlocale_scratch(pTHX)1045 S_use_curlocale_scratch(pTHX)
1046 {
1047     /* This function is used to hide from the caller the case where the current
1048      * locale_t object in POSIX 2008 is the global one, which is illegal in
1049      * many of the P2008 API calls.  This checks for that and, if necessary
1050      * creates a proper P2008 object.  Any prior object is deleted, as is any
1051      * remaining object during global destruction. */
1052 
1053     locale_t cur = uselocale((locale_t) 0);
1054 
1055     if (cur != LC_GLOBAL_LOCALE) {
1056         return cur;
1057     }
1058 
1059     if (PL_scratch_locale_obj) {
1060         freelocale(PL_scratch_locale_obj);
1061     }
1062 
1063     PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE);
1064     return PL_scratch_locale_obj;
1065 }
1066 
1067 #endif
1068 
1069 void
Perl_locale_panic(const char * msg,const line_t immediate_caller_line,const char * const higher_caller_file,const line_t higher_caller_line)1070 Perl_locale_panic(const char * msg,
1071                   const line_t immediate_caller_line,
1072                   const char * const higher_caller_file,
1073                   const line_t higher_caller_line)
1074 {
1075     PERL_ARGS_ASSERT_LOCALE_PANIC;
1076     dTHX;
1077     dSAVE_ERRNO;
1078 
1079     force_locale_unlock();
1080 
1081 #ifdef USE_C_BACKTRACE
1082     dump_c_backtrace(Perl_debug_log, 20, 1);
1083 #endif
1084 
1085     const char * called_by = "";
1086     if (   strNE(__FILE__, higher_caller_file)
1087         || immediate_caller_line != higher_caller_line)
1088     {
1089         called_by = Perl_form(aTHX_ "\nCalled by %s: %" LINE_Tf "\n",
1090                                     higher_caller_file, higher_caller_line);
1091     }
1092 
1093     RESTORE_ERRNO;
1094 
1095     const char * errno_text;
1096 
1097 #ifdef HAS_EXTENDED_OS_ERRNO
1098 
1099     const int extended_errnum = get_extended_os_errno();
1100     if (errno != extended_errnum) {
1101         errno_text = Perl_form(aTHX_ "; errno=%d, $^E=%d",
1102                                      errno, extended_errnum);
1103     }
1104     else
1105 
1106 #endif
1107 
1108     {
1109         errno_text = Perl_form(aTHX_ "; errno=%d", errno);
1110     }
1111 
1112     /* diag_listed_as: panic: %s */
1113     Perl_croak(aTHX_ "%s: %" LINE_Tf ": panic: %s%s%s\n",
1114                      __FILE__, immediate_caller_line,
1115                      msg, errno_text, called_by);
1116 }
1117 
1118 /* Macros to report and croak on an unexpected failure to set the locale.  The
1119  * via version has more stack trace information */
1120 #define setlocale_failure_panic_i(i, cur, fail, line, higher_line)          \
1121     setlocale_failure_panic_via_i(i, cur, fail, __LINE__, line,             \
1122                                   __FILE__, higher_line)
1123 
1124 #define setlocale_failure_panic_c(cat, cur, fail, line, higher_line)        \
1125    setlocale_failure_panic_i(cat##_INDEX_, cur, fail, line, higher_line)
1126 
1127 #if defined(USE_LOCALE)
1128 
1129 /* Expands to the code to
1130  *      result = savepvn(s, len)
1131  * if the category whose internal index is 'i' doesn't need to be kept in the C
1132  * locale on this system, or if 'action is 'no_override'.  Otherwise it expands
1133  * to
1134  *      result = savepv("C")
1135  * unless 'action' isn't 'check_that_overridden', in which case if the string
1136  * 's' isn't already "C" it panics */
1137 #  ifndef HAS_IGNORED_LOCALE_CATEGORIES_
1138 #    define OVERRIDE_AND_SAVEPV(s, len, result, i, action)                  \
1139                                                   result = savepvn(s, len)
1140 #  else
1141 #    define OVERRIDE_AND_SAVEPV(s, len, result, i, action)                  \
1142         STMT_START {                                                        \
1143             if (LIKELY(   ! need_to_override_category(i)                    \
1144                        || action == no_override)) {                         \
1145                 result = savepvn(s, len);                                   \
1146             }                                                               \
1147             else {                                                          \
1148                 const char * temp = savepvn(s, len);                        \
1149                 result = savepv(override_ignored_category(i, temp));        \
1150                 if (action == check_that_overridden && strNE(result, temp)) { \
1151                     locale_panic_(Perl_form(aTHX_                           \
1152                                 "%s expected to be '%s', instead is '%s'",  \
1153                                 category_names[i], result, temp));          \
1154                 }                                                           \
1155                 Safefree(temp);                                             \
1156             }                                                               \
1157         } STMT_END
1158 #  endif
1159 
1160 STATIC parse_LC_ALL_string_return
S_parse_LC_ALL_string(pTHX_ const char * string,const char ** output,const parse_LC_ALL_STRING_action override,bool always_use_full_array,const bool panic_on_error,const line_t caller_line)1161 S_parse_LC_ALL_string(pTHX_ const char * string,
1162                             const char ** output,
1163                             const parse_LC_ALL_STRING_action  override,
1164                             bool always_use_full_array,
1165                             const bool panic_on_error,
1166                             const line_t caller_line)
1167 {
1168     /* This function parses the value of the input 'string' which is expected
1169      * to be the representation of an LC_ALL locale, and splits the result into
1170      * the values for the individual component categories, returning those in
1171      * the 'output' array.  Each array value will be a savepv() copy that is
1172      * the responsibility of the caller to make sure gets freed
1173      *
1174      * The locale for each category is independent of the other categories.
1175      * Often, they are all the same, but certainly not always.  Perl, in fact,
1176      * usually keeps LC_NUMERIC in the C locale, regardless of the underlying
1177      * locale.  LC_ALL has to be able to represent the case of when not all
1178      * categories have the same locale.  Platforms have differing ways of
1179      * representing this.  Internally, this file uses the 'name=value;'
1180      * representation found on some platforms, so this function always looks
1181      * for and parses that.  Other platforms use a positional notation.  On
1182      * those platforms, this function also parses that form.  It examines the
1183      * input to see which form is being parsed.
1184      *
1185      * Often, all categories will have the same locale.  This is special cased
1186      * if 'always_use_full_array' is false on input:
1187      *      1) If the input 'string' is a single value, this function doesn't
1188      *         store anything into 'output', and returns 'no_array'
1189      *      2) Some platforms will return multiple occurrences of the same
1190      *         value rather than coalescing them down to a single one.  HP-UX
1191      *         is such a one.  This function will do that collapsing for you,
1192      *         returning 'only_element_0' and saving the single value in
1193      *         output[0], which the caller will need to arrange to be freed.
1194      *         The rest of output[] is undefined, and does not need to be
1195      *         freed.
1196      *
1197      * Otherwise, the input 'string' may not be valid.  This function looks
1198      * mainly for syntactic errors, and if found, returns 'invalid'.  'output'
1199      * will not be filled in in that case, but the input state of it isn't
1200      * necessarily preserved.  Turning on -DL debugging will give details as to
1201      * the error.  If 'panic_on_error' is 'true', the function panics instead
1202      * of returning on error, with a message giving the details.
1203      *
1204      * Otherwise, output[] will be filled with the individual locale names for
1205      * all categories on the system, 'full_array' will be returned, and the
1206      * caller needs to arrange for each to be freed.  This means that either at
1207      * least one category differed from the others, or 'always_use_full_array' was
1208      * true on input.
1209      *
1210      * perl may be configured to ignore changes to a category's locale to
1211      * non-C.  The parameter 'override' tells this function what to do when
1212      * encountering such an illegal combination:
1213      *
1214      *      no_override             indicates to take no special action
1215      *      override_if_ignored,    indicates to return 'C' instead of what the
1216      *                              input string actually says.
1217      *      check_that_overridden   indicates to panic if the string says the
1218      *                              category is not 'C'.  This is used when
1219      *                              non-C is very unexpected behavior.
1220      * */
1221 
1222     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1223                            "Entering parse_LC_ALL_string; called from %"    \
1224                            LINE_Tf "\nnew='%s'\n", caller_line, string));
1225 
1226 #  ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
1227 
1228     const char separator[] = ";";
1229     const Size_t separator_len = 1;
1230     const bool single_component = (strchr(string, ';') == NULL);
1231 
1232 #  else
1233 
1234     /* It's possible (but quite unlikely) that the separator string is an '='
1235      * or a ';'.  Requiring both to be present for using the 'name=value;' form
1236      * properly handles those possibilities */
1237     const bool name_value = strchr(string, '=') && strchr(string, ';');
1238     const char * separator;
1239     Size_t separator_len;
1240     bool single_component;
1241     if (name_value) {
1242         separator = ";";
1243         separator_len = 1;
1244         single_component = false;   /* Since has both [;=], must be multi */
1245     }
1246     else {
1247         separator = PERL_LC_ALL_SEPARATOR;
1248         separator_len = STRLENs(PERL_LC_ALL_SEPARATOR);
1249         single_component = instr(string, separator) == NULL;
1250     }
1251 
1252     Size_t component_number = 0;    /* Position in the parsing loop below */
1253 
1254 #  endif
1255 #  ifndef HAS_IGNORED_LOCALE_CATEGORIES_
1256     PERL_UNUSED_ARG(override);
1257 #  else
1258 
1259     /* Any ignored categories are to be set to "C", so if this single-component
1260      * LC_ALL isn't to C, it has both "C" and non-C, so isn't really a single
1261      * component.  All the non-ignored categories are set to the input
1262      * component, but the ignored ones are overridden to be C.
1263      *
1264      * This incidentally handles the case where the string is "".  The return
1265      * will be C for each ignored category and "" for the others.  Then the
1266      * caller can individually set each category, and get the right answer. */
1267     if (single_component && ! isNAME_C_OR_POSIX(string)) {
1268         for_all_individual_category_indexes(i) {
1269            OVERRIDE_AND_SAVEPV(string, strlen(string), output[i], i, override);
1270         }
1271 
1272         return full_array;
1273     }
1274 
1275 #  endif
1276 
1277     if (single_component) {
1278         if (! always_use_full_array) {
1279             return no_array;
1280         }
1281 
1282         for_all_individual_category_indexes(i) {
1283             output[i] = savepv(string);
1284         }
1285 
1286         return full_array;
1287     }
1288 
1289     /* Here the input is multiple components.  Parse through them.  (It is
1290      * possible that these components are all the same, so we check, and if so,
1291      * return just the 0th component (unless 'always_use_full_array' is true)
1292      *
1293      * This enum notes the possible errors findable in parsing */
1294     enum {
1295             incomplete,
1296             no_equals,
1297             unknown_category,
1298             contains_LC_ALL_element
1299     } error;
1300 
1301     /* Keep track of the categories we have encountered so far */
1302     bool seen[LC_ALL_INDEX_] = { false };
1303 
1304     Size_t index;           /* Our internal index for the current category */
1305     const char * s = string;
1306     const char * e = s + strlen(string);
1307     const char * category_end = NULL;
1308     const char * saved_first = NULL;
1309 
1310     /* Parse the input locale string */
1311     while (s < e) {
1312 
1313         /* 'separator' has been set up to delimit the components */
1314         const char * next_sep = instr(s, separator);
1315         if (! next_sep) {   /* At the end of the input */
1316             next_sep = e;
1317         }
1318 
1319 #  ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
1320 
1321         if (! name_value) {
1322             /* Get the index of the category in this position */
1323             index = map_LC_ALL_position_to_index[component_number++];
1324         }
1325         else
1326 
1327 #  endif
1328 
1329         {   /* Get the category part when each component is the
1330              * 'category=locale' form */
1331 
1332             category_end = strchr(s, '=');
1333 
1334             /* The '=' terminates the category name.  If no '=', is improper
1335              * form */
1336             if (! category_end) {
1337                 error = no_equals;
1338                 goto failure;
1339             }
1340 
1341             /* Find our internal index of the category name; uses a linear
1342              * search.  (XXX This could be avoided by various means, but the
1343              * maximum likely search is 6 items, and khw doesn't think the
1344              * added complexity would save very much at all.) */
1345             const unsigned int name_len = (unsigned int) (category_end - s);
1346             for (index = 0; index < C_ARRAY_LENGTH(category_names); index++) {
1347                 if (   name_len == category_name_lengths[index]
1348                     && memEQ(s, category_names[index], name_len))
1349                 {
1350                     goto found_category;
1351                 }
1352             }
1353 
1354             /* Here, the category is not in our list. */
1355             error = unknown_category;
1356             goto failure;
1357 
1358           found_category:   /* The system knows about this category. */
1359 
1360             if (index == LC_ALL_INDEX_) {
1361                 error = contains_LC_ALL_element;
1362                 goto failure;
1363             }
1364 
1365             /* The locale name starts just beyond the '=' */
1366             s = category_end + 1;
1367 
1368             /* Linux (and maybe others) doesn't treat a duplicate category in
1369              * the string as an error.  Instead it uses the final occurrence as
1370              * the intended value.  So if this is a duplicate, free the former
1371              * value before setting the new one */
1372             if (seen[index]) {
1373                 Safefree(output[index]);
1374             }
1375             else {
1376                 seen[index] = true;
1377             }
1378         }
1379 
1380         /* Here, 'index' contains our internal index number for the current
1381          * category, and 's' points to the beginning of the locale name for
1382          * that category. */
1383         OVERRIDE_AND_SAVEPV(s, next_sep - s, output[index], index, override);
1384 
1385         if (! always_use_full_array) {
1386             if (! saved_first) {
1387                 saved_first = output[index];
1388             }
1389             else {
1390                 if (strNE(saved_first, output[index])) {
1391                     always_use_full_array = true;
1392                 }
1393             }
1394         }
1395 
1396         /* Next time start from the new position */
1397         s = next_sep + separator_len;
1398     }
1399 
1400     /* Finished looping through all the categories
1401      *
1402      * Check if the input was incomplete. */
1403 
1404 #  ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
1405 
1406     if (! name_value) {     /* Positional notation */
1407         if (component_number != LC_ALL_INDEX_) {
1408             error = incomplete;
1409             goto failure;
1410         }
1411     }
1412     else
1413 
1414 #  endif
1415 
1416     {   /* Here is the name=value notation */
1417         for_all_individual_category_indexes(i) {
1418             if (! seen[i]) {
1419                 error = incomplete;
1420                 goto failure;
1421             }
1422         }
1423     }
1424 
1425     /* In the loop above, we changed 'always_use_full_array' to true iff not all
1426      * categories have the same locale.  Hence, if it is still 'false', all of
1427      * them are the same. */
1428     if (always_use_full_array) {
1429         return full_array;
1430     }
1431 
1432     /* Free the dangling ones */
1433     for_all_but_0th_individual_category_indexes(i) {
1434         Safefree(output[i]);
1435         output[i] = NULL;
1436     }
1437 
1438     return only_element_0;
1439 
1440   failure:
1441 
1442     /* Don't leave memory dangling that we allocated before the failure */
1443     for_all_individual_category_indexes(i) {
1444         if (seen[i]) {
1445             Safefree(output[i]);
1446             output[i] = NULL;
1447         }
1448     }
1449 
1450     const char * msg;
1451     const char * display_start = s;
1452     const char * display_end = e;
1453 
1454     switch (error) {
1455         case incomplete:
1456             msg = "doesn't list every locale category";
1457             display_start = string;
1458             break;
1459         case no_equals:
1460             msg = "needs an '=' to split name=value";
1461             break;
1462         case unknown_category:
1463             msg = "is an unknown category";
1464             display_end = (category_end && category_end > display_start)
1465                           ? category_end
1466                           : e;
1467             break;
1468         case contains_LC_ALL_element:
1469             msg = "has LC_ALL, which is illegal here";
1470             break;
1471     }
1472 
1473     msg = Perl_form(aTHX_ "'%.*s' %s\n",
1474                           (int) (display_end - display_start),
1475                           display_start, msg);
1476 
1477     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s", msg));
1478 
1479     if (panic_on_error) {
1480         locale_panic_via_(msg, __FILE__, caller_line);
1481     }
1482 
1483     return invalid;
1484 }
1485 
1486 #  undef OVERRIDE_AND_SAVEPV
1487 #endif
1488 
1489 /*==========================================================================
1490  * Here starts the code that gives a uniform interface to its callers, hiding
1491  * the differences between platforms.
1492  *
1493  * base_posix_setlocale_() presents a consistent POSIX-compliant interface to
1494  * setlocale().   Windows requres a customized base-level setlocale().  This
1495  * layer should only be used by the next level up: the plain posix_setlocale
1496  * layer.  Any necessary mutex locking needs to be done at a higher level.  The
1497  * return may be overwritten by the next call to this function */
1498 #ifdef WIN32
1499 #  define base_posix_setlocale_(cat, locale) win32_setlocale(cat, locale)
1500 #else
1501 #  define base_posix_setlocale_(cat, locale)                                \
1502                                     ((const char *) setlocale(cat, locale))
1503 #endif
1504 
1505 /*==========================================================================
1506  * Here is the main posix layer.  It is the same as the base one unless the
1507  * system is lacking LC_ALL, or there are categories that we ignore, but that
1508  * the system libc knows about */
1509 
1510 #if ! defined(USE_LOCALE)                                                   \
1511  ||  (defined(LC_ALL) && ! defined(HAS_IGNORED_LOCALE_CATEGORIES_))
1512 #  define posix_setlocale(cat, locale) base_posix_setlocale_(cat, locale)
1513 #else
1514 #  define posix_setlocale(cat, locale)                                      \
1515         S_posix_setlocale_with_complications(aTHX_ cat, locale, __LINE__)
1516 
1517 STATIC const char *
S_posix_setlocale_with_complications(pTHX_ const int cat,const char * new_locale,const line_t caller_line)1518 S_posix_setlocale_with_complications(pTHX_ const int cat,
1519                                            const char * new_locale,
1520                                            const line_t caller_line)
1521 {
1522     /* This implements the posix layer above the base posix layer.
1523      * It is needed to reconcile our internal records that reflect only a
1524      * proper subset of the categories known by the system. */
1525 
1526     /* Querying the current locale returns the real value */
1527     if (new_locale == NULL) {
1528         new_locale = base_posix_setlocale_(cat, NULL);
1529         assert(new_locale);
1530         return new_locale;
1531     }
1532 
1533     const char * locale_on_entry = NULL;
1534 
1535     /* If setting from the environment, actually do the set to get the system's
1536      * idea of what that means; we may have to override later. */
1537     if (strEQ(new_locale, "")) {
1538         locale_on_entry = base_posix_setlocale_(cat, NULL);
1539         assert(locale_on_entry);
1540         new_locale = base_posix_setlocale_(cat, "");
1541         if (! new_locale) {
1542             SET_EINVAL;
1543             return NULL;
1544         }
1545     }
1546 
1547 #  ifdef LC_ALL
1548 
1549     const char * new_locales[LC_ALL_INDEX_] = { NULL };
1550 
1551     if (cat == LC_ALL) {
1552         switch (parse_LC_ALL_string(new_locale,
1553                                     (const char **) &new_locales,
1554                                     override_if_ignored,   /* Override any
1555                                                               ignored
1556                                                               categories */
1557                                     false,    /* Return only [0] if suffices */
1558                                     false,    /* Don't panic on error */
1559                                     caller_line))
1560         {
1561           case invalid:
1562             SET_EINVAL;
1563             return NULL;
1564 
1565           case no_array:
1566             break;
1567 
1568           case only_element_0:
1569             new_locale = new_locales[0];
1570             SAVEFREEPV(new_locale);
1571             break;
1572 
1573           case full_array:
1574 
1575             /* Turn the array into a string that the libc setlocale() should
1576              * understand.   (Another option would be to loop, setting the
1577              * individual locales, and then return base(cat, NULL) */
1578             new_locale = calculate_LC_ALL_string(new_locales,
1579                                                  EXTERNAL_FORMAT_FOR_SET,
1580                                                  WANT_TEMP_PV,
1581                                                  caller_line);
1582 
1583             for_all_individual_category_indexes(i) {
1584                 Safefree(new_locales[i]);
1585             }
1586 
1587             /* And call the libc setlocale.  We could avoid this call if
1588              * locale_on_entry is set and eq the new_locale.  But that would be
1589              * only for the relatively rare case of the desired locale being
1590              * "", and the time spent in doing the string compare might be more
1591              * than that of just setting it unconditionally */
1592             new_locale = base_posix_setlocale_(cat, new_locale);
1593             if (! new_locale) {
1594                  goto failure;
1595             }
1596 
1597             return new_locale;
1598         }
1599     }
1600 
1601 #  endif
1602 
1603     /* Here, 'new_locale' is a single value, not an aggregation.  Just set it.
1604      * */
1605     new_locale =
1606         base_posix_setlocale_(cat,
1607                               override_ignored_category(
1608                                           get_category_index(cat), new_locale));
1609     if (! new_locale) {
1610         goto failure;
1611     }
1612 
1613     return new_locale;
1614 
1615  failure:
1616 
1617     /* 'locale_on_entry' being set indicates there has likely been a change in
1618      * locale which needs to be restored */
1619     if (locale_on_entry) {
1620         if (! base_posix_setlocale_(cat, locale_on_entry)) {
1621             setlocale_failure_panic_i(get_category_index(cat),
1622                                       NULL, locale_on_entry,
1623                                       __LINE__, caller_line);
1624         }
1625     }
1626 
1627     SET_EINVAL;
1628     return NULL;
1629 }
1630 
1631 #endif
1632 
1633 /* End of posix layer
1634  *==========================================================================
1635  *
1636  * The next layer up is to catch vagaries and bugs in the libc setlocale return
1637  * value.  The return is not guaranteed to be stable.
1638  *
1639  * Any necessary mutex locking needs to be done at a higher level.
1640  *
1641  * On most platforms this layer is empty, expanding to just the layer
1642  * below.   To enable it, call Configure with either or both:
1643  * -Accflags=-DHAS_LF_IN_SETLOCALE_RETURN
1644  *                  to indicate that extraneous \n characters can be returned
1645  *                  by setlocale()
1646  * -Accflags=-DHAS_BROKEN_SETLOCALE_QUERY_LC_ALL
1647  *                  to indicate that setlocale(LC_ALL, NULL) cannot be relied
1648  *                  on
1649  */
1650 
1651 #define STDIZED_SETLOCALE_LOCK    POSIX_SETLOCALE_LOCK
1652 #define STDIZED_SETLOCALE_UNLOCK  POSIX_SETLOCALE_UNLOCK
1653 #if ! defined(USE_LOCALE)                                                   \
1654  || ! (   defined(HAS_LF_IN_SETLOCALE_RETURN)                               \
1655        || defined(HAS_BROKEN_SETLOCALE_QUERY_LC_ALL))
1656 #  define stdized_setlocale(cat, locale)  posix_setlocale(cat, locale)
1657 #  define stdize_locale(cat, locale)  (locale)
1658 #else
1659 #  define stdized_setlocale(cat, locale)                                    \
1660         S_stdize_locale(aTHX_ cat, posix_setlocale(cat, locale), __LINE__)
1661 
1662 STATIC const char *
S_stdize_locale(pTHX_ const int category,const char * input_locale,const line_t caller_line)1663 S_stdize_locale(pTHX_ const int category,
1664                       const char *input_locale,
1665                       const line_t caller_line)
1666 {
1667     /* The return value of setlocale() is opaque, but is required to be usable
1668      * as input to a future setlocale() to create the same state.
1669      * Unfortunately not all systems are compliant.  This function brings those
1670      * outliers into conformance.  It is based on what problems have arisen in
1671      * the field.
1672      *
1673      * This has similar constraints as the posix layer.  You need to lock
1674      * around it until its return is safely copied or no longer needed. (The
1675      * return may point to a global static buffer or may be mortalized.)
1676      *
1677      * The current things this corrects are:
1678      * 1) A new-line.  This function chops any \n characters
1679      * 2) A broken 'setlocale(LC_ALL, foo)'  This constructs a proper returned
1680      *                 string from the constituent categories
1681      *
1682      * If no changes were made, the input is returned as-is */
1683 
1684     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1685                           "Entering stdize_locale(%d, '%s');"
1686                           " called from %" LINE_Tf "\n",
1687                           category, input_locale, caller_line));
1688 
1689     if (input_locale == NULL) {
1690         SET_EINVAL;
1691         return NULL;
1692     }
1693 
1694     char * retval = (char *) input_locale;
1695 
1696 #  if defined(LC_ALL) && defined(HAS_BROKEN_SETLOCALE_QUERY_LC_ALL)
1697 
1698         /* If setlocale(LC_ALL, NULL) is broken, compute what the system
1699          * actually thinks it should be from its individual components */
1700     if (category == LC_ALL) {
1701         retval = (char *) calculate_LC_ALL_string(
1702                                           NULL,  /* query each individ locale */
1703                                           EXTERNAL_FORMAT_FOR_SET,
1704                                           WANT_TEMP_PV,
1705                                           caller_line);
1706     }
1707 
1708 #  endif
1709 #  ifdef HAS_NL_IN_SETLOCALE_RETURN
1710 
1711     char * first_bad = NULL;
1712 
1713 #    ifndef LC_ALL
1714 
1715     PERL_UNUSED_ARG(category);
1716     PERL_UNUSED_ARG(caller_line);
1717 
1718 #      define INPUT_LOCALE  retval
1719 #      define MARK_CHANGED
1720 #    else
1721 
1722     char * individ_locales[LC_ALL_INDEX_] = { NULL };
1723     bool made_changes = false;
1724     Size_t upper;
1725     if (category != LC_ALL) {
1726         individ_locales[0] = retval;
1727         upper = 0;
1728     }
1729     else {
1730 
1731         /* And parse the locale string, splitting into its individual
1732          * components. */
1733         switch (parse_LC_ALL_string(retval,
1734                                     (const char **) &individ_locales,
1735                                     check_that_overridden, /* ignored
1736                                                               categories should
1737                                                               already have been
1738                                                               overridden */
1739                                     false,    /* Return only [0] if suffices */
1740                                     false,    /* Don't panic on error */
1741                                     caller_line))
1742         {
1743           case invalid:
1744             SET_EINVAL;
1745             return NULL;
1746 
1747           case full_array: /* Loop below through all the component categories.
1748                             */
1749             upper = LC_ALL_INDEX_ - 1;
1750             break;
1751 
1752           case no_array:
1753             /* All categories here are set to the same locale, and the parse
1754              * didn't fill in any of 'individ_locales'.  Set the 0th element to
1755              * that locale. */
1756             individ_locales[0] = retval;
1757             /* FALLTHROUGH */
1758 
1759           case only_element_0: /* Element 0 is the only element we need to look
1760                                   at */
1761             upper = 0;
1762             break;
1763         }
1764     }
1765 
1766     for (unsigned int i = 0; i <= upper; i++)
1767 
1768 #      define INPUT_LOCALE  individ_locales[i]
1769 #      define MARK_CHANGED  made_changes = true;
1770 #    endif    /* Has LC_ALL */
1771 
1772     {
1773         first_bad = (char *) strchr(INPUT_LOCALE, '\n');
1774 
1775         /* Most likely, there isn't a problem with the input */
1776         if (UNLIKELY(first_bad)) {
1777 
1778             /* This element will need to be adjusted.  Create a modifiable
1779              * copy. */
1780             MARK_CHANGED
1781             retval = savepv(INPUT_LOCALE);
1782             SAVEFREEPV(retval);
1783 
1784             /* Translate the found position into terms of the copy */
1785             first_bad = retval + (first_bad - INPUT_LOCALE);
1786 
1787             /* Get rid of the \n and what follows.  (Originally, only a
1788              * trailing \n was stripped.  Unsure what to do if not trailing) */
1789             *((char *) first_bad) = '\0';
1790         }   /* End of needs adjusting */
1791     }   /* End of looking for problems */
1792 
1793 #    ifdef LC_ALL
1794 
1795     /* If we had multiple elements, extra work is required */
1796     if (upper != 0) {
1797 
1798         /* If no changes were made to the input, 'retval' already contains it
1799          * */
1800         if (made_changes) {
1801 
1802             /* But if did make changes, need to calculate the new value */
1803             retval = (char *) calculate_LC_ALL_string(
1804                                             (const char **) &individ_locales,
1805                                             EXTERNAL_FORMAT_FOR_SET,
1806                                             WANT_TEMP_PV,
1807                                             caller_line);
1808         }
1809 
1810         /* And free the no-longer needed memory */
1811         for (unsigned int i = 0; i <= upper; i++) {
1812             Safefree(individ_locales[i]);
1813         }
1814     }
1815 
1816 #    endif
1817 #    undef INPUT_LOCALE
1818 #    undef MARK_CHANGED
1819 #  endif    /* HAS_NL_IN_SETLOCALE_RETURN */
1820 
1821     return (const char *) retval;
1822 }
1823 
1824 #endif  /* USE_LOCALE */
1825 
1826 /* End of stdize_locale layer
1827  *
1828  * ==========================================================================
1829  *
1830  * The next many lines form several implementations of a layer above the
1831  * close-to-the-metal 'posix' and 'stdized' macros.  They are used to present a
1832  * uniform API to the rest of the code in this file in spite of the disparate
1833  * underlying implementations.  Which implementation gets compiled depends on
1834  * the platform capabilities (and some user choice) as determined by Configure.
1835  *
1836  * As more fully described in the introductory comments in this file, the
1837  * API of each implementation consists of three sets of macros.  Each set has
1838  * three variants with suffixes '_c', '_r', and '_i'.  In the list below '_X'
1839  * is to be replaced by any of these suffixes.
1840  *
1841  * 1) bool_setlocale_X  attempts to set the given category's locale to the
1842  *                      given value, returning if it worked or not.
1843  * 2) void_setlocale_X  is like the corresponding bool_setlocale, but used when
1844  *                      success is the only sane outcome, so failure causes it
1845  *                      to panic.
1846  * 3) querylocale_X     to see what the given category's locale is
1847  *
1848  * 4) setlocale_i()     is defined only in those implementations where the bool
1849  *                      and query forms are essentially the same, and can be
1850  *                      combined to save CPU time.
1851  *
1852  * Each implementation is fundamentally defined by just two macros: a
1853  * bool_setlocale_X() and a querylocale_X().  The other macros are all
1854  * derivable from these.  Each fundamental macro is either a '_i' suffix one or
1855  * an '_r' suffix one, depending on what is the most efficient in getting to an
1856  * input form that the underlying libc functions want.  The derived macro
1857  * definitions are deferred in this file to after the code for all the
1858  * implementations.  This makes each implementation shorter and clearer, and
1859  * removes duplication.
1860  *
1861  * Each implementation below is separated by ==== lines, and includes bool,
1862  * void, and query macros.  The query macros are first, followed by any
1863  * functions needed to implement them.  Then come the bool, again followed by
1864  * any implementing functions  Then are the void macros; next is setlocale_i if
1865  * present on this implementation.  Finally are any helper functions.  The sets
1866  * in each implementation are separated by ---- lines.
1867  *
1868  * The returned strings from all the querylocale...() forms in all
1869  * implementations are thread-safe, and the caller should not free them,
1870  * but each may be a mortalized copy.  If you need something stable across
1871  * calls, you need to savepv() the result yourself.
1872  *
1873  *===========================================================================*/
1874 
1875 #if    (! defined(USE_LOCALE_THREADS) && ! defined(USE_POSIX_2008_LOCALE))    \
1876     || (  defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE))
1877 
1878 /* For non-threaded perls, the implementation just expands to the base-level
1879  * functions (except if we are Configured to nonetheless use the POSIX 2008
1880  * interface) This implementation is also used on threaded perls where
1881  * threading is invisible to us.  Currently this is only on later Windows
1882  * versions. */
1883 
1884 #  define querylocale_r(cat)  mortalized_pv_copy(stdized_setlocale(cat, NULL))
1885 #  define bool_setlocale_r(cat, locale) cBOOL(posix_setlocale(cat, locale))
1886 
1887 /*---------------------------------------------------------------------------*/
1888 
1889 /* setlocale_i is only defined for Configurations where the libc setlocale()
1890  * doesn't need any tweaking.  It allows for some shortcuts */
1891 #  ifndef USE_LOCALE_THREADS
1892 #    define setlocale_i(i, locale)   stdized_setlocale(categories[i], locale)
1893 
1894 #  elif defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)
1895 
1896 /* On Windows, we don't know at compile time if we are in thread-safe mode or
1897  * not.  If we are, we can just return the result of the layer below us.  If we
1898  * are in unsafe mode, we need to first copy that result to a safe place while
1899  * in a critical section */
1900 
1901 #    define setlocale_i(i, locale)   S_setlocale_i(aTHX_ categories[i], locale)
1902 
1903 STATIC const char *
S_setlocale_i(pTHX_ const int category,const char * locale)1904 S_setlocale_i(pTHX_ const int category, const char * locale)
1905 {
1906     if (LIKELY(_configthreadlocale(0) == _ENABLE_PER_THREAD_LOCALE)) {
1907         return stdized_setlocale(category, locale);
1908     }
1909 
1910     gwLOCALE_LOCK;
1911     const char * retval = save_to_buffer(stdized_setlocale(category, locale),
1912                                          &PL_setlocale_buf,
1913                                          &PL_setlocale_bufsize);
1914     gwLOCALE_UNLOCK;
1915 
1916     return retval;
1917 }
1918 
1919 #  endif
1920 
1921 /*===========================================================================*/
1922 #elif   defined(USE_LOCALE_THREADS)                 \
1923    && ! defined(USE_THREAD_SAFE_LOCALE)
1924 
1925    /* Here, there are threads, and there is no support for thread-safe
1926     * operation.  This is a dangerous situation, which perl is documented as
1927     * not supporting, but it arises in practice.  We can do a modicum of
1928     * automatic mitigation by making sure there is a per-thread return from
1929     * setlocale(), and that a mutex protects it from races */
1930 
1931 #  define querylocale_r(cat)                                                \
1932                       mortalized_pv_copy(less_dicey_setlocale_r(cat, NULL))
1933 
1934 STATIC const char *
S_less_dicey_setlocale_r(pTHX_ const int category,const char * locale)1935 S_less_dicey_setlocale_r(pTHX_ const int category, const char * locale)
1936 {
1937     const char * retval;
1938 
1939     PERL_ARGS_ASSERT_LESS_DICEY_SETLOCALE_R;
1940 
1941     STDIZED_SETLOCALE_LOCK;
1942 
1943     retval = save_to_buffer(stdized_setlocale(category, locale),
1944                             &PL_less_dicey_locale_buf,
1945                             &PL_less_dicey_locale_bufsize);
1946 
1947     STDIZED_SETLOCALE_UNLOCK;
1948 
1949     return retval;
1950 }
1951 
1952 /*---------------------------------------------------------------------------*/
1953 
1954 #  define bool_setlocale_r(cat, locale)                                     \
1955                                less_dicey_bool_setlocale_r(cat, locale)
1956 
1957 STATIC bool
S_less_dicey_bool_setlocale_r(pTHX_ const int cat,const char * locale)1958 S_less_dicey_bool_setlocale_r(pTHX_ const int cat, const char * locale)
1959 {
1960     bool retval;
1961 
1962     PERL_ARGS_ASSERT_LESS_DICEY_BOOL_SETLOCALE_R;
1963 
1964     /* Unlikely, but potentially possible that another thread could zap the
1965      * buffer from true to false or vice-versa, so need to lock here */
1966     POSIX_SETLOCALE_LOCK;
1967     retval = cBOOL(posix_setlocale(cat, locale));
1968     POSIX_SETLOCALE_UNLOCK;
1969 
1970     return retval;
1971 }
1972 
1973 /*---------------------------------------------------------------------------*/
1974 
1975 /* setlocale_i is only defined for Configurations where the libc setlocale()
1976  * suffices for both querying and setting the locale.  It allows for some
1977  * shortcuts */
1978 #  define setlocale_i(i, locale)  less_dicey_setlocale_r(categories[i], locale)
1979 
1980 /* The code in this file may change the locale briefly during certain
1981  * operations.  This should be a critical section when that could interfere
1982  * with other instances executing at the same time. */
1983 #  define TOGGLE_LOCK(i)    POSIX_SETLOCALE_LOCK
1984 #  define TOGGLE_UNLOCK(i)  POSIX_SETLOCALE_UNLOCK
1985 
1986 /*===========================================================================*/
1987 
1988 #elif defined(USE_POSIX_2008_LOCALE)
1989 #  ifndef LC_ALL
1990 #    error This code assumes that LC_ALL is available on a system modern enough to have POSIX 2008
1991 #  endif
1992 
1993 /* Here, there is a completely different API to get thread-safe locales.  We
1994  * emulate the setlocale() API with our own function(s).  setlocale categories,
1995  * like LC_NUMERIC, are not valid here for the POSIX 2008 API.  Instead, there
1996  * are equivalents, like LC_NUMERIC_MASK, which we use instead, which we find
1997  * by table lookup. */
1998 
1999 #  if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES)
2000             /* https://sourceware.org/bugzilla/show_bug.cgi?id=24936 */
2001 #    define HAS_GLIBC_LC_MESSAGES_BUG
2002 #    include <libintl.h>
2003 #  endif
2004 
2005 #  define querylocale_i(i)    querylocale_2008_i(i, __LINE__)
2006 
2007     /* We need to define this derivative macro here, as it is needed in
2008      * the implementing function (for recursive calls).  It also gets defined
2009      * where all the other derivative macros are defined, and the compiler
2010      * will complain if the definition gets out of sync */
2011 #  define querylocale_c(cat)      querylocale_i(cat##_INDEX_)
2012 
2013 STATIC const char *
S_querylocale_2008_i(pTHX_ const locale_category_index index,const line_t caller_line)2014 S_querylocale_2008_i(pTHX_ const locale_category_index index,
2015                            const line_t caller_line)
2016 {
2017     PERL_ARGS_ASSERT_QUERYLOCALE_2008_I;
2018 
2019     /* This function returns the name of the locale category given by the input
2020      * 'index' into our parallel tables of them.
2021      *
2022      * POSIX 2008, for some sick reason, chose not to provide a method to find
2023      * the category name of a locale, disregarding a basic linguistic tenet
2024      * that for any object, people will create a name for it.  (The next
2025      * version of the POSIX standard is proposed to fix this.)  Some vendors
2026      * have created a querylocale() function to do this in the meantime.  On
2027      * systems without querylocale(), we have to keep track of what the locale
2028      * has been set to, so that we can return its name so as to emulate
2029      * setlocale().  There are potential problems with this:
2030      *
2031      *  1)  We don't know what calling newlocale() with the locale argument ""
2032      *      actually does.  It gets its values from the program's environment.
2033      *      find_locale_from_environment() is used to work around this.  But it
2034      *      isn't fool-proof.  See the comments for that function for details.
2035      *  2)  It's possible for C code in some library to change the locale
2036      *      without us knowing it, and thus our records become wrong;
2037      *      querylocale() would catch this.  But as of September 2017, there
2038      *      are no occurrences in CPAN of uselocale().  Some libraries do use
2039      *      setlocale(), but that changes the global locale, and threads using
2040      *      per-thread locales will just ignore those changes.
2041      *  3)  Many systems have multiple names for the same locale.  Generally,
2042      *      there is an underlying base name, with aliases that evaluate to it.
2043      *      On some systems, if you set the locale to an alias, and then
2044      *      retrieve the name, you get the alias as expected; but on others you
2045      *      get the base name, not the alias you used.  And sometimes the
2046      *      charade is incomplete.  See
2047      *      https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375.
2048      *
2049      *      The code is structured so that the returned locale name when the
2050      *      locale is changed is whatever the result of querylocale() on the
2051      *      new locale is.  This effectively gives the result the system
2052      *      expects.  Without querylocale, the name returned is always the
2053      *      input name.  Theoretically this could cause problems, but khw knows
2054      *      of none so far, but mentions it here in case you are trying to
2055      *      debug something.  (This could be worked around by messing with the
2056      *      global locale temporarily, using setlocale() to get the base name;
2057      *      but that could cause a race.  The comments for
2058      *      find_locale_from_environment() give details on the potential race.)
2059      */
2060 
2061     const locale_t cur_obj = uselocale((locale_t) 0);
2062     const char * retval;
2063 
2064     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "querylocale_2008_i(%s) on %p;"
2065                                            " called from %" LINE_Tf "\n",
2066                                            category_names[index], cur_obj,
2067                                            caller_line));
2068 
2069     if (UNLIKELY(cur_obj == LC_GLOBAL_LOCALE)) {
2070 
2071         /* Even on platforms that have querylocale(), it is unclear if they
2072          * work in the global locale, and we have the means to get the correct
2073          * answer anyway.  khw is unsure this situation even comes up these
2074          * days, hence the branch prediction */
2075         POSIX_SETLOCALE_LOCK;
2076         retval = mortalized_pv_copy(posix_setlocale(categories[index], NULL));
2077         POSIX_SETLOCALE_UNLOCK;
2078     }
2079 
2080     /* Here we have handled the case of the current locale being the global
2081      * one.  Below is the 'else' case of that.  There are two different
2082      * implementations, depending on USE_PL_CURLOCALES */
2083 
2084 #  ifdef USE_PL_CURLOCALES
2085 
2086     else {
2087 
2088         /* PL_curlocales[] is kept up-to-date for all categories except LC_ALL,
2089          * which may have been invalidated by setting it to NULL, and if so,
2090          * should now be calculated.  (The called function updates that
2091          * element.) */
2092         if (index == LC_ALL_INDEX_ && PL_curlocales[LC_ALL_INDEX_] == NULL) {
2093             calculate_LC_ALL_string((const char **) &PL_curlocales,
2094                                     INTERNAL_FORMAT,
2095                                     WANT_VOID,
2096                                     caller_line);
2097         }
2098 
2099         if (cur_obj == PL_C_locale_obj) {
2100 
2101             /* If the current locale object is the C object, then the answer is
2102              * "C" or POSIX, regardless of the category.  Handling this
2103              * reasonably likely case specially shortcuts extra effort, and
2104              * hides some bugs from us in OS's that alias other locales to C,
2105              * but do so incompletely.  If our records say it is POSIX, use
2106              * that; otherwise use C.  See
2107              * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375 */
2108             retval = (strEQ(PL_curlocales[index], "POSIX"))
2109                      ? "POSIX"
2110                      : "C";
2111         }
2112         else {
2113             retval = mortalized_pv_copy(PL_curlocales[index]);
2114         }
2115     }
2116 
2117 #  else
2118 
2119     /* Below is the implementation of the 'else' clause which handles the case
2120      * of the current locale not being the global one on platforms where
2121      * USE_PL_CURLOCALES is NOT in effect.  That means the system must have
2122      * some form of querylocale.  But these have varying characteristics, so
2123      * first create some #defines to make the actual 'else' clause uniform.
2124      *
2125      * First, glibc has a function that implements querylocale(), but is called
2126      * something else, and takes the category number; the others take the mask.
2127      * */
2128 #    if defined(USE_QUERYLOCALE) && (   defined(_NL_LOCALE_NAME)            \
2129                                      && defined(HAS_NL_LANGINFO_L))
2130 #      define my_querylocale(index, cur_obj)                                \
2131                 nl_langinfo_l(_NL_LOCALE_NAME(categories[index]), cur_obj)
2132 
2133        /* Experience so far shows it is thread-safe, as well as glibc's
2134         * nl_langinfo_l(), so unless overridden, mark it so */
2135 #      ifdef NO_THREAD_SAFE_QUERYLOCALE
2136 #        undef HAS_THREAD_SAFE_QUERYLOCALE
2137 #      else
2138 #        define HAS_THREAD_SAFE_QUERYLOCALE
2139 #      endif
2140 #    else   /* below, ! glibc */
2141 
2142        /* Otherwise, use the system's querylocale(). */
2143 #      define my_querylocale(index, cur_obj)                                \
2144                                querylocale(category_masks[index], cur_obj)
2145 
2146        /* There is no standard for this function, and khw has never seen
2147         * anything beyond minimal vendor documentation, lacking important
2148         * details.  Experience has shown that some implementations have race
2149         * condiions, and their returns may not be thread safe.  It would be
2150         * unreliable to test for complete thread safety in Configure.  What we
2151         * do instead is to assume that it is thread-safe, unless overriden by,
2152         * say, a hints file specifying
2153         * -Accflags='-DNO_THREAD_SAFE_QUERYLOCALE */
2154 #      ifdef NO_THREAD_SAFE_QUERYLOCALE
2155 #        undef HAS_THREAD_SAFE_QUERYLOCALE
2156 #      else
2157 #        define HAS_THREAD_SAFE_QUERYLOCALE
2158 #      endif
2159 #    endif
2160 
2161      /* Here, we have set up enough information to know if this querylocale()
2162       * is thread-safe, or needs to use a mutex */
2163 #    ifdef HAS_THREAD_SAFE_QUERYLOCALE
2164 #      define QUERYLOCALE_LOCK
2165 #      define QUERYLOCALE_UNLOCK
2166 #    else
2167 #      define QUERYLOCALE_LOCK    gwLOCALE_LOCK
2168 #      define QUERYLOCALE_UNLOCK  gwLOCALE_UNLOCK
2169 #    endif
2170 
2171     /* Finally, everything is ready, so here is the 'else' clause to implement
2172      * the case of the current locale not being the global one on systems that
2173      * have some form of querylocale().  (POSIX will presumably eventually
2174      * publish their next version in their pipeline, which will define a
2175      * precisely specified querylocale equivalent, and there can be a new
2176      * #ifdef to use it without having to guess at its characteristics) */
2177 
2178     else {
2179         /* We don't keep records when there is querylocale(), so as to avoid the
2180          * pitfalls mentioned at the beginning of this function.
2181          *
2182          * That means LC_ALL has to be calculated from all its constituent
2183          * categories each time, since the querylocale() forms on many (if not
2184          * all) platforms only work on individual categories */
2185         if (index == LC_ALL_INDEX_) {
2186             retval = calculate_LC_ALL_string(NULL, INTERNAL_FORMAT,
2187                                              WANT_TEMP_PV,
2188                                              caller_line);
2189         }
2190         else {
2191 
2192             QUERYLOCALE_LOCK;
2193             retval = my_querylocale(index, cur_obj);
2194 
2195             /* querylocale() may conflate the C locale with something that
2196              * isn't exactly the same.  See for example
2197              * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375
2198              * We know that if the locale object is the C one, we
2199              * are in the C locale, which may go by the name POSIX, as both, by
2200              * definition, are equivalent.  But we consider any other name
2201              * spurious, so override with "C".  As in the PL_CURLOCALES case
2202              * above, this hides those glitches, for the most part, from the
2203              * rest of our code.  (The code is ordered this way so that if the
2204              * system distinugishes "C" from "POSIX", we do too.) */
2205             if (cur_obj == PL_C_locale_obj && ! isNAME_C_OR_POSIX(retval)) {
2206                 QUERYLOCALE_UNLOCK;
2207                 retval = "C";
2208             }
2209             else {
2210                 retval = savepv(retval);
2211                 QUERYLOCALE_UNLOCK;
2212                 SAVEFREEPV(retval);
2213             }
2214         }
2215     }
2216 
2217 #    undef QUERYLOCALE_LOCK
2218 #    undef QUERYLOCALE_UNLOCK
2219 #  endif
2220 
2221     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2222                            "querylocale_2008_i(%s) returning '%s'\n",
2223                            category_names[index], retval));
2224     assert(strNE(retval, ""));
2225     return retval;
2226 }
2227 
2228 /*---------------------------------------------------------------------------*/
2229 
2230 #  define bool_setlocale_i(i, locale)                                       \
2231                               bool_setlocale_2008_i(i, locale, __LINE__)
2232 
2233 /* If this doesn't exist on this platform, make it a no-op (to save #ifdefs) */
2234 #  ifndef update_PL_curlocales_i
2235 #    define update_PL_curlocales_i(index, new_locale, caller_line)
2236 #  endif
2237 
2238 STATIC bool
S_bool_setlocale_2008_i(pTHX_ const locale_category_index index,const char * new_locale,const line_t caller_line)2239 S_bool_setlocale_2008_i(pTHX_
2240 
2241         /* Our internal index of the 'category' setlocale is called with */
2242         const locale_category_index  index,
2243         const char * new_locale,    /* The locale to set the category to */
2244         const line_t caller_line    /* Called from this line number */
2245        )
2246 {
2247     PERL_ARGS_ASSERT_BOOL_SETLOCALE_2008_I;
2248 
2249     /* This function effectively performs a setlocale() on just the current
2250      * thread; thus it is thread-safe.  It does this by using the POSIX 2008
2251      * locale functions to emulate the behavior of setlocale().  Similar to
2252      * regular setlocale(), the return from this function points to memory that
2253      * can be overwritten by other system calls, so needs to be copied
2254      * immediately if you need to retain it.  The difference here is that
2255      * system calls besides another setlocale() can overwrite it.
2256      *
2257      * By doing this, most locale-sensitive functions become thread-safe.  The
2258      * exceptions are mostly those that return a pointer to static memory.
2259      */
2260 
2261     int mask = category_masks[index];
2262     const locale_t entry_obj = uselocale((locale_t) 0);
2263     const char * locale_on_entry = querylocale_i(index);
2264 
2265     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2266                            "bool_setlocale_2008_i: input=%d (%s), mask=0x%x,"
2267                            " new locale=\"%s\", current locale=\"%s\","
2268                            " index=%d, entry object=%p;"
2269                            " called from %" LINE_Tf "\n",
2270                            categories[index], category_names[index], mask,
2271                            ((new_locale == NULL) ? "(nil)" : new_locale),
2272                            locale_on_entry, index, entry_obj, caller_line));
2273 
2274     /* Here, trying to change the locale, but it is a no-op if the new boss is
2275      * the same as the old boss.  Except this routine is called when converting
2276      * from the global locale, so in that case we will create a per-thread
2277      * locale below (with the current values).  It also seemed that newlocale()
2278      * could free up the basis locale memory if we called it with the new and
2279      * old being the same, but khw now thinks that this was due to some other
2280      * bug, since fixed, as there are other places where newlocale() gets
2281      * similarly called without problems. */
2282     if (   entry_obj != LC_GLOBAL_LOCALE
2283         && locale_on_entry
2284         && strEQ(new_locale, locale_on_entry))
2285     {
2286         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2287                                "bool_setlocale_2008_i: no-op to change to"
2288                                " what it already was\n"));
2289         return true;
2290     }
2291 
2292 #  ifndef USE_QUERYLOCALE
2293 
2294     /* Without a querylocale() mechanism, we have to figure out ourselves what
2295      * happens with setting a locale to "" */
2296 
2297     if (strEQ(new_locale, "")) {
2298         new_locale = find_locale_from_environment(index);
2299         if (! new_locale) {
2300             SET_EINVAL;
2301             return false;
2302         }
2303     }
2304 
2305 #  endif
2306 #  ifdef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
2307 
2308     const bool need_loop = false;
2309 
2310 #  else
2311 
2312     bool need_loop = false;
2313     const char * new_locales[LC_ALL_INDEX_] = { NULL };
2314 
2315     /* If we're going to have to parse the LC_ALL string, might as well do it
2316      * now before we have made changes that we would have to back out of if the
2317      * parse fails */
2318     if (index == LC_ALL_INDEX_) {
2319         switch (parse_LC_ALL_string(new_locale,
2320                                     (const char **) &new_locales,
2321                                     override_if_ignored,
2322                                     false,    /* Return only [0] if suffices */
2323                                     false,    /* Don't panic on error */
2324                                     caller_line))
2325         {
2326           case invalid:
2327             SET_EINVAL;
2328             return false;
2329 
2330           case no_array:
2331             need_loop = false;
2332             break;
2333 
2334           case only_element_0:
2335             SAVEFREEPV(new_locales[0]);
2336             new_locale = new_locales[0];
2337             need_loop = false;
2338             break;
2339 
2340           case full_array:
2341             need_loop = true;
2342             break;
2343         }
2344     }
2345 
2346 #  endif
2347 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
2348 
2349     /* For this bug, if the LC_MESSAGES locale changes, we have to do an
2350      * expensive workaround.  Save the current value so we can later determine
2351      * if it changed. */
2352     const char * old_messages_locale = NULL;
2353     if (   (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_)
2354         &&  LIKELY(PL_phase != PERL_PHASE_CONSTRUCT))
2355     {
2356         old_messages_locale = querylocale_c(LC_MESSAGES);
2357     }
2358 
2359 #  endif
2360 
2361     assert(PL_C_locale_obj);
2362 
2363     /* Now ready to switch to the input 'new_locale' */
2364 
2365     /* Switching locales generally entails freeing the current one's space (at
2366      * the C library's discretion), hence we can't be using that locale at the
2367      * time of the switch (this wasn't obvious to khw from the man pages).  So
2368      * switch to a known locale object that we don't otherwise mess with. */
2369     if (! uselocale(PL_C_locale_obj)) {
2370 
2371         /* Not being able to change to the C locale is severe; don't keep
2372          * going.  */
2373         setlocale_failure_panic_i(index, locale_on_entry, "C",
2374                                   __LINE__, caller_line);
2375         NOT_REACHED; /* NOTREACHED */
2376     }
2377 
2378     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2379                            "bool_setlocale_2008_i: now using C"
2380                            " object=%p\n", PL_C_locale_obj));
2381 
2382     /* These two objects are special:
2383      *  LC_GLOBAL_LOCALE    because it is undefined behavior to call
2384      *                      newlocale() with it as a parameter.
2385      *  PL_C_locale_obj     because newlocale() generally destroys its locale
2386      *                      object parameter when it succeeds; and we don't
2387      *                      want that happening to this immutable object.
2388      * Copies will be made for them to use instead if we get so far as to call
2389      * newlocale(). */
2390     bool entry_obj_is_special = (   entry_obj == LC_GLOBAL_LOCALE
2391                                  || entry_obj == PL_C_locale_obj);
2392     locale_t new_obj;
2393 
2394     /* PL_C_locale_obj is LC_ALL set to the C locale.  If this call is to
2395      * switch to LC_ALL => C, simply use that object.  But in fact, we already
2396      * have switched to it just above, in preparation for the general case.
2397      * Since we're already there, no need to do further switching. */
2398     if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) {
2399         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2400                                "bool_setlocale_2008_i: will stay in C"
2401                                " object\n"));
2402         new_obj = PL_C_locale_obj;
2403 
2404         /* 'entry_obj' is now dangling, of no further use to anyone (unless it
2405          * is one of the special ones).  Free it to avoid a leak */
2406         if (! entry_obj_is_special) {
2407             freelocale(entry_obj);
2408         }
2409 
2410         update_PL_curlocales_i(index, new_locale, caller_line);
2411     }
2412     else {  /* Here is the general case, not to LC_ALL => C */
2413 
2414         /* The newlocale() call(s) below take a basis object to build upon to
2415          * create the changed locale, trashing it iff successful.
2416          *
2417          * For the objects that are not to be modified by this function, we
2418          * create a duplicate that gets trashed instead.
2419          *
2420          * Also if we will have to loop doing multiple newlocale()s, there is a
2421          * chance we will succeed for the first few, and then fail, having to
2422          * back out.  We need to duplicate 'entry_obj' in this case as well, so
2423          * it remains valid as something to back out to. */
2424         locale_t basis_obj = entry_obj;
2425 
2426         if (entry_obj_is_special || need_loop) {
2427             basis_obj = duplocale(basis_obj);
2428             if (! basis_obj) {
2429                 locale_panic_via_("duplocale failed", __FILE__, caller_line);
2430                 NOT_REACHED; /* NOTREACHED */
2431             }
2432 
2433             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2434                                    "bool_setlocale_2008_i created %p by"
2435                                    " duping the input\n", basis_obj));
2436         }
2437 
2438 #  define DEBUG_NEW_OBJECT_CREATED(category, locale, new, old, caller_line) \
2439       DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                \
2440                              "bool_setlocale_2008_i(%s, %s): created %p"    \
2441                              " while freeing %p; called from %" LINE_Tf     \
2442                              " via %" LINE_Tf "\n",                         \
2443                              category, locale, new, old,                    \
2444                              caller_line, (line_t)__LINE__))
2445 #  define DEBUG_NEW_OBJECT_FAILED(category, locale, basis_obj)              \
2446       DEBUG_L(PerlIO_printf(Perl_debug_log,                                 \
2447                             "bool_setlocale_2008_i: creating new object"    \
2448                             " for (%s '%s') from %p failed; called from %"  \
2449                             LINE_Tf " via %" LINE_Tf "\n",                  \
2450                             category, locale, basis_obj,                    \
2451                             caller_line, (line_t)__LINE__));
2452 
2453         /* Ready to create a new locale by modification of the existing one.
2454          *
2455          * NOTE: This code may incorrectly show up as a leak under the address
2456          * sanitizer. We do not free this object under normal teardown, however
2457          * you can set PERL_DESTRUCT_LEVEL=2 to cause it to be freed.
2458          */
2459 
2460 #  ifdef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
2461 
2462         /* Some platforms have a newlocale() that can handle disparate LC_ALL
2463          * input, so on these a single call to newlocale() always works */
2464 #  else
2465 
2466         /* If a single call to newlocale() will do */
2467         if (! need_loop)
2468 
2469 #  endif
2470 
2471         {
2472             new_obj = newlocale(mask,
2473                                 override_ignored_category(index, new_locale),
2474                                 basis_obj);
2475             if (! new_obj) {
2476                 DEBUG_NEW_OBJECT_FAILED(category_names[index], new_locale,
2477                                         basis_obj);
2478 
2479                 /* Since the call failed, it didn't trash 'basis_obj', which is
2480                  * a dup for these objects, and hence would leak if we don't
2481                  * free it.  XXX However, something is seriously wrong if we
2482                  * can't switch to C or the global locale, so maybe should
2483                  * panic instead */
2484                 if (entry_obj_is_special) {
2485                     freelocale(basis_obj);
2486                 }
2487 
2488                 goto must_restore_state;
2489             }
2490 
2491             DEBUG_NEW_OBJECT_CREATED(category_names[index], new_locale,
2492                                      new_obj, basis_obj, caller_line);
2493 
2494             update_PL_curlocales_i(index, new_locale, caller_line);
2495         }
2496 
2497 #  ifndef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
2498 
2499         else {  /* Need multiple newlocale() calls */
2500 
2501             /* Loop through the individual categories, setting the locale of
2502              * each to the corresponding name previously populated into
2503              * newlocales[].  Each iteration builds on the previous one, adding
2504              * its category to what's already been calculated, and taking as a
2505              * basis for what's been calculated 'basis_obj', which is updated
2506              * each iteration to be the result of the previous one.  Upon
2507              * success, newlocale() trashes the 'basis_obj' parameter to it.
2508              * If any iteration fails, we immediately give up, restore the
2509              * locale to what it was at the time this function was called
2510              * (saved in 'entry_obj'), and return failure. */
2511 
2512             /* Loop, using the previous iteration's result as the basis for the
2513              * next one.  (The first time we effectively use the locale in
2514              * force upon entry to this function.) */
2515             for_all_individual_category_indexes(i) {
2516                 new_obj = newlocale(category_masks[i],
2517                                     new_locales[i],
2518                                     basis_obj);
2519                 if (new_obj) {
2520                     DEBUG_NEW_OBJECT_CREATED(category_names[i],
2521                                              new_locales[i],
2522                                              new_obj, basis_obj,
2523                                              caller_line);
2524                     basis_obj = new_obj;
2525                     continue;
2526                 }
2527 
2528                 /* Failed.  Likely this is because the proposed new locale
2529                  * isn't valid on this system. */
2530 
2531                 DEBUG_NEW_OBJECT_FAILED(category_names[i],
2532                                         new_locales[i],
2533                                         basis_obj);
2534 
2535                 /* newlocale() didn't trash this, since the function call
2536                  * failed */
2537                 freelocale(basis_obj);
2538 
2539                 for_all_individual_category_indexes(j) {
2540                     Safefree(new_locales[j]);
2541                 }
2542 
2543                 goto must_restore_state;
2544             }
2545 
2546             /* Success for all categories. */
2547             for_all_individual_category_indexes(i) {
2548                 update_PL_curlocales_i(i, new_locales[i], caller_line);
2549                 Safefree(new_locales[i]);
2550             }
2551 
2552             /* We dup'd entry_obj in case we had to fall back to it.  The
2553              * newlocale() above destroyed the dup when it first succeeded, but
2554              * entry_obj itself is left dangling, so free it */
2555             if (! entry_obj_is_special) {
2556                 freelocale(entry_obj);
2557             }
2558         }
2559 
2560 #  endif    /* End of newlocale can't handle disparate LC_ALL input */
2561 
2562     }
2563 
2564 #  undef DEBUG_NEW_OBJECT_CREATED
2565 #  undef DEBUG_NEW_OBJECT_FAILED
2566 
2567     /* Here, successfully created an object representing the desired locale;
2568      * now switch into it */
2569     if (! uselocale(new_obj)) {
2570         freelocale(new_obj);
2571         locale_panic_(Perl_form(aTHX_ "(called from %" LINE_Tf "):"
2572                                       " bool_setlocale_2008_i: switching"
2573                                       " into new locale failed",
2574                                       caller_line));
2575     }
2576 
2577     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2578                            "bool_setlocale_2008_i: now using %p\n", new_obj));
2579 
2580 #  ifdef MULTIPLICITY   /* Unlikely, but POSIX 2008 functions could be
2581                            Configured to be used on unthreaded perls, in which
2582                            case this object doesn't exist */
2583 
2584     if (DEBUG_Lv_TEST) {
2585         if (PL_cur_locale_obj != new_obj) {
2586             PerlIO_printf(Perl_debug_log,
2587                           "bool_setlocale_2008_i: PL_cur_locale_obj"
2588                           " was %p, now is %p\n",
2589                           PL_cur_locale_obj, new_obj);
2590         }
2591     }
2592 
2593     /* Update the current object */
2594     PL_cur_locale_obj = new_obj;
2595 
2596 #  endif
2597 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
2598 
2599     /* Invalidate the glibc cache of loaded translations if the locale has
2600      * changed, see [perl #134264] and
2601      * https://sourceware.org/bugzilla/show_bug.cgi?id=24936 */
2602     if (old_messages_locale) {
2603         if (strNE(old_messages_locale, querylocale_c(LC_MESSAGES))) {
2604             textdomain(textdomain(NULL));
2605         }
2606     }
2607 
2608 #  endif
2609 
2610     return true;
2611 
2612   must_restore_state:
2613 
2614     /* We earlier switched to the LC_ALL => C locale in anticipation of it
2615      * succeeding,  Now have to switch back to the state upon entry.  */
2616     if (! uselocale(entry_obj)) {
2617         setlocale_failure_panic_i(index, "switching back to",
2618                                   locale_on_entry, __LINE__, caller_line);
2619     }
2620 
2621     return false;
2622 }
2623 
2624 /*===========================================================================*/
2625 
2626 #else
2627 #  error Unexpected Configuration
2628 #endif   /* End of the various implementations of the setlocale and
2629             querylocale macros used in the remainder of this program */
2630 
2631 /*===========================================================================*/
2632 
2633 /* Each implementation above is based on two fundamental macros #defined above:
2634  *  1) either a querylocale_r or a querylocale_i
2635  *  2) either a bool_setlocale_r or a bool_setlocale_i
2636  *
2637  * (Which one of each got #defined is based on which is most efficient in
2638  * interacting with the underlying libc functions called.)
2639  *
2640  * To complete the implementation, macros for the missing two suffixes must be
2641  * #defined, as well as all the void_setlocale_X() forms.  These all can be
2642  * mechanically derived from the fundamental ones. */
2643 
2644 #ifdef querylocale_r
2645 #  define querylocale_c(cat)    querylocale_r(cat)
2646 #  define querylocale_i(i)      querylocale_r(categories[i])
2647 #elif defined(querylocale_i)
2648 #  define querylocale_c(cat)    querylocale_i(cat##_INDEX_)
2649 #  define querylocale_r(cat)    querylocale_i(get_category_index(cat))
2650 #else
2651 #  error No querylocale() form defined
2652 #endif
2653 
2654 #ifdef bool_setlocale_r
2655 #  define bool_setlocale_i(i, l)    bool_setlocale_r(categories[i], l)
2656 #  define bool_setlocale_c(cat, l)  bool_setlocale_r(cat, l)
2657 
2658 #  define void_setlocale_r_with_caller(cat, locale, file, line)             \
2659      STMT_START {                                                           \
2660         if (! bool_setlocale_r(cat, locale))                                \
2661             setlocale_failure_panic_via_i(get_category_index(cat),          \
2662                                           NULL, locale, __LINE__, 0,        \
2663                                           file, line);                      \
2664      } STMT_END
2665 
2666 #  define void_setlocale_c_with_caller(cat, locale, file, line)             \
2667           void_setlocale_r_with_caller(cat, locale, file, line)
2668 
2669 #  define void_setlocale_i_with_caller(i, locale, file, line)               \
2670           void_setlocale_r_with_caller(categories[i], locale, file, line)
2671 
2672 #  define void_setlocale_r(cat, locale)                                     \
2673           void_setlocale_r_with_caller(cat, locale, __FILE__, __LINE__)
2674 #  define void_setlocale_c(cat, locale)                                     \
2675           void_setlocale_r(cat, locale)
2676 #  define void_setlocale_i(i, locale)                                       \
2677           void_setlocale_r(categories[i], locale)
2678 
2679 #elif defined(bool_setlocale_i)
2680 #  define bool_setlocale_c(cat, loc) bool_setlocale_i(cat##_INDEX_, loc)
2681 #  define bool_setlocale_r(c, loc)   bool_setlocale_i(get_category_index(c), l)
2682 
2683 #  define void_setlocale_i_with_caller(i, locale, file, line)               \
2684      STMT_START {                                                           \
2685         if (! bool_setlocale_i(i, locale))                                  \
2686             setlocale_failure_panic_via_i(i, NULL, locale, __LINE__, 0,     \
2687                                           file, line);                      \
2688      } STMT_END
2689 
2690 #  define void_setlocale_r_with_caller(cat, locale, file, line)             \
2691           void_setlocale_i_with_caller(get_category_index(cat), locale,     \
2692                                        file, line)
2693 
2694 #  define void_setlocale_c_with_caller(cat, locale, file, line)             \
2695           void_setlocale_i_with_caller(cat##_INDEX_, locale, file, line)
2696 
2697 #  define void_setlocale_i(i, locale)                                       \
2698           void_setlocale_i_with_caller(i, locale, __FILE__, __LINE__)
2699 #  define void_setlocale_c(cat, locale)                                     \
2700           void_setlocale_i(cat##_INDEX_, locale)
2701 #  define void_setlocale_r(cat, locale)                                     \
2702           void_setlocale_i(get_category_index(cat), locale)
2703 
2704 #else
2705 #  error No bool_setlocale() form defined
2706 #endif
2707 
2708 /*===========================================================================*/
2709 
2710 /* Most of the cases in this file just toggle the locale briefly; but there are
2711  * a few instances where a longer toggled interval, over multiple operations,
2712  * is desirable, since toggling and untoggling have a cost.  But on platforms
2713  * where toggling must be done in a critical section, it is even more desirable
2714  * to minimize the length of time in an uninterruptable state.
2715  *
2716  * The macros below try to balance these competing interests.  When the
2717  * toggling is to be brief, simply use the plain "toggle_locale" macros.  But
2718  * in addition, in the places where an over-arching toggle would be nice, add
2719  * calls to the macros below that have the "_locking" suffix.  These are no-ops
2720  * except on systems where the toggling doesn't force a critical section.  But
2721  * otherwise these toggle to the over-arching locale.  When the individual
2722  * toggles are executed, they will check and find that the locale is already in
2723  * the right state, and return without doing anything. */
2724 #if TOGGLING_LOCKS
2725 #  define toggle_locale_c_unless_locking(cat,          locale)  NULL
2726 #  define toggle_locale_c_if_locking(    cat,          locale)              \
2727                          toggle_locale_i(cat##_INDEX_, locale)
2728 
2729 #  define restore_toggled_locale_c_unless_locking(cat,          locale)     \
2730                          PERL_UNUSED_ARG(locale)
2731 #  define restore_toggled_locale_c_if_locking(    cat,          locale)     \
2732                 restore_toggled_locale_i(         cat##_INDEX_, locale)
2733 #else
2734 #  define toggle_locale_c_unless_locking(cat,          locale)              \
2735                          toggle_locale_i(cat##_INDEX_, locale)
2736 #  define toggle_locale_c_if_locking(    cat,          locale)  NULL
2737 
2738 #  define restore_toggled_locale_c_unless_locking(cat,          locale)     \
2739                          restore_toggled_locale_i(cat##_INDEX_, locale)
2740 #  define restore_toggled_locale_c_if_locking(    cat,          locale)     \
2741                          PERL_UNUSED_ARG(locale)
2742 #endif
2743 
2744 /* query_nominal_locale_i() is used when the caller needs the locale that an
2745  * external caller would be expecting, and not what we're secretly using
2746  * behind the scenes.  It deliberately doesn't handle LC_ALL; use
2747  * calculate_LC_ALL_string() for that. */
2748 #ifdef USE_LOCALE_NUMERIC
2749 #  define query_nominal_locale_i(i)                                         \
2750       (__ASSERT_(i != LC_ALL_INDEX_)                                        \
2751        ((i == LC_NUMERIC_INDEX_) ? PL_numeric_name : querylocale_i(i)))
2752 #elif defined(USE_LOCALE)
2753 #  define query_nominal_locale_i(i)                                         \
2754       (__ASSERT_(i != LC_ALL_INDEX_) querylocale_i(i))
2755 #else
2756 #  define query_nominal_locale_i(i)  "C"
2757 #endif
2758 
2759 #ifdef USE_PL_CURLOCALES
2760 
2761 STATIC void
S_update_PL_curlocales_i(pTHX_ const locale_category_index index,const char * new_locale,const line_t caller_line)2762 S_update_PL_curlocales_i(pTHX_
2763                          const locale_category_index index,
2764                          const char * new_locale,
2765                          const line_t caller_line)
2766 {
2767     /* Update PL_curlocales[], which is parallel to the other ones indexed by
2768      * our mapping of libc category number to our internal equivalents. */
2769 
2770     PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I;
2771 
2772     if (index == LC_ALL_INDEX_) {
2773 
2774         /* For LC_ALL, we change all individual categories to correspond,
2775          * including the LC_ALL element */
2776         for (unsigned int i = 0; i <= LC_ALL_INDEX_; i++) {
2777             Safefree(PL_curlocales[i]);
2778             PL_curlocales[i] = NULL;
2779         }
2780 
2781         switch (parse_LC_ALL_string(new_locale,
2782                                     (const char **) &PL_curlocales,
2783                                     check_that_overridden,  /* things should
2784                                                                have already
2785                                                                been overridden
2786                                                                */
2787                                     true,   /* Always fill array */
2788                                     true,   /* Panic if fails, as to get here
2789                                                it earlier had to have succeeded
2790                                                */
2791                                    caller_line))
2792         {
2793           case invalid:
2794           case no_array:
2795           case only_element_0:
2796             locale_panic_via_("Unexpected return from parse_LC_ALL_string",
2797                               __FILE__, caller_line);
2798 
2799           case full_array:
2800             /* parse_LC_ALL_string() has already filled PL_curlocales properly,
2801              * except for the LC_ALL element, which should be set to
2802              * 'new_locale'. */
2803             PL_curlocales[LC_ALL_INDEX_] = savepv(new_locale);
2804         }
2805     }
2806     else {  /* Not LC_ALL */
2807 
2808         /* Update the single category's record */
2809         Safefree(PL_curlocales[index]);
2810         PL_curlocales[index] = savepv(new_locale);
2811 
2812         /* Invalidate LC_ALL */
2813         Safefree(PL_curlocales[LC_ALL_INDEX_]);
2814         PL_curlocales[LC_ALL_INDEX_] = NULL;
2815     }
2816 }
2817 
2818 #  endif  /* Need PL_curlocales[] */
2819 
2820 /*===========================================================================*/
2821 
2822 #if defined(USE_LOCALE)
2823 
2824 /* This paradigm is needed in several places in the function below.  We have to
2825  * substitute the nominal locale for LC_NUMERIC when returning a value for
2826  * external consumption */
2827 #  ifndef USE_LOCALE_NUMERIC
2828 #    define ENTRY(i, array, format)  array[i]
2829 #  else
2830 #    define ENTRY(i, array, format)                         \
2831        (UNLIKELY(   format == EXTERNAL_FORMAT_FOR_QUERY     \
2832                  && i == LC_NUMERIC_INDEX_)                 \
2833         ? PL_numeric_name                                   \
2834         : array[i])
2835 #  endif
2836 
2837 STATIC
2838 const char *
S_calculate_LC_ALL_string(pTHX_ const char ** category_locales_list,const calc_LC_ALL_format format,const calc_LC_ALL_return returning,const line_t caller_line)2839 S_calculate_LC_ALL_string(pTHX_ const char ** category_locales_list,
2840                                 const calc_LC_ALL_format format,
2841                                 const calc_LC_ALL_return returning,
2842                                 const line_t caller_line)
2843 {
2844     PERL_ARGS_ASSERT_CALCULATE_LC_ALL_STRING;
2845 
2846     /* NOTE: On Configurations that have PL_curlocales[], this function has the
2847      * side effect of updating the LC_ALL_INDEX_ element with its result.
2848      *
2849      * This function calculates a string that defines the locale(s) LC_ALL is
2850      * set to, in either:
2851      *  1)  Our internal format if 'format' is set to INTERNAL_FORMAT.
2852      *  2)  The external format returned by Perl_setlocale() if 'format' is set
2853      *      to EXTERNAL_FORMAT_FOR_QUERY or EXTERNAL_FORMAT_FOR_SET.
2854      *
2855      *      These two are distinguished by:
2856      *       a) EXTERNAL_FORMAT_FOR_SET returns the actual locale currently in
2857      *          effect.
2858      *       b) EXTERNAL_FORMAT_FOR_QUERY returns the nominal locale.
2859      *          Currently this can differ only from the actual locale in the
2860      *          LC_NUMERIC category when it is set to a locale whose radix is
2861      *          not a dot.  (The actual locale is kept as a dot to accommodate
2862      *          the large corpus of XS code that expects it to be that;
2863      *          switched to a non-dot temporarily during certain operations
2864      *          that require the actual radix.)
2865      *
2866      * In both 1) and 2), LC_ALL's values are passed to this function by
2867      * 'category_locales_list' which is either:
2868      *  1) a pointer to an array of strings with up-to-date values of all the
2869      *     individual categories; or
2870      *  2) NULL, to indicate to use querylocale_i() to get each individual
2871      *     value.
2872      *
2873      * The caller sets 'returning' to
2874      *      WANT_TEMP_PV        the function returns the calculated string
2875      *                              as a mortalized temporary, so the caller
2876      *                              doesn't have to worry about it being
2877      *                              per-thread, nor needs to arrange for its
2878      *                              clean-up.
2879      *      WANT_PL_setlocale_buf  the function stores the calculated string
2880      *                              into the per-thread buffer PL_setlocale_buf
2881      *                              and returns a pointer to that.  The buffer
2882      *                              is cleaned up automatically in process
2883      *                              destruction.  This return method avoids
2884      *                              extra copies in some circumstances.
2885      *      WANT_VOID           NULL is returned.  This is used when the
2886      *                              function is being called only for its side
2887      *                              effect of updating
2888      *                              PL_curlocales[LC_ALL_INDEX_]
2889      *
2890      * querylocale(), on systems that have it, doesn't tend to work for LC_ALL.
2891      * So we have to construct the answer ourselves based on the passed in
2892      * data.
2893      *
2894      * If all individual categories are the same locale, we can just set LC_ALL
2895      * to that locale.  But if not, we have to create an aggregation of all the
2896      * categories on the system.  Platforms differ as to the syntax they use
2897      * for these non-uniform locales for LC_ALL.  Some, like glibc and Windows,
2898      * use an unordered series of name=value pairs, like
2899      *      LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
2900      * to specify LC_ALL; others, like *BSD, use a positional notation with a
2901      * delimitter, typically a single '/' character:
2902      *      C/en_UK.UTF-8/...
2903      *
2904      * When the external format is desired, this function returns whatever the
2905      * system expects.  The internal format is always name=value pairs.
2906      *
2907      * For systems that have categories we don't know about, the algorithm
2908      * below won't know about those missing categories, leading to potential
2909      * bugs for code that looks at them.  If there is an environment variable
2910      * that sets that category, we won't know to look for it, and so our use of
2911      * LANG or "C" improperly overrides it.  On the other hand, if we don't do
2912      * what is done here, and there is no environment variable, the category's
2913      * locale should be set to LANG or "C".  So there is no good solution.  khw
2914      * thinks the best is to make sure we have a complete list of possible
2915      * categories, adding new ones as they show up on obscure platforms.
2916      */
2917 
2918     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2919                            "Entering calculate_LC_ALL_string(%s);"
2920                            " called from %" LINE_Tf "\n",
2921                            ((format == EXTERNAL_FORMAT_FOR_QUERY)
2922                             ? "EXTERNAL_FORMAT_FOR_QUERY"
2923                             : ((format == EXTERNAL_FORMAT_FOR_SET)
2924                                ? "EXTERNAL_FORMAT_FOR_SET"
2925                                : "INTERNAL_FORMAT")),
2926                            caller_line));
2927 
2928     bool input_list_was_NULL = (category_locales_list == NULL);
2929 
2930     /* If there was no input category list, construct a temporary one
2931      * ourselves. */
2932     const char * my_category_locales_list[LC_ALL_INDEX_];
2933     const char ** locales_list = category_locales_list;
2934     if (locales_list == NULL) {
2935         locales_list = my_category_locales_list;
2936 
2937         if (format == EXTERNAL_FORMAT_FOR_QUERY) {
2938             for_all_individual_category_indexes(i) {
2939                 locales_list[i] = query_nominal_locale_i(i);
2940             }
2941         }
2942         else {
2943             for_all_individual_category_indexes(i) {
2944                 locales_list[i] = querylocale_i(i);
2945             }
2946         }
2947     }
2948 
2949     /* While we are calculating LC_ALL, we see if every category's locale is
2950      * the same as every other's or not. */
2951 #  ifndef HAS_IGNORED_LOCALE_CATEGORIES_
2952 
2953     /* When we pay attention to all categories, we assume they are all the same
2954      * until proven different */
2955     bool disparate = false;
2956 
2957 #  else
2958 
2959     /* But if there are ignored categories, those will be set to "C", so try an
2960      * arbitrary category, and if it isn't C, we know immediately that the
2961      * locales are disparate.  (The #if conditionals are to handle the case
2962      * where LC_NUMERIC_INDEX_ is 0.  We don't want to use LC_NUMERIC to
2963      * compare, as that may be different between external and internal forms.)
2964      * */
2965 #    if ! defined(USE_LOCALE_NUMERIC)
2966 
2967     bool disparate = ! isNAME_C_OR_POSIX(locales_list[0]);
2968 
2969 #    elif LC_NUMERIC_INDEX_ != 0
2970 
2971     bool disparate = ! isNAME_C_OR_POSIX(locales_list[0]);
2972 
2973 #    else
2974 
2975     /* Would need revision to handle the very unlikely case where only a single
2976      * category, LC_NUMERIC, is defined */
2977     assert(LOCALE_CATEGORIES_COUNT_ > 0);
2978 
2979     bool disparate = ! isNAME_C_OR_POSIX(locales_list[1]);
2980 
2981 #    endif
2982 #  endif
2983 
2984     /* Calculate the needed size for the string listing the individual locales.
2985      * Initialize with values known at compile time. */
2986     Size_t total_len;
2987     const char *separator;
2988 
2989 #  ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS  /* Positional formatted LC_ALL */
2990     PERL_UNUSED_ARG(format);
2991 #  else
2992 
2993     if (format != INTERNAL_FORMAT) {
2994 
2995         /* Here, we will be using positional notation.  it includes n-1
2996          * separators */
2997         total_len = (  LOCALE_CATEGORIES_COUNT_ - 1)
2998                      * STRLENs(PERL_LC_ALL_SEPARATOR)
2999                   + 1;   /* And a trailing NUL */
3000         separator = PERL_LC_ALL_SEPARATOR;
3001     }
3002     else
3003 
3004 #  endif
3005 
3006     {
3007         /* name=value output is always used in internal format, and when
3008          * positional isn't available on the platform. */
3009         total_len = lc_all_boiler_plate_length;
3010         separator = ";";
3011     }
3012 
3013     /* The total length then is just the sum of the above boiler-plate plus the
3014      * total strlen()s of the locale name of each individual category. */
3015     for_all_individual_category_indexes(i) {
3016         const char * entry = ENTRY(i, locales_list, format);
3017 
3018         total_len += strlen(entry);
3019         if (! disparate && strNE(entry, locales_list[0])) {
3020             disparate = true;
3021         }
3022     }
3023 
3024     bool free_if_void_return = false;
3025     const char * retval;
3026 
3027     /* If all categories have the same locale, we already know the answer */
3028     if (! disparate) {
3029         if (returning == WANT_PL_setlocale_buf) {
3030             save_to_buffer(locales_list[0],
3031                            &PL_setlocale_buf,
3032                            &PL_setlocale_bufsize);
3033             retval = PL_setlocale_buf;
3034         }
3035         else {
3036 
3037             retval = locales_list[0];
3038 
3039             /* If a temporary is wanted for the return, and we had to create
3040              * the input list ourselves, we created it into such a temporary,
3041              * so no further work is needed; but otherwise, make a mortal copy
3042              * of this passed-in list element */
3043             if (returning == WANT_TEMP_PV && ! input_list_was_NULL) {
3044                 retval = savepv(retval);
3045                 SAVEFREEPV(retval);
3046             }
3047 
3048             /* In all cases here, there's nothing we create that needs to be
3049              * freed, so leave 'free_if_void_return' set to the default
3050              * 'false'. */
3051         }
3052     }
3053     else {  /* Here, not all categories have the same locale */
3054 
3055         char * constructed;
3056 
3057         /* If returning to PL_setlocale_buf, set up to write directly to it,
3058          * being sure it is resized to be large enough */
3059         if (returning == WANT_PL_setlocale_buf) {
3060             set_save_buffer_min_size(total_len,
3061                                      &PL_setlocale_buf,
3062                                      &PL_setlocale_bufsize);
3063             constructed = PL_setlocale_buf;
3064         }
3065         else {  /* Otherwise we need new memory to hold the calculated value. */
3066 
3067             Newx(constructed, total_len, char);
3068 
3069             /* If returning the new memory, it must be set up to be freed
3070              * later; otherwise at the end of this function */
3071             if (returning == WANT_TEMP_PV) {
3072                 SAVEFREEPV(constructed);
3073             }
3074             else {
3075                 free_if_void_return = true;
3076             }
3077         }
3078 
3079         constructed[0] = '\0';
3080 
3081         /* Loop through all the categories */
3082         for_all_individual_category_indexes(j) {
3083 
3084             /* Add a separator, except before the first one */
3085             if (j != 0) {
3086                 my_strlcat(constructed, separator, total_len);
3087             }
3088 
3089             const char * entry;
3090             Size_t needed_len;
3091             unsigned int i = j;
3092 
3093 #  ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
3094 
3095             if (UNLIKELY(format != INTERNAL_FORMAT)) {
3096 
3097                 /* In positional notation 'j' means the position, and we have
3098                  * to convert to the index 'i' */
3099                 i = map_LC_ALL_position_to_index[j];
3100 
3101                 entry = ENTRY(i, locales_list, format);
3102                 needed_len = my_strlcat(constructed, entry, total_len);
3103             }
3104             else
3105 
3106 #  endif
3107             {
3108                 /* Below, we are to use name=value notation, either because
3109                  * that's what the platform uses, or because this is the
3110                  * internal format, which uses that notation regardless of the
3111                  * external form */
3112 
3113                 entry = ENTRY(i, locales_list, format);
3114 
3115                 /* "name=locale;" */
3116                 my_strlcat(constructed, category_names[i], total_len);
3117                 my_strlcat(constructed, "=", total_len);
3118                 needed_len = my_strlcat(constructed, entry, total_len);
3119             }
3120 
3121             if (LIKELY(needed_len <= total_len)) {
3122                 continue;
3123             }
3124 
3125             /* If would have overflowed, panic */
3126             locale_panic_via_(Perl_form(aTHX_
3127                                         "Internal length calculation wrong.\n"
3128                                         "\"%s\" was not entirely added to"
3129                                         " \"%.*s\"; needed=%zu, had=%zu",
3130                                         entry, (int) total_len,
3131                                         constructed,
3132                                         needed_len, total_len),
3133                                 __FILE__,
3134                                 caller_line);
3135         } /* End of loop through the categories */
3136 
3137         retval = constructed;
3138     } /* End of the categories' locales are displarate */
3139 
3140 #  if defined(USE_PL_CURLOCALES) && defined(LC_ALL)
3141 
3142     if (format == INTERNAL_FORMAT) {
3143 
3144         /* PL_curlocales[LC_ALL_INDEX_] is updated as a side-effect of this
3145          * function for internal format. */
3146         Safefree(PL_curlocales[LC_ALL_INDEX_]);
3147         PL_curlocales[LC_ALL_INDEX_] = savepv(retval);
3148     }
3149 
3150 #  endif
3151 
3152     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3153                            "calculate_LC_ALL_string calculated '%s'\n",
3154                            retval));
3155 
3156     if (returning == WANT_VOID) {
3157         if (free_if_void_return) {
3158             Safefree(retval);
3159         }
3160 
3161         return NULL;
3162     }
3163 
3164     return retval;
3165 }
3166 
3167 #  if defined(WIN32) || (     defined(USE_POSIX_2008_LOCALE)        \
3168                          && ! defined(USE_QUERYLOCALE))
3169 
3170 STATIC const char *
S_find_locale_from_environment(pTHX_ const locale_category_index index)3171 S_find_locale_from_environment(pTHX_ const locale_category_index index)
3172 {
3173     /* NB: This function may actually change the locale on Windows.  It
3174      * currently is designed to be called only from setting the locale on
3175      * Windows, and POSIX 2008
3176      *
3177      * This function returns the locale specified by the program's environment
3178      * for the category specified by our internal index number 'index'.  It
3179      * therefore simulates:
3180      *      setlocale(cat, "")
3181      * but, except for some cases in Windows, doesn't actually change the
3182      * locale; merely returns it.
3183      *
3184      * The return need not be freed by the caller.  This
3185      * promise relies on PerlEnv_getenv() returning a mortalized copy to us.
3186      *
3187      * The simulation is needed only on certain platforms; otherwise, libc is
3188      * called with "" to get the actual value(s).  The simulation is needed
3189      * for:
3190      *
3191      *  1)  On Windows systems, the concept of the POSIX ordering of
3192      *      environment variables is missing.  To increase portability of
3193      *      programs across platforms, the POSIX ordering is emulated on
3194      *      Windows.
3195      *
3196      *  2)  On POSIX 2008 systems without querylocale(), it is problematic
3197      *      getting the results of the POSIX 2008 equivalent of
3198      *
3199      *          setlocale(category, "")
3200      *
3201      *      To ensure that we know exactly what those values are, we do the
3202      *      setting ourselves, using the documented algorithm specified by the
3203      *      POSIX standard (assuming the platform follows the Standard) rather
3204      *      than use "" as the locale.  This will lead to results that differ
3205      *      from native behavior if the native behavior differs from the
3206      *      Standard's documented value, but khw believes it is better to know
3207      *      what's going on, even if different from native, than to just guess.
3208      *
3209      *      glibc systems differ from this standard in having a LANGUAGE
3210      *      environment variable used for just LC_MESSAGES.  This function does
3211      *      NOT handle that.
3212      *
3213      *      Another option for the POSIX 2008 case would be, in a critical
3214      *      section, to save the global locale's current value, and do a
3215      *      straight setlocale(LC_ALL, "").  That would return our desired
3216      *      values, destroying the global locale's, which we would then
3217      *      restore.  But that could cause races with any other thread that is
3218      *      using the global locale and isn't using the mutex.  And, the only
3219      *      reason someone would have done that is because they are calling a
3220      *      library function, like in gtk, that calls setlocale(), and which
3221      *      can't be changed to use the mutex.  That wouldn't be a problem if
3222      *      this were to be done before any threads had switched, say during
3223      *      perl construction time.  But this code would still be needed for
3224      *      the general case.
3225      *
3226      * The Windows and POSIX 2008 differ in that the ultimate fallback is "C"
3227      * in POSIX, and is the system default locale in Windows.  To get that
3228      * system default value, we actually have to call setlocale() on Windows.
3229      */
3230 
3231     const char * const lc_all = PerlEnv_getenv("LC_ALL");
3232     const char * locale_names[LC_ALL_INDEX_] = { NULL };
3233 
3234     /* Use any "LC_ALL" environment variable, as it overrides everything else.
3235      * */
3236     if (lc_all && strNE(lc_all, "")) {
3237         return lc_all;
3238     }
3239 
3240     /* Here, no usable LC_ALL environment variable.  We have to handle each
3241      * category separately.  If all categories are desired, we loop through
3242      * them all.  If only an individual category is desired, to avoid
3243      * duplicating logic, we use the same loop, but set up the limits so it is
3244      * only executed once, for that particular category. */
3245     locale_category_index lower, upper, offset;
3246     if (index == LC_ALL_INDEX_) {
3247         lower = (locale_category_index) 0;
3248         upper = (locale_category_index) ((int) LC_ALL_INDEX_ - 1);
3249         offset = (locale_category_index) 0;
3250     }
3251     else {
3252         lower = index;
3253         upper = index;
3254 
3255         /* 'offset' is used so that the result of the single loop iteration is
3256          * stored into output[0] */
3257         offset = lower;
3258     }
3259 
3260     /* When no LC_ALL environment variable, LANG is used as a default, but
3261      * overridden for individual categories that have corresponding environment
3262      * variables.  If no LANG exists, the default is "C" on POSIX 2008, or the
3263      * system default for the category on Windows. */
3264     const char * env_lang = NULL;
3265 
3266     /* For each desired category, use any corresponding environment variable;
3267      * or the default if none such exists. */
3268     bool is_disparate = false;  /* Assume is uniform until proven otherwise */
3269     for_category_indexes_between(i, lower, upper) {
3270         const char * const env_override = PerlEnv_getenv(category_names[i]);
3271         locale_category_index j = (locale_category_index) (i - offset);
3272 
3273         if (env_override && strNE(env_override, "")) {
3274             locale_names[j] = env_override;
3275         }
3276         else { /* Here, no corresponding environment variable, see if LANG
3277                   exists and is usable.  Done this way to avoid fetching LANG
3278                   unless it is actually needed */
3279             if (env_lang == NULL) {
3280                 env_lang = PerlEnv_getenv("LANG");
3281 
3282                 /* If not usable, set it to a non-NULL illegal value so won't
3283                  * try to use it below */
3284                 if (env_lang == NULL || strEQ(env_lang, "")) {
3285                     env_lang = (const char *) 1;
3286                 }
3287             }
3288 
3289             /* If a usable LANG exists, use it. */
3290             if (env_lang != NULL && env_lang != (const char *) 1) {
3291                 locale_names[j] = env_lang;
3292             }
3293             else {
3294 
3295 #    ifdef WIN32
3296                 /* If no LANG, use the system default on Windows. */
3297                 locale_names[j] = wrap_wsetlocale(categories[i], ".ACP");
3298                 if (locale_names[j]) {
3299                     SAVEFREEPV(locale_names[j]);
3300                 }
3301                 else
3302 #    endif
3303                 {   /* If nothing was found or worked, use C */
3304                     locale_names[j] = "C";
3305                 }
3306             }
3307         }
3308 
3309         if (j > 0 && ! is_disparate && strNE(locale_names[0], locale_names[j]))
3310         {
3311             is_disparate = true;
3312         }
3313 
3314         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3315                  "find_locale_from_environment i=%u, j=%u, name=%s,"
3316                  " locale=%s, locale of 0th category=%s, disparate=%d\n",
3317                  i, j, category_names[i],
3318                  locale_names[j], locale_names[0], is_disparate));
3319     }
3320 
3321     if (! is_disparate) {
3322         return locale_names[0];
3323     }
3324 
3325     return calculate_LC_ALL_string(locale_names, INTERNAL_FORMAT,
3326                                    WANT_TEMP_PV,
3327                                    __LINE__);
3328 }
3329 
3330 #  endif
3331 #  if defined(DEBUGGING) || defined(USE_PERL_SWITCH_LOCALE_CONTEXT)
3332 
3333 STATIC const char *
S_get_LC_ALL_display(pTHX)3334 S_get_LC_ALL_display(pTHX)
3335 {
3336     return calculate_LC_ALL_string(NULL, INTERNAL_FORMAT,
3337                                    WANT_TEMP_PV,
3338                                    __LINE__);
3339 }
3340 
3341 #  endif
3342 
3343 STATIC void
S_setlocale_failure_panic_via_i(pTHX_ const locale_category_index cat_index,const char * current,const char * failed,const line_t proxy_caller_line,const line_t immediate_caller_line,const char * const higher_caller_file,const line_t higher_caller_line)3344 S_setlocale_failure_panic_via_i(pTHX_
3345                                 const locale_category_index cat_index,
3346                                 const char * current,
3347                                 const char * failed,
3348                                 const line_t proxy_caller_line,
3349                                 const line_t immediate_caller_line,
3350                                 const char * const higher_caller_file,
3351                                 const line_t higher_caller_line)
3352 {
3353     PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_VIA_I;
3354 
3355     /* Called to panic when a setlocale form unexpectedly failed for the
3356      * category determined by 'cat_index', and the locale that was in effect
3357      * (and likely still is) is 'current'.  'current' may be NULL, which causes
3358      * this function to query what it is.
3359      *
3360      * The extra caller information is used for when a function acts as a
3361      * stand-in for another function, which a typical reader would more likely
3362      * think would be the caller
3363      *
3364      * If a line number is 0, its stack (sort-of) frame is omitted; same if
3365      * it's the same line number as the next higher caller. */
3366 
3367     const int cat = categories[cat_index];
3368     const char * name = category_names[cat_index];
3369 
3370     dSAVE_ERRNO;
3371 
3372     if (current == NULL) {
3373         current = querylocale_i(cat_index);
3374     }
3375 
3376     const char * proxy_text = "";
3377     if (proxy_caller_line != 0 && proxy_caller_line != immediate_caller_line)
3378     {
3379         proxy_text = Perl_form(aTHX_ "\nCalled via %s: %" LINE_Tf,
3380                                       __FILE__, proxy_caller_line);
3381     }
3382     if (   strNE(__FILE__, higher_caller_file)
3383         || (   immediate_caller_line != 0
3384             && immediate_caller_line != higher_caller_line))
3385     {
3386         proxy_text = Perl_form(aTHX_ "%s\nCalled via %s: %" LINE_Tf,
3387                                       proxy_text, __FILE__,
3388                                       immediate_caller_line);
3389     }
3390 
3391     /* 'false' in the get_displayable_string() calls makes it not think the
3392      * locale is UTF-8, so just dumps bytes.  Actually figuring it out can be
3393      * too complicated for a panic situation. */
3394     const char * msg = Perl_form(aTHX_
3395                             "Can't change locale for %s (%d) from '%s' to '%s'"
3396                             " %s",
3397                             name, cat,
3398                             get_displayable_string(current,
3399                                                    current + strlen(current),
3400                                                    false),
3401                             get_displayable_string(failed,
3402                                                    failed + strlen(failed),
3403                                                    false),
3404                             proxy_text);
3405     RESTORE_ERRNO;
3406 
3407     Perl_locale_panic(msg, __LINE__, higher_caller_file, higher_caller_line);
3408     NOT_REACHED; /* NOTREACHED */
3409 }
3410 
3411 #  ifdef USE_LOCALE_NUMERIC
3412 
3413 STATIC void
S_new_numeric(pTHX_ const char * newnum,bool force)3414 S_new_numeric(pTHX_ const char *newnum, bool force)
3415 {
3416     PERL_ARGS_ASSERT_NEW_NUMERIC;
3417 
3418     /* Called after each libc setlocale() or uselocale() call affecting
3419      * LC_NUMERIC, to tell core Perl this and that 'newnum' is the name of the
3420      * new locale, and we are switched into it.  It installs this locale as the
3421      * current underlying default, and then switches to the C locale, if
3422      * necessary, so that the code that has traditionally expected the radix
3423      * character to be a dot may continue to do so.
3424      *
3425      * The default locale and the C locale can be toggled between by use of the
3426      * set_numeric_underlying() and set_numeric_standard() functions, which
3427      * should probably not be called directly, but only via macros like
3428      * SET_NUMERIC_STANDARD() in perl.h.
3429      *
3430      * The toggling is necessary mainly so that a non-dot radix decimal point
3431      * character can be input and output, while allowing internal calculations
3432      * to use a dot.
3433      *
3434      * This sets several interpreter-level variables:
3435      * PL_numeric_name  The underlying locale's name: a copy of 'newnum'
3436      * PL_numeric_underlying   A boolean indicating if the toggled state is
3437      *                  such that the current locale is the program's
3438      *                  underlying locale
3439      * PL_numeric_standard   An int indicating if the toggled state is such
3440      *                  that the current locale is the C locale or
3441      *                  indistinguishable from the C locale.  If non-zero, it
3442      *                  is in C; if > 1, it means it may not be toggled away
3443      *                  from C.
3444      * PL_numeric_underlying_is_standard   A bool kept by this function
3445      *                  indicating that the underlying locale and the standard
3446      *                  C locale are indistinguishable for the purposes of
3447      *                  LC_NUMERIC.  This happens when both of the above two
3448      *                  variables are true at the same time.  (Toggling is a
3449      *                  no-op under these circumstances.)  This variable is
3450      *                  used to avoid having to recalculate.
3451      * PL_numeric_radix_sv   Contains the string that code should use for the
3452      *                  decimal point.  It is set to either a dot or the
3453      *                  program's underlying locale's radix character string,
3454      *                  depending on the situation.
3455      * PL_underlying_radix_sv   Contains the program's underlying locale's
3456      *                  radix character string.  This is copied into
3457      *                  PL_numeric_radix_sv when the situation warrants.  It
3458      *                  exists to avoid having to recalculate it when toggling.
3459      */
3460 
3461     DEBUG_L( PerlIO_printf(Perl_debug_log,
3462                            "Called new_numeric with %s, PL_numeric_name=%s\n",
3463                            newnum, PL_numeric_name));
3464 
3465     /* We keep records comparing the characteristics of the LC_NUMERIC catetory
3466      * of the current locale vs the standard C locale.  If the new locale that
3467      * has just been changed to is the same as the one our records are for,
3468      * they are still valid, and we don't have to recalculate them.  'force' is
3469      * true if the caller suspects that the records are out-of-date, so do go
3470      * ahead and recalculate them.  (This can happen when an external library
3471      * has had control and now perl is reestablishing control; we have to
3472      * assume that that library changed the locale in unknown ways.)
3473      *
3474      * Even if our records are valid, the new locale will likely have been
3475      * switched to before this function gets called, and we must toggle into
3476      * one indistinguishable from the C locale with regards to LC_NUMERIC
3477      * handling, so that all the libc functions that are affected by LC_NUMERIC
3478      * will work as expected.  This can be skipped if we already know that the
3479      * locale is indistinguishable from the C locale. */
3480     if (! force && strEQ(PL_numeric_name, newnum)) {
3481         if (! PL_numeric_underlying_is_standard) {
3482             set_numeric_standard(__FILE__, __LINE__);
3483         }
3484 
3485         return;
3486     }
3487 
3488     Safefree(PL_numeric_name);
3489     PL_numeric_name = savepv(newnum);
3490 
3491     /* Handle the trivial case.  Since this is called at process
3492      * initialization, be aware that this bit can't rely on much being
3493      * available. */
3494     if (isNAME_C_OR_POSIX(PL_numeric_name)) {
3495         PL_numeric_standard = TRUE;
3496         PL_numeric_underlying_is_standard = TRUE;
3497         PL_numeric_underlying = TRUE;
3498         sv_setpv(PL_numeric_radix_sv, C_decimal_point);
3499         SvUTF8_off(PL_numeric_radix_sv);
3500         sv_setpv(PL_underlying_radix_sv, C_decimal_point);
3501         SvUTF8_off(PL_underlying_radix_sv);
3502         return;
3503     }
3504 
3505     /* We are in the underlying locale until changed at the end of this
3506      * function */
3507     PL_numeric_underlying = TRUE;
3508 
3509     /* Passing a non-NULL causes the function call just below to
3510        automatically set the UTF-8 flag on PL_underlying_radix_sv */
3511     utf8ness_t dummy;
3512 
3513     /* Find and save this locale's radix character. */
3514     langinfo_sv_c(RADIXCHAR, LC_NUMERIC, PL_numeric_name,
3515                   PL_underlying_radix_sv, &dummy);
3516     DEBUG_L(PerlIO_printf(Perl_debug_log,
3517                           "Locale radix is '%s', ?UTF-8=%d\n",
3518                           SvPVX(PL_underlying_radix_sv),
3519                           cBOOL(SvUTF8(PL_underlying_radix_sv))));
3520 
3521     /* This locale is indistinguishable from C (for numeric purposes) if both
3522      * the radix character and the thousands separator are the same as C's.
3523      * Start with the radix. */
3524     PL_numeric_underlying_is_standard = strEQ(C_decimal_point,
3525                                               SvPVX(PL_underlying_radix_sv));
3526 
3527 #    ifndef TS_W32_BROKEN_LOCALECONV
3528 
3529     /* If the radix isn't the same as C's, we know it is distinguishable from
3530      * C; otherwise check the thousands separator too.  Only if both are the
3531      * same as C's is the locale indistinguishable from C.
3532      *
3533      * But on earlier Windows versions, there is a potential race.  This code
3534      * knows that localeconv() (elsewhere in this file) will be used to extract
3535      * the needed value, and localeconv() was buggy for quite a while, and that
3536      * code in this file hence uses a workaround.  And that workaround may have
3537      * an (unlikely) race.  Gathering the radix uses a different workaround on
3538      * Windows that doesn't involve a race.  It might be possible to do the
3539      * same for this (patches welcome).
3540      *
3541      * Until then khw doesn't think it's worth even the small risk of a race to
3542      * get this value, which doesn't appear to be used in any of the Microsoft
3543      * library routines anyway. */
3544 
3545     if (PL_numeric_underlying_is_standard) {
3546         PL_numeric_underlying_is_standard = strEQ(C_thousands_sep,
3547                                                   langinfo_c(THOUSEP,
3548                                                              LC_NUMERIC,
3549                                                              PL_numeric_name,
3550                                                              NULL));
3551     }
3552 
3553 #    endif
3554 
3555     PL_numeric_standard = PL_numeric_underlying_is_standard;
3556 
3557     /* Keep LC_NUMERIC so that it has the C locale radix and thousands
3558      * separator.  This is for XS modules, so they don't have to worry about
3559      * the radix being a non-dot.  (Core operations that need the underlying
3560      * locale change to it temporarily). */
3561     if (! PL_numeric_standard) {
3562         set_numeric_standard(__FILE__, __LINE__);
3563     }
3564 }
3565 
3566 #  endif
3567 
3568 void
Perl_set_numeric_standard(pTHX_ const char * const file,const line_t line)3569 Perl_set_numeric_standard(pTHX_ const char * const file, const line_t line)
3570 {
3571     PERL_ARGS_ASSERT_SET_NUMERIC_STANDARD;
3572     PERL_UNUSED_ARG(file);      /* Some Configurations ignore these */
3573     PERL_UNUSED_ARG(line);
3574 
3575 #  ifdef USE_LOCALE_NUMERIC
3576 
3577     /* Unconditionally toggle the LC_NUMERIC locale to the C locale
3578      *
3579      * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h
3580      * instead of calling this directly.  The macro avoids calling this routine
3581      * if toggling isn't necessary according to our records (which could be
3582      * wrong if some XS code has changed the locale behind our back) */
3583 
3584     DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to"
3585                                           " standard C; called from %s: %"
3586                                           LINE_Tf "\n", file, line));
3587 
3588     void_setlocale_c_with_caller(LC_NUMERIC, "C", file, line);
3589     PL_numeric_standard = TRUE;
3590     sv_setpv(PL_numeric_radix_sv, C_decimal_point);
3591     SvUTF8_off(PL_numeric_radix_sv);
3592 
3593     PL_numeric_underlying = PL_numeric_underlying_is_standard;
3594 
3595 #  endif /* USE_LOCALE_NUMERIC */
3596 
3597 }
3598 
3599 void
Perl_set_numeric_underlying(pTHX_ const char * const file,const line_t line)3600 Perl_set_numeric_underlying(pTHX_ const char * const file, const line_t line)
3601 {
3602     PERL_ARGS_ASSERT_SET_NUMERIC_UNDERLYING;
3603     PERL_UNUSED_ARG(file);      /* Some Configurations ignore these */
3604     PERL_UNUSED_ARG(line);
3605 
3606 #  ifdef USE_LOCALE_NUMERIC
3607 
3608     /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
3609      * default.
3610      *
3611      * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h
3612      * instead of calling this directly.  The macro avoids calling this routine
3613      * if toggling isn't necessary according to our records (which could be
3614      * wrong if some XS code has changed the locale behind our back) */
3615 
3616     DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s;"
3617                                           " called from %s: %" LINE_Tf "\n",
3618                                           PL_numeric_name, file, line));
3619     /* Maybe not in init? assert(PL_locale_mutex_depth > 0);*/
3620 
3621     void_setlocale_c_with_caller(LC_NUMERIC, PL_numeric_name, file, line);
3622     PL_numeric_underlying = TRUE;
3623     sv_setsv_nomg(PL_numeric_radix_sv, PL_underlying_radix_sv);
3624 
3625     PL_numeric_standard = PL_numeric_underlying_is_standard;
3626 
3627 #  endif /* USE_LOCALE_NUMERIC */
3628 
3629 }
3630 
3631 #  ifdef USE_LOCALE_CTYPE
3632 
3633 STATIC void
S_new_ctype(pTHX_ const char * newctype,bool force)3634 S_new_ctype(pTHX_ const char *newctype, bool force)
3635 {
3636     PERL_ARGS_ASSERT_NEW_CTYPE;
3637     PERL_UNUSED_ARG(force);
3638 
3639     /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
3640      * core Perl this and that 'newctype' is the name of the new locale.
3641      *
3642      * This function sets up the folding arrays for all 256 bytes, assuming
3643      * that tofold() is tolc() since fold case is not a concept in POSIX,
3644      */
3645 
3646     DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n",
3647                                           newctype));
3648 
3649     /* No change means no-op */
3650     if (strEQ(PL_ctype_name, newctype)) {
3651         return;
3652     }
3653 
3654     /* We will replace any bad locale warning with
3655      *  1)  nothing if the new one is ok; or
3656      *  2)  a new warning for the bad new locale */
3657     if (PL_warn_locale) {
3658         SvREFCNT_dec_NN(PL_warn_locale);
3659         PL_warn_locale = NULL;
3660     }
3661 
3662     /* Clear cache */
3663     Safefree(PL_ctype_name);
3664     PL_ctype_name = "";
3665 
3666     PL_in_utf8_turkic_locale = FALSE;
3667 
3668     /* For the C locale, just use the standard folds, and we know there are no
3669      * glitches possible, so return early.  Since this is called at process
3670      * initialization, be aware that this bit can't rely on much being
3671      * available. */
3672     if (isNAME_C_OR_POSIX(newctype)) {
3673         Copy(PL_fold, PL_fold_locale, 256, U8);
3674         PL_ctype_name = savepv(newctype);
3675         PL_in_utf8_CTYPE_locale = FALSE;
3676         return;
3677     }
3678 
3679     /* The cache being cleared signals the called function to compute a new
3680      * value */
3681     PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype);
3682 
3683     PL_ctype_name = savepv(newctype);
3684     bool maybe_utf8_turkic = FALSE;
3685 
3686     /* Don't check for problems if we are suppressing the warnings */
3687     bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
3688 
3689     if (PL_in_utf8_CTYPE_locale) {
3690 
3691         /* A UTF-8 locale gets standard rules.  But note that code still has to
3692          * handle this specially because of the three problematic code points
3693          * */
3694         Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
3695 
3696         /* UTF-8 locales can have special handling for 'I' and 'i' if they are
3697          * Turkic.  Make sure these two are the only anomalies.  (We don't
3698          * require towupper and towlower because they aren't in C89.) */
3699 
3700 #    if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
3701 
3702         if (towupper('i') == 0x130 && towlower('I') == 0x131)
3703 
3704 #    else
3705 
3706         if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I')
3707 
3708 #    endif
3709 
3710         {
3711             /* This is how we determine it really is Turkic */
3712             check_for_problems = TRUE;
3713             maybe_utf8_turkic = TRUE;
3714         }
3715     }
3716     else {  /* Not a canned locale we know the values for.  Compute them */
3717 
3718 #    ifdef DEBUGGING
3719 
3720         bool has_non_ascii_fold = FALSE;
3721         bool found_unexpected = FALSE;
3722 
3723         /* Under -DLv, see if there are any folds outside the ASCII range.
3724          * This factoid is used below */
3725         if (DEBUG_Lv_TEST) {
3726             for (unsigned i = 128; i < 256; i++) {
3727                 int j = LATIN1_TO_NATIVE(i);
3728                 if (toU8_LOWER_LC(j) != j || toU8_UPPER_LC(j) != j) {
3729                     has_non_ascii_fold = TRUE;
3730                     break;
3731                 }
3732             }
3733         }
3734 
3735 #    endif
3736 
3737         for (unsigned i = 0; i < 256; i++) {
3738             if (isU8_UPPER_LC(i))
3739                 PL_fold_locale[i] = (U8) toU8_LOWER_LC(i);
3740             else if (isU8_LOWER_LC(i))
3741                 PL_fold_locale[i] = (U8) toU8_UPPER_LC(i);
3742             else
3743                 PL_fold_locale[i] = (U8) i;
3744 
3745 #    ifdef DEBUGGING
3746 
3747             /* Most locales these days are supersets of ASCII.  When debugging,
3748              * it is helpful to know what the exceptions to that are in this
3749              * locale */
3750             if (DEBUG_L_TEST) {
3751                 bool unexpected = FALSE;
3752 
3753                 if (isUPPER_L1(i)) {
3754                     if (isUPPER_A(i)) {
3755                         if (PL_fold_locale[i] != toLOWER_A(i)) {
3756                             unexpected = TRUE;
3757                         }
3758                     }
3759                     else if (has_non_ascii_fold) {
3760                         if (PL_fold_locale[i] != toLOWER_L1(i)) {
3761                             unexpected = TRUE;
3762                         }
3763                     }
3764                     else if (PL_fold_locale[i] != i) {
3765                         unexpected = TRUE;
3766                     }
3767                 }
3768                 else if (   isLOWER_L1(i)
3769                          && i != LATIN_SMALL_LETTER_SHARP_S
3770                          && i != MICRO_SIGN)
3771                 {
3772                     if (isLOWER_A(i)) {
3773                         if (PL_fold_locale[i] != toUPPER_A(i)) {
3774                             unexpected = TRUE;
3775                         }
3776                     }
3777                     else if (has_non_ascii_fold) {
3778                         if (PL_fold_locale[i] != toUPPER_LATIN1_MOD(i)) {
3779                             unexpected = TRUE;
3780                         }
3781                     }
3782                     else if (PL_fold_locale[i] != i) {
3783                         unexpected = TRUE;
3784                     }
3785                 }
3786                 else if (PL_fold_locale[i] != i) {
3787                     unexpected = TRUE;
3788                 }
3789 
3790                 if (unexpected) {
3791                     found_unexpected = TRUE;
3792                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3793                                            "For %s, fold of %02x is %02x\n",
3794                                            newctype, i, PL_fold_locale[i]));
3795                 }
3796             }
3797         }
3798 
3799         if (found_unexpected) {
3800             DEBUG_L(PerlIO_printf(Perl_debug_log,
3801                                "All bytes not mentioned above either fold to"
3802                                " themselves or are the expected ASCII or"
3803                                " Latin1 ones\n"));
3804         }
3805         else {
3806             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3807                                    "No nonstandard folds were found\n"));
3808 #    endif
3809 
3810         }
3811     }
3812 
3813 #    ifdef MB_CUR_MAX
3814 
3815     /* We only handle single-byte locales (outside of UTF-8 ones); so if this
3816      * locale requires more than one byte, there are going to be BIG problems.
3817      * */
3818 
3819     const int mb_cur_max = MB_CUR_MAX;
3820 
3821     if (mb_cur_max > 1 && ! PL_in_utf8_CTYPE_locale
3822 
3823             /* Some platforms return MB_CUR_MAX > 1 for even the "C" locale.
3824              * Just assume that the implementation for them (plus for POSIX) is
3825              * correct and the > 1 value is spurious.  (Since these are
3826              * specially handled to never be considered UTF-8 locales, as long
3827              * as this is the only problem, everything should work fine */
3828         && ! isNAME_C_OR_POSIX(newctype))
3829     {
3830         DEBUG_L(PerlIO_printf(Perl_debug_log,
3831                               "Unsupported, MB_CUR_MAX=%d\n", mb_cur_max));
3832 
3833         if (! IN_LC(LC_CTYPE) || ckWARN_d(WARN_LOCALE)) {
3834             char * msg = Perl_form(aTHX_
3835                                    "Locale '%s' is unsupported, and may hang"
3836                                    " or crash the interpreter",
3837                                      newctype);
3838             if (IN_LC(LC_CTYPE)) {
3839                 Perl_warner(aTHX_ packWARN(WARN_LOCALE), "%s", msg);
3840             }
3841             else {
3842                 PL_warn_locale = newSV(0);
3843                 sv_setpvn(PL_warn_locale, msg, strlen(msg));
3844             }
3845         }
3846     }
3847 
3848 #    endif
3849 
3850     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "check_for_problems=%d\n",
3851                                            check_for_problems));
3852 
3853     /* We don't populate the other lists if a UTF-8 locale, but do check that
3854      * everything works as expected, unless checking turned off */
3855     if (check_for_problems) {
3856         /* Assume enough space for every character being bad.  4 spaces each
3857          * for the 94 printable characters that are output like "'x' "; and 5
3858          * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
3859          * NUL */
3860         char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
3861         unsigned int bad_count = 0;         /* Count of bad characters */
3862 
3863         for (unsigned i = 0; i < 256; i++) {
3864 
3865             /* If checking for locale problems, see if the native ASCII-range
3866              * printables plus \n and \t are in their expected categories in
3867              * the new locale.  If not, this could mean big trouble, upending
3868              * Perl's and most programs' assumptions, like having a
3869              * metacharacter with special meaning become a \w.  Fortunately,
3870              * it's very rare to find locales that aren't supersets of ASCII
3871              * nowadays.  It isn't a problem for most controls to be changed
3872              * into something else; we check only \n and \t, though perhaps \r
3873              * could be an issue as well. */
3874             if (isGRAPH_A(i) || isBLANK_A(i) || i == '\n') {
3875                 bool is_bad = FALSE;
3876                 char name[4] = { '\0' };
3877 
3878                 /* Convert the name into a string */
3879                 if (isGRAPH_A(i)) {
3880                     name[0] = i;
3881                     name[1] = '\0';
3882                 }
3883                 else if (i == '\n') {
3884                     my_strlcpy(name, "\\n", sizeof(name));
3885                 }
3886                 else if (i == '\t') {
3887                     my_strlcpy(name, "\\t", sizeof(name));
3888                 }
3889                 else {
3890                     assert(i == ' ');
3891                     my_strlcpy(name, "' '", sizeof(name));
3892                 }
3893 
3894                 /* Check each possibe class */
3895                 if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) !=
3896                                                     cBOOL(isALPHANUMERIC_A(i))))
3897                 {
3898                     is_bad = TRUE;
3899                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3900                                         "isalnum('%s') unexpectedly is %x\n",
3901                                         name, cBOOL(isU8_ALPHANUMERIC_LC(i))));
3902                 }
3903                 if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i))))  {
3904                     is_bad = TRUE;
3905                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3906                                           "isalpha('%s') unexpectedly is %x\n",
3907                                           name, cBOOL(isU8_ALPHA_LC(i))));
3908                 }
3909                 if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i))))  {
3910                     is_bad = TRUE;
3911                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3912                                           "isdigit('%s') unexpectedly is %x\n",
3913                                           name, cBOOL(isU8_DIGIT_LC(i))));
3914                 }
3915                 if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i))))  {
3916                     is_bad = TRUE;
3917                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3918                                           "isgraph('%s') unexpectedly is %x\n",
3919                                           name, cBOOL(isU8_GRAPH_LC(i))));
3920                 }
3921                 if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i))))  {
3922                     is_bad = TRUE;
3923                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3924                                           "islower('%s') unexpectedly is %x\n",
3925                                           name, cBOOL(isU8_LOWER_LC(i))));
3926                 }
3927                 if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i))))  {
3928                     is_bad = TRUE;
3929                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3930                                           "isprint('%s') unexpectedly is %x\n",
3931                                           name, cBOOL(isU8_PRINT_LC(i))));
3932                 }
3933                 if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i))))  {
3934                     is_bad = TRUE;
3935                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3936                                           "ispunct('%s') unexpectedly is %x\n",
3937                                           name, cBOOL(isU8_PUNCT_LC(i))));
3938                 }
3939                 if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i))))  {
3940                     is_bad = TRUE;
3941                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3942                                           "isspace('%s') unexpectedly is %x\n",
3943                                           name, cBOOL(isU8_SPACE_LC(i))));
3944                 }
3945                 if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i))))  {
3946                     is_bad = TRUE;
3947                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3948                                           "isupper('%s') unexpectedly is %x\n",
3949                                           name, cBOOL(isU8_UPPER_LC(i))));
3950                 }
3951                 if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i)))) {
3952                     is_bad = TRUE;
3953                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3954                                           "isxdigit('%s') unexpectedly is %x\n",
3955                                           name, cBOOL(isU8_XDIGIT_LC(i))));
3956                 }
3957                 if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) {
3958                     is_bad = TRUE;
3959                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3960                             "tolower('%s')=0x%x instead of the expected 0x%x\n",
3961                             name, toU8_LOWER_LC(i), (int) toLOWER_A(i)));
3962                 }
3963                 if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) {
3964                     is_bad = TRUE;
3965                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3966                             "toupper('%s')=0x%x instead of the expected 0x%x\n",
3967                             name, toU8_UPPER_LC(i), (int) toUPPER_A(i)));
3968                 }
3969                 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i))))  {
3970                     is_bad = TRUE;
3971                     DEBUG_L(PerlIO_printf(Perl_debug_log,
3972                                 "'\\n' (=%02X) is not a control\n", (int) i));
3973                 }
3974 
3975                 /* Add to the list;  Separate multiple entries with a blank */
3976                 if (is_bad) {
3977                     if (bad_count) {
3978                         my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
3979                     }
3980                     my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
3981                     bad_count++;
3982                 }
3983             }
3984         }
3985 
3986         if (bad_count == 2 && maybe_utf8_turkic) {
3987             bad_count = 0;
3988             *bad_chars_list = '\0';
3989 
3990             /* The casts are because otherwise some compilers warn:
3991                 gcc.gnu.org/bugzilla/show_bug.cgi?id=99950
3992                 gcc.gnu.org/bugzilla/show_bug.cgi?id=94182
3993              */
3994             PL_fold_locale[ (U8) 'I' ] = 'I';
3995             PL_fold_locale[ (U8) 'i' ] = 'i';
3996             PL_in_utf8_turkic_locale = TRUE;
3997             DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype));
3998         }
3999 
4000         /* If we found problems and we want them output, do so */
4001         if (   (UNLIKELY(bad_count))
4002             && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
4003         {
4004             /* WARNING.  If you change the wording of these; be sure to update
4005              * t/loc_tools.pl correspondingly */
4006 
4007             if (PL_warn_locale) {
4008                 sv_catpvs(PL_warn_locale, "\n");
4009             }
4010             else {
4011                 PL_warn_locale = newSVpvs("");
4012             }
4013 
4014             if (PL_in_utf8_CTYPE_locale) {
4015                 Perl_sv_catpvf(aTHX_ PL_warn_locale,
4016                      "Locale '%s' contains (at least) the following characters"
4017                      " which have\nunexpected meanings: %s\nThe Perl program"
4018                      " will use the expected meanings",
4019                       newctype, bad_chars_list);
4020             }
4021             else {
4022                 Perl_sv_catpvf(aTHX_ PL_warn_locale,
4023                                   "\nThe following characters (and maybe"
4024                                   " others) may not have the same meaning as"
4025                                   " the Perl program expects: %s\n",
4026                                   bad_chars_list
4027                             );
4028             }
4029 
4030 #    if defined(HAS_SOME_LANGINFO) || defined(WIN32)
4031 
4032             Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
4033                                  langinfo_c(CODESET, LC_CTYPE, newctype, NULL));
4034 
4035 #    endif
4036 
4037             Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
4038 
4039             /* If we are actually in the scope of the locale or are debugging,
4040              * output the message now.  If not in that scope, we save the
4041              * message to be output at the first operation using this locale,
4042              * if that actually happens.  Most programs don't use locales, so
4043              * they are immune to bad ones.  */
4044             if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
4045 
4046                 /* The '0' below suppresses a bogus gcc compiler warning */
4047                 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale),
4048                                                                             0);
4049                 if (IN_LC(LC_CTYPE)) {
4050                     SvREFCNT_dec_NN(PL_warn_locale);
4051                     PL_warn_locale = NULL;
4052                 }
4053             }
4054         }
4055     }
4056 }
4057 
4058 void
Perl_warn_problematic_locale()4059 Perl_warn_problematic_locale()
4060 {
4061     dTHX;
4062 
4063     /* Core-only function that outputs the message in PL_warn_locale,
4064      * and then NULLS it.  Should be called only through the macro
4065      * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */
4066 
4067     if (PL_warn_locale) {
4068         Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
4069                              SvPVX(PL_warn_locale),
4070                              0 /* dummy to avoid compiler warning */ );
4071         SvREFCNT_dec_NN(PL_warn_locale);
4072         PL_warn_locale = NULL;
4073     }
4074 }
4075 
4076 #  endif /* USE_LOCALE_CTYPE */
4077 
4078 STATIC void
S_new_LC_ALL(pTHX_ const char * lc_all,bool force)4079 S_new_LC_ALL(pTHX_ const char *lc_all, bool force)
4080 {
4081     PERL_ARGS_ASSERT_NEW_LC_ALL;
4082 
4083     /* new_LC_ALL() updates all the things we care about.  Note that this is
4084      * called just after a change, so uses the actual underlying locale just
4085      * set, and not the nominal one (should they differ, as they may in
4086      * LC_NUMERIC). */
4087 
4088     const char * individ_locales[LC_ALL_INDEX_] = { NULL };
4089 
4090     switch (parse_LC_ALL_string(lc_all,
4091                                 individ_locales,
4092                                 override_if_ignored,   /* Override any ignored
4093                                                           categories */
4094                                 true,   /* Always fill array */
4095                                 true,   /* Panic if fails, as to get here it
4096                                            earlier had to have succeeded */
4097                                 __LINE__))
4098     {
4099       case invalid:
4100       case no_array:
4101       case only_element_0:
4102         locale_panic_("Unexpected return from parse_LC_ALL_string");
4103 
4104       case full_array:
4105         break;
4106     }
4107 
4108     for_all_individual_category_indexes(i) {
4109         if (update_functions[i]) {
4110             const char * this_locale = individ_locales[i];
4111             update_functions[i](aTHX_ this_locale, force);
4112         }
4113 
4114         Safefree(individ_locales[i]);
4115     }
4116 }
4117 
4118 #  ifdef USE_LOCALE_COLLATE
4119 
4120 STATIC void
S_new_collate(pTHX_ const char * newcoll,bool force)4121 S_new_collate(pTHX_ const char *newcoll, bool force)
4122 {
4123     PERL_ARGS_ASSERT_NEW_COLLATE;
4124     PERL_UNUSED_ARG(force);
4125 
4126     /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
4127      * core Perl this and that 'newcoll' is the name of the new locale.
4128      *
4129      * The design of locale collation is that every locale change is given an
4130      * index 'PL_collation_ix'.  The first time a string participates in an
4131      * operation that requires collation while locale collation is active, it
4132      * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()).  That
4133      * magic includes the collation index, and the transformation of the string
4134      * by strxfrm(), q.v.  That transformation is used when doing comparisons,
4135      * instead of the string itself.  If a string changes, the magic is
4136      * cleared.  The next time the locale changes, the index is incremented,
4137      * and so we know during a comparison that the transformation is not
4138      * necessarily still valid, and so is recomputed.  Note that if the locale
4139      * changes enough times, the index could wrap, and it is possible that a
4140      * transformation would improperly be considered valid, leading to an
4141      * unlikely bug.  The value is declared to the widest possible type on this
4142      * platform. */
4143 
4144     /* Return if the locale isn't changing */
4145     if (strEQ(PL_collation_name, newcoll)) {
4146         return;
4147     }
4148 
4149     Safefree(PL_collation_name);
4150     PL_collation_name = savepv(newcoll);
4151     ++PL_collation_ix;
4152 
4153     /* Set the new one up if trivial.  Since this is called at process
4154      * initialization, be aware that this bit can't rely on much being
4155      * available. */
4156     PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
4157     if (PL_collation_standard) {
4158         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4159                                "Setting PL_collation name='%s'\n",
4160                                PL_collation_name));
4161         PL_collxfrm_base = 0;
4162         PL_collxfrm_mult = 2;
4163         PL_in_utf8_COLLATE_locale = FALSE;
4164         PL_strxfrm_NUL_replacement = '\0';
4165         PL_strxfrm_max_cp = 0;
4166         return;
4167     }
4168 
4169     /* Flag that the remainder of the set up is being deferred until first
4170      * need. */
4171     PL_collxfrm_mult = 0;
4172     PL_collxfrm_base = 0;
4173 
4174 }
4175 
4176 #  endif /* USE_LOCALE_COLLATE */
4177 
4178 #  ifdef WIN32
4179 
4180 STATIC wchar_t *
S_Win_byte_string_to_wstring(const UINT code_page,const char * byte_string)4181 S_Win_byte_string_to_wstring(const UINT code_page, const char * byte_string)
4182 {
4183     /* Caller must arrange to free the returned string */
4184 
4185     int req_size = MultiByteToWideChar(code_page, 0, byte_string, -1, NULL, 0);
4186     if (! req_size) {
4187         SET_EINVAL;
4188         return NULL;
4189     }
4190 
4191     wchar_t *wstring;
4192     Newx(wstring, req_size, wchar_t);
4193 
4194     if (! MultiByteToWideChar(code_page, 0, byte_string, -1, wstring, req_size))
4195     {
4196         Safefree(wstring);
4197         SET_EINVAL;
4198         return NULL;
4199     }
4200 
4201     return wstring;
4202 }
4203 
4204 #    define Win_utf8_string_to_wstring(s)                                   \
4205                                     Win_byte_string_to_wstring(CP_UTF8, (s))
4206 
4207 STATIC char *
S_Win_wstring_to_byte_string(const UINT code_page,const wchar_t * wstring)4208 S_Win_wstring_to_byte_string(const UINT code_page, const wchar_t * wstring)
4209 {
4210     /* Caller must arrange to free the returned string */
4211 
4212     int req_size =
4213             WideCharToMultiByte(code_page, 0, wstring, -1, NULL, 0, NULL, NULL);
4214 
4215     char *byte_string;
4216     Newx(byte_string, req_size, char);
4217 
4218     if (! WideCharToMultiByte(code_page, 0, wstring, -1, byte_string,
4219                                                          req_size, NULL, NULL))
4220     {
4221         Safefree(byte_string);
4222         SET_EINVAL;
4223         return NULL;
4224     }
4225 
4226     return byte_string;
4227 }
4228 
4229 #    define Win_wstring_to_utf8_string(ws)                                  \
4230                                    Win_wstring_to_byte_string(CP_UTF8, (ws))
4231 
4232 STATIC const char *
S_wrap_wsetlocale(pTHX_ const int category,const char * locale)4233 S_wrap_wsetlocale(pTHX_ const int category, const char *locale)
4234 {
4235     PERL_ARGS_ASSERT_WRAP_WSETLOCALE;
4236 
4237     /* Calls _wsetlocale(), converting the parameters/return to/from
4238      * Perl-expected forms as if plain setlocale() were being called instead.
4239      *
4240      * Caller must arrange for the returned PV to be freed.
4241      */
4242 
4243     const wchar_t * wlocale = NULL;
4244 
4245     if (locale) {
4246         wlocale = Win_utf8_string_to_wstring(locale);
4247         if (! wlocale) {
4248             return NULL;
4249         }
4250     }
4251 
4252     WSETLOCALE_LOCK;
4253     const wchar_t * wresult = _wsetlocale(category, wlocale);
4254 
4255     if (! wresult) {
4256         WSETLOCALE_UNLOCK;
4257         Safefree(wlocale);
4258         return NULL;
4259     }
4260 
4261     const char * result = Win_wstring_to_utf8_string(wresult);
4262     WSETLOCALE_UNLOCK;
4263 
4264     Safefree(wlocale);
4265     return result;
4266 }
4267 
4268 STATIC const char *
S_win32_setlocale(pTHX_ int category,const char * locale)4269 S_win32_setlocale(pTHX_ int category, const char* locale)
4270 {
4271     /* This, for Windows, emulates POSIX setlocale() behavior.  There is no
4272      * difference between the two unless the input locale is "", which normally
4273      * means on Windows to get the machine default, which is set via the
4274      * computer's "Regional and Language Options" (or its current equivalent).
4275      * In POSIX, it instead means to find the locale from the user's
4276      * environment.  This routine changes the Windows behavior to try the POSIX
4277      * behavior first.  Further details are in the called function
4278      * find_locale_from_environment().
4279      */
4280 
4281     if (locale != NULL && strEQ(locale, "")) {
4282         /* Note this function may change the locale, but that's ok because we
4283          * are about to change it anyway */
4284         locale = find_locale_from_environment(get_category_index(category));
4285         if (locale == NULL) {
4286             SET_EINVAL;
4287             return NULL;
4288         }
4289     }
4290 
4291     const char * result = wrap_wsetlocale(category, locale);
4292     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
4293                           setlocale_debug_string_r(category, locale, result)));
4294 
4295     if (! result) {
4296         SET_EINVAL;
4297         return NULL;
4298     }
4299 
4300     save_to_buffer(result, &PL_setlocale_buf, &PL_setlocale_bufsize);
4301 
4302 #    ifndef USE_PL_CUR_LC_ALL
4303 
4304     Safefree(result);
4305 
4306 #  else
4307 
4308     /* Here, we need to keep track of LC_ALL, so store the new value.  but if
4309      * the input locale is NULL, we were just querying, so the original value
4310      * hasn't changed */
4311     if (locale == NULL) {
4312         Safefree(result);
4313     }
4314     else {
4315 
4316         /* If we set LC_ALL directly above, we already know its new value; but
4317          * if we changed just an individual category, find the new LC_ALL */
4318         if (category != LC_ALL) {
4319             Safefree(result);
4320             result = wrap_wsetlocale(LC_ALL, NULL);
4321         }
4322 
4323         Safefree(PL_cur_LC_ALL);
4324         PL_cur_LC_ALL = result;
4325     }
4326 
4327     DEBUG_L(PerlIO_printf(Perl_debug_log, "new PL_cur_LC_ALL=%s\n",
4328                                           PL_cur_LC_ALL));
4329 #    endif
4330 
4331     return PL_setlocale_buf;
4332 }
4333 
4334 #  endif
4335 
4336 STATIC const char *
S_native_querylocale_i(pTHX_ const locale_category_index cat_index)4337 S_native_querylocale_i(pTHX_ const locale_category_index cat_index)
4338 {
4339     /* Determine the current locale and return it in the form the platform's
4340      * native locale handling understands.  This is different only from our
4341      * internal form for the LC_ALL category, as platforms differ in how they
4342      * represent that.
4343      *
4344      * This is only called from Perl_setlocale().  As such it returns in
4345      * PL_setlocale_buf */
4346 
4347 #  ifdef USE_LOCALE_NUMERIC
4348 
4349     /* We have the LC_NUMERIC name saved, because we are normally switched into
4350      * the C locale (or equivalent) for it. */
4351     if (cat_index == LC_NUMERIC_INDEX_) {
4352 
4353         /* We don't have to copy this return value, as it is a per-thread
4354          * variable, and won't change until a future setlocale */
4355         return PL_numeric_name;
4356     }
4357 
4358 #  endif
4359 #  ifdef LC_ALL
4360 
4361     if (cat_index != LC_ALL_INDEX_)
4362 
4363 #  endif
4364 
4365     {
4366         /* Here, not LC_ALL, and not LC_NUMERIC: the actual and native values
4367          * match */
4368 
4369 #  ifdef setlocale_i    /* Can shortcut if this is defined */
4370 
4371         return setlocale_i(cat_index, NULL);
4372 
4373 #  else
4374 
4375         return save_to_buffer(querylocale_i(cat_index),
4376                               &PL_setlocale_buf, &PL_setlocale_bufsize);
4377 #  endif
4378 
4379     }
4380 
4381     /* Below, querying LC_ALL */
4382 
4383 #  ifdef LC_ALL
4384 #    ifdef USE_PL_CURLOCALES
4385 #      define LC_ALL_ARG  PL_curlocales
4386 #    else
4387 #      define LC_ALL_ARG  NULL  /* Causes calculate_LC_ALL_string() to find the
4388                                    locale using a querylocale function */
4389 #    endif
4390 
4391     return calculate_LC_ALL_string(LC_ALL_ARG, EXTERNAL_FORMAT_FOR_QUERY,
4392                                    WANT_PL_setlocale_buf,
4393                                    __LINE__);
4394 #    undef LC_ALL_ARG
4395 #  endif    /* has LC_ALL */
4396 
4397 }
4398 
4399 #endif      /* USE_LOCALE */
4400 
4401 /*
4402 =for apidoc Perl_setlocale
4403 
4404 This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
4405 taking the same parameters, and returning the same information, except that it
4406 returns the correct underlying C<LC_NUMERIC> locale.  Regular C<setlocale> will
4407 instead return C<C> if the underlying locale has a non-dot decimal point
4408 character, or a non-empty thousands separator for displaying floating point
4409 numbers.  This is because perl keeps that locale category such that it has a
4410 dot and empty separator, changing the locale briefly during the operations
4411 where the underlying one is required. C<Perl_setlocale> knows about this, and
4412 compensates; regular C<setlocale> doesn't.
4413 
4414 Another reason it isn't completely a drop-in replacement is that it is
4415 declared to return S<C<const char *>>, whereas the system setlocale omits the
4416 C<const> (presumably because its API was specified long ago, and can't be
4417 updated; it is illegal to change the information C<setlocale> returns; doing
4418 so leads to segfaults.)
4419 
4420 Finally, C<Perl_setlocale> works under all circumstances, whereas plain
4421 C<setlocale> can be completely ineffective on some platforms under some
4422 configurations.
4423 
4424 Changing the locale is not a good idea when more than one thread is running,
4425 except on systems where the predefined variable C<${^SAFE_LOCALES}> is
4426 non-zero.  This is because on such systems the locale is global to the whole
4427 process and not local to just the thread calling the function.  So changing it
4428 in one thread instantaneously changes it in all.  On some such systems, the
4429 system C<setlocale()> is ineffective, returning the wrong information, and
4430 failing to actually change the locale.  z/OS refuses to try to change the
4431 locale once a second thread is created.  C<Perl_setlocale>, should give you
4432 accurate results of what actually happened on these problematic platforms,
4433 returning NULL if the system forbade the locale change.
4434 
4435 The return points to a per-thread static buffer, which is overwritten the next
4436 time C<Perl_setlocale> is called from the same thread.
4437 
4438 =cut
4439 
4440 */
4441 
4442 const char *
Perl_setlocale(const int category,const char * locale)4443 Perl_setlocale(const int category, const char * locale)
4444 {
4445     /* This wraps POSIX::setlocale() */
4446 
4447 #ifndef USE_LOCALE
4448 
4449     PERL_UNUSED_ARG(category);
4450     PERL_UNUSED_ARG(locale);
4451 
4452     return "C";
4453 
4454 #else
4455 
4456     dTHX;
4457 
4458     DEBUG_L(PerlIO_printf(Perl_debug_log,
4459                           "Entering Perl_setlocale(%d, \"%s\")\n",
4460                           category, locale));
4461 
4462     bool valid_category;
4463     locale_category_index cat_index = get_category_index_helper(category,
4464                                                                 &valid_category,
4465                                                                 __LINE__);
4466     if (! valid_category) {
4467         if (ckWARN(WARN_LOCALE)) {
4468             const char * conditional_warn_text;
4469             if (locale == NULL) {
4470                 conditional_warn_text = "";
4471                 locale = "";
4472             }
4473             else {
4474                 conditional_warn_text = "; can't set it to ";
4475             }
4476 
4477             /* diag_listed_as: Unknown locale category %d; can't set it to %s */
4478             Perl_warner(aTHX_
4479                            packWARN(WARN_LOCALE),
4480                            "Unknown locale category %d%s%s",
4481                            category, conditional_warn_text, locale);
4482         }
4483 
4484         SET_EINVAL;
4485         return NULL;
4486     }
4487 
4488 #  ifdef setlocale_i
4489 
4490     /* setlocale_i() gets defined only on Configurations that use setlocale()
4491      * in a simple manner that adequately handles all cases.  If this category
4492      * doesn't have any perl complications, just do that. */
4493     if (! update_functions[cat_index]) {
4494         return setlocale_i(cat_index, locale);
4495     }
4496 
4497 #  endif
4498 
4499     /* Get current locale */
4500     const char * current_locale = native_querylocale_i(cat_index);
4501 
4502     /* A NULL locale means only query what the current one is. */
4503     if (locale == NULL) {
4504         return current_locale;
4505     }
4506 
4507     if (strEQ(current_locale, locale)) {
4508         DEBUG_L(PerlIO_printf(Perl_debug_log,
4509                              "Already in requested locale: no action taken\n"));
4510         return current_locale;
4511     }
4512 
4513     /* Here, an actual change is being requested.  Do it */
4514     if (! bool_setlocale_i(cat_index, locale)) {
4515         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
4516                           setlocale_debug_string_i(cat_index, locale, "NULL")));
4517         return NULL;
4518     }
4519 
4520     /* At this point, the locale has been changed based on the requested value,
4521      * and the querylocale_i() will return the actual new value that the system
4522      * has for the category.  That may not be the same as the input, as libc
4523      * may have returned a synonymous locale name instead of the input one; or,
4524      * if there are locale categories that we are compiled to ignore, any
4525      * attempt to change them away from "C" is overruled */
4526     current_locale = querylocale_i(cat_index);
4527 
4528     /* But certain categories need further work.  For example we may need to
4529      * calculate new folding or collation rules.  And for LC_NUMERIC, we have
4530      * to switch into a locale that has a dot radix. */
4531     if (update_functions[cat_index]) {
4532         update_functions[cat_index](aTHX_ current_locale,
4533                                           /* No need to force recalculation, as
4534                                            * aren't coming from a situation
4535                                            * where Perl hasn't been controlling
4536                                            * the locale, so has accurate
4537                                            * records. */
4538                                           false);
4539     }
4540 
4541     /* Make sure the result is in a stable buffer for the caller's use, and is
4542      * in the expected format */
4543     current_locale = native_querylocale_i(cat_index);
4544 
4545     DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", current_locale));
4546 
4547     return current_locale;
4548 
4549 #endif
4550 
4551 }
4552 
4553 #ifdef USE_LOCALE
4554 #  ifdef DEBUGGING
4555 
4556 STATIC char *
S_my_setlocale_debug_string_i(pTHX_ const locale_category_index cat_index,const char * locale,const char * retval,const line_t line)4557 S_my_setlocale_debug_string_i(pTHX_
4558                               const locale_category_index cat_index,
4559                               const char* locale, /* Optional locale name */
4560 
4561                               /* return value from setlocale() when attempting
4562                                * to set 'category' to 'locale' */
4563                               const char* retval,
4564 
4565                               const line_t line)
4566 {
4567     /* Returns a pointer to a NUL-terminated string in static storage with
4568      * added text about the info passed in.  This is not thread safe and will
4569      * be overwritten by the next call, so this should be used just to
4570      * formulate a string to immediately print or savepv() on. */
4571 
4572     const char * locale_quote;
4573     const char * retval_quote;
4574 
4575     if (locale == NULL) {
4576         locale_quote = "";
4577         locale = "NULL";
4578     }
4579     else {
4580         locale_quote = "\"";
4581     }
4582 
4583     if (retval == NULL) {
4584         retval_quote = "";
4585         retval = "NULL";
4586     }
4587     else {
4588         retval_quote = "\"";
4589     }
4590 
4591 #  ifdef MULTIPLICITY
4592 #    define THREAD_FORMAT "%p:"
4593 #    define THREAD_ARGUMENT aTHX_
4594 #  else
4595 #    define THREAD_FORMAT
4596 #    define THREAD_ARGUMENT
4597 #  endif
4598 
4599     return Perl_form(aTHX_
4600                      "%s:%" LINE_Tf ": " THREAD_FORMAT
4601                      " setlocale(%s[%d], %s%s%s) returned %s%s%s\n",
4602 
4603                      __FILE__, line, THREAD_ARGUMENT
4604                      category_names[cat_index], categories[cat_index],
4605                      locale_quote, locale, locale_quote,
4606                      retval_quote, retval, retval_quote);
4607 }
4608 
4609 #  endif
4610 
4611 /* If this implementation hasn't defined these macros, they aren't needed */
4612 #  ifndef TOGGLE_LOCK
4613 #    define TOGGLE_LOCK(i)
4614 #    define TOGGLE_UNLOCK(i)
4615 #  endif
4616 
4617 STATIC const char *
S_toggle_locale_i(pTHX_ const locale_category_index cat_index,const char * new_locale,const line_t caller_line)4618 S_toggle_locale_i(pTHX_ const locale_category_index cat_index,
4619                         const char * new_locale,
4620                         const line_t caller_line)
4621 {
4622     PERL_ARGS_ASSERT_TOGGLE_LOCALE_I;
4623 
4624     /* Changes the locale for the category specified by 'index' to 'new_locale,
4625      * if they aren't already the same.  EVERY CALL to this function MUST HAVE
4626      * a corresponding call to restore_toggled_locale_i()
4627      *
4628      * Returns a copy of the name of the original locale for 'cat_index'
4629      * so can be switched back to with the companion function
4630      * restore_toggled_locale_i(),  (NULL if no restoral is necessary.) */
4631 
4632     /* Find the original locale of the category we may need to change, so that
4633      * it can be restored to later */
4634     const char * locale_to_restore_to = querylocale_i(cat_index);
4635 
4636     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4637                            "Entering toggle_locale_i: index=%d(%s),"        \
4638                            " wanted=%s, actual=%s; called from %" LINE_Tf   \
4639                            "\n", cat_index, category_names[cat_index],
4640                            new_locale, locale_to_restore_to ? locale_to_restore_to : "(null)",
4641                            caller_line));
4642 
4643     if (! locale_to_restore_to) {
4644         locale_panic_via_(Perl_form(aTHX_
4645                                     "Could not find current %s locale",
4646                                     category_names[cat_index]),
4647                          __FILE__, caller_line);
4648     }
4649 
4650     /* Begin a critical section on platforms that need it.  We do this even if
4651      * we don't have to change here, so as to prevent other instances from
4652      * changing the locale out from under us. */
4653     TOGGLE_LOCK(cat_index);
4654 
4655     /* If the locales are the same, there's nothing to do */
4656     if (strEQ(locale_to_restore_to, new_locale)) {
4657         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
4658                                                category_names[cat_index],
4659                                                new_locale));
4660         return NULL;
4661     }
4662 
4663     /* Finally, change the locale to the new one */
4664     void_setlocale_i_with_caller(cat_index, new_locale, __FILE__, caller_line);
4665 
4666     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4667                            "%s locale switched to %s\n",
4668                            category_names[cat_index], new_locale));
4669 
4670     return locale_to_restore_to;
4671 
4672 #  ifndef DEBUGGING
4673     PERL_UNUSED_ARG(caller_line);
4674 #  endif
4675 
4676 }
4677 
4678 STATIC void
S_restore_toggled_locale_i(pTHX_ const locale_category_index cat_index,const char * restore_locale,const line_t caller_line)4679 S_restore_toggled_locale_i(pTHX_ const locale_category_index cat_index,
4680                                  const char * restore_locale,
4681                                  const line_t caller_line)
4682 {
4683     /* Restores the locale for LC_category corresponding to cat_index to
4684      * 'restore_locale' (which is a copy that will be freed by this function),
4685      * or do nothing if the latter parameter is NULL */
4686 
4687     PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I;
4688 
4689     if (restore_locale == NULL) {
4690         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4691                                "restore_toggled_locale_i: No need to"       \
4692                                " restore %s; called from %" LINE_Tf "\n",   \
4693                                category_names[cat_index], caller_line));
4694         TOGGLE_UNLOCK(cat_index);
4695         return;
4696     }
4697 
4698     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4699                            "restore_toggled_locale_i: restoring locale for" \
4700                            " %s to  %s; called from %" LINE_Tf "\n",        \
4701                            category_names[cat_index], restore_locale,
4702                            caller_line));
4703 
4704     void_setlocale_i_with_caller(cat_index, restore_locale,
4705                                   __FILE__, caller_line);
4706     TOGGLE_UNLOCK(cat_index);
4707 
4708 #  ifndef DEBUGGING
4709     PERL_UNUSED_ARG(caller_line);
4710 #  endif
4711 
4712 }
4713 
4714 #endif
4715 #if defined(USE_LOCALE) || defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV)
4716 
4717 STATIC utf8ness_t
S_get_locale_string_utf8ness_i(pTHX_ const char * string,const locale_utf8ness_t known_utf8,const char * locale,const locale_category_index cat_index)4718 S_get_locale_string_utf8ness_i(pTHX_ const char * string,
4719                                      const locale_utf8ness_t known_utf8,
4720                                      const char * locale,
4721                                      const locale_category_index cat_index)
4722 {
4723     PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I;
4724 
4725 #  ifndef USE_LOCALE
4726 
4727     return UTF8NESS_NO;
4728     PERL_UNUSED_ARG(string);
4729     PERL_UNUSED_ARG(known_utf8);
4730     PERL_UNUSED_ARG(locale);
4731     PERL_UNUSED_ARG(cat_index);
4732 
4733 #  else
4734 
4735     /* Return to indicate if 'string' in the locale given by the input
4736      * arguments should be considered UTF-8 or not.
4737      *
4738      * If the input 'locale' is not NULL, use that for the locale; otherwise
4739      * use the current locale for the category specified by 'cat_index'.
4740      */
4741 
4742     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4743                            "Entering get_locale_string_utf8ness_i; locale=%s,"
4744                            " index=%u(%s), string=%s, known_utf8=%d\n",
4745                            locale, cat_index, category_names[cat_index],
4746                            ((string)
4747                             ?  _byte_dump_string((U8 *) string,
4748                                                  strlen(string),
4749                                                  0)
4750                             : "nil"),
4751                            known_utf8));
4752     if (string == NULL) {
4753         return UTF8NESS_IMMATERIAL;
4754     }
4755 
4756     if (IN_BYTES) { /* respect 'use bytes' */
4757         return UTF8NESS_NO;
4758     }
4759 
4760     Size_t len = strlen(string);
4761 
4762     /* UTF8ness is immaterial if the representation doesn't vary */
4763     const U8 * first_variant = NULL;
4764     if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) {
4765         return UTF8NESS_IMMATERIAL;
4766     }
4767 
4768     /* Can't be UTF-8 if invalid */
4769     if (! is_strict_utf8_string((U8 *) first_variant,
4770                                 len - ((char *) first_variant - string)))
4771     {
4772         return UTF8NESS_NO;
4773     }
4774 
4775     /* Here and below, we know the string is legal UTF-8, containing at least
4776      * one character requiring a sequence of two or more bytes.  It is quite
4777      * likely to be UTF-8.  But it pays to be paranoid and do further checking.
4778      *
4779      * If we already know the UTF-8ness of the locale, then we immediately know
4780      * what the string is */
4781     if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) {
4782         return (known_utf8 == LOCALE_IS_UTF8) ? UTF8NESS_YES : UTF8NESS_NO;
4783     }
4784 
4785     if (locale == NULL) {
4786         locale = querylocale_i(cat_index);
4787     }
4788 
4789     /* If the locale is UTF-8, the string is UTF-8;  otherwise it was
4790      * coincidental that the string is legal UTF-8
4791      *
4792      * However, if the perl is compiled to not pay attention to the category
4793      * being passed in, you might think that that locale is essentially always
4794      * the C locale, so it would make sense to say it isn't UTF-8.  But to get
4795      * here, the string has to contain characters unknown in the C locale.  And
4796      * in fact, Windows boxes are compiled without LC_MESSAGES, as their
4797      * message catalog isn't really a part of the locale system.  But those
4798      * messages really could be UTF-8, and given that the odds are rather small
4799      * of something not being UTF-8 but being syntactically valid UTF-8, khw
4800      * has decided to call such strings as UTF-8. */
4801     return (is_locale_utf8(locale)) ? UTF8NESS_YES : UTF8NESS_NO;
4802 
4803 #  endif
4804 
4805 }
4806 
4807 STATIC bool
S_is_locale_utf8(pTHX_ const char * locale)4808 S_is_locale_utf8(pTHX_ const char * locale)
4809 {
4810     PERL_ARGS_ASSERT_IS_LOCALE_UTF8;
4811 
4812     /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise. */
4813 
4814 #  if ! defined(USE_LOCALE)                                                   \
4815    || ! defined(USE_LOCALE_CTYPE)                                             \
4816    ||   defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */
4817 
4818     PERL_UNUSED_ARG(locale);
4819 
4820     return FALSE;
4821 
4822      /* Definitively, can't be UTF-8 */
4823 #    define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4824 #  else
4825 
4826     /* If the input happens to be the same locale as we are currently setup
4827      * for, the answer has already been cached. */
4828     if (strEQ(locale, PL_ctype_name)) {
4829         return PL_in_utf8_CTYPE_locale;
4830     }
4831 
4832     if (isNAME_C_OR_POSIX(locale)) {
4833         return false;
4834     }
4835 
4836 #    if ! defined(HAS_SOME_LANGINFO) && ! defined(WIN32)
4837 
4838     /* On non-Windows without nl_langinfo(), we have to do some digging to get
4839      * the answer.  First, toggle to the desired locale so can query its state
4840      * */
4841     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
4842 
4843 #      define TEARDOWN_FOR_IS_LOCALE_UTF8                                   \
4844                       restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
4845 
4846 #      ifdef MB_CUR_MAX
4847 
4848     /* If there are fewer bytes available in this locale than are required
4849      * to represent the largest legal UTF-8 code point, this isn't a UTF-8
4850      * locale. */
4851     const int mb_cur_max = MB_CUR_MAX;
4852     if (mb_cur_max < (int) UNISKIP(PERL_UNICODE_MAX)) {
4853         TEARDOWN_FOR_IS_LOCALE_UTF8;
4854         return false;
4855     }
4856 
4857 #      endif
4858 #      if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4859 
4860          /* With these functions, we can definitively determine a locale's
4861           * UTF-8ness */
4862 #        define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4863 
4864     /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT CHARACTER
4865      * as that Unicode code point, this has to be a UTF-8 locale; otherwise it
4866      * can't be  */
4867     wchar_t wc = 0;
4868     (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */
4869     int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
4870                                   STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
4871     TEARDOWN_FOR_IS_LOCALE_UTF8;
4872     return (   mbtowc_ret == STRLENs(REPLACEMENT_CHARACTER_UTF8)
4873             && wc == UNICODE_REPLACEMENT);
4874 
4875 #      else
4876 
4877         /* If the above two C99 functions aren't working, you could try some
4878          * different methods.  It seems likely that the obvious choices,
4879          * wctomb() and wcrtomb(), wouldn't be working either.  But you could
4880          * choose one of the dozen-ish Unicode titlecase triples and verify
4881          * that towupper/towlower work as expected.
4882          *
4883          * But, our emulation of nl_langinfo() works quite well, so avoid the
4884          * extra code until forced to by some weird non-conforming platform. */
4885 #        define USE_LANGINFO_FOR_UTF8NESS
4886 #        undef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4887 #      endif
4888 #    else
4889 
4890      /* On Windows or on platforms with nl_langinfo(), there is a direct way to
4891       * get the locale's codeset, which will be some form of 'UTF-8' for a
4892       * UTF-8 locale.  langinfo_c() handles this, and we will call that
4893       * below */
4894 #      define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4895 #      define USE_LANGINFO_FOR_UTF8NESS
4896 #      define TEARDOWN_FOR_IS_LOCALE_UTF8
4897 #    endif  /* USE_LANGINFO_FOR_UTF8NESS */
4898 
4899      /* If the above compiled into code, it found the locale's UTF-8ness,
4900       * nothing more to do; if it didn't get compiled,
4901       * USE_LANGINFO_FOR_UTF8NESS is defined.  There are two possible reasons:
4902       *   1)  it is the preferred method because it knows directly for sure
4903       *       what the codeset is because the platform has libc functions that
4904       *       return this; or
4905       *   2)  the functions the above code section would compile to use don't
4906       *       exist or are unreliable on this platform; we are less sure of the
4907       *       langinfo_c() result, though it is very unlikely to be wrong
4908       *       about if it is UTF-8 or not */
4909 #    ifdef USE_LANGINFO_FOR_UTF8NESS
4910 
4911     const char * codeset = langinfo_c(CODESET, LC_CTYPE, locale, NULL);
4912     bool retval = is_codeset_name_UTF8(codeset);
4913 
4914     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4915                            "found codeset=%s, is_utf8=%d\n", codeset, retval));
4916     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "is_locale_utf8(%s) returning %d\n",
4917                                                             locale, retval));
4918     TEARDOWN_FOR_IS_LOCALE_UTF8;
4919     return retval;
4920 
4921 #    endif
4922 #  endif      /* End of the #else clause, for the non-trivial case */
4923 
4924 }
4925 
4926 #endif
4927 
4928 #ifdef USE_LOCALE
4929 #  ifdef USE_LOCALE_CTYPE
4930 
4931 STATIC bool
S_is_codeset_name_UTF8(const char * name)4932 S_is_codeset_name_UTF8(const char * name)
4933 {
4934     /* Return a boolean as to if the passed-in name indicates it is a UTF-8
4935      * code set.  Several variants are possible */
4936     const Size_t len = strlen(name);
4937 
4938     PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8;
4939 
4940 #    ifdef WIN32
4941 
4942     /* https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers */
4943     if (memENDs(name, len, "65001")) {
4944         return TRUE;
4945     }
4946 
4947 #    endif
4948                /* 'UTF8' or 'UTF-8' */
4949     return (    inRANGE(len, 4, 5)
4950             &&  name[len-1] == '8'
4951             && (   memBEGINs(name, len, "UTF")
4952                 || memBEGINs(name, len, "utf"))
4953             && (len == 4 || name[3] == '-'));
4954 }
4955 
4956 #  endif
4957 #  ifdef WIN32
4958 
4959 bool
Perl_get_win32_message_utf8ness(pTHX_ const char * string)4960 Perl_get_win32_message_utf8ness(pTHX_ const char * string)
4961 {
4962     /* This is because Windows doesn't have LC_MESSAGES. */
4963 
4964 #    ifdef USE_LOCALE_CTYPE
4965 
4966     /* We don't know the locale utf8ness here, and not even the locale itself.
4967      * Since Windows uses a different mechanism to specify message language
4968      * output than the locale system, it is going to be problematic deciding
4969      * if we are to store it as UTF-8 or not.  By specifying LOCALE_IS_UTF8, we
4970      * are telling the called function to return true iff the string has
4971      * non-ASCII characters in it that are all syntactically UTF-8.  We are
4972      * thus relying on the fact that a string that is syntactically valid UTF-8
4973      * is likely to be UTF-8.  Should this ever cause problems, this function
4974      * could be replaced by something more Windows-specific */
4975     return get_locale_string_utf8ness_i(string, LOCALE_IS_UTF8,
4976                                         NULL, LC_CTYPE_INDEX_);
4977 #    else
4978 
4979     PERL_UNUSED_ARG(string);
4980     return false;
4981 
4982 #    endif
4983 
4984 }
4985 
4986 #  endif
4987 
4988 STATIC void
S_set_save_buffer_min_size(pTHX_ Size_t min_len,char ** buf,Size_t * buf_cursize)4989 S_set_save_buffer_min_size(pTHX_ Size_t min_len,
4990                                  char **buf,
4991                                  Size_t * buf_cursize)
4992 {
4993     /* Make sure the buffer pointed to by *buf is at least as large 'min_len';
4994      * *buf_cursize is the size of 'buf' upon entry; it will be updated to the
4995      * new size on exit.  'buf_cursize' being NULL is to be used when this is a
4996      * single use buffer, which will shortly be freed by the caller. */
4997 
4998     if (buf_cursize == NULL) {
4999         Newx(*buf, min_len, char);
5000     }
5001     else if (*buf_cursize == 0) {
5002         Newx(*buf, min_len, char);
5003         *buf_cursize = min_len;
5004     }
5005     else if (min_len > *buf_cursize) {
5006         Renew(*buf, min_len, char);
5007         *buf_cursize = min_len;
5008     }
5009 }
5010 
5011 STATIC const char *
S_save_to_buffer(pTHX_ const char * string,char ** buf,Size_t * buf_size)5012 S_save_to_buffer(pTHX_ const char * string, char **buf, Size_t *buf_size)
5013 {
5014     PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
5015 
5016     /* Copy the NUL-terminated 'string' to a buffer whose address before this
5017      * call began at *buf, and whose available length before this call was
5018      * *buf_size.
5019      *
5020      * If the length of 'string' is greater than the space available, the
5021      * buffer is grown accordingly, which may mean that it gets relocated.
5022      * *buf and *buf_size will be updated to reflect this.
5023      *
5024      * Regardless, the function returns a pointer to where 'string' is now
5025      * stored.
5026      *
5027      * 'string' may be NULL, which means no action gets taken, and NULL is
5028      * returned.
5029      *
5030      * 'buf_size' being NULL is to be used when this is a single use buffer,
5031      * which will shortly be freed by the caller.
5032      *
5033      * If *buf or 'buf_size' are NULL or *buf_size is 0, the buffer is assumed
5034      * empty, and memory is malloc'd.
5035      */
5036 
5037     if (! string) {
5038         return NULL;
5039     }
5040 
5041     /* No-op to copy over oneself */
5042     if (string == *buf) {
5043         return string;
5044     }
5045 
5046     Size_t string_size = strlen(string) + 1;
5047     set_save_buffer_min_size(string_size, buf, buf_size);
5048 
5049 #  ifdef DEBUGGING
5050 
5051     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5052                          "Copying '%s' to %p\n",
5053                          ((is_strict_utf8_string((U8 *) string, 0))
5054                           ? string
5055                           :_byte_dump_string((U8 *) string, strlen(string), 0)),
5056                           *buf));
5057 
5058 #    ifdef USE_LOCALE_CTYPE
5059 
5060     /* Catch glitches.  Usually this is because LC_CTYPE needs to be the same
5061      * locale as whatever is being worked on */
5062     if (UNLIKELY(instr(string, REPLACEMENT_CHARACTER_UTF8))) {
5063         locale_panic_(Perl_form(aTHX_
5064                                 "Unexpected REPLACEMENT_CHARACTER in '%s'\n%s",
5065                                 string, get_LC_ALL_display()));
5066     }
5067 
5068 #    endif
5069 #  endif
5070 
5071     Copy(string, *buf, string_size, char);
5072     return *buf;
5073 }
5074 
5075 #endif
5076 
5077 int
Perl_mbtowc_(pTHX_ const wchar_t * pwc,const char * s,const Size_t len)5078 Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len)
5079 {
5080 
5081 #if ! defined(HAS_MBRTOWC) && ! defined(HAS_MBTOWC)
5082 
5083     PERL_UNUSED_ARG(pwc);
5084     PERL_UNUSED_ARG(s);
5085     PERL_UNUSED_ARG(len);
5086     return -1;
5087 
5088 #else   /* Below we have some form of mbtowc() */
5089 #  if defined(HAS_MBRTOWC)                                      \
5090    && (defined(MULTIPLICITY) || ! defined(HAS_MBTOWC))
5091 #    define USE_MBRTOWC
5092 #  else
5093 #    undef USE_MBRTOWC
5094 #  endif
5095 
5096     CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
5097     int retval = -1;
5098 
5099     if (s == NULL) { /* Initialize the shift state to all zeros in
5100                         PL_mbrtowc_ps. */
5101 
5102 #  if defined(USE_MBRTOWC)
5103 
5104         memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
5105         return 0;
5106 
5107 #  else
5108 
5109         SETERRNO(0, 0);
5110         MBTOWC_LOCK_;
5111         retval = mbtowc(NULL, NULL, 0);
5112         MBTOWC_UNLOCK_;
5113         return retval;
5114 
5115 #  endif
5116 
5117     }
5118 
5119 #  if defined(USE_MBRTOWC)
5120 
5121     SETERRNO(0, 0);
5122     MBRTOWC_LOCK_;
5123     retval = (SSize_t) mbrtowc((wchar_t *) pwc, s, len, &PL_mbrtowc_ps);
5124     MBRTOWC_UNLOCK_;
5125 
5126 #  else
5127 
5128     /* Locking prevents races, but locales can be switched out without locking,
5129      * so this isn't a cure all */
5130     SETERRNO(0, 0);
5131     MBTOWC_LOCK_;
5132     retval = mbtowc((wchar_t *) pwc, s, len);
5133     MBTOWC_UNLOCK_;
5134 
5135 #  endif
5136 
5137     return retval;
5138 
5139 #endif
5140 
5141 }
5142 
5143 /*
5144 =for apidoc Perl_localeconv
5145 
5146 This is a thread-safe version of the libc L<localeconv(3)>.  It is the same as
5147 L<POSIX::localeconv|POSIX/localeconv> (returning a hash of the C<localeconv()>
5148 fields), but directly callable from XS code.  The hash is mortalized, so must
5149 be dealt with immediately.
5150 
5151 =cut
5152 */
5153 
5154 /* All Wndows versions we support, except possibly MingW, have general
5155  * thread-safety, and even localeconv() is thread safe, returning into a
5156  * per-thread buffer.  MingW when built with a modern MS C runtime (UCRT as of
5157  * this writing), also has those things.
5158  *
5159  * FreeBSD's localeconv() when used with uselocale() is supposed to be
5160  * thread-safe (as is their localeconv_l()), but we currently don't use
5161  * thread-safe locales there because of bugs. There may be other thread-safe
5162  * localeconv() implementations, especially on *BSD derivatives, but khw knows
5163  * of none, and hasn't really investigated, in part because of the past
5164  * unreliability of vendor thread-safety claims */
5165 #if defined(WIN32) && (defined(_MSC_VER) || (defined(_UCRT)))
5166 #  define LOCALECONV_IS_THREAD_SAFE
5167 #endif
5168 
5169 /* When multiple threads can be going at once, we need a critical section
5170  * around doing the localeconv() and saving its return, unless localeconv() is
5171  * thread-safe, and we are using it in a thread-safe manner, which we are only
5172  * doing if safe threads are available and we don't have a broken localeconv()
5173  * */
5174 #if  defined(USE_THREADS)                               \
5175  && (   ! defined(LOCALECONV_IS_THREAD_SAFE)            \
5176      || ! defined(USE_THREAD_SAFE_LOCALE)               \
5177      ||   defined(TS_W32_BROKEN_LOCALECONV))
5178 #  define LOCALECONV_NEEDS_CRITICAL_SECTION
5179 #endif
5180 
5181 HV *
Perl_localeconv(pTHX)5182 Perl_localeconv(pTHX)
5183 {
5184     return (HV *) sv_2mortal((SV *) my_localeconv(0));
5185 }
5186 
5187 HV *
S_my_localeconv(pTHX_ const int item)5188 S_my_localeconv(pTHX_ const int item)
5189 {
5190     PERL_ARGS_ASSERT_MY_LOCALECONV;
5191 
5192     /* This returns a mortalized hash containing all or certain elements
5193      * returned by localeconv(). */
5194     HV * hv = newHV();      /* The returned hash, initially empty */
5195 
5196     /* The function is used by Perl_localeconv() and POSIX::localeconv(), or
5197      * internally from this file, and is thread-safe.
5198      *
5199      * localeconv() returns items from two different locale categories,
5200      * LC_MONETARY and LC_NUMERIC.  Various data structures in this function
5201      * are arrays with two elements, one for each category, and these indexes
5202      * indicate which array element applies to which category */
5203 #define NUMERIC_OFFSET   0
5204 #define MONETARY_OFFSET  1
5205 
5206     /* Some operations apply to one or the other category, or both.  A mask
5207      * is used to specify all the possibilities.  This macro converts from the
5208      * category offset to its bit position in the mask. */
5209 #define OFFSET_TO_BIT(i)  (1 << (i))
5210 
5211     /* There are two use cases for this function:
5212      * 1) Called as Perl_localeconv(), or from POSIX::locale_conv().  This
5213      *    returns the lconv structure copied to a hash, based on the current
5214      *    underlying locales for LC_NUMERIC and LC_MONETARY. An input item==0
5215      *    signifies this case, or on many platforms it is the only use case
5216      *    compiled.
5217      * 2) Certain items that nl_langinfo() provides are also derivable from
5218      *    the return of localeconv().  Windows notably doesn't have
5219      *    nl_langinfo(), so on that, and actually any platform lacking it,
5220      *    my_localeconv() is used also to emulate it for those particular
5221      *    items.  The code to do this is compiled only on such platforms.
5222      *    Rather than going to the expense of creating a full hash when only
5223      *    one item is needed, the returned hash has just the desired item in
5224      *    it.
5225      *
5226      * To access all the localeconv() struct lconv fields, there is a data
5227      * structure that contains every commonly documented field in it.  (Maybe
5228      * some minority platforms have extra fields.  Those could be added here
5229      * without harm; they would just be ignored on platforms lacking them.)
5230      *
5231      * Our structure is compiled to make looping through the fields easier by
5232      * pointing each name to its value's offset within lconv, e.g.,
5233         { "thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep) }
5234      */
5235 #define LCONV_ENTRY(name) {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)}
5236 
5237     /* These synonyms are just for clarity, and to make it easier in case
5238      * something needs to change in the future */
5239 #define LCONV_NUMERIC_ENTRY(name)  LCONV_ENTRY(name)
5240 #define LCONV_MONETARY_ENTRY(name) LCONV_ENTRY(name)
5241 
5242     /* There are just a few fields for NUMERIC strings */
5243     const lconv_offset_t lconv_numeric_strings[] = {
5244 #ifndef NO_LOCALECONV_GROUPING
5245         LCONV_NUMERIC_ENTRY(grouping),
5246 # endif
5247         LCONV_NUMERIC_ENTRY(thousands_sep),
5248 # define THOUSANDS_SEP_LITERAL  "thousands_sep"
5249         LCONV_NUMERIC_ENTRY(decimal_point),
5250 # define DECIMAL_POINT_LITERAL "decimal_point"
5251         {NULL, 0}
5252     };
5253 
5254     /* When used to implement nl_langinfo(), we save time by only populating
5255      * the hash with the field(s) needed.  Thus we would need a data structure
5256      * of just:
5257      *  LCONV_NUMERIC_ENTRY(decimal_point),
5258      *  {NULL, 0}
5259      *
5260      * By placing the decimal_point field last in the full structure, we can
5261      * use just the tail for this bit of it, saving space.  This macro yields
5262      * the address of the sub structure. */
5263 #define DECIMAL_POINT_ADDRESS                                             \
5264         &lconv_numeric_strings[(C_ARRAY_LENGTH(lconv_numeric_strings) - 2)]
5265 
5266     /* And the MONETARY string fields */
5267     const lconv_offset_t lconv_monetary_strings[] = {
5268         LCONV_MONETARY_ENTRY(int_curr_symbol),
5269         LCONV_MONETARY_ENTRY(mon_decimal_point),
5270 #ifndef NO_LOCALECONV_MON_THOUSANDS_SEP
5271         LCONV_MONETARY_ENTRY(mon_thousands_sep),
5272 #endif
5273 #ifndef NO_LOCALECONV_MON_GROUPING
5274         LCONV_MONETARY_ENTRY(mon_grouping),
5275 #endif
5276         LCONV_MONETARY_ENTRY(positive_sign),
5277         LCONV_MONETARY_ENTRY(negative_sign),
5278         LCONV_MONETARY_ENTRY(currency_symbol),
5279 #define CURRENCY_SYMBOL_LITERAL  "currency_symbol"
5280         {NULL, 0}
5281     };
5282 
5283     /* Like above, this field being last can be used as a sub structure */
5284 #define CURRENCY_SYMBOL_ADDRESS                                            \
5285       &lconv_monetary_strings[(C_ARRAY_LENGTH(lconv_monetary_strings) - 2)]
5286 
5287     /* Finally there are integer fields, all are for monetary purposes */
5288     const lconv_offset_t lconv_integers[] = {
5289         LCONV_ENTRY(int_frac_digits),
5290         LCONV_ENTRY(frac_digits),
5291         LCONV_ENTRY(p_sep_by_space),
5292         LCONV_ENTRY(n_cs_precedes),
5293         LCONV_ENTRY(n_sep_by_space),
5294         LCONV_ENTRY(p_sign_posn),
5295         LCONV_ENTRY(n_sign_posn),
5296 #ifdef HAS_LC_MONETARY_2008
5297         LCONV_ENTRY(int_p_cs_precedes),
5298         LCONV_ENTRY(int_p_sep_by_space),
5299         LCONV_ENTRY(int_n_cs_precedes),
5300         LCONV_ENTRY(int_n_sep_by_space),
5301         LCONV_ENTRY(int_p_sign_posn),
5302         LCONV_ENTRY(int_n_sign_posn),
5303 #endif
5304 #    define P_CS_PRECEDES_LITERAL    "p_cs_precedes"
5305         LCONV_ENTRY(p_cs_precedes),
5306         {NULL, 0}
5307     };
5308 
5309     /* Like above, this field being last can be used as a sub structure */
5310 #define P_CS_PRECEDES_ADDRESS                                       \
5311       &lconv_integers[(C_ARRAY_LENGTH(lconv_integers) - 2)]
5312 
5313     /* The actual populating of the hash is done by two sub functions that get
5314      * passed an array of length two containing the data structure they are
5315      * supposed to use to get the key names to fill the hash with.  One element
5316      * is always for the NUMERIC strings (or NULL if none to use), and the
5317      * other element similarly for the MONETARY ones. */
5318     const lconv_offset_t * strings[2] = { lconv_numeric_strings,
5319                                           lconv_monetary_strings
5320                                         };
5321 
5322     /* The LC_MONETARY category also has some integer-valued fields, whose
5323      * information is kept in a separate parallel array to 'strings' */
5324     const lconv_offset_t * integers[2] = {
5325                                            NULL,
5326                                            lconv_integers
5327                                          };
5328 
5329 #if  ! defined(HAS_LOCALECONV)                                          \
5330  || (! defined(USE_LOCALE_NUMERIC) && ! defined(USE_LOCALE_MONETARY))
5331 
5332     /* If both NUMERIC and MONETARY must be the "C" locale, simply populate the
5333      * hash using the function that works on just that locale. */
5334     populate_hash_from_C_localeconv(hv,
5335                                     "C",
5336                                     (  OFFSET_TO_BIT(NUMERIC_OFFSET)
5337                                      | OFFSET_TO_BIT(MONETARY_OFFSET)),
5338                                      strings, integers);
5339 
5340     /* We shouldn't get to here for the case of an individual item, as
5341      * preprocessor directives elsewhere in this file should have filled in the
5342      * correct values at a higher level */
5343     assert(item == 0);
5344     PERL_UNUSED_ARG(item);
5345 
5346     return hv;
5347 
5348 #else
5349 
5350     /* From here to the end of this function, at least one of NUMERIC or
5351      * MONETARY can be non-C */
5352 
5353     /* This is a mask, with one bit to tell the populate functions to populate
5354      * the NUMERIC items; another bit for the MONETARY ones.  This way they can
5355      * choose which (or both) to populate from */
5356     U32 index_bits = 0;
5357 
5358     /* Some platforms, for correct non-mojibake results, require LC_CTYPE's
5359      * locale to match LC_NUMERIC's for the numeric fields, and LC_MONETARY's
5360      * for the monetary ones.  What happens if LC_NUMERIC and LC_MONETARY
5361      * aren't compatible?  Wrong results.  To avoid that, we call localeconv()
5362      * twice, once for each locale, setting LC_CTYPE to match the category.
5363      * But if the locales of both categories are the same, there is no need for
5364      * a second call.  Assume this is the case unless overridden below */
5365     bool requires_2nd_localeconv = false;
5366 
5367     /* The actual hash populating is done by one of the two populate functions.
5368      * Which one is appropriate for either the MONETARY_OFFSET or the
5369      * NUMERIC_OFFSET is calculated and then stored in this table */
5370     void (*populate[2]) (pTHX_
5371                          HV * ,
5372                          const char *,
5373                          const U32,
5374                          const lconv_offset_t **,
5375                          const lconv_offset_t **);
5376 
5377     /* This gives the locale to use for the corresponding OFFSET, like the
5378      * 'populate' array above */
5379     const char * locales[2];
5380 
5381 #  ifdef HAS_SOME_LANGINFO
5382 
5383     /* If the only use-case for this is the full localeconv(), the 'item'
5384      * parameter is ignored. */
5385     PERL_UNUSED_ARG(item);
5386 
5387 #  else     /* This only gets compiled for the use-case of using localeconv()
5388                to emulate nl_langinfo() when missing from the platform. */
5389 
5390 #    ifdef USE_LOCALE_NUMERIC
5391 
5392     /* We need this substructure to only return this field for the THOUSEP
5393      * item.  The other items also need substructures, but they were handled
5394      * above by placing the substructure's item at the end of the full one, so
5395      * the data structure could do double duty.  However, both this and
5396      * RADIXCHAR would need to be in the final position of the same full
5397      * structure; an impossibility.  So make this into a separate structure */
5398     const lconv_offset_t  thousands_sep_string[] = {
5399         LCONV_NUMERIC_ENTRY(thousands_sep),
5400         {NULL, 0}
5401     };
5402 
5403 #    endif
5404 
5405     /* End of all the initialization of data structures.  Now for actual code.
5406      *
5407      * Without nl_langinfo(), the call to my_localeconv() could be for all of
5408      * the localeconv() items or for just one of the following 3 items to
5409      * emulate nl_langinfo().
5410      *
5411      * This is compiled only when using perl_langinfo.h, which we control, and
5412      * it has been constructed so that no item is numbered 0.
5413      *
5414      * For each individual item, either return the known value if the current
5415      * locale is "C", or set up the appropriate parameters for the call below
5416      * to the populate function */
5417     if (item != 0) {
5418         const char *locale;
5419 
5420         switch (item) {
5421           default:
5422             locale_panic_(Perl_form(aTHX_
5423                           "Unexpected item passed to my_localeconv: %d", item));
5424             break;
5425 
5426 #    ifdef USE_LOCALE_NUMERIC
5427 
5428           case RADIXCHAR:
5429             if (isNAME_C_OR_POSIX(PL_numeric_name)) {
5430                 (void) hv_stores(hv, DECIMAL_POINT_LITERAL, newSVpvs("."));
5431                 return hv;
5432             }
5433 
5434             strings[NUMERIC_OFFSET] = DECIMAL_POINT_ADDRESS;
5435             goto numeric_common;
5436 
5437           case THOUSEP:
5438             if (isNAME_C_OR_POSIX(PL_numeric_name)) {
5439                 (void) hv_stores(hv, THOUSANDS_SEP_LITERAL, newSVpvs(""));
5440                 return hv;
5441             }
5442 
5443             strings[NUMERIC_OFFSET] = thousands_sep_string;
5444 
5445           numeric_common:
5446             index_bits = OFFSET_TO_BIT(NUMERIC_OFFSET);
5447             locale = PL_numeric_name;
5448             break;
5449 
5450 #    endif
5451 #    ifdef USE_LOCALE_MONETARY
5452 
5453           case CRNCYSTR:    /* This item needs the values for both the currency
5454                                symbol, and another one used to construct the
5455                                nl_langino()-compatible return. */
5456 
5457             locale = querylocale_c(LC_MONETARY);
5458             if (isNAME_C_OR_POSIX(locale)) {
5459                 (void) hv_stores(hv, CURRENCY_SYMBOL_LITERAL, newSVpvs(""));
5460                 (void) hv_stores(hv, P_CS_PRECEDES_LITERAL, newSViv(-1));
5461                 return hv;
5462             }
5463 
5464             strings[MONETARY_OFFSET] = CURRENCY_SYMBOL_ADDRESS;
5465             integers[MONETARY_OFFSET] = P_CS_PRECEDES_ADDRESS;
5466 
5467             index_bits = OFFSET_TO_BIT(MONETARY_OFFSET);
5468             break;
5469 
5470 #    endif
5471 
5472         } /* End of switch() */
5473 
5474         /* There's only one item, so only one of each of these will get used,
5475          * but cheap to initialize both */
5476         populate[MONETARY_OFFSET] =
5477         populate[NUMERIC_OFFSET]  = S_populate_hash_from_localeconv;
5478         locales[MONETARY_OFFSET] = locales[NUMERIC_OFFSET]  = locale;
5479     }
5480     else   /* End of for just one item to emulate nl_langinfo() */
5481 
5482 #  endif
5483 
5484     {
5485         /* Here, the call is for all of localeconv().  It has a bunch of
5486          * items.  The first function call always gets the MONETARY values */
5487         index_bits = OFFSET_TO_BIT(MONETARY_OFFSET);
5488 
5489 #  ifdef USE_LOCALE_MONETARY
5490 
5491         locales[MONETARY_OFFSET] = querylocale_c(LC_MONETARY);
5492         populate[MONETARY_OFFSET] =
5493                                 (isNAME_C_OR_POSIX(locales[MONETARY_OFFSET]))
5494                                 ?  S_populate_hash_from_C_localeconv
5495                                 :  S_populate_hash_from_localeconv;
5496 
5497 #  else
5498 
5499         locales[MONETARY_OFFSET] = "C";
5500         populate[MONETARY_OFFSET] = S_populate_hash_from_C_localeconv;
5501 
5502 #  endif
5503 #  ifdef USE_LOCALE_NUMERIC
5504 
5505         /* And if the locales for the two categories are the same, we can also
5506          * do the NUMERIC values in the same call */
5507         if (strEQ(PL_numeric_name, locales[MONETARY_OFFSET])) {
5508             index_bits |= OFFSET_TO_BIT(NUMERIC_OFFSET);
5509             locales[NUMERIC_OFFSET] = locales[MONETARY_OFFSET];
5510             populate[NUMERIC_OFFSET] = populate[MONETARY_OFFSET];
5511         }
5512         else {
5513             requires_2nd_localeconv = true;
5514             locales[NUMERIC_OFFSET] = PL_numeric_name;
5515             populate[NUMERIC_OFFSET] = (isNAME_C_OR_POSIX(PL_numeric_name))
5516                                        ?  S_populate_hash_from_C_localeconv
5517                                        :  S_populate_hash_from_localeconv;
5518         }
5519 
5520 #  else
5521 
5522         /* When LC_NUMERIC is confined to "C", the two locales are the same
5523            iff LC_MONETARY in this case is also "C".  We set up the function
5524            for that case above, so fastest to test just its address */
5525         locales[NUMERIC_OFFSET] = "C";
5526         if (populate[MONETARY_OFFSET] == S_populate_hash_from_C_localeconv) {
5527             index_bits |= OFFSET_TO_BIT(NUMERIC_OFFSET);
5528             populate[NUMERIC_OFFSET] = populate[MONETARY_OFFSET];
5529         }
5530         else {
5531             requires_2nd_localeconv = true;
5532             populate[NUMERIC_OFFSET] = S_populate_hash_from_C_localeconv;
5533         }
5534 
5535 #  endif
5536 
5537     }   /* End of call is for localeconv() */
5538 
5539     /* Call the proper populate function (which may call localeconv()) and copy
5540      * its results into the hash.  All the parameters have been initialized
5541      * above */
5542     (*populate[MONETARY_OFFSET])(aTHX_
5543                                  hv, locales[MONETARY_OFFSET],
5544                                  index_bits, strings, integers);
5545 
5546 #  ifndef HAS_SOME_LANGINFO  /* Could be using this function to emulate
5547                                 nl_langinfo() */
5548 
5549     /* We are done when called with an individual item.  There are no integer
5550      * items to adjust, and it's best for the caller to determine if this
5551      * string item is UTF-8 or not.  This is because the locale's UTF-8ness is
5552      * calculated below, and in some Configurations, that can lead to a
5553      * recursive call to here, which could recurse infinitely. */
5554     if (item != 0) {
5555         return hv;
5556     }
5557 
5558 #  endif
5559 
5560     /* The above call may have done all the hash fields, but not always, as
5561      * already explained.  If we need a second call it is always for the
5562      * NUMERIC fields */
5563     if (requires_2nd_localeconv) {
5564         (*populate[NUMERIC_OFFSET])(aTHX_
5565                                     hv,
5566                                     locales[NUMERIC_OFFSET],
5567                                     OFFSET_TO_BIT(NUMERIC_OFFSET),
5568                                     strings, integers);
5569     }
5570 
5571     /* Here, the hash has been completely populated. */
5572 
5573 #  ifdef LOCALECONV_NEEDS_CRITICAL_SECTION
5574 
5575     /* When the hash was populated during a critical section, the determination
5576      * of whether or not a string element should be marked as UTF-8 was
5577      * deferred, so as to minimize the amount of time in the critical section.
5578      * But now we have the hash specific to this thread, and can do the
5579      * adjusting without worrying about delaying other threads. */
5580     for (unsigned int i = 0; i < 2; i++) {  /* Try both types of strings */
5581 
5582         /* The return from this function is already adjusted */
5583         if (populate[i] == S_populate_hash_from_C_localeconv) {
5584             continue;
5585         }
5586 
5587         /* Examine each string */
5588         for (const lconv_offset_t *strp = strings[i]; strp->name; strp++) {
5589             const char * name = strp->name;
5590 
5591             /* 'value' will contain the string that may need to be marked as
5592              * UTF-8 */
5593             SV ** value = hv_fetch(hv, name, strlen(name), true);
5594             if (value == NULL) {
5595                 continue;
5596             }
5597 
5598             /* Determine if the string should be marked as UTF-8. */
5599             if (UTF8NESS_YES == (get_locale_string_utf8ness_i(SvPVX(*value),
5600                                                   LOCALE_UTF8NESS_UNKNOWN,
5601                                                   locales[i],
5602                                                   LC_ALL_INDEX_ /* OOB */)))
5603             {
5604                 SvUTF8_on(*value);
5605             }
5606         }
5607     }
5608 
5609 #  endif
5610 
5611     return hv;
5612 
5613 #endif    /* End of must have one or both USE_MONETARY, USE_NUMERIC */
5614 
5615 }
5616 
5617 STATIC void
S_populate_hash_from_C_localeconv(pTHX_ HV * hv,const char * locale,const U32 which_mask,const lconv_offset_t * strings[2],const lconv_offset_t * integers[2])5618 S_populate_hash_from_C_localeconv(pTHX_ HV * hv,
5619                                         const char * locale,  /* Unused */
5620 
5621                                         /* bit mask of which categories to
5622                                          * populate */
5623                                         const U32 which_mask,
5624 
5625                                         /* The string type values to return;
5626                                          * one element for numeric; the other
5627                                          * for monetary */
5628                                         const lconv_offset_t * strings[2],
5629 
5630                                         /* And the integer fields */
5631                                         const lconv_offset_t * integers[2])
5632 {
5633     PERL_ARGS_ASSERT_POPULATE_HASH_FROM_C_LOCALECONV;
5634     PERL_UNUSED_ARG(locale);
5635     assert(isNAME_C_OR_POSIX(locale));
5636 
5637     /* Fill hv with the values that localeconv() is supposed to return for
5638      * the C locale */
5639 
5640     U32 working_mask = which_mask;
5641     while (working_mask) {
5642 
5643         /* Get the bit position of the next lowest set bit.  That is the
5644          * index into the 'strings' array of the category we use in this loop
5645          * iteration.  Turn the bit off so we don't work on this category
5646          * again in this function call. */
5647         const PERL_UINT_FAST8_T i = lsbit_pos(working_mask);
5648         working_mask &= ~ (1 << i);
5649 
5650         /* This category's string fields */
5651         const lconv_offset_t * category_strings = strings[i];
5652 
5653 #ifndef HAS_SOME_LANGINFO /* This doesn't work properly if called on a single
5654                              item, which could only happen when there isn't
5655                              nl_langinfo on the platform */
5656         assert(category_strings[1].name != NULL);
5657 #endif
5658 
5659         /* All string fields are empty except for one NUMERIC one.  That one
5660          * has been initialized to be the final one in the NUMERIC strings, so
5661          * stop the loop early in that case.  Otherwise, we would store an
5662          * empty string to the hash, and immediately overwrite it with the
5663          * correct value */
5664         const unsigned int stop_early = (i == NUMERIC_OFFSET) ? 1 : 0;
5665 
5666         /* A NULL element terminates the list */
5667         while ((category_strings + stop_early)->name) {
5668             (void) hv_store(hv,
5669                             category_strings->name,
5670                             strlen(category_strings->name),
5671                             newSVpvs(""),
5672                             0);
5673 
5674             category_strings++;
5675         }
5676 
5677         /* And fill in the NUMERIC exception */
5678         if (i == NUMERIC_OFFSET) {
5679             (void) hv_stores(hv, "decimal_point", newSVpvs("."));
5680             category_strings++;
5681         }
5682 
5683         /* Add any int fields.  In the C locale, all are -1 */
5684         if (integers[i]) {
5685             const lconv_offset_t * current = integers[i];
5686             while (current->name) {
5687                 (void) hv_store(hv,
5688                                 current->name, strlen(current->name),
5689                                 newSViv(-1),
5690                                 0);
5691                 current++;
5692             }
5693         }
5694     }
5695 }
5696 
5697 #if defined(HAS_LOCALECONV) && (   defined(USE_LOCALE_NUMERIC)      \
5698                                 || defined(USE_LOCALE_MONETARY))
5699 
5700 STATIC void
S_populate_hash_from_localeconv(pTHX_ HV * hv,const char * locale,const U32 which_mask,const lconv_offset_t * strings[2],const lconv_offset_t * integers[2])5701 S_populate_hash_from_localeconv(pTHX_ HV * hv,
5702 
5703                                       /* Switch to this locale to run
5704                                        * localeconv() from */
5705                                       const char * locale,
5706 
5707                                       /* bit mask of which categories to
5708                                        * populate */
5709                                       const U32 which_mask,
5710 
5711                                       /* The string type values to return; one
5712                                        * element for numeric; the other for
5713                                        * monetary */
5714                                       const lconv_offset_t * strings[2],
5715 
5716                                       /* And similarly the integer fields */
5717                                       const lconv_offset_t * integers[2])
5718 {
5719     PERL_ARGS_ASSERT_POPULATE_HASH_FROM_LOCALECONV;
5720 
5721     /* Run localeconv() and copy some or all of its results to the input 'hv'
5722      * hash.  Most localeconv() implementations return the values in a global
5723      * static buffer, so for them, the operation must be performed in a
5724      * critical section, ending only after the copy is completed.  There are so
5725      * many locks because localeconv() deals with two categories, and returns
5726      * in a single global static buffer.  Some locks might be no-ops on this
5727      * platform, but not others.  We need to lock if any one isn't a no-op. */
5728 
5729     /* If the call could be for either or both of the two categories, we need
5730      * to test which one; but if the Configuration is such that we will never
5731      * be called with one of them, the code for that one will be #ifdef'd out
5732      * below, leaving code for just the other category.  That code will always
5733      * want to be executed, no conditional required.  Create a macro that
5734      * replaces the condition with an always-true value so the compiler will
5735      * omit the conditional */
5736 #  if defined(USE_LOCALE_NUMERIC) && defined(USE_LOCALE_MONETARY)
5737 #    define CALL_IS_FOR(x)  (which_mask & OFFSET_TO_BIT(x ## _OFFSET))
5738 #  else
5739 #    define CALL_IS_FOR(x) 1
5740 #  endif
5741 
5742     /* This function is unfortunately full of #ifdefs.  It consists of three
5743      * sections:
5744      *  1)  Setup:
5745      *        a)  On platforms where it matters, toggle LC_CTYPE to the same
5746      *            locale that LC_NUMERIC and LC_MONETARY will be toggled to
5747      *        b)  On calls that process LC_NUMERIC, toggle to the desired locale
5748      *        c)  On calls that process LC_MONETARY, toggle to the desired
5749      *            locale
5750      *        d)  Do any necessary mutex locking not (automatically) done by
5751      *            the toggling
5752      *        e)  Work around some Windows-only issues and bugs
5753      *  2)  Do the localeconv(), copying the results.
5754      *  3)  Teardown, which is the inverse of setup.
5755      *
5756      * The setup and teardown are highly variable due to the variance in the
5757      * possible Configurations.  What is done here to make it slightly more
5758      * understandable is each setup section creates the details of its
5759      * corresponding teardown section, and macroizes them.  So that the
5760      * finished teardown product is just a linear series of macros.  You can
5761      * thus easily see the logic there. */
5762 
5763     /* Setup any LC_CTYPE handling */
5764     start_DEALING_WITH_MISMATCHED_CTYPE(locale);
5765 #  define CTYPE_TEARDOWN  end_DEALING_WITH_MISMATCHED_CTYPE(locale)
5766 
5767    /* Setup any LC_NUMERIC handling */
5768 #  ifndef USE_LOCALE_NUMERIC
5769 #    define NUMERIC_TEARDOWN
5770 #  else
5771 
5772     /* We need to toggle the NUMERIC locale to the desired one if we are
5773      * getting NUMERIC strings */
5774     const char * orig_NUMERIC_locale = NULL;
5775     if (CALL_IS_FOR(NUMERIC)) {
5776 
5777 #    ifdef WIN32
5778 
5779         /* There is a bug in Windows in which setting LC_CTYPE after the others
5780          * doesn't actually take effect for localeconv().  See commit
5781          * 418efacd1950763f74ed3cc22f8cf9206661b892 for details.  Thus we have
5782          * to make sure that the locale we want is set after LC_CTYPE.  We
5783          * unconditionally toggle away from and back to the current locale
5784          * prior to calling localeconv(). */
5785         orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC, "C");
5786         (void) toggle_locale_c(LC_NUMERIC, locale);
5787 
5788 #      define NUMERIC_TEARDOWN                                              \
5789           STMT_START {                                                      \
5790             if (CALL_IS_FOR(NUMERIC)) {                                     \
5791                 restore_toggled_locale_c(LC_NUMERIC, "C");                  \
5792                 restore_toggled_locale_c(LC_NUMERIC, orig_NUMERIC_locale);  \
5793             }                                                               \
5794           } STMT_END
5795 
5796 #    else
5797 
5798         /* No need for the extra toggle when not on Windows */
5799         orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC, locale);
5800 
5801 #      define NUMERIC_TEARDOWN                                              \
5802          STMT_START {                                                       \
5803             if (CALL_IS_FOR(NUMERIC)) {                                     \
5804                 restore_toggled_locale_c(LC_NUMERIC, orig_NUMERIC_locale);  \
5805             }                                                               \
5806          } STMT_END
5807 #    endif
5808 
5809     }
5810 
5811 #  endif  /* End of LC_NUMERIC setup */
5812 
5813    /* Setup any LC_MONETARY handling, using the same logic as for
5814     * USE_LOCALE_NUMERIC just above */
5815 #  ifndef USE_LOCALE_MONETARY
5816 #    define MONETARY_TEARDOWN
5817 #  else
5818 
5819     /* Same logic as LC_NUMERIC, and same Windows bug */
5820     const char * orig_MONETARY_locale = NULL;
5821     if (CALL_IS_FOR(MONETARY)) {
5822 
5823 #    ifdef WIN32
5824 
5825         orig_MONETARY_locale = toggle_locale_c(LC_MONETARY, "C");
5826         (void) toggle_locale_c(LC_MONETARY, locale);
5827 
5828 #      define MONETARY_TEARDOWN                                             \
5829          STMT_START {                                                       \
5830             if (CALL_IS_FOR(MONETARY)) {                                    \
5831                 restore_toggled_locale_c(LC_MONETARY, "C");                 \
5832                 restore_toggled_locale_c(LC_MONETARY, orig_MONETARY_locale);\
5833             }                                                               \
5834          } STMT_END
5835 
5836 #    else
5837 
5838         /* No need for the extra toggle when not on Windows */
5839         orig_MONETARY_locale = toggle_locale_c(LC_MONETARY, locale);
5840 
5841 #      define MONETARY_TEARDOWN                                             \
5842          STMT_START {                                                       \
5843             if (CALL_IS_FOR(MONETARY)) {                                    \
5844                 restore_toggled_locale_c(LC_MONETARY, orig_MONETARY_locale);\
5845             }                                                               \
5846          } STMT_END
5847 
5848 #    endif
5849 
5850     }
5851 
5852 #  endif  /* End of LC_MONETARY setup */
5853 
5854     /* Here, have toggled to the correct locale.
5855      *
5856      * We don't need to worry about locking at all if localeconv() is
5857      * thread-safe, regardless of if using threads or not. */
5858 #  ifdef LOCALECONV_IS_THREAD_SAFE
5859 #    define LOCALECONV_UNLOCK
5860 #  else
5861 
5862      /* Otherwise, the gwLOCALE_LOCK macro expands to whatever locking is
5863       * needed (none if there is only a single perl instance) */
5864     gwLOCALE_LOCK;
5865 
5866 #    define LOCALECONV_UNLOCK  gwLOCALE_UNLOCK
5867 #  endif
5868 #  if ! defined(TS_W32_BROKEN_LOCALECONV) || ! defined(USE_THREAD_SAFE_LOCALE)
5869 #    define WIN32_TEARDOWN
5870 #  else
5871 
5872     /* This is a workaround for another bug in Windows.  localeconv() was
5873      * broken with thread-safe locales prior to VS 15.  It looks at the global
5874      * locale instead of the thread one.  As a work-around, we toggle to the
5875      * global locale; populate the return; then toggle back.  We have to use
5876      * LC_ALL instead of the individual categories because of yet another bug
5877      * in Windows.  And this all has to be done in a critical section.
5878      *
5879      * This introduces a potential race with any other thread that has also
5880      * converted to use the global locale, and doesn't protect its locale calls
5881      * with mutexes.  khw can't think of any reason for a thread to do so on
5882      * Windows, as the locale API is the same regardless of thread-safety,
5883      * except if the code is ported from working on another platform where
5884      * there might be some reason to do this.  But this is typically due to
5885      * some alien-to-Perl library that thinks it owns locale setting.  Such a
5886      * library isn't likely to exist on Windows, so such an application is
5887      * unlikely to be run on Windows
5888      */
5889     bool restore_per_thread = FALSE;
5890 
5891     /* Save the per-thread locale state */
5892     const char * save_thread = querylocale_c(LC_ALL);
5893 
5894     /* Change to the global locale, and note if we already were there */
5895     int config_return = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
5896     if (config_return != _DISABLE_PER_THREAD_LOCALE) {
5897         if (config_return == -1) {
5898             locale_panic_("_configthreadlocale returned an error");
5899         }
5900 
5901         restore_per_thread = TRUE;
5902     }
5903 
5904     /* Save the state of the global locale; then convert to our desired
5905      * state.  */
5906     const char * save_global = querylocale_c(LC_ALL);
5907     void_setlocale_c(LC_ALL, save_thread);
5908 
5909 #   define WIN32_TEARDOWN                                                   \
5910          STMT_START {                                                       \
5911             /* Restore the global locale's prior state */                   \
5912             void_setlocale_c(LC_ALL, save_global);                          \
5913                                                                             \
5914             /* And back to per-thread locales */                            \
5915             if (restore_per_thread) {                                       \
5916                 if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) { \
5917                     locale_panic_("_configthreadlocale returned an error"); \
5918                 }                                                           \
5919             }                                                               \
5920                                                                             \
5921             /* Restore the per-thread locale state */                       \
5922             void_setlocale_c(LC_ALL, save_thread);                          \
5923         } STMT_END
5924 #  endif  /* TS_W32_BROKEN_LOCALECONV */
5925 
5926 
5927     /* Finally, everything is locked and loaded; do the actual call to
5928      * localeconv() */
5929     const char *lcbuf_as_string = (const char *) localeconv();
5930 
5931     /* Copy its results for each desired category as determined by
5932      * 'which_mask' */
5933     U32 working_mask = which_mask;
5934     while (working_mask) {
5935 
5936         /* Get the bit position of the next lowest set bit.  That is the
5937          * index into the 'strings' array of the category we use in this loop
5938          * iteration.  Turn the bit off so we don't work on this category
5939          * again in this function call. */
5940         const PERL_UINT_FAST8_T i = lsbit_pos32(working_mask);
5941         working_mask &= ~ (1 << i);
5942 
5943         /* Point to the string field list for the given category ... */
5944         const lconv_offset_t * category_strings = strings[i];
5945 
5946         /* The string fields returned by localeconv() are stored as SVs in the
5947          * hash.  Their utf8ness needs to be calculated at some point, and the
5948          * SV flagged accordingly.  It is easier to do that now as we go
5949          * through them, but strongly countering this is the need to minimize
5950          * the length of time spent in a critical section with other threads
5951          * locked out.  Therefore, when this is being executed in a critical
5952          * section, the strings are stored as-is, and the utf8ness calculation
5953          * is done by our caller, outside the critical section, in an extra
5954          * pass through the hash.  But when this code is not being executed in
5955          * a critical section, that extra pass would be extra work, so the
5956          * calculation is done here.  We have #defined a symbol that indicates
5957          * whether or not this is being done in a critical section.  But there
5958          * is a complication.  When this is being called with just a single
5959          * string to populate the hash with, there may be extra adjustments
5960          * needed, and the ultimate caller is expecting to do all adjustments,
5961          * so the adjustment is deferred in this case even if there is no
5962          * critical section.  (This case is indicated by element [1] being a
5963          * NULL marker, hence having only one real element.) */
5964 #  ifndef LOCALECONV_NEEDS_CRITICAL_SECTION
5965         const bool calculate_utf8ness_here = category_strings[1].name;
5966 #  endif
5967         bool utf8ness = false;
5968 
5969         /* For each string field */
5970         while (category_strings->name) {
5971 
5972             /* We have set things up so that we know where in the returned
5973              * structure, when viewed as a string, the corresponding value is.
5974              * */
5975             char *value = *((char **)(  lcbuf_as_string
5976                                       + category_strings->offset));
5977             if (value) {    /* Copy to the hash */
5978 
5979 #  ifndef LOCALECONV_NEEDS_CRITICAL_SECTION
5980 
5981                 if (calculate_utf8ness_here) {
5982                     utf8ness =
5983                       (   UTF8NESS_YES
5984                        == get_locale_string_utf8ness_i(value,
5985                                                       LOCALE_UTF8NESS_UNKNOWN,
5986                                                       locale,
5987                                                       LC_ALL_INDEX_ /* OOB */));
5988                 }
5989 #  endif
5990                 (void) hv_store(hv,
5991                                 category_strings->name,
5992                                 strlen(category_strings->name),
5993                                 newSVpvn_utf8(value, strlen(value), utf8ness),
5994                                 0);
5995             }
5996 
5997             category_strings++;
5998         }
5999 
6000         /* Add any int fields to the HV*. */
6001         if (integers[i]) {
6002             const lconv_offset_t * current = integers[i];
6003             while (current->name) {
6004                 int value = *((const char *)(  lcbuf_as_string
6005                                              + current->offset));
6006                 if (value == CHAR_MAX) { /* Change CHAR_MAX to -1 */
6007                     value = -1;
6008                 }
6009 
6010                 (void) hv_store(hv,
6011                                 current->name, strlen(current->name),
6012                                 newSViv(value),
6013                                 0);
6014                 current++;
6015             }
6016         }
6017     }   /* End of loop through the fields */
6018 
6019     /* Done with copying to the hash.  Can unwind the critical section locks */
6020 
6021     /* Back out of what we set up */
6022     WIN32_TEARDOWN;
6023     LOCALECONV_UNLOCK;
6024     MONETARY_TEARDOWN;
6025     NUMERIC_TEARDOWN;
6026     CTYPE_TEARDOWN;
6027 }
6028 
6029 #endif    /* defined(USE_LOCALE_NUMERIC) || defined(USE_LOCALE_MONETARY) */
6030 
6031 /*
6032 
6033 =for apidoc      Perl_langinfo
6034 =for apidoc_item Perl_langinfo8
6035 
6036 C<Perl_langinfo> is an (almost) drop-in replacement for the system
6037 C<L<nl_langinfo(3)>>, taking the same C<item> parameter values, and returning
6038 the same information.  But it is more thread-safe than regular
6039 C<nl_langinfo()>, and hides the quirks of Perl's locale handling from your
6040 code, and can be used on systems that lack a native C<nl_langinfo>.
6041 
6042 However, you should instead use either the improved version of this,
6043 L</Perl_langinfo8>, or even better, L</sv_langinfo>.  The latter returns an SV,
6044 handling all the possible non-standard returns of C<nl_langinfo()>, including
6045 the UTF8ness of any returned string.
6046 
6047 C<Perl_langinfo8> is identical to C<Perl_langinfo> except for an additional
6048 parameter, a pointer to a variable declared as L</C<utf8ness_t>>, into which it
6049 returns to you how you should treat the returned string with regards to it
6050 being encoded in UTF-8 or not.
6051 
6052 These two functions share private per-thread memory that will be changed the
6053 next time either one of them is called with any input, but not before.
6054 
6055 Concerning the differences between these and plain C<nl_langinfo()>:
6056 
6057 =over
6058 
6059 =item a.
6060 
6061 C<Perl_langinfo8> has an extra parameter, described above.  Besides this, the
6062 other reason they aren't quite a drop-in replacement is actually an advantage.
6063 The C<const>ness of the return allows the compiler to catch attempts to write
6064 into the returned buffer, which is illegal and could cause run-time crashes.
6065 
6066 =item b.
6067 
6068 They deliver the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
6069 without you having to write extra code.  The reason for the extra code would be
6070 because these are from the C<LC_NUMERIC> locale category, which is normally
6071 kept set by Perl so that the radix is a dot, and the separator is the empty
6072 string, no matter what the underlying locale is supposed to be, and so to get
6073 the expected results, you have to temporarily toggle into the underlying
6074 locale, and later toggle back.  (You could use plain C<nl_langinfo> and
6075 C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
6076 the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
6077 (or equivalent) locale would break a lot of CPAN, which is expecting the radix
6078 (decimal point) character to be a dot.)
6079 
6080 =item c.
6081 
6082 The system function they replace can have its static return buffer trashed,
6083 not only by a subsequent call to that function, but by a C<freelocale>,
6084 C<setlocale>, or other locale change.  The returned buffer of these functions
6085 is not changed until the next call to one or the other, so the buffer is never
6086 in a trashed state.
6087 
6088 =item d.
6089 
6090 The return buffer is per-thread, so it also is never overwritten by a call to
6091 these functions from another thread;  unlike the function it replaces.
6092 
6093 =item e.
6094 
6095 But most importantly, they work on systems that don't have C<nl_langinfo>, such
6096 as Windows, hence making your code more portable.  Of the fifty-some possible
6097 items specified by the POSIX 2008 standard,
6098 L<https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
6099 only one is completely unimplemented, though on non-Windows platforms, another
6100 significant one is not fully implemented).  They use various techniques to
6101 recover the other items, including calling C<L<localeconv(3)>>, and
6102 C<L<strftime(3)>>, both of which are specified in C89, so should be always be
6103 available.  Later C<strftime()> versions have additional capabilities.
6104 If an item is not available on your system, this returns either the value
6105 associated with the C locale, or simply C<"">, whichever is more appropriate.
6106 
6107 It is important to note that, when called with an item that is recovered by
6108 using C<localeconv>, the buffer from any previous explicit call to
6109 C<L<localeconv(3)>> will be overwritten.  But you shouldn't be using
6110 C<localeconv> anyway because it is is very much not thread-safe, and suffers
6111 from the same problems outlined in item 'b.' above for the fields it returns
6112 that are controlled by the LC_NUMERIC locale category.  Instead, avoid all of
6113 those problems by calling L</Perl_localeconv>, which is thread-safe; or by
6114 using the methods given in L<perlcall>  to call
6115 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe.
6116 
6117 =back
6118 
6119 The details for those items which may deviate from what this emulation returns
6120 and what a native C<nl_langinfo()> would return are specified in
6121 L<I18N::Langinfo>.
6122 
6123 =for apidoc  sv_langinfo
6124 
6125 This is the preferred interface for accessing the data that L<nl_langinfo(3)>
6126 provides (or Perl's emulation of it on platforms lacking it), returning an SV.
6127 Unlike, the earlier-defined interfaces to this (L</Perl_langinfo> and
6128 L</Perl_langinfo8>), which return strings, the UTF8ness of the result is
6129 automatically handled for you.  And like them, it is thread-safe and
6130 automatically handles getting the proper values for the C<RADIXCHAR> and
6131 C<THOUSEP> items (that calling the plain libc C<nl_langinfo()> could give the
6132 wrong results for).  Like them, this also doesn't play well with the libc
6133 C<localeconv()>; use L<C<POSIX::localeconv()>|POSIX/localeconv> instead.
6134 
6135 There are a few deviations from what a native C<nl_langinfo()> would return and
6136 what this returns on platforms that don't implement that function.  These are
6137 detailed in L<I18N::Langinfo>.
6138 
6139 =cut
6140 
6141 */
6142 
6143 /* external_call_langinfo() is an interface to callers from outside this file to
6144  * langinfo_sv_i(), calculating a necessary value for it.  If those functions
6145  * aren't defined, the fallback function is emulate_langinfo(), which doesn't
6146  * use that value (as everything in this situation takes place in the "C"
6147  * locale), and so we define this macro to transparently hide the absence of
6148  * the missing functions */
6149 #ifndef external_call_langinfo
6150 #  define external_call_langinfo(item, sv, utf8p)                           \
6151                                     emulate_langinfo(item, "C", sv, utf8p)
6152 #endif
6153 
6154 SV *
Perl_sv_langinfo(pTHX_ const nl_item item)6155 Perl_sv_langinfo(pTHX_ const nl_item  item) {
6156     utf8ness_t dummy;   /* Having this tells the layers below that we want the
6157                            UTF-8 flag in 'sv' to be set properly. */
6158 
6159     SV * sv = newSV_type(SVt_PV);
6160     (void) external_call_langinfo(item, sv, &dummy);
6161 
6162     return sv;
6163 }
6164 
6165 const char *
Perl_langinfo(const nl_item item)6166 Perl_langinfo(const nl_item item)
6167 {
6168     dTHX;
6169 
6170     (void) external_call_langinfo(item, PL_langinfo_sv, NULL);
6171     return SvPV_nolen(PL_langinfo_sv);
6172 }
6173 
6174 const char *
Perl_langinfo8(const nl_item item,utf8ness_t * utf8ness)6175 Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness)
6176 {
6177     PERL_ARGS_ASSERT_PERL_LANGINFO8;
6178     dTHX;
6179 
6180     (void) external_call_langinfo(item, PL_langinfo_sv, utf8ness);
6181     return SvPV_nolen(PL_langinfo_sv);
6182 }
6183 
6184 #ifdef USE_LOCALE
6185 
6186 const char *
S_external_call_langinfo(pTHX_ const nl_item item,SV * sv,utf8ness_t * utf8ness)6187 S_external_call_langinfo(pTHX_ const nl_item item,
6188                                SV * sv,
6189                                utf8ness_t * utf8ness)
6190 {
6191     PERL_ARGS_ASSERT_EXTERNAL_CALL_LANGINFO;
6192 
6193     /* Find the locale category that controls the input 'item', and call
6194      * langinfo_sv_i() including that value.
6195      *
6196      * If we are not paying attention to that category, instead call
6197      * emulate_langinfo(), which knows how to handle this situation. */
6198     locale_category_index  cat_index = LC_ALL_INDEX_;  /* Out-of-bounds */
6199 
6200     switch (item) {
6201       case CODESET:
6202 
6203 #  ifdef USE_LOCALE_CTYPE
6204         cat_index = LC_CTYPE_INDEX_;
6205 #  endif
6206         break;
6207 
6208 
6209       case YESEXPR: case YESSTR: case NOEXPR: case NOSTR:
6210 
6211 #  ifdef USE_LOCALE_MESSAGES
6212         cat_index = LC_MESSAGES_INDEX_;
6213 #  endif
6214         break;
6215 
6216 
6217       case CRNCYSTR:
6218 
6219 #  ifdef USE_LOCALE_MONETARY
6220         cat_index = LC_MONETARY_INDEX_;
6221 #  endif
6222         break;
6223 
6224 
6225       case RADIXCHAR: case THOUSEP:
6226 
6227 #  ifdef USE_LOCALE_NUMERIC
6228         cat_index = LC_NUMERIC_INDEX_;
6229 #  endif
6230         break;
6231 
6232 
6233       case _NL_ADDRESS_POSTAL_FMT:
6234       case _NL_ADDRESS_COUNTRY_NAME:
6235       case _NL_ADDRESS_COUNTRY_POST:
6236       case _NL_ADDRESS_COUNTRY_AB2:
6237       case _NL_ADDRESS_COUNTRY_AB3:
6238       case _NL_ADDRESS_COUNTRY_CAR:
6239       case _NL_ADDRESS_COUNTRY_NUM:
6240       case _NL_ADDRESS_COUNTRY_ISBN:
6241       case _NL_ADDRESS_LANG_NAME:
6242       case _NL_ADDRESS_LANG_AB:
6243       case _NL_ADDRESS_LANG_TERM:
6244       case _NL_ADDRESS_LANG_LIB:
6245 #  ifdef USE_LOCALE_ADDRESS
6246         cat_index = LC_ADDRESS_INDEX_;
6247 #  endif
6248         break;
6249 
6250 
6251       case _NL_IDENTIFICATION_TITLE:
6252       case _NL_IDENTIFICATION_SOURCE:
6253       case _NL_IDENTIFICATION_ADDRESS:
6254       case _NL_IDENTIFICATION_CONTACT:
6255       case _NL_IDENTIFICATION_EMAIL:
6256       case _NL_IDENTIFICATION_TEL:
6257       case _NL_IDENTIFICATION_FAX:
6258       case _NL_IDENTIFICATION_LANGUAGE:
6259       case _NL_IDENTIFICATION_TERRITORY:
6260       case _NL_IDENTIFICATION_AUDIENCE:
6261       case _NL_IDENTIFICATION_APPLICATION:
6262       case _NL_IDENTIFICATION_ABBREVIATION:
6263       case _NL_IDENTIFICATION_REVISION:
6264       case _NL_IDENTIFICATION_DATE:
6265       case _NL_IDENTIFICATION_CATEGORY:
6266 #  ifdef USE_LOCALE_IDENTIFICATION
6267         cat_index = LC_IDENTIFICATION_INDEX_;
6268 #  endif
6269         break;
6270 
6271 
6272       case _NL_MEASUREMENT_MEASUREMENT:
6273 #  ifdef USE_LOCALE_MEASUREMENT
6274         cat_index = LC_MEASUREMENT_INDEX_;
6275 #  endif
6276         break;
6277 
6278 
6279       case _NL_NAME_NAME_FMT:
6280       case _NL_NAME_NAME_GEN:
6281       case _NL_NAME_NAME_MR:
6282       case _NL_NAME_NAME_MRS:
6283       case _NL_NAME_NAME_MISS:
6284       case _NL_NAME_NAME_MS:
6285 #  ifdef USE_LOCALE_NAME
6286         cat_index = LC_NAME_INDEX_;
6287 #  endif
6288         break;
6289 
6290 
6291       case _NL_PAPER_HEIGHT:
6292       case _NL_PAPER_WIDTH:
6293 #  ifdef USE_LOCALE_PAPER
6294         cat_index = LC_PAPER_INDEX_;
6295 #  endif
6296         break;
6297 
6298 
6299       case _NL_TELEPHONE_TEL_INT_FMT:
6300       case _NL_TELEPHONE_TEL_DOM_FMT:
6301       case _NL_TELEPHONE_INT_SELECT:
6302       case _NL_TELEPHONE_INT_PREFIX:
6303 #  ifdef USE_LOCALE_TELEPHONE
6304         cat_index = LC_TELEPHONE_INDEX_;
6305 #  endif
6306         break;
6307 
6308 
6309       default:  /* The other possible items are all in LC_TIME. */
6310 #  ifdef USE_LOCALE_TIME
6311         cat_index = LC_TIME_INDEX_;
6312 #  endif
6313         break;
6314 
6315     } /* End of switch on item */
6316 
6317 #  if defined(HAS_MISSING_LANGINFO_ITEM_)
6318 
6319     /* If the above didn't find the category's index, it has to be because the
6320      * item is unknown to us (and the callee will handle that), or the category
6321      * is confined to the "C" locale on this platform, which the callee also
6322      * handles.  (LC_MESSAGES is not required by the C Standard (the others
6323      * above are), so we have to emulate it on platforms lacking it (such as
6324      * Windows).) */
6325     if (cat_index == LC_ALL_INDEX_) {
6326         return emulate_langinfo(item, "C", sv, utf8ness);
6327     }
6328 
6329 #  endif
6330 
6331     /* And get the value for this 'item', whose category has now been
6332      * calculated.  We need to find the current corresponding locale, and pass
6333      * that as well. */
6334     return langinfo_sv_i(item, cat_index,
6335                          query_nominal_locale_i(cat_index),
6336                          sv, utf8ness);
6337 }
6338 
6339 #endif
6340 #if defined(USE_LOCALE) && defined(HAS_NL_LANGINFO)
6341 
6342 STATIC const char *
S_langinfo_sv_i(pTHX_ const nl_item item,locale_category_index cat_index,const char * locale,SV * sv,utf8ness_t * utf8ness)6343 S_langinfo_sv_i(pTHX_
6344                 const nl_item item,           /* The item to look up */
6345 
6346                 /* The locale category that controls it */
6347                 locale_category_index cat_index,
6348 
6349                 /* The locale to look up 'item' in. */
6350                 const char * locale,
6351 
6352                 /* The SV to store the result in; see below */
6353                 SV * sv,
6354 
6355                 /* If not NULL, the location to store the UTF8-ness of 'item's
6356                  * value, as documented */
6357                 utf8ness_t * utf8ness)
6358 {
6359     PERL_ARGS_ASSERT_LANGINFO_SV_I;
6360     assert(cat_index < LC_ALL_INDEX_);
6361 
6362     /* This function is the interface to nl_langinfo(), returning a thread-safe
6363      * result, valid until its next call that uses the same 'sv'.  Similarly,
6364      * the S_emulate_langinfo() function below does the same, when
6365      * nl_langinfo() isn't available for the desired locale, or is completely
6366      * absent from the system.  It is hopefully invisible to an outside caller
6367      * as to which one of the two actually ends up processing the request.
6368      * This comment block hence generally describes the two functions as a
6369      * unit.
6370      *
6371      * The two functions both return values (using 'return' statements) and
6372      * potentially change the contents of the passed in SV 'sv'.  However, in
6373      * any given call, only one of the return types is reliable.
6374      *
6375      * When the passed in SV is 'PL_scratch_langinfo', the functions make sure
6376      * that the 'return' statements return the correct value, but whatever
6377      * value is in 'PL_scratch_langinfo' should be considered garbage.  When it
6378      * is any other SV, that SV will get the correct result, and the value
6379      * returned by a 'return' statement should be considered garbage.
6380      *
6381      * The reason for this is twofold:
6382      *
6383      *  1) These functions serve two masters.  For most purposes when called
6384      *     from within this file, the desired value is used immediately, and
6385      *     then no longer required.  For these, the 'return' statement values
6386      *     are most convenient.
6387      *
6388      *     But when the call is initiated from an external XS source, like
6389      *     I18N::Langinfo, the value needs to be able to be stable for a longer
6390      *     time and likely returned to Perl space.  An SV return is most
6391      *     convenient for these
6392      *
6393      *     Further, some Configurations use these functions reentrantly.  For
6394      *     those, an SV must be passed.
6395      *
6396      *  2) In S_emulate_langinfo(), most langinfo items are easy or even
6397      *     trivial to get.  These are amenable to being returned by 'return'
6398      *     statements.  But others are more complex, and use the infrastructure
6399      *     provided by perl's SV functions to help out.
6400      *
6401      * So for some items, it is most convenient to 'return' a simple value; for
6402      * others an SV is most convenient.  And some callers want a simple value;
6403      * others want or need an SV.  It would be wasteful to have an SV, convert
6404      * it to a simple value, discarding the SV, then create a new SV.
6405      *
6406      * The solution adopted here is to always pass an SV, and have a reserved
6407      * one, PL_scratch_langinfo, indicate that a 'return' is desired.  That SV
6408      * is then used as scratch for the items that it is most convenient to use
6409      * an SV in calculating.  Besides these two functions and initialization,
6410      * the only mention of PL_scratch_langinfo is in the expansion of a single
6411      * macro that is used by the code in this file that desires a non-SV return
6412      * value.
6413      *
6414      * A wart of this interface is that to get the UTF-8 flag of the passed-in
6415      * SV set, you have to also pass a non-null 'utf8ness' parameter.  This is
6416      * entirely to prevent the extra expense of calculating UTF-8ness when the
6417      * caller is plain Perl_langinfo(), which doesn't care about this.  If that
6418      * seems too kludgy, other mechanisms could be devised.  But be aware that
6419      * the SV interface has to have a way to not calculate UTF-8ness, or else
6420      * the reentrant uses could infinitely recurse */
6421 
6422     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6423                            "Entering langinfo_sv_i item=%jd, using locale %s\n",
6424                            (PERL_INTMAX_T) item, locale));
6425 
6426 #  ifdef HAS_MISSING_LANGINFO_ITEM_
6427 
6428     if (! category_available[cat_index]) {
6429         return emulate_langinfo(item, locale, sv, utf8ness);
6430     }
6431 
6432 #  endif
6433 
6434     /* One might be tempted to avoid any toggling by instead using
6435      * nl_langinfo_l() on platforms that have it.  This would entail creating a
6436      * locale object with newlocale() and freeing it afterwards.  But doing so
6437      * runs significantly slower than just doing the toggle ourselves.
6438      * lib/locale_threads.t was slowed down by 25% on Ubuntu 22.04 */
6439 
6440     start_DEALING_WITH_MISMATCHED_CTYPE(locale);
6441 
6442     const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
6443 
6444 /* nl_langinfo() is supposedly thread-safe except for its return value.  The
6445  * POSIX 2017 Standard states:
6446  *
6447  *    "The pointer returned by nl_langinfo() might be invalidated or the string
6448  *    content might be overwritten by a subsequent call to nl_langinfo() in any
6449  *    thread or to nl_langinfo_l() in the same thread or the initial thread, by
6450  *    subsequent calls to setlocale() with a category corresponding to the
6451  *    category of item (see <langinfo.h>) or the category LC_ALL, or by
6452  *    subsequent calls to uselocale() which change the category corresponding
6453  *    to the category of item."
6454  *
6455  * The implications of this are:
6456  *  a) Threaded:    nl_langinfo()'s return must be saved in a critical section
6457  *                  to avoid having another thread's call to it destroying the
6458  *                  result.  That means that the whole call to nl_langinfo()
6459  *                  plus the save must be done in a critical section.
6460  *  b) Unthreaded:  No critical section is needed (accomplished by having the
6461  *                  locks below be no-ops in this case).  But any subsequent
6462  *                  setlocale() or uselocale() could still destroy it.
6463  *                  Note that before returning, this function restores any
6464  *                  toggled locale categories.  These could easily end up
6465  *                  calling uselocale() or setlocale(), destroying our
6466  *                  result.  (And in some Configurations, this file currently
6467  *                  calls nl_langinfo_l() to determine if a uselocale() is
6468  *                  needed.)  So, a copy of the result is made in this case as
6469  *                  well.
6470  */
6471     const char * retval = NULL;
6472     utf8ness_t is_utf8 = UTF8NESS_UNKNOWN;
6473 
6474     /* Do a bit of extra work so avoid
6475      *  switch() { default: ... }
6476      * where the only case in it is the default: */
6477 #  if defined(USE_LOCALE_PAPER)                 \
6478    || defined(USE_LOCALE_MEASUREMENT)           \
6479    || defined(USE_LOCALE_ADDRESS)
6480 #    define IS_SWITCH  1
6481 #    define MAYBE_SWITCH(n)  switch(n)
6482 #  else
6483 #    define IS_SWITCH  0
6484 #    define MAYBE_SWITCH(n)
6485 #  endif
6486 
6487     GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
6488 
6489     MAYBE_SWITCH(item) {
6490 
6491 #  if defined(USE_LOCALE_MEASUREMENT)
6492 
6493       case _NL_MEASUREMENT_MEASUREMENT:
6494        {
6495         /* An ugly API; only the first byte of the returned char* address means
6496          * anything */
6497         gwLOCALE_LOCK;
6498         char char_value = nl_langinfo(item)[0];
6499         gwLOCALE_UNLOCK;
6500 
6501         sv_setuv(sv, char_value);
6502        }
6503 
6504         goto non_string_common;
6505 
6506 #  endif
6507 #  if defined(USE_LOCALE_ADDRESS) || defined(USE_LOCALE_PAPER)
6508 #    if defined(USE_LOCALE_ADDRESS)
6509 
6510       case _NL_ADDRESS_COUNTRY_NUM:
6511 
6512         /* Some glibc's return random values for this item and locale;
6513          * workaround by special casing it. */
6514         if (isNAME_C_OR_POSIX(locale)) {
6515             sv_setuv(sv, 0);
6516             goto non_string_common;
6517         }
6518 
6519         /* FALLTHROUGH */
6520 
6521 #    endif
6522 #    if defined(USE_LOCALE_PAPER)
6523 
6524       case _NL_PAPER_HEIGHT: case _NL_PAPER_WIDTH:
6525 
6526 #    endif
6527 
6528        {    /* A slightly less ugly API; the int portion of the returned char*
6529              * address is an integer. */
6530         gwLOCALE_LOCK;
6531         int int_value = (int) PTR2UV(nl_langinfo(item));
6532         gwLOCALE_UNLOCK;
6533 
6534         sv_setuv(sv, int_value);
6535        }
6536 
6537 #  endif
6538 #  if IS_SWITCH
6539 #    if defined(USE_LOCALE_MEASUREMENT)
6540 
6541        non_string_common:
6542 
6543 #    endif
6544 
6545         /* In all cases that get here, the char* instead delivers a numeric
6546          * value, so its UTF-8ness is meaningless */
6547         is_utf8 = UTF8NESS_IMMATERIAL;
6548 
6549         if (sv == PL_scratch_langinfo) {
6550             retval = SvPV_nomg_const_nolen(sv);
6551         }
6552 
6553         break;
6554 
6555       default:
6556 
6557 #  endif
6558 
6559         /* The rest of the possibilities deliver a true char* pointer to a
6560          * string (or sequence of strings in the case of ALT_DIGITS) */
6561         gwLOCALE_LOCK;
6562 
6563         retval = nl_langinfo(item);
6564         Size_t total_len = strlen(retval);
6565 
6566         /* Initialized only to silence some dumber compilers warning that
6567          * might be uninitialized */
6568         char separator = ';';
6569 
6570         if (UNLIKELY(item == ALT_DIGITS) && total_len > 0) {
6571 
6572             /* The return from nl_langinfo(ALT_DIGITS) is specified by the
6573              * 2017 POSIX Standard as a string consisting of "semicolon-
6574              * separated symbols. The first is the alternative symbol
6575              * corresponding to zero, the second is the symbol corresponding to
6576              * one, and so on.  Up to 100 alternative symbols may be
6577              * specified".  Infuriatingly, Linux does not follow this, and uses
6578              * the least C-language-friendly separator possible, the NUL.  In
6579              * case other platforms also violate the standard, the code below
6580              * looks for NUL and any graphic \W character as a potential
6581              * separator. */
6582             const char * sep_pos = strchr(retval, ';');
6583             if (! sep_pos) {
6584                 sep_pos = strpbrk(retval, " !\"#$%&'()*+,-./:<=>?@[\\]^_`{|}~");
6585             }
6586             if (sep_pos) {
6587                 separator = *sep_pos;
6588             }
6589             else if (strpbrk(retval, "123456789")) {
6590 
6591                 /* Alternate digits, with the possible exception of 0,
6592                  * shouldn't be standard digits, so if we get any back, return
6593                  * that there aren't alternate digits.  0 is an exception
6594                  * because there may be locales that do not have a zero, such
6595                  * as Roman numerals.  It could therefore be that alt-0 is 0,
6596                  * but alt-1 better be some multi-byte Unicode character(s)
6597                  * like U+2160, ROMAN NUMERAL ONE.  This clause is necessary
6598                  * because the total length of the ASCII digits won't trigger
6599                  * the conditional in the next clause that protects against
6600                  * non-Standard libc returns, such as in Alpine platforms, but
6601                  * multi-byte returns will trigger it */
6602                 retval = "";
6603                 total_len = 0;
6604             }
6605             else if (UNLIKELY(total_len >
6606                                         2 * UVCHR_SKIP(PERL_UNICODE_MAX) * 4))
6607             {   /* But as a check against the possibility that the separator is
6608                  * some other character, look at the length of the returned
6609                  * string.  If the separator is a NUL, the length will be just
6610                  * for the first NUL-terminated segment; if it is some other
6611                  * character, there is only a single segment with all returned
6612                  * alternate digits, which will be quite a bit longer than just
6613                  * the first one.  Many locales will always have a leading zero
6614                  * to represent 0-9 (hence the 2* in the conditional above).
6615                  * The conditional uses the worst case value of the most number
6616                  * of byte possible for a Unicode character, and it is possible
6617                  * that it requires several characters to represent a single
6618                  * value; hence the final multiplier.  This length represents a
6619                  * conservative upper limit of the number of bytes for the
6620                  * alternative representation of 00, but if the string
6621                  * represents even only the first 10 alternative digits, it
6622                  * will be much longer than that.  So to reach here, the
6623                  * separator must be some other byte. */
6624                 locale_panic_(Perl_form(aTHX_
6625                                         "Can't find separator in ALT_DIGITS"
6626                                         " representation '%s' for locale '%s'",
6627                                         _byte_dump_string((U8 *) retval,
6628                                                           total_len, 0),
6629                                         locale));
6630             }
6631             else {
6632                 separator = '\0';
6633 
6634                 /* Must be using NUL to separate the digits.  There are up to
6635                  * 100 of them.  Find the length of the entire sequence.
6636                  *
6637                  * The only way it could work if fewer is if it ends in two
6638                  * NULs.  khw has seen cases where there is no 2nd NUL on a 100
6639                  * digit return. */
6640                 const char * s = retval + total_len + 1;
6641 
6642                 for (unsigned int i = 1; i <= 99; i++) {
6643                     Size_t len = strlen(s) + 1;
6644                     total_len += len;
6645 
6646                     if (len == 1) {     /* Only a NUL */
6647                         break;
6648                     }
6649 
6650                     s += len;
6651                 }
6652             }
6653         }
6654 
6655         sv_setpvn(sv, retval, total_len);
6656 
6657         gwLOCALE_UNLOCK;
6658 
6659         /* Convert the ALT_DIGITS separator to a semi-colon if not already */
6660         if (UNLIKELY(item == ALT_DIGITS) && total_len > 0 && separator != ';') {
6661 
6662             /* Operate directly on the string in the SV */
6663             char * digit_string = SvPVX(sv);
6664             char * s = digit_string;
6665             char * e = s + total_len;
6666 
6667             do {
6668                 char * this_end = (char *) memchr(s, separator, total_len);
6669                 if (! this_end || this_end >= e) {
6670                     break;
6671                 }
6672 
6673                 *this_end = ';';
6674                 s = this_end;
6675             } while (1);
6676         }
6677 
6678         SvUTF8_off(sv);
6679         retval = SvPV_nomg_const_nolen(sv);
6680     }
6681 
6682     GCC_DIAG_RESTORE_STMT;
6683 
6684     restore_toggled_locale_i(cat_index, orig_switched_locale);
6685     end_DEALING_WITH_MISMATCHED_CTYPE(locale);
6686 
6687     if (utf8ness) {
6688         if (LIKELY(is_utf8 == UTF8NESS_UNKNOWN)) {  /* default: case above */
6689             is_utf8 = get_locale_string_utf8ness_i(retval,
6690                                                    LOCALE_UTF8NESS_UNKNOWN,
6691                                                    locale, cat_index);
6692         }
6693 
6694         *utf8ness = is_utf8;
6695 
6696         if (*utf8ness == UTF8NESS_YES) {
6697             SvUTF8_on(sv);
6698         }
6699     }
6700 
6701     return retval;
6702 }
6703 
6704 #  undef IS_SWITCH
6705 #  undef MAYBE_SWITCH
6706 #endif
6707 #ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
6708 
6709 /* Forward declaration of function that we don't put into embed.fnc so as to
6710  * make its removal easier, as there may not be any extant platforms that need
6711  * it; and the function is located after emulate_langinfo() because it's easier
6712  * to understand when placed in the context of that code */
6713 STATIC bool
6714 S_maybe_override_codeset(pTHX_ const char * codeset,
6715                                const char * locale,
6716                                const char ** new_codeset);
6717 #endif
6718 #if ! defined(HAS_NL_LANGINFO) || defined(HAS_MISSING_LANGINFO_ITEM_)
6719 
6720 STATIC const char *
S_emulate_langinfo(pTHX_ const PERL_INTMAX_T item,const char * locale,SV * sv,utf8ness_t * utf8ness)6721 S_emulate_langinfo(pTHX_ const PERL_INTMAX_T item,
6722                          const char * locale,
6723                          SV * sv,
6724                          utf8ness_t * utf8ness)
6725 {
6726     PERL_ARGS_ASSERT_EMULATE_LANGINFO;
6727     PERL_UNUSED_ARG(locale);    /* Too complicated to specify which
6728                                    Configurations use this vs which don't */
6729 
6730     /* This emulates nl_langinfo() on platforms:
6731      *   1) where it doesn't exist; or
6732      *   2) where it does exist, but there are categories that it shouldn't be
6733      *      called on because they don't exist on the platform or we are
6734      *      supposed to always stay in the C locale for them.  This function
6735      *      has hard-coded in the results for those for the C locale.
6736      *
6737      * This function returns a thread-safe result, valid until its next call
6738      * that uses the same 'sv'.  Similarly, the S_langinfo_sv_i() function
6739      * above does the same when nl_langinfo() is available.  Its comments
6740      * include a general description of the interface for both it and this
6741      * function.  That function should be the one called by code outside this
6742      * little group.  If it can't handle the request, it gets handed off to
6743      * this function.
6744      *
6745      * The major platform lacking nl_langinfo() is Windows.  It does have
6746      * GetLocaleInfoEx() that could be used to get most of the items, but it
6747      * (and other similar Windows API functions) use what MS calls "locale
6748      * names", whereas the C functions use what MS calls "locale strings".  The
6749      * locale string "English_United_States.1252" is equivalent to the locale
6750      * name "en_US".  There are tables inside Windows that translate between
6751      * the two forms, but they are not exposed.  Also calling setlocale(), then
6752      * calling GetThreadLocale() doesn't work, as the former doesn't change the
6753      * latter's return.  Therefore we are stuck using the mechanisms below. */
6754 
6755     /* Almost all the items will have ASCII return values.  Set that here, and
6756      * override if necessary */
6757     utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL;
6758     const char * retval = NULL;
6759 
6760     /* This function returns its result either by returning the calculated
6761      * value 'retval' if the 'sv' argument is PL_scratch_langinfo; or for any
6762      * other value of 'sv', it places the result into that 'sv'.  For some
6763      * paths through the code, it is more convenient, in the moment, to use one
6764      * or the other to hold the calculated result.  And, the calculation could
6765      * end up with the value in both places.  At the end, if the caller
6766      * wants the convenient result, we are done; but if it wants the opposite
6767      * type of value, it must be converted.  These macros are used to tell the
6768      * code at the end where the value got placed. */
6769 #  define RETVAL_IN_retval -1
6770 #  define RETVAL_IN_BOTH    0
6771 #  define RETVAL_IN_sv      1
6772 #  define isRETVAL_IN_sv(type)      ((type) >= RETVAL_IN_BOTH)
6773 #  define isRETVAL_IN_retval(type)  ((type) <= RETVAL_IN_BOTH)
6774 
6775     /* Most calculations place the result in 'retval', so initialize to that,
6776      * and override if necessary */
6777     int retval_type = RETVAL_IN_retval;
6778 
6779     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6780                         "Entering emulate_langinfo item=%jd, using locale %s\n",
6781                         item, locale));
6782 
6783 #  if   defined(HAS_LOCALECONV)                                         \
6784    && ! defined(HAS_SOME_LANGINFO)                                      \
6785    &&  (defined(USE_LOCALE_NUMERIC) || defined(USE_LOCALE_MONETARY))
6786 
6787     locale_category_index  cat_index;
6788     const char * localeconv_key;
6789     I32 localeconv_klen;
6790 
6791 #  endif
6792 
6793     GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
6794 
6795     switch (item) {
6796 
6797 #  if ! defined(HAS_SOME_LANGINFO) || ! LC_MESSAGES_AVAIL_
6798 
6799       /* The following items have no way khw could figure out how to get except
6800        * via nl_langinfo() */
6801       case YESEXPR:   retval = "^[+1yY]"; break;
6802       case YESSTR:    retval = "yes";     break;
6803       case NOEXPR:    retval = "^[-0nN]"; break;
6804       case NOSTR:     retval = "no";      break;
6805 
6806 #  endif
6807 #  if ! defined(HAS_SOME_LANGINFO) || ! LC_MONETARY_AVAIL_
6808 #    if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
6809 #      define NEED_USE_LOCALECONV
6810 
6811       case CRNCYSTR:
6812         cat_index = LC_MONETARY_INDEX_;
6813         localeconv_key = CURRENCY_SYMBOL_LITERAL;
6814         localeconv_klen = STRLENs(CURRENCY_SYMBOL_LITERAL);
6815         goto use_localeconv;
6816 
6817 #    else
6818 
6819       case CRNCYSTR:
6820 
6821         /* The locale's currency symbol may be empty.  But if not, the return
6822          * from nl_langinfo() prefixes it with a character that indicates where
6823          * in the monetary value the symbol is to be placed
6824          *  a) before, like $9.99
6825          *  b) middle, rare, but would like be 9$99
6826          *  c) after,  like 9.99USD
6827          *
6828          * The POSIX Standard permits an implementation to choose whether or
6829          * not to omit the prefix character if the symbol is empty (the
6830          * placement position is meaningless if there is nothing to place).
6831          * glibc has chosen to always prefix an empty symbol by a minus (which
6832          * is the prefix for 'before' positioning).  FreeBSD has chosen to
6833          * return an empty string for an empty symbol.  Perl has always
6834          * emulated the glibc way (probably with little thought). */
6835         retval = "-";
6836         break;
6837 
6838 #    endif
6839 #  endif
6840 #  if ! defined(HAS_SOME_LANGINFO) || ! LC_NUMERIC_AVAIL_
6841 #    if defined(USE_LOCALE_NUMERIC) && defined(HAS_LOCALECONV)
6842 #      define NEED_USE_LOCALECONV
6843 
6844       case THOUSEP:
6845         cat_index = LC_NUMERIC_INDEX_;
6846         localeconv_key = THOUSANDS_SEP_LITERAL;
6847         localeconv_klen = STRLENs(THOUSANDS_SEP_LITERAL);
6848         goto use_localeconv;
6849 
6850 #    else
6851 
6852       case THOUSEP:
6853         retval = C_thousands_sep;
6854         break;
6855 
6856 #    endif
6857 
6858       case RADIXCHAR:
6859 
6860 #    if defined(USE_LOCALE_NUMERIC) && defined(HAS_STRTOD)
6861 
6862        {
6863         /* khw knows of only three possible radix characters used in the world.
6864          * By far the two most common are comma and dot.  We can use strtod()
6865          * to quickly check for those without without much fuss.  If it is
6866          * something other than those two, the code drops down and lets
6867          * localeconv() find it.
6868          *
6869          * We don't have to toggle LC_CTYPE here because all locales Perl
6870          * supports are compatible with ASCII, which the two possibilities are.
6871          * */
6872         const char * orig_switched_locale = toggle_locale_c(LC_NUMERIC, locale);
6873 
6874         /* Comma tried first in case strtod() always accepts dot regardless of
6875          * the locale */
6876         if (strtod("1,5", NULL) > 1.4) {
6877             retval = ",";
6878         }
6879         else if (strtod("1.5", NULL) > 1.4) {
6880             retval = ".";
6881         }
6882         else {
6883             retval = NULL;
6884         }
6885 
6886         restore_toggled_locale_c(LC_NUMERIC, orig_switched_locale);
6887 
6888         if (retval) {
6889             break;
6890         }
6891        }
6892 
6893 #    endif  /* Trying strtod() */
6894 
6895         /* If gets to here, the strtod() method wasn't compiled, or it failed;
6896          * drop down.
6897          *
6898          * (snprintf() used to be used instead of strtod(), but it was removed
6899          * as being somewhat more clumsy, and maybe non-conforming on some
6900          * platforms.  But before resorting to localeconv(), the code that was
6901          * removed by the strtod commit could be inserted here.  This seems
6902          * unlikely to be wanted unless some really broken localeconv() shows
6903          * up) */
6904 
6905 #    if ! defined(USE_LOCALE_NUMERIC) || ! defined(HAS_LOCALECONV)
6906 
6907         retval = C_decimal_point;
6908         break;
6909 
6910 #    else
6911 #      define NEED_USE_LOCALECONV
6912 
6913         cat_index = LC_NUMERIC_INDEX_;
6914         localeconv_key = DECIMAL_POINT_LITERAL;
6915         localeconv_klen = STRLENs(DECIMAL_POINT_LITERAL);
6916 
6917 #    endif
6918 #  endif
6919 #  ifdef NEED_USE_LOCALECONV
6920 
6921     /* These items are available from localeconv(). */
6922 
6923    /* case RADIXCHAR:   // May drop down to here in some configurations
6924       case THOUSEP:     // Jumps to here
6925       case CRNCYSTR:    // Jumps to here */
6926       use_localeconv:
6927        {
6928 
6929         /* The hash gets populated with just the field(s) related to 'item'. */
6930         HV * result_hv = my_localeconv(item);
6931         SV* string = hv_delete(result_hv, localeconv_key, localeconv_klen, 0);
6932 
6933 #  ifdef USE_LOCALE_MONETARY
6934 
6935         if (item == CRNCYSTR) {
6936 
6937             /* CRNCYSTR localeconv() returns a slightly different value
6938              * than the nl_langinfo() API calls for, so have to modify this one
6939              * to conform.  We need another value from localeconv() to know
6940              * what to change it to.  my_localeconv() has populated the hash
6941              * with exactly both fields. */
6942             SV* precedes = hv_deletes(result_hv, P_CS_PRECEDES_LITERAL, 0);
6943             if (! precedes) {
6944                 locale_panic_("my_localeconv() unexpectedly didn't return"
6945                               " a value for " P_CS_PRECEDES_LITERAL);
6946             }
6947 
6948             /* The modification is to prefix the localeconv() return with a
6949              * single byte, calculated as follows: */
6950             const char * prefix = (LIKELY(SvIV(precedes) != -1))
6951                                    ? ((precedes != 0) ?  "-" : "+")
6952                                    : ".";
6953             /* (khw couldn't find any documentation that the dot is signalled
6954              * by CHAR_MAX (which we modify to -1), but cygwin uses it thusly,
6955              * and it makes sense given that CHAR_MAX indicates the value isn't
6956              * used, so it neither precedes nor succeeds) */
6957 
6958             /* Perform the modification */
6959             sv_insert(string, 0, 0, prefix, 1);
6960         }
6961 
6962 #  endif
6963 
6964         /* Here, 'string' contains the value we want to return, and the
6965          * hv_delete() has left it mortalized so its PV may be reused instead of
6966          * copied */
6967         sv_setsv_nomg(sv, string);
6968         retval_type = RETVAL_IN_sv;
6969 
6970         if (utf8ness) {
6971             is_utf8 = get_locale_string_utf8ness_i(SvPVX(sv),
6972                                                    LOCALE_UTF8NESS_UNKNOWN,
6973                                                    locale,
6974                                                    cat_index);
6975         }
6976 
6977         SvREFCNT_dec_NN(result_hv);
6978         break;
6979        }
6980 
6981 #  endif  /* Using localeconv() for something or other */
6982 #  undef NEED_USE_LOCALECONV
6983 #  if ! defined(HAS_SOME_LANGINFO) || ! LC_CTYPE_AVAIL_
6984 #    ifndef USE_LOCALE_CTYPE
6985 
6986       case CODESET:
6987         retval = C_codeset;
6988         break;
6989 
6990 #    else
6991 
6992       case CODESET:
6993 
6994         /* The trivial case */
6995         if (isNAME_C_OR_POSIX(locale)) {
6996             retval = C_codeset;
6997             break;
6998         }
6999 
7000         /* If this happens to match our cached value */
7001         if (PL_in_utf8_CTYPE_locale && strEQ(locale, PL_ctype_name)) {
7002             retval = "UTF-8";
7003             break;
7004         }
7005 
7006 #      ifdef WIN32
7007 #        ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
7008 #          define CODE_PAGE_FORMAT  "%s"
7009 #          define CODE_PAGE_FUNCTION  nl_langinfo(CODESET)
7010 #        else
7011 #          define CODE_PAGE_FORMAT  "%d"
7012 
7013          /* This Windows function retrieves the code page.  It is subject to
7014           * change, but is documented, and has been stable for many releases */
7015 #          define CODE_PAGE_FUNCTION  ___lc_codepage_func()
7016 #        endif
7017 
7018         const char * orig_CTYPE_locale;
7019         orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
7020         Perl_sv_setpvf(aTHX_ sv, CODE_PAGE_FORMAT, CODE_PAGE_FUNCTION);
7021         retval_type = RETVAL_IN_sv;
7022 
7023         /* We just assume the codeset is ASCII; no need to check for it being
7024          * UTF-8 */
7025         SvUTF8_off(sv);
7026 
7027         restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
7028 
7029         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n",
7030                                                locale, SvPVX(sv)));
7031         break;
7032 
7033 #      else   /* Below is ! Win32 */
7034 
7035         /* The codeset is important, but khw did not figure out a way for it to
7036          * be retrieved on non-Windows boxes without nl_langinfo().  But even
7037          * if we can't get it directly, we can usually determine if it is a
7038          * UTF-8 locale or not.  If it is UTF-8, we (correctly) use that for
7039          * the code set. */
7040 
7041 #        ifdef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
7042 
7043         if (is_locale_utf8(locale)) {
7044             retval = "UTF-8";
7045             break;
7046         }
7047 
7048 #        endif
7049 
7050         /* Here, the code set has not been found.  The only other option khw
7051          * could think of is to see if the codeset is part of the locale name.
7052          * This is very less than ideal; often there is no code set in the
7053          * name; and at other times they even lie.
7054          *
7055          * But there is an XPG standard syntax, which many locales follow:
7056          *
7057          *    language[_territory[.codeset]][@modifier]
7058          *
7059          * So we take the part between the dot and any '@' */
7060         const char * name;
7061         name = strchr(locale, '.');
7062         if (! name) {
7063             retval = "";  /* Alas, no dot */
7064         }
7065         else {
7066 
7067             /* Don't include the dot */
7068             name++;
7069 
7070             /* The code set name is considered to be everything between the dot
7071              * and any '@', so stop before any '@' */
7072             const char * modifier = strchr(name, '@');
7073             if (modifier) {
7074                 sv_setpvn(sv, name, modifier - name);
7075             }
7076             else {
7077                 sv_setpv(sv, name);
7078             }
7079             SvUTF8_off(sv);
7080 
7081             retval_type = RETVAL_IN_sv;
7082         }
7083 
7084 #        ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
7085 
7086         /* Here, 'retval' contains any codeset name derived from the locale
7087          * name.  That derived name may be empty or not necessarily indicative
7088          * of the real codeset.  But we can often determine if it should be
7089          * UTF-8, regardless of what the name is.  On most platforms, that
7090          * determination is definitive, and was already done.  But for this
7091          * code to be compiled, this platform is not one of them.  However,
7092          * there are typically tools available to make a very good guess, and
7093          * knowing the derived codeset name improves the quality of that guess.
7094          * The following function overrides the derived codeset name when it
7095          * guesses that it actually should be UTF-8.  It could be inlined here,
7096          * but was moved out of this switch() so as to make the switch()
7097          * control flow easier to follow */
7098         if (isRETVAL_IN_sv(retval_type)) {
7099             retval = SvPVX_const(sv);
7100             retval_type = RETVAL_IN_BOTH;
7101         }
7102 
7103         if (S_maybe_override_codeset(aTHX_ retval, locale, &retval)) {
7104             retval_type = RETVAL_IN_retval;
7105         }
7106 
7107 #        endif
7108 
7109         break;
7110 
7111 #      endif    /* ! WIN32 */
7112 #    endif      /* USE_LOCALE_CTYPE */
7113 #  endif
7114 
7115    /* The _NL_foo items are mostly empty; the rest are copied from Ubuntu C
7116     * locale values.  khw fairly arbitrarily decided which of its non-empty
7117     * values to copy and which to change to empty.  All the numeric ones needed
7118     * some value */
7119 
7120 #  if ! defined(HAS_SOME_LANGINFO) || ! LC_ADDRESS_AVAIL_
7121 
7122       case _NL_ADDRESS_POSTAL_FMT:
7123       case _NL_ADDRESS_COUNTRY_NAME:
7124       case _NL_ADDRESS_COUNTRY_POST:
7125       case _NL_ADDRESS_COUNTRY_AB2:
7126       case _NL_ADDRESS_COUNTRY_AB3:
7127       case _NL_ADDRESS_COUNTRY_CAR:
7128       case _NL_ADDRESS_COUNTRY_ISBN:
7129       case _NL_ADDRESS_LANG_NAME:
7130       case _NL_ADDRESS_LANG_AB:
7131       case _NL_ADDRESS_LANG_TERM:
7132       case _NL_ADDRESS_LANG_LIB:
7133         retval = "";
7134         break;
7135 
7136       case _NL_ADDRESS_COUNTRY_NUM:
7137         sv_setuv(sv, 0);
7138         retval_type = RETVAL_IN_sv;
7139         break;
7140 
7141 #  endif
7142 #  if ! defined(HAS_SOME_LANGINFO) || ! LC_IDENTIFICATION_AVAIL_
7143 
7144       case _NL_IDENTIFICATION_ADDRESS:
7145       case _NL_IDENTIFICATION_CONTACT:
7146       case _NL_IDENTIFICATION_EMAIL:
7147       case _NL_IDENTIFICATION_TEL:
7148       case _NL_IDENTIFICATION_FAX:
7149       case _NL_IDENTIFICATION_LANGUAGE:
7150       case _NL_IDENTIFICATION_AUDIENCE:
7151       case _NL_IDENTIFICATION_APPLICATION:
7152       case _NL_IDENTIFICATION_ABBREVIATION:
7153         retval = "";
7154         break;
7155 
7156       case _NL_IDENTIFICATION_DATE:     retval = "1997-12-20"; break;
7157       case _NL_IDENTIFICATION_REVISION: retval = "1.0"; break;
7158       case _NL_IDENTIFICATION_CATEGORY: retval = "i18n:1999"; break;
7159       case _NL_IDENTIFICATION_TERRITORY:retval = "ISO"; break;
7160 
7161       case _NL_IDENTIFICATION_TITLE:
7162         retval = "ISO/IEC 14652 i18n FDCC-set";
7163         break;
7164 
7165       case _NL_IDENTIFICATION_SOURCE:
7166         retval = "ISO/IEC JTC1/SC22/WG20 - internationalization";
7167         break;
7168 
7169 #  endif
7170 #  if ! defined(HAS_SOME_LANGINFO) || ! LC_MEASUREMENT_AVAIL_
7171 
7172       case _NL_MEASUREMENT_MEASUREMENT:
7173         sv_setuv(sv, 1);
7174         retval_type = RETVAL_IN_sv;
7175         break;
7176 
7177 #  endif
7178 #  if ! defined(HAS_SOME_LANGINFO) || ! LC_NAME_AVAIL_
7179 
7180       case _NL_NAME_NAME_FMT:
7181       case _NL_NAME_NAME_GEN:
7182       case _NL_NAME_NAME_MR:
7183       case _NL_NAME_NAME_MRS:
7184       case _NL_NAME_NAME_MISS:
7185       case _NL_NAME_NAME_MS:
7186         retval = "";
7187         break;
7188 
7189 #  endif
7190 #  if ! defined(HAS_SOME_LANGINFO) || ! LC_PAPER_AVAIL_
7191 
7192       case _NL_PAPER_HEIGHT:
7193         sv_setuv(sv, 297);
7194         retval_type = RETVAL_IN_sv;
7195         break;
7196 
7197       case _NL_PAPER_WIDTH:
7198         sv_setuv(sv, 210);
7199         retval_type = RETVAL_IN_sv;
7200         break;
7201 
7202 #  endif
7203 #  if ! defined(HAS_SOME_LANGINFO) || ! LC_TELEPHONE_AVAIL_
7204 
7205       case _NL_TELEPHONE_INT_SELECT:
7206       case _NL_TELEPHONE_INT_PREFIX:
7207       case _NL_TELEPHONE_TEL_DOM_FMT:
7208         retval = "";
7209         break;
7210 
7211       case _NL_TELEPHONE_TEL_INT_FMT:
7212         retval = "+%c %a %l";
7213         break;
7214 
7215 #  endif
7216 
7217    /* When we have to emulate TIME-related items, this bit of code is compiled
7218     * to have the default: case be a nested switch() which distinguishes
7219     * between legal inputs and unknown ones.  This bit does initialization and
7220     * then at the end calls switch().  But when we aren't emulating TIME, by
7221     * the time we get to here all legal inputs have been handled above, and it
7222     * is cleaner to not have a nested switch().  So this bit of code is skipped
7223     * and the other-wise nested default: case is compiled as part of the outer
7224     * (and actually only) switch() */
7225 #  if ! defined(HAS_SOME_LANGINFO) || ! LC_TIME_AVAIL_
7226 
7227       default:  /* Anything else that is legal is LC_TIME-related */
7228        {
7229 
7230         const char * format = NULL;
7231         retval = NULL;
7232 
7233 #    ifdef HAS_STRFTIME
7234 
7235         bool return_format = FALSE;
7236 
7237         /* Without strftime(), default compiled-in values are returned.
7238          * Otherwise, we generally compute a date as explained below.
7239          * Initialize default values for that computation */
7240         int mon = 0;
7241         int mday = 1;
7242         int hour = 6;
7243 
7244 #    endif
7245 
7246         /* Nested switch for LC_TIME items, plus the default: case is for
7247          * unknown items */
7248         switch (item) {
7249 
7250 #  endif    /* ! defined(HAS_SOME_LANGINFO) || ! LC_TIME_AVAIL_ */
7251 
7252           default:
7253 
7254             /* On systems with langinfo.h, 'item' is an enum.  If we don't
7255              * handle one of those, the code needs to change to be able to do
7256              * so.  But otherwise, the parameter can be any int, and so could
7257              * be a garbage value and all we can do is to return that it is
7258              * invalid. */;
7259 #  if defined(I_LANGINFO)
7260 
7261             Perl_croak_nocontext("panic: Unexpected nl_langinfo() item %jd",
7262                                  item);
7263 
7264 #  else
7265             assert(item < 0);   /* Make sure using perl_langinfo.h */
7266             SET_EINVAL;
7267             retval = "";
7268             break;
7269 #  endif
7270 
7271    /* Back to the nested switch() */
7272 #  if ! defined(HAS_SOME_LANGINFO) || ! LC_TIME_AVAIL_
7273 
7274             /* The case: statments in this switch are all for LC_TIME related
7275              * values.  There are four types of values returned.  One type is
7276              * "Give me the name in this locale of the 3rd month of the year"
7277              * (March in an English locale).  The second main type is "Give me
7278              * the best format string understood by strftime(), like '%c', for
7279              * formatting the date and time in this locale."  The other two
7280              * types are for ERA and ALT_DIGITS, and are explained at the case
7281              * statements for them.
7282              *
7283              * For the first type, suppose we want to find the name of the 3rd
7284              * month of the year.  We pass a date/time to strftime() that is
7285              * known to evaluate to sometime in March, along with a format that
7286              * tells strftime() to return the month's name.  We then return
7287              * that to our caller.  Similarly for the names of the days of the
7288              * week, like "Tuesday".  There are also abbreviated versions for
7289              * each of these.
7290              *
7291              * To implement the second type (returning to the caller a string
7292              * containing a format suitable for passing to strftime() ) we
7293              * guess a format, pass that to strftime, and examine its return to
7294              * see if that format is known on this platform.  If so, we return
7295              * that guess.  Otherwise we return the empty string "".  There are
7296              * no second guesses, as there don't seem to be alternatives
7297              * lurking out there.  For some formats that are supposed to be
7298              * known to all strftime()s since C89, we just assume that they are
7299              * valid, not bothering to check.  The guesses may not be the best
7300              * available for this locale on this platform, but should be good
7301              * enough, so that a native speaker would find them understandable.
7302              * */
7303 
7304             /* Unimplemented by perl; for use with strftime() %E modifier */
7305           case ERA: retval = ""; break;
7306 
7307 #    if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
7308 
7309           case AM_STR: retval = "AM"; break;
7310           case PM_STR: retval = "PM"; break;
7311 #    else
7312           case PM_STR: hour = 18;
7313           case AM_STR:
7314             format = "%p";
7315             break;
7316 #    endif
7317 #    if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
7318 
7319           case ABDAY_1: retval = "Sun"; break;
7320           case ABDAY_2: retval = "Mon"; break;
7321           case ABDAY_3: retval = "Tue"; break;
7322           case ABDAY_4: retval = "Wed"; break;
7323           case ABDAY_5: retval = "Thu"; break;
7324           case ABDAY_6: retval = "Fri"; break;
7325           case ABDAY_7: retval = "Sat"; break;
7326 #    else
7327           case ABDAY_7: mday++;
7328           case ABDAY_6: mday++;
7329           case ABDAY_5: mday++;
7330           case ABDAY_4: mday++;
7331           case ABDAY_3: mday++;
7332           case ABDAY_2: mday++;
7333           case ABDAY_1:
7334             format = "%a";
7335             break;
7336 #    endif
7337 #    if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
7338 
7339           case DAY_1: retval = "Sunday";    break;
7340           case DAY_2: retval = "Monday";    break;
7341           case DAY_3: retval = "Tuesday";   break;
7342           case DAY_4: retval = "Wednesday"; break;
7343           case DAY_5: retval = "Thursday";  break;
7344           case DAY_6: retval = "Friday";    break;
7345           case DAY_7: retval = "Saturday";  break;
7346 #    else
7347           case DAY_7: mday++;
7348           case DAY_6: mday++;
7349           case DAY_5: mday++;
7350           case DAY_4: mday++;
7351           case DAY_3: mday++;
7352           case DAY_2: mday++;
7353           case DAY_1:
7354             format = "%A";
7355             break;
7356 #    endif
7357 #    if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
7358           case ABMON_1:  retval = "Jan"; break;
7359           case ABMON_2:  retval = "Feb"; break;
7360           case ABMON_3:  retval = "Mar"; break;
7361           case ABMON_4:  retval = "Apr"; break;
7362           case ABMON_5:  retval = "May"; break;
7363           case ABMON_6:  retval = "Jun"; break;
7364           case ABMON_7:  retval = "Jul"; break;
7365           case ABMON_8:  retval = "Aug"; break;
7366           case ABMON_9:  retval = "Sep"; break;
7367           case ABMON_10: retval = "Oct"; break;
7368           case ABMON_11: retval = "Nov"; break;
7369           case ABMON_12: retval = "Dec"; break;
7370 #    else
7371           case ABMON_12: mon++;
7372           case ABMON_11: mon++;
7373           case ABMON_10: mon++;
7374           case ABMON_9:  mon++;
7375           case ABMON_8:  mon++;
7376           case ABMON_7:  mon++;
7377           case ABMON_6:  mon++;
7378           case ABMON_5:  mon++;
7379           case ABMON_4:  mon++;
7380           case ABMON_3:  mon++;
7381           case ABMON_2:  mon++;
7382           case ABMON_1:
7383             format = "%b";
7384             break;
7385 #    endif
7386 #    if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
7387 
7388           case MON_1:  retval = "January";  break;
7389           case MON_2:  retval = "February"; break;
7390           case MON_3:  retval = "March";    break;
7391           case MON_4:  retval = "April";    break;
7392           case MON_5:  retval = "May";      break;
7393           case MON_6:  retval = "June";     break;
7394           case MON_7:  retval = "July";     break;
7395           case MON_8:  retval = "August";   break;
7396           case MON_9:  retval = "September";break;
7397           case MON_10: retval = "October";  break;
7398           case MON_11: retval = "November"; break;
7399           case MON_12: retval = "December"; break;
7400 #    else
7401           case MON_12: mon++;
7402           case MON_11: mon++;
7403           case MON_10: mon++;
7404           case MON_9:  mon++;
7405           case MON_8:  mon++;
7406           case MON_7:  mon++;
7407           case MON_6:  mon++;
7408           case MON_5:  mon++;
7409           case MON_4:  mon++;
7410           case MON_3:  mon++;
7411           case MON_2:  mon++;
7412           case MON_1:
7413             format = "%B";
7414             break;
7415 #    endif
7416 #    ifndef HAS_STRFTIME
7417 
7418           /* If no strftime() on this system, no format will be recognized, so
7419            * return empty */
7420           case D_FMT:  case T_FMT:  case D_T_FMT:
7421           case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT:
7422           case T_FMT_AMPM:
7423             retval = "";
7424             break;
7425 #    else
7426           /* These strftime formats are defined by C89, so we assume that
7427            * strftime supports them, and so are returned unconditionally; they
7428            * may not be what the locale actually says, but should give good
7429            * enough results for someone using them as formats (as opposed to
7430            * trying to parse them to figure out what the locale says).  The
7431            * other format items are actually tested to verify they work on the
7432            * platform */
7433           case D_FMT:   retval = "%x"; break;
7434           case T_FMT:   retval = "%X"; break;
7435           case D_T_FMT: retval = "%c"; break;
7436 
7437           /* This format isn't in C89; test that it actually works on the
7438            * platform */
7439           case T_FMT_AMPM:
7440             format = "%r";
7441             return_format = TRUE;
7442             break;
7443 
7444 #      if defined(WIN32) || ! defined(USE_LOCALE_TIME)
7445 
7446           /* strftime() on Windows doesn't have the POSIX (beyond C89)
7447            * extensions that would allow it to recover these, so use the plain
7448            * non-ERA formats.  Also, when LC_TIME is constrained to the C
7449            * locale, the %E modifier is useless, so don't return it. */
7450           case ERA_D_FMT:   retval = "%x"; break;
7451           case ERA_T_FMT:   retval = "%X"; break;
7452           case ERA_D_T_FMT: retval = "%c"; break;
7453 #      else
7454           case ERA_D_FMT:
7455             format = "%Ex";
7456             return_format = TRUE;   /* Test that this works on the platform */
7457             break;
7458 
7459           case ERA_T_FMT:
7460             format = "%EX";
7461             return_format = TRUE;
7462             break;
7463 
7464           case ERA_D_T_FMT:
7465             format = "%Ec";
7466             return_format = TRUE;
7467             break;
7468 #      endif
7469 #    endif
7470 #    if defined(WIN32) || ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
7471 
7472           case ALT_DIGITS: retval = ""; break;
7473 #    else
7474 #      define CAN_BE_ALT_DIGITS
7475 
7476           case ALT_DIGITS:
7477             format = "%Ow"; /* Find the alternate digit for 0 */
7478             break;
7479 #    endif
7480 
7481         } /* End of inner switch() */
7482 
7483         /* The inner switch() above has set 'retval' iff that is the final
7484          * answer */
7485         if (retval) {
7486             break;
7487         }
7488 
7489         /* And it hasn't set 'format' iff it can't figure out a good value on
7490          * this platform. */
7491         if (! format) {
7492             retval = "";
7493             break;
7494         }
7495 
7496 #    ifdef HAS_STRFTIME
7497 
7498         /* Here we have figured out what to call strftime() with */
7499 
7500         struct tm  mytm;
7501         const char * orig_TIME_locale
7502                             = toggle_locale_c_unless_locking(LC_TIME, locale);
7503 
7504         /* The year was deliberately chosen so that January 1 is on the
7505         * first day of the week.  Since we're only getting one thing at a
7506         * time, it all works */
7507         ints_to_tm(&mytm, locale, 30, 30, hour, mday, mon, 2011, 0, 0, 0);
7508         bool succeeded;
7509         if (utf8ness) {
7510             succeeded = strftime8(format,
7511                                   sv,
7512                                   locale,
7513                                   &mytm,
7514 
7515                                   /* All possible formats specified above are
7516                                    * entirely ASCII */
7517                                   UTF8NESS_IMMATERIAL,
7518 
7519                                   &is_utf8,
7520                                   false    /* not calling from sv_strftime */
7521                               );
7522         }
7523         else {
7524             succeeded = strftime_tm(format, sv, locale, &mytm);
7525         }
7526 
7527         restore_toggled_locale_c_unless_locking(LC_TIME, orig_TIME_locale);
7528 
7529         if (UNLIKELY(! succeeded)) {
7530             retval = "";
7531             break;
7532         }
7533 
7534 #      ifdef CAN_BE_ALT_DIGITS
7535 
7536         if (LIKELY(item != ALT_DIGITS))
7537 
7538 #      endif
7539 
7540         {
7541 
7542             /* If to return what strftime() returns, are done */
7543             if (! return_format) {
7544                 retval_type = RETVAL_IN_sv;
7545                 break;
7546             }
7547 
7548             /* Here are to return the format, not the value.  This is used when
7549              * we are testing if the format we expect to return is legal on
7550              * this platform.  We have passed the format, say "%r, to
7551              * strftime(), and now have in 'sv' what strftime processed it
7552              * to be.  But the caller doesnt't want that; it wants the actual
7553              * %r, if it is understood on this platform, and "" if it isn't.
7554              * Some strftime()s return "" for an unknown format.  (None of the
7555              * formats exposed by langinfo can have "" be a legal result.)
7556              * Other strftime()s return the format unchanged if not understood.
7557              * So if we pass "%r" to strftime(), and it's illegal, we will get
7558              * back either "" or "%r", and we return "" to our caller.  If the
7559              * strftime() return is anything else, we conclude that "%r" is
7560              * understood by the platform, and return "%r". */
7561             if (strEQ(SvPVX(sv), format)) {
7562                 retval = "";
7563             }
7564             else {
7565                 retval = format;
7566             }
7567 
7568             /* A format is always in ASCII */
7569             is_utf8 = UTF8NESS_IMMATERIAL;
7570 
7571             break;
7572         }
7573 
7574 #      ifdef CAN_BE_ALT_DIGITS
7575 
7576         /* Here, the item is 'ALT_DIGITS' and 'sv' contains the zeroth
7577          * alternate digit.  If empty, return that there aren't alternate
7578          * digits */
7579         Size_t alt0_len = SvCUR(sv);
7580         if (alt0_len == 0) {
7581             retval_type = RETVAL_IN_sv;
7582             break;
7583         }
7584 
7585         /* ALT_DIGITS requires special handling because it requires up to 100
7586          * values.  Below we generate those by using the %O modifier to
7587          * strftime() formats.
7588          *
7589          * We already have the alternate digit for zero in 'sv', generated
7590          * using the %Ow format, which was used because it seems least likely
7591          * to have a leading zero.  But some locales return the equivalent of
7592          * 00 anyway.  If the first half of 'sv' is identical to the second
7593          * half, assume that is the case, and use just the first half */
7594         if ((alt0_len & 1) == 0) {
7595             Size_t half_alt0_len = alt0_len / 2;
7596             if (strnEQ(SvPVX(sv), SvPVX(sv) + half_alt0_len, half_alt0_len)) {
7597                 alt0_len = half_alt0_len;
7598                 SvCUR_set(sv, alt0_len);
7599             }
7600         }
7601 
7602         sv_catpvn_nomg (sv, ";", 1);
7603 
7604         /* Many of the remaining digits have representations that include at
7605          * least two 0-sized strings */
7606         SV* alt_dig_sv = newSV(2 * alt0_len);
7607 
7608         /* Various %O formats can be used to derive the alternate digits.  Only
7609          * %Oy can go up to the full 100 values.  If it doesn't work, we try
7610          * various fallbacks in decreasing order of how many values they can
7611          * deliver.  maxes[] tells the highest value that the format applies
7612          * to; offsets[] compensates for 0-based vs 1-based indices; and vars[]
7613          * holds what field in the 'struct tm' to applies to the corresponding
7614          * format */
7615         int year, min, sec;
7616       const char  * fmts[] = {"%Oy", "%OM", "%OS", "%Od", "%OH", "%Om", "%Ow" };
7617       const Size_t maxes[] = {  99,    59,    59,    31,    23,    11,    6   };
7618       const int  offsets[] = {   0,     0,     0,     1,     0,     1,    0   };
7619       int         * vars[] = {&year,  &min,  &sec,  &mday, &hour, &mon, &mday };
7620         Size_t j = 0;   /* Current index into the above tables */
7621 
7622         orig_TIME_locale = toggle_locale_c_unless_locking(LC_TIME, locale);
7623 
7624         for (unsigned int i = 1; i <= 99; i++) {
7625             struct tm  mytm;
7626 
7627           redo:
7628             if (j >= C_ARRAY_LENGTH(fmts)) {
7629                 break;  /* Exhausted formats early; can't continue */
7630             }
7631 
7632             if (i > maxes[j]) {
7633                 j++;    /* Exhausted this format; try next one */
7634                 goto redo;
7635             }
7636 
7637             year = (strchr(fmts[j], 'y')) ? 1900 : 2011;
7638             hour = 0;
7639             min = 0;
7640             sec = 0;
7641             mday = 1;
7642             mon = 0;
7643 
7644             /* Change the variable corresponding to this format to the
7645             * current time being run in 'i' */
7646             *(vars[j]) += i - offsets[j];
7647 
7648             /* Do the strftime.  Once we have determined the UTF8ness (if
7649             * we want it), assume the rest will be the same, and use
7650             * strftime_tm(), which doesn't recalculate UTF8ness */
7651             ints_to_tm(&mytm, locale, sec, min, hour, mday, mon, year, 0, 0, 0);
7652             if (utf8ness && is_utf8 != UTF8NESS_NO && is_utf8 != UTF8NESS_YES) {
7653                 succeeded = strftime8(fmts[j],
7654                                       alt_dig_sv,
7655                                       locale,
7656                                       &mytm,
7657                                       UTF8NESS_IMMATERIAL,
7658                                       &is_utf8,
7659                                       false   /* not calling from sv_strftime */
7660                                      );
7661             }
7662             else {
7663                 succeeded = strftime_tm(fmts[j], alt_dig_sv, locale, &mytm);
7664             }
7665 
7666             /* If didn't recognize this format, try the next */
7667             if (UNLIKELY(! succeeded)) {
7668                 j++;
7669                 goto redo;
7670             }
7671 
7672             const char * current = SvPVX(alt_dig_sv);
7673 
7674             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
7675                                 "i=%d, format=%s, alt='%s'\n",
7676                                 i, fmts[j], current));
7677 
7678 
7679             /* If it returned regular digits, give up on this format, to try
7680              * the next candidate one */
7681             if (strpbrk(current, "0123456789")) {
7682                 j++;
7683                 goto redo;
7684             }
7685 
7686             /* If there is a leading alternate zero, skip past it, to get the
7687              * second one in the string.  The first 'alt0_len' bytes in 'sv'
7688              * will be the alternate-zero representation */
7689             if (strnEQ(current, SvPVX(sv), alt0_len)) {
7690                 current += alt0_len;
7691             }
7692 
7693             /* Append this number to the ongoing list, including the separator.
7694              * */
7695             sv_catpv_nomg (sv, current);
7696             sv_catpvn_nomg (sv, ";", 1);
7697         } /* End of loop generating ALT_DIGIT strings */
7698 
7699         /* Above we accepted 0 for alt-0 in case the locale doesn't have a
7700          * zero, but we rejected any other ASCII digits.  Now that we have
7701          * processed everything, if that 0 is the only thing we found, it was a
7702          * false positive, and the locale doesn't have alternate digits */
7703         if (SvCUR(sv) == alt0_len + 1) {
7704             SvCUR_set(sv, 0);
7705         }
7706 
7707         SvREFCNT_dec_NN(alt_dig_sv);
7708 
7709         restore_toggled_locale_c_unless_locking(LC_TIME, orig_TIME_locale);
7710 
7711         retval_type = RETVAL_IN_sv;
7712         break;
7713 
7714 #      endif    /* End of CAN_BE_ALT_DIGITS */
7715 #    endif      /* End of HAS_STRFTIME */
7716 
7717        }    /* End of braced group for outer switch 'default:' case */
7718 
7719 #  endif
7720 
7721     } /* Giant switch() of nl_langinfo() items */
7722 
7723     GCC_DIAG_RESTORE_STMT;
7724 
7725     if (sv != PL_scratch_langinfo) {    /* Caller wants return in 'sv' */
7726         if (! isRETVAL_IN_sv(retval_type)) {
7727             sv_setpv(sv, retval);
7728             SvUTF8_off(sv);
7729         }
7730 
7731         if (utf8ness) {
7732             *utf8ness = is_utf8;
7733             if (is_utf8 == UTF8NESS_YES) {
7734                 SvUTF8_on(sv);
7735             }
7736         }
7737 
7738         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
7739                          "Leaving emulate_langinfo item=%jd, using locale %s\n",
7740                          item, locale));
7741 
7742         /* The caller shouldn't also be wanting a 'retval'; make sure segfaults
7743          * if they call this wrong */
7744         return NULL;
7745     }
7746 
7747     /* Here, wants a 'retval' return.  Extract that if not already there. */
7748     if (! isRETVAL_IN_retval(retval_type)) {
7749         retval = SvPV_nolen(sv);
7750     }
7751 
7752     /* Here, 'retval' started as a simple value, or has been converted into
7753      * being simple */
7754     if (utf8ness) {
7755         *utf8ness = is_utf8;
7756     }
7757 
7758     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
7759                          "Leaving emulate_langinfo item=%jd, using locale %s\n",
7760                          item, locale));
7761     return retval;
7762 
7763 #  undef RETVAL_IN_retval
7764 #  undef RETVAL_IN_BOTH
7765 #  undef RETVAL_IN_sv
7766 #  undef isRETVAL_IN_sv
7767 #  undef isRETVAL_IN_retval
7768 
7769 }
7770 
7771 #endif      /* Needs emulate_langinfo() */
7772 #ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
7773 
7774 STATIC bool
S_maybe_override_codeset(pTHX_ const char * codeset,const char * locale,const char ** new_codeset)7775 S_maybe_override_codeset(pTHX_ const char * codeset,
7776                                const char * locale,
7777                                const char ** new_codeset)
7778 {
7779 #  define NAME_INDICATES_UTF8       0x1
7780 #  define MB_CUR_MAX_SUGGESTS_UTF8  0x2
7781 
7782     /* Override 'codeset' with UTF-8 if this routine guesses that it should be.
7783      * Conversely (but rarely), "UTF-8" in the locale name might be wrong.  We
7784      * return "" as the code set name if we find that to be the case.  */
7785 
7786     unsigned int lean_towards_being_utf8 = 0;
7787     if (is_codeset_name_UTF8(codeset)) {
7788         lean_towards_being_utf8 |= NAME_INDICATES_UTF8;
7789     }
7790 
7791     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
7792 
7793     /* For this portion of the file to compile, some C99 functions aren't
7794      * available to us, even though we now require C99.  So, something must be
7795      * wrong with them.  The code here should be good enough to work around
7796      * this issue, but should the need arise, comments in S_is_locale_utf8()
7797      * list some alternative C99 functions that could be tried.
7798      *
7799      * But MB_CUR_MAX is a C89 construct that helps a lot, is simple for a
7800      * vendor to implement, and our experience with it is that it works well on
7801      * a variety of platforms.  We have found that it returns a too-large
7802      * number on some platforms for the C locale, but for no others.  That
7803      * locale was already ruled out in the code that called this function.  (If
7804      * MB_CUR_MAX returned too small a number, that would break a lot of
7805      * things, and likely would be quickly corrected by the vendor.)  khw has
7806      * some confidence that it doesn't return >1 when 1 is meant, as that would
7807      * trigger a Perl warning, and we've had no reports of invalid occurrences
7808      * of such. */
7809 #  ifdef MB_CUR_MAX
7810 
7811     /* If there are fewer bytes available in this locale than are required to
7812      * represent the largest legal UTF-8 code point, this definitely isn't a
7813      * UTF-8 locale, even if the locale name says it is. */
7814     const int mb_cur_max = MB_CUR_MAX;
7815     if (mb_cur_max < (int) UNISKIP(PERL_UNICODE_MAX)) {
7816         restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
7817 
7818         if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
7819             *new_codeset = "";    /* The name is wrong; override */
7820             return true;
7821         }
7822 
7823         return false;
7824     }
7825 
7826     /* But if the locale could be UTF-8, and also the name corroborates this,
7827      * assume it is so */
7828     if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
7829         restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
7830         return false;
7831     }
7832 
7833     restore_toggled_locale_c_if_locking(LC_CTYPE, orig_CTYPE_locale);
7834 
7835     /* Here, the name doesn't indicate UTF-8, but MB_CUR_MAX indicates it could
7836      * be.  khw knows of only two other locales in the world, EUC-TW and GB
7837      * 18030, that legitimately require this many bytes (4).  So, if the name
7838      * is one of those, MB_CUR_MAX has corroborated that. */
7839     bool name_implies_non_utf8 = false;
7840     if (foldEQ(codeset, "GB", 2)) {
7841         const char * s = codeset + 2;
7842         if (*s == '-' || *s == '_') {
7843             s++;
7844         }
7845 
7846         if strEQ(s, "18030") {
7847             name_implies_non_utf8 = true;
7848         }
7849     }
7850     else if (foldEQ(codeset, "EUC", 3)) {
7851         const char * s = codeset + 3;
7852         if (*s == '-' || *s == '_') {
7853             s++;
7854         }
7855 
7856         if (foldEQ(s, "TW", 2)) {
7857             name_implies_non_utf8 = true;
7858         }
7859     }
7860 
7861     /* Otherwise, the locale is likely UTF-8 */
7862     if (! name_implies_non_utf8) {
7863         lean_towards_being_utf8 |= MB_CUR_MAX_SUGGESTS_UTF8;
7864     }
7865 
7866     /* (In both those two other multibyte locales, the single byte characters
7867      * are the same as ASCII.  No multi-byte character in EUC-TW is legal UTF-8
7868      * (since the first byte of each is a continuation).  GB 18030 has no three
7869      * byte sequences, and none of the four byte ones is legal UTF-8 (as the
7870      * second byte for these is a non-continuation).  But every legal UTF-8 two
7871      * byte sequence is also legal in GB 18030, though none have the same
7872      * meaning, and no Han code point expressed in UTF-8 is two byte.  So the
7873      * further tests below which look for native expressions of currency and
7874      * time will not return two byte sequences, hence they will reliably rule
7875      * out such a locale as being UTF-8, even if the code set name checked
7876      * above isn't correct.) */
7877 
7878 #  endif    /* has MB_CUR_MAX */
7879 
7880     /* Here, MB_CUR_MAX is not available, or was inconclusive.  What we do is
7881      * to look at various strings associated with the locale:
7882      *  1)  If any are illegal UTF-8, the locale can't be UTF-8.
7883      *  2)  If all are legal UTF-8, and some non-ASCII characters are present,
7884      *      it is likely to be UTF-8, because of the strictness of UTF-8
7885      *      syntax. So assume it is UTF-8
7886      *  3)  If all are ASCII and the locale name and/or MB_CUR_MAX indicate
7887      *      UTF-8, assume the locale is UTF-8.
7888      *  4)  Otherwise, assume the locale isn't UTF-8
7889      *
7890      * To save cycles, if the locale name indicates it is a UTF-8 locale, we
7891      * stop looking at the first instance with legal non-ASCII UTF-8.  It is
7892      * very unlikely this combination is coincidental. */
7893 
7894     utf8ness_t strings_utf8ness = UTF8NESS_UNKNOWN;
7895 
7896     /* List of strings to look at */
7897     const int trials[] = {
7898 
7899 #  if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
7900 
7901         /* The first string tried is the locale currency name.  Often that will
7902          * be in the native script.
7903          *
7904          * But this is usable only if localeconv() is available, as that's the
7905          * way we find out the currency symbol. */
7906 
7907         CRNCYSTR,
7908 
7909 #  endif
7910 #  ifdef USE_LOCALE_TIME
7911 
7912     /* We can also try various strings associated with LC_TIME, like the names
7913      * of months or days of the week */
7914 
7915         DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
7916         MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
7917                                     MON_9, MON_10, MON_11, MON_12,
7918         ALT_DIGITS, AM_STR, PM_STR,
7919         ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6, ABDAY_7,
7920         ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
7921         ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
7922 
7923 #  endif
7924 
7925     };
7926 
7927 #  ifdef USE_LOCALE_TIME
7928 
7929     /* The code in the recursive call below can handle switching the locales,
7930      * but by doing it now here, that code will check and discover that there
7931      * is no need to switch then restore, avoiding those each loop iteration.
7932      *
7933      * But don't do this if toggling actually creates a critical section, so as
7934      * to minimize the amount of time spent in each critical section. */
7935     const char * orig_TIME_locale =
7936                                 toggle_locale_c_unless_locking(LC_TIME, locale);
7937 
7938 #  endif
7939 
7940     /* The trials array may consist of strings from two different locale
7941      * categories.  The call to langinfo_i() below needs to pass the proper
7942      * category for each string.  There is a max of 1 trial for LC_MONETARY;
7943      * the rest are LC_TIME.  So the array is arranged so the LC_MONETARY item
7944      * (if any) is first, and all subsequent iterations will use LC_TIME.
7945      * These #ifdefs set up the values for all possible combinations. */
7946 #  if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
7947 
7948     locale_category_index  cat_index = LC_MONETARY_INDEX_;
7949 
7950 #    ifdef USE_LOCALE_TIME
7951 
7952     const locale_category_index  follow_on_cat_index = LC_TIME_INDEX_;
7953     assert(trials[1] == DAY_1); /* Make sure only a single non-time entry */
7954 
7955 #    else
7956 
7957     /* Effectively out-of-bounds, as there is only the monetary entry */
7958     const locale_category_index  follow_on_cat_index = LC_ALL_INDEX_;
7959 
7960 #    endif
7961 #  elif defined(USE_LOCALE_TIME)
7962 
7963     locale_category_index  cat_index = LC_TIME_INDEX_;
7964     const locale_category_index  follow_on_cat_index = LC_TIME_INDEX_;
7965 
7966 #  else
7967 
7968     /* Effectively out-of-bounds, as here there are no trial entries at all.
7969      * This allows this code to compile, but there are no strings to test, and
7970      * so the answer will always be non-UTF-8. */
7971     locale_category_index  cat_index = LC_ALL_INDEX_;
7972     const locale_category_index  follow_on_cat_index = LC_ALL_INDEX_;
7973 
7974 #  endif
7975 
7976     /* We will need to use the reentrant interface. */
7977     SV * sv = newSVpvs("");
7978 
7979     /* Everything set up; look through all the strings */
7980     for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(trials); i++) {
7981 
7982         /* To prevent infinite recursive calls, we don't ask for the UTF-8ness
7983          * of the string.  Instead we examine the result below */
7984         langinfo_sv_i(trials[i], cat_index, locale, sv, NULL);
7985 
7986         cat_index = follow_on_cat_index;
7987 
7988         const char * result = SvPVX(sv);
7989         const Size_t len = strlen(result);
7990         const U8 * first_variant;
7991 
7992         /* If the string is identical whether or not it is encoded as UTF-8, it
7993          * isn't helpful in determining UTF8ness. */
7994         if (is_utf8_invariant_string_loc((U8 *) result, len, &first_variant))
7995         {
7996             continue;
7997         }
7998 
7999         /* Here, has non-ASCII.  If not legal UTF-8, isn't a UTF-8 locale */
8000         if (! is_strict_utf8_string(first_variant,
8001                                     len - (first_variant - (U8 *) result)))
8002         {
8003             strings_utf8ness = UTF8NESS_NO;
8004             break;
8005         }
8006 
8007         /* Here, is a legal non-ASCII UTF-8 string; tentatively set the return
8008          * to YES; possibly overridden by later iterations */
8009         strings_utf8ness = UTF8NESS_YES;
8010 
8011         /* But if this corroborates our expectation, quit now */
8012         if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
8013             break;
8014         }
8015     }
8016 
8017 #  ifdef USE_LOCALE_TIME
8018 
8019     restore_toggled_locale_c_unless_locking(LC_TIME, orig_TIME_locale);
8020 
8021 #  endif
8022 
8023     restore_toggled_locale_c_unless_locking(LC_CTYPE, orig_CTYPE_locale);
8024 
8025     if (strings_utf8ness == UTF8NESS_NO) {
8026         return false;     /* No override */
8027     }
8028 
8029     /* Here all tested strings are legal UTF-8.
8030      *
8031      * Above we set UTF8NESS_YES if any string wasn't ASCII.  But even if they
8032      * are all ascii, and the locale name indicates it is a UTF-8 locale,
8033      * assume the locale is UTF-8. */
8034     if (lean_towards_being_utf8) {
8035         strings_utf8ness = UTF8NESS_YES;
8036     }
8037 
8038     if (strings_utf8ness == UTF8NESS_YES) {
8039         *new_codeset = "UTF-8";
8040         return true;
8041     }
8042 
8043     /* Here, nothing examined indicates that the codeset is or isn't UTF-8.
8044      * But what is it?  The other locale categories are not likely to be of
8045      * further help:
8046      *
8047      * LC_NUMERIC   Only a few locales in the world have a non-ASCII radix or
8048      *              group separator.
8049      * LC_CTYPE     This code wouldn't be compiled if mbtowc() existed and was
8050      *              reliable.  This is unlikely in C99.  There are other
8051      *              functions that could be used instead, but are they going to
8052      *              exist, and be able to distinguish between UTF-8 and 8859-1?
8053      *              Deal with this only if it becomes necessary.
8054      * LC_MESSAGES  The strings returned from strerror() would seem likely
8055      *              candidates, but experience has shown that many systems
8056      *              don't actually have translations installed for them.  They
8057      *              are instead always in English, so everything in them is
8058      *              ASCII, which is of no help to us.  A Configure probe could
8059      *              possibly be written to see if this platform has non-ASCII
8060      *              error messages.  But again, wait until it turns out to be
8061      *              an actual problem.
8062      *
8063      *              Things like YESSTR, NOSTR, might not be in ASCII, but need
8064      *              nl_langinfo() to access, which we don't have.
8065      */
8066 
8067     /* Otherwise, assume the locale isn't UTF-8.  This can be wrong if we don't
8068      * have MB_CUR_MAX, and the locale is English without UTF-8 in its name,
8069      * and with a dollar currency symbol. */
8070     return false;     /* No override */
8071 }
8072 
8073 #  endif /* ! HAS_DEFINITIVE_UTF8NESS_DETERMINATION */
8074 
8075 /*
8076 =for apidoc_section $time
8077 =for apidoc      sv_strftime_tm
8078 =for apidoc_item my_strftime
8079 
8080 These implement the libc strftime().
8081 
8082 On failure, they return NULL, and set C<errno> to C<EINVAL>.
8083 
8084 C<sv_strftime_tm> is preferred, as it transparently handles the UTF-8ness of
8085 the current locale, the input C<fmt>, and the returned result.  Only if the
8086 current C<LC_TIME> locale is a UTF-8 one (and S<C<use bytes>> is not in effect)
8087 will the result be marked as UTF-8.
8088 
8089 C<sv_strftime_tm> takes a pointer to a filled-in S<C<struct tm>> parameter.  It
8090 ignores the values of the C<wday> and C<yday> fields in it.  The other fields
8091 give enough information to accurately calculate these values, and are used for
8092 that purpose.
8093 
8094 The caller assumes ownership of the returned SV with a reference count of 1.
8095 
8096 C<my_strftime> is kept for backwards compatibility.  Knowing if its result
8097 should be considered UTF-8 or not requires significant extra logic.
8098 
8099 The return value is a pointer to the formatted result (which MUST be arranged
8100 to be FREED BY THE CALLER).  This allows this function to increase the buffer
8101 size as needed, so that the caller doesn't have to worry about that, unlike
8102 libc C<strftime()>.
8103 
8104 The C<wday>, C<yday>, and C<isdst> parameters are ignored by C<my_strftime>.
8105 Daylight savings time is never considered to exist, and the values returned for
8106 the other two fields (if C<fmt> even calls for them) are calculated from the
8107 other parameters, without need for referencing these.
8108 
8109 Note that both functions are always executed in the underlying
8110 C<LC_TIME> locale of the program, giving results based on that locale.
8111 
8112 =cut
8113  */
8114 
8115 char *
Perl_my_strftime(pTHX_ const char * fmt,int sec,int min,int hour,int mday,int mon,int year,int wday,int yday,int isdst)8116 Perl_my_strftime(pTHX_ const char *fmt, int sec, int min, int hour,
8117                        int mday, int mon, int year, int wday, int yday,
8118                        int isdst)
8119 {   /* Documented above */
8120     PERL_ARGS_ASSERT_MY_STRFTIME;
8121 
8122 #ifdef USE_LOCALE_TIME
8123     const char * locale = querylocale_c(LC_TIME);
8124 #else
8125     const char * locale = "C";
8126 #endif
8127 
8128     struct tm  mytm;
8129     ints_to_tm(&mytm, locale, sec, min, hour, mday, mon, year, wday, yday,
8130                isdst);
8131     if (! strftime_tm(fmt, PL_scratch_langinfo, locale, &mytm)) {
8132         return NULL;
8133     }
8134 
8135     return savepv(SvPVX(PL_scratch_langinfo));
8136 }
8137 
8138 SV *
Perl_sv_strftime_ints(pTHX_ SV * fmt,int sec,int min,int hour,int mday,int mon,int year,int wday,int yday,int isdst)8139 Perl_sv_strftime_ints(pTHX_ SV * fmt, int sec, int min, int hour,
8140                             int mday, int mon, int year, int wday,
8141                             int yday, int isdst)
8142 {   /* Documented above */
8143     PERL_ARGS_ASSERT_SV_STRFTIME_INTS;
8144 
8145 #ifdef USE_LOCALE_TIME
8146     const char * locale = querylocale_c(LC_TIME);
8147 #else
8148     const char * locale = "C";
8149 #endif
8150 
8151     struct tm  mytm;
8152     ints_to_tm(&mytm, locale, sec, min, hour, mday, mon, year, wday, yday,
8153                isdst);
8154     return sv_strftime_common(fmt, locale, &mytm);
8155 }
8156 
8157 SV *
Perl_sv_strftime_tm(pTHX_ SV * fmt,const struct tm * mytm)8158 Perl_sv_strftime_tm(pTHX_ SV * fmt, const struct tm * mytm)
8159 {   /* Documented above */
8160     PERL_ARGS_ASSERT_SV_STRFTIME_TM;
8161 
8162 #ifdef USE_LOCALE_TIME
8163 
8164     return sv_strftime_common(fmt, querylocale_c(LC_TIME), mytm);
8165 
8166 #else
8167 
8168     return sv_strftime_common(fmt, "C", mytm);
8169 
8170 #endif
8171 
8172 }
8173 
8174 SV *
S_sv_strftime_common(pTHX_ SV * fmt,const char * locale,const struct tm * mytm)8175 S_sv_strftime_common(pTHX_ SV * fmt,
8176                            const char * locale,
8177                            const struct tm * mytm)
8178 {   /* Documented above */
8179     PERL_ARGS_ASSERT_SV_STRFTIME_COMMON;
8180 
8181     STRLEN fmt_cur;
8182     const char *fmt_str = SvPV_const(fmt, fmt_cur);
8183 
8184     utf8ness_t fmt_utf8ness = (SvUTF8(fmt) && LIKELY(! IN_BYTES))
8185                               ? UTF8NESS_YES
8186                               : UTF8NESS_UNKNOWN;
8187 
8188     utf8ness_t result_utf8ness;
8189 
8190     /* Use a fairly generous guess as to how big the buffer needs to be, so as
8191      * to get almost all the typical returns to fit without the called function
8192      * having to realloc; this is a somewhat educated guess, but feel free to
8193      * tweak it. */
8194     SV* sv = newSV(MAX(fmt_cur * 2, 64));
8195     if (! strftime8(fmt_str,
8196                     sv,
8197                     locale,
8198                     mytm,
8199                     fmt_utf8ness,
8200                     &result_utf8ness,
8201                     true  /* calling from sv_strftime */ ))
8202     {
8203         return NULL;
8204     }
8205 
8206 
8207     if (result_utf8ness == UTF8NESS_YES) {
8208         SvUTF8_on(sv);
8209     }
8210 
8211     return sv;
8212 }
8213 
8214 STATIC void
S_ints_to_tm(pTHX_ struct tm * mytm,const char * locale,int sec,int min,int hour,int mday,int mon,int year,int wday,int yday,int isdst)8215 S_ints_to_tm(pTHX_ struct tm * mytm,
8216                    const char * locale,
8217                    int sec, int min, int hour, int mday, int mon, int year,
8218                    int wday, int yday, int isdst)
8219 {
8220     /* Create a struct tm structure from the input time-related integer
8221      * variables for 'locale' */
8222 
8223     /* Override with the passed-in values */
8224     Zero(mytm, 1, struct tm);
8225     mytm->tm_sec = sec;
8226     mytm->tm_min = min;
8227     mytm->tm_hour = hour;
8228     mytm->tm_mday = mday;
8229     mytm->tm_mon = mon;
8230     mytm->tm_year = year;
8231     mytm->tm_wday = wday;
8232     mytm->tm_yday = yday;
8233     mytm->tm_isdst = isdst;
8234 
8235     /* Long-standing behavior is to ignore the effects of locale (in
8236      * particular, daylight savings time) on the input, so we use mini_mktime.
8237      * See GH #22062. */
8238     mini_mktime(mytm);
8239 
8240     /* But some of those effect are deemed desirable, so use libc to get the
8241      * values for tm_gmtoff and tm_zone on platforms that have them [perl
8242      * #18238] */
8243 #if  defined(HAS_MKTIME)                                      \
8244  && (defined(HAS_TM_TM_GMTOFF) || defined(HAS_TM_TM_ZONE))
8245 
8246     const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
8247     struct tm mytm2 = *mytm;
8248     MKTIME_LOCK;
8249     mktime(&mytm2);
8250     MKTIME_UNLOCK;
8251     restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
8252 
8253 #  ifdef HAS_TM_TM_GMTOFF
8254     mytm->tm_gmtoff = mytm2.tm_gmtoff;
8255 #  endif
8256 #  ifdef HAS_TM_TM_ZONE
8257     mytm->tm_zone = mytm2.tm_zone;
8258 #  endif
8259 #endif
8260 
8261     return;
8262 }
8263 
8264 STATIC bool
S_strftime_tm(pTHX_ const char * fmt,SV * sv,const char * locale,const struct tm * mytm)8265 S_strftime_tm(pTHX_ const char *fmt,
8266                     SV * sv,
8267                     const char *locale,
8268                     const struct tm *mytm)
8269 {
8270     PERL_ARGS_ASSERT_STRFTIME_TM;
8271 
8272     /* Execute strftime() based on the input struct tm, and the current LC_TIME
8273      * locale.
8274      *
8275      * Returns 'true' if succeeded, with the PV pointer in 'sv' filled with the
8276      * result, and all other C<OK> bits disabled, and not marked as UTF-8.
8277      * Determining the UTF-8ness must be done at a higher level.
8278      *
8279      * 'false' is returned if if fails; the state of 'sv' is unspecified. */
8280 
8281     /* An empty format yields an empty result */
8282     const Size_t fmtlen = strlen(fmt);
8283     if (fmtlen == 0) {
8284         sv_setpvs(sv, "");
8285         SvUTF8_off(sv);
8286         return true;
8287     }
8288 
8289     bool succeeded = false;
8290 
8291 #ifndef HAS_STRFTIME
8292     Perl_croak(aTHX_ "panic: no strftime");
8293 #endif
8294 
8295     start_DEALING_WITH_MISMATCHED_CTYPE(locale);
8296 
8297 #if defined(USE_LOCALE_TIME)
8298 
8299     const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
8300 
8301 #  define LC_TIME_TEARDOWN                                                  \
8302                         restore_toggled_locale_c(LC_TIME, orig_TIME_locale)
8303 #else
8304    PERL_UNUSED_ARG(locale);
8305 #  define LC_TIME_TEARDOWN
8306 #endif
8307 
8308     /* Assume the caller has furnished a reasonable sized guess, but guard
8309      * against one that won't work */
8310     Size_t bufsize = MAX(2, SvLEN(sv));
8311     SvUPGRADE(sv, SVt_PV);
8312     SvPOK_only(sv);
8313 
8314     do {
8315         char * buf = SvGROW(sv, bufsize);
8316 
8317         /* allowing user-supplied (rather than literal) formats is normally
8318          * frowned upon as a potential security risk; but this is part of the
8319          * API so we have to allow it (and the available formats have a much
8320          * lower chance of doing something bad than the ones for printf etc. */
8321         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
8322 
8323 #ifdef WIN32    /* Windows will tell you if the input is invalid */
8324 
8325         /* Needed because the LOCK might (or might not) save/restore errno */
8326         bool strftime_failed = false;
8327 
8328         STRFTIME_LOCK;
8329         dSAVE_ERRNO;
8330         errno = 0;
8331 
8332         Size_t len = strftime(buf, bufsize, fmt, mytm);
8333         if (errno == EINVAL) {
8334             strftime_failed = true;
8335         }
8336 
8337         RESTORE_ERRNO;
8338         STRFTIME_UNLOCK;
8339 
8340         if (strftime_failed) {
8341             goto strftime_failed;
8342         }
8343 
8344 #else
8345         STRFTIME_LOCK;
8346         Size_t len = strftime(buf, bufsize, fmt, mytm);
8347         STRFTIME_UNLOCK;
8348 #endif
8349 
8350         GCC_DIAG_RESTORE_STMT;
8351 
8352         /* A non-zero return indicates success.  But to make sure we're not
8353          * dealing with some rogue strftime that returns how much space it
8354          * needs instead of 0 when there isn't enough, check that the return
8355          * indicates we have at least one byte of spare space (which will be
8356          * used for the terminating NUL). */
8357         if (inRANGE(len, 1, bufsize - 1)) {
8358             succeeded = true;
8359             SvCUR_set(sv, len);
8360             goto strftime_return;
8361         }
8362 
8363         /* There are several possible reasons for a 0 return code for a
8364          * non-empty format, and they are not trivial to tease apart.  This
8365          * issue is a known bug in the strftime() API.  What we do to cope is
8366          * to assume that the reason is not enough space in the buffer, so
8367          * increase it and try again. */
8368         bufsize *= 2;
8369 
8370         /* But don't just keep increasing the size indefinitely.  Stop when it
8371          * becomes obvious that the reason for failure is something besides not
8372          * enough space.  The most likely largest expanding format is %c.  On
8373          * khw's Linux box, the maximum result of this is 67 characters, in the
8374          * km_KH locale.  If a new script comes along that uses 4 UTF-8 bytes
8375          * per character, and with a similar expansion factor, that would be a
8376          * 268:2 byte ratio, or a bit more than 128:1 = 2**7:1.  Some strftime
8377          * implementations allow you to say %1000c to pad to 1000 bytes.  This
8378          * shows that it is impossible to implement this without a heuristic
8379          * (which can fail).  But it indicates we need to be generous in the
8380          * upper limit before failing.  The previous heuristic used was too
8381          * stingy.  Since the size doubles per iteration, it doesn't take many
8382          * to reach the limit */
8383     } while (bufsize < ((1 << 11) + 1) * fmtlen);
8384 
8385     /* Here, strftime() returned 0, and it likely wasn't for lack of space.
8386      * There are two possible reasons:
8387      *
8388      * First is that the result is legitimately 0 length.  This can happen
8389      * when the format is precisely "%p".  That is the only documented format
8390      * that can have an empty result. */
8391     if (strEQ(fmt, "%p")) {
8392         sv_setpvs(sv, "");
8393         SvUTF8_off(sv);
8394         succeeded = true;
8395         goto strftime_return;
8396     }
8397 
8398     /* The other reason is that the format string is malformed.  Probably it is
8399      * that the string is syntactically invalid for the locale.  On some
8400      * platforms an invalid conversion specifier '%?' (for all illegal '?') is
8401      * treated as a literal, but others may fail when '?' is illegal */
8402 
8403 #ifdef WIN32
8404   strftime_failed:
8405 #endif
8406 
8407     SET_EINVAL;
8408     succeeded = false;
8409 
8410   strftime_return:
8411 
8412     LC_TIME_TEARDOWN;
8413     end_DEALING_WITH_MISMATCHED_CTYPE(locale);
8414 
8415     return succeeded;
8416 }
8417 
8418 STATIC bool
S_strftime8(pTHX_ const char * fmt,SV * sv,const char * locale,const struct tm * mytm,const utf8ness_t fmt_utf8ness,utf8ness_t * result_utf8ness,const bool called_externally)8419 S_strftime8(pTHX_ const char * fmt,
8420                   SV * sv,
8421                   const char * locale,
8422                   const struct tm * mytm,
8423                   const utf8ness_t fmt_utf8ness,
8424                   utf8ness_t * result_utf8ness,
8425                   const bool called_externally)
8426 {
8427     PERL_ARGS_ASSERT_STRFTIME8;
8428 
8429     /* Wrap strftime_tm, taking into account the input and output UTF-8ness */
8430 
8431 #ifdef USE_LOCALE_TIME
8432 #  define INDEX_TO_USE  LC_TIME_INDEX_
8433 
8434     locale_utf8ness_t locale_utf8ness = LOCALE_UTF8NESS_UNKNOWN;
8435 
8436 #else
8437 #  define INDEX_TO_USE  LC_ALL_INDEX_   /* Effectively out of bounds */
8438 
8439     locale_utf8ness_t locale_utf8ness = LOCALE_NOT_UTF8;
8440 
8441 #endif
8442 
8443     switch (fmt_utf8ness) {
8444       case UTF8NESS_IMMATERIAL:
8445         break;
8446 
8447       case UTF8NESS_NO: /* Known not to be UTF-8; must not be UTF-8 locale */
8448         if (is_locale_utf8(locale)) {
8449             SET_EINVAL;
8450             return false;
8451         }
8452 
8453         locale_utf8ness = LOCALE_NOT_UTF8;
8454         break;
8455 
8456       case UTF8NESS_YES:    /* Known to be UTF-8; must be UTF-8 locale if can't
8457                                downgrade. */
8458         if (! is_locale_utf8(locale)) {
8459             locale_utf8ness = LOCALE_NOT_UTF8;
8460 
8461             bool is_utf8 = true;
8462             Size_t fmt_len = strlen(fmt);
8463             fmt = (char *) bytes_from_utf8((U8 *) fmt, &fmt_len, &is_utf8);
8464             if (is_utf8) {
8465                 SET_EINVAL;
8466                 return false;
8467             }
8468 
8469             SAVEFREEPV(fmt);
8470         }
8471         else {
8472             locale_utf8ness = LOCALE_IS_UTF8;
8473         }
8474 
8475         break;
8476 
8477       case UTF8NESS_UNKNOWN:
8478         if (! is_locale_utf8(locale)) {
8479             locale_utf8ness = LOCALE_NOT_UTF8;
8480         }
8481         else {
8482             locale_utf8ness = LOCALE_IS_UTF8;
8483             if (called_externally) {
8484 
8485                 /* All internal calls from this file use ASCII-only formats;
8486                  * but otherwise the format could be anything, so make sure to
8487                  * upgrade it to UTF-8 for a UTF-8 locale.  Otherwise the
8488                  * locale would find any UTF-8 variant characters to be
8489                  * malformed */
8490                 Size_t fmt_len = strlen(fmt);
8491                 fmt = (char *) bytes_to_utf8((U8 *) fmt, &fmt_len);
8492                 SAVEFREEPV(fmt);
8493             }
8494         }
8495 
8496         break;
8497     }
8498 
8499     if (! strftime_tm(fmt, sv, locale, mytm)) {
8500         return false;
8501     }
8502 
8503     *result_utf8ness = get_locale_string_utf8ness_i(SvPVX(sv),
8504                                                     locale_utf8ness,
8505                                                     locale,
8506                                                     INDEX_TO_USE);
8507     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8508                           "fmt=%s, retval=%s; utf8ness=%d",
8509                           fmt,
8510                           ((is_strict_utf8_string((U8 *) SvPVX(sv), 0))
8511                            ? SvPVX(sv)
8512                            :_byte_dump_string((U8 *) SvPVX(sv), SvCUR(sv) ,0)),
8513                           *result_utf8ness));
8514     return true;
8515 
8516 #undef INDEX_TO_USE
8517 
8518 }
8519 
8520 #ifdef USE_LOCALE
8521 
8522 STATIC void
S_give_perl_locale_control(pTHX_ const char * lc_all_string,const line_t caller_line)8523 S_give_perl_locale_control(pTHX_
8524 #  ifdef LC_ALL
8525                            const char * lc_all_string,
8526 #  else
8527                            const char ** locales,
8528 #  endif
8529                            const line_t caller_line)
8530 {
8531     PERL_UNUSED_ARG(caller_line);
8532 
8533     /* This is called when the program is in the global locale and are
8534      * switching to per-thread (if available).  And it is called at
8535      * initialization time to do the same.
8536      */
8537 
8538 #  if defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)
8539 
8540     /* On Windows, convert to per-thread behavior.  This isn't necessary in
8541      * POSIX 2008, as the conversion gets done automatically in the
8542      * void_setlocale_i() calls below. */
8543     if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) {
8544         locale_panic_("_configthreadlocale returned an error");
8545     }
8546 
8547 #  endif
8548 #  if ! defined(USE_THREAD_SAFE_LOCALE)                               \
8549    && ! defined(USE_POSIX_2008_LOCALE)
8550 #    if defined(LC_ALL)
8551     PERL_UNUSED_ARG(lc_all_string);
8552 #    else
8553     PERL_UNUSED_ARG(locales);
8554 #    endif
8555 #  else
8556 
8557     /* This platform has per-thread locale handling.  Do the conversion. */
8558 
8559 #    if defined(LC_ALL)
8560 
8561     void_setlocale_c_with_caller(LC_ALL, lc_all_string, __FILE__, caller_line);
8562 
8563 #    else
8564 
8565     for_all_individual_category_indexes(i) {
8566         void_setlocale_i_with_caller(i, locales[i], __FILE__, caller_line);
8567     }
8568 
8569 #    endif
8570 #  endif
8571 
8572     /* Finally, update our remaining records.  'true' => force recalculation.
8573      * This is needed because we don't know what's happened while Perl hasn't
8574      * had control, so we need to figure out the current state */
8575 
8576 #  if defined(LC_ALL)
8577 
8578     new_LC_ALL(lc_all_string, true);
8579 
8580 #    else
8581 
8582     new_LC_ALL(calculate_LC_ALL_string(locales,
8583                                        INTERNAL_FORMAT,
8584                                        WANT_TEMP_PV,
8585                                        caller_line),
8586                true);
8587 #    endif
8588 
8589 }
8590 
8591 STATIC void
S_output_check_environment_warning(pTHX_ const char * const language,const char * const lc_all,const char * const lang)8592 S_output_check_environment_warning(pTHX_ const char * const language,
8593                                          const char * const lc_all,
8594                                          const char * const lang)
8595 {
8596     PerlIO_printf(Perl_error_log,
8597                   "perl: warning: Please check that your locale settings:\n");
8598 
8599 #  ifdef __GLIBC__
8600 
8601     PerlIO_printf(Perl_error_log, "\tLANGUAGE = %c%s%c,\n",
8602                                   language ? '"' : '(',
8603                                   language ? language : "unset",
8604                                   language ? '"' : ')');
8605 #  else
8606     PERL_UNUSED_ARG(language);
8607 #  endif
8608 
8609     PerlIO_printf(Perl_error_log, "\tLC_ALL = %c%s%c,\n",
8610                                   lc_all ? '"' : '(',
8611                                   lc_all ? lc_all : "unset",
8612                                   lc_all ? '"' : ')');
8613 
8614     for_all_individual_category_indexes(i) {
8615         const char * value = PerlEnv_getenv(category_names[i]);
8616         PerlIO_printf(Perl_error_log,
8617                       "\t%s = %c%s%c,\n",
8618                       category_names[i],
8619                       value ? '"' : '(',
8620                       value ? value : "unset",
8621                       value ? '"' : ')');
8622     }
8623 
8624     PerlIO_printf(Perl_error_log, "\tLANG = %c%s%c\n",
8625                                   lang ? '"' : '(',
8626                                   lang ? lang : "unset",
8627                                   lang ? '"' : ')');
8628     PerlIO_printf(Perl_error_log,
8629                   "    are supported and installed on your system.\n");
8630 }
8631 
8632 #endif
8633 
8634 /* A helper macro for the next function.  Needed because would be called in two
8635  * places.  Knows about the internal workings of the function */
8636 #define GET_DESCRIPTION(trial, name)                                    \
8637     ((isNAME_C_OR_POSIX(name))                                          \
8638      ? "the standard locale"                                            \
8639      : ((trial == (system_default_trial)                                \
8640                   ? "the system default locale"                         \
8641                   : "a fallback locale")))
8642 
8643 /*
8644  * Initialize locale awareness.
8645  */
8646 int
Perl_init_i18nl10n(pTHX_ int printwarn)8647 Perl_init_i18nl10n(pTHX_ int printwarn)
8648 {
8649     /* printwarn is:
8650      *    0 if not to output warning when setup locale is bad
8651      *    1 if to output warning based on value of PERL_BADLANG
8652      *    >1 if to output regardless of PERL_BADLANG
8653      *
8654      * returns
8655      *    1 = set ok or not applicable,
8656      *    0 = fallback to a locale of lower priority
8657      *   -1 = fallback to all locales failed, not even to the C locale
8658      *
8659      * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
8660      * set, debugging information is output.
8661      *
8662      * This routine effectively does the following in most cases:
8663      *
8664      *      basic initialization;
8665      *      asserts that the compiled tables are consistent;
8666      *      initialize data structures;
8667      *      make sure we are in the global locale;
8668      *      setlocale(LC_ALL, "");
8669      *      switch to per-thread locale if applicable;
8670      *
8671      * The "" causes the locale to be set to what the environment variables at
8672      * the time say it should be.
8673      *
8674      * To handle possible failures, the setlocale is expanded to be like:
8675      *
8676      *      trial_locale = pre-first-trial;
8677      *      while (has_another_trial()) {
8678      *          trial_locale = next_trial();
8679      *          if setlocale(LC_ALL, trial_locale) {
8680      *              ok = true;
8681      *              break;
8682      *          }
8683      *
8684      *          had_failure = true;
8685      *          warn();
8686      *      }
8687      *
8688      *      if (had_failure) {
8689      *          warn_even_more();
8690      *          if (! ok) warn_still_more();
8691      *      }
8692      *
8693      * The first trial is either:
8694      *      ""      to examine the environment variables for the locale
8695      *      NULL    to use the values already set for the locale by the program
8696      *              embedding this perl instantiation.
8697      *
8698      * Something is wrong if this trial fails, but there is a sequence of
8699      * fallbacks to try should that happen.  They are given in the enum below.
8700 
8701      * If there is no LC_ALL defined on the system, the setlocale() above is
8702      * replaced by a loop setting each individual category separately.
8703      *
8704      * In a non-embeded environment, this code is executed exactly once.  It
8705      * sets up the global locale environment.  At the end, if some sort of
8706      * thread-safety is in effect, it will turn thread 0 into using that, with
8707      * the same locale as the global initially.  thread 0 can then change its
8708      * locale at will without affecting the global one.
8709      *
8710      * At destruction time, thread 0 will revert to the global locale as the
8711      * other threads die.
8712      *
8713      * Care must be taken in an embedded environment.  This code will be
8714      * executed for each instantiation.  Since it changes the global locale, it
8715      * could clash with another running instantiation that isn't using
8716      * per-thread locales.  perlembed suggests having the controlling program
8717      * set each instantiation's locale and set PERL_SKIP_LOCALE_INIT so this
8718      * code uses that without actually changing anything.  Then the onus is on
8719      * the controlling program to prevent any races.  The code below does
8720      * enough locking so as to prevent system calls from overwriting data
8721      * before it is safely copied here, but that isn't a general solution.
8722      */
8723 
8724     if (PL_langinfo_sv == NULL) {
8725          PL_langinfo_sv = newSVpvs("");
8726     }
8727     if (PL_scratch_langinfo == NULL) {
8728          PL_scratch_langinfo = newSVpvs("");
8729     }
8730 
8731 #ifndef USE_LOCALE
8732 
8733     PERL_UNUSED_ARG(printwarn);
8734     const int ok = 1;
8735 
8736 #else  /* USE_LOCALE to near the end of the routine */
8737 
8738     int ok = 0;
8739 
8740 #  ifdef __GLIBC__
8741 
8742     const char * const language = PerlEnv_getenv("LANGUAGE");
8743 
8744 #  else
8745     const char * const language = NULL;     /* Unused placeholder */
8746 #  endif
8747 
8748     /* A later getenv() could zap this, so only use here */
8749     const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
8750 
8751     const bool locwarn = (printwarn > 1
8752                           || (          printwarn
8753                               && (    ! bad_lang_use_once
8754                                   || (
8755                                          /* disallow with "" or "0" */
8756                                          *bad_lang_use_once
8757                                        && strNE("0", bad_lang_use_once)))));
8758 
8759 #  ifndef DEBUGGING
8760 #    define DEBUG_LOCALE_INIT(a,b,c)
8761 #  else
8762 
8763     DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
8764 
8765 #    define DEBUG_LOCALE_INIT(cat_index, locale, result)                    \
8766         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",                       \
8767                     setlocale_debug_string_i(cat_index, locale, result)));
8768 
8769 #    ifdef LC_ALL
8770     assert(categories[LC_ALL_INDEX_] == LC_ALL);
8771     assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL"));
8772 #      ifdef USE_POSIX_2008_LOCALE
8773     assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK);
8774 #      endif
8775 #    endif
8776 
8777     for_all_individual_category_indexes(i) {
8778         assert(category_name_lengths[i] == strlen(category_names[i]));
8779     }
8780 
8781 #  endif    /* DEBUGGING */
8782 
8783     /* Initialize the per-thread mbrFOO() state variables.  See POSIX.xs for
8784      * why these particular incantations are used. */
8785 #  ifdef HAS_MBRLEN
8786     memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
8787 #  endif
8788 #  ifdef HAS_MBRTOWC
8789     memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
8790 #  endif
8791 #  ifdef HAS_WCTOMBR
8792     wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
8793 #  endif
8794 #  ifdef USE_PL_CURLOCALES
8795 
8796     for (unsigned int i = 0; i <= LC_ALL_INDEX_; i++) {
8797         PL_curlocales[i] = savepv("C");
8798     }
8799 
8800 #  endif
8801 #  ifdef USE_PL_CUR_LC_ALL
8802 
8803     PL_cur_LC_ALL = savepv("C");
8804 
8805 #  endif
8806 #  if ! defined(PERL_LC_ALL_USES_NAME_VALUE_PAIRS) && defined(LC_ALL)
8807 
8808     LOCALE_LOCK;
8809 
8810     /* If we haven't done so already, translate the LC_ALL positions of
8811      * categories into our internal indices. */
8812     if (map_LC_ALL_position_to_index[0] == LC_ALL_INDEX_) {
8813 
8814 #    ifdef PERL_LC_ALL_CATEGORY_POSITIONS_INIT
8815         /* Use this array, initialized by a config.h constant */
8816         int lc_all_category_positions[] = PERL_LC_ALL_CATEGORY_POSITIONS_INIT;
8817         STATIC_ASSERT_STMT(   C_ARRAY_LENGTH(lc_all_category_positions)
8818                            == LC_ALL_INDEX_);
8819 
8820         for (unsigned int i = 0;
8821              i < C_ARRAY_LENGTH(lc_all_category_positions);
8822              i++)
8823         {
8824             map_LC_ALL_position_to_index[i] =
8825                               get_category_index(lc_all_category_positions[i]);
8826         }
8827 #    else
8828         /* It is possible for both PERL_LC_ALL_USES_NAME_VALUE_PAIRS and
8829          * PERL_LC_ALL_CATEGORY_POSITIONS_INIT not to be defined, e.g. on
8830          * systems with only a C locale during ./Configure.  Assume that this
8831          * can only happen as part of some sort of bootstrapping so allow
8832          * compilation to succeed by ignoring correctness.
8833          */
8834         for (unsigned int i = 0;
8835              i < C_ARRAY_LENGTH(map_LC_ALL_position_to_index);
8836              i++)
8837         {
8838             map_LC_ALL_position_to_index[i] = 0;
8839         }
8840 #    endif
8841 
8842     }
8843 
8844     LOCALE_UNLOCK;
8845 
8846 #  endif
8847 #  ifdef USE_POSIX_2008_LOCALE
8848 
8849     /* This is a global, so be sure to keep another instance from zapping it */
8850     LOCALE_LOCK;
8851     if (PL_C_locale_obj) {
8852         LOCALE_UNLOCK;
8853     }
8854     else {
8855         PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
8856         if (! PL_C_locale_obj) {
8857             LOCALE_UNLOCK;
8858             locale_panic_("Cannot create POSIX 2008 C locale object");
8859         }
8860         LOCALE_UNLOCK;
8861 
8862         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n",
8863                                                PL_C_locale_obj));
8864     }
8865 
8866     /* Switch to using the POSIX 2008 interface now.  This would happen below
8867      * anyway, but deferring it can lead to leaks of memory that would also get
8868      * malloc'd in the interim.  We arbitrarily switch to the C locale,
8869      * overridden below  */
8870     if (! uselocale(PL_C_locale_obj)) {
8871         locale_panic_(Perl_form(aTHX_
8872                                 "Can't uselocale(%p), LC_ALL supposed to"
8873                                 " be 'C'",
8874                                 PL_C_locale_obj));
8875     }
8876 
8877 #    ifdef MULTIPLICITY
8878 
8879     PL_cur_locale_obj = PL_C_locale_obj;
8880 
8881 #    endif
8882 #  endif
8883 
8884     /* Now initialize some data structures.  This is entirely so that
8885      * later-executed code doesn't have to concern itself with things not being
8886      * initialized.  Arbitrarily use the C locale (which we know has to exist
8887      * on the system). */
8888 
8889 #  ifdef USE_LOCALE_NUMERIC
8890 
8891     PL_numeric_radix_sv    = newSV(1);
8892     PL_underlying_radix_sv = newSV(1);
8893     Newxz(PL_numeric_name, 1, char);    /* Single NUL character */
8894 
8895 #  endif
8896 #  ifdef USE_LOCALE_COLLATE
8897 
8898     Newxz(PL_collation_name, 1, char);
8899 
8900 #  endif
8901 #  ifdef USE_LOCALE_CTYPE
8902 
8903     Newxz(PL_ctype_name, 1, char);
8904 
8905 #  endif
8906 
8907     new_LC_ALL("C", true /* Don't shortcut */);
8908 
8909 /*===========================================================================*/
8910 
8911     /* Now ready to override the initialization with the values that the user
8912      * wants.  This is done in the global locale as explained in the
8913      * introductory comments to this function */
8914     switch_to_global_locale();
8915 
8916     const char * const lc_all     = PerlEnv_getenv("LC_ALL");
8917     const char * const lang       = PerlEnv_getenv("LANG");
8918 
8919     /* We try each locale in the enum, in order, until we get one that works,
8920      * or exhaust the list.  Normally the loop is executed just once.
8921      *
8922      * Each enum value is +1 from the previous */
8923     typedef enum {
8924             dummy_trial       = -1,
8925             environment_trial =  0,     /* "" or NULL; code below assumes value
8926                                            0 is the first real trial */
8927             LC_ALL_trial,               /* ENV{LC_ALL} */
8928             LANG_trial,                 /* ENV{LANG} */
8929             system_default_trial,       /* Windows .ACP */
8930             C_trial,                    /* C locale */
8931             beyond_final_trial,
8932     } trials;
8933 
8934     trials trial;
8935     unsigned int already_checked = 0;
8936     const char * checked[C_trial];
8937 
8938 #  ifdef LC_ALL
8939     const char * lc_all_string;
8940 #  else
8941     const char * curlocales[LC_ALL_INDEX_];
8942 #  endif
8943 
8944     /* Loop through the initial setting and all the possible fallbacks,
8945      * breaking out of the loop on success */
8946     trial = dummy_trial;
8947     while (trial != beyond_final_trial) {
8948 
8949         /* Each time through compute the next trial to use based on the one in
8950          * the previous iteration and switch to the new one.  This enforces the
8951          * order in which the fallbacks are applied */
8952       next_trial:
8953         trial = (trials) ((int) trial + 1);     /* Casts are needed for g++ */
8954 
8955         const char * locale = NULL;
8956 
8957         /* Set up the parameters for this trial */
8958         switch (trial) {
8959           case dummy_trial:
8960             locale_panic_("Unexpectedly got 'dummy_trial");
8961             break;
8962 
8963           case environment_trial:
8964             /* This is either "" to get the values from the environment, or
8965              * NULL if the calling program has initialized the values already.
8966              * */
8967             locale = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
8968                      ? NULL
8969                      : "";
8970             break;
8971 
8972           case LC_ALL_trial:
8973             if (! lc_all || strEQ(lc_all, "")) {
8974                 continue;   /* No-op */
8975             }
8976 
8977             locale = lc_all;
8978             break;
8979 
8980           case LANG_trial:
8981             if (! lang || strEQ(lang, "")) {
8982                 continue;   /* No-op */
8983             }
8984 
8985             locale = lang;
8986             break;
8987 
8988           case system_default_trial:
8989 
8990 #  if ! defined(WIN32) || ! defined(LC_ALL)
8991 
8992             continue;   /* No-op */
8993 
8994 #  else
8995             /* For Windows, we also try the system default locale before "C".
8996              * (If there exists a Windows without LC_ALL we skip this because
8997              * it gets too complicated.  For those, "C" is the next fallback
8998              * possibility). */
8999             locale = ".ACP";
9000 #  endif
9001             break;
9002 
9003           case C_trial:
9004             locale = "C";
9005             break;
9006 
9007           case beyond_final_trial:
9008             continue;     /* No-op, causes loop to exit */
9009         }
9010 
9011         /* If the locale is a substantive name, don't try the same locale
9012          * twice. */
9013         if (locale && strNE(locale, "")) {
9014             for (unsigned int i = 0; i < already_checked; i++) {
9015                 if (strEQ(checked[i], locale)) {
9016                     goto next_trial;
9017                 }
9018             }
9019 
9020             /* And, for future iterations, indicate we've tried this locale */
9021             assert(already_checked < C_ARRAY_LENGTH(checked));
9022             checked[already_checked] = savepv(locale);
9023             SAVEFREEPV(checked[already_checked]);
9024             already_checked++;
9025         }
9026 
9027 #  ifdef LC_ALL
9028 
9029         STDIZED_SETLOCALE_LOCK;
9030         lc_all_string = savepv(stdized_setlocale(LC_ALL, locale));
9031         STDIZED_SETLOCALE_UNLOCK;
9032 
9033         DEBUG_LOCALE_INIT(LC_ALL_INDEX_, locale, lc_all_string);
9034 
9035         if (LIKELY(lc_all_string)) {     /* Succeeded */
9036             ok = 1;
9037             break;
9038         }
9039 
9040         if (trial == 0 && locwarn) {
9041             PerlIO_printf(Perl_error_log,
9042                                   "perl: warning: Setting locale failed.\n");
9043             output_check_environment_warning(language, lc_all, lang);
9044         }
9045 
9046 #  else /* Below is ! LC_ALL */
9047 
9048         bool setlocale_failure = FALSE;  /* This trial hasn't failed so far */
9049         bool dowarn = trial == 0 && locwarn;
9050 
9051         for_all_individual_category_indexes(j) {
9052             STDIZED_SETLOCALE_LOCK;
9053             curlocales[j] = savepv(stdized_setlocale(categories[j], locale));
9054             STDIZED_SETLOCALE_UNLOCK;
9055 
9056             DEBUG_LOCALE_INIT(j, locale, curlocales[j]);
9057 
9058             if (UNLIKELY(! curlocales[j])) {
9059                 setlocale_failure = TRUE;
9060 
9061                 /* If are going to warn below, continue to loop so all failures
9062                  * are included in the message */
9063                 if (! dowarn) {
9064                     break;
9065                 }
9066             }
9067         }
9068 
9069         if (LIKELY(! setlocale_failure)) {  /* All succeeded */
9070             ok = 1;
9071             break;  /* Exit trial_locales loop */
9072         }
9073 
9074         /* Here, this trial failed */
9075 
9076         if (dowarn) {
9077             PerlIO_printf(Perl_error_log,
9078                 "perl: warning: Setting locale failed for the categories:\n");
9079 
9080             for_all_individual_category_indexes(j) {
9081                 if (! curlocales[j]) {
9082                     PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]);
9083                 }
9084             }
9085 
9086             output_check_environment_warning(language, lc_all, lang);
9087         }   /* end of warning on first failure */
9088 
9089 #  endif /* LC_ALL */
9090 
9091     }   /* end of looping through the trial locales */
9092 
9093     /* If we had to do more than the first trial, it means that one failed, and
9094      * we may need to output a warning, and, if none worked, do more */
9095     if (UNLIKELY(trial != 0)) {
9096         if (locwarn) {
9097             const char * description = "a fallback locale";
9098             const char * name = NULL;;
9099 
9100             /* If we didn't find a good fallback, list all we tried */
9101             if (! ok && already_checked > 0) {
9102                 PerlIO_printf(Perl_error_log, "perl: warning: Failed to fall"
9103                                               " back to ");
9104                 if (already_checked > 1) {  /* more than one was tried */
9105                     PerlIO_printf(Perl_error_log, "any of:\n");
9106                 }
9107 
9108                 while (already_checked > 0) {
9109                     name = checked[--already_checked];
9110                     description = GET_DESCRIPTION(trial, name);
9111                     PerlIO_printf(Perl_error_log, "%s (\"%s\")\n",
9112                                                   description, name);
9113                 }
9114             }
9115 
9116             if (ok) {
9117 
9118                 /* Here, a fallback worked.  So we have saved its name, and the
9119                  * trial that succeeded is still valid */
9120 #  ifdef LC_ALL
9121                 const char * individ_locales[LC_ALL_INDEX_] = { NULL };
9122 
9123                 /* Even though we know the valid string for LC_ALL that worked,
9124                  * translate it into our internal format, which is the
9125                  * name=value pairs notation.  This is easier for a human to
9126                  * decipher than the positional notation.  Some platforms
9127                  * can return "C C C C C C" for LC_ALL.  This code also
9128                  * standardizes that result into plain "C". */
9129                 switch (parse_LC_ALL_string(lc_all_string,
9130                                             (const char **) &individ_locales,
9131                                             no_override,
9132                                             false,   /* Return only [0] if
9133                                                         suffices */
9134                                             false,   /* Don't panic on error */
9135                                             __LINE__))
9136                 {
9137                   case invalid:
9138 
9139                     /* Here, the parse failed, which shouldn't happen, but if
9140                      * it does, we have an easy fallback that allows us to keep
9141                      * going. */
9142                     name = lc_all_string;
9143                     break;
9144 
9145                   case no_array:    /* The original is a single locale */
9146                     name = lc_all_string;
9147                     break;
9148 
9149                   case only_element_0:  /* element[0] is a single locale valid
9150                                            for all categories */
9151                     SAVEFREEPV(individ_locales[0]);
9152                     name = individ_locales[0];
9153                     break;
9154 
9155                   case full_array:
9156                     name = calculate_LC_ALL_string(individ_locales,
9157                                                    INTERNAL_FORMAT,
9158                                                    WANT_TEMP_PV,
9159                                                    __LINE__);
9160                     for_all_individual_category_indexes(j) {
9161                         Safefree(individ_locales[j]);
9162                     }
9163                 }
9164 #  else
9165                 name = calculate_LC_ALL_string(curlocales,
9166                                                INTERNAL_FORMAT,
9167                                                WANT_TEMP_PV,
9168                                                __LINE__);
9169 #  endif
9170                 description = GET_DESCRIPTION(trial, name);
9171             }
9172             else {
9173 
9174                 /* Nothing seems to be working, yet we want to continue
9175                  * executing.  It may well be that locales are mostly
9176                  * irrelevant to this particular program, and there must be
9177                  * some locale underlying the program.  Figure it out as best
9178                  * we can, by querying the system's current locale */
9179 
9180 #  ifdef LC_ALL
9181 
9182                 STDIZED_SETLOCALE_LOCK;
9183                 name = stdized_setlocale(LC_ALL, NULL);
9184                 STDIZED_SETLOCALE_UNLOCK;
9185 
9186                 if (UNLIKELY(! name)) {
9187                     name = "locale name not determinable";
9188                 }
9189 
9190 #  else /* Below is ! LC_ALL */
9191 
9192                 const char * system_locales[LC_ALL_INDEX_] = { NULL };
9193 
9194                 for_all_individual_category_indexes(j) {
9195                     STDIZED_SETLOCALE_LOCK;
9196                     system_locales[j] = savepv(stdized_setlocale(categories[j],
9197                                                                  NULL));
9198                     STDIZED_SETLOCALE_UNLOCK;
9199 
9200                     if (UNLIKELY(! system_locales[j])) {
9201                         system_locales[j] = "not determinable";
9202                     }
9203                 }
9204 
9205                 /* We use the name=value form for the string, as that is more
9206                  * human readable than the positional notation */
9207                 name = calculate_LC_ALL_string(system_locales,
9208                                                INTERNAL_FORMAT,
9209                                                WANT_TEMP_PV,
9210                                                __LINE__);
9211                 description = "what the system says";
9212 
9213                 for_all_individual_category_indexes(j) {
9214                     Safefree(system_locales[j]);
9215                 }
9216 #  endif
9217             }
9218 
9219             PerlIO_printf(Perl_error_log,
9220                           "perl: warning: Falling back to %s (\"%s\").\n",
9221                           description, name);
9222 
9223             /* Here, ok being true indicates that the first attempt failed, but
9224              * a fallback succeeded; false => nothing working.  Translate to
9225              * API return values. */
9226             ok = (ok) ? 0 : -1;
9227         }
9228     }
9229 
9230 #  ifdef LC_ALL
9231 
9232     give_perl_locale_control(lc_all_string, __LINE__);
9233     Safefree(lc_all_string);
9234 
9235 #  else
9236 
9237     give_perl_locale_control((const char **) &curlocales, __LINE__);
9238 
9239     for_all_individual_category_indexes(j) {
9240         Safefree(curlocales[j]);
9241     }
9242 
9243 #  endif
9244 #  if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
9245 
9246     /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
9247      * locale is UTF-8.  give_perl_locale_control() just above has already
9248      * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
9249      * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
9250      * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
9251      * STDIN, STDOUT, STDERR, _and_ the default open discipline.  */
9252     PL_utf8locale = PL_in_utf8_CTYPE_locale;
9253 
9254     /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
9255        This is an alternative to using the -C command line switch
9256        (the -C if present will override this). */
9257     {
9258          const char *p = PerlEnv_getenv("PERL_UNICODE");
9259          PL_unicode = p ? parse_unicode_opts(&p) : 0;
9260          if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
9261              PL_utf8cache = -1;
9262     }
9263 
9264 #  endif
9265 #  if defined(USE_POSIX_2008_LOCALE) && defined(MULTIPLICITY)
9266     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
9267                            "finished Perl_init_i18nl10n; actual obj=%p,"
9268                            " expected obj=%p, initial=%s\n",
9269                            uselocale(0), PL_cur_locale_obj,
9270                            get_LC_ALL_display()));
9271 #  endif
9272 
9273     /* So won't continue to output stuff */
9274     DEBUG_INITIALIZATION_set(FALSE);
9275 
9276 #endif /* USE_LOCALE */
9277 
9278     return ok;
9279 }
9280 
9281 #undef GET_DESCRIPTION
9282 #ifdef USE_LOCALE_COLLATE
9283 
9284 STATIC void
S_compute_collxfrm_coefficients(pTHX)9285 S_compute_collxfrm_coefficients(pTHX)
9286 {
9287 
9288     /* A locale collation definition includes primary, secondary, tertiary,
9289      * etc. weights for each character.  To sort, the primary weights are used,
9290      * and only if they compare equal, then the secondary weights are used, and
9291      * only if they compare equal, then the tertiary, etc.
9292      *
9293      * strxfrm() works by taking the input string, say ABC, and creating an
9294      * output transformed string consisting of first the primary weights,
9295      * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the tertiary,
9296      * etc, yielding A¹B¹C¹ A²B²C² A³B³C³ ....  Some characters may not have
9297      * weights at every level.  In our example, let's say B doesn't have a
9298      * tertiary weight, and A doesn't have a secondary weight.  The constructed
9299      * string is then going to be
9300      *  A¹B¹C¹ B²C² A³C³ ....
9301      * This has the desired effect that strcmp() will look at the secondary or
9302      * tertiary weights only if the strings compare equal at all higher
9303      * priority weights.  The spaces shown here, like in
9304      *  "A¹B¹C¹ A²B²C² "
9305      * are not just for readability.  In the general case, these must actually
9306      * be bytes, which we will call here 'separator weights'; and they must be
9307      * smaller than any other weight value, but since these are C strings, only
9308      * the terminating one can be a NUL (some implementations may include a
9309      * non-NUL separator weight just before the NUL).  Implementations tend to
9310      * reserve 01 for the separator weights.  They are needed so that a shorter
9311      * string's secondary weights won't be misconstrued as primary weights of a
9312      * longer string, etc.  By making them smaller than any other weight, the
9313      * shorter string will sort first.  (Actually, if all secondary weights are
9314      * smaller than all primary ones, there is no need for a separator weight
9315      * between those two levels, etc.)
9316      *
9317      * The length of the transformed string is roughly a linear function of the
9318      * input string.  It's not exactly linear because some characters don't
9319      * have weights at all levels.  When we call strxfrm() we have to allocate
9320      * some memory to hold the transformed string.  The calculations below try
9321      * to find coefficients 'm' and 'b' for this locale so that m*x + b equals
9322      * how much space we need, given the size of the input string in 'x'.  If
9323      * we calculate too small, we increase the size as needed, and call
9324      * strxfrm() again, but it is better to get it right the first time to
9325      * avoid wasted expensive string transformations.
9326      *
9327      * We use the string below to find how long the transformation of it is.
9328      * Almost all locales are supersets of ASCII, or at least the ASCII
9329      * letters.  We use all of them, half upper half lower, because if we used
9330      * fewer, we might hit just the ones that are outliers in a particular
9331      * locale.  Most of the strings being collated will contain a preponderance
9332      * of letters, and even if they are above-ASCII, they are likely to have
9333      * the same number of weight levels as the ASCII ones.  It turns out that
9334      * digits tend to have fewer levels, and some punctuation has more, but
9335      * those are relatively sparse in text, and khw believes this gives a
9336      * reasonable result, but it could be changed if experience so dictates. */
9337     const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
9338     char * x_longer;        /* Transformed 'longer' */
9339     Size_t x_len_longer;    /* Length of 'x_longer' */
9340 
9341     char * x_shorter;   /* We also transform a substring of 'longer' */
9342     Size_t x_len_shorter;
9343 
9344     PL_in_utf8_COLLATE_locale = (PL_collation_standard)
9345                                 ? 0
9346                                 : is_locale_utf8(PL_collation_name);
9347     PL_strxfrm_NUL_replacement = '\0';
9348     PL_strxfrm_max_cp = 0;
9349 
9350     /* mem_collxfrm_() is used get the transformation (though here we are
9351      * interested only in its length).  It is used because it has the
9352      * intelligence to handle all cases, but to work, it needs some values of
9353      * 'm' and 'b' to get it started.  For the purposes of this calculation we
9354      * use a very conservative estimate of 'm' and 'b'.  This assumes a weight
9355      * can be multiple bytes, enough to hold any UV on the platform, and there
9356      * are 5 levels, 4 weight bytes, and a trailing NUL.  */
9357     PL_collxfrm_base = 5;
9358     PL_collxfrm_mult = 5 * sizeof(UV);
9359 
9360     /* Find out how long the transformation really is */
9361     x_longer = mem_collxfrm_(longer,
9362                              sizeof(longer) - 1,
9363                              &x_len_longer,
9364 
9365                              /* We avoid converting to UTF-8 in the called
9366                               * function by telling it the string is in UTF-8
9367                               * if the locale is a UTF-8 one.  Since the string
9368                               * passed here is invariant under UTF-8, we can
9369                               * claim it's UTF-8 even if it isn't.  */
9370                               PL_in_utf8_COLLATE_locale);
9371     Safefree(x_longer);
9372 
9373     /* Find out how long the transformation of a substring of 'longer' is.
9374      * Together the lengths of these transformations are sufficient to
9375      * calculate 'm' and 'b'.  The substring is all of 'longer' except the
9376      * first character.  This minimizes the chances of being swayed by outliers
9377      * */
9378     x_shorter = mem_collxfrm_(longer + 1,
9379                               sizeof(longer) - 2,
9380                               &x_len_shorter,
9381                               PL_in_utf8_COLLATE_locale);
9382     Safefree(x_shorter);
9383 
9384     /* If the results are nonsensical for this simple test, the whole locale
9385      * definition is suspect.  Mark it so that locale collation is not active
9386      * at all for it.  XXX Should we warn? */
9387     if (   x_len_shorter == 0
9388         || x_len_longer == 0
9389         || x_len_shorter >= x_len_longer)
9390     {
9391         PL_collxfrm_mult = 0;
9392         PL_collxfrm_base = 1;
9393         DEBUG_L(PerlIO_printf(Perl_debug_log,
9394                 "Disabling locale collation for LC_COLLATE='%s';"
9395                 " length for shorter sample=%zu; longer=%zu\n",
9396                 PL_collation_name, x_len_shorter, x_len_longer));
9397     }
9398     else {
9399         SSize_t base;       /* Temporary */
9400 
9401         /* We have both: m * strlen(longer)  + b = x_len_longer
9402          *               m * strlen(shorter) + b = x_len_shorter;
9403          * subtracting yields:
9404          *          m * (strlen(longer) - strlen(shorter))
9405          *                             = x_len_longer - x_len_shorter
9406          * But we have set things up so that 'shorter' is 1 byte smaller than
9407          * 'longer'.  Hence:
9408          *          m = x_len_longer - x_len_shorter
9409          *
9410          * But if something went wrong, make sure the multiplier is at least 1.
9411          */
9412         if (x_len_longer > x_len_shorter) {
9413             PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
9414         }
9415         else {
9416             PL_collxfrm_mult = 1;
9417         }
9418 
9419         /*     mx + b = len
9420          * so:      b = len - mx
9421          * but in case something has gone wrong, make sure it is non-negative
9422          * */
9423         base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
9424         if (base < 0) {
9425             base = 0;
9426         }
9427 
9428         /* Add 1 for the trailing NUL */
9429         PL_collxfrm_base = base + 1;
9430     }
9431 
9432     DEBUG_L(PerlIO_printf(Perl_debug_log,
9433                           "?UTF-8 locale=%d; x_len_shorter=%zu, "
9434                           "x_len_longer=%zu,"
9435                           " collate multipler=%zu, collate base=%zu\n",
9436                           PL_in_utf8_COLLATE_locale,
9437                           x_len_shorter, x_len_longer,
9438                           PL_collxfrm_mult, PL_collxfrm_base));
9439 }
9440 
9441 char *
Perl_mem_collxfrm_(pTHX_ const char * input_string,STRLEN len,STRLEN * xlen,bool utf8)9442 Perl_mem_collxfrm_(pTHX_ const char *input_string,
9443                          STRLEN len,    /* Length of 'input_string' */
9444                          STRLEN *xlen,  /* Set to length of returned string
9445                                            (not including the collation index
9446                                            prefix) */
9447                          bool utf8      /* Is the input in UTF-8? */
9448                    )
9449 {
9450     /* mem_collxfrm_() is like strxfrm() but with two important differences.
9451      * First, it handles embedded NULs. Second, it allocates a bit more memory
9452      * than needed for the transformed data itself.  The real transformed data
9453      * begins at offset COLLXFRM_HDR_LEN.  *xlen is set to the length of that,
9454      * and doesn't include the collation index size.
9455      *
9456      * It is the caller's responsibility to eventually free the memory returned
9457      * by this function.
9458      *
9459      * Please see sv_collxfrm() to see how this is used. */
9460 
9461 #  define COLLXFRM_HDR_LEN    sizeof(PL_collation_ix)
9462 
9463     char * s = (char *) input_string;
9464     STRLEN s_strlen = strlen(input_string);
9465     char *xbuf = NULL;
9466     STRLEN xAlloc;          /* xalloc is a reserved word in VC */
9467     STRLEN length_in_chars;
9468     bool first_time = TRUE; /* Cleared after first loop iteration */
9469 
9470 #  ifdef USE_LOCALE_CTYPE
9471         const char * orig_CTYPE_locale = NULL;
9472 #  endif
9473 
9474 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
9475     locale_t constructed_locale = (locale_t) 0;
9476 #  endif
9477 
9478     PERL_ARGS_ASSERT_MEM_COLLXFRM_;
9479 
9480     /* Must be NUL-terminated */
9481     assert(*(input_string + len) == '\0');
9482 
9483     if (PL_collxfrm_mult == 0) {     /* unknown or bad */
9484         if (PL_collxfrm_base != 0) { /* bad collation => skip */
9485             DEBUG_L(PerlIO_printf(Perl_debug_log,
9486                           "mem_collxfrm_: locale's collation is defective\n"));
9487             goto bad;
9488         }
9489 
9490         /* (mult, base) == (0,0) means we need to calculate mult and base
9491          * before proceeding */
9492         S_compute_collxfrm_coefficients(aTHX);
9493     }
9494 
9495     /* Replace any embedded NULs with the control that sorts before any others.
9496      * This will give as good as possible results on strings that don't
9497      * otherwise contain that character, but otherwise there may be
9498      * less-than-perfect results with that character and NUL.  This is
9499      * unavoidable unless we replace strxfrm with our own implementation. */
9500     if (UNLIKELY(s_strlen < len)) {   /* Only execute if there is an embedded
9501                                          NUL */
9502         char * e = s + len;
9503         char * sans_nuls;
9504         STRLEN sans_nuls_len;
9505         int try_non_controls;
9506         char this_replacement_char[] = "?\0";   /* Room for a two-byte string,
9507                                                    making sure 2nd byte is NUL.
9508                                                  */
9509         STRLEN this_replacement_len;
9510 
9511         /* If we don't know what non-NUL control character sorts lowest for
9512          * this locale, find it */
9513         if (PL_strxfrm_NUL_replacement == '\0') {
9514             int j;
9515             char * cur_min_x = NULL;    /* The min_char's xfrm, (except it also
9516                                            includes the collation index
9517                                            prefixed. */
9518 
9519             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
9520 
9521             /* Unlikely, but it may be that no control will work to replace
9522              * NUL, in which case we instead look for any character.  Controls
9523              * are preferred because collation order is, in general, context
9524              * sensitive, with adjoining characters affecting the order, and
9525              * controls are less likely to have such interactions, allowing the
9526              * NUL-replacement to stand on its own.  (Another way to look at it
9527              * is to imagine what would happen if the NUL were replaced by a
9528              * combining character; it wouldn't work out all that well.) */
9529             for (try_non_controls = 0;
9530                  try_non_controls < 2;
9531                  try_non_controls++)
9532             {
9533 
9534 #  ifdef USE_LOCALE_CTYPE
9535 
9536                 /* In this case we use isCNTRL_LC() below, which relies on
9537                  * LC_CTYPE, so that must be switched to correspond with the
9538                  * LC_COLLATE locale */
9539                 const bool need_to_toggle = (   ! try_non_controls
9540                                              && ! PL_in_utf8_COLLATE_locale);
9541                 if (need_to_toggle) {
9542                     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE,
9543                                                         PL_collation_name);
9544                 }
9545 #  endif
9546                 /* Look through all legal code points (NUL isn't) */
9547                 for (j = 1; j < 256; j++) {
9548                     char * x;       /* j's xfrm plus collation index */
9549                     STRLEN x_len;   /* length of 'x' */
9550                     STRLEN trial_len = 1;
9551                     char cur_source[] = { '\0', '\0' };
9552 
9553                     /* Skip non-controls the first time through the loop.  The
9554                      * controls in a UTF-8 locale are the L1 ones */
9555                     if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
9556                                                ? ! isCNTRL_L1(j)
9557                                                : ! isCNTRL_LC(j))
9558                     {
9559                         continue;
9560                     }
9561 
9562                     /* Create a 1-char string of the current code point */
9563                     cur_source[0] = (char) j;
9564 
9565                     /* Then transform it */
9566                     x = mem_collxfrm_(cur_source, trial_len, &x_len,
9567                                       0 /* The string is not in UTF-8 */);
9568 
9569                     /* Ignore any character that didn't successfully transform.
9570                      * */
9571                     if (! x) {
9572                         continue;
9573                     }
9574 
9575                     /* If this character's transformation is lower than
9576                      * the current lowest, this one becomes the lowest */
9577                     if (   cur_min_x == NULL
9578                         || strLT(x         + COLLXFRM_HDR_LEN,
9579                                  cur_min_x + COLLXFRM_HDR_LEN))
9580                     {
9581                         PL_strxfrm_NUL_replacement = j;
9582                         Safefree(cur_min_x);
9583                         cur_min_x = x;
9584                     }
9585                     else {
9586                         Safefree(x);
9587                     }
9588                 } /* end of loop through all 255 characters */
9589 
9590 #  ifdef USE_LOCALE_CTYPE
9591 
9592                 if (need_to_toggle) {
9593                     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
9594                 }
9595 #  endif
9596 
9597                 /* Stop looking if found */
9598                 if (cur_min_x) {
9599                     break;
9600                 }
9601 
9602                 /* Unlikely, but possible, if there aren't any controls that
9603                  * work in the locale, repeat the loop, looking for any
9604                  * character that works */
9605                 DEBUG_L(PerlIO_printf(Perl_debug_log,
9606                 "mem_collxfrm_: No control worked.  Trying non-controls\n"));
9607             } /* End of loop to try first the controls, then any char */
9608 
9609             if (! cur_min_x) {
9610                 DEBUG_L(PerlIO_printf(Perl_debug_log,
9611                     "mem_collxfrm_: Couldn't find any character to replace"
9612                     " embedded NULs in locale %s with", PL_collation_name));
9613                 goto bad;
9614             }
9615 
9616             DEBUG_L(PerlIO_printf(Perl_debug_log,
9617                     "mem_collxfrm_: Replacing embedded NULs in locale %s with "
9618                     "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
9619 
9620             Safefree(cur_min_x);
9621         } /* End of determining the character that is to replace NULs */
9622 
9623         /* If the replacement is variant under UTF-8, it must match the
9624          * UTF8-ness of the original */
9625         if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
9626             this_replacement_char[0] =
9627                                 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
9628             this_replacement_char[1] =
9629                                 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
9630             this_replacement_len = 2;
9631         }
9632         else {
9633             this_replacement_char[0] = PL_strxfrm_NUL_replacement;
9634             /* this_replacement_char[1] = '\0' was done at initialization */
9635             this_replacement_len = 1;
9636         }
9637 
9638         /* The worst case length for the replaced string would be if every
9639          * character in it is NUL.  Multiply that by the length of each
9640          * replacement, and allow for a trailing NUL */
9641         sans_nuls_len = (len * this_replacement_len) + 1;
9642         Newx(sans_nuls, sans_nuls_len, char);
9643         *sans_nuls = '\0';
9644 
9645         /* Replace each NUL with the lowest collating control.  Loop until have
9646          * exhausted all the NULs */
9647         while (s + s_strlen < e) {
9648             my_strlcat(sans_nuls, s, sans_nuls_len);
9649 
9650             /* Do the actual replacement */
9651             my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
9652 
9653             /* Move past the input NUL */
9654             s += s_strlen + 1;
9655             s_strlen = strlen(s);
9656         }
9657 
9658         /* And add anything that trails the final NUL */
9659         my_strlcat(sans_nuls, s, sans_nuls_len);
9660 
9661         /* Switch so below we transform this modified string */
9662         s = sans_nuls;
9663         len = strlen(s);
9664     } /* End of replacing NULs */
9665 
9666     /* Make sure the UTF8ness of the string and locale match */
9667     if (utf8 != PL_in_utf8_COLLATE_locale) {
9668         /* XXX convert above Unicode to 10FFFF? */
9669         const char * const t = s;   /* Temporary so we can later find where the
9670                                        input was */
9671 
9672         /* Here they don't match.  Change the string's to be what the locale is
9673          * expecting */
9674 
9675         if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
9676             s = (char *) bytes_to_utf8((const U8 *) s, &len);
9677             utf8 = TRUE;
9678         }
9679         else {   /* locale is not UTF-8; but input is; downgrade the input */
9680 
9681             s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
9682 
9683             /* If the downgrade was successful we are done, but if the input
9684              * contains things that require UTF-8 to represent, have to do
9685              * damage control ... */
9686             if (UNLIKELY(utf8)) {
9687 
9688                 /* What we do is construct a non-UTF-8 string with
9689                  *  1) the characters representable by a single byte converted
9690                  *     to be so (if necessary);
9691                  *  2) and the rest converted to collate the same as the
9692                  *     highest collating representable character.  That makes
9693                  *     them collate at the end.  This is similar to how we
9694                  *     handle embedded NULs, but we use the highest collating
9695                  *     code point instead of the smallest.  Like the NUL case,
9696                  *     this isn't perfect, but is the best we can reasonably
9697                  *     do.  Every above-255 code point will sort the same as
9698                  *     the highest-sorting 0-255 code point.  If that code
9699                  *     point can combine in a sequence with some other code
9700                  *     points for weight calculations, us changing something to
9701                  *     be it can adversely affect the results.  But in most
9702                  *     cases, it should work reasonably.  And note that this is
9703                  *     really an illegal situation: using code points above 255
9704                  *     on a locale where only 0-255 are valid.  If two strings
9705                  *     sort entirely equal, then the sort order for the
9706                  *     above-255 code points will be in code point order. */
9707 
9708                 utf8 = FALSE;
9709 
9710                 /* If we haven't calculated the code point with the maximum
9711                  * collating order for this locale, do so now */
9712                 if (! PL_strxfrm_max_cp) {
9713                     int j;
9714 
9715                     /* The current transformed string that collates the
9716                      * highest (except it also includes the prefixed collation
9717                      * index. */
9718                     char * cur_max_x = NULL;
9719 
9720                     /* Look through all legal code points (NUL isn't) */
9721                     for (j = 1; j < 256; j++) {
9722                         char * x;
9723                         STRLEN x_len;
9724                         char cur_source[] = { '\0', '\0' };
9725 
9726                         /* Create a 1-char string of the current code point */
9727                         cur_source[0] = (char) j;
9728 
9729                         /* Then transform it */
9730                         x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);
9731 
9732                         /* If something went wrong (which it shouldn't), just
9733                          * ignore this code point */
9734                         if (! x) {
9735                             continue;
9736                         }
9737 
9738                         /* If this character's transformation is higher than
9739                          * the current highest, this one becomes the highest */
9740                         if (   cur_max_x == NULL
9741                             || strGT(x         + COLLXFRM_HDR_LEN,
9742                                      cur_max_x + COLLXFRM_HDR_LEN))
9743                         {
9744                             PL_strxfrm_max_cp = j;
9745                             Safefree(cur_max_x);
9746                             cur_max_x = x;
9747                         }
9748                         else {
9749                             Safefree(x);
9750                         }
9751                     }
9752 
9753                     if (! cur_max_x) {
9754                         DEBUG_L(PerlIO_printf(Perl_debug_log,
9755                             "mem_collxfrm_: Couldn't find any character to"
9756                             " replace above-Latin1 chars in locale %s with",
9757                             PL_collation_name));
9758                         goto bad;
9759                     }
9760 
9761                     DEBUG_L(PerlIO_printf(Perl_debug_log,
9762                             "mem_collxfrm_: highest 1-byte collating character"
9763                             " in locale %s is 0x%02X\n",
9764                             PL_collation_name,
9765                             PL_strxfrm_max_cp));
9766 
9767                     Safefree(cur_max_x);
9768                 }
9769 
9770                 /* Here we know which legal code point collates the highest.
9771                  * We are ready to construct the non-UTF-8 string.  The length
9772                  * will be at least 1 byte smaller than the input string
9773                  * (because we changed at least one 2-byte character into a
9774                  * single byte), but that is eaten up by the trailing NUL */
9775                 Newx(s, len, char);
9776 
9777                 {
9778                     STRLEN i;
9779                     STRLEN d= 0;
9780                     char * e = (char *) t + len;
9781 
9782                     for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
9783                         U8 cur_char = t[i];
9784                         if (UTF8_IS_INVARIANT(cur_char)) {
9785                             s[d++] = cur_char;
9786                         }
9787                         else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
9788                             s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
9789                         }
9790                         else {  /* Replace illegal cp with highest collating
9791                                    one */
9792                             s[d++] = PL_strxfrm_max_cp;
9793                         }
9794                     }
9795                     s[d++] = '\0';
9796                     Renew(s, d, char);   /* Free up unused space */
9797                 }
9798             }
9799         }
9800 
9801         /* Here, we have constructed a modified version of the input.  It could
9802          * be that we already had a modified copy before we did this version.
9803          * If so, that copy is no longer needed */
9804         if (t != input_string) {
9805             Safefree(t);
9806         }
9807     }
9808 
9809     length_in_chars = (utf8)
9810                       ? utf8_length((U8 *) s, (U8 *) s + len)
9811                       : len;
9812 
9813     /* The first element in the output is the collation id, used by
9814      * sv_collxfrm(); then comes the space for the transformed string.  The
9815      * equation should give us a good estimate as to how much is needed */
9816     xAlloc = COLLXFRM_HDR_LEN
9817            + PL_collxfrm_base
9818            + (PL_collxfrm_mult * length_in_chars);
9819     Newx(xbuf, xAlloc, char);
9820     if (UNLIKELY(! xbuf)) {
9821         DEBUG_L(PerlIO_printf(Perl_debug_log,
9822                       "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc));
9823         goto bad;
9824     }
9825 
9826     /* Store the collation id */
9827     *(PERL_UINTMAX_T *)xbuf = PL_collation_ix;
9828 
9829 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
9830 #    ifdef USE_LOCALE_CTYPE
9831 
9832     constructed_locale = newlocale(LC_CTYPE_MASK, PL_collation_name,
9833                                    duplocale(use_curlocale_scratch()));
9834 #    else
9835 
9836     constructed_locale = duplocale(use_curlocale_scratch());
9837 
9838 #    endif
9839 #    define my_strxfrm(dest, src, n)  strxfrm_l(dest, src, n,           \
9840                                                 constructed_locale)
9841 #    define CLEANUP_STRXFRM                                             \
9842         STMT_START {                                                    \
9843             if (constructed_locale != (locale_t) 0)                     \
9844                 freelocale(constructed_locale);                         \
9845         } STMT_END
9846 #  else
9847 #    define my_strxfrm(dest, src, n)  strxfrm(dest, src, n)
9848 #    ifdef USE_LOCALE_CTYPE
9849 
9850     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
9851 
9852 #      define CLEANUP_STRXFRM                                           \
9853                 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
9854 #    else
9855 #      define CLEANUP_STRXFRM  NOOP
9856 #    endif
9857 #  endif
9858 
9859     /* Then the transformation of the input.  We loop until successful, or we
9860      * give up */
9861     for (;;) {
9862 
9863         errno = 0;
9864         *xlen = my_strxfrm(xbuf + COLLXFRM_HDR_LEN,
9865                            s,
9866                            xAlloc - COLLXFRM_HDR_LEN);
9867 
9868 
9869         /* If the transformed string occupies less space than we told strxfrm()
9870          * was available, it means it transformed the whole string. */
9871         if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
9872 
9873             /* But there still could have been a problem */
9874             if (errno != 0) {
9875                 DEBUG_L(PerlIO_printf(Perl_debug_log,
9876                        "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n",
9877                        PL_collation_name, errno,
9878                        _byte_dump_string((U8 *) s, len, 0)));
9879                 goto bad;
9880             }
9881 
9882             /* Here, the transformation was successful.  Some systems include a
9883              * trailing NUL in the returned length.  Ignore it, using a loop in
9884              * case multiple trailing NULs are returned. */
9885             while (   (*xlen) > 0
9886                    && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
9887             {
9888                 (*xlen)--;
9889             }
9890 
9891             /* If the first try didn't get it, it means our prediction was low.
9892              * Modify the coefficients so that we predict a larger value in any
9893              * future transformations */
9894             if (! first_time) {
9895                 STRLEN needed = *xlen + 1;   /* +1 For trailing NUL */
9896                 STRLEN computed_guess = PL_collxfrm_base
9897                                       + (PL_collxfrm_mult * length_in_chars);
9898 
9899                 /* On zero-length input, just keep current slope instead of
9900                  * dividing by 0 */
9901                 const STRLEN new_m = (length_in_chars != 0)
9902                                      ? needed / length_in_chars
9903                                      : PL_collxfrm_mult;
9904 
9905                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
9906                     "initial size of %zu bytes for a length "
9907                     "%zu string was insufficient, %zu needed\n",
9908                     computed_guess, length_in_chars, needed));
9909 
9910                 /* If slope increased, use it, but discard this result for
9911                  * length 1 strings, as we can't be sure that it's a real slope
9912                  * change */
9913                 if (length_in_chars > 1 && new_m  > PL_collxfrm_mult) {
9914 
9915 #  ifdef DEBUGGING
9916 
9917                     STRLEN old_m = PL_collxfrm_mult;
9918                     STRLEN old_b = PL_collxfrm_base;
9919 
9920 #  endif
9921 
9922                     PL_collxfrm_mult = new_m;
9923                     PL_collxfrm_base = 1;   /* +1 For trailing NUL */
9924                     computed_guess = PL_collxfrm_base
9925                                     + (PL_collxfrm_mult * length_in_chars);
9926                     if (computed_guess < needed) {
9927                         PL_collxfrm_base += needed - computed_guess;
9928                     }
9929 
9930                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
9931                                     "slope is now %zu; was %zu, base "
9932                         "is now %zu; was %zu\n",
9933                         PL_collxfrm_mult, old_m,
9934                         PL_collxfrm_base, old_b));
9935                 }
9936                 else {  /* Slope didn't change, but 'b' did */
9937                     const STRLEN new_b = needed
9938                                         - computed_guess
9939                                         + PL_collxfrm_base;
9940                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
9941                         "base is now %zu; was %zu\n", new_b, PL_collxfrm_base));
9942                     PL_collxfrm_base = new_b;
9943                 }
9944             }
9945 
9946             break;
9947         }
9948 
9949         if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
9950             DEBUG_L(PerlIO_printf(Perl_debug_log,
9951                   "mem_collxfrm_: Needed %zu bytes, max permissible is %u\n",
9952                   *xlen, PERL_INT_MAX));
9953             goto bad;
9954         }
9955 
9956         /* A well-behaved strxfrm() returns exactly how much space it needs
9957          * (usually not including the trailing NUL) when it fails due to not
9958          * enough space being provided.  Assume that this is the case unless
9959          * it's been proven otherwise */
9960         if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
9961             xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
9962         }
9963         else { /* Here, either:
9964                 *  1)  The strxfrm() has previously shown bad behavior; or
9965                 *  2)  It isn't the first time through the loop, which means
9966                 *      that the strxfrm() is now showing bad behavior, because
9967                 *      we gave it what it said was needed in the previous
9968                 *      iteration, and it came back saying it needed still more.
9969                 *      (Many versions of cygwin fit this.  When the buffer size
9970                 *      isn't sufficient, they return the input size instead of
9971                 *      how much is needed.)
9972                 * Increase the buffer size by a fixed percentage and try again.
9973                 * */
9974             xAlloc += (xAlloc / 4) + 1;
9975             PL_strxfrm_is_behaved = FALSE;
9976 
9977             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
9978                      "mem_collxfrm_ required more space than previously"
9979                      " calculated for locale %s, trying again with new"
9980                      " guess=%zu+%zu\n",
9981                      PL_collation_name,  COLLXFRM_HDR_LEN,
9982                      xAlloc - COLLXFRM_HDR_LEN));
9983         }
9984 
9985         Renew(xbuf, xAlloc, char);
9986         if (UNLIKELY(! xbuf)) {
9987             DEBUG_L(PerlIO_printf(Perl_debug_log,
9988                       "mem_collxfrm_: Couldn't realloc %zu bytes\n", xAlloc));
9989             goto bad;
9990         }
9991 
9992         first_time = FALSE;
9993     }
9994 
9995     CLEANUP_STRXFRM;
9996 
9997     DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8));
9998 
9999     /* Free up unneeded space; retain enough for trailing NUL */
10000     Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
10001 
10002     if (s != input_string) {
10003         Safefree(s);
10004     }
10005 
10006     return xbuf;
10007 
10008   bad:
10009 
10010     CLEANUP_STRXFRM;
10011     DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
10012 
10013     Safefree(xbuf);
10014     if (s != input_string) {
10015         Safefree(s);
10016     }
10017     *xlen = 0;
10018 
10019     return NULL;
10020 }
10021 
10022 #  ifdef DEBUGGING
10023 
10024 STATIC void
S_print_collxfrm_input_and_return(pTHX_ const char * s,const char * e,const char * xbuf,const STRLEN xlen,const bool is_utf8)10025 S_print_collxfrm_input_and_return(pTHX_
10026                                   const char * s,
10027                                   const char * e,
10028                                   const char * xbuf,
10029                                   const STRLEN xlen,
10030                                   const bool is_utf8)
10031 {
10032 
10033     PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
10034 
10035     PerlIO_printf(Perl_debug_log,
10036                   "mem_collxfrm_[ix %" UVuf "] for locale '%s':\n"
10037                   "     input=%s\n    return=%s\n    return len=%zu\n",
10038                   (UV) PL_collation_ix, PL_collation_name,
10039                   get_displayable_string(s, e, is_utf8),
10040                   ((xbuf == NULL)
10041                    ? "(null)"
10042                    : ((xlen == 0)
10043                       ? "(empty)"
10044                       : _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
10045                                           xlen, 0))),
10046                   xlen);
10047 }
10048 
10049 #  endif    /* DEBUGGING */
10050 
10051 SV *
Perl_strxfrm(pTHX_ SV * src)10052 Perl_strxfrm(pTHX_ SV * src)
10053 {
10054     PERL_ARGS_ASSERT_STRXFRM;
10055 
10056     /* For use by POSIX::strxfrm().  If they differ, toggle LC_CTYPE to
10057      * LC_COLLATE to avoid potential mojibake.
10058      *
10059      * If we can't calculate a collation, 'src' is instead returned, so that
10060      * future comparisons will be by code point order */
10061 
10062 #  ifdef USE_LOCALE_CTYPE
10063 
10064     const char * orig_ctype = toggle_locale_c(LC_CTYPE,
10065                                               querylocale_c(LC_COLLATE));
10066 #  endif
10067 
10068     SV * dst = src;
10069     STRLEN dstlen;
10070     STRLEN srclen;
10071     const char *p = SvPV_const(src, srclen);
10072     const U32 utf8_flag = SvUTF8(src);
10073     char *d = mem_collxfrm_(p, srclen, &dstlen, cBOOL(utf8_flag));
10074 
10075     assert(utf8_flag == 0 || utf8_flag == SVf_UTF8);
10076 
10077     if (d != NULL) {
10078         assert(dstlen > 0);
10079         dst =newSVpvn_flags(d + COLLXFRM_HDR_LEN,
10080                             dstlen, SVs_TEMP|utf8_flag);
10081         Safefree(d);
10082     }
10083 
10084 #  ifdef USE_LOCALE_CTYPE
10085 
10086     restore_toggled_locale_c(LC_CTYPE, orig_ctype);
10087 
10088 #  endif
10089 
10090     return dst;
10091 }
10092 
10093 #endif /* USE_LOCALE_COLLATE */
10094 
10095 /* my_strerror() returns a mortalized copy of the text of the error message
10096  * associated with 'errnum'.
10097  *
10098  * If not called from within the scope of 'use locale', it uses the text from
10099  * the C locale.  If Perl is compiled to not pay attention to LC_CTYPE nor
10100  * LC_MESSAGES, it uses whatever strerror() returns.  Otherwise the text is
10101  * derived from the locale, LC_MESSAGES if we have that; LC_CTYPE if not.
10102  *
10103  * It returns in *utf8ness the result's UTF-8ness
10104  *
10105  * The function just calls strerror(), but temporarily switches locales, if
10106  * needed.  Many platforms require LC_CTYPE and LC_MESSAGES to be in the same
10107  * CODESET in order for the return from strerror() to not contain '?' symbols,
10108  * or worse, mojibaked.  It's cheaper to just use the stricter criteria of
10109  * being in the same locale.  So the code below uses a common locale for both
10110  * categories.  Again, that is C if not within 'use locale' scope; or the
10111  * LC_MESSAGES locale if in scope and we have that category; and LC_CTYPE if we
10112  * don't have LC_MESSAGES; and whatever strerror returns if we don't have
10113  * either category.
10114  *
10115  * There are two sets of implementations.  The first below is if we have
10116  * strerror_l().  This is the simpler.  We just use the already-built C locale
10117  * object if not in locale scope, or build up a custom one otherwise.
10118  *
10119  * When strerror_l() is not available, we may have to swap locales temporarily
10120  * to bring the two categories into sync with each other, and possibly to the C
10121  * locale.
10122  *
10123  * Because the prepropessing directives to conditionally compile this function
10124  * would greatly obscure the logic of the various implementations, the whole
10125  * function is repeated for each configuration, with some common macros. */
10126 
10127 /* Used to shorten the definitions of the following implementations of
10128  * my_strerror() */
10129 #define DEBUG_STRERROR_ENTER(errnum, in_locale)                             \
10130     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
10131                            "my_strerror called with errnum %d;"             \
10132                            " Within locale scope=%d\n",                     \
10133                            errnum, in_locale))
10134 
10135 #define DEBUG_STRERROR_RETURN(errstr, utf8ness)                             \
10136     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
10137                            "Strerror returned; saving a copy: '%s';"        \
10138                            " utf8ness=%d\n",                                \
10139                            get_displayable_string(errstr,                   \
10140                                                   errstr + strlen(errstr),  \
10141                                                   *utf8ness),               \
10142                            (int) *utf8ness))
10143 
10144 /* On platforms that have precisely one of these categories (Windows
10145  * qualifies), these yield the correct one */
10146 #if defined(USE_LOCALE_CTYPE)
10147 #  define WHICH_LC_INDEX LC_CTYPE_INDEX_
10148 #elif defined(USE_LOCALE_MESSAGES)
10149 #  define WHICH_LC_INDEX LC_MESSAGES_INDEX_
10150 #endif
10151 
10152 /*===========================================================================*/
10153 /* First set of implementations, when have strerror_l() */
10154 
10155 #if defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
10156 
10157 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
10158 
10159 /* Here, neither category is defined: use the C locale */
10160 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)10161 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
10162 {
10163     PERL_ARGS_ASSERT_MY_STRERROR;
10164 
10165     DEBUG_STRERROR_ENTER(errnum, 0);
10166 
10167     const char *errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
10168     *utf8ness = UTF8NESS_IMMATERIAL;
10169 
10170     DEBUG_STRERROR_RETURN(errstr, utf8ness);
10171 
10172     SAVEFREEPV(errstr);
10173     return errstr;
10174 }
10175 
10176 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
10177 
10178 /*--------------------------------------------------------------------------*/
10179 
10180 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
10181  * are not within 'use locale' scope of the only one defined, we use the C
10182  * locale; otherwise use the current locale object */
10183 
10184 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)10185 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
10186 {
10187     PERL_ARGS_ASSERT_MY_STRERROR;
10188 
10189     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
10190 
10191     /* Use C if not within locale scope;  Otherwise, use current locale */
10192     const locale_t which_obj = (IN_LC(categories[WHICH_LC_INDEX]))
10193                                ? PL_C_locale_obj
10194                                : use_curlocale_scratch();
10195 
10196     const char *errstr = savepv(strerror_l(errnum, which_obj));
10197     *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
10198                                              NULL, WHICH_LC_INDEX);
10199     DEBUG_STRERROR_RETURN(errstr, utf8ness);
10200 
10201     SAVEFREEPV(errstr);
10202     return errstr;
10203 }
10204 
10205 /*--------------------------------------------------------------------------*/
10206 #  else     /* Are using both categories.  Place them in the same CODESET,
10207              * either C or the LC_MESSAGES locale */
10208 
10209 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)10210 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
10211 {
10212     PERL_ARGS_ASSERT_MY_STRERROR;
10213 
10214     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
10215 
10216     const char *errstr;
10217     if (! IN_LC(LC_MESSAGES)) {    /* Use C if not within locale scope */
10218         errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
10219         *utf8ness = UTF8NESS_IMMATERIAL;
10220     }
10221     else {  /* Otherwise, use the LC_MESSAGES locale, making sure LC_CTYPE
10222                matches */
10223         locale_t cur = duplocale(use_curlocale_scratch());
10224 
10225         const char * locale = querylocale_c(LC_MESSAGES);
10226         cur = newlocale(LC_CTYPE_MASK, locale, cur);
10227         errstr = savepv(strerror_l(errnum, cur));
10228         *utf8ness = get_locale_string_utf8ness_i(errstr,
10229                                                  LOCALE_UTF8NESS_UNKNOWN,
10230                                                  locale,
10231                                                  LC_MESSAGES_INDEX_);
10232         freelocale(cur);
10233     }
10234 
10235     DEBUG_STRERROR_RETURN(errstr, utf8ness);
10236 
10237     SAVEFREEPV(errstr);
10238     return errstr;
10239 }
10240 #  endif    /* Above is using strerror_l */
10241 /*===========================================================================*/
10242 #else       /* Below is not using strerror_l */
10243 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
10244 
10245 /* If not using using either of the categories, return plain, unadorned
10246  * strerror */
10247 
10248 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)10249 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
10250 {
10251     PERL_ARGS_ASSERT_MY_STRERROR;
10252 
10253     DEBUG_STRERROR_ENTER(errnum, 0);
10254 
10255     const char *errstr = savepv(Strerror(errnum));
10256     *utf8ness = UTF8NESS_IMMATERIAL;
10257 
10258     DEBUG_STRERROR_RETURN(errstr, utf8ness);
10259 
10260     SAVEFREEPV(errstr);
10261     return errstr;
10262 }
10263 
10264 /*--------------------------------------------------------------------------*/
10265 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
10266 
10267 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
10268  * are not within 'use locale' scope of the only one defined, we use the C
10269  * locale; otherwise use the current locale */
10270 
10271 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)10272 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
10273 {
10274     PERL_ARGS_ASSERT_MY_STRERROR;
10275 
10276     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
10277 
10278     const char *errstr;
10279     if (IN_LC(categories[WHICH_LC_INDEX])) {
10280         errstr = savepv(Strerror(errnum));
10281         *utf8ness = get_locale_string_utf8ness_i(errstr,
10282                                                  LOCALE_UTF8NESS_UNKNOWN,
10283                                                  NULL, WHICH_LC_INDEX);
10284     }
10285     else {
10286 
10287         LOCALE_LOCK;
10288 
10289         const char * orig_locale = toggle_locale_i(WHICH_LC_INDEX, "C");
10290 
10291         errstr = savepv(Strerror(errnum));
10292 
10293         restore_toggled_locale_i(WHICH_LC_INDEX, orig_locale);
10294 
10295         LOCALE_UNLOCK;
10296 
10297         *utf8ness = UTF8NESS_IMMATERIAL;
10298     }
10299 
10300     DEBUG_STRERROR_RETURN(errstr, utf8ness);
10301 
10302     SAVEFREEPV(errstr);
10303     return errstr;
10304 }
10305 
10306 /*--------------------------------------------------------------------------*/
10307 #  else
10308 
10309 /* Below, have both LC_CTYPE and LC_MESSAGES.  Place them in the same CODESET,
10310  * either C or the LC_MESSAGES locale */
10311 
10312 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)10313 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
10314 {
10315     PERL_ARGS_ASSERT_MY_STRERROR;
10316 
10317     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
10318 
10319     const char * desired_locale = (IN_LC(LC_MESSAGES))
10320                                   ? querylocale_c(LC_MESSAGES)
10321                                   : "C";
10322     /* XXX Can fail on z/OS */
10323 
10324     LOCALE_LOCK;
10325 
10326     const char* orig_CTYPE_locale    = toggle_locale_c(LC_CTYPE,
10327                                                        desired_locale);
10328     const char* orig_MESSAGES_locale = toggle_locale_c(LC_MESSAGES,
10329                                                        desired_locale);
10330     const char *errstr = savepv(Strerror(errnum));
10331 
10332     restore_toggled_locale_c(LC_MESSAGES, orig_MESSAGES_locale);
10333     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
10334 
10335     LOCALE_UNLOCK;
10336 
10337     *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
10338                                              desired_locale,
10339                                              LC_MESSAGES_INDEX_);
10340     DEBUG_STRERROR_RETURN(errstr, utf8ness);
10341 
10342     SAVEFREEPV(errstr);
10343     return errstr;
10344 }
10345 
10346 /*--------------------------------------------------------------------------*/
10347 #  endif /* end of not using strerror_l() */
10348 #endif   /* end of all the my_strerror() implementations */
10349 
10350 bool
Perl__is_in_locale_category(pTHX_ const bool compiling,const int category)10351 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
10352 {
10353     /* Internal function which returns if we are in the scope of a pragma that
10354      * enables the locale category 'category'.  'compiling' should indicate if
10355      * this is during the compilation phase (TRUE) or not (FALSE). */
10356 
10357     const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
10358 
10359     SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
10360     if (! these_categories || these_categories == &PL_sv_placeholder) {
10361         return FALSE;
10362     }
10363 
10364     /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
10365      * a valid unsigned */
10366     assert(category >= -1);
10367     return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
10368 }
10369 
10370 /*
10371 
10372 =for apidoc_section $locale
10373 =for apidoc switch_to_global_locale
10374 
10375 This function copies the locale state of the calling thread into the program's
10376 global locale, and converts the thread to use that global locale.
10377 
10378 It is intended so that Perl can safely be used with C libraries that access the
10379 global locale and which can't be converted to not access it.  Effectively, this
10380 means libraries that call C<L<setlocale(3)>> on non-Windows systems.  (For
10381 portability, it is a good idea to use it on Windows as well.)
10382 
10383 A downside of using it is that it disables the services that Perl provides to
10384 hide locale gotchas from your code.  The service you most likely will miss
10385 regards the radix character (decimal point) in floating point numbers.  Code
10386 executed after this function is called can no longer just assume that this
10387 character is correct for the current circumstances.
10388 
10389 To return to Perl control, and restart the gotcha prevention services, call
10390 C<L</sync_locale>>.  Behavior is undefined for any pure Perl code that executes
10391 while the switch is in effect.
10392 
10393 The global locale and the per-thread locales are independent.  As long as just
10394 one thread converts to the global locale, everything works smoothly.  But if
10395 more than one does, they can easily interfere with each other, and races are
10396 likely.  On Windows systems prior to Visual Studio 15 (at which point Microsoft
10397 fixed a bug), races can occur (even if only one thread has been converted to
10398 the global locale), but only if you use the following operations:
10399 
10400 =over
10401 
10402 =item L<POSIX::localeconv|POSIX/localeconv>
10403 
10404 =item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
10405 
10406 =item L<perlapi/sv_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
10407 
10408 =back
10409 
10410 The first item is not fixable (except by upgrading to a later Visual Studio
10411 release), but it would be possible to work around the latter two items by
10412 having Perl change its algorithm for calculating these to use Windows API
10413 functions (likely C<GetNumberFormat> and C<GetCurrencyFormat>); patches
10414 welcome.
10415 
10416 XS code should never call plain C<setlocale>, but should instead be converted
10417 to either call L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in
10418 for the system C<setlocale>) or use the methods given in L<perlcall> to call
10419 L<C<POSIX::setlocale>|POSIX/setlocale>.  Either one will transparently properly
10420 handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
10421 
10422 =cut
10423 */
10424 
10425 #if defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)
10426 #  define CHANGE_SYSTEM_LOCALE_TO_GLOBAL                                \
10427     STMT_START {                                                        \
10428         if (_configthreadlocale(_DISABLE_PER_THREAD_LOCALE) == -1) {    \
10429             locale_panic_("_configthreadlocale returned an error");     \
10430         }                                                               \
10431     } STMT_END
10432 #elif defined(USE_POSIX_2008_LOCALE)
10433 #  define CHANGE_SYSTEM_LOCALE_TO_GLOBAL                                \
10434     STMT_START {                                                        \
10435         locale_t old_locale = uselocale(LC_GLOBAL_LOCALE);              \
10436         if (! old_locale) {                                             \
10437             locale_panic_("Could not change to global locale");         \
10438         }                                                               \
10439                                                                         \
10440         /* Free the per-thread memory */                                \
10441         if (   old_locale != LC_GLOBAL_LOCALE                           \
10442             && old_locale != PL_C_locale_obj)                           \
10443         {                                                               \
10444             freelocale(old_locale);                                     \
10445         }                                                               \
10446     } STMT_END
10447 #else
10448 #  define CHANGE_SYSTEM_LOCALE_TO_GLOBAL
10449 #endif
10450 
10451 void
Perl_switch_to_global_locale(pTHX)10452 Perl_switch_to_global_locale(pTHX)
10453 {
10454 
10455 #ifdef USE_LOCALE
10456 
10457     DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering switch_to_global; %s\n",
10458                                           get_LC_ALL_display()));
10459 
10460    /* In these cases, we use the system state to determine if we are in the
10461     * global locale or not. */
10462 #  ifdef USE_POSIX_2008_LOCALE
10463 
10464     const bool perl_controls = (LC_GLOBAL_LOCALE != uselocale((locale_t) 0));
10465 
10466 #  elif defined(USE_THREAD_SAFE_LOCALE) && defined(WIN32)
10467 
10468     int config_return = _configthreadlocale(0);
10469     if (config_return == -1) {
10470         locale_panic_("_configthreadlocale returned an error");
10471     }
10472     const bool perl_controls = (config_return == _ENABLE_PER_THREAD_LOCALE);
10473 
10474 #  else
10475 
10476     const bool perl_controls = false;
10477 
10478 #  endif
10479 
10480     /* No-op if already in global */
10481     if (! perl_controls) {
10482         return;
10483     }
10484 
10485 #  ifdef LC_ALL
10486 
10487     const char * thread_locale = calculate_LC_ALL_string(NULL,
10488                                                          EXTERNAL_FORMAT_FOR_SET,
10489                                                          WANT_TEMP_PV,
10490                                                          __LINE__);
10491     CHANGE_SYSTEM_LOCALE_TO_GLOBAL;
10492     posix_setlocale(LC_ALL, thread_locale);
10493 
10494 #  else   /* Must be USE_POSIX_2008_LOCALE) */
10495 
10496     const char * cur_thread_locales[LC_ALL_INDEX_];
10497 
10498     /* Save each category's current per-thread state */
10499     for_all_individual_category_indexes(i) {
10500         cur_thread_locales[i] = querylocale_i(i);
10501     }
10502 
10503     CHANGE_SYSTEM_LOCALE_TO_GLOBAL;
10504 
10505     /* Set the global to what was our per-thread state */
10506     POSIX_SETLOCALE_LOCK;
10507     for_all_individual_category_indexes(i) {
10508         posix_setlocale(categories[i], cur_thread_locales[i]);
10509     }
10510     POSIX_SETLOCALE_UNLOCK;
10511 
10512 #  endif
10513 #  ifdef USE_LOCALE_NUMERIC
10514 
10515     /* Switch to the underlying C numeric locale; the application is on its
10516      * own. */
10517     POSIX_SETLOCALE_LOCK;
10518     posix_setlocale(LC_NUMERIC, PL_numeric_name);
10519     POSIX_SETLOCALE_UNLOCK;
10520 
10521 #  endif
10522 #endif
10523 
10524 }
10525 
10526 /*
10527 
10528 =for apidoc sync_locale
10529 
10530 This function copies the state of the program global locale into the calling
10531 thread, and converts that thread to using per-thread locales, if it wasn't
10532 already, and the platform supports them.  The LC_NUMERIC locale is toggled into
10533 the standard state (using the C locale's conventions), if not within the
10534 lexical scope of S<C<use locale>>.
10535 
10536 Perl will now consider itself to have control of the locale.
10537 
10538 Since unthreaded perls have only a global locale, this function is a no-op
10539 without threads.
10540 
10541 This function is intended for use with C libraries that do locale manipulation.
10542 It allows Perl to accommodate the use of them.  Call this function before
10543 transferring back to Perl space so that it knows what state the C code has left
10544 things in.
10545 
10546 XS code should not manipulate the locale on its own.  Instead,
10547 L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
10548 change the locale (though changing the locale is antisocial and dangerous on
10549 multi-threaded systems that don't have multi-thread safe locale operations.
10550 (See L<perllocale/Multi-threaded operation>).
10551 
10552 Using the libc L<C<setlocale(3)>> function should be avoided.  Nevertheless,
10553 certain non-Perl libraries called from XS, do call it, and their behavior may
10554 not be able to be changed.  This function, along with
10555 C<L</switch_to_global_locale>>, can be used to get seamless behavior in these
10556 circumstances, as long as only one thread is involved.
10557 
10558 If the library has an option to turn off its locale manipulation, doing that is
10559 preferable to using this mechanism.  C<Gtk> is such a library.
10560 
10561 The return value is a boolean: TRUE if the global locale at the time of call
10562 was in effect for the caller; and FALSE if a per-thread locale was in effect.
10563 
10564 =cut
10565 */
10566 
10567 bool
Perl_sync_locale(pTHX)10568 Perl_sync_locale(pTHX)
10569 {
10570 
10571 #ifndef USE_LOCALE
10572 
10573     return TRUE;
10574 
10575 #else
10576 
10577     bool was_in_global = TRUE;
10578 
10579 #  ifdef USE_THREAD_SAFE_LOCALE
10580 #    if defined(WIN32)
10581 
10582     int config_return = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
10583     if (config_return == -1) {
10584         locale_panic_("_configthreadlocale returned an error");
10585     }
10586     was_in_global = (config_return == _DISABLE_PER_THREAD_LOCALE);
10587 
10588 #    elif defined(USE_POSIX_2008_LOCALE)
10589 
10590     was_in_global = (LC_GLOBAL_LOCALE == uselocale(LC_GLOBAL_LOCALE));
10591 
10592 #    else
10593 #      error Unexpected Configuration
10594 #    endif
10595 #  endif    /* USE_THREAD_SAFE_LOCALE */
10596 
10597     /* Here, we are in the global locale.  Get and save the values for each
10598      * category, and convert the current thread to use them */
10599 
10600 #  ifdef LC_ALL
10601 
10602     STDIZED_SETLOCALE_LOCK;
10603     const char * lc_all_string = savepv(stdized_setlocale(LC_ALL, NULL));
10604     STDIZED_SETLOCALE_UNLOCK;
10605 
10606     give_perl_locale_control(lc_all_string, __LINE__);
10607     Safefree(lc_all_string);
10608 
10609 #  else
10610 
10611     const char * current_globals[LC_ALL_INDEX_];
10612     for_all_individual_category_indexes(i) {
10613         STDIZED_SETLOCALE_LOCK;
10614         current_globals[i] = savepv(stdized_setlocale(categories[i], NULL));
10615         STDIZED_SETLOCALE_UNLOCK;
10616     }
10617 
10618     give_perl_locale_control((const char **) &current_globals, __LINE__);
10619 
10620     for_all_individual_category_indexes(i) {
10621         Safefree(current_globals[i]);
10622     }
10623 
10624 #  endif
10625 
10626     return was_in_global;
10627 
10628 #endif
10629 
10630 }
10631 
10632 #ifdef USE_PERL_SWITCH_LOCALE_CONTEXT
10633 
10634 void
Perl_switch_locale_context(pTHX)10635 Perl_switch_locale_context(pTHX)
10636 {
10637     /* libc keeps per-thread locale status information in some configurations.
10638      * So, we can't just switch out aTHX to switch to a new thread.  libc has
10639      * to follow along.  This routine does that based on per-interpreter
10640      * variables we keep just for this purpose.
10641      *
10642      * There are two implementations where this is an issue.  For the other
10643      * implementations, it doesn't matter because libc is using global values
10644      * that all threads know about.
10645      *
10646      * The two implementations are where libc keeps thread-specific information
10647      * on its own.  These are
10648      *
10649      * POSIX 2008:  The current locale is kept by libc as an object.  We save
10650      *              a copy of that in the per-thread PL_cur_locale_obj, and so
10651      *              this routine uses that copy to tell the thread it should be
10652      *              operating with that object
10653      * Windows thread-safe locales:  A given thread in Windows can be being run
10654      *              with per-thread locales, or not.  When the thread context
10655      *              changes, libc doesn't automatically know if the thread is
10656      *              using per-thread locales, nor does it know what the new
10657      *              thread's locale is.  We keep that information in the
10658      *              per-thread variables:
10659      *                  PL_controls_locale  indicates if this thread is using
10660      *                                      per-thread locales or not
10661      *                  PL_cur_LC_ALL       indicates what the locale should be
10662      *                                      if it is a per-thread locale.
10663      */
10664 
10665     if (UNLIKELY(   PL_veto_switch_non_tTHX_context
10666                  || PL_phase == PERL_PHASE_CONSTRUCT))
10667     {
10668         return;
10669     }
10670 
10671 #  ifdef USE_POSIX_2008_LOCALE
10672 
10673     if (! uselocale(PL_cur_locale_obj)) {
10674         locale_panic_(Perl_form(aTHX_
10675                                 "Can't uselocale(%p), LC_ALL supposed to"
10676                                 " be '%s'",
10677                                 PL_cur_locale_obj, get_LC_ALL_display()));
10678     }
10679 
10680 #  elif defined(WIN32)
10681 
10682     if (! bool_setlocale_c(LC_ALL, PL_cur_LC_ALL)) {
10683         locale_panic_(Perl_form(aTHX_ "Can't setlocale(%s)", PL_cur_LC_ALL));
10684     }
10685 
10686 #  endif
10687 
10688 }
10689 
10690 #endif
10691 #ifdef USE_THREADS
10692 
10693 void
Perl_thread_locale_init(pTHX)10694 Perl_thread_locale_init(pTHX)
10695 {
10696 
10697 #  ifdef USE_THREAD_SAFE_LOCALE
10698 #    ifdef USE_POSIX_2008_LOCALE
10699 
10700     /* Called from a thread on startup.
10701      *
10702      * The operations here have to be done from within the calling thread, as
10703      * they affect libc's knowledge of the thread; libc has no knowledge of
10704      * aTHX */
10705 
10706      DEBUG_L(PerlIO_printf(Perl_debug_log,
10707                            "new thread, initial locale is %s;"
10708                            " calling setlocale(LC_ALL, \"C\")\n",
10709                            get_LC_ALL_display()));
10710 
10711     if (! uselocale(PL_C_locale_obj)) {
10712 
10713         /* Not being able to change to the C locale is severe; don't keep
10714          * going.  */
10715         locale_panic_(Perl_form(aTHX_
10716                                 "Can't uselocale(%p), 'C'", PL_C_locale_obj));
10717         NOT_REACHED; /* NOTREACHED */
10718     }
10719 
10720     PL_cur_locale_obj = PL_C_locale_obj;
10721 
10722 #    elif defined(WIN32)
10723 
10724     /* On Windows, make sure new thread has per-thread locales enabled */
10725     if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) {
10726         locale_panic_("_configthreadlocale returned an error");
10727     }
10728     void_setlocale_c(LC_ALL, "C");
10729 
10730 #    endif
10731 #  endif
10732 
10733 }
10734 
10735 void
Perl_thread_locale_term(pTHX)10736 Perl_thread_locale_term(pTHX)
10737 {
10738     /* Called from a thread as it gets ready to terminate.
10739      *
10740      * The operations here have to be done from within the calling thread, as
10741      * they affect libc's knowledge of the thread; libc has no knowledge of
10742      * aTHX */
10743 
10744 #  if defined(USE_POSIX_2008_LOCALE)
10745 
10746     /* Switch to the global locale, so can free up the per-thread object */
10747     locale_t actual_obj = uselocale(LC_GLOBAL_LOCALE);
10748     if (actual_obj != LC_GLOBAL_LOCALE && actual_obj != PL_C_locale_obj) {
10749         freelocale(actual_obj);
10750     }
10751 
10752     /* Prevent leaks even if something has gone wrong */
10753     locale_t expected_obj = PL_cur_locale_obj;
10754     if (UNLIKELY(   expected_obj != actual_obj
10755                  && expected_obj != LC_GLOBAL_LOCALE
10756                  && expected_obj != PL_C_locale_obj))
10757     {
10758         freelocale(expected_obj);
10759     }
10760 
10761     PL_cur_locale_obj = LC_GLOBAL_LOCALE;
10762 
10763 #  endif
10764 #  ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
10765 
10766     /* When faking the mingw implementation, we coerce this function into doing
10767      * something completely different from its intent -- namely to free up our
10768      * static buffer to avoid a leak.  This function gets called for each
10769      * thread that is terminating, so will give us a chance to free the buffer
10770      * from the appropriate pool.  On unthreaded systems, it gets called by the
10771      * mutex termination code. */
10772 
10773     if (aTHX != wsetlocale_buf_aTHX) {
10774         return;
10775     }
10776 
10777     if (wsetlocale_buf_size > 0) {
10778         Safefree(wsetlocale_buf);
10779         wsetlocale_buf_size = 0;
10780     }
10781 
10782 #  endif
10783 
10784 }
10785 
10786 #endif
10787 
10788 /*
10789  * ex: set ts=8 sts=4 sw=4 et:
10790  */
10791