xref: /openbsd/gnu/usr.bin/perl/locale.c (revision d635388b)
1 /*    locale.c
2  *
3  *    Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4  *    2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
5  *
6  *    You may distribute under the terms of either the GNU General Public
7  *    License or the Artistic License, as specified in the README file.
8  *
9  */
10 
11 /*
12  *      A Elbereth Gilthoniel,
13  *      silivren penna míriel
14  *      o menel aglar elenath!
15  *      Na-chaered palan-díriel
16  *      o galadhremmin ennorath,
17  *      Fanuilos, le linnathon
18  *      nef aear, si nef aearon!
19  *
20  *     [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
21  */
22 
23 /* utility functions for handling locale-specific stuff like what
24  * character represents the decimal point.
25  *
26  * All C programs have an underlying locale.  Perl code generally doesn't pay
27  * any attention to it except within the scope of a 'use locale'.  For most
28  * categories, it accomplishes this by just using different operations if it is
29  * in such scope than if not.  However, various libc functions called by Perl
30  * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31  * are used to toggle between the current locale and the C locale depending on
32  * the desired behavior of those functions at the moment.  And, LC_MESSAGES is
33  * switched to the C locale for outputting the message unless within the scope
34  * of 'use locale'.
35  *
36  * This code now has multi-thread-safe locale handling on systems that support
37  * that.  This is completely transparent to most XS code.  On earlier systems,
38  * it would be possible to emulate thread-safe locales, but this likely would
39  * involve a lot of locale switching, and would require XS code changes.
40  * Macros could be written so that the code wouldn't have to know which type of
41  * system is being used.
42  *
43  * Table-driven code is used for simplicity and clarity, as many operations
44  * differ only in which category is being worked on.  However the system
45  * categories need not be small contiguous integers, so do not lend themselves
46  * to table lookup.  Instead we have created our own equivalent values which
47  * are all small contiguous non-negative integers, and translation functions
48  * between the two sets.  For category 'LC_foo', the name of our index is
49  * LC_foo_INDEX_.  Various parallel tables, indexed by these, are used.
50  *
51  * Many of the macros and functions in this file have one of the suffixes '_c',
52  * '_r', or '_i'.  khw found these useful in remembering what type of locale
53  * category to use as their parameter.  '_r' takes an int category number as
54  * passed to setlocale(), like LC_ALL, LC_CTYPE, etc.  The 'r' indicates that
55  * the value isn't known until runtime.  '_c' also indicates such a category
56  * number, but its value is known at compile time.  These are both converted
57  * into unsigned indexes into various tables of category information, where the
58  * real work is generally done.  The tables are generated at compile-time based
59  * on platform characteristics and Configure options.  They hide from the code
60  * many of the vagaries of the different locale implementations out there.  You
61  * may have already guessed that '_i' indicates the parameter is such an
62  * unsigned index.  Converting from '_r' to '_i' requires run-time lookup.
63  * '_c' is used to get cpp to do this at compile time.  To avoid the runtime
64  * expense, the code is structured to use '_r' at the API level, and once
65  * converted, everything possible is done using the table indexes.
66  *
67  * On unthreaded perls, most operations expand out to just the basic
68  * setlocale() calls.  The same is true on threaded perls on modern Windows
69  * systems where the same API, after set up, is used for thread-safe locale
70  * handling.  On other systems, there is a completely different API, specified
71  * in POSIX 2008, to do thread-safe locales.  On these systems, our
72  * emulate_setlocale_i() function is used to hide the different API from the
73  * outside.  This makes it completely transparent to most XS code.
74  *
75  * A huge complicating factor is that the LC_NUMERIC category is normally held
76  * in the C locale, except during those relatively rare times when it needs to
77  * be in the underlying locale.  There is a bunch of code to accomplish this,
78  * and to allow easy switches from one state to the other.
79  *
80  * In addition, the setlocale equivalents have versions for the return context,
81  * 'void' and 'bool', besides the full return value.  This can present
82  * opportunities for avoiding work.  We don't have to necessarily create a safe
83  * copy to return if no return is desired.
84  *
85  * There are 3.5 major implementations here; which one chosen depends on what
86  * the platform has available, and Configuration options.
87  *
88  * 1) Raw my_setlocale().  Here the layer adds nothing.  This is used for
89  *    unthreaded perls, and when the API for safe locale threading is identical
90  *    to the unsafe API (Windows, currently).
91  *
92  * 2) A minimal layer that makes my_setlocale() uninterruptible and returns a
93  *    per-thread/per-category value.
94  *
95  * 3a and 3b) A layer that implements POSIX 2008 thread-safe locale handling,
96  *    mapping the setlocale() API to them.  This automatically makes almost all
97  *    code thread-safe without need for changes.  This layer is chosen on
98  *    threaded perls when the platform supports the POSIX 2008 functions, and
99  *    when there is no manual override in Configure.
100  *
101  *    3a) is when the platform has a reliable querylocale() function or
102  *        equivalent that is selected to be used.
103  *    3b) is when we have to emulate that functionality.
104  *
105  * z/OS (os390) is an outlier.  Locales really don't work under threads when
106  * either the radix character isn't a dot, or attempts are made to change
107  * locales after the first thread is created.  The reason is that IBM has made
108  * it thread-safe by refusing to change locales (returning failure if
109  * attempted) any time after an application has called pthread_create() to
110  * create another thread.  The expectation is that an application will set up
111  * its locale information before the first fork, and be stable thereafter.  But
112  * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
113  * the other toggles, which are less common.
114  */
115 
116 /* If the environment says to, we can output debugging information during
117  * initialization.  This is done before option parsing, and before any thread
118  * creation, so can be a file-level static.  (Must come before #including
119  * perl.h) */
120 #ifdef DEBUGGING
121 static int debug_initialization = 0;
122 #  define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
123 #  define DEBUG_LOCALE_INITIALIZATION_  debug_initialization
124 /* C standards seem to say that __LINE__ is merely "an integer constant",
125  * which means it might be either int, long (with L suffix), or long long
126  * (or their corresponding unsigned type).  So, we have to explicitly cast
127  * __LINE__ to a particular integer type to pass it reliably to variadic
128  * functions like (PerlIO_)printf, as below: */
129 #  ifdef USE_LOCALE_THREADS
130 #    define DEBUG_PRE_STMTS                                                     \
131      dSAVE_ERRNO; dTHX; PerlIO_printf(Perl_debug_log,"\n%s: %" LINE_Tf ": %p: ",\
132                                       __FILE__, (line_t)__LINE__, aTHX);
133 #  else
134 #    define DEBUG_PRE_STMTS                                                     \
135      dSAVE_ERRNO; dTHX; PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf ": ",   \
136                                       __FILE__, (line_t)__LINE__);
137 #  endif
138 #  define DEBUG_POST_STMTS  RESTORE_ERRNO;
139 #else
140 #  define debug_initialization 0
141 #  define DEBUG_INITIALIZATION_set(v)
142 #  define DEBUG_PRE_STMTS
143 #  define DEBUG_POST_STMTS
144 #endif
145 
146 #include "EXTERN.h"
147 #define PERL_IN_LOCALE_C
148 #include "perl.h"
149 
150 #include "reentr.h"
151 
152 #ifdef I_WCHAR
153 #  include <wchar.h>
154 #endif
155 #ifdef I_WCTYPE
156 #  include <wctype.h>
157 #endif
158 
159  /* The main errno that gets used is this one, on platforms that support it */
160 #ifdef EINVAL
161 #  define SET_EINVAL  SETERRNO(EINVAL, LIB_INVARG)
162 #else
163 #  define SET_EINVAL
164 #endif
165 
166 /* If we have any of these library functions, we can reliably determine is a
167  * locale is a UTF-8 one or not.  And if we aren't using locales at all, we act
168  * as if everything is the C locale, so the answer there is always "No, it
169  * isn't UTF-8"; this too is reliably accurate */
170 #if   defined(HAS_SOME_LANGINFO) || defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) \
171  || ! defined(USE_LOCALE)
172 #  define HAS_RELIABLE_UTF8NESS_DETERMINATION
173 #endif
174 
175 #ifdef USE_LOCALE
176 
177 PERL_STATIC_INLINE const char *
S_mortalized_pv_copy(pTHX_ const char * const pv)178 S_mortalized_pv_copy(pTHX_ const char * const pv)
179 {
180     PERL_ARGS_ASSERT_MORTALIZED_PV_COPY;
181 
182     /* Copies the input pv, and arranges for it to be freed at an unspecified
183      * later time. */
184 
185     if (pv == NULL) {
186         return NULL;
187     }
188 
189     const char * copy = savepv(pv);
190     SAVEFREEPV(copy);
191     return copy;
192 }
193 
194 #endif
195 
196 /* Returns the Unix errno portion; ignoring any others.  This is a macro here
197  * instead of putting it into perl.h, because unclear to khw what should be
198  * done generally. */
199 #define GET_ERRNO   saved_errno
200 
201 /* Default values come from the C locale */
202 #define C_codeset "ANSI_X3.4-1968" /* Only in some Configurations, and usually
203                                       a single instance, so is a #define */
204 static const char C_decimal_point[] = ".";
205 static const char C_thousands_sep[] = "";
206 
207 /* Is the C string input 'name' "C" or "POSIX"?  If so, and 'name' is the
208  * return of setlocale(), then this is extremely likely to be the C or POSIX
209  * locale.  However, the output of setlocale() is documented to be opaque, but
210  * the odds are extremely small that it would return these two strings for some
211  * other locale.  Note that VMS in these two locales includes many non-ASCII
212  * characters as controls and punctuation (below are hex bytes):
213  *   cntrl:  84-97 9B-9F
214  *   punct:  A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
215  * Oddly, none there are listed as alphas, though some represent alphabetics
216  * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
217 #define isNAME_C_OR_POSIX(name)                                              \
218                              (   (name) != NULL                              \
219                               && (( *(name) == 'C' && (*(name + 1)) == '\0') \
220                                    || strEQ((name), "POSIX")))
221 
222 #if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
223 #  define HAS_SOME_LANGINFO
224 #endif
225 
226 #define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep, utf8ness) \
227             my_langinfo_i(item, category##_INDEX_, locale, retbufp,            \
228                                                       retbuf_sizep,  utf8ness)
229 
230 #ifdef USE_LOCALE
231 
232 #  ifdef DEBUGGING
233 #    define setlocale_debug_string_i(index, locale, result)                 \
234             my_setlocale_debug_string_i(index, locale, result, __LINE__)
235 #    define setlocale_debug_string_c(category, locale, result)              \
236                 setlocale_debug_string_i(category##_INDEX_, locale, result)
237 #    define setlocale_debug_string_r(category, locale, result)              \
238              setlocale_debug_string_i(get_category_index(category, locale), \
239                                       locale, result)
240 #  endif
241 
242 #  define toggle_locale_i(index, locale)                                    \
243                  S_toggle_locale_i(aTHX_ index, locale, __LINE__)
244 #  define toggle_locale_c(cat, locale)  toggle_locale_i(cat##_INDEX_, locale)
245 #  define restore_toggled_locale_i(index, locale)                           \
246                  S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__)
247 #  define restore_toggled_locale_c(cat, locale)                             \
248                              restore_toggled_locale_i(cat##_INDEX_, locale)
249 
250 /* Two parallel arrays indexed by our mapping of category numbers into small
251  * non-negative indexes; first the locale categories Perl uses on this system,
252  * used to do the inverse mapping.  The second array is their names.  These
253  * arrays are in mostly arbitrary order. */
254 
255 STATIC const int categories[] = {
256 
257 #    ifdef USE_LOCALE_CTYPE
258                              LC_CTYPE,
259 #    endif
260 #  ifdef USE_LOCALE_NUMERIC
261                              LC_NUMERIC,
262 #  endif
263 #    ifdef USE_LOCALE_COLLATE
264                              LC_COLLATE,
265 #    endif
266 #    ifdef USE_LOCALE_TIME
267                              LC_TIME,
268 #    endif
269 #    ifdef USE_LOCALE_MESSAGES
270                              LC_MESSAGES,
271 #    endif
272 #    ifdef USE_LOCALE_MONETARY
273                              LC_MONETARY,
274 #    endif
275 #    ifdef USE_LOCALE_ADDRESS
276                              LC_ADDRESS,
277 #    endif
278 #    ifdef USE_LOCALE_IDENTIFICATION
279                              LC_IDENTIFICATION,
280 #    endif
281 #    ifdef USE_LOCALE_MEASUREMENT
282                              LC_MEASUREMENT,
283 #    endif
284 #    ifdef USE_LOCALE_PAPER
285                              LC_PAPER,
286 #    endif
287 #    ifdef USE_LOCALE_TELEPHONE
288                              LC_TELEPHONE,
289 #    endif
290 #    ifdef USE_LOCALE_NAME
291                              LC_NAME,
292 #    endif
293 #    ifdef USE_LOCALE_SYNTAX
294                              LC_SYNTAX,
295 #    endif
296 #    ifdef USE_LOCALE_TOD
297                              LC_TOD,
298 #    endif
299 #    ifdef LC_ALL
300                              LC_ALL,
301 #    endif
302 
303    /* Placeholder as a precaution if code fails to check the return of
304     * get_category_index(), which returns this element to indicate an error */
305                             -1
306 };
307 
308 /* The top-most real element is LC_ALL */
309 
310 STATIC const char * const category_names[] = {
311 
312 #    ifdef USE_LOCALE_CTYPE
313                                  "LC_CTYPE",
314 #    endif
315 #  ifdef USE_LOCALE_NUMERIC
316                                  "LC_NUMERIC",
317 #  endif
318 #    ifdef USE_LOCALE_COLLATE
319                                  "LC_COLLATE",
320 #    endif
321 #    ifdef USE_LOCALE_TIME
322                                  "LC_TIME",
323 #    endif
324 #    ifdef USE_LOCALE_MESSAGES
325                                  "LC_MESSAGES",
326 #    endif
327 #    ifdef USE_LOCALE_MONETARY
328                                  "LC_MONETARY",
329 #    endif
330 #    ifdef USE_LOCALE_ADDRESS
331                                  "LC_ADDRESS",
332 #    endif
333 #    ifdef USE_LOCALE_IDENTIFICATION
334                                  "LC_IDENTIFICATION",
335 #    endif
336 #    ifdef USE_LOCALE_MEASUREMENT
337                                  "LC_MEASUREMENT",
338 #    endif
339 #    ifdef USE_LOCALE_PAPER
340                                  "LC_PAPER",
341 #    endif
342 #    ifdef USE_LOCALE_TELEPHONE
343                                  "LC_TELEPHONE",
344 #    endif
345 #    ifdef USE_LOCALE_NAME
346                                  "LC_NAME",
347 #    endif
348 #    ifdef USE_LOCALE_SYNTAX
349                                  "LC_SYNTAX",
350 #    endif
351 #    ifdef USE_LOCALE_TOD
352                                  "LC_TOD",
353 #    endif
354 #    ifdef LC_ALL
355                                  "LC_ALL",
356 #    endif
357 
358    /* Placeholder as a precaution if code fails to check the return of
359     * get_category_index(), which returns this element to indicate an error */
360                                  NULL
361 };
362 
363 /* A few categories require additional setup when they are changed.  This table
364  * points to the functions that do that setup */
365 STATIC void (*update_functions[]) (pTHX_ const char *, bool force) = {
366 #  ifdef USE_LOCALE_CTYPE
367                                 S_new_ctype,
368 #  endif
369 #  ifdef USE_LOCALE_NUMERIC
370                                 S_new_numeric,
371 #  endif
372 #  ifdef USE_LOCALE_COLLATE
373                                 S_new_collate,
374 #  endif
375 #  ifdef USE_LOCALE_TIME
376                                 NULL,
377 #  endif
378 #  ifdef USE_LOCALE_MESSAGES
379                                 NULL,
380 #  endif
381 #  ifdef USE_LOCALE_MONETARY
382                                 NULL,
383 #  endif
384 #  ifdef USE_LOCALE_ADDRESS
385                                 NULL,
386 #  endif
387 #  ifdef USE_LOCALE_IDENTIFICATION
388                                 NULL,
389 #  endif
390 #  ifdef USE_LOCALE_MEASUREMENT
391                                 NULL,
392 #  endif
393 #  ifdef USE_LOCALE_PAPER
394                                 NULL,
395 #  endif
396 #  ifdef USE_LOCALE_TELEPHONE
397                                 NULL,
398 #  endif
399 #  ifdef USE_LOCALE_NAME
400                                 NULL,
401 #  endif
402 #  ifdef USE_LOCALE_SYNTAX
403                                 NULL,
404 #  endif
405 #  ifdef USE_LOCALE_TOD
406                                 NULL,
407 #  endif
408     /* No harm done to have this even without an LC_ALL */
409                                 S_new_LC_ALL,
410 
411    /* Placeholder as a precaution if code fails to check the return of
412     * get_category_index(), which returns this element to indicate an error */
413                                 NULL
414 };
415 
416 #  ifdef LC_ALL
417 
418     /* On systems with LC_ALL, it is kept in the highest index position.  (-2
419      * to account for the final unused placeholder element.) */
420 #    define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
421 #  else
422 
423     /* On systems without LC_ALL, we pretend it is there, one beyond the real
424      * top element, hence in the unused placeholder element. */
425 #    define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
426 #  endif
427 
428 /* Pretending there is an LC_ALL element just above allows us to avoid most
429  * special cases.  Most loops through these arrays in the code below are
430  * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'.  They will work
431  * on either type of system.  But the code must be written to not access the
432  * element at 'LC_ALL_INDEX_' except on platforms that have it.  This can be
433  * checked for at compile time by using the #define LC_ALL_INDEX_ which is only
434  * defined if we do have LC_ALL. */
435 
436 STATIC int
S_get_category_index_nowarn(const int category)437 S_get_category_index_nowarn(const int category)
438 {
439     /* Given a category, return the equivalent internal index we generally use
440      * instead, or negative if not found.
441      *
442      * Some sort of hash could be used instead of this loop, but the number of
443      * elements is so far at most 12 */
444 
445     unsigned int i;
446 
447     PERL_ARGS_ASSERT_GET_CATEGORY_INDEX;
448 
449 #  ifdef LC_ALL
450     for (i = 0; i <=         LC_ALL_INDEX_; i++)
451 #  else
452     for (i = 0; i <  NOMINAL_LC_ALL_INDEX;  i++)
453 #  endif
454     {
455         if (category == categories[i]) {
456             dTHX_DEBUGGING;
457             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
458                                    "index of category %d (%s) is %d\n",
459                                    category, category_names[i], i));
460             return i;
461         }
462     }
463 
464     return -1;
465 }
466 
467 STATIC unsigned int
S_get_category_index(const int category,const char * locale)468 S_get_category_index(const int category, const char * locale)
469 {
470     /* Given a category, return the equivalent internal index we generally use
471      * instead.
472      *
473      * 'locale' is for use in any generated diagnostics, and may be NULL
474      */
475 
476     const char * conditional_warn_text = "; can't set it to ";
477     const int index = get_category_index_nowarn(category);
478 
479     if (index >= 0) {
480         return index;
481     }
482 
483     /* Here, we don't know about this category, so can't handle it. */
484 
485     if (! locale) {
486         locale = "";
487         conditional_warn_text = "";
488     }
489 
490     /* diag_listed_as: Unknown locale category %d; can't set it to %s */
491     Perl_warner_nocontext(packWARN(WARN_LOCALE),
492                           "Unknown locale category %d%s%s",
493                           category, conditional_warn_text, locale);
494 
495     SET_EINVAL;
496 
497     /* Return an out-of-bounds value */
498     return NOMINAL_LC_ALL_INDEX + 1;
499 }
500 
501 #endif /* ifdef USE_LOCALE */
502 
503 void
Perl_force_locale_unlock()504 Perl_force_locale_unlock()
505 {
506 
507 #if defined(USE_LOCALE_THREADS)
508 
509     dTHX;
510 
511     /* If recursively locked, clear all at once */
512     if (PL_locale_mutex_depth > 1) {
513         PL_locale_mutex_depth = 1;
514     }
515 
516     if (PL_locale_mutex_depth > 0) {
517         LOCALE_UNLOCK_;
518     }
519 
520 #endif
521 
522 }
523 
524 #ifdef USE_POSIX_2008_LOCALE
525 
526 STATIC locale_t
S_use_curlocale_scratch(pTHX)527 S_use_curlocale_scratch(pTHX)
528 {
529     /* This function is used to hide from the caller the case where the current
530      * locale_t object in POSIX 2008 is the global one, which is illegal in
531      * many of the P2008 API calls.  This checks for that and, if necessary
532      * creates a proper P2008 object.  Any prior object is deleted, as is any
533      * remaining object during global destruction. */
534 
535     locale_t cur = uselocale((locale_t) 0);
536 
537     if (cur != LC_GLOBAL_LOCALE) {
538         return cur;
539     }
540 
541     if (PL_scratch_locale_obj) {
542         freelocale(PL_scratch_locale_obj);
543     }
544 
545     PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE);
546     return PL_scratch_locale_obj;
547 }
548 
549 #endif
550 
551 void
Perl_locale_panic(const char * msg,const char * file_name,const line_t line,const int errnum)552 Perl_locale_panic(const char * msg,
553                   const char * file_name,
554                   const line_t line,
555                   const int errnum)
556 {
557     dTHX;
558 
559     PERL_ARGS_ASSERT_LOCALE_PANIC;
560 
561     force_locale_unlock();
562 
563 #ifdef USE_C_BACKTRACE
564     dump_c_backtrace(Perl_debug_log, 20, 1);
565 #endif
566 
567     /* diag_listed_as: panic: %s */
568     Perl_croak(aTHX_ "%s: %" LINE_Tf ": panic: %s; errno=%d\n",
569                      file_name, line, msg, errnum);
570 }
571 
572 #define setlocale_failure_panic_c(                                          \
573                         cat, current, failed, caller_0_line, caller_1_line) \
574         setlocale_failure_panic_i(cat##_INDEX_, current, failed,            \
575                         caller_0_line, caller_1_line)
576 
577 /* posix_setlocale() presents a consistent POSIX-compliant interface to
578  * setlocale().   Windows requres a customized base-level setlocale().  Any
579  * necessary mutex locking needs to be done at a higher level */
580 #ifdef WIN32
581 #  define posix_setlocale(cat, locale) win32_setlocale(cat, locale)
582 #else
583 #  define posix_setlocale(cat, locale) ((const char *) setlocale(cat, locale))
584 #endif
585 
586 /* The next layer up is to catch vagaries and bugs in the libc setlocale return
587  * value.  Again, any necessary mutex locking needs to be done at a higher
588  * level */
589 #ifdef stdize_locale
590 #  define stdized_setlocale(cat, locale)                                       \
591      stdize_locale(cat, posix_setlocale(cat, locale),                          \
592                    &PL_stdize_locale_buf, &PL_stdize_locale_bufsize, __LINE__)
593 #else
594 #  define stdized_setlocale(cat, locale)  posix_setlocale(cat, locale)
595 #endif
596 
597 /* The next many lines form a layer above the close-to-the-metal 'posix'
598  * and 'stdized' macros.  They are used to present a uniform API to the rest of
599  * the code in this file in spite of the disparate underlying implementations.
600  * */
601 
602 #if    (! defined(USE_LOCALE_THREADS) && ! defined(USE_POSIX_2008_LOCALE))    \
603     || (  defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE))
604 
605 /* For non-threaded perls, the added layer just expands to the base-level
606  * functions, except if we are supposed to use the POSIX 2008 interface anyway.
607  * On perls where threading is invisible to us, the base-level functions are
608  * used regardless of threading.  Currently this is only on later Windows
609  * versions.
610  *
611  * See the introductory comments in this file for the meaning of the suffixes
612  * '_c', '_r', '_i'. */
613 
614 #  define setlocale_r(cat, locale)        stdized_setlocale(cat, locale)
615 #  define setlocale_i(i, locale)      setlocale_r(categories[i], locale)
616 #  define setlocale_c(cat, locale)              setlocale_r(cat, locale)
617 
618 #  define void_setlocale_i(i, locale)                                       \
619     STMT_START {                                                            \
620         if (! posix_setlocale(categories[i], locale)) {                     \
621             setlocale_failure_panic_i(i, NULL, locale, __LINE__, 0);        \
622             NOT_REACHED; /* NOTREACHED */                                   \
623         }                                                                   \
624     } STMT_END
625 #  define void_setlocale_c(cat, locale)                                     \
626                                   void_setlocale_i(cat##_INDEX_, locale)
627 #  define void_setlocale_r(cat, locale)                                     \
628                void_setlocale_i(get_category_index(cat, locale), locale)
629 
630 #  define bool_setlocale_r(cat, locale) cBOOL(posix_setlocale(cat, locale))
631 #  define bool_setlocale_i(i, locale)                                       \
632                                  bool_setlocale_c(categories[i], locale)
633 #  define bool_setlocale_c(cat, locale)    bool_setlocale_r(cat, locale)
634 
635 /* All the querylocale...() forms return a mortalized copy.  If you need
636  * something stable across calls, you need to savepv() the result yourself */
637 
638 #  define querylocale_r(cat)        mortalized_pv_copy(setlocale_r(cat, NULL))
639 #  define querylocale_c(cat)        querylocale_r(cat)
640 #  define querylocale_i(i)          querylocale_c(categories[i])
641 
642 #elif   defined(USE_LOCALE_THREADS)                 \
643    && ! defined(USE_THREAD_SAFE_LOCALE)
644 
645    /* Here, there are threads, and there is no support for thread-safe
646     * operation.  This is a dangerous situation, which perl is documented as
647     * not supporting, but it arises in practice.  We can do a modicum of
648     * automatic mitigation by making sure there is a per-thread return from
649     * setlocale(), and that a mutex protects it from races */
650 STATIC const char *
S_less_dicey_setlocale_r(pTHX_ const int category,const char * locale)651 S_less_dicey_setlocale_r(pTHX_ const int category, const char * locale)
652 {
653     const char * retval;
654 
655     PERL_ARGS_ASSERT_LESS_DICEY_SETLOCALE_R;
656 
657     POSIX_SETLOCALE_LOCK;
658 
659     retval = stdized_setlocale(category, locale);
660 
661     /* We reuse PL_stdize_locale_buf as it doesn't conflict, but the call may
662      * already have used it, in which case we don't have to do anything further
663      * */
664     retval = save_to_buffer(retval,
665                             &PL_stdize_locale_buf, &PL_stdize_locale_bufsize);
666 
667     POSIX_SETLOCALE_UNLOCK;
668 
669     return retval;
670 }
671 
672 #  define setlocale_r(cat, locale)  less_dicey_setlocale_r(cat, locale)
673 #  define setlocale_c(cat, locale)             setlocale_r(cat, locale)
674 #  define setlocale_i(i, locale)     setlocale_r(categories[i], locale)
675 
676 #  define querylocale_r(cat)  mortalized_pv_copy(setlocale_r(cat, NULL))
677 #  define querylocale_c(cat)                   querylocale_r(cat)
678 #  define querylocale_i(i)                     querylocale_r(categories[i])
679 
680 STATIC void
S_less_dicey_void_setlocale_i(pTHX_ const unsigned cat_index,const char * locale,const line_t line)681 S_less_dicey_void_setlocale_i(pTHX_ const unsigned cat_index,
682                                     const char * locale,
683                                     const line_t line)
684 {
685     PERL_ARGS_ASSERT_LESS_DICEY_VOID_SETLOCALE_I;
686 
687     POSIX_SETLOCALE_LOCK;
688     if (! posix_setlocale(categories[cat_index], locale)) {
689         POSIX_SETLOCALE_UNLOCK;
690         setlocale_failure_panic_i(cat_index, NULL, locale, __LINE__, line);
691     }
692     POSIX_SETLOCALE_UNLOCK;
693 }
694 
695 #  define void_setlocale_i(i, locale)                                       \
696                           less_dicey_void_setlocale_i(i, locale, __LINE__)
697 #  define void_setlocale_c(cat, locale)                                     \
698                           void_setlocale_i(cat##_INDEX_, locale)
699 #  define void_setlocale_r(cat, locale)                                     \
700        void_setlocale_i(get_category_index(cat, locale), locale)
701 
702 #  if 0     /* Not currently used */
703 
704 STATIC bool
705 S_less_dicey_bool_setlocale_r(pTHX_ const int cat, const char * locale)
706 {
707     bool retval;
708 
709     PERL_ARGS_ASSERT_LESS_DICEY_BOOL_SETLOCALE_R;
710 
711     POSIX_SETLOCALE_LOCK;
712     retval = cBOOL(posix_setlocale(cat, locale));
713     POSIX_SETLOCALE_UNLOCK;
714 
715     return retval;
716 }
717 
718 #  endif
719 #  define bool_setlocale_r(cat, locale)                                 \
720                                less_dicey_bool_setlocale_r(cat, locale)
721 #  define bool_setlocale_i(i, locale)                                   \
722                                 bool_setlocale_r(categories[i], locale)
723 #  define bool_setlocale_c(cat, locale) bool_setlocale_r(cat, locale)
724 #else
725 
726 /* Here, there is a completely different API to get thread-safe locales.  We
727  * emulate the setlocale() API with our own function(s).  setlocale categories,
728  * like LC_NUMERIC, are not valid here for the POSIX 2008 API.  Instead, there
729  * are equivalents, like LC_NUMERIC_MASK, which we use instead, converting to
730  * by using get_category_index() followed by table lookup. */
731 
732 #  define emulate_setlocale_c(cat, locale, recalc_LC_ALL, line)             \
733            emulate_setlocale_i(cat##_INDEX_, locale, recalc_LC_ALL, line)
734 
735      /* A wrapper for the macros below. */
736 #  define common_emulate_setlocale(i, locale)                               \
737                  emulate_setlocale_i(i, locale, YES_RECALC_LC_ALL, __LINE__)
738 
739 #  define setlocale_i(i, locale)                                            \
740      save_to_buffer(common_emulate_setlocale(i, locale),                    \
741                                              &PL_stdize_locale_buf,         \
742                                              &PL_stdize_locale_bufsize)
743 #  define setlocale_c(cat, locale)     setlocale_i(cat##_INDEX_, locale)
744 #  define setlocale_r(cat, locale)                                          \
745                     setlocale_i(get_category_index(cat, locale), locale)
746 
747 #  define void_setlocale_i(i, locale)                                       \
748                              ((void) common_emulate_setlocale(i, locale))
749 #  define void_setlocale_c(cat, locale)                                     \
750                                   void_setlocale_i(cat##_INDEX_, locale)
751 #  define void_setlocale_r(cat, locale) ((void) setlocale_r(cat, locale))
752 
753 #  define bool_setlocale_i(i, locale)                                       \
754                                cBOOL(common_emulate_setlocale(i, locale))
755 #  define bool_setlocale_c(cat, locale)                                     \
756                                   bool_setlocale_i(cat##_INDEX_, locale)
757 #  define bool_setlocale_r(cat, locale)   cBOOL(setlocale_r(cat, locale))
758 
759 #  define querylocale_i(i)      mortalized_pv_copy(my_querylocale_i(i))
760 #  define querylocale_c(cat)    querylocale_i(cat##_INDEX_)
761 #  define querylocale_r(cat)    querylocale_i(get_category_index(cat,NULL))
762 
763 #  ifdef USE_QUERYLOCALE
764 #    define isSINGLE_BIT_SET(mask) isPOWER_OF_2(mask)
765 
766      /* This code used to think querylocale() was valid on LC_ALL.  Make sure
767       * all instances of that have been removed */
768 #    define QUERYLOCALE_ASSERT(index)                                       \
769                         __ASSERT_(isSINGLE_BIT_SET(category_masks[index]))
770 #    if ! defined(HAS_QUERYLOCALE) && (   defined(_NL_LOCALE_NAME)          \
771                                        && defined(HAS_NL_LANGINFO_L))
772 #      define querylocale_l(index, locale_obj)                              \
773             (QUERYLOCALE_ASSERT(index)                                      \
774              mortalized_pv_copy(nl_langinfo_l(                              \
775                          _NL_LOCALE_NAME(categories[index]), locale_obj)))
776 #    else
777 #      define querylocale_l(index, locale_obj)                              \
778         (QUERYLOCALE_ASSERT(index)                                          \
779          mortalized_pv_copy(querylocale(category_masks[index], locale_obj)))
780 #    endif
781 #  endif
782 #  if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES)
783 #    define HAS_GLIBC_LC_MESSAGES_BUG
784 #    include <libintl.h>
785 #  endif
786 
787 /* A fourth array, parallel to the ones above to map from category to its
788  * equivalent mask */
789 STATIC const int category_masks[] = {
790 #  ifdef USE_LOCALE_CTYPE
791                                 LC_CTYPE_MASK,
792 #  endif
793 #  ifdef USE_LOCALE_NUMERIC
794                                 LC_NUMERIC_MASK,
795 #  endif
796 #  ifdef USE_LOCALE_COLLATE
797                                 LC_COLLATE_MASK,
798 #  endif
799 #  ifdef USE_LOCALE_TIME
800                                 LC_TIME_MASK,
801 #  endif
802 #  ifdef USE_LOCALE_MESSAGES
803                                 LC_MESSAGES_MASK,
804 #  endif
805 #  ifdef USE_LOCALE_MONETARY
806                                 LC_MONETARY_MASK,
807 #  endif
808 #  ifdef USE_LOCALE_ADDRESS
809                                 LC_ADDRESS_MASK,
810 #  endif
811 #  ifdef USE_LOCALE_IDENTIFICATION
812                                 LC_IDENTIFICATION_MASK,
813 #  endif
814 #  ifdef USE_LOCALE_MEASUREMENT
815                                 LC_MEASUREMENT_MASK,
816 #  endif
817 #  ifdef USE_LOCALE_PAPER
818                                 LC_PAPER_MASK,
819 #  endif
820 #  ifdef USE_LOCALE_TELEPHONE
821                                 LC_TELEPHONE_MASK,
822 #  endif
823 #  ifdef USE_LOCALE_NAME
824                                 LC_NAME_MASK,
825 #  endif
826 #  ifdef USE_LOCALE_SYNTAX
827                                 LC_SYNTAX_MASK,
828 #  endif
829 #  ifdef USE_LOCALE_TOD
830                                 LC_TOD_MASK,
831 #  endif
832                                 /* LC_ALL can't be turned off by a Configure
833                                  * option, and in Posix 2008, should always be
834                                  * here, so compile it in unconditionally.
835                                  * This could catch some glitches at compile
836                                  * time */
837                                 LC_ALL_MASK,
838 
839    /* Placeholder as a precaution if code fails to check the return of
840     * get_category_index(), which returns this element to indicate an error */
841                                 0
842 };
843 
844 #  define my_querylocale_c(cat) my_querylocale_i(cat##_INDEX_)
845 
846 STATIC const char *
S_my_querylocale_i(pTHX_ const unsigned int index)847 S_my_querylocale_i(pTHX_ const unsigned int index)
848 {
849     /* This function returns the name of the locale category given by the input
850      * index into our parallel tables of them.
851      *
852      * POSIX 2008, for some sick reason, chose not to provide a method to find
853      * the category name of a locale, discarding a basic linguistic tenet that
854      * for any object, people will create a name for it.  Some vendors have
855      * created a querylocale() function to do just that.  This function is a
856      * lot simpler to implement on systems that have this.  Otherwise, we have
857      * to keep track of what the locale has been set to, so that we can return
858      * its name so as to emulate setlocale().  It's also possible for C code in
859      * some library to change the locale without us knowing it, though as of
860      * September 2017, there are no occurrences in CPAN of uselocale().  Some
861      * libraries do use setlocale(), but that changes the global locale, and
862      * threads using per-thread locales will just ignore those changes. */
863 
864     int category;
865     const locale_t cur_obj = uselocale((locale_t) 0);
866     const char * retval;
867 
868     PERL_ARGS_ASSERT_MY_QUERYLOCALE_I;
869     assert(index <= NOMINAL_LC_ALL_INDEX);
870 
871     category = categories[index];
872 
873     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "my_querylocale_i(%s) on %p\n",
874                                            category_names[index], cur_obj));
875     if (cur_obj == LC_GLOBAL_LOCALE) {
876         POSIX_SETLOCALE_LOCK;
877         retval = posix_setlocale(category, NULL);
878         POSIX_SETLOCALE_UNLOCK;
879     }
880     else {
881 
882 #  ifdef USE_QUERYLOCALE
883 
884         /* We don't currently keep records when there is querylocale(), so have
885          * to get it anew each time */
886         retval = (index == LC_ALL_INDEX_)
887                  ? calculate_LC_ALL(cur_obj)
888                  : querylocale_l(index, cur_obj);
889 
890 #  else
891 
892         /* But we do have up-to-date values when we keep our own records
893          * (except some times in initialization, where we get the value from
894          * the system. */
895         const char ** which  = (index == LC_ALL_INDEX_)
896                                ? &PL_cur_LC_ALL
897                                : &PL_curlocales[index];
898         if (*which == NULL) {
899             retval = stdized_setlocale(category, NULL);
900             *which = savepv(retval);
901         }
902         else {
903             retval = *which;
904         }
905 
906 #  endif
907 
908     }
909 
910     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
911                            "my_querylocale_i(%s) returning '%s'\n",
912                            category_names[index], retval));
913     assert(strNE(retval, ""));
914     return retval;
915 }
916 
917 #  ifdef USE_PL_CURLOCALES
918 
919 STATIC const char *
S_update_PL_curlocales_i(pTHX_ const unsigned int index,const char * new_locale,recalc_lc_all_t recalc_LC_ALL)920 S_update_PL_curlocales_i(pTHX_
921                          const unsigned int index,
922                          const char * new_locale,
923                          recalc_lc_all_t recalc_LC_ALL)
924 {
925     /* This is a helper function for emulate_setlocale_i(), mostly used to
926      * make that function easier to read. */
927 
928     PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I;
929     assert(index <= NOMINAL_LC_ALL_INDEX);
930 
931     if (index == LC_ALL_INDEX_) {
932         unsigned int i;
933 
934         /* For LC_ALL, we change all individual categories to correspond */
935                          /* PL_curlocales is a parallel array, so has same
936                           * length as 'categories' */
937         for (i = 0; i < LC_ALL_INDEX_; i++) {
938             Safefree(PL_curlocales[i]);
939             PL_curlocales[i] = savepv(new_locale);
940         }
941 
942         Safefree(PL_cur_LC_ALL);
943         PL_cur_LC_ALL = savepv(calculate_LC_ALL(PL_curlocales));
944         return PL_cur_LC_ALL;
945     }
946 
947     /* Update the single category's record */
948     Safefree(PL_curlocales[index]);
949     PL_curlocales[index] = savepv(new_locale);
950 
951     /* And also LC_ALL if the input says to, including if this is the final
952      * iteration of a loop updating all sub-categories */
953     if (   recalc_LC_ALL == YES_RECALC_LC_ALL
954         || (   recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION
955             && index == NOMINAL_LC_ALL_INDEX - 1))
956     {
957         Safefree(PL_cur_LC_ALL);
958         PL_cur_LC_ALL = savepv(calculate_LC_ALL(PL_curlocales));
959     }
960 
961     return PL_curlocales[index];
962 }
963 
964 #  endif  /* Need PL_curlocales[] */
965 
966 STATIC const char *
S_setlocale_from_aggregate_LC_ALL(pTHX_ const char * locale,const line_t line)967 S_setlocale_from_aggregate_LC_ALL(pTHX_ const char * locale, const line_t line)
968 {
969     /* This function parses the value of the LC_ALL locale, assuming glibc
970      * syntax, and sets each individual category on the system to the proper
971      * value.
972      *
973      * This is likely to only ever be called from one place, so exists to make
974      * the calling function easier to read by moving this ancillary code out of
975      * the main line.
976      *
977      * The locale for each category is independent of the other categories.
978      * Often, they are all the same, but certainly not always.  Perl, in fact,
979      * usually keeps LC_NUMERIC in the C locale, regardless of the underlying
980      * locale.  LC_ALL has to be able to represent the case of when there are
981      * varying locales.  Platforms have differing ways of representing this.
982      * Because of this, the code in this file goes to lengths to avoid the
983      * issue, generally looping over the component categories instead of
984      * referring to them in the aggregate, wherever possible.  However, there
985      * are cases where we have to parse our own constructed aggregates, which use
986      * the glibc syntax. */
987 
988     const char * locale_on_entry = querylocale_c(LC_ALL);
989 
990     PERL_ARGS_ASSERT_SETLOCALE_FROM_AGGREGATE_LC_ALL;
991 
992     /* If the string that gives what to set doesn't include all categories,
993      * the omitted ones get set to "C".  To get this behavior, first set
994      * all the individual categories to "C", and override the furnished
995      * ones below.  FALSE => No need to recalculate LC_ALL, as this is a
996      * temporary state */
997     if (! emulate_setlocale_c(LC_ALL, "C", DONT_RECALC_LC_ALL, line)) {
998         setlocale_failure_panic_c(LC_ALL, locale_on_entry,
999                                   "C", __LINE__, line);
1000         NOT_REACHED; /* NOTREACHED */
1001     }
1002 
1003     const char * s = locale;
1004     const char * e = locale + strlen(locale);
1005     while (s < e) {
1006         const char * p = s;
1007 
1008         /* Parse through the category */
1009         while (isWORDCHAR(*p)) {
1010             p++;
1011         }
1012 
1013         const char * category_end = p;
1014 
1015         if (*p++ != '=') {
1016             locale_panic_(Perl_form(aTHX_
1017                           "Unexpected character in locale category name '%s"
1018                           "<-- HERE",
1019                           get_displayable_string(s, p - 1, 0)));
1020         }
1021 
1022         /* Parse through the locale name */
1023         const char * name_start = p;
1024         while (p < e && *p != ';') {
1025             p++;
1026         }
1027         if (UNLIKELY( p < e && *p != ';')) {
1028             locale_panic_(Perl_form(aTHX_
1029                           "Unexpected character in locale name '%s<-- HERE",
1030                           get_displayable_string(s, p, 0)));
1031         }
1032 
1033         const char * name_end = p;
1034 
1035         /* Space past the semi-colon */
1036         if (p < e) {
1037             p++;
1038         }
1039 
1040         /* Find the index of the category name in our lists */
1041         for (PERL_UINT_FAST8_T i = 0; i < LC_ALL_INDEX_; i++) {
1042 
1043             /* Keep going if this index doesn't point to the category being
1044              * parsed.  The strnNE() avoids a Perl_form(), but would fail if
1045              * ever a category name could be a substring of another one, e.g.,
1046              * if there were a "LC_TIME_DATE" */
1047             if strnNE(s, category_names[i], category_end - s) {
1048                 continue;
1049             }
1050 
1051             /* Here i points to the category being parsed.  Now isolate the
1052              * locale it is being changed to */
1053             const char * individ_locale = Perl_form(aTHX_ "%.*s",
1054                                 (int) (name_end - name_start), name_start);
1055 
1056             /* And do the change.  Don't recalculate LC_ALL; we'll do it
1057              * ourselves after the loop */
1058             if (! emulate_setlocale_i(i, individ_locale,
1059                                       DONT_RECALC_LC_ALL, line))
1060             {
1061 
1062                 /* But if we have to back out, do fix up LC_ALL */
1063                 if (! emulate_setlocale_c(LC_ALL, locale_on_entry,
1064                                           YES_RECALC_LC_ALL, line))
1065                 {
1066                     setlocale_failure_panic_i(i, individ_locale,
1067                                               locale, __LINE__, line);
1068                     NOT_REACHED; /* NOTREACHED */
1069                 }
1070 
1071                 /* Reverting to the entry value succeeded, but the operation
1072                  * failed to go to the requested locale. */
1073                 return NULL;
1074             }
1075 
1076             /* Found and handled the desired category.  Quit the inner loop to
1077              * try the next category */
1078             break;
1079         }
1080 
1081         /* Finished with this category; iterate to the next one in the input */
1082         s = p;
1083     }
1084 
1085 #    ifdef USE_PL_CURLOCALES
1086 
1087     /* Here we have set all the individual categories.  Update the LC_ALL entry
1088      * as well.  We can't just use the input 'locale' as the value may omit
1089      * categories whose locale is 'C'.  khw thinks it's better to store a
1090      * complete LC_ALL.  So calculate it. */
1091     const char * retval = savepv(calculate_LC_ALL(PL_curlocales));
1092     Safefree(PL_cur_LC_ALL);
1093     PL_cur_LC_ALL = retval;
1094 
1095 #    else
1096 
1097     const char * retval = querylocale_c(LC_ALL);
1098 
1099 #    endif
1100 
1101     return retval;
1102 }
1103 
1104 STATIC const char *
S_emulate_setlocale_i(pTHX_ const unsigned int index,const char * new_locale,const recalc_lc_all_t recalc_LC_ALL,const line_t line)1105 S_emulate_setlocale_i(pTHX_
1106 
1107         /* Our internal index of the 'category' setlocale is called with */
1108         const unsigned int index,
1109 
1110         const char * new_locale, /* The locale to set the category to */
1111         const recalc_lc_all_t recalc_LC_ALL,  /* Explained below */
1112         const line_t line     /* Called from this line number */
1113        )
1114 {
1115     PERL_ARGS_ASSERT_EMULATE_SETLOCALE_I;
1116     assert(index <= NOMINAL_LC_ALL_INDEX);
1117 
1118     /* Otherwise could have undefined behavior, as the return of this function
1119      * may be copied to this buffer, which this function could change in the
1120      * middle of its work */
1121     assert(new_locale != PL_stdize_locale_buf);
1122 
1123     /* This function effectively performs a setlocale() on just the current
1124      * thread; thus it is thread-safe.  It does this by using the POSIX 2008
1125      * locale functions to emulate the behavior of setlocale().  Similar to
1126      * regular setlocale(), the return from this function points to memory that
1127      * can be overwritten by other system calls, so needs to be copied
1128      * immediately if you need to retain it.  The difference here is that
1129      * system calls besides another setlocale() can overwrite it.
1130      *
1131      * By doing this, most locale-sensitive functions become thread-safe.  The
1132      * exceptions are mostly those that return a pointer to static memory.
1133      *
1134      * This function may be called in a tight loop that iterates over all
1135      * categories.  Because LC_ALL is not a "real" category, but merely the sum
1136      * of all the other ones, such loops don't include LC_ALL.  On systems that
1137      * have querylocale() or similar, the current LC_ALL value is immediately
1138      * retrievable; on systems lacking that feature, we have to keep track of
1139      * LC_ALL ourselves.  We could do that on each iteration, only to throw it
1140      * away on the next, but the calculation is more than a trivial amount of
1141      * work.  Instead, the 'recalc_LC_ALL' parameter is set to
1142      * RECALCULATE_LC_ALL_ON_FINAL_INTERATION to only do the calculation once.
1143      * This function calls itself recursively in such a loop.
1144      *
1145      * When not in such a loop, the parameter is set to the other enum values
1146      * DONT_RECALC_LC_ALL or YES_RECALC_LC_ALL. */
1147 
1148     int mask = category_masks[index];
1149     const locale_t entry_obj = uselocale((locale_t) 0);
1150     const char * locale_on_entry = querylocale_i(index);
1151 
1152     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1153              "emulate_setlocale_i input=%d (%s), mask=0x%x,"
1154              " new locale=\"%s\", current locale=\"%s\","
1155              "index=%d, object=%p\n",
1156              categories[index], category_names[index], mask,
1157              ((new_locale == NULL) ? "(nil)" : new_locale),
1158              locale_on_entry, index, entry_obj));
1159 
1160     /* Return the already-calculated info if just querying what the existing
1161      * locale is */
1162     if (new_locale == NULL) {
1163         return locale_on_entry;
1164     }
1165 
1166     /* Here, trying to change the locale, but it is a no-op if the new boss is
1167      * the same as the old boss.  Except this routine is called when converting
1168      * from the global locale, so in that case we will create a per-thread
1169      * locale below (with the current values).  It also seemed that newlocale()
1170      * could free up the basis locale memory if we called it with the new and
1171      * old being the same, but khw now thinks that this was due to some other
1172      * bug, since fixed, as there are other places where newlocale() gets
1173      * similarly called without problems. */
1174     if (   entry_obj != LC_GLOBAL_LOCALE
1175         && locale_on_entry
1176         && strEQ(new_locale, locale_on_entry))
1177     {
1178         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1179                  "(%" LINE_Tf "): emulate_setlocale_i"
1180                  " no-op to change to what it already was\n",
1181                  line));
1182 
1183 #  ifdef USE_PL_CURLOCALES
1184 
1185        /* On the final iteration of a loop that needs to recalculate LC_ALL, do
1186         * so.  If no iteration changed anything, LC_ALL also doesn't change,
1187         * but khw believes the complexity needed to keep track of that isn't
1188         * worth it. */
1189         if (UNLIKELY(   recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION
1190                      && index == NOMINAL_LC_ALL_INDEX - 1))
1191         {
1192             Safefree(PL_cur_LC_ALL);
1193             PL_cur_LC_ALL = savepv(calculate_LC_ALL(PL_curlocales));
1194         }
1195 
1196 #  endif
1197 
1198         return locale_on_entry;
1199     }
1200 
1201 #  ifndef USE_QUERYLOCALE
1202 
1203     /* Without a querylocale() mechanism, we have to figure out ourselves what
1204      * happens with setting a locale to "" */
1205     if (strEQ(new_locale, "")) {
1206         new_locale = find_locale_from_environment(index);
1207     }
1208 
1209 #  endif
1210 
1211     /* So far, it has worked that a semi-colon in the locale name means that
1212      * the category is LC_ALL and it subsumes categories which don't all have
1213      * the same locale.  This is the glibc syntax. */
1214     if (strchr(new_locale, ';')) {
1215         assert(index == LC_ALL_INDEX_);
1216         return setlocale_from_aggregate_LC_ALL(new_locale, line);
1217     }
1218 
1219 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
1220 
1221     /* For this bug, if the LC_MESSAGES locale changes, we have to do an
1222      * expensive workaround.  Save the current value so we can later determine
1223      * if it changed. */
1224     const char * old_messages_locale = NULL;
1225     if (   (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_)
1226         &&  LIKELY(PL_phase != PERL_PHASE_CONSTRUCT))
1227     {
1228         old_messages_locale = querylocale_c(LC_MESSAGES);
1229     }
1230 
1231 #  endif
1232 
1233     assert(PL_C_locale_obj);
1234 
1235     /* Now ready to switch to the input 'new_locale' */
1236 
1237     /* Switching locales generally entails freeing the current one's space (at
1238      * the C library's discretion), hence we can't be using that locale at the
1239      * time of the switch (this wasn't obvious to khw from the man pages).  So
1240      * switch to a known locale object that we don't otherwise mess with. */
1241     if (! uselocale(PL_C_locale_obj)) {
1242 
1243         /* Not being able to change to the C locale is severe; don't keep
1244          * going.  */
1245         setlocale_failure_panic_i(index, locale_on_entry, "C", __LINE__, line);
1246         NOT_REACHED; /* NOTREACHED */
1247     }
1248 
1249     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1250              "(%" LINE_Tf "): emulate_setlocale_i now using C"
1251              " object=%p\n", line, PL_C_locale_obj));
1252 
1253     locale_t new_obj;
1254 
1255     /* We created a (never changing) object at start-up for LC_ALL being in the
1256      * C locale.  If this call is to switch to LC_ALL=>C, simply use that
1257      * object.  But in fact, we already have switched to it just above, in
1258      * preparation for the general case.  Since we're already there, no need to
1259      * do further switching. */
1260     if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) {
1261         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "(%" LINE_Tf "):"
1262                                                " emulate_setlocale_i will stay"
1263                                                " in C object\n", line));
1264         new_obj = PL_C_locale_obj;
1265 
1266         /* And free the old object if it isn't a special one */
1267         if (entry_obj != LC_GLOBAL_LOCALE && entry_obj != PL_C_locale_obj) {
1268             freelocale(entry_obj);
1269         }
1270     }
1271     else {  /* Here is the general case, not to LC_ALL=>C */
1272         locale_t basis_obj = entry_obj;
1273 
1274         /* Specially handle two objects */
1275         if (basis_obj == LC_GLOBAL_LOCALE || basis_obj == PL_C_locale_obj) {
1276 
1277             /* For these two objects, we make duplicates to hand to newlocale()
1278              * below.  For LC_GLOBAL_LOCALE, this is because newlocale()
1279              * doesn't necessarily accept it as input (the results are
1280              * undefined).  For PL_C_locale_obj, it is so that it never gets
1281              * modified, as otherwise newlocale() is free to do so */
1282             basis_obj = duplocale(basis_obj);
1283             if (! basis_obj) {
1284                 locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): duplocale failed",
1285                                               line));
1286                 NOT_REACHED; /* NOTREACHED */
1287             }
1288 
1289             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1290                                    "(%" LINE_Tf "): emulate_setlocale_i"
1291                                    " created %p by duping the input\n",
1292                                    line, basis_obj));
1293         }
1294 
1295         /* Ready to create a new locale by modification of the existing one.
1296          *
1297          * NOTE: This code may incorrectly show up as a leak under the address
1298          * sanitizer. We do not free this object under normal teardown, however
1299          * you can set PERL_DESTRUCT_LEVEL=2 to cause it to be freed.
1300          */
1301         new_obj = newlocale(mask, new_locale, basis_obj);
1302 
1303         if (! new_obj) {
1304             DEBUG_L(PerlIO_printf(Perl_debug_log,
1305                                   " (%" LINE_Tf "): emulate_setlocale_i"
1306                                   " creating new object from %p failed:"
1307                                   " errno=%d\n",
1308                                   line, basis_obj, GET_ERRNO));
1309 
1310             /* Failed.  Likely this is because the proposed new locale isn't
1311              * valid on this system.  But we earlier switched to the LC_ALL=>C
1312              * locale in anticipation of it succeeding,  Now have to switch
1313              * back to the state upon entry */
1314             if (! uselocale(entry_obj)) {
1315                 setlocale_failure_panic_i(index, "switching back to",
1316                                           locale_on_entry, __LINE__, line);
1317                 NOT_REACHED; /* NOTREACHED */
1318             }
1319 
1320 #    ifdef USE_PL_CURLOCALES
1321 
1322             if (entry_obj == LC_GLOBAL_LOCALE) {
1323 
1324                 /* Here, we are back in the global locale.  We may never have
1325                  * set PL_curlocales.  If the locale change had succeeded, the
1326                  * code would have then set them up, but since it didn't, do so
1327                  * here.  khw isn't sure if this prevents some issues or not,
1328                  * This will calculate LC_ALL's entry only on the final
1329                  * iteration */
1330                 POSIX_SETLOCALE_LOCK;
1331                 for (PERL_UINT_FAST8_T i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1332                     update_PL_curlocales_i(i,
1333                                        posix_setlocale(categories[i], NULL),
1334                                        RECALCULATE_LC_ALL_ON_FINAL_INTERATION);
1335                 }
1336                 POSIX_SETLOCALE_UNLOCK;
1337             }
1338 #    endif
1339 
1340             return NULL;
1341         }
1342 
1343         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1344                                "(%" LINE_Tf "): emulate_setlocale_i created %p"
1345                                " while freeing %p\n", line, new_obj, basis_obj));
1346 
1347         /* Here, successfully created an object representing the desired
1348          * locale; now switch into it */
1349         if (! uselocale(new_obj)) {
1350             freelocale(new_obj);
1351             locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): emulate_setlocale_i"
1352                                           " switching into new locale failed",
1353                                           line));
1354         }
1355     }
1356 
1357     /* Here, we are using 'new_obj' which matches the input 'new_locale'. */
1358     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1359              "(%" LINE_Tf "): emulate_setlocale_i now using %p\n",
1360              line, new_obj));
1361 
1362 #ifdef MULTIPLICITY
1363     PL_cur_locale_obj = new_obj;
1364 #endif
1365 
1366     /* We are done, except for updating our records (if the system doesn't keep
1367      * them) and in the case of locale "", we don't actually know what the
1368      * locale that got switched to is, as it came from the environment.  So
1369      * have to find it */
1370 
1371 #  ifdef USE_QUERYLOCALE
1372 
1373     if (strEQ(new_locale, "")) {
1374         new_locale = querylocale_i(index);
1375     }
1376 
1377     PERL_UNUSED_ARG(recalc_LC_ALL);
1378 
1379 #  else
1380 
1381     new_locale = update_PL_curlocales_i(index, new_locale, recalc_LC_ALL);
1382 
1383 #  endif
1384 #  ifdef HAS_GLIBC_LC_MESSAGES_BUG
1385 
1386     /* Invalidate the glibc cache of loaded translations if the locale has
1387      * changed, see [perl #134264] */
1388     if (old_messages_locale) {
1389         if (strNE(old_messages_locale, my_querylocale_c(LC_MESSAGES))) {
1390             textdomain(textdomain(NULL));
1391         }
1392     }
1393 
1394 #  endif
1395 
1396     return new_locale;
1397 }
1398 
1399 #endif   /* End of the various implementations of the setlocale and
1400             querylocale macros used in the remainder of this program */
1401 
1402 #ifdef USE_LOCALE
1403 
1404 /* So far, the locale strings returned by modern 2008-compliant systems have
1405  * been fine */
1406 
1407 STATIC const char *
S_stdize_locale(pTHX_ const int category,const char * input_locale,const char ** buf,Size_t * buf_size,const line_t caller_line)1408 S_stdize_locale(pTHX_ const int category,
1409                       const char *input_locale,
1410                       const char **buf,
1411                       Size_t *buf_size,
1412                       const line_t caller_line)
1413 {
1414     /* The return value of setlocale() is opaque, but is required to be usable
1415      * as input to a future setlocale() to create the same state.
1416      * Unfortunately not all systems are compliant.  But most often they are of
1417      * a very restricted set of forms that this file has been coded to expect.
1418      *
1419      * There are some outliers, though, that this function tries to tame:
1420      *
1421      * 1) A new-line.  This function chomps any \n characters
1422      * 2) foo=bar.     'bar' is what is generally meant, and the foo= part is
1423      *                 stripped.  This form is legal for LC_ALL.  When found in
1424      *                 that category group, the function calls itself
1425      *                 recursively on each possible component category to make
1426      *                 sure the individual categories are ok.
1427      *
1428      * If no changes to the input were made, it is returned; otherwise the
1429      * changed version is stored into memory at *buf, with *buf_size set to its
1430      * new value, and *buf is returned.
1431      */
1432 
1433     const char * first_bad;
1434     const char * retval;
1435 
1436     PERL_ARGS_ASSERT_STDIZE_LOCALE;
1437 
1438     if (input_locale == NULL) {
1439         return NULL;
1440     }
1441 
1442     first_bad = strpbrk(input_locale, "=\n");
1443 
1444     /* Most likely, there isn't a problem with the input */
1445     if (LIKELY(! first_bad)) {
1446         return input_locale;
1447     }
1448 
1449 #    ifdef LC_ALL
1450 
1451     /* But if there is, and the category is LC_ALL, we have to look at each
1452      * component category */
1453     if (category == LC_ALL) {
1454         const char * individ_locales[LC_ALL_INDEX_];
1455         bool made_changes = FALSE;
1456         unsigned int i;
1457 
1458         for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1459             Size_t this_size = 0;
1460             individ_locales[i] = stdize_locale(categories[i],
1461                                                posix_setlocale(categories[i],
1462                                                                NULL),
1463                                                &individ_locales[i],
1464                                                &this_size,
1465                                                caller_line);
1466 
1467             /* If the size didn't change, it means this category did not have
1468              * to be adjusted, and individ_locales[i] points to the buffer
1469              * returned by posix_setlocale(); we have to copy that before
1470              * it's called again in the next iteration */
1471             if (this_size == 0) {
1472                 individ_locales[i] = savepv(individ_locales[i]);
1473             }
1474             else {
1475                 made_changes = TRUE;
1476             }
1477         }
1478 
1479         /* If all the individual categories were ok as-is, this was a false
1480          * alarm.  We must have seen an '=' which was a legal occurrence in
1481          * this combination locale */
1482         if (! made_changes) {
1483             retval = input_locale;  /* The input can be returned unchanged */
1484         }
1485         else {
1486             retval = save_to_buffer(querylocale_c(LC_ALL), buf, buf_size);
1487         }
1488 
1489         for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1490             Safefree(individ_locales[i]);
1491         }
1492 
1493         return retval;
1494     }
1495 
1496 #    else   /* else no LC_ALL */
1497 
1498     PERL_UNUSED_ARG(category);
1499     PERL_UNUSED_ARG(caller_line);
1500 
1501 #    endif
1502 
1503     /* Here, there was a problem in an individual category.  This means that at
1504      * least one adjustment will be necessary.  Create a modifiable copy */
1505     retval = save_to_buffer(input_locale, buf, buf_size);
1506 
1507     if (*first_bad != '=') {
1508 
1509         /* Translate the found position into terms of the copy */
1510         first_bad = retval + (first_bad - input_locale);
1511     }
1512     else { /* An '=' */
1513 
1514         /* It is unlikely that the return is so screwed-up that it contains
1515          * multiple equals signs, but handle that case by stripping all of
1516          * them.  */
1517         const char * final_equals = strrchr(retval, '=');
1518 
1519         /* The length passed here causes the move to include the terminating
1520          * NUL */
1521         Move(final_equals + 1, retval, strlen(final_equals), char);
1522 
1523         /* See if there are additional problems; if not, we're good to return.
1524          * */
1525         first_bad = strpbrk(retval, "\n");
1526 
1527         if (! first_bad) {
1528             return retval;
1529         }
1530     }
1531 
1532     /* Here, the problem must be a \n.  Get rid of it and what follows.
1533      * (Originally, only a trailing \n was stripped.  Unsure what to do if not
1534      * trailing) */
1535     *((char *) first_bad) = '\0';
1536     return retval;
1537 }
1538 
1539 #if defined(WIN32) || (     defined(USE_POSIX_2008_LOCALE)      \
1540                        && ! defined(USE_QUERYLOCALE))
1541 
1542 STATIC const char *
S_find_locale_from_environment(pTHX_ const unsigned int index)1543 S_find_locale_from_environment(pTHX_ const unsigned int index)
1544 {
1545     /* NB: This function may actually change the locale on Windows.
1546      *
1547      * On Windows systems, the concept of the POSIX ordering of environment
1548      * variables is missing.  To increase portability of programs across
1549      * platforms, the POSIX ordering is emulated on Windows.
1550      *
1551      * And on POSIX 2008 systems without querylocale(), it is problematic
1552      * getting the results of the POSIX 2008 equivalent of
1553      *      setlocale(category,  "")
1554      * (which gets the locale from the environment).
1555      *
1556      * To ensure that we know exactly what those values are, we do the setting
1557      * ourselves, using the documented algorithm (assuming the documentation is
1558      * correct) rather than use "" as the locale.  This will lead to results
1559      * that differ from native behavior if the native behavior differs from the
1560      * standard documented value, but khw believes it is better to know what's
1561      * going on, even if different from native, than to just guess.
1562      *
1563      * Another option for the POSIX 2008 case would be, in a critical section,
1564      * to save the global locale's current value, and do a straight
1565      * setlocale(LC_ALL, "").  That would return our desired values, destroying
1566      * the global locale's, which we would then restore.  But that could cause
1567      * races with any other thread that is using the global locale and isn't
1568      * using the mutex.  And, the only reason someone would have done that is
1569      * because they are calling a library function, like in gtk, that calls
1570      * setlocale(), and which can't be changed to use the mutex.  That wouldn't
1571      * be a problem if this were to be done before any threads had switched,
1572      * say during perl construction time.  But this code would still be needed
1573      * for the general case.
1574      *
1575      * The Windows and POSIX 2008 differ in that the ultimate fallback is "C"
1576      * in POSIX, and is the system default locale in Windows.  To get that
1577      * system default value, we actually have to call setlocale() on Windows.
1578      */
1579 
1580     /* We rely on PerlEnv_getenv() returning a mortalized copy */
1581     const char * const lc_all = PerlEnv_getenv("LC_ALL");
1582 
1583     /* Use any "LC_ALL" environment variable, as it overrides everything
1584      * else. */
1585     if (lc_all && strNE(lc_all, "")) {
1586         return lc_all;
1587     }
1588 
1589     /* If setting an individual category, use its corresponding value found in
1590      * the environment, if any */
1591     if (index != LC_ALL_INDEX_) {
1592         const char * const new_value = PerlEnv_getenv(category_names[index]);
1593 
1594         if (new_value && strNE(new_value, "")) {
1595             return new_value;
1596         }
1597 
1598         /* If no corresponding environment variable, see if LANG exists.  If
1599          * so, use it. */
1600         const char * default_name = PerlEnv_getenv("LANG");
1601         if (default_name && strNE(default_name, "")) {
1602             return default_name;
1603         }
1604 
1605         /* If no LANG, use "C" on POSIX 2008, the system default on Windows */
1606 #  ifndef WIN32
1607         return "C";
1608 #  else
1609         return wrap_wsetlocale(categories[index], "");
1610 #  endif
1611 
1612     }
1613 
1614     /* Here is LC_ALL, and no LC_ALL environment variable.  LANG is used as a
1615      * default, but overridden for individual categories that have
1616      * corresponding environment variables.  If no LANG exists, the default is
1617      * "C" on POSIX 2008, or the system default for the category on Windows. */
1618     const char * default_name = PerlEnv_getenv("LANG");
1619 
1620     /* Convert "" to NULL to save conditionals in the loop below */
1621     if (default_name != NULL && strEQ(default_name, "")) {
1622         default_name = NULL;
1623     }
1624 
1625     /* Loop through all the individual categories, setting each to any
1626      * corresponding environment variable; or to the default if none exists for
1627      * the category */
1628     const char * locale_names[LC_ALL_INDEX_];
1629     for (unsigned i = 0; i < LC_ALL_INDEX_; i++) {
1630         const char * const env_override = PerlEnv_getenv(category_names[i]);
1631 
1632         if (env_override && strNE(env_override, "")) {
1633             locale_names[i] = env_override;
1634         }
1635         else if (default_name) {
1636             locale_names[i] = default_name;
1637         }
1638         else {
1639 
1640 #  ifndef WIN32
1641             locale_names[i] = "C";
1642 #  else
1643             locale_names[i] = wrap_wsetlocale(categories[index], "");
1644 #  endif
1645 
1646         }
1647 
1648         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1649                  "find_locale_from_environment i=%d, name=%s, locale=%s\n",
1650                  i, category_names[i], locale_names[i]));
1651     }
1652 
1653     return calculate_LC_ALL(locale_names);
1654 }
1655 
1656 #endif
1657 #if defined(WIN32) || defined(USE_POSIX_2008_LOCALE) || ! defined(LC_ALL)
1658 
1659 STATIC
1660 const char *
1661 
1662 #  ifdef USE_QUERYLOCALE
S_calculate_LC_ALL(pTHX_ const locale_t cur_obj)1663 S_calculate_LC_ALL(pTHX_ const locale_t cur_obj)
1664 #  else
1665 S_calculate_LC_ALL(pTHX_ const char ** individ_locales)
1666 #  endif
1667 
1668 {
1669     /* For POSIX 2008, we have to figure out LC_ALL ourselves when needed.
1670      * querylocale(), on systems that have it, doesn't tend to work for LC_ALL.
1671      * So we have to construct the answer ourselves based on the passed in
1672      * data, which is either a locale_t object, for systems with querylocale(),
1673      * or an array we keep updated to the proper values, otherwise.
1674      *
1675      * For Windows, we also may need to construct an LC_ALL when setting the
1676      * locale to the system default.
1677      *
1678      * This function returns a mortalized string containing the locale name(s)
1679      * of LC_ALL.
1680      *
1681      * If all individual categories are the same locale, we can just set LC_ALL
1682      * to that locale.  But if not, we have to create an aggregation of all the
1683      * categories on the system.  Platforms differ as to the syntax they use
1684      * for these non-uniform locales for LC_ALL.  Some use a '/' or other
1685      * delimiter of the locales with a predetermined order of categories; a
1686      * Configure probe would be needed to tell us how to decipher those.  glibc
1687      * and Windows use a series of name=value pairs, like
1688      *      LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
1689      * This function returns that syntax, which is suitable for input to the
1690      * Windows setlocale().  It could also be suitable for glibc, but because
1691      * the non-Windows code is common to systems that use a different syntax,
1692      * we don't depend on it for glibc.  Instead we take care not to use the
1693      * native setlocale() function on whatever non-Windows style is chosen.
1694      * But, it would be possible for someone to call Perl_setlocale() using a
1695      * native style we don't understand.  So far no one has complained.
1696      *
1697      * For systems that have categories we don't know about, the algorithm
1698      * below won't know about those missing categories, leading to potential
1699      * bugs for code that looks at them.  If there is an environment variable
1700      * that sets that category, we won't know to look for it, and so our use of
1701      * LANG or "C" improperly overrides it.  On the other hand, if we don't do
1702      * what is done here, and there is no environment variable, the category's
1703      * locale should be set to LANG or "C".  So there is no good solution.  khw
1704      * thinks the best is to make sure we have a complete list of possible
1705      * categories, adding new ones as they show up on obscure platforms.
1706      */
1707 
1708     unsigned int i;
1709     Size_t names_len = 0;
1710     bool are_all_categories_the_same_locale = TRUE;
1711     char * aggregate_locale;
1712     char * previous_start = NULL;
1713     char * this_start = NULL;
1714     Size_t entry_len = 0;
1715 
1716     PERL_ARGS_ASSERT_CALCULATE_LC_ALL;
1717 
1718     /* First calculate the needed size for the string listing the categories
1719      * and their locales. */
1720     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1721 
1722 #  ifdef USE_QUERYLOCALE
1723         const char * entry = querylocale_l(i, cur_obj);
1724 #  else
1725         const char * entry = individ_locales[i];
1726 #  endif
1727 
1728         names_len += strlen(category_names[i])
1729                   + 1                           /* '=' */
1730                   + strlen(entry)
1731                   + 1;                          /* ';' */
1732     }
1733 
1734     names_len++;    /* Trailing '\0' */
1735 
1736     /* Allocate enough space for the aggregated string */
1737     Newxz(aggregate_locale, names_len, char);
1738     SAVEFREEPV(aggregate_locale);
1739 
1740     /* Then fill it in */
1741     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1742         Size_t new_len;
1743 
1744 #  ifdef USE_QUERYLOCALE
1745         const char * entry = querylocale_l(i, cur_obj);
1746 #  else
1747         const char * entry = individ_locales[i];
1748 #  endif
1749 
1750         new_len = my_strlcat(aggregate_locale, category_names[i], names_len);
1751         assert(new_len <= names_len);
1752         new_len = my_strlcat(aggregate_locale, "=", names_len);
1753         assert(new_len <= names_len);
1754 
1755         this_start = aggregate_locale + strlen(aggregate_locale);
1756         entry_len = strlen(entry);
1757 
1758         new_len = my_strlcat(aggregate_locale, entry, names_len);
1759         assert(new_len <= names_len);
1760         new_len = my_strlcat(aggregate_locale, ";", names_len);
1761         assert(new_len <= names_len);
1762         PERL_UNUSED_VAR(new_len);   /* Only used in DEBUGGING */
1763 
1764         if (   i > 0
1765             && are_all_categories_the_same_locale
1766             && memNE(previous_start, this_start, entry_len + 1))
1767         {
1768             are_all_categories_the_same_locale = FALSE;
1769         }
1770         else {
1771             previous_start = this_start;
1772         }
1773     }
1774 
1775     /* If they are all the same, just return any one of them */
1776     if (are_all_categories_the_same_locale) {
1777         aggregate_locale = this_start;
1778         aggregate_locale[entry_len] = '\0';
1779     }
1780 
1781     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1782                            "calculate_LC_ALL returning '%s'\n",
1783                            aggregate_locale));
1784 
1785     return aggregate_locale;
1786 }
1787 
1788 #endif
1789 #if defined(USE_LOCALE) && (   defined(DEBUGGING)                       \
1790                             || defined(USE_PERL_SWITCH_LOCALE_CONTEXT))
1791 
1792 STATIC const char *
S_get_LC_ALL_display(pTHX)1793 S_get_LC_ALL_display(pTHX)
1794 {
1795 
1796 #  ifdef LC_ALL
1797 
1798     return querylocale_c(LC_ALL);
1799 
1800 #  else
1801 
1802     const char * curlocales[NOMINAL_LC_ALL_INDEX];
1803 
1804     for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1805         curlocales[i] = querylocale_i(i);
1806     }
1807 
1808     return calculate_LC_ALL(curlocales);
1809 
1810 #  endif
1811 
1812 }
1813 
1814 #endif
1815 
1816 STATIC void
S_setlocale_failure_panic_i(pTHX_ const unsigned int cat_index,const char * current,const char * failed,const line_t caller_0_line,const line_t caller_1_line)1817 S_setlocale_failure_panic_i(pTHX_
1818                             const unsigned int cat_index,
1819                             const char * current,
1820                             const char * failed,
1821                             const line_t caller_0_line,
1822                             const line_t caller_1_line)
1823 {
1824     dSAVE_ERRNO;
1825     const int cat = categories[cat_index];
1826     const char * name = category_names[cat_index];
1827 
1828     PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_I;
1829 
1830     if (current == NULL) {
1831         current = querylocale_i(cat_index);
1832     }
1833 
1834     Perl_locale_panic(Perl_form(aTHX_ "(%" LINE_Tf
1835                                       "): Can't change locale for %s(%d)"
1836                                       " from '%s' to '%s'",
1837                                       caller_1_line, name, cat,
1838                                       current, failed),
1839                       __FILE__, caller_0_line, GET_ERRNO);
1840     NOT_REACHED; /* NOTREACHED */
1841 }
1842 
1843 /* Any of these will allow us to find the RADIX */
1844 #  if defined(USE_LOCALE_NUMERIC) && (   defined(HAS_SOME_LANGINFO)         \
1845                                       || defined(HAS_LOCALECONV)            \
1846                                       || defined(HAS_SNPRINTF))
1847 #    define CAN_CALCULATE_RADIX
1848 #  endif
1849 #  ifdef USE_LOCALE_NUMERIC
1850 
1851 STATIC void
S_new_numeric(pTHX_ const char * newnum,bool force)1852 S_new_numeric(pTHX_ const char *newnum, bool force)
1853 {
1854     PERL_ARGS_ASSERT_NEW_NUMERIC;
1855 
1856     /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
1857      * core Perl this and that 'newnum' is the name of the new locale, and we
1858      * are switched into it.  It installs this locale as the current underlying
1859      * default, and then switches to the C locale, if necessary, so that the
1860      * code that has traditionally expected the radix character to be a dot may
1861      * continue to do so.
1862      *
1863      * The default locale and the C locale can be toggled between by use of the
1864      * set_numeric_underlying() and set_numeric_standard() functions, which
1865      * should probably not be called directly, but only via macros like
1866      * SET_NUMERIC_STANDARD() in perl.h.
1867      *
1868      * The toggling is necessary mainly so that a non-dot radix decimal point
1869      * character can be input and output, while allowing internal calculations
1870      * to use a dot.
1871      *
1872      * This sets several interpreter-level variables:
1873      * PL_numeric_name  The underlying locale's name: a copy of 'newnum'
1874      * PL_numeric_underlying  A boolean indicating if the toggled state is such
1875      *                  that the current locale is the program's underlying
1876      *                  locale
1877      * PL_numeric_standard An int indicating if the toggled state is such
1878      *                  that the current locale is the C locale or
1879      *                  indistinguishable from the C locale.  If non-zero, it
1880      *                  is in C; if > 1, it means it may not be toggled away
1881      *                  from C.
1882      * PL_numeric_underlying_is_standard   A bool kept by this function
1883      *                  indicating that the underlying locale and the standard
1884      *                  C locale are indistinguishable for the purposes of
1885      *                  LC_NUMERIC.  This happens when both of the above two
1886      *                  variables are true at the same time.  (Toggling is a
1887      *                  no-op under these circumstances.)  This variable is
1888      *                  used to avoid having to recalculate.
1889      * PL_numeric_radix_sv  Contains the string that code should use for the
1890      *                  decimal point.  It is set to either a dot or the
1891      *                  program's underlying locale's radix character string,
1892      *                  depending on the situation.
1893      * PL_underlying_radix_sv  Contains the program's underlying locale's radix
1894      *                  character string.  This is copied into
1895      *                  PL_numeric_radix_sv when the situation warrants.  It
1896      *                  exists to avoid having to recalculate it when toggling.
1897      * PL_underlying_numeric_obj = (only on POSIX 2008 platforms)  An object
1898      *                  with everything set up properly so as to avoid work on
1899      *                  such platforms.
1900      */
1901 
1902     DEBUG_L( PerlIO_printf(Perl_debug_log,
1903                            "Called new_numeric with %s, PL_numeric_name=%s\n",
1904                            newnum, PL_numeric_name));
1905 
1906     /* If not forcing this procedure, and there isn't actually a change from
1907      * our records, do nothing.  (Our records can be wrong when sync'ing to the
1908      * locale set up by an external library, hence the 'force' parameter) */
1909     if (! force && strEQ(PL_numeric_name, newnum)) {
1910         return;
1911     }
1912 
1913     Safefree(PL_numeric_name);
1914     PL_numeric_name = savepv(newnum);
1915 
1916     /* Handle the trivial case.  Since this is called at process
1917      * initialization, be aware that this bit can't rely on much being
1918      * available. */
1919     if (isNAME_C_OR_POSIX(PL_numeric_name)) {
1920         PL_numeric_standard = TRUE;
1921         PL_numeric_underlying_is_standard = TRUE;
1922         PL_numeric_underlying = TRUE;
1923         sv_setpv(PL_numeric_radix_sv, C_decimal_point);
1924         sv_setpv(PL_underlying_radix_sv, C_decimal_point);
1925         return;
1926     }
1927 
1928     /* We are in the underlying locale until changed at the end of this
1929      * function */
1930     PL_numeric_underlying = TRUE;
1931 
1932 #  ifdef USE_POSIX_2008_LOCALE
1933 
1934     /* We keep a special object for easy switching to.
1935      *
1936      * NOTE: This code may incorrectly show up as a leak under the address
1937      * sanitizer. We do not free this object under normal teardown, however
1938      * you can set PERL_DESTRUCT_LEVEL=2 to cause it to be freed.
1939      */
1940     PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
1941                                           PL_numeric_name,
1942                                           PL_underlying_numeric_obj);
1943 
1944 #    endif
1945 
1946     const char * radix = NULL;
1947     utf8ness_t utf8ness = UTF8NESS_IMMATERIAL;
1948 
1949     /* Find and save this locale's radix character. */
1950     my_langinfo_c(RADIXCHAR, LC_NUMERIC, PL_numeric_name,
1951                   &radix, NULL, &utf8ness);
1952     sv_setpv(PL_underlying_radix_sv, radix);
1953 
1954     if (utf8ness == UTF8NESS_YES) {
1955         SvUTF8_on(PL_underlying_radix_sv);
1956     }
1957 
1958     DEBUG_L(PerlIO_printf(Perl_debug_log,
1959                           "Locale radix is '%s', ?UTF-8=%d\n",
1960                           SvPVX(PL_underlying_radix_sv),
1961                           cBOOL(SvUTF8(PL_underlying_radix_sv))));
1962 
1963     /* This locale is indistinguishable from C (for numeric purposes) if both
1964      * the radix character and the thousands separator are the same as C's.
1965      * Start with the radix. */
1966     PL_numeric_underlying_is_standard = strEQ(C_decimal_point, radix);
1967     Safefree(radix);
1968 
1969 #    ifndef TS_W32_BROKEN_LOCALECONV
1970 
1971     /* If the radix isn't the same as C's, we know it is distinguishable from
1972      * C; otherwise check the thousands separator too.  Only if both are the
1973      * same as C's is the locale indistinguishable from C.
1974      *
1975      * But on earlier Windows versions, there is a potential race.  This code
1976      * knows that localeconv() (elsewhere in this file) will be used to extract
1977      * the needed value, and localeconv() was buggy for quite a while, and that
1978      * code in this file hence uses a workaround.  And that workaround may have
1979      * an (unlikely) race.  Gathering the radix uses a different workaround on
1980      * Windows that doesn't involve a race.  It might be possible to do the
1981      * same for this (patches welcome).
1982      *
1983      * Until then khw doesn't think it's worth even the small risk of a race to
1984      * get this value, which doesn't appear to be used in any of the Microsoft
1985      * library routines anyway. */
1986 
1987     const char * scratch_buffer = NULL;
1988     if (PL_numeric_underlying_is_standard) {
1989         PL_numeric_underlying_is_standard = strEQ(C_thousands_sep,
1990                                              my_langinfo_c(THOUSEP, LC_NUMERIC,
1991                                                            PL_numeric_name,
1992                                                            &scratch_buffer,
1993                                                            NULL, NULL));
1994     }
1995     Safefree(scratch_buffer);
1996 
1997 #    else
1998     PERL_UNUSED_VAR(C_thousands_sep);
1999 #    endif
2000 
2001     PL_numeric_standard = PL_numeric_underlying_is_standard;
2002 
2003     /* Keep LC_NUMERIC so that it has the C locale radix and thousands
2004      * separator.  This is for XS modules, so they don't have to worry about
2005      * the radix being a non-dot.  (Core operations that need the underlying
2006      * locale change to it temporarily). */
2007     if (! PL_numeric_standard) {
2008         set_numeric_standard();
2009     }
2010 
2011 }
2012 
2013 #  endif
2014 
2015 void
Perl_set_numeric_standard(pTHX)2016 Perl_set_numeric_standard(pTHX)
2017 {
2018 
2019 #  ifdef USE_LOCALE_NUMERIC
2020 
2021     /* Unconditionally toggle the LC_NUMERIC locale to the C locale
2022      *
2023      * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h
2024      * instead of calling this directly.  The macro avoids calling this routine
2025      * if toggling isn't necessary according to our records (which could be
2026      * wrong if some XS code has changed the locale behind our back) */
2027 
2028     DEBUG_L(PerlIO_printf(Perl_debug_log,
2029                                   "Setting LC_NUMERIC locale to standard C\n"));
2030 
2031     void_setlocale_c(LC_NUMERIC, "C");
2032     PL_numeric_standard = TRUE;
2033     sv_setpv(PL_numeric_radix_sv, C_decimal_point);
2034 
2035     PL_numeric_underlying = PL_numeric_underlying_is_standard;
2036 
2037 #  endif /* USE_LOCALE_NUMERIC */
2038 
2039 }
2040 
2041 void
Perl_set_numeric_underlying(pTHX)2042 Perl_set_numeric_underlying(pTHX)
2043 {
2044 
2045 #  ifdef USE_LOCALE_NUMERIC
2046 
2047     /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
2048      * default.
2049      *
2050      * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h
2051      * instead of calling this directly.  The macro avoids calling this routine
2052      * if toggling isn't necessary according to our records (which could be
2053      * wrong if some XS code has changed the locale behind our back) */
2054 
2055     DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s\n",
2056                                           PL_numeric_name));
2057 
2058     void_setlocale_c(LC_NUMERIC, PL_numeric_name);
2059     PL_numeric_underlying = TRUE;
2060     sv_setsv_nomg(PL_numeric_radix_sv, PL_underlying_radix_sv);
2061 
2062     PL_numeric_standard = PL_numeric_underlying_is_standard;
2063 
2064 #  endif /* USE_LOCALE_NUMERIC */
2065 
2066 }
2067 
2068 #  ifdef USE_LOCALE_CTYPE
2069 
2070 STATIC void
S_new_ctype(pTHX_ const char * newctype,bool force)2071 S_new_ctype(pTHX_ const char *newctype, bool force)
2072 {
2073     PERL_ARGS_ASSERT_NEW_CTYPE;
2074     PERL_UNUSED_ARG(force);
2075 
2076     /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
2077      * core Perl this and that 'newctype' is the name of the new locale.
2078      *
2079      * This function sets up the folding arrays for all 256 bytes, assuming
2080      * that tofold() is tolc() since fold case is not a concept in POSIX,
2081      */
2082 
2083     DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n", newctype));
2084 
2085     /* No change means no-op */
2086     if (strEQ(PL_ctype_name, newctype)) {
2087         return;
2088     }
2089 
2090     /* We will replace any bad locale warning with 1) nothing if the new one is
2091      * ok; or 2) a new warning for the bad new locale */
2092     if (PL_warn_locale) {
2093         SvREFCNT_dec_NN(PL_warn_locale);
2094         PL_warn_locale = NULL;
2095     }
2096 
2097     /* Clear cache */
2098     Safefree(PL_ctype_name);
2099     PL_ctype_name = "";
2100 
2101     PL_in_utf8_turkic_locale = FALSE;
2102 
2103     /* For the C locale, just use the standard folds, and we know there are no
2104      * glitches possible, so return early.  Since this is called at process
2105      * initialization, be aware that this bit can't rely on much being
2106      * available. */
2107     if (isNAME_C_OR_POSIX(newctype)) {
2108         Copy(PL_fold, PL_fold_locale, 256, U8);
2109         PL_ctype_name = savepv(newctype);
2110         PL_in_utf8_CTYPE_locale = FALSE;
2111         return;
2112     }
2113 
2114     /* The cache being cleared signals this function to compute a new value */
2115     PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype);
2116 
2117     PL_ctype_name = savepv(newctype);
2118     bool maybe_utf8_turkic = FALSE;
2119 
2120     /* Don't check for problems if we are suppressing the warnings */
2121     bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
2122 
2123     if (PL_in_utf8_CTYPE_locale) {
2124 
2125         /* A UTF-8 locale gets standard rules.  But note that code still has to
2126          * handle this specially because of the three problematic code points
2127          * */
2128         Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
2129 
2130         /* UTF-8 locales can have special handling for 'I' and 'i' if they are
2131          * Turkic.  Make sure these two are the only anomalies.  (We don't
2132          * require towupper and towlower because they aren't in C89.) */
2133 
2134 #    if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
2135 
2136         if (towupper('i') == 0x130 && towlower('I') == 0x131)
2137 
2138 #    else
2139 
2140         if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I')
2141 
2142 #    endif
2143 
2144         {
2145             /* This is how we determine it really is Turkic */
2146             check_for_problems = TRUE;
2147             maybe_utf8_turkic = TRUE;
2148         }
2149     }
2150     else {  /* Not a canned locale we know the values for.  Compute them */
2151 
2152 #    ifdef DEBUGGING
2153 
2154         bool has_non_ascii_fold = FALSE;
2155         bool found_unexpected = FALSE;
2156 
2157         /* Under -DLv, see if there are any folds outside the ASCII range.
2158          * This factoid is used below */
2159         if (DEBUG_Lv_TEST) {
2160             for (unsigned i = 128; i < 256; i++) {
2161                 int j = LATIN1_TO_NATIVE(i);
2162                 if (toU8_LOWER_LC(j) != j || toU8_UPPER_LC(j) != j) {
2163                     has_non_ascii_fold = TRUE;
2164                     break;
2165                 }
2166             }
2167         }
2168 
2169 #    endif
2170 
2171         for (unsigned i = 0; i < 256; i++) {
2172             if (isU8_UPPER_LC(i))
2173                 PL_fold_locale[i] = (U8) toU8_LOWER_LC(i);
2174             else if (isU8_LOWER_LC(i))
2175                 PL_fold_locale[i] = (U8) toU8_UPPER_LC(i);
2176             else
2177                 PL_fold_locale[i] = (U8) i;
2178 
2179 #    ifdef DEBUGGING
2180 
2181             /* Most locales these days are supersets of ASCII.  When debugging
2182              * with -DLv, it is helpful to know what the exceptions to that are
2183              * in this locale */
2184             if (DEBUG_Lv_TEST) {
2185                 bool unexpected = FALSE;
2186 
2187                 if (isUPPER_L1(i)) {
2188                     if (isUPPER_A(i)) {
2189                         if (PL_fold_locale[i] != toLOWER_A(i)) {
2190                             unexpected = TRUE;
2191                         }
2192                     }
2193                     else if (has_non_ascii_fold) {
2194                         if (PL_fold_locale[i] != toLOWER_L1(i)) {
2195                             unexpected = TRUE;
2196                         }
2197                     }
2198                     else if (PL_fold_locale[i] != i) {
2199                         unexpected = TRUE;
2200                     }
2201                 }
2202                 else if (   isLOWER_L1(i)
2203                          && i != LATIN_SMALL_LETTER_SHARP_S
2204                          && i != MICRO_SIGN)
2205                 {
2206                     if (isLOWER_A(i)) {
2207                         if (PL_fold_locale[i] != toUPPER_A(i)) {
2208                             unexpected = TRUE;
2209                         }
2210                     }
2211                     else if (has_non_ascii_fold) {
2212                         if (PL_fold_locale[i] != toUPPER_LATIN1_MOD(i)) {
2213                             unexpected = TRUE;
2214                         }
2215                     }
2216                     else if (PL_fold_locale[i] != i) {
2217                         unexpected = TRUE;
2218                     }
2219                 }
2220                 else if (PL_fold_locale[i] != i) {
2221                     unexpected = TRUE;
2222                 }
2223 
2224                 if (unexpected) {
2225                     found_unexpected = TRUE;
2226                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2227                                            "For %s, fold of %02x is %02x\n",
2228                                            newctype, i, PL_fold_locale[i]));
2229                 }
2230             }
2231         }
2232 
2233         if (found_unexpected) {
2234             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2235                                "All bytes not mentioned above either fold to"
2236                                " themselves or are the expected ASCII or"
2237                                " Latin1 ones\n"));
2238         }
2239         else {
2240             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2241                                    "No nonstandard folds were found\n"));
2242 #    endif
2243 
2244         }
2245     }
2246 
2247 #    ifdef MB_CUR_MAX
2248 
2249     /* We only handle single-byte locales (outside of UTF-8 ones); so if this
2250      * locale requires more than one byte, there are going to be BIG problems.
2251      * */
2252 
2253     if (MB_CUR_MAX > 1 && ! PL_in_utf8_CTYPE_locale
2254 
2255             /* Some platforms return MB_CUR_MAX > 1 for even the "C" locale.
2256              * Just assume that the implementation for them (plus for POSIX) is
2257              * correct and the > 1 value is spurious.  (Since these are
2258              * specially handled to never be considered UTF-8 locales, as long
2259              * as this is the only problem, everything should work fine */
2260         && ! isNAME_C_OR_POSIX(newctype))
2261     {
2262         DEBUG_L(PerlIO_printf(Perl_debug_log,
2263                               "Unsupported, MB_CUR_MAX=%d\n", (int) MB_CUR_MAX));
2264 
2265         Perl_ck_warner_d(aTHX_ packWARN(WARN_LOCALE),
2266                          "Locale '%s' is unsupported, and may crash the"
2267                          " interpreter.\n",
2268                          newctype);
2269     }
2270 
2271 #    endif
2272 
2273     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "check_for_problems=%d\n",
2274                                            check_for_problems));
2275 
2276     /* We don't populate the other lists if a UTF-8 locale, but do check that
2277      * everything works as expected, unless checking turned off */
2278     if (check_for_problems) {
2279         /* Assume enough space for every character being bad.  4 spaces each
2280          * for the 94 printable characters that are output like "'x' "; and 5
2281          * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
2282          * NUL */
2283         char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
2284         unsigned int bad_count = 0;         /* Count of bad characters */
2285 
2286         for (unsigned i = 0; i < 256; i++) {
2287 
2288             /* If checking for locale problems, see if the native ASCII-range
2289              * printables plus \n and \t are in their expected categories in
2290              * the new locale.  If not, this could mean big trouble, upending
2291              * Perl's and most programs' assumptions, like having a
2292              * metacharacter with special meaning become a \w.  Fortunately,
2293              * it's very rare to find locales that aren't supersets of ASCII
2294              * nowadays.  It isn't a problem for most controls to be changed
2295              * into something else; we check only \n and \t, though perhaps \r
2296              * could be an issue as well. */
2297             if (isGRAPH_A(i) || isBLANK_A(i) || i == '\n') {
2298                 bool is_bad = FALSE;
2299                 char name[4] = { '\0' };
2300 
2301                 /* Convert the name into a string */
2302                 if (isGRAPH_A(i)) {
2303                     name[0] = i;
2304                     name[1] = '\0';
2305                 }
2306                 else if (i == '\n') {
2307                     my_strlcpy(name, "\\n", sizeof(name));
2308                 }
2309                 else if (i == '\t') {
2310                     my_strlcpy(name, "\\t", sizeof(name));
2311                 }
2312                 else {
2313                     assert(i == ' ');
2314                     my_strlcpy(name, "' '", sizeof(name));
2315                 }
2316 
2317                 /* Check each possibe class */
2318                 if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) !=
2319                                                     cBOOL(isALPHANUMERIC_A(i))))
2320                 {
2321                     is_bad = TRUE;
2322                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2323                                           "isalnum('%s') unexpectedly is %x\n",
2324                                           name, cBOOL(isU8_ALPHANUMERIC_LC(i))));
2325                 }
2326                 if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i))))  {
2327                     is_bad = TRUE;
2328                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2329                                           "isalpha('%s') unexpectedly is %x\n",
2330                                           name, cBOOL(isU8_ALPHA_LC(i))));
2331                 }
2332                 if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i))))  {
2333                     is_bad = TRUE;
2334                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2335                                           "isdigit('%s') unexpectedly is %x\n",
2336                                           name, cBOOL(isU8_DIGIT_LC(i))));
2337                 }
2338                 if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i))))  {
2339                     is_bad = TRUE;
2340                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2341                                           "isgraph('%s') unexpectedly is %x\n",
2342                                           name, cBOOL(isU8_GRAPH_LC(i))));
2343                 }
2344                 if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i))))  {
2345                     is_bad = TRUE;
2346                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2347                                           "islower('%s') unexpectedly is %x\n",
2348                                           name, cBOOL(isU8_LOWER_LC(i))));
2349                 }
2350                 if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i))))  {
2351                     is_bad = TRUE;
2352                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2353                                           "isprint('%s') unexpectedly is %x\n",
2354                                           name, cBOOL(isU8_PRINT_LC(i))));
2355                 }
2356                 if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i))))  {
2357                     is_bad = TRUE;
2358                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2359                                           "ispunct('%s') unexpectedly is %x\n",
2360                                           name, cBOOL(isU8_PUNCT_LC(i))));
2361                 }
2362                 if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i))))  {
2363                     is_bad = TRUE;
2364                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2365                                           "isspace('%s') unexpectedly is %x\n",
2366                                           name, cBOOL(isU8_SPACE_LC(i))));
2367                 }
2368                 if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i))))  {
2369                     is_bad = TRUE;
2370                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2371                                           "isupper('%s') unexpectedly is %x\n",
2372                                           name, cBOOL(isU8_UPPER_LC(i))));
2373                 }
2374                 if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i))))  {
2375                     is_bad = TRUE;
2376                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2377                                           "isxdigit('%s') unexpectedly is %x\n",
2378                                           name, cBOOL(isU8_XDIGIT_LC(i))));
2379                 }
2380                 if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) {
2381                     is_bad = TRUE;
2382                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2383                             "tolower('%s')=0x%x instead of the expected 0x%x\n",
2384                             name, toU8_LOWER_LC(i), (int) toLOWER_A(i)));
2385                 }
2386                 if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) {
2387                     is_bad = TRUE;
2388                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2389                             "toupper('%s')=0x%x instead of the expected 0x%x\n",
2390                             name, toU8_UPPER_LC(i), (int) toUPPER_A(i)));
2391                 }
2392                 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i))))  {
2393                     is_bad = TRUE;
2394                     DEBUG_L(PerlIO_printf(Perl_debug_log,
2395                                 "'\\n' (=%02X) is not a control\n", (int) i));
2396                 }
2397 
2398                 /* Add to the list;  Separate multiple entries with a blank */
2399                 if (is_bad) {
2400                     if (bad_count) {
2401                         my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
2402                     }
2403                     my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
2404                     bad_count++;
2405                 }
2406             }
2407         }
2408 
2409         if (bad_count == 2 && maybe_utf8_turkic) {
2410             bad_count = 0;
2411             *bad_chars_list = '\0';
2412 
2413             /* The casts are because otherwise some compilers warn:
2414                 gcc.gnu.org/bugzilla/show_bug.cgi?id=99950
2415                 gcc.gnu.org/bugzilla/show_bug.cgi?id=94182
2416              */
2417             PL_fold_locale[ (U8) 'I' ] = 'I';
2418             PL_fold_locale[ (U8) 'i' ] = 'i';
2419             PL_in_utf8_turkic_locale = TRUE;
2420             DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype));
2421         }
2422 
2423         /* If we found problems and we want them output, do so */
2424         if (   (UNLIKELY(bad_count))
2425             && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
2426         {
2427             /* WARNING.  If you change the wording of these; be sure to update
2428              * t/loc_tools.pl correspondingly */
2429 
2430             if (PL_in_utf8_CTYPE_locale) {
2431                 PL_warn_locale = Perl_newSVpvf(aTHX_
2432                      "Locale '%s' contains (at least) the following characters"
2433                      " which have\nunexpected meanings: %s\nThe Perl program"
2434                      " will use the expected meanings",
2435                       newctype, bad_chars_list);
2436             }
2437             else {
2438                 PL_warn_locale =
2439                     Perl_newSVpvf(aTHX_
2440                                   "\nThe following characters (and maybe"
2441                                   " others) may not have the same meaning as"
2442                                   " the Perl program expects: %s\n",
2443                                   bad_chars_list
2444                             );
2445             }
2446 
2447 #    ifdef HAS_SOME_LANGINFO
2448 
2449             const char * scratch_buffer = NULL;
2450             Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
2451                                  my_langinfo_c(CODESET, LC_CTYPE,
2452                                                newctype,
2453                                                &scratch_buffer, NULL,
2454                                                NULL));
2455             Safefree(scratch_buffer);
2456 
2457 #  endif
2458 
2459             Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
2460 
2461             /* If we are actually in the scope of the locale or are debugging,
2462              * output the message now.  If not in that scope, we save the
2463              * message to be output at the first operation using this locale,
2464              * if that actually happens.  Most programs don't use locales, so
2465              * they are immune to bad ones.  */
2466             if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
2467 
2468                 /* The '0' below suppresses a bogus gcc compiler warning */
2469                 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale),
2470                                                                             0);
2471 
2472                 if (IN_LC(LC_CTYPE)) {
2473                     SvREFCNT_dec_NN(PL_warn_locale);
2474                     PL_warn_locale = NULL;
2475                 }
2476             }
2477         }
2478     }
2479 }
2480 
2481 #  endif /* USE_LOCALE_CTYPE */
2482 
2483 void
Perl__warn_problematic_locale()2484 Perl__warn_problematic_locale()
2485 {
2486 
2487 #  ifdef USE_LOCALE_CTYPE
2488 
2489     dTHX;
2490 
2491     /* Internal-to-core function that outputs the message in PL_warn_locale,
2492      * and then NULLS it.  Should be called only through the macro
2493      * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */
2494 
2495     if (PL_warn_locale) {
2496         Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2497                              SvPVX(PL_warn_locale),
2498                              0 /* dummy to avoid compiler warning */ );
2499         SvREFCNT_dec_NN(PL_warn_locale);
2500         PL_warn_locale = NULL;
2501     }
2502 
2503 #  endif
2504 
2505 }
2506 
2507 STATIC void
S_new_LC_ALL(pTHX_ const char * unused,bool force)2508 S_new_LC_ALL(pTHX_ const char *unused, bool force)
2509 {
2510     PERL_ARGS_ASSERT_NEW_LC_ALL;
2511     PERL_UNUSED_ARG(unused);
2512 
2513     /* LC_ALL updates all the things we care about. */
2514 
2515     for (unsigned int i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2516         if (update_functions[i]) {
2517             const char * this_locale = querylocale_i(i);
2518             update_functions[i](aTHX_ this_locale, force);
2519         }
2520     }
2521 }
2522 
2523 #  ifdef USE_LOCALE_COLLATE
2524 
2525 STATIC void
S_new_collate(pTHX_ const char * newcoll,bool force)2526 S_new_collate(pTHX_ const char *newcoll, bool force)
2527 {
2528     PERL_ARGS_ASSERT_NEW_COLLATE;
2529     PERL_UNUSED_ARG(force);
2530 
2531     /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
2532      * core Perl this and that 'newcoll' is the name of the new locale.
2533      *
2534      * The design of locale collation is that every locale change is given an
2535      * index 'PL_collation_ix'.  The first time a string participates in an
2536      * operation that requires collation while locale collation is active, it
2537      * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()).  That
2538      * magic includes the collation index, and the transformation of the string
2539      * by strxfrm(), q.v.  That transformation is used when doing comparisons,
2540      * instead of the string itself.  If a string changes, the magic is
2541      * cleared.  The next time the locale changes, the index is incremented,
2542      * and so we know during a comparison that the transformation is not
2543      * necessarily still valid, and so is recomputed.  Note that if the locale
2544      * changes enough times, the index could wrap (a U32), and it is possible
2545      * that a transformation would improperly be considered valid, leading to
2546      * an unlikely bug */
2547 
2548     /* Return if the locale isn't changing */
2549     if (strEQ(PL_collation_name, newcoll)) {
2550         return;
2551     }
2552 
2553     Safefree(PL_collation_name);
2554     PL_collation_name = savepv(newcoll);
2555     ++PL_collation_ix;
2556 
2557     /* Set the new one up if trivial.  Since this is called at process
2558      * initialization, be aware that this bit can't rely on much being
2559      * available. */
2560     PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
2561     if (PL_collation_standard) {
2562         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2563                                "Setting PL_collation name='%s'\n",
2564                                PL_collation_name));
2565         PL_collxfrm_base = 0;
2566         PL_collxfrm_mult = 2;
2567         PL_in_utf8_COLLATE_locale = FALSE;
2568         PL_strxfrm_NUL_replacement = '\0';
2569         PL_strxfrm_max_cp = 0;
2570         return;
2571     }
2572 
2573     /* Flag that the remainder of the set up is being deferred until first
2574      * need. */
2575     PL_collxfrm_mult = 0;
2576     PL_collxfrm_base = 0;
2577 
2578 }
2579 
2580 #  endif /* USE_LOCALE_COLLATE */
2581 #endif  /* USE_LOCALE */
2582 
2583 #ifdef WIN32
2584 
2585 wchar_t *
S_Win_byte_string_to_wstring(const UINT code_page,const char * byte_string)2586 S_Win_byte_string_to_wstring(const UINT code_page, const char * byte_string)
2587 {
2588     /* Caller must arrange to free the returned string */
2589 
2590     int req_size = MultiByteToWideChar(code_page, 0, byte_string, -1, NULL, 0);
2591     if (! req_size) {
2592         SET_EINVAL;
2593         return NULL;
2594     }
2595 
2596     wchar_t *wstring;
2597     Newx(wstring, req_size, wchar_t);
2598 
2599     if (! MultiByteToWideChar(code_page, 0, byte_string, -1, wstring, req_size))
2600     {
2601         Safefree(wstring);
2602         SET_EINVAL;
2603         return NULL;
2604     }
2605 
2606     return wstring;
2607 }
2608 
2609 #define Win_utf8_string_to_wstring(s)  Win_byte_string_to_wstring(CP_UTF8, (s))
2610 
2611 char *
S_Win_wstring_to_byte_string(const UINT code_page,const wchar_t * wstring)2612 S_Win_wstring_to_byte_string(const UINT code_page, const wchar_t * wstring)
2613 {
2614     /* Caller must arrange to free the returned string */
2615 
2616     int req_size =
2617             WideCharToMultiByte(code_page, 0, wstring, -1, NULL, 0, NULL, NULL);
2618 
2619     char *byte_string;
2620     Newx(byte_string, req_size, char);
2621 
2622     if (! WideCharToMultiByte(code_page, 0, wstring, -1, byte_string,
2623                                                          req_size, NULL, NULL))
2624     {
2625         Safefree(byte_string);
2626         SET_EINVAL;
2627         return NULL;
2628     }
2629 
2630     return byte_string;
2631 }
2632 
2633 #define Win_wstring_to_utf8_string(ws) Win_wstring_to_byte_string(CP_UTF8, (ws))
2634 
2635 STATIC const char *
S_wrap_wsetlocale(pTHX_ const int category,const char * locale)2636 S_wrap_wsetlocale(pTHX_ const int category, const char *locale)
2637 {
2638     PERL_ARGS_ASSERT_WRAP_WSETLOCALE;
2639 
2640     /* Calls _wsetlocale(), converting the parameters/return to/from
2641      * Perl-expected forms as if plain setlocale() were being called instead.
2642      */
2643 
2644     const wchar_t * wlocale = NULL;
2645 
2646     if (locale) {
2647         wlocale = Win_utf8_string_to_wstring(locale);
2648         if (! wlocale) {
2649             return NULL;
2650         }
2651     }
2652 
2653     WSETLOCALE_LOCK;
2654     const wchar_t * wresult = _wsetlocale(category, wlocale);
2655     Safefree(wlocale);
2656 
2657     if (! wresult) {
2658         WSETLOCALE_UNLOCK;
2659         return NULL;
2660     }
2661 
2662     const char * result = Win_wstring_to_utf8_string(wresult);
2663     WSETLOCALE_UNLOCK;
2664 
2665     SAVEFREEPV(result); /* is there something better we can do here?  Answer:
2666                            Without restructuring, returning a unique value each
2667                            call is required.  See GH #20434 */
2668     return result;
2669 }
2670 
2671 STATIC const char *
S_win32_setlocale(pTHX_ int category,const char * locale)2672 S_win32_setlocale(pTHX_ int category, const char* locale)
2673 {
2674     /* This, for Windows, emulates POSIX setlocale() behavior.  There is no
2675      * difference between the two unless the input locale is "", which normally
2676      * means on Windows to get the machine default, which is set via the
2677      * computer's "Regional and Language Options" (or its current equivalent).
2678      * In POSIX, it instead means to find the locale from the user's
2679      * environment.  This routine changes the Windows behavior to first look in
2680      * the environment, and, if anything is found, use that instead of going to
2681      * the machine default.  If there is no environment override, the machine
2682      * default is used, by calling the real setlocale() with "".
2683      *
2684      * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
2685      * use the particular category's variable if set; otherwise to use the LANG
2686      * variable. */
2687 
2688     if (locale == NULL) {
2689         return wrap_wsetlocale(category, NULL);
2690     }
2691 
2692     if (strEQ(locale, "")) {
2693         /* Note this function may change the locale, but that's ok because we
2694          * are about to change it anyway */
2695         locale = find_locale_from_environment(get_category_index(category, ""));
2696     }
2697 
2698     const char * result = wrap_wsetlocale(category, locale);
2699     DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2700                           setlocale_debug_string_r(category, locale, result)));
2701 
2702 #  ifdef USE_PL_CUR_LC_ALL
2703 
2704     /* If we need to keep track of LC_ALL, update it to the new value.  */
2705     Safefree(PL_cur_LC_ALL);
2706     if (category == LC_ALL) {
2707         PL_cur_LC_ALL = savepv(result);
2708     }
2709     else {
2710         PL_cur_LC_ALL = savepv(wrap_wsetlocale(LC_ALL, NULL));
2711     }
2712 
2713 #  endif
2714 
2715     return result;
2716 }
2717 
2718 #endif
2719 
2720 /*
2721 =for apidoc Perl_setlocale
2722 
2723 This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
2724 taking the same parameters, and returning the same information, except that it
2725 returns the correct underlying C<LC_NUMERIC> locale.  Regular C<setlocale> will
2726 instead return C<C> if the underlying locale has a non-dot decimal point
2727 character, or a non-empty thousands separator for displaying floating point
2728 numbers.  This is because perl keeps that locale category such that it has a
2729 dot and empty separator, changing the locale briefly during the operations
2730 where the underlying one is required. C<Perl_setlocale> knows about this, and
2731 compensates; regular C<setlocale> doesn't.
2732 
2733 Another reason it isn't completely a drop-in replacement is that it is
2734 declared to return S<C<const char *>>, whereas the system setlocale omits the
2735 C<const> (presumably because its API was specified long ago, and can't be
2736 updated; it is illegal to change the information C<setlocale> returns; doing
2737 so leads to segfaults.)
2738 
2739 Finally, C<Perl_setlocale> works under all circumstances, whereas plain
2740 C<setlocale> can be completely ineffective on some platforms under some
2741 configurations.
2742 
2743 Changing the locale is not a good idea when more than one thread is running,
2744 except on systems where the predefined variable C<${^SAFE_LOCALES}> is 1.
2745 This is because on such systems the locale is global to the whole process and
2746 not local to just the thread calling the function.  So changing it in one
2747 thread instantaneously changes it in all.  On some such systems, the system
2748 C<setlocale()> is ineffective, returning the wrong information, and failing to
2749 actually change the locale.  z/OS refuses to try to change the locale once a
2750 second thread is created.  C<Perl_setlocale>, should give you accurate results
2751 of what actually happened on these problematic platforms, returning NULL if the
2752 system forbade the locale change.
2753 
2754 The return points to a per-thread static buffer, which is overwritten the next
2755 time C<Perl_setlocale> is called from the same thread.
2756 
2757 =cut
2758 
2759 */
2760 
2761 #ifndef USE_LOCALE_NUMERIC
2762 #  define affects_LC_NUMERIC(cat) 0
2763 #elif defined(LC_ALL)
2764 #  define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC || cat == LC_ALL)
2765 #else
2766 #  define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC)
2767 #endif
2768 
2769 const char *
Perl_setlocale(const int category,const char * locale)2770 Perl_setlocale(const int category, const char * locale)
2771 {
2772     /* This wraps POSIX::setlocale() */
2773 
2774 #ifndef USE_LOCALE
2775 
2776     PERL_UNUSED_ARG(category);
2777     PERL_UNUSED_ARG(locale);
2778 
2779     return "C";
2780 
2781 #else
2782 
2783     const char * retval;
2784     dTHX;
2785 
2786     DEBUG_L(PerlIO_printf(Perl_debug_log,
2787                           "Entering Perl_setlocale(%d, \"%s\")\n",
2788                           category, locale));
2789 
2790     /* A NULL locale means only query what the current one is. */
2791     if (locale == NULL) {
2792 
2793 #  ifndef USE_LOCALE_NUMERIC
2794 
2795         /* Without LC_NUMERIC, it's trivial; we just return the value */
2796         return save_to_buffer(querylocale_r(category),
2797                               &PL_setlocale_buf, &PL_setlocale_bufsize);
2798 #  else
2799 
2800         /* We have the LC_NUMERIC name saved, because we are normally switched
2801          * into the C locale (or equivalent) for it. */
2802         if (category == LC_NUMERIC) {
2803             DEBUG_L(PerlIO_printf(Perl_debug_log,
2804                     "Perl_setlocale(LC_NUMERIC, NULL) returning stashed '%s'\n",
2805                     PL_numeric_name));
2806 
2807             /* We don't have to copy this return value, as it is a per-thread
2808              * variable, and won't change until a future setlocale */
2809             return PL_numeric_name;
2810         }
2811 
2812 #    ifndef LC_ALL
2813 
2814         /* Without LC_ALL, just return the value */
2815         return save_to_buffer(querylocale_r(category),
2816                               &PL_setlocale_buf, &PL_setlocale_bufsize);
2817 
2818 #    else
2819 
2820         /* Here, LC_ALL is available on this platform.  It's the one
2821          * complicating category (because it can contain a toggled LC_NUMERIC
2822          * value), for all the remaining ones (we took care of LC_NUMERIC
2823          * above), just return the value */
2824         if (category != LC_ALL) {
2825             return save_to_buffer(querylocale_r(category),
2826                                   &PL_setlocale_buf, &PL_setlocale_bufsize);
2827         }
2828 
2829         bool toggled = FALSE;
2830 
2831         /* For an LC_ALL query, switch back to the underlying numeric locale
2832          * (if we aren't there already) so as to get the correct results.  Our
2833          * records for all the other categories are valid without switching */
2834         if (! PL_numeric_underlying) {
2835             set_numeric_underlying();
2836             toggled = TRUE;
2837         }
2838 
2839         retval = querylocale_c(LC_ALL);
2840 
2841         if (toggled) {
2842             set_numeric_standard();
2843         }
2844 
2845         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2846                             setlocale_debug_string_r(category, locale, retval)));
2847 
2848         return save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
2849 
2850 #    endif      /* Has LC_ALL */
2851 #  endif        /* Has LC_NUMERIC */
2852 
2853     } /* End of querying the current locale */
2854 
2855 
2856     unsigned int cat_index = get_category_index(category, NULL);
2857     retval = querylocale_i(cat_index);
2858 
2859     /* If the new locale is the same as the current one, nothing is actually
2860      * being changed, so do nothing. */
2861     if (      retval != NULL && strEQ(retval, locale)
2862         && (   ! affects_LC_NUMERIC(category)
2863 
2864 #  ifdef USE_LOCALE_NUMERIC
2865 
2866             || strEQ(locale, PL_numeric_name)
2867 
2868 #  endif
2869 
2870     )) {
2871         DEBUG_L(PerlIO_printf(Perl_debug_log,
2872                               "Already in requested locale: no action taken\n"));
2873         return save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
2874     }
2875 
2876     /* Here, an actual change is being requested.  Do it */
2877     retval = setlocale_i(cat_index, locale);
2878 
2879     if (! retval) {
2880         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2881                           setlocale_debug_string_i(cat_index, locale, "NULL")));
2882         return NULL;
2883     }
2884 
2885     assert(strNE(retval, ""));
2886     retval = save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
2887 
2888     /* Now that have changed locales, we have to update our records to
2889      * correspond.  Only certain categories have extra work to update. */
2890     if (update_functions[cat_index]) {
2891         update_functions[cat_index](aTHX_ retval, false);
2892     }
2893 
2894     DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", retval));
2895 
2896     return retval;
2897 
2898 #endif
2899 
2900 }
2901 
2902 STATIC utf8ness_t
S_get_locale_string_utf8ness_i(pTHX_ const char * string,const locale_utf8ness_t known_utf8,const char * locale,const unsigned cat_index)2903 S_get_locale_string_utf8ness_i(pTHX_ const char * string,
2904                                      const locale_utf8ness_t known_utf8,
2905                                      const char * locale,
2906                                      const unsigned cat_index)
2907 {
2908     PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I;
2909 
2910 #ifndef USE_LOCALE
2911 
2912     return UTF8NESS_NO;
2913     PERL_UNUSED_ARG(string);
2914     PERL_UNUSED_ARG(known_utf8);
2915     PERL_UNUSED_ARG(locale);
2916     PERL_UNUSED_ARG(cat_index);
2917 
2918 #else
2919 
2920     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
2921 
2922     /* Return to indicate if 'string' in the locale given by the input
2923      * arguments should be considered UTF-8 or not.
2924      *
2925      * If the input 'locale' is not NULL, use that for the locale; otherwise
2926      * use the current locale for the category specified by 'cat_index'.
2927      */
2928 
2929     if (string == NULL) {
2930         return UTF8NESS_NO;
2931     }
2932 
2933     if (IN_BYTES) { /* respect 'use bytes' */
2934         return UTF8NESS_NO;
2935     }
2936 
2937     Size_t len = strlen(string);
2938 
2939     /* UTF8ness is immaterial if the representation doesn't vary */
2940     const U8 * first_variant = NULL;
2941     if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) {
2942         return UTF8NESS_IMMATERIAL;
2943     }
2944 
2945     /* Can't be UTF-8 if invalid */
2946     if (! is_utf8_string((U8 *) first_variant,
2947                          len - ((char *) first_variant - string)))
2948     {
2949         return UTF8NESS_NO;
2950     }
2951 
2952     /* Here and below, we know the string is legal UTF-8, containing at least
2953      * one character requiring a sequence of two or more bytes.  It is quite
2954      * likely to be UTF-8.  But it pays to be paranoid and do further checking.
2955      *
2956      * If we already know the UTF-8ness of the locale, then we immediately know
2957      * what the string is */
2958     if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) {
2959         if (known_utf8 == LOCALE_IS_UTF8) {
2960             return UTF8NESS_YES;
2961         }
2962         else {
2963             return UTF8NESS_NO;
2964         }
2965     }
2966 
2967 #  ifdef HAS_RELIABLE_UTF8NESS_DETERMINATION
2968 
2969     /* Here, we have available the libc functions that can be used to
2970      * accurately determine the UTF8ness of the underlying locale.  If it is a
2971      * UTF-8 locale, the string is UTF-8;  otherwise it was coincidental that
2972      * the string is legal UTF-8
2973      *
2974      * However, if the perl is compiled to not pay attention to the category
2975      * being passed in, you might think that that locale is essentially always
2976      * the C locale, so it would make sense to say it isn't UTF-8.  But to get
2977      * here, the string has to contain characters unknown in the C locale.  And
2978      * in fact, Windows boxes are compiled without LC_MESSAGES, as their
2979      * message catalog isn't really a part of the locale system.  But those
2980      * messages really could be UTF-8, and given that the odds are rather small
2981      * of something not being UTF-8 but being syntactically valid UTF-8, khw
2982      * has decided to call such strings as UTF-8. */
2983 
2984     if (locale == NULL) {
2985         locale = querylocale_i(cat_index);
2986     }
2987     if (is_locale_utf8(locale)) {
2988         return UTF8NESS_YES;
2989     }
2990 
2991     return UTF8NESS_NO;
2992 
2993 #  else
2994 
2995     /* Here, we have a valid UTF-8 string containing non-ASCII characters, and
2996      * don't have access to functions to check if the locale is UTF-8 or not.
2997      * Assume that it is.  khw tried adding a check that the string is entirely
2998      * in a single Unicode script, but discovered the strftime() timezone is
2999      * user-settable through the environment, which may be in a different
3000      * script than the locale-expected value. */
3001     PERL_UNUSED_ARG(locale);
3002     PERL_UNUSED_ARG(cat_index);
3003 
3004     return UTF8NESS_YES;
3005 
3006 #  endif
3007 #endif
3008 
3009 }
3010 
3011 STATIC bool
S_is_locale_utf8(pTHX_ const char * locale)3012 S_is_locale_utf8(pTHX_ const char * locale)
3013 {
3014     /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise.  It uses
3015      * my_langinfo(), which employs various methods to get this information
3016      * if nl_langinfo() isn't available, using heuristics as a last resort, in
3017      * which case, the result will very likely be correct for locales for
3018      * languages that have commonly used non-ASCII characters, but for notably
3019      * English, it comes down to if the locale's name ends in something like
3020      * "UTF-8".  It errs on the side of not being a UTF-8 locale. */
3021 
3022 #  if ! defined(USE_LOCALE)                                                   \
3023    || ! defined(USE_LOCALE_CTYPE)                                             \
3024    ||   defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */
3025 
3026     PERL_UNUSED_ARG(locale);
3027 
3028     return FALSE;
3029 
3030 #  else
3031 
3032     const char * scratch_buffer = NULL;
3033     const char * codeset;
3034     bool retval;
3035 
3036     PERL_ARGS_ASSERT_IS_LOCALE_UTF8;
3037 
3038     if (strEQ(locale, PL_ctype_name)) {
3039         return PL_in_utf8_CTYPE_locale;
3040     }
3041 
3042     codeset = my_langinfo_c(CODESET, LC_CTYPE, locale,
3043                             &scratch_buffer, NULL, NULL);
3044     retval = is_codeset_name_UTF8(codeset);
3045 
3046     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3047                            "found codeset=%s, is_utf8=%d\n", codeset, retval));
3048 
3049     Safefree(scratch_buffer);
3050     return retval;
3051 
3052 #  endif
3053 
3054 }
3055 
3056 #ifdef USE_LOCALE
3057 
3058 STATIC const char *
S_save_to_buffer(const char * string,const char ** buf,Size_t * buf_size)3059 S_save_to_buffer(const char * string, const char **buf, Size_t *buf_size)
3060 {
3061     /* Copy the NUL-terminated 'string' to a buffer whose address before this
3062      * call began at *buf, and whose available length before this call was
3063      * *buf_size.
3064      *
3065      * If the length of 'string' is greater than the space available, the
3066      * buffer is grown accordingly, which may mean that it gets relocated.
3067      * *buf and *buf_size will be updated to reflect this.
3068      *
3069      * Regardless, the function returns a pointer to where 'string' is now
3070      * stored.
3071      *
3072      * 'string' may be NULL, which means no action gets taken, and NULL is
3073      * returned.
3074      *
3075      * If *buf or 'buf_size' are NULL or *buf_size is 0, the buffer is assumed
3076      * empty, and memory is malloc'd.   'buf-size' being NULL is to be used
3077      * when this is a single use buffer, which will shortly be freed by the
3078      * caller.
3079      */
3080 
3081     Size_t string_size;
3082 
3083     PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
3084 
3085     if (! string) {
3086         return NULL;
3087     }
3088 
3089     /* No-op to copy over oneself */
3090     if (string == *buf) {
3091         return string;
3092     }
3093 
3094     string_size = strlen(string) + 1;
3095 
3096     if (buf_size == NULL) {
3097         Newx(*buf, string_size, char);
3098     }
3099     else if (*buf_size == 0) {
3100         Newx(*buf, string_size, char);
3101         *buf_size = string_size;
3102     }
3103     else if (string_size > *buf_size) {
3104         Renew(*buf, string_size, char);
3105         *buf_size = string_size;
3106     }
3107 
3108     {
3109         dTHX_DEBUGGING;
3110         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3111                          "Copying '%s' to %p\n",
3112                          ((is_utf8_string((U8 *) string, 0))
3113                           ? string
3114                           :_byte_dump_string((U8 *) string, strlen(string), 0)),
3115                           *buf));
3116     }
3117 
3118 #    ifdef DEBUGGING
3119 
3120     /* Catch glitches.  Usually this is because LC_CTYPE needs to be the same
3121      * locale as whatever is being worked on */
3122     if (UNLIKELY(instr(string, REPLACEMENT_CHARACTER_UTF8))) {
3123         dTHX_DEBUGGING;
3124 
3125         locale_panic_(Perl_form(aTHX_
3126                                 "Unexpected REPLACEMENT_CHARACTER in '%s'\n%s",
3127                                 string, get_LC_ALL_display()));
3128     }
3129 
3130 #    endif
3131 
3132     Copy(string, *buf, string_size, char);
3133     return *buf;
3134 }
3135 
3136 #  ifdef WIN32
3137 
3138 bool
Perl_get_win32_message_utf8ness(pTHX_ const char * string)3139 Perl_get_win32_message_utf8ness(pTHX_ const char * string)
3140 {
3141     /* NULL => locale irrelevant, 0 => category irrelevant
3142      * so returns based on the UTF-8 legality of the input string, ignoring the
3143      * locale and category completely.
3144      *
3145      * This is because Windows doesn't have LC_MESSAGES */
3146     return get_locale_string_utf8ness_i(string, LOCALE_IS_UTF8, NULL, 0);
3147 }
3148 
3149 #  endif
3150 #endif  /* USE_LOCALE */
3151 
3152 
3153 int
Perl_mbtowc_(pTHX_ const wchar_t * pwc,const char * s,const Size_t len)3154 Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len)
3155 {
3156 
3157 #if ! defined(HAS_MBRTOWC) && ! defined(HAS_MBTOWC)
3158 
3159     PERL_UNUSED_ARG(pwc);
3160     PERL_UNUSED_ARG(s);
3161     PERL_UNUSED_ARG(len);
3162     return -1;
3163 
3164 #else   /* Below we have some form of mbtowc() */
3165 #   if defined(HAS_MBRTOWC)                                     \
3166    && (defined(USE_LOCALE_THREADS) || ! defined(HAS_MBTOWC))
3167 #    define USE_MBRTOWC
3168 #  else
3169 #    undef USE_MBRTOWC
3170 #  endif
3171 
3172     int retval = -1;
3173 
3174     if (s == NULL) { /* Initialize the shift state to all zeros in
3175                         PL_mbrtowc_ps. */
3176 
3177 #  if defined(USE_MBRTOWC)
3178 
3179         memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
3180         return 0;
3181 
3182 #  else
3183 
3184         MBTOWC_LOCK_;
3185         SETERRNO(0, 0);
3186         retval = mbtowc(NULL, NULL, 0);
3187         MBTOWC_UNLOCK_;
3188         return retval;
3189 
3190 #  endif
3191 
3192     }
3193 
3194 #  if defined(USE_MBRTOWC)
3195 
3196     SETERRNO(0, 0);
3197     retval = (SSize_t) mbrtowc((wchar_t *) pwc, s, len, &PL_mbrtowc_ps);
3198 
3199 #  else
3200 
3201     /* Locking prevents races, but locales can be switched out without locking,
3202      * so this isn't a cure all */
3203     MBTOWC_LOCK_;
3204     SETERRNO(0, 0);
3205     retval = mbtowc((wchar_t *) pwc, s, len);
3206     MBTOWC_UNLOCK_;
3207 
3208 #  endif
3209 
3210     return retval;
3211 
3212 #endif
3213 
3214 }
3215 
3216 /*
3217 =for apidoc Perl_localeconv
3218 
3219 This is a thread-safe version of the libc L<localeconv(3)>.  It is the same as
3220 L<POSIX::localeconv|POSIX/localeconv> (returning a hash of the C<localeconv()>
3221 fields), but directly callable from XS code.
3222 
3223 =cut
3224 */
3225 
3226 HV *
Perl_localeconv(pTHX)3227 Perl_localeconv(pTHX)
3228 {
3229 
3230 #if  ! defined(HAS_LOCALECONV)
3231 
3232     return newHV();
3233 
3234 #else
3235 
3236     return my_localeconv(0);
3237 
3238 #endif
3239 
3240 }
3241 
3242 #if  defined(HAS_LOCALECONV)
3243 
3244 HV *
S_my_localeconv(pTHX_ const int item)3245 S_my_localeconv(pTHX_ const int item)
3246 {
3247     PERL_ARGS_ASSERT_MY_LOCALECONV;
3248 
3249     /* This returns a mortalized hash containing all or one of the elements
3250      * returned by localeconv().  It is used by Perl_localeconv() and
3251      * POSIX::localeconv() and is thread-safe.
3252      *
3253      * There are two use cases:
3254      * 1) Called from POSIX::locale_conv().  This returns the lconv structure
3255      *    copied to a hash, based on the current underlying locales for
3256      *    LC_NUMERIC and LC_MONETARY. An input item==0 signifies this case, or
3257      *    on many platforms it is the only use case compiled.
3258      * 2) Certain items that nl_langinfo() provides are also derivable from
3259      *    the return of localeconv().  Windows notably doesn't have
3260      *    nl_langinfo(), so on that, and actually any platform lacking it,
3261      *    my_localeconv() is used also to emulate it for those particular
3262      *    items.  The code to do this is compiled only on such platforms.
3263      *    Rather than going to the expense of creating a full hash when only
3264      *    one item is needed, the returned hash has just the desired item in
3265      *    it.
3266      *
3267      * To access all the localeconv() struct lconv fields, there is a data
3268      * structure that contains every commonly documented field in it.  (Maybe
3269      * some minority platforms have extra fields.  Those could be added here
3270      * without harm; they would just be ignored on platforms lacking them.)
3271      *
3272      * Our structure is compiled to make looping through the fields easier by
3273      * pointing each name to its value's offset within lconv, e.g.,
3274         { "thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep) }
3275      */
3276 #  define LCONV_ENTRY(name)                                           \
3277                 {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)}
3278 
3279     /* These synonyms are just for clarity, and to make it easier in case
3280      * something needs to change in the future */
3281 #  define LCONV_NUMERIC_ENTRY(name)  LCONV_ENTRY(name)
3282 #  define LCONV_MONETARY_ENTRY(name) LCONV_ENTRY(name)
3283 
3284     /* There are just a few fields for NUMERIC strings */
3285     const lconv_offset_t lconv_numeric_strings[] = {
3286 #  ifndef NO_LOCALECONV_GROUPING
3287         LCONV_NUMERIC_ENTRY(grouping),
3288 #   endif
3289         LCONV_NUMERIC_ENTRY(thousands_sep),
3290         LCONV_NUMERIC_ENTRY(decimal_point),
3291         {NULL, 0}
3292     };
3293 
3294     /* When used to implement nl_langinfo(), we save time by only populating
3295      * the hash with the field(s) needed.  Thus we would need a data structure
3296      * of just:
3297      *  LCONV_NUMERIC_ENTRY(decimal_point),
3298      *  {NULL, 0}
3299      *
3300      * By placing the decimal_point field last in the full structure, we can
3301      * use just the tail for this bit of it, saving space.  This macro yields
3302      * the address of the sub structure. */
3303 #  define DECIMAL_POINT_ADDRESS                                             \
3304         &lconv_numeric_strings[(C_ARRAY_LENGTH(lconv_numeric_strings) - 2)]
3305 
3306     /* And the MONETARY string fields */
3307     const lconv_offset_t lconv_monetary_strings[] = {
3308         LCONV_MONETARY_ENTRY(int_curr_symbol),
3309         LCONV_MONETARY_ENTRY(mon_decimal_point),
3310 #  ifndef NO_LOCALECONV_MON_THOUSANDS_SEP
3311         LCONV_MONETARY_ENTRY(mon_thousands_sep),
3312 #  endif
3313 #  ifndef NO_LOCALECONV_MON_GROUPING
3314         LCONV_MONETARY_ENTRY(mon_grouping),
3315 #  endif
3316         LCONV_MONETARY_ENTRY(positive_sign),
3317         LCONV_MONETARY_ENTRY(negative_sign),
3318         LCONV_MONETARY_ENTRY(currency_symbol),
3319         {NULL, 0}
3320     };
3321 
3322     /* Like above, this field being last can be used as a sub structure */
3323 #  define CURRENCY_SYMBOL_ADDRESS                                            \
3324       &lconv_monetary_strings[(C_ARRAY_LENGTH(lconv_monetary_strings) - 2)]
3325 
3326     /* Finally there are integer fields, all are for monetary purposes */
3327     const lconv_offset_t lconv_integers[] = {
3328         LCONV_ENTRY(int_frac_digits),
3329         LCONV_ENTRY(frac_digits),
3330         LCONV_ENTRY(p_sep_by_space),
3331         LCONV_ENTRY(n_cs_precedes),
3332         LCONV_ENTRY(n_sep_by_space),
3333         LCONV_ENTRY(p_sign_posn),
3334         LCONV_ENTRY(n_sign_posn),
3335 #  ifdef HAS_LC_MONETARY_2008
3336         LCONV_ENTRY(int_p_cs_precedes),
3337         LCONV_ENTRY(int_p_sep_by_space),
3338         LCONV_ENTRY(int_n_cs_precedes),
3339         LCONV_ENTRY(int_n_sep_by_space),
3340         LCONV_ENTRY(int_p_sign_posn),
3341         LCONV_ENTRY(int_n_sign_posn),
3342 #  endif
3343         LCONV_ENTRY(p_cs_precedes),
3344         {NULL, 0}
3345     };
3346 
3347     /* Like above, this field being last can be used as a sub structure */
3348 #  define P_CS_PRECEDES_ADDRESS                                       \
3349       &lconv_integers[(C_ARRAY_LENGTH(lconv_integers) - 2)]
3350 
3351     /* If we aren't paying attention to a given category, use LC_CTYPE instead;
3352      * If not paying attention to that either, the code below should end up not
3353      * using this.  Make sure that things blow up if that avoidance gets lost,
3354      * by setting the category to -1 */
3355     unsigned int numeric_index;
3356     unsigned int monetary_index;
3357 
3358 #  ifdef USE_LOCALE_NUMERIC
3359     numeric_index = LC_NUMERIC_INDEX_;
3360 #  elif defined(USE_LOCALE_CTYPE)
3361     numeric_index = LC_CTYPE_INDEX_;
3362 #  else
3363     numeric_index = (unsigned) -1;
3364 #  endif
3365 #  ifdef USE_LOCALE_MONETARY
3366     monetary_index = LC_MONETARY_INDEX_;
3367 #  elif defined(USE_LOCALE_CTYPE)
3368     monetary_index = LC_CTYPE_INDEX_;
3369 #  else
3370     monetary_index = (unsigned) -1;
3371 #  endif
3372 
3373     /* Some platforms, for correct non-mojibake results, require LC_CTYPE's
3374      * locale to match LC_NUMERIC's for the numeric fields, and LC_MONETARY's
3375      * for the monetary ones.  What happens if LC_NUMERIC and LC_MONETARY
3376      * aren't compatible?  Wrong results.  To avoid that, we call localeconv()
3377      * twice, once for each locale, setting LC_CTYPE to match the category.
3378      * But if the locales of both categories are the same, there is no need for
3379      * a second call.  Assume this is the case unless overridden below */
3380     bool requires_2nd_localeconv = false;
3381 
3382     /* The actual hash populating is done by S_populate_hash_from_localeconv().
3383      * It gets passed an array of length two containing the data structure it
3384      * is supposed to use to get the key names to fill the hash with.  One
3385      * element is alwasy for the NUMERIC strings (or NULL if none to use), and
3386      * the other element similarly for the MONETARY ones. */
3387 #    define NUMERIC_STRING_OFFSET   0
3388 #    define MONETARY_STRING_OFFSET  1
3389     const lconv_offset_t * strings[2] = { NULL, NULL };
3390 
3391     /* This is a mask, with one bit to tell S_populate_hash_from_localeconv to
3392      * populate the NUMERIC items; another bit for the MONETARY ones.  This way
3393      * it can choose which (or both) to populate from */
3394     U32 index_bits = 0;
3395 
3396     /* This converts from a locale index to its bit position in the above mask.
3397      * */
3398 #  define INDEX_TO_BIT(i)  (1 << (i))
3399 
3400     /* The two categories can have disparate locales.  Initialize them to C and
3401      * override later whichever one(s) we pay attention to */
3402     const char * numeric_locale = "C";
3403     const char * monetary_locale = "C";
3404 
3405     /* This will be either 'numeric_locale' or 'monetary_locale' depending on
3406      * what we are working on at the moment */
3407     const char * locale;
3408 
3409     /* The LC_MONETARY category also has some integer-valued fields, whose
3410      * information is kept in a separate list */
3411     const lconv_offset_t * integers;
3412 
3413 #  ifdef HAS_SOME_LANGINFO
3414 
3415     /* If the only use-case for this is the full localeconv(), the 'item'
3416      * parameter is ignored. */
3417     PERL_UNUSED_ARG(item);
3418 
3419 #  else
3420 
3421     /* This only gets compiled for the use-case of using localeconv() to
3422      * emulate an nl_langinfo() missing from the platform.
3423      *
3424      * We need this substructure to only return this field for the THOUSEP
3425      * item.  The other items also need substructures, but they were handled
3426      * above by placing the substructure's item at the end of the full one, so
3427      * the data structure could do double duty.  However, both this and
3428      * RADIXCHAR would need to be in the final position of the same full
3429      * structure; an impossibility.  So make this into a separate structure */
3430     const lconv_offset_t  thousands_sep_string[] = {
3431         LCONV_NUMERIC_ENTRY(thousands_sep),
3432         {NULL, 0}
3433     };
3434 
3435     /* End of all the initialization of datastructures.  Now for actual code.
3436      *
3437      * Without nl_langinfo(), the call to my_localeconv() could be for just one
3438      * of the following 3 items to emulate nl_langinfo().  This is compiled
3439      * only when using perl_langinfo.h, which we control, and it has been
3440      * constructed so that no item is numbered 0.
3441      *
3442      * For each, setup the appropriate parameters for the call below to
3443      * S_populate_hash_from_localeconv() */
3444     if (item != 0) switch (item) {
3445       default:
3446         locale_panic_(Perl_form(aTHX_
3447                     "Unexpected item passed to my_localeconv: %d", item));
3448         break;
3449 
3450 #    ifdef USE_LOCALE_NUMERIC
3451 
3452       case RADIXCHAR:
3453         locale = numeric_locale = PL_numeric_name;
3454         index_bits = INDEX_TO_BIT(LC_NUMERIC_INDEX_);
3455         strings[NUMERIC_STRING_OFFSET] = DECIMAL_POINT_ADDRESS;
3456         integers = NULL;
3457         break;
3458 
3459       case THOUSEP:
3460         index_bits = INDEX_TO_BIT(LC_NUMERIC_INDEX_);
3461         locale = numeric_locale = PL_numeric_name;
3462         strings[NUMERIC_STRING_OFFSET] = thousands_sep_string;
3463         integers = NULL;
3464         break;
3465 
3466 #    endif
3467 #    ifdef USE_LOCALE_MONETARY
3468 
3469       case CRNCYSTR:
3470         index_bits = INDEX_TO_BIT(LC_MONETARY_INDEX_);
3471         locale = monetary_locale = querylocale_i(LC_MONETARY_INDEX_);
3472 
3473         /* This item needs the values for both the currency symbol, and another
3474          * one used to construct the nl_langino()-compatible return */
3475         strings[MONETARY_STRING_OFFSET] = CURRENCY_SYMBOL_ADDRESS;
3476         integers = P_CS_PRECEDES_ADDRESS;
3477         break;
3478 
3479 #    endif
3480 
3481     } /* End of switch() */
3482 
3483     else    /* End of for just one item to emulate nl_langinfo() */
3484 
3485 #  endif
3486 
3487     {   /* Here, the call is for all of localeconv().  It has a bunch of
3488          * items.  As in the individual item case, set up the parameters for
3489          * S_populate_hash_from_localeconv(); */
3490 
3491 #  ifdef USE_LOCALE_NUMERIC
3492         numeric_locale = PL_numeric_name;
3493 #  elif defined(USE_LOCALE_CTYPE)
3494         numeric_locale = querylocale_i(numeric_index);
3495 #  endif
3496 #  if defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_CTYPE)
3497         monetary_locale = querylocale_i(monetary_index);
3498 #  endif
3499 
3500         /* The first call to S_populate_hash_from_localeconv() will be for the
3501          * MONETARY values */
3502         index_bits = INDEX_TO_BIT(monetary_index);
3503         locale = monetary_locale;
3504 
3505         /* And if the locales for the two categories are the same, we can also
3506          * do the NUMERIC values in the same call */
3507         if (strEQ(numeric_locale, monetary_locale)) {
3508             index_bits |= INDEX_TO_BIT(numeric_index);
3509         }
3510         else {
3511             requires_2nd_localeconv = true;
3512         }
3513 
3514         /* We always pass both sets of strings. 'index_bits' tells
3515          * S_populate_hash_from_localeconv which to actually look at */
3516         strings[NUMERIC_STRING_OFFSET] = lconv_numeric_strings;
3517         strings[MONETARY_STRING_OFFSET] = lconv_monetary_strings;
3518 
3519         /* And pass the integer values to populate; again 'index_bits' will
3520          * say to use them or not */
3521         integers = lconv_integers;
3522 
3523     }   /* End of call is for localeconv() */
3524 
3525     /* The code above has determined the parameters to
3526        S_populate_hash_from_localeconv() for both cases of an individual item
3527        and for the entire structure.  Below is code common to both */
3528 
3529     HV * hv = newHV();      /* The returned hash, initially empty */
3530     sv_2mortal((SV*)hv);
3531 
3532     /* Call localeconv() and copy its results into the hash.  All the
3533      * parameters have been initialized above */
3534     populate_hash_from_localeconv(hv,
3535                                   locale,
3536                                   index_bits,
3537                                   strings,
3538                                   integers
3539                                  );
3540 
3541     /* The above call may have done all the hash fields, but not always, as
3542      * already explained.  If we need a second call it is always for the
3543      * NUMERIC fields */
3544     if (requires_2nd_localeconv) {
3545         populate_hash_from_localeconv(hv,
3546                                       numeric_locale,
3547                                       INDEX_TO_BIT(numeric_index),
3548                                       strings,
3549                                       NULL      /* There are No NUMERIC integer
3550                                                    fields */
3551                                      );
3552     }
3553 
3554     /* Here, the hash has been completely populated.
3555      *
3556      * Now go through all the items and:
3557      *  a) For string items, see if they should be marked as UTF-8 or not.
3558      *     This would have been more convenient and faster to do while
3559      *     populating the hash in the first place, but that operation has to be
3560      *     done within a critical section, keeping other threads from
3561      *     executing, so only the minimal amount of work necessary is done at
3562      *     that time.
3563      *  b) For integer items, convert the C CHAR_MAX value into -1.  Again,
3564      *     this could have been done in the critical section, but was deferred
3565      *     to here to keep to the bare minimum amount the time spent owning the
3566      *     processor. CHAR_MAX is a C concept for an 8-bit character type.
3567      *     Perl has no such type; the closest fit is a -1.
3568      *
3569      * XXX On unthreaded perls, this code could be #ifdef'd out, and the
3570      * corrections determined at hash population time, at an extra maintenance
3571      * cost which khw doesn't think is worth it
3572      */
3573     for (unsigned int i = 0; i < 2; i++) {  /* Try both types of strings */
3574         if (! strings[i]) {     /* Skip if no strings of this type */
3575             continue;
3576         }
3577 
3578         locale = (i == NUMERIC_STRING_OFFSET)
3579                  ? numeric_locale
3580                  : monetary_locale;
3581 
3582         locale_utf8ness_t locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
3583 
3584 #  ifdef HAS_RELIABLE_UTF8NESS_DETERMINATION
3585 
3586         /* It saves time in the loop below to have predetermined the UTF8ness
3587          * of the locale.  But only do so if the platform reliably has this
3588          * information; otherwise to do it, this could recurse indefinitely.
3589          *
3590          * When we don't do it here, it will be done on a per-element basis in
3591          * the loop.  The per-element check is intelligent enough to not
3592          * recurse */
3593 
3594         locale_is_utf8 = (is_locale_utf8(locale))
3595                          ? LOCALE_IS_UTF8
3596                          : LOCALE_NOT_UTF8;
3597 
3598         if (locale_is_utf8 == LOCALE_NOT_UTF8) {
3599             continue;   /* No string can be UTF-8 if the locale isn't */
3600         }
3601 
3602 #  endif
3603 
3604         /* Examine each string */
3605         while (1) {
3606             const char * name = strings[i]->name;
3607 
3608             if (! name) {   /* Reached the end */
3609                 break;
3610             }
3611 
3612             /* 'value' will contain the string that may need to be marked as
3613              * UTF-8 */
3614             SV ** value = hv_fetch(hv, name, strlen(name), true);
3615             if (! value) {
3616                 continue;
3617             }
3618 
3619             /* Determine if the string should be marked as UTF-8. */
3620             if (UTF8NESS_YES == (get_locale_string_utf8ness_i(SvPVX(*value),
3621                                                               locale_is_utf8,
3622                                                               NULL, 0)))
3623             {
3624                 SvUTF8_on(*value);
3625             }
3626 
3627             strings[i]++;   /* Iterate */
3628         }
3629     }   /* End of fixing up UTF8ness */
3630 
3631 
3632     /* Examine each integer */
3633     if (integers) while (1) {
3634         const char * name = integers->name;
3635 
3636         if (! name) {   /* Reached the end */
3637             break;
3638         }
3639 
3640         SV ** value = hv_fetch(hv, name, strlen(name), true);
3641         if (! value) {
3642             continue;
3643         }
3644 
3645         /* Change CHAR_MAX to -1 */
3646         if (SvIV(*value) == CHAR_MAX) {
3647             sv_setiv(*value, -1);
3648         }
3649 
3650         integers++;   /* Iterate */
3651     }
3652 
3653     return hv;
3654 }
3655 
3656 STATIC void
S_populate_hash_from_localeconv(pTHX_ HV * hv,const char * locale,const U32 which_mask,const lconv_offset_t * strings[2],const lconv_offset_t * integers)3657 S_populate_hash_from_localeconv(pTHX_ HV * hv,
3658 
3659                                       /* Switch to this locale to run
3660                                        * localeconv() from */
3661                                       const char * locale,
3662 
3663                                       /* bit mask of which categories to
3664                                        * populate */
3665                                       const U32 which_mask,
3666 
3667                                       /* strings[0] points the numeric
3668                                        * string fields; [1] to the monetary */
3669                                       const lconv_offset_t * strings[2],
3670 
3671                                       /* And to the monetary integer fields */
3672                                       const lconv_offset_t * integers)
3673 {
3674     PERL_ARGS_ASSERT_POPULATE_HASH_FROM_LOCALECONV;
3675     PERL_UNUSED_ARG(which_mask);    /* Some configurations don't use this;
3676                                        complicated to figure out which */
3677 
3678     /* Run localeconv() and copy some or all of its results to the input 'hv'
3679      * hash.  Most localeconv() implementations return the values in a global
3680      * static buffer, so the operation must be performed in a critical section,
3681      * ending only after the copy is completed.  There are so many locks
3682      * because localeconv() deals with two categories, and returns in a single
3683      * global static buffer.  Some locks might be no-ops on this platform, but
3684      * not others.  We need to lock if any one isn't a no-op. */
3685 
3686 #  ifdef USE_LOCALE_CTYPE
3687 
3688     /* Some platforms require LC_CTYPE to be congruent with the category we are
3689      * looking for */
3690     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
3691 
3692 #  endif
3693 #  ifdef USE_LOCALE_NUMERIC
3694 
3695     /* We need to toggle to the underlying NUMERIC locale if we are getting
3696      * NUMERIC strings */
3697     const char * orig_NUMERIC_locale = NULL;
3698     if (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) {
3699         LC_NUMERIC_LOCK(0);
3700         orig_NUMERIC_locale = toggle_locale_i(LC_NUMERIC_INDEX_, locale);
3701     }
3702 
3703 #    endif
3704 
3705     /* Finally ready to do the actual localeconv().  Lock to prevent other
3706      * accesses until we have made a copy of its returned static buffer */
3707     gwLOCALE_LOCK;
3708 
3709 #  ifdef TS_W32_BROKEN_LOCALECONV
3710 
3711     /* This is a workaround for another bug in Windows.  localeconv() was
3712      * broken with thread-safe locales prior to VS 15.  It looks at the global
3713      * locale instead of the thread one.  As a work-around, we toggle to the
3714      * global locale; populate the return; then toggle back.  We have to use
3715      * LC_ALL instead of the individual categories because of yet another bug
3716      * in Windows.  And this all has to be done in a critical section.
3717      *
3718      * This introduces a potential race with any other thread that has also
3719      * converted to use the global locale, and doesn't protect its locale calls
3720      * with mutexes.  khw can't think of any reason for a thread to do so on
3721      * Windows, as the locale API is the same regardless of thread-safety, except
3722      * if the code is ported from working on another platform where there might
3723      * be some reason to do this.  But this is typically due to some
3724      * alien-to-Perl library that thinks it owns locale setting.  Such a
3725      * library isn't likely to exist on Windows, so such an application is
3726      * unlikely to be run on Windows
3727      */
3728     bool restore_per_thread = FALSE;
3729 
3730     /* Save the per-thread locale state */
3731     const char * save_thread = querylocale_c(LC_ALL);
3732 
3733     /* Change to the global locale, and note if we already were there */
3734     if (_configthreadlocale(_DISABLE_PER_THREAD_LOCALE)
3735                          != _DISABLE_PER_THREAD_LOCALE)
3736     {
3737         restore_per_thread = TRUE;
3738     }
3739 
3740     /* Save the state of the global locale; then convert to our desired
3741      * state.  */
3742     const char * save_global = querylocale_c(LC_ALL);
3743     void_setlocale_c(LC_ALL, save_thread);
3744 
3745 #  endif  /* TS_W32_BROKEN_LOCALECONV */
3746 
3747     /* Finally, do the actual localeconv */
3748     const char *lcbuf_as_string = (const char *) localeconv();
3749 
3750     /* Fill in the string fields of the HV* */
3751     for (unsigned int i = 0; i < 2; i++) {
3752 
3753 #  ifdef USE_LOCALE_NUMERIC
3754 
3755         /* One iteration is only for the numeric string fields */
3756         if (   i == NUMERIC_STRING_OFFSET
3757             && (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_))  == 0)
3758         {
3759             continue;
3760         }
3761 
3762 #  endif
3763 #  ifdef USE_LOCALE_MONETARY
3764 
3765         /* The other iteration is only for the monetary string fields */
3766         if (   i == MONETARY_STRING_OFFSET
3767             && (which_mask & INDEX_TO_BIT(LC_MONETARY_INDEX_)) == 0)
3768         {
3769             continue;
3770         }
3771 
3772 #  endif
3773 
3774         /* For each field for the given category ... */
3775         const lconv_offset_t * category_strings = strings[i];
3776         while (1) {
3777             const char * name = category_strings->name;
3778             if (! name) {   /* Quit at the end */
3779                 break;
3780             }
3781 
3782             /* we have set things up so that we know where in the returned
3783              * structure, when viewed as a string, the corresponding value is.
3784              * */
3785             const char *value = *((const char **)(  lcbuf_as_string
3786                                                   + category_strings->offset));
3787 
3788             /* Set to get next string on next iteration */
3789             category_strings++;
3790 
3791             /* Skip if this platform doesn't have this field. */
3792             if (! value) {
3793                 continue;
3794             }
3795 
3796             /* Copy to the hash */
3797             (void) hv_store(hv,
3798                             name, strlen(name),
3799                             newSVpv(value, strlen(value)),
3800                             0);
3801         }
3802 
3803         /* Add any int fields to the HV* */
3804         if (i == MONETARY_STRING_OFFSET && integers) {
3805             while (integers->name) {
3806                 const char value = *((const char *)(  lcbuf_as_string
3807                                                     + integers->offset));
3808                 (void) hv_store(hv, integers->name,
3809                                 strlen(integers->name), newSViv(value), 0);
3810                 integers++;
3811             }
3812         }
3813     }   /* End of loop through the fields */
3814 
3815     /* Done with copying to the hash.  Can unwind the critical section locks */
3816 
3817 #  ifdef TS_W32_BROKEN_LOCALECONV
3818 
3819     /* Restore the global locale's prior state */
3820     void_setlocale_c(LC_ALL, save_global);
3821 
3822     /* And back to per-thread locales */
3823     if (restore_per_thread) {
3824         _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
3825     }
3826 
3827     /* Restore the per-thread locale state */
3828     void_setlocale_c(LC_ALL, save_thread);
3829 
3830 #  endif  /* TS_W32_BROKEN_LOCALECONV */
3831 
3832     gwLOCALE_UNLOCK;    /* Finished with the critical section of a
3833                            globally-accessible buffer */
3834 
3835 #  ifdef USE_LOCALE_NUMERIC
3836 
3837     restore_toggled_locale_i(LC_NUMERIC_INDEX_, orig_NUMERIC_locale);
3838     if (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) {
3839         LC_NUMERIC_UNLOCK;
3840     }
3841 
3842 #  endif
3843 #  ifdef USE_LOCALE_CTYPE
3844 
3845     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
3846 
3847 #  endif
3848 
3849 }
3850 
3851 #endif /* defined(HAS_LOCALECONV) */
3852 #ifndef HAS_SOME_LANGINFO
3853 
3854 typedef int nl_item;    /* Substitute 'int' for emulated nl_langinfo() */
3855 
3856 #endif
3857 
3858 /*
3859 
3860 =for apidoc      Perl_langinfo
3861 =for apidoc_item Perl_langinfo8
3862 
3863 C<Perl_langinfo> is an (almost) drop-in replacement for the system
3864 C<L<nl_langinfo(3)>>, taking the same C<item> parameter values, and returning
3865 the same information.  But it is more thread-safe than regular
3866 C<nl_langinfo()>, and hides the quirks of Perl's locale handling from your
3867 code, and can be used on systems that lack a native C<nl_langinfo>.
3868 
3869 However, you should instead use the improved version of this:
3870 L</Perl_langinfo8>, which behaves identically except for an additional
3871 parameter, a pointer to a variable declared as L</C<utf8ness_t>>, into which it
3872 returns to you how you should treat the returned string with regards to it
3873 being encoded in UTF-8 or not.
3874 
3875 Concerning the differences between these and plain C<nl_langinfo()>:
3876 
3877 =over
3878 
3879 =item a.
3880 
3881 C<Perl_langinfo8> has an extra parameter, described above.  Besides this, the
3882 other reason they aren't quite a drop-in replacement is actually an advantage.
3883 The C<const>ness of the return allows the compiler to catch attempts to write
3884 into the returned buffer, which is illegal and could cause run-time crashes.
3885 
3886 =item b.
3887 
3888 They deliver the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
3889 without you having to write extra code.  The reason for the extra code would be
3890 because these are from the C<LC_NUMERIC> locale category, which is normally
3891 kept set by Perl so that the radix is a dot, and the separator is the empty
3892 string, no matter what the underlying locale is supposed to be, and so to get
3893 the expected results, you have to temporarily toggle into the underlying
3894 locale, and later toggle back.  (You could use plain C<nl_langinfo> and
3895 C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
3896 the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
3897 (or equivalent) locale would break a lot of CPAN, which is expecting the radix
3898 (decimal point) character to be a dot.)
3899 
3900 =item c.
3901 
3902 The system function they replace can have its static return buffer trashed,
3903 not only by a subsequent call to that function, but by a C<freelocale>,
3904 C<setlocale>, or other locale change.  The returned buffer of these functions
3905 is not changed until the next call to one or the other, so the buffer is never
3906 in a trashed state.
3907 
3908 =item d.
3909 
3910 The return buffer is per-thread, so it also is never overwritten by a call to
3911 these functions from another thread;  unlike the function it replaces.
3912 
3913 =item e.
3914 
3915 But most importantly, they work on systems that don't have C<nl_langinfo>, such
3916 as Windows, hence making your code more portable.  Of the fifty-some possible
3917 items specified by the POSIX 2008 standard,
3918 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
3919 only one is completely unimplemented, though on non-Windows platforms, another
3920 significant one is not fully implemented).  They use various techniques to
3921 recover the other items, including calling C<L<localeconv(3)>>, and
3922 C<L<strftime(3)>>, both of which are specified in C89, so should be always be
3923 available.  Later C<strftime()> versions have additional capabilities; What the
3924 C locale yields or C<""> is returned for any item not available on your system.
3925 
3926 It is important to note that, when called with an item that is recovered by
3927 using C<localeconv>, the buffer from any previous explicit call to
3928 C<L<localeconv(3)>> will be overwritten.  But you shouldn't be using
3929 C<localeconv> anyway because it is is very much not thread-safe, and suffers
3930 from the same problems outlined in item 'b.' above for the fields it returns that
3931 are controlled by the LC_NUMERIC locale category.  Instead, avoid all of those
3932 problems by calling L</Perl_localeconv>, which is thread-safe; or by using the
3933 methods given in L<perlcall>  to call
3934 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe.
3935 
3936 =back
3937 
3938 The details for those items which may deviate from what this emulation returns
3939 and what a native C<nl_langinfo()> would return are specified in
3940 L<I18N::Langinfo>.
3941 
3942 When using C<Perl_langinfo8> (or plain C<Perl_langinfo>) on systems that don't
3943 have a native C<nl_langinfo()>, you must
3944 
3945  #include "perl_langinfo.h"
3946 
3947 before the C<perl.h> C<#include>.  You can replace your F<langinfo.h>
3948 C<#include> with this one.  (Doing it this way keeps out the symbols that plain
3949 F<langinfo.h> would try to import into the namespace for code that doesn't need
3950 it.)
3951 
3952 =cut
3953 
3954 */
3955 
3956 const char *
Perl_langinfo(const nl_item item)3957 Perl_langinfo(const nl_item item)
3958 {
3959     return Perl_langinfo8(item, NULL);
3960 }
3961 
3962 const char *
Perl_langinfo8(const nl_item item,utf8ness_t * utf8ness)3963 Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness)
3964 {
3965     dTHX;
3966     unsigned cat_index;
3967 
3968     PERL_ARGS_ASSERT_PERL_LANGINFO8;
3969 
3970     if (utf8ness) {     /* Assume for now */
3971         *utf8ness = UTF8NESS_IMMATERIAL;
3972     }
3973 
3974     /* Find the locale category that controls the input 'item'.  If we are not
3975      * paying attention to that category, instead return a default value.  Also
3976      * return the default value if there is no way for us to figure out the
3977      * correct value.  If we have some form of nl_langinfo(), we can always
3978      * figure it out, but lacking that, there may be alternative methods that
3979      * can be used to recover most of the possible items.  Some of those
3980      * methods need libc functions, which may or may not be available.  If
3981      * unavailable, we can't compute the correct value, so must here return the
3982      * default. */
3983     switch (item) {
3984 
3985       case CODESET:
3986 
3987 #ifdef USE_LOCALE_CTYPE
3988 
3989         cat_index = LC_CTYPE_INDEX_;
3990         break;
3991 
3992 #else
3993         return C_codeset;
3994 #endif
3995 #if defined(USE_LOCALE_MESSAGES) && defined(HAS_SOME_LANGINFO)
3996 
3997       case YESEXPR: case YESSTR: case NOEXPR: case NOSTR:
3998         cat_index = LC_MESSAGES_INDEX_;
3999         break;
4000 #else
4001       case YESEXPR:   return "^[+1yY]";
4002       case YESSTR:    return "yes";
4003       case NOEXPR:    return "^[-0nN]";
4004       case NOSTR:     return "no";
4005 #endif
4006 
4007       case CRNCYSTR:
4008 
4009 #if  defined(USE_LOCALE_MONETARY)                                   \
4010  && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV))
4011 
4012         cat_index = LC_MONETARY_INDEX_;
4013         break;
4014 #else
4015         return "-";
4016 #endif
4017 
4018       case RADIXCHAR:
4019 
4020 #ifdef CAN_CALCULATE_RADIX
4021 
4022         cat_index = LC_NUMERIC_INDEX_;
4023         break;
4024 #else
4025         return C_decimal_point;
4026 #endif
4027 
4028       case THOUSEP:
4029 
4030 #if  defined(USE_LOCALE_NUMERIC)                                    \
4031  && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV))
4032 
4033         cat_index = LC_NUMERIC_INDEX_;
4034         break;
4035 #else
4036         return C_thousands_sep;
4037 #endif
4038 
4039 /* The other possible items are all in LC_TIME. */
4040 #ifdef USE_LOCALE_TIME
4041 
4042       default:
4043         cat_index = LC_TIME_INDEX_;
4044         break;
4045 
4046 #endif
4047 #if ! defined(USE_LOCALE_TIME) || ! defined(HAS_SOME_LANGINFO)
4048 
4049     /* If not using LC_TIME, hard code the rest.  Or, if there is no
4050      * nl_langinfo(), we use strftime() as an alternative, and it is missing
4051      * functionality to get every single one, so hard-code those */
4052 
4053       case ERA: return "";  /* Unimplemented; for use with strftime() %E
4054                                modifier */
4055 
4056       /* These formats are defined by C89, so we assume that strftime supports
4057        * them, and so are returned unconditionally; they may not be what the
4058        * locale actually says, but should give good enough results for someone
4059        * using them as formats (as opposed to trying to parse them to figure
4060        * out what the locale says).  The other format items are actually tested
4061        * to verify they work on the platform */
4062       case D_FMT:         return "%x";
4063       case T_FMT:         return "%X";
4064       case D_T_FMT:       return "%c";
4065 
4066 #  if defined(WIN32) || ! defined(USE_LOCALE_TIME)
4067 
4068       /* strftime() on Windows doesn't have the POSIX (beyond C89) extensions
4069        * that would allow it to recover these */
4070       case ERA_D_FMT:     return "%x";
4071       case ERA_T_FMT:     return "%X";
4072       case ERA_D_T_FMT:   return "%c";
4073       case ALT_DIGITS:    return "0";
4074 
4075 #  endif
4076 #  ifndef USE_LOCALE_TIME
4077 
4078       case T_FMT_AMPM:    return "%r";
4079       case ABDAY_1:       return "Sun";
4080       case ABDAY_2:       return "Mon";
4081       case ABDAY_3:       return "Tue";
4082       case ABDAY_4:       return "Wed";
4083       case ABDAY_5:       return "Thu";
4084       case ABDAY_6:       return "Fri";
4085       case ABDAY_7:       return "Sat";
4086       case AM_STR:        return "AM";
4087       case PM_STR:        return "PM";
4088       case ABMON_1:       return "Jan";
4089       case ABMON_2:       return "Feb";
4090       case ABMON_3:       return "Mar";
4091       case ABMON_4:       return "Apr";
4092       case ABMON_5:       return "May";
4093       case ABMON_6:       return "Jun";
4094       case ABMON_7:       return "Jul";
4095       case ABMON_8:       return "Aug";
4096       case ABMON_9:       return "Sep";
4097       case ABMON_10:      return "Oct";
4098       case ABMON_11:      return "Nov";
4099       case ABMON_12:      return "Dec";
4100       case DAY_1:         return "Sunday";
4101       case DAY_2:         return "Monday";
4102       case DAY_3:         return "Tuesday";
4103       case DAY_4:         return "Wednesday";
4104       case DAY_5:         return "Thursday";
4105       case DAY_6:         return "Friday";
4106       case DAY_7:         return "Saturday";
4107       case MON_1:         return "January";
4108       case MON_2:         return "February";
4109       case MON_3:         return "March";
4110       case MON_4:         return "April";
4111       case MON_5:         return "May";
4112       case MON_6:         return "June";
4113       case MON_7:         return "July";
4114       case MON_8:         return "August";
4115       case MON_9:         return "September";
4116       case MON_10:        return "October";
4117       case MON_11:        return "November";
4118       case MON_12:        return "December";
4119 
4120 #  endif
4121 #endif
4122 
4123     } /* End of switch on item */
4124 
4125 #ifndef USE_LOCALE
4126 
4127     Perl_croak_nocontext("panic: Unexpected nl_langinfo() item %d", item);
4128     NOT_REACHED; /* NOTREACHED */
4129     PERL_UNUSED_VAR(cat_index);
4130 
4131 #else
4132 #  ifdef USE_LOCALE_NUMERIC
4133 
4134     /* Use either the underlying numeric, or the other underlying categories */
4135     if (cat_index == LC_NUMERIC_INDEX_) {
4136         return my_langinfo_c(item, LC_NUMERIC, PL_numeric_name,
4137                              &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
4138     }
4139     else
4140 
4141 #  endif
4142 
4143     {
4144         return my_langinfo_i(item, cat_index, querylocale_i(cat_index),
4145                              &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
4146     }
4147 
4148 #endif
4149 
4150 }
4151 
4152 #ifdef USE_LOCALE
4153 
4154 /* There are several implementations of my_langinfo, depending on the
4155  * Configuration.  They all share the same beginning of the function */
4156 STATIC const char *
S_my_langinfo_i(pTHX_ const nl_item item,const unsigned int cat_index,const char * locale,const char ** retbufp,Size_t * retbuf_sizep,utf8ness_t * utf8ness)4157 S_my_langinfo_i(pTHX_
4158                 const nl_item item,           /* The item to look up */
4159                 const unsigned int cat_index, /* The locale category that
4160                                                  controls it */
4161                 /* The locale to look up 'item' in. */
4162                 const char * locale,
4163 
4164                 /* Where to store the result, and where the size of that buffer
4165                  * is stored, updated on exit. retbuf_sizep may be NULL for an
4166                  * empty-on-entry, single use buffer whose size we don't need
4167                  * to keep track of */
4168                 const char ** retbufp,
4169                 Size_t * retbuf_sizep,
4170 
4171                 /* If not NULL, the location to store the UTF8-ness of 'item's
4172                  * value, as documented */
4173                 utf8ness_t * utf8ness)
4174 {
4175     const char * retval = NULL;
4176 
4177     PERL_ARGS_ASSERT_MY_LANGINFO_I;
4178     assert(cat_index < NOMINAL_LC_ALL_INDEX);
4179 
4180     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4181                            "Entering my_langinfo item=%ld, using locale %s\n",
4182                            (long) item, locale));
4183 /*--------------------------------------------------------------------------*/
4184 /* Above is the common beginning to all the implementations of my_langinfo().
4185  * Below are the various completions.
4186  *
4187  * Some platforms don't deal well with non-ASCII strings in locale X when
4188  * LC_CTYPE is not in X.  (Actually it is probably when X is UTF-8 and LC_CTYPE
4189  * isn't, or vice versa).  There is explicit code to bring the categories into
4190  * sync.  This doesn't seem to be a problem with nl_langinfo(), so that
4191  * implementation doesn't currently worry about it.  But it is a problem on
4192  * Windows boxes, which don't have nl_langinfo(). */
4193 
4194 /*--------------------------------------------------------------------------*/
4195 #  if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available.  */
4196 #    ifdef USE_LOCALE_CTYPE
4197 
4198     /* Ths function sorts out if things actually have to be switched or not,
4199      * for both calls. */
4200     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
4201 
4202 #    endif
4203 
4204     const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
4205 
4206     gwLOCALE_LOCK;
4207     retval = save_to_buffer(nl_langinfo(item), retbufp, retbuf_sizep);
4208     gwLOCALE_UNLOCK;
4209 
4210     if (utf8ness) {
4211         *utf8ness = get_locale_string_utf8ness_i(retval,
4212                                                  LOCALE_UTF8NESS_UNKNOWN,
4213                                                  locale, cat_index);
4214     }
4215 
4216     restore_toggled_locale_i(cat_index, orig_switched_locale);
4217 
4218 #    ifdef USE_LOCALE_CTYPE
4219     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
4220 #    endif
4221 
4222     return retval;
4223 /*--------------------------------------------------------------------------*/
4224 #  else   /* Below, emulate nl_langinfo as best we can */
4225 
4226     /* And the third and final completion is where we have to emulate
4227      * nl_langinfo().  There are various possibilities depending on the
4228      * Configuration */
4229 
4230 #    ifdef USE_LOCALE_CTYPE
4231 
4232     const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
4233 
4234 #    endif
4235 
4236     const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
4237 
4238     /* Here, we are in the locale we want information about */
4239 
4240     /* Almost all the items will have ASCII return values.  Set that here, and
4241      * override if necessary */
4242     utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL;
4243 
4244     switch (item) {
4245       default:
4246         assert(item < 0);   /* Make sure using perl_langinfo.h */
4247         retval = "";
4248         break;
4249 
4250       case RADIXCHAR:
4251 
4252 #    if      defined(HAS_SNPRINTF)                                              \
4253        && (! defined(HAS_LOCALECONV) || defined(TS_W32_BROKEN_LOCALECONV))
4254 
4255         {
4256             /* snprintf() can be used to find the radix character by outputting
4257              * a known simple floating point number to a buffer, and parsing
4258              * it, inferring the radix as the bytes separating the integer and
4259              * fractional parts.  But localeconv() is more direct, not
4260              * requiring inference, so use it instead of the code just below,
4261              * if (likely) it is available and works ok */
4262 
4263             char * floatbuf = NULL;
4264             const Size_t initial_size = 10;
4265 
4266             Newx(floatbuf, initial_size, char);
4267 
4268             /* 1.5 is exactly representable on binary computers */
4269             Size_t needed_size = snprintf(floatbuf, initial_size, "%.1f", 1.5);
4270 
4271             /* If our guess wasn't big enough, increase and try again, based on
4272              * the real number that snprintf() is supposed to return */
4273             if (UNLIKELY(needed_size >= initial_size)) {
4274                 needed_size++;  /* insurance */
4275                 Renew(floatbuf, needed_size, char);
4276                 Size_t new_needed = snprintf(floatbuf, needed_size, "%.1f", 1.5);
4277                 assert(new_needed <= needed_size);
4278                 needed_size = new_needed;
4279             }
4280 
4281             char * s = floatbuf;
4282             char * e = floatbuf + needed_size;
4283 
4284             /* Find the '1' */
4285             while (s < e && *s != '1') {
4286                 s++;
4287             }
4288 
4289             if (LIKELY(s < e)) {
4290                 s++;
4291             }
4292 
4293             /* Find the '5' */
4294             char * item_start = s;
4295             while (s < e && *s != '5') {
4296                 s++;
4297             }
4298 
4299             /* Everything in between is the radix string */
4300             if (LIKELY(s < e)) {
4301                 *s = '\0';
4302                 retval = save_to_buffer(item_start, retbufp, retbuf_sizep);
4303                 Safefree(floatbuf);
4304 
4305                 if (utf8ness) {
4306                     is_utf8 = get_locale_string_utf8ness_i(retval,
4307                                                         LOCALE_UTF8NESS_UNKNOWN,
4308                                                         locale, cat_index);
4309                 }
4310 
4311                 break;
4312             }
4313 
4314             Safefree(floatbuf);
4315         }
4316 
4317 #      ifdef HAS_LOCALECONV /* snprintf() failed; drop down to use
4318                                localeconv() */
4319 
4320         /* FALLTHROUGH */
4321 
4322 #      else                      /* snprintf() failed and no localeconv() */
4323 
4324         retval = C_decimal_point;
4325         break;
4326 
4327 #      endif
4328 #    endif
4329 #    ifdef HAS_LOCALECONV
4330 
4331     /* These items are available from localeconv().  (To avoid using
4332      * TS_W32_BROKEN_LOCALECONV, one could use GetNumberFormat and
4333      * GetCurrencyFormat; patches welcome) */
4334 
4335 #    define P_CS_PRECEDES    "p_cs_precedes"
4336 #    define CURRENCY_SYMBOL  "currency_symbol"
4337 
4338    /* case RADIXCHAR:   // May drop down to here in some configurations */
4339       case THOUSEP:
4340       case CRNCYSTR:
4341        {
4342 
4343         /* The hash gets populated with just the field(s) related to 'item'. */
4344         HV * result_hv = my_localeconv(item);
4345 
4346         SV* string;
4347         if (item != CRNCYSTR) {
4348 
4349             /* These items have been populated with just one key => value */
4350             (void) hv_iterinit(result_hv);
4351             HE * entry = hv_iternext(result_hv);
4352             string = hv_iterval(result_hv, entry);
4353         }
4354         else {
4355 
4356             /* But CRNCYSTR localeconv() returns a slightly different value
4357              * than the nl_langinfo() API calls for, so have to modify this one
4358              * to conform.  We need another value from localeconv() to know
4359              * what to change it to.  my_localeconv() has populated the hash
4360              * with exactly both fields.  Delete this one, leaving just the
4361              * CRNCYSTR one in the hash */
4362             SV* precedes = hv_delete(result_hv,
4363                                      P_CS_PRECEDES, STRLENs(P_CS_PRECEDES),
4364                                      0);
4365             if (! precedes) {
4366                     locale_panic_("my_localeconv() unexpectedly didn't return"
4367                                   " a value for " P_CS_PRECEDES);
4368             }
4369 
4370             /* The modification is to prefix the localeconv() return with a
4371              * single byte, calculated as follows: */
4372             char prefix = (LIKELY(SvIV(precedes) != -1))
4373                           ? ((precedes != 0) ?  '-' : '+')
4374 
4375                             /* khw couldn't find any documentation that
4376                              * CHAR_MAX (which we modify to -1) is the signal,
4377                              * but cygwin uses it thusly, and it makes sense
4378                              * given that CHAR_MAX indicates the value isn't
4379                              * used, so it neither precedes nor succeeds */
4380                           : '.';
4381 
4382             /* Now get CRNCYSTR */
4383             (void) hv_iterinit(result_hv);
4384             HE * entry = hv_iternext(result_hv);
4385             string = hv_iterval(result_hv, entry);
4386 
4387             /* And perform the modification */
4388             Perl_sv_setpvf(aTHX_ string, "%c%s", prefix, SvPV_nolen(string));
4389         }
4390 
4391         /* Here, 'string' contains the value we want to return */
4392         retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep);
4393 
4394         if (utf8ness) {
4395             is_utf8 = (SvUTF8(string))
4396                       ? UTF8NESS_YES
4397                       : (is_utf8_invariant_string( (U8 *) retval,
4398                                                   strlen(retval)))
4399                         ? UTF8NESS_IMMATERIAL
4400                         : UTF8NESS_NO;
4401         }
4402 
4403         break;
4404 
4405        }
4406 
4407 #    endif  /* Some form of localeconv */
4408 #    ifdef HAS_STRFTIME
4409 
4410       /* These formats are only available in later strftime's */
4411       case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM:
4412 
4413       /* The rest can be gotten from most versions of strftime(). */
4414       case ABDAY_1: case ABDAY_2: case ABDAY_3:
4415       case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7:
4416       case ALT_DIGITS:
4417       case AM_STR: case PM_STR:
4418       case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4:
4419       case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8:
4420       case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12:
4421       case DAY_1: case DAY_2: case DAY_3: case DAY_4:
4422       case DAY_5: case DAY_6: case DAY_7:
4423       case MON_1: case MON_2: case MON_3: case MON_4:
4424       case MON_5: case MON_6: case MON_7: case MON_8:
4425       case MON_9: case MON_10: case MON_11: case MON_12:
4426         {
4427             const char * format;
4428             bool return_format = FALSE;
4429             int mon = 0;
4430             int mday = 1;
4431             int hour = 6;
4432 
4433             GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
4434 
4435             switch (item) {
4436               default:
4437                 locale_panic_(Perl_form(aTHX_ "switch case: %d problem", item));
4438                 NOT_REACHED; /* NOTREACHED */
4439 
4440               case PM_STR: hour = 18;
4441               case AM_STR:
4442                 format = "%p";
4443                 break;
4444               case ABDAY_7: mday++;
4445               case ABDAY_6: mday++;
4446               case ABDAY_5: mday++;
4447               case ABDAY_4: mday++;
4448               case ABDAY_3: mday++;
4449               case ABDAY_2: mday++;
4450               case ABDAY_1:
4451                 format = "%a";
4452                 break;
4453               case DAY_7: mday++;
4454               case DAY_6: mday++;
4455               case DAY_5: mday++;
4456               case DAY_4: mday++;
4457               case DAY_3: mday++;
4458               case DAY_2: mday++;
4459               case DAY_1:
4460                 format = "%A";
4461                 break;
4462               case ABMON_12: mon++;
4463               case ABMON_11: mon++;
4464               case ABMON_10: mon++;
4465               case ABMON_9:  mon++;
4466               case ABMON_8:  mon++;
4467               case ABMON_7:  mon++;
4468               case ABMON_6:  mon++;
4469               case ABMON_5:  mon++;
4470               case ABMON_4:  mon++;
4471               case ABMON_3:  mon++;
4472               case ABMON_2:  mon++;
4473               case ABMON_1:
4474                 format = "%b";
4475                 break;
4476               case MON_12: mon++;
4477               case MON_11: mon++;
4478               case MON_10: mon++;
4479               case MON_9:  mon++;
4480               case MON_8:  mon++;
4481               case MON_7:  mon++;
4482               case MON_6:  mon++;
4483               case MON_5:  mon++;
4484               case MON_4:  mon++;
4485               case MON_3:  mon++;
4486               case MON_2:  mon++;
4487               case MON_1:
4488                 format = "%B";
4489                 break;
4490               case T_FMT_AMPM:
4491                 format = "%r";
4492                 return_format = TRUE;
4493                 break;
4494               case ERA_D_FMT:
4495                 format = "%Ex";
4496                 return_format = TRUE;
4497                 break;
4498               case ERA_T_FMT:
4499                 format = "%EX";
4500                 return_format = TRUE;
4501                 break;
4502               case ERA_D_T_FMT:
4503                 format = "%Ec";
4504                 return_format = TRUE;
4505                 break;
4506               case ALT_DIGITS:
4507                 format = "%Ow";	/* Find the alternate digit for 0 */
4508                 break;
4509             }
4510 
4511             GCC_DIAG_RESTORE_STMT;
4512 
4513             /* The year was deliberately chosen so that January 1 is on the
4514              * first day of the week.  Since we're only getting one thing at a
4515              * time, it all works */
4516             const char * temp = my_strftime8_temp(format, 30, 30, hour, mday, mon,
4517                                              2011, 0, 0, 0, &is_utf8);
4518             retval = save_to_buffer(temp, retbufp, retbuf_sizep);
4519             Safefree(temp);
4520 
4521             /* If the item is 'ALT_DIGITS', '*retbuf' contains the alternate
4522              * format for wday 0.  If the value is the same as the normal 0,
4523              * there isn't an alternate, so clear the buffer.
4524              *
4525              * (wday was chosen because its range is all a single digit.
4526              * Things like tm_sec have two digits as the minimum: '00'.) */
4527             if (item == ALT_DIGITS && strEQ(*retbufp, "0")) {
4528                 retval = "";
4529                 break;
4530             }
4531 
4532             /* ALT_DIGITS is problematic.  Experiments on it showed that
4533              * strftime() did not always work properly when going from alt-9 to
4534              * alt-10.  Only a few locales have this item defined, and in all
4535              * of them on Linux that khw was able to find, nl_langinfo() merely
4536              * returned the alt-0 character, possibly doubled.  Most Unicode
4537              * digits are in blocks of 10 consecutive code points, so that is
4538              * sufficient information for such scripts, as we can infer alt-1,
4539              * alt-2, ....  But for a Japanese locale, a CJK ideographic 0 is
4540              * returned, and the CJK digits are not in code point order, so you
4541              * can't really infer anything.  The localedef for this locale did
4542              * specify the succeeding digits, so that strftime() works properly
4543              * on them, without needing to infer anything.  But the
4544              * nl_langinfo() return did not give sufficient information for the
4545              * caller to understand what's going on.  So until there is
4546              * evidence that it should work differently, this returns the alt-0
4547              * string for ALT_DIGITS. */
4548 
4549             if (return_format) {
4550 
4551                 /* If to return the format, not the value, overwrite the buffer
4552                  * with it.  But some strftime()s will keep the original format
4553                  * if illegal, so change those to "" */
4554                 if (strEQ(*retbufp, format)) {
4555                     retval = "";
4556                 }
4557                 else {
4558                     retval = format;
4559                 }
4560 
4561                 /* A format is always in ASCII */
4562                 is_utf8 = UTF8NESS_IMMATERIAL;
4563             }
4564 
4565             break;
4566         }
4567 
4568 #    endif
4569 
4570       case CODESET:
4571 
4572         /* The trivial case */
4573         if (isNAME_C_OR_POSIX(locale)) {
4574             retval = C_codeset;
4575             break;
4576         }
4577 
4578 #    ifdef WIN32
4579 
4580         /* This function retrieves the code page.  It is subject to change, but
4581          * is documented and has been stable for many releases */
4582         UINT ___lc_codepage_func(void);
4583 
4584         retval = save_to_buffer(Perl_form(aTHX_ "%d", ___lc_codepage_func()),
4585                                 retbufp, retbuf_sizep);
4586         DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n",
4587                                                locale, retval));
4588         break;
4589 
4590 #    else
4591 
4592         /* The codeset is important, but khw did not figure out a way for it to
4593          * be retrieved on non-Windows boxes without nl_langinfo().  But even
4594          * if we can't get it directly, we can usually determine if it is a
4595          * UTF-8 locale or not.  If it is UTF-8, we (correctly) use that for
4596          * the code set. */
4597 
4598 #      if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4599 
4600         /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT
4601          * CHARACTER as that Unicode code point, this has to be a UTF-8 locale.
4602          * */
4603         wchar_t wc = 0;
4604         (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */
4605         int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
4606                               STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
4607         if (mbtowc_ret >= 0 && wc == UNICODE_REPLACEMENT) {
4608             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4609                                    "mbtowc returned REPLACEMENT\n"));
4610             retval = "UTF-8";
4611             break;
4612         }
4613 
4614         /* Here, it isn't a UTF-8 locale. */
4615 
4616 #    else   /* mbtowc() is not available. */
4617 
4618         /* Sling together several possibilities, depending on platform
4619          * capabilities and what we found.
4620          *
4621          * For non-English locales or non-dollar currency locales, we likely
4622          * will find out whether a locale is UTF-8 or not */
4623 
4624         utf8ness_t is_utf8 = UTF8NESS_UNKNOWN;
4625         const char * scratch_buf = NULL;
4626 
4627 #      if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
4628 
4629         /* Can't use this method unless localeconv() is available, as that's
4630          * the way we find out the currency symbol. */
4631 
4632         /* First try looking at the currency symbol (via a recursive call) to
4633          * see if it disambiguates things.  Often that will be in the native
4634          * script, and if the symbol isn't legal UTF-8, we know that the locale
4635          * isn't either. */
4636         (void) my_langinfo_c(CRNCYSTR, LC_MONETARY, locale, &scratch_buf, NULL,
4637                              &is_utf8);
4638         Safefree(scratch_buf);
4639 
4640 #      endif
4641 #      ifdef USE_LOCALE_TIME
4642 
4643         /* If we have ruled out being UTF-8, no point in checking further. */
4644         if (is_utf8 != UTF8NESS_NO) {
4645 
4646             /* But otherwise do check more.  This is done even if the currency
4647              * symbol looks to be UTF-8, just in case that's a false positive.
4648              *
4649              * Look at the LC_TIME entries, like the names of the months or
4650              * weekdays.  We quit at the first one that is illegal UTF-8 */
4651 
4652             utf8ness_t this_is_utf8 = UTF8NESS_UNKNOWN;
4653             const int times[] = {
4654                 DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
4655                 MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
4656                                             MON_9, MON_10, MON_11, MON_12,
4657                 ALT_DIGITS, AM_STR, PM_STR,
4658                 ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6,
4659                                                              ABDAY_7,
4660                 ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
4661                 ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
4662             };
4663 
4664             /* The code in the recursive call can handle switching the locales,
4665              * but by doing it here, we avoid switching each iteration of the
4666              * loop */
4667             const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
4668 
4669             for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(times); i++) {
4670                 scratch_buf = NULL;
4671                 (void) my_langinfo_c(times[i], LC_TIME, locale, &scratch_buf,
4672                                      NULL, &this_is_utf8);
4673                 Safefree(scratch_buf);
4674                 if (this_is_utf8 == UTF8NESS_NO) {
4675                     is_utf8 = UTF8NESS_NO;
4676                     break;
4677                 }
4678 
4679                 if (this_is_utf8 == UTF8NESS_YES) {
4680                     is_utf8 = UTF8NESS_YES;
4681                 }
4682             }
4683 
4684             /* Here we have gone through all the LC_TIME elements.  is_utf8 has
4685              * been set as follows:
4686              *      UTF8NESS_NO           If at least one is't legal UTF-8
4687              *      UTF8NESS_IMMMATERIAL  If all are ASCII
4688              *      UTF8NESS_YES          If all are legal UTF-8 (including
4689              *                            ASCIIi), and at least one isn't
4690              *                            ASCII. */
4691 
4692             restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
4693         }
4694 
4695 #      endif    /* LC_TIME */
4696 
4697         /* If nothing examined above rules out it being UTF-8, and at least one
4698          * thing fits as UTF-8 (and not plain ASCII), assume the codeset is
4699          * UTF-8. */
4700         if (is_utf8 == UTF8NESS_YES) {
4701             retval = "UTF-8";
4702             break;
4703         }
4704 
4705         /* Here, nothing examined indicates that the codeset is UTF-8.  But
4706          * what is it?  The other locale categories are not likely to be of
4707          * further help:
4708          *
4709          * LC_NUMERIC   Only a few locales in the world have a non-ASCII radix
4710          *              or group separator.
4711          * LC_CTYPE     This code wouldn't be compiled if mbtowc() existed and
4712          *              was reliable.  This is unlikely in C99.  There are
4713          *              other functions that could be used instead, but are
4714          *              they going to exist, and be able to distinguish between
4715          *              UTF-8 and 8859-1?  Deal with this only if it becomes
4716          *              necessary.
4717          * LC_MESSAGES  The strings returned from strerror() would seem likely
4718          *              candidates, but experience has shown that many systems
4719          *              don't actually have translations installed for them.
4720          *              They are instead always in English, so everything in
4721          *              them is ASCII, which is of no help to us.  A Configure
4722          *              probe could possibly be written to see if this platform
4723          *              has non-ASCII error messages.  But again, wait until it
4724          *              turns out to be an actual problem. */
4725 
4726 #    endif    /* ! mbtowc() */
4727 
4728         /* Rejoin the mbtowc available/not-available cases.
4729          *
4730          * We got here only because we haven't been able to find the codeset.
4731          * The only other option khw could think of is to see if the codeset is
4732          * part of the locale name.  This is very less than ideal; often there
4733          * is no code set in the name; and at other times they even lie.
4734          *
4735          * But there is an XPG standard syntax, which many locales follow:
4736          *
4737          * language[_territory[.codeset]][@modifier]
4738          *
4739          * So we take the part between the dot and any '@' */
4740         retval = (const char *) strchr(locale, '.');
4741         if (! retval) {
4742             retval = "";  /* Alas, no dot */
4743             break;
4744         }
4745 
4746         /* Don't include the dot */
4747         retval++;
4748 
4749         /* And stop before any '@' */
4750         const char * modifier = strchr(retval, '@');
4751         if (modifier) {
4752             char * code_set_name;
4753             const Size_t name_len = modifier - retval;
4754             Newx(code_set_name, name_len + 1, char);         /* +1 for NUL */
4755             my_strlcpy(code_set_name, retval, name_len + 1);
4756             SAVEFREEPV(code_set_name);
4757             retval = code_set_name;
4758         }
4759 
4760 #      if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4761 
4762         /* When these functions, are available, they were tried earlier and
4763          * indicated that the locale did not act like a proper UTF-8 one.  So
4764          * if it claims to be UTF-8, it is a lie */
4765         if (is_codeset_name_UTF8(retval)) {
4766             retval = "";
4767             break;
4768         }
4769 
4770 #      endif
4771 
4772         /* Otherwise the code set name is considered to be everything between
4773          * the dot and the '@' */
4774         retval = save_to_buffer(retval, retbufp, retbuf_sizep);
4775 
4776         break;
4777 
4778 #    endif
4779 
4780     } /* Giant switch() of nl_langinfo() items */
4781 
4782     restore_toggled_locale_i(cat_index, orig_switched_locale);
4783 
4784 #    ifdef USE_LOCALE_CTYPE
4785     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
4786 #    endif
4787 
4788     if (utf8ness) {
4789         *utf8ness = is_utf8;
4790     }
4791 
4792     return retval;
4793 
4794 #  endif    /* All the implementations of my_langinfo() */
4795 
4796 /*--------------------------------------------------------------------------*/
4797 
4798 }   /* my_langinfo() */
4799 
4800 #endif      /* USE_LOCALE */
4801 
4802 char *
Perl_my_strftime(pTHX_ const char * fmt,int sec,int min,int hour,int mday,int mon,int year,int wday,int yday,int isdst)4803 Perl_my_strftime(pTHX_ const char *fmt, int sec, int min, int hour, int mday, int mon, int year, int wday, int yday, int isdst)
4804 {
4805 #ifdef HAS_STRFTIME
4806 
4807 /*
4808 =for apidoc_section $time
4809 =for apidoc      my_strftime
4810 
4811 strftime(), but with a different API so that the return value is a pointer
4812 to the formatted result (which MUST be arranged to be FREED BY THE
4813 CALLER).  This allows this function to increase the buffer size as needed,
4814 so that the caller doesn't have to worry about that.
4815 
4816 On failure it returns NULL.
4817 
4818 Note that yday and wday effectively are ignored by this function, as
4819 mini_mktime() overwrites them.
4820 
4821 Also note that it is always executed in the underlying C<LC_TIME> locale of
4822 the program, giving results based on that locale.
4823 
4824 =cut
4825  */
4826     PERL_ARGS_ASSERT_MY_STRFTIME;
4827 
4828     /* An empty format yields an empty result */
4829     const int fmtlen = strlen(fmt);
4830     if (fmtlen == 0) {
4831         char *ret;
4832         Newxz (ret, 1, char);
4833         return ret;
4834     }
4835 
4836     /* Set mytm to now */
4837     struct tm mytm;
4838     init_tm(&mytm);	/* XXX workaround - see Perl_init_tm() */
4839 
4840     /* Override with the passed-in values */
4841     mytm.tm_sec = sec;
4842     mytm.tm_min = min;
4843     mytm.tm_hour = hour;
4844     mytm.tm_mday = mday;
4845     mytm.tm_mon = mon;
4846     mytm.tm_year = year;
4847     mytm.tm_wday = wday;
4848     mytm.tm_yday = yday;
4849     mytm.tm_isdst = isdst;
4850     mini_mktime(&mytm);
4851 
4852     /* use libc to get the values for tm_gmtoff and tm_zone on platforms that
4853      * have them [perl #18238] */
4854 #if defined(HAS_MKTIME) && (defined(HAS_TM_TM_GMTOFF) || defined(HAS_TM_TM_ZONE))
4855     struct tm mytm2;
4856     mytm2 = mytm;
4857     MKTIME_LOCK;
4858     mktime(&mytm2);
4859     MKTIME_UNLOCK;
4860 #  ifdef HAS_TM_TM_GMTOFF
4861     mytm.tm_gmtoff = mytm2.tm_gmtoff;
4862 #  endif
4863 #  ifdef HAS_TM_TM_ZONE
4864     mytm.tm_zone = mytm2.tm_zone;
4865 #  endif
4866 #endif
4867 #if defined(USE_LOCALE_CTYPE) && defined(USE_LOCALE_TIME)
4868 
4869     const char * orig_CTYPE_LOCALE = toggle_locale_c(LC_CTYPE,
4870                                                      querylocale_c(LC_TIME));
4871 #endif
4872 
4873     /* Guess an initial size for the returned string based on an expansion
4874      * factor of the input format, but with a minimum that should handle most
4875      * common cases.  If this guess is too small, we will try again with a
4876      * larger one */
4877     int bufsize = MAX(fmtlen * 2, 64);
4878 
4879     char *buf = NULL;   /* Makes Renew() act as Newx() on the first iteration */
4880     do {
4881         Renew(buf, bufsize, char);
4882 
4883         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); /* fmt checked by caller */
4884 
4885         STRFTIME_LOCK;
4886         int len = strftime(buf, bufsize, fmt, &mytm);
4887         STRFTIME_UNLOCK;
4888 
4889         GCC_DIAG_RESTORE_STMT;
4890 
4891         /* A non-zero return indicates success.  But to make sure we're not
4892          * dealing with some rogue strftime that returns how much space it
4893          * needs instead of 0 when there isn't enough, check that the return
4894          * indicates we have at least one byte of spare space (which will be
4895          * used for the terminating NUL). */
4896         if (inRANGE(len, 1, bufsize - 1)) {
4897             goto strftime_success;
4898         }
4899 
4900         /* There are several possible reasons for a 0 return code for a
4901          * non-empty format, and they are not trivial to tease apart.  This
4902          * issue is a known bug in the strftime() API.  What we do to cope is
4903          * to assume that the reason is not enough space in the buffer, so
4904          * increase it and try again. */
4905         bufsize *= 2;
4906 
4907         /* But don't just keep increasing the size indefinitely.  Stop when it
4908          * becomes obvious that the reason for failure is something besides not
4909          * enough space.  The most likely largest expanding format is %c.  On
4910          * khw's Linux box, the maximum result of this is 67 characters, in the
4911          * km_KH locale.  If a new script comes along that uses 4 UTF-8 bytes
4912          * per character, and with a similar expansion factor, that would be a
4913          * 268:2 byte ratio, or a bit more than 128:1 = 2**7:1.  Some strftime
4914          * implementations allow you to say %1000c to pad to 1000 bytes.  This
4915          * shows that it is impossible to implement this without a heuristic
4916          * (that can fail).  But it indicates we need to be generous in the
4917          * upper limit before failing.  The previous heuristic used was too
4918          * stingy.  Since the size doubles per iteration, it doesn't take many
4919          * to reach the limit */
4920     } while (bufsize < ((1 << 11) + 1) * fmtlen);
4921 
4922     /* Here, strftime() returned 0, and it likely wasn't for lack of space.
4923      * There are two possible reasons:
4924      *
4925      * First is that the result is legitimately 0 length.  This can happen
4926      * when the format is precisely "%p".  That is the only documented format
4927      * that can have an empty result. */
4928     if (strEQ(fmt, "%p")) {
4929         Renew(buf, 1, char);
4930         *buf = '\0';
4931         goto strftime_success;
4932     }
4933 
4934     /* The other reason is that the format string is malformed.  Probably it is
4935      * an illegal conversion specifier.) */
4936     Safefree(buf);
4937     return NULL;
4938 
4939   strftime_success:
4940 
4941 #if defined(USE_LOCALE_CTYPE) && defined(USE_LOCALE_TIME)
4942 
4943     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_LOCALE);
4944 
4945 #endif
4946     return buf;
4947 
4948 #else
4949     Perl_croak(aTHX_ "panic: no strftime");
4950     return NULL;
4951 #endif
4952 
4953 }
4954 
4955 char *
Perl_my_strftime8_temp(pTHX_ const char * fmt,int sec,int min,int hour,int mday,int mon,int year,int wday,int yday,int isdst,utf8ness_t * utf8ness)4956 Perl_my_strftime8_temp(pTHX_ const char *fmt, int sec, int min, int hour, int mday,
4957                          int mon, int year, int wday, int yday, int isdst,
4958                          utf8ness_t * utf8ness)
4959 {   /* Documented above */
4960     char * retval = my_strftime(fmt, sec, min, hour, mday, mon, year, wday,
4961                                 yday, isdst);
4962 
4963     PERL_ARGS_ASSERT_MY_STRFTIME8_TEMP;
4964 
4965     if (utf8ness) {
4966 
4967 #ifdef USE_LOCALE_TIME
4968         *utf8ness = get_locale_string_utf8ness_i(retval,
4969                                                  LOCALE_UTF8NESS_UNKNOWN,
4970                                                  NULL, LC_TIME_INDEX_);
4971 #else
4972         *utf8ness = UTF8NESS_IMMATERIAL;
4973 #endif
4974 
4975     }
4976 
4977     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4978                         "fmt=%s, retval=%s", fmt,
4979                         ((is_utf8_string((U8 *) retval, 0))
4980                          ? retval
4981                          :_byte_dump_string((U8 *) retval, strlen(retval), 0)));
4982              if (utf8ness) PerlIO_printf(Perl_debug_log, "; utf8ness=%d",
4983                                                          (int) *utf8ness);
4984              PerlIO_printf(Perl_debug_log, "\n");
4985             );
4986 
4987     return retval;
4988 }
4989 
4990 /*
4991  * Initialize locale awareness.
4992  */
4993 int
Perl_init_i18nl10n(pTHX_ int printwarn)4994 Perl_init_i18nl10n(pTHX_ int printwarn)
4995 {
4996     /* printwarn is
4997      *
4998      *    0 if not to output warning when setup locale is bad
4999      *    1 if to output warning based on value of PERL_BADLANG
5000      *    >1 if to output regardless of PERL_BADLANG
5001      *
5002      * returns
5003      *    1 = set ok or not applicable,
5004      *    0 = fallback to a locale of lower priority
5005      *   -1 = fallback to all locales failed, not even to the C locale
5006      *
5007      * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
5008      * set, debugging information is output.
5009      *
5010      * This looks more complicated than it is, mainly due to the #ifdefs and
5011      * error handling.
5012      *
5013      * Besides some asserts, data structure initialization, and specific
5014      * platform complications, this routine is effectively represented by this
5015      * pseudo-code:
5016      *
5017      *      setlocale(LC_ALL, "");                                            x
5018      *      foreach (subcategory) {                                           x
5019      *          curlocales[f(subcategory)] = setlocale(subcategory, NULL);    x
5020      *      }                                                                 x
5021      *      if (platform_so_requires) {
5022      *          foreach (subcategory) {
5023      *            PL_curlocales[f(subcategory)] = curlocales[f(subcategory)]
5024      *          }
5025      *      }
5026      *      foreach (subcategory) {
5027      *          if (needs_special_handling[f(subcategory)] &this_subcat_handler
5028      *      }
5029      *
5030      * This sets all the categories to the values in the current environment,
5031      * saves them temporarily in curlocales[] until they can be handled and/or
5032      * on some platforms saved in a per-thread array PL_curlocales[].
5033      *
5034      * f(foo) is a mapping from the opaque system category numbers to small
5035      * non-negative integers used most everywhere in this file as indices into
5036      * arrays (such as curlocales[]) so the program doesn't have to otherwise
5037      * deal with the opaqueness.
5038      *
5039      * If the platform doesn't have LC_ALL, the lines marked 'x' above are
5040      * effectively replaced by:
5041      *      foreach (subcategory) {                                           y
5042      *          curlocales[f(subcategory)] = setlocale(subcategory, "");      y
5043      *      }                                                                 y
5044      *
5045      * The only differences being the lack of an LC_ALL call, and using ""
5046      * instead of NULL in the setlocale calls.
5047      *
5048      * But there are, of course, complications.
5049      *
5050      * it has to deal with if this is an embedded perl, whose locale doesn't
5051      * come from the environment, but has been set up by the caller.  This is
5052      * pretty simply handled: the "" in the setlocale calls is not a string
5053      * constant, but a variable which is set to NULL in the embedded case.
5054      *
5055      * But the major complication is handling failure and doing fallback.  All
5056      * the code marked 'x' or 'y' above is actually enclosed in an outer loop,
5057      * using the array trial_locales[].  On entry, trial_locales[] is
5058      * initialized to just one entry, containing the NULL or "" locale argument
5059      * shown above.  If, as is almost always the case, everything works, it
5060      * exits after just the one iteration, going on to the next step.
5061      *
5062      * But if there is a failure, the code tries its best to honor the
5063      * environment as much as possible.  It self-modifies trial_locales[] to
5064      * have more elements, one for each of the POSIX-specified settings from
5065      * the environment, such as LANG, ending in the ultimate fallback, the C
5066      * locale.  Thus if there is something bogus with a higher priority
5067      * environment variable, it will try with the next highest, until something
5068      * works.  If everything fails, it limps along with whatever state it got
5069      * to.
5070      *
5071      * A further complication is that Windows has an additional fallback, the
5072      * user-default ANSI code page obtained from the operating system.  This is
5073      * added as yet another loop iteration, just before the final "C"
5074      *
5075      * A slight complication is that in embedded Perls, the locale may already
5076      * be set-up, and we don't want to get it from the normal environment
5077      * variables.  This is handled by having a special environment variable
5078      * indicate we're in this situation.  We simply set setlocale's 2nd
5079      * parameter to be a NULL instead of "".  That indicates to setlocale that
5080      * it is not to change anything, but to return the current value,
5081      * effectively initializing perl's db to what the locale already is.
5082      *
5083      * We play the same trick with NULL if a LC_ALL succeeds.  We call
5084      * setlocale() on the individual categories with NULL to get their existing
5085      * values for our db, instead of trying to change them.
5086      * */
5087 
5088     int ok = 1;
5089 
5090 #ifndef USE_LOCALE
5091 
5092     PERL_UNUSED_ARG(printwarn);
5093 
5094 #else  /* USE_LOCALE */
5095 #  ifdef __GLIBC__
5096 
5097     const char * const language = PerlEnv_getenv("LANGUAGE");
5098 
5099 #  endif
5100 
5101     /* NULL uses the existing already set up locale */
5102     const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
5103                                         ? NULL
5104                                         : "";
5105     typedef struct trial_locales_struct_s {
5106         const char* trial_locale;
5107         const char* fallback_desc;
5108         const char* fallback_name;
5109     } trial_locales_struct;
5110     /* 5 = 1 each for "", LC_ALL, LANG, (Win32) system default locale, C */
5111     trial_locales_struct trial_locales[5];
5112     unsigned int trial_locales_count;
5113     const char * const lc_all     = PerlEnv_getenv("LC_ALL");
5114     const char * const lang       = PerlEnv_getenv("LANG");
5115     bool setlocale_failure = FALSE;
5116     unsigned int i;
5117 
5118     /* A later getenv() could zap this, so only use here */
5119     const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
5120 
5121     const bool locwarn = (printwarn > 1
5122                           || (          printwarn
5123                               && (    ! bad_lang_use_once
5124                                   || (
5125                                          /* disallow with "" or "0" */
5126                                          *bad_lang_use_once
5127                                        && strNE("0", bad_lang_use_once)))));
5128 
5129     /* current locale for given category; should have been copied so aren't
5130      * volatile */
5131     const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
5132 
5133 #  ifndef DEBUGGING
5134 #    define DEBUG_LOCALE_INIT(a,b,c)
5135 #  else
5136 
5137     DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
5138 
5139 #    define DEBUG_LOCALE_INIT(cat_index, locale, result)                    \
5140         DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",                       \
5141                     setlocale_debug_string_i(cat_index, locale, result)));
5142 
5143 /* Make sure the parallel arrays are properly set up */
5144 #    ifdef USE_LOCALE_NUMERIC
5145     assert(categories[LC_NUMERIC_INDEX_] == LC_NUMERIC);
5146     assert(strEQ(category_names[LC_NUMERIC_INDEX_], "LC_NUMERIC"));
5147 #      ifdef USE_POSIX_2008_LOCALE
5148     assert(category_masks[LC_NUMERIC_INDEX_] == LC_NUMERIC_MASK);
5149 #      endif
5150 #    endif
5151 #    ifdef USE_LOCALE_CTYPE
5152     assert(categories[LC_CTYPE_INDEX_] == LC_CTYPE);
5153     assert(strEQ(category_names[LC_CTYPE_INDEX_], "LC_CTYPE"));
5154 #      ifdef USE_POSIX_2008_LOCALE
5155     assert(category_masks[LC_CTYPE_INDEX_] == LC_CTYPE_MASK);
5156 #      endif
5157 #    endif
5158 #    ifdef USE_LOCALE_COLLATE
5159     assert(categories[LC_COLLATE_INDEX_] == LC_COLLATE);
5160     assert(strEQ(category_names[LC_COLLATE_INDEX_], "LC_COLLATE"));
5161 #      ifdef USE_POSIX_2008_LOCALE
5162     assert(category_masks[LC_COLLATE_INDEX_] == LC_COLLATE_MASK);
5163 #      endif
5164 #    endif
5165 #    ifdef USE_LOCALE_TIME
5166     assert(categories[LC_TIME_INDEX_] == LC_TIME);
5167     assert(strEQ(category_names[LC_TIME_INDEX_], "LC_TIME"));
5168 #      ifdef USE_POSIX_2008_LOCALE
5169     assert(category_masks[LC_TIME_INDEX_] == LC_TIME_MASK);
5170 #      endif
5171 #    endif
5172 #    ifdef USE_LOCALE_MESSAGES
5173     assert(categories[LC_MESSAGES_INDEX_] == LC_MESSAGES);
5174     assert(strEQ(category_names[LC_MESSAGES_INDEX_], "LC_MESSAGES"));
5175 #      ifdef USE_POSIX_2008_LOCALE
5176     assert(category_masks[LC_MESSAGES_INDEX_] == LC_MESSAGES_MASK);
5177 #      endif
5178 #    endif
5179 #    ifdef USE_LOCALE_MONETARY
5180     assert(categories[LC_MONETARY_INDEX_] == LC_MONETARY);
5181     assert(strEQ(category_names[LC_MONETARY_INDEX_], "LC_MONETARY"));
5182 #      ifdef USE_POSIX_2008_LOCALE
5183     assert(category_masks[LC_MONETARY_INDEX_] == LC_MONETARY_MASK);
5184 #      endif
5185 #    endif
5186 #    ifdef USE_LOCALE_ADDRESS
5187     assert(categories[LC_ADDRESS_INDEX_] == LC_ADDRESS);
5188     assert(strEQ(category_names[LC_ADDRESS_INDEX_], "LC_ADDRESS"));
5189 #      ifdef USE_POSIX_2008_LOCALE
5190     assert(category_masks[LC_ADDRESS_INDEX_] == LC_ADDRESS_MASK);
5191 #      endif
5192 #    endif
5193 #    ifdef USE_LOCALE_IDENTIFICATION
5194     assert(categories[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION);
5195     assert(strEQ(category_names[LC_IDENTIFICATION_INDEX_], "LC_IDENTIFICATION"));
5196 #      ifdef USE_POSIX_2008_LOCALE
5197     assert(category_masks[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION_MASK);
5198 #      endif
5199 #    endif
5200 #    ifdef USE_LOCALE_MEASUREMENT
5201     assert(categories[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT);
5202     assert(strEQ(category_names[LC_MEASUREMENT_INDEX_], "LC_MEASUREMENT"));
5203 #      ifdef USE_POSIX_2008_LOCALE
5204     assert(category_masks[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT_MASK);
5205 #      endif
5206 #    endif
5207 #    ifdef USE_LOCALE_PAPER
5208     assert(categories[LC_PAPER_INDEX_] == LC_PAPER);
5209     assert(strEQ(category_names[LC_PAPER_INDEX_], "LC_PAPER"));
5210 #      ifdef USE_POSIX_2008_LOCALE
5211     assert(category_masks[LC_PAPER_INDEX_] == LC_PAPER_MASK);
5212 #      endif
5213 #    endif
5214 #    ifdef USE_LOCALE_TELEPHONE
5215     assert(categories[LC_TELEPHONE_INDEX_] == LC_TELEPHONE);
5216     assert(strEQ(category_names[LC_TELEPHONE_INDEX_], "LC_TELEPHONE"));
5217 #      ifdef USE_POSIX_2008_LOCALE
5218     assert(category_masks[LC_TELEPHONE_INDEX_] == LC_TELEPHONE_MASK);
5219 #      endif
5220 #    endif
5221 #    ifdef USE_LOCALE_NAME
5222     assert(categories[LC_NAME_INDEX_] == LC_NAME);
5223     assert(strEQ(category_names[LC_NAME_INDEX_], "LC_NAME"));
5224 #      ifdef USE_POSIX_2008_LOCALE
5225     assert(category_masks[LC_NAME_INDEX_] == LC_NAME_MASK);
5226 #      endif
5227 #    endif
5228 #    ifdef USE_LOCALE_SYNTAX
5229     assert(categories[LC_SYNTAX_INDEX_] == LC_SYNTAX);
5230     assert(strEQ(category_names[LC_SYNTAX_INDEX_], "LC_SYNTAX"));
5231 #      ifdef USE_POSIX_2008_LOCALE
5232     assert(category_masks[LC_SYNTAX_INDEX_] == LC_SYNTAX_MASK);
5233 #      endif
5234 #    endif
5235 #    ifdef USE_LOCALE_TOD
5236     assert(categories[LC_TOD_INDEX_] == LC_TOD);
5237     assert(strEQ(category_names[LC_TOD_INDEX_], "LC_TOD"));
5238 #      ifdef USE_POSIX_2008_LOCALE
5239     assert(category_masks[LC_TOD_INDEX_] == LC_TOD_MASK);
5240 #      endif
5241 #    endif
5242 #    ifdef LC_ALL
5243     assert(categories[LC_ALL_INDEX_] == LC_ALL);
5244     assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL"));
5245     STATIC_ASSERT_STMT(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX_);
5246 #      ifdef USE_POSIX_2008_LOCALE
5247     assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK);
5248 #      endif
5249 #    endif
5250 #  endif    /* DEBUGGING */
5251 
5252     /* Initialize the per-thread mbrFOO() state variables.  See POSIX.xs for
5253      * why these particular incantations are used. */
5254 #  ifdef HAS_MBRLEN
5255     memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
5256 #  endif
5257 #  ifdef HAS_MBRTOWC
5258     memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
5259 #  endif
5260 #  ifdef HAS_WCTOMBR
5261     wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
5262 #  endif
5263 #  ifdef USE_THREAD_SAFE_LOCALE
5264 #    ifdef WIN32
5265 
5266     _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
5267 
5268 #    endif
5269 #  endif
5270 #  ifdef USE_POSIX_2008_LOCALE
5271 
5272     if (! PL_C_locale_obj) {
5273         PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
5274     }
5275     if (! PL_C_locale_obj) {
5276         locale_panic_(Perl_form(aTHX_
5277                                 "Cannot create POSIX 2008 C locale object"));
5278     }
5279 
5280     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n",
5281                            PL_C_locale_obj));
5282 
5283     /* Switch to using the POSIX 2008 interface now.  This would happen below
5284      * anyway, but deferring it can lead to leaks of memory that would also get
5285      * malloc'd in the interim */
5286     uselocale(PL_C_locale_obj);
5287 
5288 #    ifdef USE_LOCALE_NUMERIC
5289 
5290     PL_underlying_numeric_obj = duplocale(PL_C_locale_obj);
5291 
5292 #    endif
5293 #  endif
5294 #  ifdef USE_LOCALE_NUMERIC
5295 
5296     PL_numeric_radix_sv    = newSV(1);
5297     PL_underlying_radix_sv = newSV(1);
5298     Newxz(PL_numeric_name, 1, char);    /* Single NUL character */
5299     new_numeric("C", false);
5300 
5301 #  endif
5302 #  ifdef USE_LOCALE_COLLATE
5303 
5304     Newxz(PL_collation_name, 1, char);
5305     new_collate("C", false);
5306 
5307 #  endif
5308 #  ifdef USE_LOCALE_CTYPE
5309 
5310     Newxz(PL_ctype_name, 1, char);
5311     new_ctype("C", false);
5312 
5313 #  endif
5314 #  ifdef USE_PL_CURLOCALES
5315 
5316     /* Initialize our records. */
5317     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
5318         (void) emulate_setlocale_i(i, posix_setlocale(categories[i], NULL),
5319                                    RECALCULATE_LC_ALL_ON_FINAL_INTERATION,
5320                                    __LINE__);
5321     }
5322 
5323 #  endif
5324 
5325     /* We try each locale in the list until we get one that works, or exhaust
5326      * the list.  Normally the loop is executed just once.  But if setting the
5327      * locale fails, inside the loop we add fallback trials to the array and so
5328      * will execute the loop multiple times */
5329     trial_locales_struct ts = {
5330         .trial_locale = setlocale_init,
5331         .fallback_desc = NULL,
5332         .fallback_name = NULL,
5333     };
5334     trial_locales[0] = ts;
5335     trial_locales_count = 1;
5336 
5337     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
5338         curlocales[i] = NULL;
5339     }
5340 
5341     for (i= 0; i < trial_locales_count; i++) {
5342         const char * trial_locale = trial_locales[i].trial_locale;
5343         setlocale_failure = FALSE;
5344 
5345 #  ifdef LC_ALL
5346 
5347         /* setlocale() return vals; not copied so must be looked at
5348          * immediately. */
5349         const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
5350         sl_result[LC_ALL_INDEX_] = stdized_setlocale(LC_ALL, trial_locale);
5351         DEBUG_LOCALE_INIT(LC_ALL_INDEX_, trial_locale, sl_result[LC_ALL_INDEX_]);
5352         if (! sl_result[LC_ALL_INDEX_]) {
5353             setlocale_failure = TRUE;
5354         }
5355         else {
5356             /* Since LC_ALL succeeded, it should have changed all the other
5357              * categories it can to its value; so we massage things so that the
5358              * setlocales below just return their category's current values.
5359              * This adequately handles the case in NetBSD where LC_COLLATE may
5360              * not be defined for a locale, and setting it individually will
5361              * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
5362              * the POSIX locale. */
5363             trial_locale = NULL;
5364         }
5365 
5366 #  endif /* LC_ALL */
5367 
5368         if (! setlocale_failure) {
5369             unsigned int j;
5370             for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
5371                 curlocales[j] = stdized_setlocale(categories[j], trial_locale);
5372                 if (! curlocales[j]) {
5373                     setlocale_failure = TRUE;
5374                 }
5375                 curlocales[j] = savepv(curlocales[j]);
5376                 DEBUG_LOCALE_INIT(j, trial_locale, curlocales[j]);
5377             }
5378 
5379             if (LIKELY(! setlocale_failure)) {  /* All succeeded */
5380                 break;  /* Exit trial_locales loop */
5381             }
5382         }
5383 
5384         /* Here, something failed; will need to try a fallback. */
5385         ok = 0;
5386 
5387         if (i == 0) {
5388             unsigned int j;
5389 
5390             if (locwarn) { /* Output failure info only on the first one */
5391 
5392 #  ifdef LC_ALL
5393 
5394                 PerlIO_printf(Perl_error_log,
5395                 "perl: warning: Setting locale failed.\n");
5396 
5397 #  else /* !LC_ALL */
5398 
5399                 PerlIO_printf(Perl_error_log,
5400                 "perl: warning: Setting locale failed for the categories:\n");
5401 
5402                 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
5403                     if (! curlocales[j]) {
5404                         PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]);
5405                     }
5406                 }
5407 
5408 #  endif /* LC_ALL */
5409 
5410                 PerlIO_printf(Perl_error_log,
5411                     "perl: warning: Please check that your locale settings:\n");
5412 
5413 #  ifdef __GLIBC__
5414 
5415                 PerlIO_printf(Perl_error_log,
5416                             "\tLANGUAGE = %c%s%c,\n",
5417                             language ? '"' : '(',
5418                             language ? language : "unset",
5419                             language ? '"' : ')');
5420 #  endif
5421 
5422                 PerlIO_printf(Perl_error_log,
5423                             "\tLC_ALL = %c%s%c,\n",
5424                             lc_all ? '"' : '(',
5425                             lc_all ? lc_all : "unset",
5426                             lc_all ? '"' : ')');
5427 
5428 #  if defined(USE_ENVIRON_ARRAY)
5429 
5430                 {
5431                     char **e;
5432 
5433                     /* Look through the environment for any variables of the
5434                      * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
5435                      * already handled above.  These are assumed to be locale
5436                      * settings.  Output them and their values. */
5437                     for (e = environ; *e; e++) {
5438                         const STRLEN prefix_len = sizeof("LC_") - 1;
5439                         STRLEN uppers_len;
5440 
5441                         if (     strBEGINs(*e, "LC_")
5442                             && ! strBEGINs(*e, "LC_ALL=")
5443                             && (uppers_len = strspn(*e + prefix_len,
5444                                              "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
5445                             && ((*e)[prefix_len + uppers_len] == '='))
5446                         {
5447                             PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
5448                                 (int) (prefix_len + uppers_len), *e,
5449                                 *e + prefix_len + uppers_len + 1);
5450                         }
5451                     }
5452                 }
5453 
5454 #  else
5455 
5456                 PerlIO_printf(Perl_error_log,
5457                             "\t(possibly more locale environment variables)\n");
5458 
5459 #  endif
5460 
5461                 PerlIO_printf(Perl_error_log,
5462                             "\tLANG = %c%s%c\n",
5463                             lang ? '"' : '(',
5464                             lang ? lang : "unset",
5465                             lang ? '"' : ')');
5466 
5467                 PerlIO_printf(Perl_error_log,
5468                             "    are supported and installed on your system.\n");
5469             }
5470 
5471             /* Calculate what fallback locales to try.  We have avoided this
5472              * until we have to, because failure is quite unlikely.  This will
5473              * usually change the upper bound of the loop we are in.
5474              *
5475              * Since the system's default way of setting the locale has not
5476              * found one that works, We use Perl's defined ordering: LC_ALL,
5477              * LANG, and the C locale.  We don't try the same locale twice, so
5478              * don't add to the list if already there.  (On POSIX systems, the
5479              * LC_ALL element will likely be a repeat of the 0th element "",
5480              * but there's no harm done by doing it explicitly.
5481              *
5482              * Note that this tries the LC_ALL environment variable even on
5483              * systems which have no LC_ALL locale setting.  This may or may
5484              * not have been originally intentional, but there's no real need
5485              * to change the behavior. */
5486             if (lc_all) {
5487                 for (j = 0; j < trial_locales_count; j++) {
5488                     if (strEQ(lc_all, trial_locales[j].trial_locale)) {
5489                         goto done_lc_all;
5490                     }
5491                 }
5492                 trial_locales_struct ts = {
5493                     .trial_locale = lc_all,
5494                     .fallback_desc = (strEQ(lc_all, "C")
5495                                       ? "the standard locale"
5496                                       : "a fallback locale"),
5497                     .fallback_name = lc_all,
5498                 };
5499                 trial_locales[trial_locales_count++] = ts;
5500             }
5501           done_lc_all:
5502 
5503             if (lang) {
5504                 for (j = 0; j < trial_locales_count; j++) {
5505                     if (strEQ(lang, trial_locales[j].trial_locale)) {
5506                         goto done_lang;
5507                     }
5508                 }
5509                 trial_locales_struct ts = {
5510                     .trial_locale = lang,
5511                     .fallback_desc = (strEQ(lang, "C")
5512                                       ? "the standard locale"
5513                                       : "a fallback locale"),
5514                     .fallback_name = lang,
5515                 };
5516                 trial_locales[trial_locales_count++] = ts;
5517             }
5518           done_lang:
5519 
5520 #  if defined(WIN32) && defined(LC_ALL)
5521 
5522             /* For Windows, we also try the system default locale before "C".
5523              * (If there exists a Windows without LC_ALL we skip this because
5524              * it gets too complicated.  For those, the "C" is the next
5525              * fallback possibility). */
5526             {
5527                 /* Note that this may change the locale, but we are going to do
5528                  * that anyway.
5529                  *
5530                  * Our normal Windows setlocale() implementation ignores the
5531                  * system default locale to make things work like POSIX.  This
5532                  * is the only place where we want to consider it, so have to
5533                  * use wrap_wsetlocale(). */
5534                 const char *system_default_locale =
5535                                     stdize_locale(LC_ALL,
5536                                                   wrap_wsetlocale(LC_ALL, ""),
5537                                                   &PL_stdize_locale_buf,
5538                                                   &PL_stdize_locale_bufsize,
5539                                                   __LINE__);
5540                 DEBUG_LOCALE_INIT(LC_ALL_INDEX_, "", system_default_locale);
5541 
5542                 /* Skip if invalid or if it's already on the list of locales to
5543                  * try */
5544                 if (! system_default_locale) {
5545                     goto done_system_default;
5546                 }
5547                 for (j = 0; j < trial_locales_count; j++) {
5548                     if (strEQ(system_default_locale, trial_locales[j].trial_locale)) {
5549                         goto done_system_default;
5550                     }
5551                 }
5552 
5553                 trial_locales_struct ts = {
5554                     .trial_locale = system_default_locale,
5555                     .fallback_desc = (strEQ(system_default_locale, "C")
5556                                       ? "the standard locale"
5557                                       : "the system default locale"),
5558                     .fallback_name = system_default_locale,
5559                 };
5560                 trial_locales[trial_locales_count++] = ts;
5561             }
5562           done_system_default:
5563 
5564 #  endif
5565 
5566             for (j = 0; j < trial_locales_count; j++) {
5567                 if (strEQ("C", trial_locales[j].trial_locale)) {
5568                     goto done_C;
5569                 }
5570             }
5571             {
5572                 /* new scope to avoid C++ complaining about
5573                    initialization being bypassed by goto.
5574                 */
5575                 trial_locales_struct ts = {
5576                     .trial_locale = "C",
5577                     .fallback_desc = "the standard locale",
5578                     .fallback_name = "C",
5579                 };
5580                 trial_locales[trial_locales_count++] = ts;
5581             }
5582           done_C: ;
5583         }   /* end of first time through the loop */
5584     }   /* end of looping through the trial locales */
5585 
5586     if (ok < 1) {   /* If we tried to fallback */
5587         const char* msg;
5588         if (! setlocale_failure) {  /* fallback succeeded */
5589            msg = "Falling back to";
5590         }
5591         else {  /* fallback failed */
5592             unsigned int j;
5593 
5594             /* We dropped off the end of the loop, so have to decrement i to
5595              * get back to the value the last time through */
5596             i--;
5597 
5598             ok = -1;
5599             msg = "Failed to fall back to";
5600 
5601             /* To continue, we should use whatever values we've got */
5602 
5603             for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
5604                 Safefree(curlocales[j]);
5605                 curlocales[j] = savepv(stdized_setlocale(categories[j], NULL));
5606                 DEBUG_LOCALE_INIT(j, NULL, curlocales[j]);
5607             }
5608         }
5609 
5610         if (locwarn) {
5611             const char * description = trial_locales[i].fallback_desc;
5612             const char * name = trial_locales[i].fallback_name;
5613 
5614             if (name && strNE(name, "")) {
5615                 PerlIO_printf(Perl_error_log,
5616                     "perl: warning: %s %s (\"%s\").\n", msg, description, name);
5617             }
5618             else {
5619                 PerlIO_printf(Perl_error_log,
5620                                    "perl: warning: %s %s.\n", msg, description);
5621             }
5622         }
5623     } /* End of tried to fallback */
5624 
5625 #  ifdef USE_POSIX_2008_LOCALE
5626 
5627     /* The stdized setlocales haven't affected the P2008 locales.  Initialize
5628      * them now, calculating LC_ALL only on the final go round, when all have
5629      * been set. */
5630     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
5631         (void) emulate_setlocale_i(i, curlocales[i],
5632                                    RECALCULATE_LC_ALL_ON_FINAL_INTERATION,
5633                                    __LINE__);
5634     }
5635 
5636 #  endif
5637 
5638     /* Done with finding the locales; update the auxiliary records */
5639     new_LC_ALL(NULL, false);
5640 
5641     for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
5642         Safefree(curlocales[i]);
5643     }
5644 
5645 #  if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
5646 
5647     /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
5648      * locale is UTF-8.  The call to new_ctype() just above has already
5649      * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
5650      * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
5651      * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
5652      * STDIN, STDOUT, STDERR, _and_ the default open discipline.  */
5653     PL_utf8locale = PL_in_utf8_CTYPE_locale;
5654 
5655     /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
5656        This is an alternative to using the -C command line switch
5657        (the -C if present will override this). */
5658     {
5659          const char *p = PerlEnv_getenv("PERL_UNICODE");
5660          PL_unicode = p ? parse_unicode_opts(&p) : 0;
5661          if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
5662              PL_utf8cache = -1;
5663     }
5664 
5665 #  endif
5666 #endif /* USE_LOCALE */
5667 
5668     /* So won't continue to output stuff */
5669     DEBUG_INITIALIZATION_set(FALSE);
5670 
5671     return ok;
5672 }
5673 
5674 #ifdef USE_LOCALE_COLLATE
5675 
5676 STATIC void
S_compute_collxfrm_coefficients(pTHX)5677 S_compute_collxfrm_coefficients(pTHX)
5678 {
5679 
5680     /* A locale collation definition includes primary, secondary, tertiary,
5681      * etc. weights for each character.  To sort, the primary weights are used,
5682      * and only if they compare equal, then the secondary weights are used, and
5683      * only if they compare equal, then the tertiary, etc.
5684      *
5685      * strxfrm() works by taking the input string, say ABC, and creating an
5686      * output transformed string consisting of first the primary weights,
5687      * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the tertiary,
5688      * etc, yielding A¹B¹C¹ A²B²C² A³B³C³ ....  Some characters may not have
5689      * weights at every level.  In our example, let's say B doesn't have a
5690      * tertiary weight, and A doesn't have a secondary weight.  The constructed
5691      * string is then going to be
5692      *  A¹B¹C¹ B²C² A³C³ ....
5693      * This has the desired effect that strcmp() will look at the secondary or
5694      * tertiary weights only if the strings compare equal at all higher
5695      * priority weights.  The spaces shown here, like in
5696      *  "A¹B¹C¹ A²B²C² "
5697      * are not just for readability.  In the general case, these must actually
5698      * be bytes, which we will call here 'separator weights'; and they must be
5699      * smaller than any other weight value, but since these are C strings, only
5700      * the terminating one can be a NUL (some implementations may include a
5701      * non-NUL separator weight just before the NUL).  Implementations tend to
5702      * reserve 01 for the separator weights.  They are needed so that a shorter
5703      * string's secondary weights won't be misconstrued as primary weights of a
5704      * longer string, etc.  By making them smaller than any other weight, the
5705      * shorter string will sort first.  (Actually, if all secondary weights are
5706      * smaller than all primary ones, there is no need for a separator weight
5707      * between those two levels, etc.)
5708      *
5709      * The length of the transformed string is roughly a linear function of the
5710      * input string.  It's not exactly linear because some characters don't
5711      * have weights at all levels.  When we call strxfrm() we have to allocate
5712      * some memory to hold the transformed string.  The calculations below try
5713      * to find coefficients 'm' and 'b' for this locale so that m*x + b equals
5714      * how much space we need, given the size of the input string in 'x'.  If
5715      * we calculate too small, we increase the size as needed, and call
5716      * strxfrm() again, but it is better to get it right the first time to
5717      * avoid wasted expensive string transformations.
5718      *
5719      * We use the string below to find how long the transformation of it is.
5720      * Almost all locales are supersets of ASCII, or at least the ASCII
5721      * letters.  We use all of them, half upper half lower, because if we used
5722      * fewer, we might hit just the ones that are outliers in a particular
5723      * locale.  Most of the strings being collated will contain a preponderance
5724      * of letters, and even if they are above-ASCII, they are likely to have
5725      * the same number of weight levels as the ASCII ones.  It turns out that
5726      * digits tend to have fewer levels, and some punctuation has more, but
5727      * those are relatively sparse in text, and khw believes this gives a
5728      * reasonable result, but it could be changed if experience so dictates. */
5729     const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
5730     char * x_longer;        /* Transformed 'longer' */
5731     Size_t x_len_longer;    /* Length of 'x_longer' */
5732 
5733     char * x_shorter;   /* We also transform a substring of 'longer' */
5734     Size_t x_len_shorter;
5735 
5736     PL_in_utf8_COLLATE_locale = (PL_collation_standard)
5737                                 ? 0
5738                                 : is_locale_utf8(PL_collation_name);
5739     PL_strxfrm_NUL_replacement = '\0';
5740     PL_strxfrm_max_cp = 0;
5741 
5742     /* mem_collxfrm_() is used get the transformation (though here we are
5743      * interested only in its length).  It is used because it has the
5744      * intelligence to handle all cases, but to work, it needs some values of
5745      * 'm' and 'b' to get it started.  For the purposes of this calculation we
5746      * use a very conservative estimate of 'm' and 'b'.  This assumes a weight
5747      * can be multiple bytes, enough to hold any UV on the platform, and there
5748      * are 5 levels, 4 weight bytes, and a trailing NUL.  */
5749     PL_collxfrm_base = 5;
5750     PL_collxfrm_mult = 5 * sizeof(UV);
5751 
5752     /* Find out how long the transformation really is */
5753     x_longer = mem_collxfrm_(longer,
5754                              sizeof(longer) - 1,
5755                              &x_len_longer,
5756 
5757                              /* We avoid converting to UTF-8 in the called
5758                               * function by telling it the string is in UTF-8
5759                               * if the locale is a UTF-8 one.  Since the string
5760                               * passed here is invariant under UTF-8, we can
5761                               * claim it's UTF-8 even though it isn't.  */
5762                               PL_in_utf8_COLLATE_locale);
5763     Safefree(x_longer);
5764 
5765     /* Find out how long the transformation of a substring of 'longer' is.
5766      * Together the lengths of these transformations are sufficient to
5767      * calculate 'm' and 'b'.  The substring is all of 'longer' except the
5768      * first character.  This minimizes the chances of being swayed by outliers
5769      * */
5770     x_shorter = mem_collxfrm_(longer + 1,
5771                               sizeof(longer) - 2,
5772                               &x_len_shorter,
5773                               PL_in_utf8_COLLATE_locale);
5774     Safefree(x_shorter);
5775 
5776     /* If the results are nonsensical for this simple test, the whole locale
5777      * definition is suspect.  Mark it so that locale collation is not active
5778      * at all for it.  XXX Should we warn? */
5779     if (   x_len_shorter == 0
5780         || x_len_longer == 0
5781         || x_len_shorter >= x_len_longer)
5782     {
5783         PL_collxfrm_mult = 0;
5784         PL_collxfrm_base = 1;
5785         DEBUG_L(PerlIO_printf(Perl_debug_log,
5786                 "Disabling locale collation for LC_COLLATE='%s';"
5787                 " length for shorter sample=%zu; longer=%zu\n",
5788                 PL_collation_name, x_len_shorter, x_len_longer));
5789     }
5790     else {
5791         SSize_t base;       /* Temporary */
5792 
5793         /* We have both: m * strlen(longer)  + b = x_len_longer
5794          *               m * strlen(shorter) + b = x_len_shorter;
5795          * subtracting yields:
5796          *          m * (strlen(longer) - strlen(shorter))
5797          *                             = x_len_longer - x_len_shorter
5798          * But we have set things up so that 'shorter' is 1 byte smaller than
5799          * 'longer'.  Hence:
5800          *          m = x_len_longer - x_len_shorter
5801          *
5802          * But if something went wrong, make sure the multiplier is at least 1.
5803          */
5804         if (x_len_longer > x_len_shorter) {
5805             PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
5806         }
5807         else {
5808             PL_collxfrm_mult = 1;
5809         }
5810 
5811         /*     mx + b = len
5812          * so:      b = len - mx
5813          * but in case something has gone wrong, make sure it is non-negative
5814          * */
5815         base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
5816         if (base < 0) {
5817             base = 0;
5818         }
5819 
5820         /* Add 1 for the trailing NUL */
5821         PL_collxfrm_base = base + 1;
5822     }
5823 
5824     DEBUG_L(PerlIO_printf(Perl_debug_log,
5825                           "?UTF-8 locale=%d; x_len_shorter=%zu, "
5826                           "x_len_longer=%zu,"
5827                           " collate multipler=%zu, collate base=%zu\n",
5828                           PL_in_utf8_COLLATE_locale,
5829                           x_len_shorter, x_len_longer,
5830                           PL_collxfrm_mult, PL_collxfrm_base));
5831 }
5832 
5833 char *
Perl_mem_collxfrm_(pTHX_ const char * input_string,STRLEN len,STRLEN * xlen,bool utf8)5834 Perl_mem_collxfrm_(pTHX_ const char *input_string,
5835                          STRLEN len,    /* Length of 'input_string' */
5836                          STRLEN *xlen,  /* Set to length of returned string
5837                                            (not including the collation index
5838                                            prefix) */
5839                          bool utf8      /* Is the input in UTF-8? */
5840                    )
5841 {
5842     /* mem_collxfrm_() is like strxfrm() but with two important differences.
5843      * First, it handles embedded NULs. Second, it allocates a bit more memory
5844      * than needed for the transformed data itself.  The real transformed data
5845      * begins at offset COLLXFRM_HDR_LEN.  *xlen is set to the length of that,
5846      * and doesn't include the collation index size.
5847      *
5848      * It is the caller's responsibility to eventually free the memory returned
5849      * by this function.
5850      *
5851      * Please see sv_collxfrm() to see how this is used. */
5852 
5853 #  define COLLXFRM_HDR_LEN    sizeof(PL_collation_ix)
5854 
5855     char * s = (char *) input_string;
5856     STRLEN s_strlen = strlen(input_string);
5857     char *xbuf = NULL;
5858     STRLEN xAlloc;          /* xalloc is a reserved word in VC */
5859     STRLEN length_in_chars;
5860     bool first_time = TRUE; /* Cleared after first loop iteration */
5861 
5862 #  ifdef USE_LOCALE_CTYPE
5863         const char * orig_CTYPE_locale = NULL;
5864 #  endif
5865 
5866 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
5867     locale_t constructed_locale = (locale_t) 0;
5868 #  endif
5869 
5870     PERL_ARGS_ASSERT_MEM_COLLXFRM_;
5871 
5872     /* Must be NUL-terminated */
5873     assert(*(input_string + len) == '\0');
5874 
5875     if (PL_collxfrm_mult == 0) {     /* unknown or bad */
5876         if (PL_collxfrm_base != 0) { /* bad collation => skip */
5877             DEBUG_L(PerlIO_printf(Perl_debug_log,
5878                             "mem_collxfrm_: locale's collation is defective\n"));
5879             goto bad;
5880         }
5881 
5882         /* (mult, base) == (0,0) means we need to calculate mult and base
5883          * before proceeding */
5884         S_compute_collxfrm_coefficients(aTHX);
5885     }
5886 
5887     /* Replace any embedded NULs with the control that sorts before any others.
5888      * This will give as good as possible results on strings that don't
5889      * otherwise contain that character, but otherwise there may be
5890      * less-than-perfect results with that character and NUL.  This is
5891      * unavoidable unless we replace strxfrm with our own implementation. */
5892     if (UNLIKELY(s_strlen < len)) {   /* Only execute if there is an embedded
5893                                          NUL */
5894         char * e = s + len;
5895         char * sans_nuls;
5896         STRLEN sans_nuls_len;
5897         int try_non_controls;
5898         char this_replacement_char[] = "?\0";   /* Room for a two-byte string,
5899                                                    making sure 2nd byte is NUL.
5900                                                  */
5901         STRLEN this_replacement_len;
5902 
5903         /* If we don't know what non-NUL control character sorts lowest for
5904          * this locale, find it */
5905         if (PL_strxfrm_NUL_replacement == '\0') {
5906             int j;
5907             char * cur_min_x = NULL;    /* The min_char's xfrm, (except it also
5908                                            includes the collation index
5909                                            prefixed. */
5910 
5911             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
5912 
5913             /* Unlikely, but it may be that no control will work to replace
5914              * NUL, in which case we instead look for any character.  Controls
5915              * are preferred because collation order is, in general, context
5916              * sensitive, with adjoining characters affecting the order, and
5917              * controls are less likely to have such interactions, allowing the
5918              * NUL-replacement to stand on its own.  (Another way to look at it
5919              * is to imagine what would happen if the NUL were replaced by a
5920              * combining character; it wouldn't work out all that well.) */
5921             for (try_non_controls = 0;
5922                  try_non_controls < 2;
5923                  try_non_controls++)
5924             {
5925 
5926 #  ifdef USE_LOCALE_CTYPE
5927 
5928                 /* In this case we use isCNTRL_LC() below, which relies on
5929                  * LC_CTYPE, so that must be switched to correspond with the
5930                  * LC_COLLATE locale */
5931                 if (! try_non_controls && ! PL_in_utf8_COLLATE_locale) {
5932                     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
5933                 }
5934 #  endif
5935                 /* Look through all legal code points (NUL isn't) */
5936                 for (j = 1; j < 256; j++) {
5937                     char * x;       /* j's xfrm plus collation index */
5938                     STRLEN x_len;   /* length of 'x' */
5939                     STRLEN trial_len = 1;
5940                     char cur_source[] = { '\0', '\0' };
5941 
5942                     /* Skip non-controls the first time through the loop.  The
5943                      * controls in a UTF-8 locale are the L1 ones */
5944                     if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
5945                                                ? ! isCNTRL_L1(j)
5946                                                : ! isCNTRL_LC(j))
5947                     {
5948                         continue;
5949                     }
5950 
5951                     /* Create a 1-char string of the current code point */
5952                     cur_source[0] = (char) j;
5953 
5954                     /* Then transform it */
5955                     x = mem_collxfrm_(cur_source, trial_len, &x_len,
5956                                       0 /* The string is not in UTF-8 */);
5957 
5958                     /* Ignore any character that didn't successfully transform.
5959                      * */
5960                     if (! x) {
5961                         continue;
5962                     }
5963 
5964                     /* If this character's transformation is lower than
5965                      * the current lowest, this one becomes the lowest */
5966                     if (   cur_min_x == NULL
5967                         || strLT(x         + COLLXFRM_HDR_LEN,
5968                                  cur_min_x + COLLXFRM_HDR_LEN))
5969                     {
5970                         PL_strxfrm_NUL_replacement = j;
5971                         Safefree(cur_min_x);
5972                         cur_min_x = x;
5973                     }
5974                     else {
5975                         Safefree(x);
5976                     }
5977                 } /* end of loop through all 255 characters */
5978 
5979 #  ifdef USE_LOCALE_CTYPE
5980                 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
5981 #  endif
5982 
5983                 /* Stop looking if found */
5984                 if (cur_min_x) {
5985                     break;
5986                 }
5987 
5988                 /* Unlikely, but possible, if there aren't any controls that
5989                  * work in the locale, repeat the loop, looking for any
5990                  * character that works */
5991                 DEBUG_L(PerlIO_printf(Perl_debug_log,
5992                 "mem_collxfrm_: No control worked.  Trying non-controls\n"));
5993             } /* End of loop to try first the controls, then any char */
5994 
5995             if (! cur_min_x) {
5996                 DEBUG_L(PerlIO_printf(Perl_debug_log,
5997                     "mem_collxfrm_: Couldn't find any character to replace"
5998                     " embedded NULs in locale %s with", PL_collation_name));
5999                 goto bad;
6000             }
6001 
6002             DEBUG_L(PerlIO_printf(Perl_debug_log,
6003                     "mem_collxfrm_: Replacing embedded NULs in locale %s with "
6004                     "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
6005 
6006             Safefree(cur_min_x);
6007         } /* End of determining the character that is to replace NULs */
6008 
6009         /* If the replacement is variant under UTF-8, it must match the
6010          * UTF8-ness of the original */
6011         if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
6012             this_replacement_char[0] =
6013                                 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
6014             this_replacement_char[1] =
6015                                 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
6016             this_replacement_len = 2;
6017         }
6018         else {
6019             this_replacement_char[0] = PL_strxfrm_NUL_replacement;
6020             /* this_replacement_char[1] = '\0' was done at initialization */
6021             this_replacement_len = 1;
6022         }
6023 
6024         /* The worst case length for the replaced string would be if every
6025          * character in it is NUL.  Multiply that by the length of each
6026          * replacement, and allow for a trailing NUL */
6027         sans_nuls_len = (len * this_replacement_len) + 1;
6028         Newx(sans_nuls, sans_nuls_len, char);
6029         *sans_nuls = '\0';
6030 
6031         /* Replace each NUL with the lowest collating control.  Loop until have
6032          * exhausted all the NULs */
6033         while (s + s_strlen < e) {
6034             my_strlcat(sans_nuls, s, sans_nuls_len);
6035 
6036             /* Do the actual replacement */
6037             my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
6038 
6039             /* Move past the input NUL */
6040             s += s_strlen + 1;
6041             s_strlen = strlen(s);
6042         }
6043 
6044         /* And add anything that trails the final NUL */
6045         my_strlcat(sans_nuls, s, sans_nuls_len);
6046 
6047         /* Switch so below we transform this modified string */
6048         s = sans_nuls;
6049         len = strlen(s);
6050     } /* End of replacing NULs */
6051 
6052     /* Make sure the UTF8ness of the string and locale match */
6053     if (utf8 != PL_in_utf8_COLLATE_locale) {
6054         /* XXX convert above Unicode to 10FFFF? */
6055         const char * const t = s;   /* Temporary so we can later find where the
6056                                        input was */
6057 
6058         /* Here they don't match.  Change the string's to be what the locale is
6059          * expecting */
6060 
6061         if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
6062             s = (char *) bytes_to_utf8((const U8 *) s, &len);
6063             utf8 = TRUE;
6064         }
6065         else {   /* locale is not UTF-8; but input is; downgrade the input */
6066 
6067             s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
6068 
6069             /* If the downgrade was successful we are done, but if the input
6070              * contains things that require UTF-8 to represent, have to do
6071              * damage control ... */
6072             if (UNLIKELY(utf8)) {
6073 
6074                 /* What we do is construct a non-UTF-8 string with
6075                  *  1) the characters representable by a single byte converted
6076                  *     to be so (if necessary);
6077                  *  2) and the rest converted to collate the same as the
6078                  *     highest collating representable character.  That makes
6079                  *     them collate at the end.  This is similar to how we
6080                  *     handle embedded NULs, but we use the highest collating
6081                  *     code point instead of the smallest.  Like the NUL case,
6082                  *     this isn't perfect, but is the best we can reasonably
6083                  *     do.  Every above-255 code point will sort the same as
6084                  *     the highest-sorting 0-255 code point.  If that code
6085                  *     point can combine in a sequence with some other code
6086                  *     points for weight calculations, us changing something to
6087                  *     be it can adversely affect the results.  But in most
6088                  *     cases, it should work reasonably.  And note that this is
6089                  *     really an illegal situation: using code points above 255
6090                  *     on a locale where only 0-255 are valid.  If two strings
6091                  *     sort entirely equal, then the sort order for the
6092                  *     above-255 code points will be in code point order. */
6093 
6094                 utf8 = FALSE;
6095 
6096                 /* If we haven't calculated the code point with the maximum
6097                  * collating order for this locale, do so now */
6098                 if (! PL_strxfrm_max_cp) {
6099                     int j;
6100 
6101                     /* The current transformed string that collates the
6102                      * highest (except it also includes the prefixed collation
6103                      * index. */
6104                     char * cur_max_x = NULL;
6105 
6106                     /* Look through all legal code points (NUL isn't) */
6107                     for (j = 1; j < 256; j++) {
6108                         char * x;
6109                         STRLEN x_len;
6110                         char cur_source[] = { '\0', '\0' };
6111 
6112                         /* Create a 1-char string of the current code point */
6113                         cur_source[0] = (char) j;
6114 
6115                         /* Then transform it */
6116                         x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);
6117 
6118                         /* If something went wrong (which it shouldn't), just
6119                          * ignore this code point */
6120                         if (! x) {
6121                             continue;
6122                         }
6123 
6124                         /* If this character's transformation is higher than
6125                          * the current highest, this one becomes the highest */
6126                         if (   cur_max_x == NULL
6127                             || strGT(x         + COLLXFRM_HDR_LEN,
6128                                      cur_max_x + COLLXFRM_HDR_LEN))
6129                         {
6130                             PL_strxfrm_max_cp = j;
6131                             Safefree(cur_max_x);
6132                             cur_max_x = x;
6133                         }
6134                         else {
6135                             Safefree(x);
6136                         }
6137                     }
6138 
6139                     if (! cur_max_x) {
6140                         DEBUG_L(PerlIO_printf(Perl_debug_log,
6141                             "mem_collxfrm_: Couldn't find any character to"
6142                             " replace above-Latin1 chars in locale %s with",
6143                             PL_collation_name));
6144                         goto bad;
6145                     }
6146 
6147                     DEBUG_L(PerlIO_printf(Perl_debug_log,
6148                             "mem_collxfrm_: highest 1-byte collating character"
6149                             " in locale %s is 0x%02X\n",
6150                             PL_collation_name,
6151                             PL_strxfrm_max_cp));
6152 
6153                     Safefree(cur_max_x);
6154                 }
6155 
6156                 /* Here we know which legal code point collates the highest.
6157                  * We are ready to construct the non-UTF-8 string.  The length
6158                  * will be at least 1 byte smaller than the input string
6159                  * (because we changed at least one 2-byte character into a
6160                  * single byte), but that is eaten up by the trailing NUL */
6161                 Newx(s, len, char);
6162 
6163                 {
6164                     STRLEN i;
6165                     STRLEN d= 0;
6166                     char * e = (char *) t + len;
6167 
6168                     for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
6169                         U8 cur_char = t[i];
6170                         if (UTF8_IS_INVARIANT(cur_char)) {
6171                             s[d++] = cur_char;
6172                         }
6173                         else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
6174                             s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
6175                         }
6176                         else {  /* Replace illegal cp with highest collating
6177                                    one */
6178                             s[d++] = PL_strxfrm_max_cp;
6179                         }
6180                     }
6181                     s[d++] = '\0';
6182                     Renew(s, d, char);   /* Free up unused space */
6183                 }
6184             }
6185         }
6186 
6187         /* Here, we have constructed a modified version of the input.  It could
6188          * be that we already had a modified copy before we did this version.
6189          * If so, that copy is no longer needed */
6190         if (t != input_string) {
6191             Safefree(t);
6192         }
6193     }
6194 
6195     length_in_chars = (utf8)
6196                       ? utf8_length((U8 *) s, (U8 *) s + len)
6197                       : len;
6198 
6199     /* The first element in the output is the collation id, used by
6200      * sv_collxfrm(); then comes the space for the transformed string.  The
6201      * equation should give us a good estimate as to how much is needed */
6202     xAlloc = COLLXFRM_HDR_LEN
6203            + PL_collxfrm_base
6204            + (PL_collxfrm_mult * length_in_chars);
6205     Newx(xbuf, xAlloc, char);
6206     if (UNLIKELY(! xbuf)) {
6207         DEBUG_L(PerlIO_printf(Perl_debug_log,
6208                       "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc));
6209         goto bad;
6210     }
6211 
6212     /* Store the collation id */
6213     *(U32*)xbuf = PL_collation_ix;
6214 
6215 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
6216 #    ifdef USE_LOCALE_CTYPE
6217 
6218     constructed_locale = newlocale(LC_CTYPE_MASK, PL_collation_name,
6219                                    duplocale(use_curlocale_scratch()));
6220 #    else
6221 
6222     constructed_locale = duplocale(use_curlocale_scratch());
6223 
6224 #    endif
6225 #    define my_strxfrm(dest, src, n)  strxfrm_l(dest, src, n,           \
6226                                                 constructed_locale)
6227 #    define CLEANUP_STRXFRM                                             \
6228         STMT_START {                                                    \
6229             if (constructed_locale != (locale_t) 0)                     \
6230                 freelocale(constructed_locale);                         \
6231         } STMT_END
6232 #  else
6233 #    define my_strxfrm(dest, src, n)  strxfrm(dest, src, n)
6234 #    ifdef USE_LOCALE_CTYPE
6235 
6236     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
6237 
6238 #      define CLEANUP_STRXFRM                                           \
6239                 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
6240 #    else
6241 #      define CLEANUP_STRXFRM  NOOP
6242 #    endif
6243 #  endif
6244 
6245     /* Then the transformation of the input.  We loop until successful, or we
6246      * give up */
6247     for (;;) {
6248 
6249         errno = 0;
6250         *xlen = my_strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
6251 
6252         /* If the transformed string occupies less space than we told strxfrm()
6253          * was available, it means it transformed the whole string. */
6254         if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
6255 
6256             /* But there still could have been a problem */
6257             if (errno != 0) {
6258                 DEBUG_L(PerlIO_printf(Perl_debug_log,
6259                        "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n",
6260                        PL_collation_name, errno,
6261                        _byte_dump_string((U8 *) s, len, 0)));
6262                 goto bad;
6263             }
6264 
6265             /* Here, the transformation was successful.  Some systems include a
6266              * trailing NUL in the returned length.  Ignore it, using a loop in
6267              * case multiple trailing NULs are returned. */
6268             while (   (*xlen) > 0
6269                    && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
6270             {
6271                 (*xlen)--;
6272             }
6273 
6274             /* If the first try didn't get it, it means our prediction was low.
6275              * Modify the coefficients so that we predict a larger value in any
6276              * future transformations */
6277             if (! first_time) {
6278                 STRLEN needed = *xlen + 1;   /* +1 For trailing NUL */
6279                 STRLEN computed_guess = PL_collxfrm_base
6280                                       + (PL_collxfrm_mult * length_in_chars);
6281 
6282                 /* On zero-length input, just keep current slope instead of
6283                  * dividing by 0 */
6284                 const STRLEN new_m = (length_in_chars != 0)
6285                                      ? needed / length_in_chars
6286                                      : PL_collxfrm_mult;
6287 
6288                 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6289                     "initial size of %zu bytes for a length "
6290                     "%zu string was insufficient, %zu needed\n",
6291                     computed_guess, length_in_chars, needed));
6292 
6293                 /* If slope increased, use it, but discard this result for
6294                  * length 1 strings, as we can't be sure that it's a real slope
6295                  * change */
6296                 if (length_in_chars > 1 && new_m  > PL_collxfrm_mult) {
6297 
6298 #  ifdef DEBUGGING
6299 
6300                     STRLEN old_m = PL_collxfrm_mult;
6301                     STRLEN old_b = PL_collxfrm_base;
6302 
6303 #  endif
6304 
6305                     PL_collxfrm_mult = new_m;
6306                     PL_collxfrm_base = 1;   /* +1 For trailing NUL */
6307                     computed_guess = PL_collxfrm_base
6308                                     + (PL_collxfrm_mult * length_in_chars);
6309                     if (computed_guess < needed) {
6310                         PL_collxfrm_base += needed - computed_guess;
6311                     }
6312 
6313                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6314                                     "slope is now %zu; was %zu, base "
6315                         "is now %zu; was %zu\n",
6316                         PL_collxfrm_mult, old_m,
6317                         PL_collxfrm_base, old_b));
6318                 }
6319                 else {  /* Slope didn't change, but 'b' did */
6320                     const STRLEN new_b = needed
6321                                         - computed_guess
6322                                         + PL_collxfrm_base;
6323                     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6324                         "base is now %zu; was %zu\n", new_b, PL_collxfrm_base));
6325                     PL_collxfrm_base = new_b;
6326                 }
6327             }
6328 
6329             break;
6330         }
6331 
6332         if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
6333             DEBUG_L(PerlIO_printf(Perl_debug_log,
6334                   "mem_collxfrm_: Needed %zu bytes, max permissible is %u\n",
6335                   *xlen, PERL_INT_MAX));
6336             goto bad;
6337         }
6338 
6339         /* A well-behaved strxfrm() returns exactly how much space it needs
6340          * (usually not including the trailing NUL) when it fails due to not
6341          * enough space being provided.  Assume that this is the case unless
6342          * it's been proven otherwise */
6343         if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
6344             xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
6345         }
6346         else { /* Here, either:
6347                 *  1)  The strxfrm() has previously shown bad behavior; or
6348                 *  2)  It isn't the first time through the loop, which means
6349                 *      that the strxfrm() is now showing bad behavior, because
6350                 *      we gave it what it said was needed in the previous
6351                 *      iteration, and it came back saying it needed still more.
6352                 *      (Many versions of cygwin fit this.  When the buffer size
6353                 *      isn't sufficient, they return the input size instead of
6354                 *      how much is needed.)
6355                 * Increase the buffer size by a fixed percentage and try again.
6356                 * */
6357             xAlloc += (xAlloc / 4) + 1;
6358             PL_strxfrm_is_behaved = FALSE;
6359 
6360             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6361                      "mem_collxfrm_ required more space than previously"
6362                      " calculated for locale %s, trying again with new"
6363                      " guess=%zu+%zu\n",
6364                 PL_collation_name,  COLLXFRM_HDR_LEN,
6365                      xAlloc - COLLXFRM_HDR_LEN));
6366         }
6367 
6368         Renew(xbuf, xAlloc, char);
6369         if (UNLIKELY(! xbuf)) {
6370             DEBUG_L(PerlIO_printf(Perl_debug_log,
6371                       "mem_collxfrm_: Couldn't realloc %zu bytes\n", xAlloc));
6372             goto bad;
6373         }
6374 
6375         first_time = FALSE;
6376     }
6377 
6378     CLEANUP_STRXFRM;
6379 
6380     DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8));
6381 
6382     /* Free up unneeded space; retain enough for trailing NUL */
6383     Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
6384 
6385     if (s != input_string) {
6386         Safefree(s);
6387     }
6388 
6389     return xbuf;
6390 
6391   bad:
6392 
6393     CLEANUP_STRXFRM;
6394     DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
6395 
6396     Safefree(xbuf);
6397     if (s != input_string) {
6398         Safefree(s);
6399     }
6400     *xlen = 0;
6401 
6402     return NULL;
6403 }
6404 
6405 #  ifdef DEBUGGING
6406 
6407 STATIC void
S_print_collxfrm_input_and_return(pTHX_ const char * s,const char * e,const char * xbuf,const STRLEN xlen,const bool is_utf8)6408 S_print_collxfrm_input_and_return(pTHX_
6409                                   const char * s,
6410                                   const char * e,
6411                                   const char * xbuf,
6412                                   const STRLEN xlen,
6413                                   const bool is_utf8)
6414 {
6415 
6416     PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
6417 
6418     PerlIO_printf(Perl_debug_log,
6419                   "mem_collxfrm_[ix %" UVuf "] for locale '%s':\n"
6420                   "     input=%s\n    return=%s\n    return len=%zu\n",
6421                   (UV) PL_collation_ix, PL_collation_name,
6422                   get_displayable_string(s, e, is_utf8),
6423                   ((xbuf == NULL)
6424                    ? "(null)"
6425                    : _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN, xlen, 0)),
6426                   xlen);
6427 }
6428 
6429 #  endif    /* DEBUGGING */
6430 
6431 SV *
Perl_strxfrm(pTHX_ SV * src)6432 Perl_strxfrm(pTHX_ SV * src)
6433 {
6434     PERL_ARGS_ASSERT_STRXFRM;
6435 
6436     /* For use by POSIX::strxfrm().  If they differ, toggle LC_CTYPE to
6437      * LC_COLLATE to avoid potential mojibake.
6438      *
6439      * If we can't calculate a collation, 'src' is instead returned, so that
6440      * future comparisons will be by code point order */
6441 
6442 #  ifdef USE_LOCALE_CTYPE
6443 
6444     const char * orig_ctype = toggle_locale_c(LC_CTYPE,
6445                                               querylocale_c(LC_COLLATE));
6446 #  endif
6447 
6448     SV * dst = src;
6449     STRLEN dstlen;
6450     STRLEN srclen;
6451     const char *p = SvPV_const(src,srclen);
6452     const U32 utf8_flag = SvUTF8(src);
6453     char *d = mem_collxfrm_(p, srclen, &dstlen, cBOOL(utf8_flag));
6454 
6455     assert(utf8_flag == 0 || utf8_flag == SVf_UTF8);
6456 
6457     if (d != NULL) {
6458         assert(dstlen > 0);
6459         dst =newSVpvn_flags(d + COLLXFRM_HDR_LEN,
6460                             dstlen, SVs_TEMP|utf8_flag);
6461         Safefree(d);
6462     }
6463 
6464 #  ifdef USE_LOCALE_CTYPE
6465 
6466     restore_toggled_locale_c(LC_CTYPE, orig_ctype);
6467 
6468 #  endif
6469 
6470     return dst;
6471 }
6472 
6473 #endif /* USE_LOCALE_COLLATE */
6474 #if  defined(DEBUGGING) || defined(USE_POSIX_2008_LOCALE)
6475 
6476 STATIC const char *
S_get_displayable_string(pTHX_ const char * const s,const char * const e,const bool is_utf8)6477 S_get_displayable_string(pTHX_
6478                          const char * const s,
6479                          const char * const e,
6480                          const bool is_utf8)
6481 {
6482     PERL_ARGS_ASSERT_GET_DISPLAYABLE_STRING;
6483 
6484     const char * t = s;
6485     bool prev_was_printable = TRUE;
6486     bool first_time = TRUE;
6487     char * ret;
6488 
6489     if (e <= s) {
6490         return "";
6491     }
6492 
6493     /* Worst case scenario: All are non-printable so have a blank between each.
6494      * If UTF-8, all are the largest possible code point; otherwise all are a
6495      * single byte.  '(2 + 1)'  is from each byte takes 2 characters to
6496      * display, and a blank (or NUL for the final one) after it */
6497     Newxz(ret, (e - s) * (2 + 1) * ((is_utf8) ? UVSIZE : 1), char);
6498     SAVEFREEPV(ret);
6499 
6500     while (t < e) {
6501         UV cp = (is_utf8)
6502                 ?  utf8_to_uvchr_buf((U8 *) t, e, NULL)
6503                 : * (U8 *) t;
6504         if (isPRINT(cp)) {
6505             if (! prev_was_printable) {
6506                 my_strlcat(ret, " ", sizeof(ret));
6507             }
6508 
6509             /* Escape these to avoid any ambiguity */
6510             if (cp == ' ' || cp == '\\') {
6511                 my_strlcat(ret, "\\", sizeof(ret));
6512             }
6513             my_strlcat(ret, Perl_form(aTHX_ "%c", (U8) cp), sizeof(ret));
6514             prev_was_printable = TRUE;
6515         }
6516         else {
6517             if (! first_time) {
6518                 my_strlcat(ret, " ", sizeof(ret));
6519             }
6520             my_strlcat(ret, Perl_form(aTHX_ "%02" UVXf, cp), sizeof(ret));
6521             prev_was_printable = FALSE;
6522         }
6523         t += (is_utf8) ? UTF8SKIP(t) : 1;
6524         first_time = FALSE;
6525     }
6526 
6527     return ret;
6528 }
6529 
6530 #endif
6531 #ifdef USE_LOCALE
6532 
6533 STATIC const char *
S_toggle_locale_i(pTHX_ const unsigned cat_index,const char * new_locale,const line_t caller_line)6534 S_toggle_locale_i(pTHX_ const unsigned cat_index,
6535                         const char * new_locale,
6536                         const line_t caller_line)
6537 {
6538     /* Changes the locale for the category specified by 'index' to 'new_locale,
6539      * if they aren't already the same.
6540      *
6541      * Returns a copy of the name of the original locale for 'cat_index'
6542      * so can be switched back to with the companion function
6543      * restore_toggled_locale_i(),  (NULL if no restoral is necessary.) */
6544 
6545     const char * locale_to_restore_to = NULL;
6546 
6547     PERL_ARGS_ASSERT_TOGGLE_LOCALE_I;
6548     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
6549 
6550     /* Find the original locale of the category we may need to change, so that
6551      * it can be restored to later */
6552 
6553     locale_to_restore_to = querylocale_i(cat_index);
6554 
6555     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6556              "(%" LINE_Tf "): toggle_locale_i: index=%d(%s), wanted=%s,"
6557              " actual=%s\n",
6558              caller_line, cat_index, category_names[cat_index],
6559              new_locale, locale_to_restore_to));
6560 
6561     if (! locale_to_restore_to) {
6562         locale_panic_(Perl_form(aTHX_
6563                                 "Could not find current %s locale, errno=%d",
6564                                 category_names[cat_index], errno));
6565     }
6566 
6567     /* If the locales are the same, there's nothing to do */
6568     if (strEQ(locale_to_restore_to, new_locale)) {
6569         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6570                                "(%" LINE_Tf "): %s locale unchanged as %s\n",
6571                                caller_line, category_names[cat_index],
6572                                new_locale));
6573 
6574         return NULL;
6575     }
6576 
6577     /* Finally, change the locale to the new one */
6578     void_setlocale_i(cat_index, new_locale);
6579 
6580     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6581                            "(%" LINE_Tf "): %s locale switched to %s\n",
6582                            caller_line, category_names[cat_index], new_locale));
6583 
6584     return locale_to_restore_to;
6585 
6586 #  ifndef DEBUGGING
6587     PERL_UNUSED_ARG(caller_line);
6588 #  endif
6589 
6590 }
6591 
6592 STATIC void
S_restore_toggled_locale_i(pTHX_ const unsigned int cat_index,const char * restore_locale,const line_t caller_line)6593 S_restore_toggled_locale_i(pTHX_ const unsigned int cat_index,
6594                                  const char * restore_locale,
6595                                  const line_t caller_line)
6596 {
6597     /* Restores the locale for LC_category corresponding to cat_indes to
6598      * 'restore_locale' (which is a copy that will be freed by this function),
6599      * or do nothing if the latter parameter is NULL */
6600 
6601     PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I;
6602     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
6603 
6604     if (restore_locale == NULL) {
6605         DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6606                                "(%" LINE_Tf "): No need to restore %s\n",
6607                                caller_line, category_names[cat_index]));
6608         return;
6609     }
6610 
6611     DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6612                            "(%" LINE_Tf "): %s restoring locale to %s\n",
6613                            caller_line, category_names[cat_index],
6614                            restore_locale));
6615 
6616     void_setlocale_i(cat_index, restore_locale);
6617 
6618 #  ifndef DEBUGGING
6619     PERL_UNUSED_ARG(caller_line);
6620 #  endif
6621 
6622 }
6623 
6624 #  ifdef USE_LOCALE_CTYPE
6625 
6626 STATIC bool
S_is_codeset_name_UTF8(const char * name)6627 S_is_codeset_name_UTF8(const char * name)
6628 {
6629     /* Return a boolean as to if the passed-in name indicates it is a UTF-8
6630      * code set.  Several variants are possible */
6631     const Size_t len = strlen(name);
6632 
6633     PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8;
6634 
6635 #    ifdef WIN32
6636 
6637     /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
6638     if (memENDs(name, len, "65001")) {
6639         return TRUE;
6640     }
6641 
6642 #    endif
6643                /* 'UTF8' or 'UTF-8' */
6644     return (    inRANGE(len, 4, 5)
6645             &&  name[len-1] == '8'
6646             && (   memBEGINs(name, len, "UTF")
6647                 || memBEGINs(name, len, "utf"))
6648             && (len == 4 || name[3] == '-'));
6649 }
6650 
6651 #  endif
6652 #endif  /* USE_LOCALE */
6653 
6654 bool
Perl__is_in_locale_category(pTHX_ const bool compiling,const int category)6655 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
6656 {
6657     /* Internal function which returns if we are in the scope of a pragma that
6658      * enables the locale category 'category'.  'compiling' should indicate if
6659      * this is during the compilation phase (TRUE) or not (FALSE). */
6660 
6661     const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
6662 
6663     SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
6664     if (! these_categories || these_categories == &PL_sv_placeholder) {
6665         return FALSE;
6666     }
6667 
6668     /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
6669      * a valid unsigned */
6670     assert(category >= -1);
6671     return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
6672 }
6673 
6674 /* my_strerror() returns a mortalized copy of the text of the error message
6675  * associated with 'errnum'.
6676  *
6677  * If not called from within the scope of 'use locale', it uses the text from
6678  * the C locale.  If Perl is compiled to not pay attention to LC_CTYPE nor
6679  * LC_MESSAGES, it uses whatever strerror() returns.  Otherwise the text is
6680  * derived from the locale, LC_MESSAGES if we have that; LC_CTYPE if not.
6681  *
6682  * It returns in *utf8ness the result's UTF-8ness
6683  *
6684  * The function just calls strerror(), but temporarily switches locales, if
6685  * needed.  Many platforms require LC_CTYPE and LC_MESSAGES to be in the same
6686  * CODESET in order for the return from strerror() to not contain '?' symbols,
6687  * or worse, mojibaked.  It's cheaper to just use the stricter criteria of
6688  * being in the same locale.  So the code below uses a common locale for both
6689  * categories.  Again, that is C if not within 'use locale' scope; or the
6690  * LC_MESSAGES locale if in scope and we have that category; and LC_CTYPE if we
6691  * don't have LC_MESSAGES; and whatever strerror returns if we don't have
6692  * either category.
6693  *
6694  * There are two sets of implementations.  The first below is if we have
6695  * strerror_l().  This is the simpler.  We just use the already-built C locale
6696  * object if not in locale scope, or build up a custom one otherwise.
6697  *
6698  * When strerror_l() is not available, we may have to swap locales temporarily
6699  * to bring the two categories into sync with each other, and possibly to the C
6700  * locale.
6701  *
6702  * Because the prepropessing directives to conditionally compile this function
6703  * would greatly obscure the logic of the various implementations, the whole
6704  * function is repeated for each configuration, with some common macros. */
6705 
6706 /* Used to shorten the definitions of the following implementations of
6707  * my_strerror() */
6708 #define DEBUG_STRERROR_ENTER(errnum, in_locale)                             \
6709     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
6710                            "my_strerror called with errnum %d;"             \
6711                            " Within locale scope=%d\n",                     \
6712                            errnum, in_locale))
6713 #define DEBUG_STRERROR_RETURN(errstr, utf8ness)                             \
6714     DEBUG_Lv(PerlIO_printf(Perl_debug_log,                                  \
6715                            "Strerror returned; saving a copy: '%s';"        \
6716                            " utf8ness=%d\n",                                \
6717                            get_displayable_string(errstr,                   \
6718                                                   errstr + strlen(errstr),  \
6719                                                   *utf8ness),               \
6720                            (int) *utf8ness))
6721 
6722 /* On platforms that have precisely one of these categories (Windows
6723  * qualifies), these yield the correct one */
6724 #if defined(USE_LOCALE_CTYPE)
6725 #  define WHICH_LC_INDEX LC_CTYPE_INDEX_
6726 #elif defined(USE_LOCALE_MESSAGES)
6727 #  define WHICH_LC_INDEX LC_MESSAGES_INDEX_
6728 #endif
6729 
6730 /*==========================================================================*/
6731 /* First set of implementations, when have strerror_l() */
6732 
6733 #if defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
6734 
6735 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
6736 
6737 /* Here, neither category is defined: use the C locale */
6738 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)6739 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6740 {
6741     PERL_ARGS_ASSERT_MY_STRERROR;
6742 
6743     DEBUG_STRERROR_ENTER(errnum, 0);
6744 
6745     const char *errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
6746     *utf8ness = UTF8NESS_IMMATERIAL;
6747 
6748     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6749 
6750     SAVEFREEPV(errstr);
6751     return errstr;
6752 }
6753 
6754 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
6755 
6756 /*--------------------------------------------------------------------------*/
6757 
6758 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
6759  * are not within 'use locale' scope of the only one defined, we use the C
6760  * locale; otherwise use the current locale object */
6761 
6762 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)6763 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6764 {
6765     PERL_ARGS_ASSERT_MY_STRERROR;
6766 
6767     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
6768 
6769     /* Use C if not within locale scope;  Otherwise, use current locale */
6770     const locale_t which_obj = (IN_LC(categories[WHICH_LC_INDEX]))
6771                                ? PL_C_locale_obj
6772                                : use_curlocale_scratch();
6773 
6774     const char *errstr = savepv(strerror_l(errnum, which_obj));
6775     *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
6776                                              NULL, WHICH_LC_INDEX);
6777     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6778 
6779     SAVEFREEPV(errstr);
6780     return errstr;
6781 }
6782 
6783 /*--------------------------------------------------------------------------*/
6784 #  else     /* Are using both categories.  Place them in the same CODESET,
6785              * either C or the LC_MESSAGES locale */
6786 
6787 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)6788 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6789 {
6790     PERL_ARGS_ASSERT_MY_STRERROR;
6791 
6792     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
6793 
6794     const char *errstr;
6795     if (! IN_LC(LC_MESSAGES)) {    /* Use C if not within locale scope */
6796         errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
6797         *utf8ness = UTF8NESS_IMMATERIAL;
6798     }
6799     else {  /* Otherwise, use the LC_MESSAGES locale, making sure LC_CTYPE
6800                matches */
6801         locale_t cur = duplocale(use_curlocale_scratch());
6802 
6803         cur = newlocale(LC_CTYPE_MASK, querylocale_c(LC_MESSAGES), cur);
6804         errstr = savepv(strerror_l(errnum, cur));
6805         *utf8ness = get_locale_string_utf8ness_i(errstr,
6806                                                  LOCALE_UTF8NESS_UNKNOWN,
6807                                                  NULL, LC_MESSAGES_INDEX_);
6808         freelocale(cur);
6809     }
6810 
6811     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6812 
6813     SAVEFREEPV(errstr);
6814     return errstr;
6815 }
6816 #  endif    /* Above is using strerror_l */
6817 /*==========================================================================*/
6818 #else       /* Below is not using strerror_l */
6819 #  if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
6820 
6821 /* If not using using either of the categories, return plain, unadorned
6822  * strerror */
6823 
6824 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)6825 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6826 {
6827     PERL_ARGS_ASSERT_MY_STRERROR;
6828 
6829     DEBUG_STRERROR_ENTER(errnum, 0);
6830 
6831     const char *errstr = savepv(Strerror(errnum));
6832     *utf8ness = UTF8NESS_IMMATERIAL;
6833 
6834     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6835 
6836     SAVEFREEPV(errstr);
6837     return errstr;
6838 }
6839 
6840 /*--------------------------------------------------------------------------*/
6841 #  elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
6842 
6843 /* Here one or the other of CTYPE or MESSAGES is defined, but not both.  If we
6844  * are not within 'use locale' scope of the only one defined, we use the C
6845  * locale; otherwise use the current locale */
6846 
6847 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)6848 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6849 {
6850     PERL_ARGS_ASSERT_MY_STRERROR;
6851 
6852     DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
6853 
6854     const char *errstr;
6855     if (IN_LC(categories[WHICH_LC_INDEX])) {
6856         errstr = savepv(Strerror(errnum));
6857         *utf8ness = get_locale_string_utf8ness_i(errstr,
6858                                                  LOCALE_UTF8NESS_UNKNOWN,
6859                                                  NULL, WHICH_LC_INDEX);
6860     }
6861     else {
6862 
6863         SETLOCALE_LOCK;
6864 
6865         const char * orig_locale = toggle_locale_i(WHICH_LC_INDEX, "C");
6866 
6867         errstr = savepv(Strerror(errnum));
6868 
6869         restore_toggled_locale_i(WHICH_LC_INDEX, orig_locale);
6870 
6871         SETLOCALE_UNLOCK;
6872 
6873         *utf8ness = UTF8NESS_IMMATERIAL;
6874 
6875     }
6876 
6877     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6878 
6879     SAVEFREEPV(errstr);
6880     return errstr;
6881 }
6882 
6883 /*--------------------------------------------------------------------------*/
6884 #  else
6885 
6886 /* Below, have both LC_CTYPE and LC_MESSAGES.  Place them in the same CODESET,
6887  * either C or the LC_MESSAGES locale */
6888 
6889 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)6890 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6891 {
6892     PERL_ARGS_ASSERT_MY_STRERROR;
6893 
6894     DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
6895 
6896     const char * desired_locale = (IN_LC(LC_MESSAGES))
6897                                   ? querylocale_c(LC_MESSAGES)
6898                                   : "C";
6899     /* XXX Can fail on z/OS */
6900 
6901     SETLOCALE_LOCK;
6902 
6903     const char* orig_CTYPE_locale    = toggle_locale_c(LC_CTYPE, desired_locale);
6904     const char* orig_MESSAGES_locale = toggle_locale_c(LC_MESSAGES,
6905                                                        desired_locale);
6906     const char *errstr = savepv(Strerror(errnum));
6907 
6908     restore_toggled_locale_c(LC_MESSAGES, orig_MESSAGES_locale);
6909     restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
6910 
6911     SETLOCALE_UNLOCK;
6912 
6913     *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
6914                                              NULL, LC_MESSAGES_INDEX_);
6915     DEBUG_STRERROR_RETURN(errstr, utf8ness);
6916 
6917     SAVEFREEPV(errstr);
6918     return errstr;
6919 }
6920 
6921 /*--------------------------------------------------------------------------*/
6922 #  endif /* end of not using strerror_l() */
6923 #endif   /* end of all the my_strerror() implementations */
6924 
6925 /*
6926 
6927 =for apidoc switch_to_global_locale
6928 
6929 This function copies the locale state of the calling thread into the program's
6930 global locale, and converts the thread to use that global locale.
6931 
6932 It is intended so that Perl can safely be used with C libraries that access the
6933 global locale and which can't be converted to not access it.  Effectively, this
6934 means libraries that call C<L<setlocale(3)>> on non-Windows systems.  (For
6935 portability, it is a good idea to use it on Windows as well.)
6936 
6937 A downside of using it is that it disables the services that Perl provides to
6938 hide locale gotchas from your code.  The service you most likely will miss
6939 regards the radix character (decimal point) in floating point numbers.  Code
6940 executed after this function is called can no longer just assume that this
6941 character is correct for the current circumstances.
6942 
6943 To return to Perl control, and restart the gotcha prevention services, call
6944 C<L</sync_locale>>.  Behavior is undefined for any pure Perl code that executes
6945 while the switch is in effect.
6946 
6947 The global locale and the per-thread locales are independent.  As long as just
6948 one thread converts to the global locale, everything works smoothly.  But if
6949 more than one does, they can easily interfere with each other, and races are
6950 likely.  On Windows systems prior to Visual Studio 15 (at which point Microsoft
6951 fixed a bug), races can occur (even if only one thread has been converted to
6952 the global locale), but only if you use the following operations:
6953 
6954 =over
6955 
6956 =item L<POSIX::localeconv|POSIX/localeconv>
6957 
6958 =item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
6959 
6960 =item L<perlapi/Perl_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
6961 
6962 =back
6963 
6964 The first item is not fixable (except by upgrading to a later Visual Studio
6965 release), but it would be possible to work around the latter two items by
6966 having Perl change its algorithm for calculating these to use Windows API
6967 functions (likely C<GetNumberFormat> and C<GetCurrencyFormat>); patches
6968 welcome.
6969 
6970 XS code should never call plain C<setlocale>, but should instead be converted
6971 to either call L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in
6972 for the system C<setlocale>) or use the methods given in L<perlcall> to call
6973 L<C<POSIX::setlocale>|POSIX/setlocale>.  Either one will transparently properly
6974 handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
6975 
6976 =cut
6977 */
6978 
6979 void
Perl_switch_to_global_locale(pTHX)6980 Perl_switch_to_global_locale(pTHX)
6981 {
6982 
6983 #ifdef USE_LOCALE
6984 
6985     DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering switch_to_global; %s\n",
6986                                           get_LC_ALL_display()));
6987     bool perl_controls = false;
6988 
6989 #  ifdef USE_THREAD_SAFE_LOCALE
6990 
6991    /* In these cases, we use the system state to determine if we are in the
6992     * global locale or not. */
6993 
6994 #    ifdef USE_POSIX_2008_LOCALE
6995 
6996     perl_controls = (LC_GLOBAL_LOCALE != uselocale((locale_t) 0));
6997 
6998 #    elif defined(WIN32)
6999 
7000     perl_controls = (_configthreadlocale(0) == _ENABLE_PER_THREAD_LOCALE);
7001 
7002 #    else
7003 #      error Unexpected Configuration
7004 #    endif
7005 #  endif
7006 
7007     /* No-op if already in global */
7008     if (! perl_controls) {
7009         return;
7010     }
7011 
7012 #  ifdef USE_THREAD_SAFE_LOCALE
7013 #    if defined(WIN32)
7014 
7015     const char * thread_locale = posix_setlocale(LC_ALL, NULL);
7016     _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
7017     posix_setlocale(LC_ALL, thread_locale);
7018 
7019 #    else   /* Must be USE_POSIX_2008_LOCALE) */
7020 
7021     const char * cur_thread_locales[NOMINAL_LC_ALL_INDEX + 1];
7022 
7023     /* Save each category's current per-thread state */
7024     for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
7025         cur_thread_locales[i] = querylocale_i(i);
7026     }
7027 
7028     /* Now switch to global */
7029     DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Switching to global locale\n"));
7030 
7031     locale_t old_locale = uselocale(LC_GLOBAL_LOCALE);
7032     if (! old_locale) {
7033         locale_panic_("Could not change to global locale");
7034     }
7035 
7036     /* Free the per-thread memory */
7037     if (old_locale != LC_GLOBAL_LOCALE && old_locale != PL_C_locale_obj) {
7038         freelocale(old_locale);
7039     }
7040 
7041     /* Set the global to what was our per-thread state */
7042     POSIX_SETLOCALE_LOCK;
7043     for (unsigned int i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
7044         posix_setlocale(categories[i], cur_thread_locales[i]);
7045     }
7046     POSIX_SETLOCALE_UNLOCK;
7047 
7048 #    endif
7049 #  endif
7050 #  ifdef USE_LOCALE_NUMERIC
7051 
7052     /* Switch to the underlying C numeric locale; the application is on its
7053      * own. */
7054     POSIX_SETLOCALE_LOCK;
7055     posix_setlocale(LC_NUMERIC, PL_numeric_name);
7056     POSIX_SETLOCALE_UNLOCK;
7057 
7058 #  endif
7059 #endif
7060 
7061 }
7062 
7063 /*
7064 
7065 =for apidoc sync_locale
7066 
7067 This function copies the state of the program global locale into the calling
7068 thread, and converts that thread to using per-thread locales, if it wasn't
7069 already, and the platform supports them.  The LC_NUMERIC locale is toggled into
7070 the standard state (using the C locale's conventions), if not within the
7071 lexical scope of S<C<use locale>>.
7072 
7073 Perl will now consider itself to have control of the locale.
7074 
7075 Since unthreaded perls have only a global locale, this function is a no-op
7076 without threads.
7077 
7078 This function is intended for use with C libraries that do locale manipulation.
7079 It allows Perl to accommodate the use of them.  Call this function before
7080 transferring back to Perl space so that it knows what state the C code has left
7081 things in.
7082 
7083 XS code should not manipulate the locale on its own.  Instead,
7084 L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
7085 change the locale (though changing the locale is antisocial and dangerous on
7086 multi-threaded systems that don't have multi-thread safe locale operations.
7087 (See L<perllocale/Multi-threaded operation>).
7088 
7089 Using the libc L<C<setlocale(3)>> function should be avoided.  Nevertheless,
7090 certain non-Perl libraries called from XS, do call it, and their behavior may
7091 not be able to be changed.  This function, along with
7092 C<L</switch_to_global_locale>>, can be used to get seamless behavior in these
7093 circumstances, as long as only one thread is involved.
7094 
7095 If the library has an option to turn off its locale manipulation, doing that is
7096 preferable to using this mechanism.  C<Gtk> is such a library.
7097 
7098 The return value is a boolean: TRUE if the global locale at the time of call
7099 was in effect for the caller; and FALSE if a per-thread locale was in effect.
7100 
7101 =cut
7102 */
7103 
7104 bool
Perl_sync_locale(pTHX)7105 Perl_sync_locale(pTHX)
7106 {
7107 
7108 #ifndef USE_LOCALE
7109 
7110     return TRUE;
7111 
7112 #else
7113 
7114     bool was_in_global = TRUE;
7115 
7116 #  ifdef USE_THREAD_SAFE_LOCALE
7117 #    if defined(WIN32)
7118 
7119     was_in_global = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE)
7120                                      == _DISABLE_PER_THREAD_LOCALE;
7121 
7122 #    elif defined(USE_POSIX_2008_LOCALE)
7123 
7124     was_in_global = (LC_GLOBAL_LOCALE == uselocale((locale_t) 0));
7125 
7126 #    else
7127 #      error Unexpected Configuration
7128 #    endif
7129 #  endif    /* USE_THREAD_SAFE_LOCALE */
7130 
7131     /* Here, we are in the global locale.  Get and save the values for each
7132      * category. */
7133     const char * current_globals[NOMINAL_LC_ALL_INDEX];
7134     for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
7135         POSIX_SETLOCALE_LOCK;
7136         current_globals[i] = savepv(stdized_setlocale(categories[i], NULL));
7137         POSIX_SETLOCALE_UNLOCK;
7138     }
7139 
7140     /* Now we have to convert the current thread to use them */
7141 
7142 #  if defined(WIN32)
7143 
7144     /* On Windows, convert to per-thread behavior.  This isn't necessary in
7145      * POSIX 2008, as the conversion gets done automatically in the loop below.
7146      * */
7147     _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
7148 
7149 #  endif
7150 
7151     for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
7152         setlocale_i(i, current_globals[i]);
7153         Safefree(current_globals[i]);
7154     }
7155 
7156     /* And update our remaining records.  'true' => force recalculation */
7157     new_LC_ALL(NULL, true);
7158 
7159     return was_in_global;
7160 
7161 #endif
7162 
7163 }
7164 
7165 #if defined(DEBUGGING) && defined(USE_LOCALE)
7166 
7167 STATIC char *
S_my_setlocale_debug_string_i(pTHX_ const unsigned cat_index,const char * locale,const char * retval,const line_t line)7168 S_my_setlocale_debug_string_i(pTHX_
7169                               const unsigned cat_index,
7170                               const char* locale, /* Optional locale name */
7171 
7172                               /* return value from setlocale() when attempting
7173                                * to set 'category' to 'locale' */
7174                               const char* retval,
7175 
7176                               const line_t line)
7177 {
7178     /* Returns a pointer to a NUL-terminated string in static storage with
7179      * added text about the info passed in.  This is not thread safe and will
7180      * be overwritten by the next call, so this should be used just to
7181      * formulate a string to immediately print or savepv() on. */
7182 
7183     const char * locale_quote;
7184     const char * retval_quote;
7185 
7186     assert(cat_index <= NOMINAL_LC_ALL_INDEX);
7187 
7188     if (locale == NULL) {
7189         locale_quote = "";
7190         locale = "NULL";
7191     }
7192     else {
7193         locale_quote = "\"";
7194     }
7195 
7196     if (retval == NULL) {
7197         retval_quote = "";
7198         retval = "NULL";
7199     }
7200     else {
7201         retval_quote = "\"";
7202     }
7203 
7204 #  ifdef USE_LOCALE_THREADS
7205 #    define THREAD_FORMAT "%p:"
7206 #    define THREAD_ARGUMENT aTHX_
7207 #  else
7208 #    define THREAD_FORMAT
7209 #    define THREAD_ARGUMENT
7210 #  endif
7211 
7212     return Perl_form(aTHX_
7213                      "%s:%" LINE_Tf ": " THREAD_FORMAT
7214                      " setlocale(%s[%d], %s%s%s) returned %s%s%s\n",
7215 
7216                      __FILE__, line, THREAD_ARGUMENT
7217                      category_names[cat_index], categories[cat_index],
7218                      locale_quote, locale, locale_quote,
7219                      retval_quote, retval, retval_quote);
7220 }
7221 
7222 #endif
7223 #ifdef USE_PERL_SWITCH_LOCALE_CONTEXT
7224 
7225 void
Perl_switch_locale_context()7226 Perl_switch_locale_context()
7227 {
7228     /* libc keeps per-thread locale status information in some configurations.
7229      * So, we can't just switch out aTHX to switch to a new thread.  libc has
7230      * to follow along.  This routine does that based on per-interpreter
7231      * variables we keep just for this purpose */
7232 
7233     /* Can't use pTHX, because we may be called from a place where that
7234      * isn't available */
7235     dTHX;
7236 
7237     if (UNLIKELY(   aTHX == NULL
7238                  || PL_veto_switch_non_tTHX_context
7239                  || PL_phase == PERL_PHASE_CONSTRUCT))
7240     {
7241         return;
7242     }
7243 
7244 #  ifdef USE_POSIX_2008_LOCALE
7245 
7246     if (! uselocale(PL_cur_locale_obj)) {
7247         locale_panic_(Perl_form(aTHX_
7248                                 "Can't uselocale(%p), LC_ALL supposed to be '%s",
7249                                 PL_cur_locale_obj, get_LC_ALL_display()));
7250     }
7251 
7252 #  elif defined(WIN32)
7253 
7254     if (! bool_setlocale_c(LC_ALL, PL_cur_LC_ALL)) {
7255         locale_panic_(Perl_form(aTHX_ "Can't setlocale(%s)", PL_cur_LC_ALL));
7256     }
7257 
7258 #  endif
7259 
7260 }
7261 
7262 #endif
7263 
7264 void
Perl_thread_locale_init(pTHX)7265 Perl_thread_locale_init(pTHX)
7266 {
7267 
7268 #ifdef USE_THREAD_SAFE_LOCALE
7269 #  ifdef USE_POSIX_2008_LOCALE
7270 
7271     /* Called from a thread on startup.
7272      *
7273      * The operations here have to be done from within the calling thread, as
7274      * they affect libc's knowledge of the thread; libc has no knowledge of
7275      * aTHX */
7276 
7277      DEBUG_L(PerlIO_printf(Perl_debug_log,
7278                            "new thread, initial locale is %s;"
7279                            " calling setlocale(LC_ALL, \"C\")\n",
7280                            get_LC_ALL_display()));
7281 
7282     uselocale(PL_C_locale_obj);
7283 
7284 #  elif defined(WIN32)
7285 
7286     /* On Windows, make sure new thread has per-thread locales enabled */
7287     _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
7288     void_setlocale_c(LC_ALL, "C");
7289 
7290 #  endif
7291 #endif
7292 
7293 }
7294 
7295 void
Perl_thread_locale_term(pTHX)7296 Perl_thread_locale_term(pTHX)
7297 {
7298     /* Called from a thread as it gets ready to terminate.
7299      *
7300      * The operations here have to be done from within the calling thread, as
7301      * they affect libc's knowledge of the thread; libc has no knowledge of
7302      * aTHX */
7303 
7304 #ifdef USE_POSIX_2008_LOCALE
7305 
7306     /* C starts the new thread in the global C locale.  If we are thread-safe,
7307      * we want to not be in the global locale */
7308 
7309     /* Free up */
7310     locale_t actual_obj   = uselocale(LC_GLOBAL_LOCALE);
7311     if (actual_obj != LC_GLOBAL_LOCALE && actual_obj != PL_C_locale_obj) {
7312         freelocale(actual_obj);
7313     }
7314 
7315     /* Prevent leaks even if something has gone wrong */
7316     locale_t expected_obj = PL_cur_locale_obj;
7317     if (UNLIKELY(   expected_obj != actual_obj
7318                  && expected_obj != LC_GLOBAL_LOCALE
7319                  && expected_obj != PL_C_locale_obj))
7320     {
7321         freelocale(expected_obj);
7322     }
7323 
7324     PL_cur_locale_obj = LC_GLOBAL_LOCALE;
7325 
7326 #endif
7327 
7328 }
7329 
7330 /*
7331  * ex: set ts=8 sts=4 sw=4 et:
7332  */
7333