1 /* locale.c
2 *
3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11 /*
12 * A Elbereth Gilthoniel,
13 * silivren penna míriel
14 * o menel aglar elenath!
15 * Na-chaered palan-díriel
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
19 *
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
21 */
22
23 /* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
25 *
26 * All C programs have an underlying locale. Perl code generally doesn't pay
27 * any attention to it except within the scope of a 'use locale'. For most
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
34 * of 'use locale'.
35 *
36 * There is more than the typical amount of variation between platforms with
37 * regard to locale handling. At the end of these introductory comments, are
38 * listed various relevent Configuration options, including some that can be
39 * used to pretend to some extent that this is being developed on a different
40 * platform than it actually is. This allows you to make changes and catch
41 * some errors without having access to those other platforms.
42 *
43 * This code now has multi-thread-safe locale handling on systems that support
44 * that. This is completely transparent to most XS code. On earlier systems,
45 * it would be possible to emulate thread-safe locales, but this likely would
46 * involve a lot of locale switching, and would require XS code changes.
47 * Macros could be written so that the code wouldn't have to know which type of
48 * system is being used.
49 *
50 * Table-driven code is used for simplicity and clarity, as many operations
51 * differ only in which category is being worked on. However the system
52 * categories need not be small contiguous integers, so do not lend themselves
53 * to table lookup. Instead we have created our own equivalent values which
54 * are all small contiguous non-negative integers, and translation functions
55 * between the two sets. For category 'LC_foo', the name of our index is
56 * LC_foo_INDEX_. Various parallel tables, indexed by these, are used for the
57 * translation. The tables are generated at compile-time based on platform
58 * characteristics and Configure options. They hide from the code many of the
59 * vagaries of the different locale implementations out there.
60 *
61 * On unthreaded perls, most operations expand out to just the basic
62 * setlocale() calls. That sort of is true on threaded perls on modern Windows
63 * systems where the same API, after set up, is used for thread-safe locale
64 * handling. (But there are complications on Windows due to internal character
65 * set issues.) On other systems, there is a completely different API,
66 * specified in POSIX 2008, to do thread-safe locales. On these systems, our
67 * bool_setlocale_2008_i() function is used to hide the different API from the
68 * outside. This makes it completely transparent to most XS code.
69 *
70 * A huge complicating factor is that the LC_NUMERIC category is normally held
71 * in the C locale, except during those relatively rare times when it needs to
72 * be in the underlying locale. There is a bunch of code to accomplish this,
73 * and to allow easy switches from one state to the other.
74 *
75 * In addition, the setlocale equivalents have versions for the return context,
76 * 'void' and 'bool', besides the full return value. This can present
77 * opportunities for avoiding work. We don't have to necessarily create a safe
78 * copy to return if no return is desired.
79 *
80 * There are 3.5 major implementations here; which one chosen depends on what
81 * the platform has available, and Configuration options.
82 *
83 * 1) Raw posix_setlocale(). This implementation is basically the libc
84 * setlocale(), with possibly minor tweaks. This is used for startup, and
85 * always for unthreaded perls, and when the API for safe locale threading
86 * is identical to the unsafe API (Windows, currently).
87 *
88 * This implementation is composed of two layers:
89 * a) posix_setlocale() implements the libc setlocale(). In most cases,
90 * it is just an alias for the libc version. But Windows doesn't
91 * fully conform to the POSIX standard, and this is a layer on top of
92 * libc to bring it more into conformance. And in Configurations
93 * where perl is to ignore some locale categories that the libc
94 * setlocale() knows about, there is a layer to cope with that.
95 * b) stdized_setlocale() is a layer above a) that fixes some vagaries in
96 * the return value of the libc setlocale(). On most platforms this
97 * layer is empty; in order to be activated, it requires perl to be
98 * Configured with a parameter indicating the platform's defect. The
99 * current ones are listed at the definition of the macro.
100 *
101 * 2) An implementation that adds a minimal layer above implementation 1),
102 * making that implementation uninterruptible and returning a
103 * per-thread/per-category value.
104 *
105 * 3a and 3b) An implementation of POSIX 2008 thread-safe locale handling,
106 * hiding from the programmer the completely different API for this.
107 * This automatically makes almost all code thread-safe without need for
108 * changes. This implementation is chosen on threaded perls when the
109 * platform properly supports the POSIX 2008 functions, and when there is no
110 * manual override to the contrary passed to Configure.
111 *
112 * 3a) is when the platform has a documented reliable querylocale() function
113 * or equivalent that is selected to be used.
114 * 3b) is when we have to emulate that functionality.
115 *
116 * Unfortunately, it seems that some platforms that claim to support these
117 * are buggy, in one way or another. There are workarounds encoded here,
118 * where feasible, for platforms where the bugs are amenable to that
119 * (glibc, for example). But other platforms instead don't use this
120 * implementation.
121 *
122 * z/OS (os390) is an outlier. Locales really don't work under threads when
123 * either the radix character isn't a dot, or attempts are made to change
124 * locales after the first thread is created. The reason is that IBM has made
125 * it thread-safe by refusing to change locales (returning failure if
126 * attempted) any time after an application has called pthread_create() to
127 * create another thread. The expectation is that an application will set up
128 * its locale information before the first fork, and be stable thereafter. But
129 * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
130 * the other toggles, which are less common.
131 *
132 * Associated with each implementation are three sets of macros that translate
133 * a consistent API into what that implementation needs. Each set consists of
134 * three macros with the suffixes:
135 * _c Means the argument is a locale category number known at compile time.
136 * An example would be LC_TIME. This token is a compile-time constant
137 * and can be passed to a '_c' macro.
138 * _r Means the argument is a locale category number whose value might not be
139 * known until runtime
140 * _i Means the argument is our internal index of a locale category
141 *
142 * The three sets are: ('_X' means one of '_c', '_r', '_i')
143 * 1) bool_setlocale_X()
144 * This calls the appropriate setlocale()-equivalent for the
145 * implementation, with the category and new locale. The input locale is
146 * not necessarily valid, so the return is true or false depending on
147 * whether or not the setlocale() succeeded. This is not used for
148 * querying the locale, so the input locale must not be NULL.
149 *
150 * This macro is suitable for toggling the locale back and forth during an
151 * operation. For example, the names of days and months under LC_TIME are
152 * strings that are also subject to LC_CTYPE. If the locales of these two
153 * categories differ, mojibake can result on many platforms. The code
154 * here will toggle LC_CTYPE into the locale of LC_TIME temporarily to
155 * avoid this.
156 *
157 * Several categories require extra work when their locale is changed.
158 * LC_CTYPE, for example, requires the calculation of the table of which
159 * characters fold to which others under /i pattern matching or fc(), as
160 * folding is not a concept in POSIX. This table isn't needed when the
161 * LC_CTYPE locale gets toggled during an operation, and will be toggled
162 * back before return to the caller. To save work that would be
163 * discarded, the bool_setlocale_X() implementations don't do this extra
164 * work. Instead, there is a separate function for just this purpose to
165 * be done before control is transferred back to the external caller. All
166 * categories that have such requirements have such a function. The
167 * update_functions[] array contains pointers to them (or NULL for
168 * categories which don't need a function).
169 *
170 * Care must be taken to remember to call the separate function before
171 * returning to an external caller, and to not use things it updates
172 * before its call. An alternative approach would be to have
173 * bool_setlocale_X() always call the update, which would return
174 * immediately if a flag wasn't set indicating it was time to actually
175 * perform it.
176 *
177 * 2) void_setlocale_X()
178 * This is like bool_setlocale_X(), but it is used only when it is
179 * expected that the call must succeed, or something is seriously wrong.
180 * A panic is issued if it fails. The caller uses this form when it just
181 * wants to assume things worked.
182 *
183 * 3) querylocale_X()
184 * This returns a string that specifies the current locale for the given
185 * category given by the input argument. The string is safe from other
186 * threads zapping it, and the caller need not worry about freeing it, but
187 * it may be mortalized, so must be copied if you need to preserve it
188 * across calls, or long term. This returns the actual current locale,
189 * not the nominal. These differ, for example, when LC_NUMERIC is
190 * supposed to be a locale whose decimal radix character is a comma. As
191 * mentioned above, Perl actually keeps this category set to C in such
192 * circumstances so that XS code can just assume a dot radix character.
193 * querylocale_X() returns the locale that libc has stored at this moment,
194 * so most of the time will return a locale whose radix character is a
195 * dot. The macro query_nominal_locale_i() can be used to get the nominal
196 * locale that an external caller would expect, for all categories except
197 * LC_ALL. For that, you can use the function
198 * S_calculate_LC_ALL_string(). Or S_native_querylocale_i() will operate
199 * on any category.
200 *
201 * The underlying C API that this implements uses category numbers, hence the
202 * code is structured to use '_r' at the API level to convert to indexes, which
203 * are then used internally with the '_i' forms.
204 *
205 * The splitting apart into setting vs querying means that the return value of
206 * the bool macros is not subject to potential clashes with other threads,
207 * eliminating any need for the calling code to worry about that and get it
208 * wrong. Whereas, you do have to think about thread interactions when using a
209 * query.
210 *
211 * Additionally, for the implementations where there aren't any complications,
212 * a setlocale_i() is defined that is like plain setlocale(), returning the new
213 * locale. Thus it combines a bool_setlocale_X() with a querylocale_X(). It
214 * is used only for performance on implementations that allow it, such as
215 * non-threaded perls.
216 *
217 * There are also a few other macros herein that use this naming convention to
218 * describe their category parameter.
219 *
220 * Relevant Configure options
221 *
222 * -Accflags=-DNO_LOCALE
223 * This compiles perl to always use the C locale, ignoring any
224 * attempts to change it. This could be useful on platforms with a
225 * crippled locale implementation.
226 *
227 * -Accflags=-DNO_THREAD_SAFE_LOCALE
228 * Even if thread-safe operations are available on this platform and
229 * would otherwise be used (because this is a perl with multiplicity),
230 * perl is compiled to not use them. This could be useful on
231 * platforms where the libc is buggy.
232 *
233 * -Accflags=-DNO_POSIX_2008_LOCALE
234 * Even if the libc locale operations specified by the Posix 2008
235 * Standard are available on this platform and would otherwise be used
236 * (because this is a threaded perl), perl is compiled to not use
237 * them. This could be useful on platforms where the libc is buggy.
238 * This is like NO_THREAD_SAFE_LOCALE, but has no effect on platforms
239 * that don't have these functions.
240 *
241 * -Accflags=-DUSE_POSIX_2008_LOCALE
242 * Normally, setlocale() is used for locale operations on perls
243 * compiled without threads. This option causes the locale operations
244 * defined by the Posix 2008 Standard to always be used instead. This
245 * could be useful on platforms where the libc setlocale() is buggy.
246 *
247 * -Accflags=-DNO_THREAD_SAFE_QUERYLOCALE
248 * This applies only to platforms that have a querylocale() libc
249 * function. perl assumes that that function is thread-safe, unless
250 * overridden by this, typically in a hints file. When overridden,
251 * querylocale() is called only while the locale mutex is locked, and
252 * the result is copied to a per-thread place before unlocking.
253 *
254 * -Accflags=-DNO_USE_NL_LOCALE_NAME
255 * glibc has an undocumented equivalent function to querylocale(),
256 * which our experience indicates is reliable. But you can forbid its
257 * use by specifying this Configure option (with no effect on systems
258 * lacking it). When this is function is enabled, it removes the need
259 * for perl to keep its own records, hence is more efficient and
260 * guaranteed to be accurate.
261 *
262 * -Accflags=-DNO_LOCALE_CTYPE
263 * -Accflags=-DNO_LOCALE_NUMERIC
264 * etc.
265 *
266 * If the named category(ies) does(do) not exist on this platform,
267 * these have no effect. Otherwise they cause perl to be compiled to
268 * always keep the named category(ies) in the C locale.
269 *
270 * -Accflags=-DHAS_BROKEN_SETLOCALE_QUERY_LC_ALL
271 * This would be set in a hints file to tell perl that doing a libc
272 * setlocale(LC_ALL, NULL)
273 * can give erroneous results, and perl will compensate to get the
274 * correct results. This is known to be a problem in earlier AIX
275 * versions
276 *
277 * -Accflags=-DHAS_LF_IN_SETLOCALE_RETURN
278 * This would be set in a hints file to tell perl that a libc
279 * setlocale() can return results containing \n characters that need
280 * to be stripped off. khw believes there aren't any such platforms
281 * still in existence.
282 *
283 * -Accflags=-DLIBC_HANDLES_MISMATCHED_CTYPE
284 * Consider the name of a month in some language, Chinese for example.
285 * If LC_TIME has been set to a Chinese locale, strftime() can be used
286 * to generate the Chinese month name for any given date, by using the
287 * %B format. But also suppose that LC_CTYPE is set to, say, "C".
288 * The return from strftime() on many platforms will be mojibake given
289 * that no Chinese month name is composed of just ASCII characters.
290 * Perl handles this for you by automatically toggling LC_CTYPE to
291 * whatever LC_TIME is during the execution of strftime(), and
292 * afterwards restoring it to its prior value. But the strftime()
293 * (and similar functions) in some libc implementations already do
294 * this toggle, meaning perl's action is redundant. You can tell perl
295 * that a libc does this by setting this Configure option, and it will
296 * skip its syncing LC_CTYPE and whatever the other locale is.
297 * Currently, perl ignores this Configuration option and syncs anyway
298 * for LC_COLLATE-related operations, due to perl's internal needs.
299 *
300 * -Accflags=USE_FAKE_LC_ALL_POSITIONAL_NOTATION
301 * This is used when developing Perl on a platform that uses
302 * 'name=value;' notation to represent LC_ALL when not all categories
303 * are the same. When so compiled, much of the code gets compiled
304 * and exercised that applies to platforms that instead use positional
305 * notation. This allows for finding many bugs in that portion of the
306 * implementation, without having to access such a platform.
307 *
308 * -Accflags=-DWIN32_USE_FAKE_OLD_MINGW_LOCALES
309 * This is used when developing Perl on a non-Windows platform to
310 * compile and exercise much of the locale-related code that instead
311 * applies to MingW platforms that don't use the more modern UCRT
312 * library. This allows for finding many bugs in that portion of the
313 * implementation, without having to access such a platform.
314 */
315
316 /* If the environment says to, we can output debugging information during
317 * initialization. This is done before option parsing, and before any thread
318 * creation, so can be a file-level static. (Must come before #including
319 * perl.h) */
320 #include "config.h"
321
322 /* Returns the Unix errno portion; ignoring any others. This is a macro here
323 * instead of putting it into perl.h, because unclear to khw what should be
324 * done generally. */
325 #define GET_ERRNO saved_errno
326
327 #ifdef DEBUGGING
328 static int debug_initialization = 0;
329 # define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
330 # define DEBUG_LOCALE_INITIALIZATION_ debug_initialization
331
332 # ifdef HAS_EXTENDED_OS_ERRNO
333 /* Output the non-zero errno and/or the non-zero extended errno */
334 # define DEBUG_ERRNO \
335 dSAVE_ERRNO; dTHX; \
336 int extended = get_extended_os_errno(); \
337 const char * errno_string; \
338 if (GET_ERRNO == 0) { /* Skip output if both errno types are 0 */ \
339 if (LIKELY(extended == 0)) errno_string = ""; \
340 else errno_string = Perl_form(aTHX_ "; $^E=%d", extended); \
341 } \
342 else if (LIKELY(extended == GET_ERRNO)) \
343 errno_string = Perl_form(aTHX_ "; $!=%d", GET_ERRNO); \
344 else errno_string = Perl_form(aTHX_ "; $!=%d, $^E=%d", \
345 GET_ERRNO, extended);
346 # else
347 /* Output the errno, if non-zero */
348 # define DEBUG_ERRNO \
349 dSAVE_ERRNO; \
350 const char * errno_string = ""; \
351 if (GET_ERRNO != 0) { \
352 dTHX; \
353 errno_string = Perl_form(aTHX_ "; $!=%d", GET_ERRNO); \
354 }
355 # endif
356
357 /* Automatically include the caller's file, and line number in debugging output;
358 * and the errno (and/or extended errno) if non-zero. On threaded perls add
359 * the aTHX too. */
360 # if defined(MULTIPLICITY) && ! defined(NO_LOCALE_THREADS)
361 # define DEBUG_PRE_STMTS \
362 DEBUG_ERRNO; \
363 PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf ": 0x%p%s: ", \
364 __FILE__, (line_t)__LINE__, aTHX_ \
365 errno_string);
366 # else
367 # define DEBUG_PRE_STMTS \
368 DEBUG_ERRNO; \
369 PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf "%s: ", \
370 __FILE__, (line_t)__LINE__, \
371 errno_string);
372 # endif
373 # define DEBUG_POST_STMTS RESTORE_ERRNO;
374 #else
375 # define debug_initialization 0
376 # define DEBUG_INITIALIZATION_set(v)
377 # define DEBUG_PRE_STMTS
378 # define DEBUG_POST_STMTS
379 #endif
380
381 #include "EXTERN.h"
382 #define PERL_IN_LOCALE_C
383 #include "perl.h"
384
385 /* Some platforms require LC_CTYPE to be congruent with the category we are
386 * looking for. XXX This still presumes that we have to match COLLATE and
387 * CTYPE even on platforms that apparently handle this. */
388 #if defined(USE_LOCALE_CTYPE) && ! defined(LIBC_HANDLES_MISMATCHED_CTYPE)
389 # define WE_MUST_DEAL_WITH_MISMATCHED_CTYPE /* no longer used; kept for
390 possible future use */
391 # define start_DEALING_WITH_MISMATCHED_CTYPE(locale) \
392 const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale)
393 # define end_DEALING_WITH_MISMATCHED_CTYPE(locale) \
394 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
395 #else
396 # define start_DEALING_WITH_MISMATCHED_CTYPE(locale)
397 # define end_DEALING_WITH_MISMATCHED_CTYPE(locale)
398 #endif
399
400 #ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
401
402 /* Use -Accflags=-DWIN32_USE_FAKE_OLD_MINGW_LOCALES on a POSIX or *nix box
403 * to get a semblance of pretending the locale handling is that of a MingW
404 * that doesn't use UCRT (hence 'OLD' in the name). This exercizes code
405 * paths that are not compiled on non-Windows boxes, and allows for ASAN
406 * and PERL_MEMLOG. This is thus a way to see if locale.c on Windows is
407 * likely going to compile, without having to use a real Win32 box. And
408 * running the test suite will verify to a large extent our logic and memory
409 * allocation handling for such boxes. Of course the underlying calls are
410 * to the POSIX libc, so any differences in implementation between those and
411 * the Windows versions will not be caught by this. */
412
413 # define WIN32
414 # undef P_CS_PRECEDES
415 # undef CURRENCY_SYMBOL
416 # define CP_UTF8 -1
417 # undef _configthreadlocale
418 # define _configthreadlocale(arg) NOOP
419
420 # define MultiByteToWideChar(cp, flags, byte_string, m1, wstring, req_size) \
421 (PERL_UNUSED_ARG(cp), \
422 mbsrtowcs(wstring, &(byte_string), req_size, NULL) + 1)
423 # define WideCharToMultiByte(cp, flags, wstring, m1, byte_string, \
424 req_size, default_char, found_default_char) \
425 (PERL_UNUSED_ARG(cp), \
426 wcsrtombs(byte_string, &(wstring), req_size, NULL) + 1)
427
428 # ifdef USE_LOCALE
429
430 static const wchar_t * wsetlocale_buf = NULL;
431 static Size_t wsetlocale_buf_size = 0;
432
433 # ifdef MULTIPLICITY
434
435 static PerlInterpreter * wsetlocale_buf_aTHX = NULL;
436
437 # endif
438
439 STATIC
440 const wchar_t *
S_wsetlocale(const int category,const wchar_t * wlocale)441 S_wsetlocale(const int category, const wchar_t * wlocale)
442 {
443 /* Windows uses a setlocale that takes a wchar_t* locale. Other boxes
444 * don't have this, so this Windows replacement converts the wchar_t input
445 * to plain 'char*', calls plain setlocale(), and converts the result back
446 * to 'wchar_t*' */
447
448 const char * byte_locale = NULL;
449 if (wlocale) {
450 byte_locale = Win_wstring_to_byte_string(CP_UTF8, wlocale);
451 }
452
453 const char * byte_result = setlocale(category, byte_locale);
454 Safefree(byte_locale);
455 if (byte_result == NULL) {
456 return NULL;
457 }
458
459 const wchar_t * wresult = Win_byte_string_to_wstring(CP_UTF8, byte_result);
460
461 if (! wresult) {
462 return NULL;
463 }
464
465 /* Emulate a global static memory return from wsetlocale(). This currently
466 * leaks at process end; would require changing LOCALE_TERM to fix that */
467 Size_t string_size = wcslen(wresult) + 1;
468
469 if (wsetlocale_buf_size == 0) {
470 Newx(wsetlocale_buf, string_size, wchar_t);
471 wsetlocale_buf_size = string_size;
472
473 # ifdef MULTIPLICITY
474
475 dTHX;
476 wsetlocale_buf_aTHX = aTHX;
477
478 # endif
479
480 }
481 else if (string_size > wsetlocale_buf_size) {
482 Renew(wsetlocale_buf, string_size, wchar_t);
483 wsetlocale_buf_size = string_size;
484 }
485
486 Copy(wresult, wsetlocale_buf, string_size, wchar_t);
487 Safefree(wresult);
488
489 return wsetlocale_buf;
490 }
491
492 # define _wsetlocale(category, wlocale) S_wsetlocale(category, wlocale)
493 # endif
494 #endif /* WIN32_USE_FAKE_OLD_MINGW_LOCALES */
495
496 /* 'for' loop headers to hide the necessary casts */
497 #define for_category_indexes_between(i, m, n) \
498 for (locale_category_index i = (locale_category_index) (m); \
499 i <= (locale_category_index) (n); \
500 i = (locale_category_index) ((int) i + 1))
501 #define for_all_individual_category_indexes(i) \
502 for_category_indexes_between(i, 0, LC_ALL_INDEX_ - 1)
503 #define for_all_but_0th_individual_category_indexes(i) \
504 for_category_indexes_between(i, 1, LC_ALL_INDEX_ - 1)
505 #define for_all_category_indexes(i) \
506 for_category_indexes_between(i, 0, LC_ALL_INDEX_)
507
508 #ifdef USE_LOCALE
509 # if defined(USE_FAKE_LC_ALL_POSITIONAL_NOTATION) && defined(LC_ALL)
510
511 /* This simulates an underlying positional notation for LC_ALL when compiled on
512 * a system that uses name=value notation. Use this to develop on Linux and
513 * make a quick check that things have some chance of working on a positional
514 * box. Enable by adding to the Congfigure parameters:
515 * -Accflags=USE_FAKE_LC_ALL_POSITIONAL_NOTATION
516 *
517 * NOTE it redefines setlocale() and usequerylocale()
518 * */
519
520 STATIC const char *
S_positional_name_value_xlation(const char * locale,bool direction)521 S_positional_name_value_xlation(const char * locale, bool direction)
522 { /* direction == 1 is from name=value to positional
523 direction == 0 is from positional to name=value */
524 assert(locale);
525
526 dTHX;
527 const char * individ_locales[LC_ALL_INDEX_] = { NULL };
528
529 /* This parses either notation */
530 switch (parse_LC_ALL_string(locale,
531 (const char **) &individ_locales,
532 no_override, /* Handled by other code */
533 false, /* Return only [0] if suffices */
534 false, /* Don't panic on error */
535 __LINE__))
536 {
537 default: /* Some compilers don't realize that below is the complete
538 list of the available enum values */
539 case invalid:
540 return NULL;
541
542 case no_array:
543 return locale;
544 case only_element_0:
545 SAVEFREEPV(individ_locales[0]);
546 return individ_locales[0];
547 case full_array:
548 {
549 calc_LC_ALL_format format = (direction)
550 ? EXTERNAL_FORMAT_FOR_SET
551 : INTERNAL_FORMAT;
552 const char * retval = calculate_LC_ALL_string(individ_locales,
553 format,
554 WANT_TEMP_PV,
555 __LINE__);
556
557 for_all_individual_category_indexes(i) {
558 Safefree(individ_locales[i]);
559 }
560
561 return retval;
562 }
563 }
564 }
565
566 STATIC const char *
S_positional_setlocale(int cat,const char * locale)567 S_positional_setlocale(int cat, const char * locale)
568 {
569 if (cat != LC_ALL) return setlocale(cat, locale);
570
571 if (locale && strNE(locale, "")) {
572 locale = S_positional_name_value_xlation(locale, 0);
573 if (! locale) return NULL;
574 }
575
576 locale = setlocale(cat, locale);
577 if (locale == NULL) return NULL;
578 return S_positional_name_value_xlation(locale, 1);
579 }
580
581 # undef setlocale
582 # define setlocale(a,b) S_positional_setlocale(a,b)
583 # ifdef USE_POSIX_2008_LOCALE
584
585 STATIC locale_t
S_positional_newlocale(int mask,const char * locale,locale_t base)586 S_positional_newlocale(int mask, const char * locale, locale_t base)
587 {
588 assert(locale);
589
590 if (mask != LC_ALL_MASK) return newlocale(mask, locale, base);
591
592 if (strNE(locale, "")) locale = S_positional_name_value_xlation(locale, 0);
593 if (locale == NULL) return NULL;
594 return newlocale(LC_ALL_MASK, locale, base);
595 }
596
597 # undef newlocale
598 # define newlocale(a,b,c) S_positional_newlocale(a,b,c)
599 # endif
600 # endif
601 #endif /* End of fake positional notation */
602
603 #include "reentr.h"
604
605 #ifdef I_WCHAR
606 # include <wchar.h>
607 #endif
608 #ifdef I_WCTYPE
609 # include <wctype.h>
610 #endif
611
612 /* The main errno that gets used is this one, on platforms that support it */
613 #ifdef EINVAL
614 # define SET_EINVAL SETERRNO(EINVAL, LIB_INVARG)
615 #else
616 # define SET_EINVAL
617 #endif
618
619 /* This is a starting guess as to when this is true. It definititely isn't
620 * true on *BSD where positional LC_ALL notation is used. Likely this will end
621 * up being defined in hints files. */
622 #ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
623 # define NEWLOCALE_HANDLES_DISPARATE_LC_ALL
624 #endif
625
626 /* But regardless, we have to look at individual categories if some are
627 * ignored. */
628 #ifdef HAS_IGNORED_LOCALE_CATEGORIES_
629 # undef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
630 #endif
631 #ifdef USE_LOCALE
632
633 /* Not all categories need be set to the same locale. This macro determines if
634 * 'name' which represents LC_ALL is uniform or disparate. There are two
635 * situations: 1) the platform uses unordered name=value pairs; 2) the platform
636 * uses ordered positional values, with a separator string between them */
637 # ifdef PERL_LC_ALL_SEPARATOR /* positional */
638 # define is_disparate_LC_ALL(name) cBOOL(instr(name, PERL_LC_ALL_SEPARATOR))
639 # else /* name=value */
640
641 /* In the, hopefully never occurring, event that the platform doesn't use
642 * either mechanism for disparate LC_ALL's, assume the name=value pairs
643 * form, rather than taking the extreme step of refusing to compile. Many
644 * programs won't have disparate locales, so will generally work */
645 # define PERL_LC_ALL_SEPARATOR ";"
646 # define is_disparate_LC_ALL(name) cBOOL( strchr(name, ';') \
647 && strchr(name, '='))
648 # endif
649
650 /* It is possible to compile perl to always keep any individual category in the
651 * C locale. This would be done where the implementation on a platform is
652 * flawed or incomplete. At the time of this writing, for example, OpenBSD has
653 * not implemented LC_COLLATE beyond the C locale. The 'category_available[]'
654 * table is a bool that says whether a category is changeable, or must be kept
655 * in C. This macro substitutes C for the locale appropriately, expanding to
656 * nothing on the more typical case where all possible categories present on
657 * the platform are handled. */
658 # if defined(HAS_IGNORED_LOCALE_CATEGORIES_) \
659 || defined(HAS_MISSING_LANGINFO_ITEM_)
660 # define need_to_override_category(i) (! category_available[i])
661 # define override_ignored_category(i, new_locale) \
662 ((need_to_override_category(i)) ? "C" : (new_locale))
663 # else
664 # define need_to_override_category(i) 0
665 # define override_ignored_category(i, new_locale) (new_locale)
666 # endif
667
668 PERL_STATIC_INLINE const char *
S_mortalized_pv_copy(pTHX_ const char * const pv)669 S_mortalized_pv_copy(pTHX_ const char * const pv)
670 {
671 PERL_ARGS_ASSERT_MORTALIZED_PV_COPY;
672
673 /* Copies the input pv, and arranges for it to be freed at an unspecified
674 * later time. */
675
676 if (pv == NULL) {
677 return NULL;
678 }
679
680 const char * copy = savepv(pv);
681 SAVEFREEPV(copy);
682 return copy;
683 }
684
685 #endif
686
687 /* Default values come from the C locale */
688 #define C_codeset "ANSI_X3.4-1968" /* Only in some Configurations, and usually
689 a single instance, so is a #define */
690 static const char C_decimal_point[] = ".";
691
692 #if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
693 # define HAS_SOME_LANGINFO
694 #endif
695
696 #if (defined(USE_LOCALE_NUMERIC) && ! defined(TS_W32_BROKEN_LOCALECONV)) \
697 || ! ( defined(USE_LOCALE_NUMERIC) \
698 && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV)))
699 static const char C_thousands_sep[] = "";
700 #endif
701
702 /* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
703 * return of setlocale(), then this is extremely likely to be the C or POSIX
704 * locale. However, the output of setlocale() is documented to be opaque, but
705 * the odds are extremely small that it would return these two strings for some
706 * other locale. Note that VMS includes many non-ASCII characters in these two
707 * locales as controls and punctuation (below are hex bytes):
708 * cntrl: 84-97 9B-9F
709 * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
710 * Oddly, none there are listed as alphas, though some represent alphabetics
711 * https://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
712 #define isNAME_C_OR_POSIX(name) \
713 ( (name) != NULL \
714 && (( *(name) == 'C' && (*(name + 1)) == '\0') \
715 || strEQ((name), "POSIX")))
716
717 /* If this interface to nl_langinfo() isn't defined by embed.fnc, it means it
718 * isn't available on this platform, so instead emulate it */
719 #ifndef langinfo_sv_i
720 # define langinfo_sv_i(i, c, l, s, u) \
721 (PERL_UNUSED_VAR(c), emulate_langinfo(i, l, s, u))
722 #endif
723
724 /* In either case, create a version that takes things like 'LC_NUMERIC' as a
725 * parameter */
726 #define langinfo_sv_c(item, category, locale, sv, utf8ness) \
727 langinfo_sv_i(item, category##_INDEX_, locale, sv, utf8ness)
728
729 /* The normal method for interfacing with nl_langinfo() in this file is to use
730 * a scratch buffer (whose existence is hidden from the caller by these
731 * macros). */
732 #define langinfo_i(item, index, locale, utf8ness) \
733 langinfo_sv_i(item, index, locale, PL_scratch_langinfo, utf8ness)
734
735 #define langinfo_c(item, category, locale, utf8ness) \
736 langinfo_i(item, category##_INDEX_, locale, utf8ness)
737
738 #ifndef USE_LOCALE /* A no-op unless locales are enabled */
739 # define toggle_locale_i(index, locale) \
740 ((const char *) (PERL_UNUSED_VAR(locale), NULL))
741 # define restore_toggled_locale_i(index, locale) PERL_UNUSED_VAR(locale)
742 #else
743 # define toggle_locale_i(index, locale) \
744 S_toggle_locale_i(aTHX_ index, locale, __LINE__)
745 # define restore_toggled_locale_i(index, locale) \
746 S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__)
747 #endif
748
749 # define toggle_locale_c(cat, locale) toggle_locale_i(cat##_INDEX_, locale)
750 # define restore_toggled_locale_c(cat, locale) \
751 restore_toggled_locale_i(cat##_INDEX_, locale)
752 #ifdef USE_LOCALE
753 # ifdef DEBUGGING
754 # define setlocale_debug_string_i(index, locale, result) \
755 my_setlocale_debug_string_i(index, locale, result, __LINE__)
756 # define setlocale_debug_string_c(category, locale, result) \
757 setlocale_debug_string_i(category##_INDEX_, locale, result)
758 # define setlocale_debug_string_r(category, locale, result) \
759 setlocale_debug_string_i(get_category_index(category), \
760 locale, result)
761 # endif
762
763 /* On systems without LC_ALL, pretending it exists anyway simplifies things.
764 * Choose a value for it that is very unlikely to clash with any actual
765 * category */
766 # define FAKE_LC_ALL PERL_INT_MIN
767
768 /* Below are parallel arrays for locale information indexed by our mapping of
769 * category numbers into small non-negative indexes. locale_table.h contains
770 * an entry like this for each individual category used on this system:
771 * PERL_LOCALE_TABLE_ENTRY(CTYPE, S_new_ctype)
772 *
773 * Each array redefines PERL_LOCALE_TABLE_ENTRY to generate the information
774 * needed for that array, and #includes locale_table.h to get the valid
775 * categories.
776 *
777 * An entry for the conglomerate category LC_ALL is added here, immediately
778 * following the individual categories. (The treatment for it varies, so can't
779 * be in locale_table.h.)
780 *
781 * Following this, each array ends with an entry for illegal categories. All
782 * category numbers unknown to perl get mapped to this entry. This is likely
783 * to be a parameter error from the calling program; but it could be that this
784 * platform has a category we don't know about, in which case it needs to be
785 * added, using the paradigm of one of the existing categories. */
786
787 /* The first array is the locale categories perl uses on this system, used to
788 * map our index back to the system's category number. */
789 STATIC const int categories[] = {
790
791 # undef PERL_LOCALE_TABLE_ENTRY
792 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) LC_ ## name,
793 # include "locale_table.h"
794
795 # ifdef LC_ALL
796 LC_ALL,
797 # else
798 FAKE_LC_ALL,
799 # endif
800
801 (FAKE_LC_ALL + 1) /* Entry for unknown category; this number is unlikely
802 to clash with a real category */
803 };
804
805 # define GET_NAME_AS_STRING(token) # token
806 # define GET_LC_NAME_AS_STRING(token) GET_NAME_AS_STRING(LC_ ## token)
807
808 /* The second array is the category names. */
809 STATIC const char * const category_names[] = {
810
811 # undef PERL_LOCALE_TABLE_ENTRY
812 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) GET_LC_NAME_AS_STRING(name),
813 # include "locale_table.h"
814
815 # ifdef LC_ALL
816 # define LC_ALL_STRING "LC_ALL"
817 # else
818 # define LC_ALL_STRING "If you see this, it is a bug in perl;" \
819 " please report it via perlbug"
820 # endif
821
822 LC_ALL_STRING,
823
824 # define LC_UNKNOWN_STRING "Locale category unknown to Perl; if you see" \
825 " this, it is a bug in perl; please report it" \
826 " via perlbug"
827 LC_UNKNOWN_STRING
828 };
829
830 STATIC const Size_t category_name_lengths[] = {
831
832 # undef PERL_LOCALE_TABLE_ENTRY
833 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) \
834 STRLENs(GET_LC_NAME_AS_STRING(name)),
835 # include "locale_table.h"
836
837 STRLENs(LC_ALL_STRING),
838 STRLENs(LC_UNKNOWN_STRING)
839 };
840
841 /* Each entry includes space for the '=' and ';' */
842 # undef PERL_LOCALE_TABLE_ENTRY
843 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) \
844 + STRLENs(GET_LC_NAME_AS_STRING(name)) + 2
845
846 STATIC const Size_t lc_all_boiler_plate_length = 1 /* space for trailing NUL */
847 # include "locale_table.h"
848 ;
849
850 /* A few categories require additional setup when they are changed. This table
851 * points to the functions that do that setup */
852 STATIC void (*update_functions[]) (pTHX_ const char *, bool force) = {
853
854 # undef PERL_LOCALE_TABLE_ENTRY
855 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) call_back,
856 # include "locale_table.h"
857
858 S_new_LC_ALL,
859 NULL, /* No update for unknown category */
860 };
861
862 # if defined(HAS_IGNORED_LOCALE_CATEGORIES_) \
863 || defined(HAS_MISSING_LANGINFO_ITEM_)
864
865 /* Indicates if each category on this platform is available to use not in
866 * the C locale */
867 STATIC const bool category_available[] = {
868
869 # undef PERL_LOCALE_TABLE_ENTRY
870 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) LC_ ## name ## _AVAIL_,
871 # include "locale_table.h"
872
873 # ifdef LC_ALL
874 true,
875 # else
876 false,
877 # endif
878
879 false /* LC_UNKNOWN_AVAIL_ */
880 };
881
882 # endif
883 # if defined(USE_POSIX_2008_LOCALE)
884
885 STATIC const int category_masks[] = {
886
887 # undef PERL_LOCALE_TABLE_ENTRY
888 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) LC_ ## name ## _MASK,
889 # include "locale_table.h"
890
891 LC_ALL_MASK, /* Will rightly refuse to compile unless this is defined */
892 0 /* Empty mask for unknown category */
893 };
894
895 # endif
896 # if ! defined(PERL_LC_ALL_USES_NAME_VALUE_PAIRS)
897
898 /* On platforms that use positional notation for expressing LC_ALL, this maps
899 * the position of each category to our corresponding internal index for it.
900 * This is initialized at run time if needed. LC_ALL_INDEX_ is not legal for
901 * an individual locale, hence marks the elements here as not actually
902 * initialized. */
903 STATIC
904 unsigned int
905 map_LC_ALL_position_to_index[LC_ALL_INDEX_] = { LC_ALL_INDEX_ };
906
907 # endif
908 #endif
909 #if defined(USE_LOCALE) || defined(DEBUGGING)
910
911 STATIC const char *
S_get_displayable_string(pTHX_ const char * const s,const char * const e,const bool is_utf8)912 S_get_displayable_string(pTHX_
913 const char * const s,
914 const char * const e,
915 const bool is_utf8)
916 {
917 PERL_ARGS_ASSERT_GET_DISPLAYABLE_STRING;
918
919 if (e <= s) {
920 return "";
921 }
922
923 const char * t = s;
924 bool prev_was_printable = TRUE;
925 bool first_time = TRUE;
926 char * ret;
927
928 /* Worst case scenario: All are non-printable so have a blank between each.
929 * If UTF-8, all are the largest possible code point; otherwise all are a
930 * single byte. '(2 + 1)' is from each byte takes 2 characters to
931 * display, and a blank (or NUL for the final one) after it */
932 const Size_t size = (e - s) * (2 + 1) * ((is_utf8) ? UVSIZE : 1);
933 Newxz(ret, size, char);
934 SAVEFREEPV(ret);
935
936 while (t < e) {
937 UV cp = (is_utf8)
938 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
939 : * (U8 *) t;
940 if (isPRINT(cp)) {
941 if (! prev_was_printable) {
942 my_strlcat(ret, " ", size);
943 }
944
945 /* Escape these to avoid any ambiguity */
946 if (cp == ' ' || cp == '\\') {
947 my_strlcat(ret, "\\", size);
948 }
949 my_strlcat(ret, Perl_form(aTHX_ "%c", (U8) cp), size);
950 prev_was_printable = TRUE;
951 }
952 else {
953 if (! first_time) {
954 my_strlcat(ret, " ", size);
955 }
956 my_strlcat(ret, Perl_form(aTHX_ "%02" UVXf, cp), size);
957 prev_was_printable = FALSE;
958 }
959 t += (is_utf8) ? UTF8SKIP(t) : 1;
960 first_time = FALSE;
961 }
962
963 return ret;
964 }
965
966 #endif
967 #ifdef USE_LOCALE
968
969 # define get_category_index(cat) get_category_index_helper(cat, NULL, __LINE__)
970
971 STATIC locale_category_index
S_get_category_index_helper(pTHX_ const int category,bool * succeeded,const line_t caller_line)972 S_get_category_index_helper(pTHX_ const int category, bool * succeeded,
973 const line_t caller_line)
974 {
975 PERL_ARGS_ASSERT_GET_CATEGORY_INDEX_HELPER;
976
977 /* Given a category, return the equivalent internal index we generally use
978 * instead, warn or panic if not found. */
979
980 locale_category_index i;
981
982 # undef PERL_LOCALE_TABLE_ENTRY
983 # define PERL_LOCALE_TABLE_ENTRY(name, call_back) \
984 case LC_ ## name: i = LC_ ## name ## _INDEX_; break;
985
986 switch (category) {
987
988 # include "locale_table.h"
989 # ifdef LC_ALL
990 case LC_ALL: i = LC_ALL_INDEX_; break;
991 # endif
992
993 default: goto unknown_locale;
994 }
995
996 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
997 "index of category %d (%s) is %d;"
998 " called from %" LINE_Tf "\n",
999 category, category_names[i], i, caller_line));
1000
1001 if (succeeded) {
1002 *succeeded = true;
1003 }
1004
1005 return i;
1006
1007 unknown_locale:
1008
1009 if (succeeded) {
1010 *succeeded = false;
1011 return LC_ALL_INDEX_; /* Arbitrary */
1012 }
1013
1014 locale_panic_via_(Perl_form(aTHX_ "Unknown locale category %d", category),
1015 __FILE__, caller_line);
1016 NOT_REACHED; /* NOTREACHED */
1017 }
1018
1019 #endif /* ifdef USE_LOCALE */
1020
1021 void
Perl_force_locale_unlock(pTHX)1022 Perl_force_locale_unlock(pTHX)
1023 {
1024 /* Remove any locale mutex, in preperation for an inglorious termination,
1025 * typically a panic */
1026
1027 #if defined(USE_LOCALE_THREADS)
1028
1029 /* If recursively locked, clear all at once */
1030 if (PL_locale_mutex_depth > 1) {
1031 PL_locale_mutex_depth = 1;
1032 }
1033
1034 if (PL_locale_mutex_depth > 0) {
1035 LOCALE_UNLOCK_;
1036 }
1037
1038 #endif
1039
1040 }
1041
1042 #ifdef USE_POSIX_2008_LOCALE
1043
1044 STATIC locale_t
S_use_curlocale_scratch(pTHX)1045 S_use_curlocale_scratch(pTHX)
1046 {
1047 /* This function is used to hide from the caller the case where the current
1048 * locale_t object in POSIX 2008 is the global one, which is illegal in
1049 * many of the P2008 API calls. This checks for that and, if necessary
1050 * creates a proper P2008 object. Any prior object is deleted, as is any
1051 * remaining object during global destruction. */
1052
1053 locale_t cur = uselocale((locale_t) 0);
1054
1055 if (cur != LC_GLOBAL_LOCALE) {
1056 return cur;
1057 }
1058
1059 if (PL_scratch_locale_obj) {
1060 freelocale(PL_scratch_locale_obj);
1061 }
1062
1063 PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE);
1064 return PL_scratch_locale_obj;
1065 }
1066
1067 #endif
1068
1069 void
Perl_locale_panic(const char * msg,const line_t immediate_caller_line,const char * const higher_caller_file,const line_t higher_caller_line)1070 Perl_locale_panic(const char * msg,
1071 const line_t immediate_caller_line,
1072 const char * const higher_caller_file,
1073 const line_t higher_caller_line)
1074 {
1075 PERL_ARGS_ASSERT_LOCALE_PANIC;
1076 dTHX;
1077 dSAVE_ERRNO;
1078
1079 force_locale_unlock();
1080
1081 #ifdef USE_C_BACKTRACE
1082 dump_c_backtrace(Perl_debug_log, 20, 1);
1083 #endif
1084
1085 const char * called_by = "";
1086 if ( strNE(__FILE__, higher_caller_file)
1087 || immediate_caller_line != higher_caller_line)
1088 {
1089 called_by = Perl_form(aTHX_ "\nCalled by %s: %" LINE_Tf "\n",
1090 higher_caller_file, higher_caller_line);
1091 }
1092
1093 RESTORE_ERRNO;
1094
1095 const char * errno_text;
1096
1097 #ifdef HAS_EXTENDED_OS_ERRNO
1098
1099 const int extended_errnum = get_extended_os_errno();
1100 if (errno != extended_errnum) {
1101 errno_text = Perl_form(aTHX_ "; errno=%d, $^E=%d",
1102 errno, extended_errnum);
1103 }
1104 else
1105
1106 #endif
1107
1108 {
1109 errno_text = Perl_form(aTHX_ "; errno=%d", errno);
1110 }
1111
1112 /* diag_listed_as: panic: %s */
1113 Perl_croak(aTHX_ "%s: %" LINE_Tf ": panic: %s%s%s\n",
1114 __FILE__, immediate_caller_line,
1115 msg, errno_text, called_by);
1116 }
1117
1118 /* Macros to report and croak on an unexpected failure to set the locale. The
1119 * via version has more stack trace information */
1120 #define setlocale_failure_panic_i(i, cur, fail, line, higher_line) \
1121 setlocale_failure_panic_via_i(i, cur, fail, __LINE__, line, \
1122 __FILE__, higher_line)
1123
1124 #define setlocale_failure_panic_c(cat, cur, fail, line, higher_line) \
1125 setlocale_failure_panic_i(cat##_INDEX_, cur, fail, line, higher_line)
1126
1127 #if defined(USE_LOCALE)
1128
1129 /* Expands to the code to
1130 * result = savepvn(s, len)
1131 * if the category whose internal index is 'i' doesn't need to be kept in the C
1132 * locale on this system, or if 'action is 'no_override'. Otherwise it expands
1133 * to
1134 * result = savepv("C")
1135 * unless 'action' isn't 'check_that_overridden', in which case if the string
1136 * 's' isn't already "C" it panics */
1137 # ifndef HAS_IGNORED_LOCALE_CATEGORIES_
1138 # define OVERRIDE_AND_SAVEPV(s, len, result, i, action) \
1139 result = savepvn(s, len)
1140 # else
1141 # define OVERRIDE_AND_SAVEPV(s, len, result, i, action) \
1142 STMT_START { \
1143 if (LIKELY( ! need_to_override_category(i) \
1144 || action == no_override)) { \
1145 result = savepvn(s, len); \
1146 } \
1147 else { \
1148 const char * temp = savepvn(s, len); \
1149 result = savepv(override_ignored_category(i, temp)); \
1150 if (action == check_that_overridden && strNE(result, temp)) { \
1151 locale_panic_(Perl_form(aTHX_ \
1152 "%s expected to be '%s', instead is '%s'", \
1153 category_names[i], result, temp)); \
1154 } \
1155 Safefree(temp); \
1156 } \
1157 } STMT_END
1158 # endif
1159
1160 STATIC parse_LC_ALL_string_return
S_parse_LC_ALL_string(pTHX_ const char * string,const char ** output,const parse_LC_ALL_STRING_action override,bool always_use_full_array,const bool panic_on_error,const line_t caller_line)1161 S_parse_LC_ALL_string(pTHX_ const char * string,
1162 const char ** output,
1163 const parse_LC_ALL_STRING_action override,
1164 bool always_use_full_array,
1165 const bool panic_on_error,
1166 const line_t caller_line)
1167 {
1168 /* This function parses the value of the input 'string' which is expected
1169 * to be the representation of an LC_ALL locale, and splits the result into
1170 * the values for the individual component categories, returning those in
1171 * the 'output' array. Each array value will be a savepv() copy that is
1172 * the responsibility of the caller to make sure gets freed
1173 *
1174 * The locale for each category is independent of the other categories.
1175 * Often, they are all the same, but certainly not always. Perl, in fact,
1176 * usually keeps LC_NUMERIC in the C locale, regardless of the underlying
1177 * locale. LC_ALL has to be able to represent the case of when not all
1178 * categories have the same locale. Platforms have differing ways of
1179 * representing this. Internally, this file uses the 'name=value;'
1180 * representation found on some platforms, so this function always looks
1181 * for and parses that. Other platforms use a positional notation. On
1182 * those platforms, this function also parses that form. It examines the
1183 * input to see which form is being parsed.
1184 *
1185 * Often, all categories will have the same locale. This is special cased
1186 * if 'always_use_full_array' is false on input:
1187 * 1) If the input 'string' is a single value, this function doesn't
1188 * store anything into 'output', and returns 'no_array'
1189 * 2) Some platforms will return multiple occurrences of the same
1190 * value rather than coalescing them down to a single one. HP-UX
1191 * is such a one. This function will do that collapsing for you,
1192 * returning 'only_element_0' and saving the single value in
1193 * output[0], which the caller will need to arrange to be freed.
1194 * The rest of output[] is undefined, and does not need to be
1195 * freed.
1196 *
1197 * Otherwise, the input 'string' may not be valid. This function looks
1198 * mainly for syntactic errors, and if found, returns 'invalid'. 'output'
1199 * will not be filled in in that case, but the input state of it isn't
1200 * necessarily preserved. Turning on -DL debugging will give details as to
1201 * the error. If 'panic_on_error' is 'true', the function panics instead
1202 * of returning on error, with a message giving the details.
1203 *
1204 * Otherwise, output[] will be filled with the individual locale names for
1205 * all categories on the system, 'full_array' will be returned, and the
1206 * caller needs to arrange for each to be freed. This means that either at
1207 * least one category differed from the others, or 'always_use_full_array' was
1208 * true on input.
1209 *
1210 * perl may be configured to ignore changes to a category's locale to
1211 * non-C. The parameter 'override' tells this function what to do when
1212 * encountering such an illegal combination:
1213 *
1214 * no_override indicates to take no special action
1215 * override_if_ignored, indicates to return 'C' instead of what the
1216 * input string actually says.
1217 * check_that_overridden indicates to panic if the string says the
1218 * category is not 'C'. This is used when
1219 * non-C is very unexpected behavior.
1220 * */
1221
1222 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1223 "Entering parse_LC_ALL_string; called from %" \
1224 LINE_Tf "\nnew='%s'\n", caller_line, string));
1225
1226 # ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
1227
1228 const char separator[] = ";";
1229 const Size_t separator_len = 1;
1230 const bool single_component = (strchr(string, ';') == NULL);
1231
1232 # else
1233
1234 /* It's possible (but quite unlikely) that the separator string is an '='
1235 * or a ';'. Requiring both to be present for using the 'name=value;' form
1236 * properly handles those possibilities */
1237 const bool name_value = strchr(string, '=') && strchr(string, ';');
1238 const char * separator;
1239 Size_t separator_len;
1240 bool single_component;
1241 if (name_value) {
1242 separator = ";";
1243 separator_len = 1;
1244 single_component = false; /* Since has both [;=], must be multi */
1245 }
1246 else {
1247 separator = PERL_LC_ALL_SEPARATOR;
1248 separator_len = STRLENs(PERL_LC_ALL_SEPARATOR);
1249 single_component = instr(string, separator) == NULL;
1250 }
1251
1252 Size_t component_number = 0; /* Position in the parsing loop below */
1253
1254 # endif
1255 # ifndef HAS_IGNORED_LOCALE_CATEGORIES_
1256 PERL_UNUSED_ARG(override);
1257 # else
1258
1259 /* Any ignored categories are to be set to "C", so if this single-component
1260 * LC_ALL isn't to C, it has both "C" and non-C, so isn't really a single
1261 * component. All the non-ignored categories are set to the input
1262 * component, but the ignored ones are overridden to be C.
1263 *
1264 * This incidentally handles the case where the string is "". The return
1265 * will be C for each ignored category and "" for the others. Then the
1266 * caller can individually set each category, and get the right answer. */
1267 if (single_component && ! isNAME_C_OR_POSIX(string)) {
1268 for_all_individual_category_indexes(i) {
1269 OVERRIDE_AND_SAVEPV(string, strlen(string), output[i], i, override);
1270 }
1271
1272 return full_array;
1273 }
1274
1275 # endif
1276
1277 if (single_component) {
1278 if (! always_use_full_array) {
1279 return no_array;
1280 }
1281
1282 for_all_individual_category_indexes(i) {
1283 output[i] = savepv(string);
1284 }
1285
1286 return full_array;
1287 }
1288
1289 /* Here the input is multiple components. Parse through them. (It is
1290 * possible that these components are all the same, so we check, and if so,
1291 * return just the 0th component (unless 'always_use_full_array' is true)
1292 *
1293 * This enum notes the possible errors findable in parsing */
1294 enum {
1295 incomplete,
1296 no_equals,
1297 unknown_category,
1298 contains_LC_ALL_element
1299 } error;
1300
1301 /* Keep track of the categories we have encountered so far */
1302 bool seen[LC_ALL_INDEX_] = { false };
1303
1304 Size_t index; /* Our internal index for the current category */
1305 const char * s = string;
1306 const char * e = s + strlen(string);
1307 const char * category_end = NULL;
1308 const char * saved_first = NULL;
1309
1310 /* Parse the input locale string */
1311 while (s < e) {
1312
1313 /* 'separator' has been set up to delimit the components */
1314 const char * next_sep = instr(s, separator);
1315 if (! next_sep) { /* At the end of the input */
1316 next_sep = e;
1317 }
1318
1319 # ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
1320
1321 if (! name_value) {
1322 /* Get the index of the category in this position */
1323 index = map_LC_ALL_position_to_index[component_number++];
1324 }
1325 else
1326
1327 # endif
1328
1329 { /* Get the category part when each component is the
1330 * 'category=locale' form */
1331
1332 category_end = strchr(s, '=');
1333
1334 /* The '=' terminates the category name. If no '=', is improper
1335 * form */
1336 if (! category_end) {
1337 error = no_equals;
1338 goto failure;
1339 }
1340
1341 /* Find our internal index of the category name; uses a linear
1342 * search. (XXX This could be avoided by various means, but the
1343 * maximum likely search is 6 items, and khw doesn't think the
1344 * added complexity would save very much at all.) */
1345 const unsigned int name_len = (unsigned int) (category_end - s);
1346 for (index = 0; index < C_ARRAY_LENGTH(category_names); index++) {
1347 if ( name_len == category_name_lengths[index]
1348 && memEQ(s, category_names[index], name_len))
1349 {
1350 goto found_category;
1351 }
1352 }
1353
1354 /* Here, the category is not in our list. */
1355 error = unknown_category;
1356 goto failure;
1357
1358 found_category: /* The system knows about this category. */
1359
1360 if (index == LC_ALL_INDEX_) {
1361 error = contains_LC_ALL_element;
1362 goto failure;
1363 }
1364
1365 /* The locale name starts just beyond the '=' */
1366 s = category_end + 1;
1367
1368 /* Linux (and maybe others) doesn't treat a duplicate category in
1369 * the string as an error. Instead it uses the final occurrence as
1370 * the intended value. So if this is a duplicate, free the former
1371 * value before setting the new one */
1372 if (seen[index]) {
1373 Safefree(output[index]);
1374 }
1375 else {
1376 seen[index] = true;
1377 }
1378 }
1379
1380 /* Here, 'index' contains our internal index number for the current
1381 * category, and 's' points to the beginning of the locale name for
1382 * that category. */
1383 OVERRIDE_AND_SAVEPV(s, next_sep - s, output[index], index, override);
1384
1385 if (! always_use_full_array) {
1386 if (! saved_first) {
1387 saved_first = output[index];
1388 }
1389 else {
1390 if (strNE(saved_first, output[index])) {
1391 always_use_full_array = true;
1392 }
1393 }
1394 }
1395
1396 /* Next time start from the new position */
1397 s = next_sep + separator_len;
1398 }
1399
1400 /* Finished looping through all the categories
1401 *
1402 * Check if the input was incomplete. */
1403
1404 # ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
1405
1406 if (! name_value) { /* Positional notation */
1407 if (component_number != LC_ALL_INDEX_) {
1408 error = incomplete;
1409 goto failure;
1410 }
1411 }
1412 else
1413
1414 # endif
1415
1416 { /* Here is the name=value notation */
1417 for_all_individual_category_indexes(i) {
1418 if (! seen[i]) {
1419 error = incomplete;
1420 goto failure;
1421 }
1422 }
1423 }
1424
1425 /* In the loop above, we changed 'always_use_full_array' to true iff not all
1426 * categories have the same locale. Hence, if it is still 'false', all of
1427 * them are the same. */
1428 if (always_use_full_array) {
1429 return full_array;
1430 }
1431
1432 /* Free the dangling ones */
1433 for_all_but_0th_individual_category_indexes(i) {
1434 Safefree(output[i]);
1435 output[i] = NULL;
1436 }
1437
1438 return only_element_0;
1439
1440 failure:
1441
1442 /* Don't leave memory dangling that we allocated before the failure */
1443 for_all_individual_category_indexes(i) {
1444 if (seen[i]) {
1445 Safefree(output[i]);
1446 output[i] = NULL;
1447 }
1448 }
1449
1450 const char * msg;
1451 const char * display_start = s;
1452 const char * display_end = e;
1453
1454 switch (error) {
1455 case incomplete:
1456 msg = "doesn't list every locale category";
1457 display_start = string;
1458 break;
1459 case no_equals:
1460 msg = "needs an '=' to split name=value";
1461 break;
1462 case unknown_category:
1463 msg = "is an unknown category";
1464 display_end = (category_end && category_end > display_start)
1465 ? category_end
1466 : e;
1467 break;
1468 case contains_LC_ALL_element:
1469 msg = "has LC_ALL, which is illegal here";
1470 break;
1471 }
1472
1473 msg = Perl_form(aTHX_ "'%.*s' %s\n",
1474 (int) (display_end - display_start),
1475 display_start, msg);
1476
1477 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s", msg));
1478
1479 if (panic_on_error) {
1480 locale_panic_via_(msg, __FILE__, caller_line);
1481 }
1482
1483 return invalid;
1484 }
1485
1486 # undef OVERRIDE_AND_SAVEPV
1487 #endif
1488
1489 /*==========================================================================
1490 * Here starts the code that gives a uniform interface to its callers, hiding
1491 * the differences between platforms.
1492 *
1493 * base_posix_setlocale_() presents a consistent POSIX-compliant interface to
1494 * setlocale(). Windows requres a customized base-level setlocale(). This
1495 * layer should only be used by the next level up: the plain posix_setlocale
1496 * layer. Any necessary mutex locking needs to be done at a higher level. The
1497 * return may be overwritten by the next call to this function */
1498 #ifdef WIN32
1499 # define base_posix_setlocale_(cat, locale) win32_setlocale(cat, locale)
1500 #else
1501 # define base_posix_setlocale_(cat, locale) \
1502 ((const char *) setlocale(cat, locale))
1503 #endif
1504
1505 /*==========================================================================
1506 * Here is the main posix layer. It is the same as the base one unless the
1507 * system is lacking LC_ALL, or there are categories that we ignore, but that
1508 * the system libc knows about */
1509
1510 #if ! defined(USE_LOCALE) \
1511 || (defined(LC_ALL) && ! defined(HAS_IGNORED_LOCALE_CATEGORIES_))
1512 # define posix_setlocale(cat, locale) base_posix_setlocale_(cat, locale)
1513 #else
1514 # define posix_setlocale(cat, locale) \
1515 S_posix_setlocale_with_complications(aTHX_ cat, locale, __LINE__)
1516
1517 STATIC const char *
S_posix_setlocale_with_complications(pTHX_ const int cat,const char * new_locale,const line_t caller_line)1518 S_posix_setlocale_with_complications(pTHX_ const int cat,
1519 const char * new_locale,
1520 const line_t caller_line)
1521 {
1522 /* This implements the posix layer above the base posix layer.
1523 * It is needed to reconcile our internal records that reflect only a
1524 * proper subset of the categories known by the system. */
1525
1526 /* Querying the current locale returns the real value */
1527 if (new_locale == NULL) {
1528 new_locale = base_posix_setlocale_(cat, NULL);
1529 assert(new_locale);
1530 return new_locale;
1531 }
1532
1533 const char * locale_on_entry = NULL;
1534
1535 /* If setting from the environment, actually do the set to get the system's
1536 * idea of what that means; we may have to override later. */
1537 if (strEQ(new_locale, "")) {
1538 locale_on_entry = base_posix_setlocale_(cat, NULL);
1539 assert(locale_on_entry);
1540 new_locale = base_posix_setlocale_(cat, "");
1541 if (! new_locale) {
1542 SET_EINVAL;
1543 return NULL;
1544 }
1545 }
1546
1547 # ifdef LC_ALL
1548
1549 const char * new_locales[LC_ALL_INDEX_] = { NULL };
1550
1551 if (cat == LC_ALL) {
1552 switch (parse_LC_ALL_string(new_locale,
1553 (const char **) &new_locales,
1554 override_if_ignored, /* Override any
1555 ignored
1556 categories */
1557 false, /* Return only [0] if suffices */
1558 false, /* Don't panic on error */
1559 caller_line))
1560 {
1561 case invalid:
1562 SET_EINVAL;
1563 return NULL;
1564
1565 case no_array:
1566 break;
1567
1568 case only_element_0:
1569 new_locale = new_locales[0];
1570 SAVEFREEPV(new_locale);
1571 break;
1572
1573 case full_array:
1574
1575 /* Turn the array into a string that the libc setlocale() should
1576 * understand. (Another option would be to loop, setting the
1577 * individual locales, and then return base(cat, NULL) */
1578 new_locale = calculate_LC_ALL_string(new_locales,
1579 EXTERNAL_FORMAT_FOR_SET,
1580 WANT_TEMP_PV,
1581 caller_line);
1582
1583 for_all_individual_category_indexes(i) {
1584 Safefree(new_locales[i]);
1585 }
1586
1587 /* And call the libc setlocale. We could avoid this call if
1588 * locale_on_entry is set and eq the new_locale. But that would be
1589 * only for the relatively rare case of the desired locale being
1590 * "", and the time spent in doing the string compare might be more
1591 * than that of just setting it unconditionally */
1592 new_locale = base_posix_setlocale_(cat, new_locale);
1593 if (! new_locale) {
1594 goto failure;
1595 }
1596
1597 return new_locale;
1598 }
1599 }
1600
1601 # endif
1602
1603 /* Here, 'new_locale' is a single value, not an aggregation. Just set it.
1604 * */
1605 new_locale =
1606 base_posix_setlocale_(cat,
1607 override_ignored_category(
1608 get_category_index(cat), new_locale));
1609 if (! new_locale) {
1610 goto failure;
1611 }
1612
1613 return new_locale;
1614
1615 failure:
1616
1617 /* 'locale_on_entry' being set indicates there has likely been a change in
1618 * locale which needs to be restored */
1619 if (locale_on_entry) {
1620 if (! base_posix_setlocale_(cat, locale_on_entry)) {
1621 setlocale_failure_panic_i(get_category_index(cat),
1622 NULL, locale_on_entry,
1623 __LINE__, caller_line);
1624 }
1625 }
1626
1627 SET_EINVAL;
1628 return NULL;
1629 }
1630
1631 #endif
1632
1633 /* End of posix layer
1634 *==========================================================================
1635 *
1636 * The next layer up is to catch vagaries and bugs in the libc setlocale return
1637 * value. The return is not guaranteed to be stable.
1638 *
1639 * Any necessary mutex locking needs to be done at a higher level.
1640 *
1641 * On most platforms this layer is empty, expanding to just the layer
1642 * below. To enable it, call Configure with either or both:
1643 * -Accflags=-DHAS_LF_IN_SETLOCALE_RETURN
1644 * to indicate that extraneous \n characters can be returned
1645 * by setlocale()
1646 * -Accflags=-DHAS_BROKEN_SETLOCALE_QUERY_LC_ALL
1647 * to indicate that setlocale(LC_ALL, NULL) cannot be relied
1648 * on
1649 */
1650
1651 #define STDIZED_SETLOCALE_LOCK POSIX_SETLOCALE_LOCK
1652 #define STDIZED_SETLOCALE_UNLOCK POSIX_SETLOCALE_UNLOCK
1653 #if ! defined(USE_LOCALE) \
1654 || ! ( defined(HAS_LF_IN_SETLOCALE_RETURN) \
1655 || defined(HAS_BROKEN_SETLOCALE_QUERY_LC_ALL))
1656 # define stdized_setlocale(cat, locale) posix_setlocale(cat, locale)
1657 # define stdize_locale(cat, locale) (locale)
1658 #else
1659 # define stdized_setlocale(cat, locale) \
1660 S_stdize_locale(aTHX_ cat, posix_setlocale(cat, locale), __LINE__)
1661
1662 STATIC const char *
S_stdize_locale(pTHX_ const int category,const char * input_locale,const line_t caller_line)1663 S_stdize_locale(pTHX_ const int category,
1664 const char *input_locale,
1665 const line_t caller_line)
1666 {
1667 /* The return value of setlocale() is opaque, but is required to be usable
1668 * as input to a future setlocale() to create the same state.
1669 * Unfortunately not all systems are compliant. This function brings those
1670 * outliers into conformance. It is based on what problems have arisen in
1671 * the field.
1672 *
1673 * This has similar constraints as the posix layer. You need to lock
1674 * around it until its return is safely copied or no longer needed. (The
1675 * return may point to a global static buffer or may be mortalized.)
1676 *
1677 * The current things this corrects are:
1678 * 1) A new-line. This function chops any \n characters
1679 * 2) A broken 'setlocale(LC_ALL, foo)' This constructs a proper returned
1680 * string from the constituent categories
1681 *
1682 * If no changes were made, the input is returned as-is */
1683
1684 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1685 "Entering stdize_locale(%d, '%s');"
1686 " called from %" LINE_Tf "\n",
1687 category, input_locale, caller_line));
1688
1689 if (input_locale == NULL) {
1690 SET_EINVAL;
1691 return NULL;
1692 }
1693
1694 char * retval = (char *) input_locale;
1695
1696 # if defined(LC_ALL) && defined(HAS_BROKEN_SETLOCALE_QUERY_LC_ALL)
1697
1698 /* If setlocale(LC_ALL, NULL) is broken, compute what the system
1699 * actually thinks it should be from its individual components */
1700 if (category == LC_ALL) {
1701 retval = (char *) calculate_LC_ALL_string(
1702 NULL, /* query each individ locale */
1703 EXTERNAL_FORMAT_FOR_SET,
1704 WANT_TEMP_PV,
1705 caller_line);
1706 }
1707
1708 # endif
1709 # ifdef HAS_NL_IN_SETLOCALE_RETURN
1710
1711 char * first_bad = NULL;
1712
1713 # ifndef LC_ALL
1714
1715 PERL_UNUSED_ARG(category);
1716 PERL_UNUSED_ARG(caller_line);
1717
1718 # define INPUT_LOCALE retval
1719 # define MARK_CHANGED
1720 # else
1721
1722 char * individ_locales[LC_ALL_INDEX_] = { NULL };
1723 bool made_changes = false;
1724 Size_t upper;
1725 if (category != LC_ALL) {
1726 individ_locales[0] = retval;
1727 upper = 0;
1728 }
1729 else {
1730
1731 /* And parse the locale string, splitting into its individual
1732 * components. */
1733 switch (parse_LC_ALL_string(retval,
1734 (const char **) &individ_locales,
1735 check_that_overridden, /* ignored
1736 categories should
1737 already have been
1738 overridden */
1739 false, /* Return only [0] if suffices */
1740 false, /* Don't panic on error */
1741 caller_line))
1742 {
1743 case invalid:
1744 SET_EINVAL;
1745 return NULL;
1746
1747 case full_array: /* Loop below through all the component categories.
1748 */
1749 upper = LC_ALL_INDEX_ - 1;
1750 break;
1751
1752 case no_array:
1753 /* All categories here are set to the same locale, and the parse
1754 * didn't fill in any of 'individ_locales'. Set the 0th element to
1755 * that locale. */
1756 individ_locales[0] = retval;
1757 /* FALLTHROUGH */
1758
1759 case only_element_0: /* Element 0 is the only element we need to look
1760 at */
1761 upper = 0;
1762 break;
1763 }
1764 }
1765
1766 for (unsigned int i = 0; i <= upper; i++)
1767
1768 # define INPUT_LOCALE individ_locales[i]
1769 # define MARK_CHANGED made_changes = true;
1770 # endif /* Has LC_ALL */
1771
1772 {
1773 first_bad = (char *) strchr(INPUT_LOCALE, '\n');
1774
1775 /* Most likely, there isn't a problem with the input */
1776 if (UNLIKELY(first_bad)) {
1777
1778 /* This element will need to be adjusted. Create a modifiable
1779 * copy. */
1780 MARK_CHANGED
1781 retval = savepv(INPUT_LOCALE);
1782 SAVEFREEPV(retval);
1783
1784 /* Translate the found position into terms of the copy */
1785 first_bad = retval + (first_bad - INPUT_LOCALE);
1786
1787 /* Get rid of the \n and what follows. (Originally, only a
1788 * trailing \n was stripped. Unsure what to do if not trailing) */
1789 *((char *) first_bad) = '\0';
1790 } /* End of needs adjusting */
1791 } /* End of looking for problems */
1792
1793 # ifdef LC_ALL
1794
1795 /* If we had multiple elements, extra work is required */
1796 if (upper != 0) {
1797
1798 /* If no changes were made to the input, 'retval' already contains it
1799 * */
1800 if (made_changes) {
1801
1802 /* But if did make changes, need to calculate the new value */
1803 retval = (char *) calculate_LC_ALL_string(
1804 (const char **) &individ_locales,
1805 EXTERNAL_FORMAT_FOR_SET,
1806 WANT_TEMP_PV,
1807 caller_line);
1808 }
1809
1810 /* And free the no-longer needed memory */
1811 for (unsigned int i = 0; i <= upper; i++) {
1812 Safefree(individ_locales[i]);
1813 }
1814 }
1815
1816 # endif
1817 # undef INPUT_LOCALE
1818 # undef MARK_CHANGED
1819 # endif /* HAS_NL_IN_SETLOCALE_RETURN */
1820
1821 return (const char *) retval;
1822 }
1823
1824 #endif /* USE_LOCALE */
1825
1826 /* End of stdize_locale layer
1827 *
1828 * ==========================================================================
1829 *
1830 * The next many lines form several implementations of a layer above the
1831 * close-to-the-metal 'posix' and 'stdized' macros. They are used to present a
1832 * uniform API to the rest of the code in this file in spite of the disparate
1833 * underlying implementations. Which implementation gets compiled depends on
1834 * the platform capabilities (and some user choice) as determined by Configure.
1835 *
1836 * As more fully described in the introductory comments in this file, the
1837 * API of each implementation consists of three sets of macros. Each set has
1838 * three variants with suffixes '_c', '_r', and '_i'. In the list below '_X'
1839 * is to be replaced by any of these suffixes.
1840 *
1841 * 1) bool_setlocale_X attempts to set the given category's locale to the
1842 * given value, returning if it worked or not.
1843 * 2) void_setlocale_X is like the corresponding bool_setlocale, but used when
1844 * success is the only sane outcome, so failure causes it
1845 * to panic.
1846 * 3) querylocale_X to see what the given category's locale is
1847 *
1848 * 4) setlocale_i() is defined only in those implementations where the bool
1849 * and query forms are essentially the same, and can be
1850 * combined to save CPU time.
1851 *
1852 * Each implementation is fundamentally defined by just two macros: a
1853 * bool_setlocale_X() and a querylocale_X(). The other macros are all
1854 * derivable from these. Each fundamental macro is either a '_i' suffix one or
1855 * an '_r' suffix one, depending on what is the most efficient in getting to an
1856 * input form that the underlying libc functions want. The derived macro
1857 * definitions are deferred in this file to after the code for all the
1858 * implementations. This makes each implementation shorter and clearer, and
1859 * removes duplication.
1860 *
1861 * Each implementation below is separated by ==== lines, and includes bool,
1862 * void, and query macros. The query macros are first, followed by any
1863 * functions needed to implement them. Then come the bool, again followed by
1864 * any implementing functions Then are the void macros; next is setlocale_i if
1865 * present on this implementation. Finally are any helper functions. The sets
1866 * in each implementation are separated by ---- lines.
1867 *
1868 * The returned strings from all the querylocale...() forms in all
1869 * implementations are thread-safe, and the caller should not free them,
1870 * but each may be a mortalized copy. If you need something stable across
1871 * calls, you need to savepv() the result yourself.
1872 *
1873 *===========================================================================*/
1874
1875 #if (! defined(USE_LOCALE_THREADS) && ! defined(USE_POSIX_2008_LOCALE)) \
1876 || ( defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE))
1877
1878 /* For non-threaded perls, the implementation just expands to the base-level
1879 * functions (except if we are Configured to nonetheless use the POSIX 2008
1880 * interface) This implementation is also used on threaded perls where
1881 * threading is invisible to us. Currently this is only on later Windows
1882 * versions. */
1883
1884 # define querylocale_r(cat) mortalized_pv_copy(stdized_setlocale(cat, NULL))
1885 # define bool_setlocale_r(cat, locale) cBOOL(posix_setlocale(cat, locale))
1886
1887 /*---------------------------------------------------------------------------*/
1888
1889 /* setlocale_i is only defined for Configurations where the libc setlocale()
1890 * doesn't need any tweaking. It allows for some shortcuts */
1891 # ifndef USE_LOCALE_THREADS
1892 # define setlocale_i(i, locale) stdized_setlocale(categories[i], locale)
1893
1894 # elif defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)
1895
1896 /* On Windows, we don't know at compile time if we are in thread-safe mode or
1897 * not. If we are, we can just return the result of the layer below us. If we
1898 * are in unsafe mode, we need to first copy that result to a safe place while
1899 * in a critical section */
1900
1901 # define setlocale_i(i, locale) S_setlocale_i(aTHX_ categories[i], locale)
1902
1903 STATIC const char *
S_setlocale_i(pTHX_ const int category,const char * locale)1904 S_setlocale_i(pTHX_ const int category, const char * locale)
1905 {
1906 if (LIKELY(_configthreadlocale(0) == _ENABLE_PER_THREAD_LOCALE)) {
1907 return stdized_setlocale(category, locale);
1908 }
1909
1910 gwLOCALE_LOCK;
1911 const char * retval = save_to_buffer(stdized_setlocale(category, locale),
1912 &PL_setlocale_buf,
1913 &PL_setlocale_bufsize);
1914 gwLOCALE_UNLOCK;
1915
1916 return retval;
1917 }
1918
1919 # endif
1920
1921 /*===========================================================================*/
1922 #elif defined(USE_LOCALE_THREADS) \
1923 && ! defined(USE_THREAD_SAFE_LOCALE)
1924
1925 /* Here, there are threads, and there is no support for thread-safe
1926 * operation. This is a dangerous situation, which perl is documented as
1927 * not supporting, but it arises in practice. We can do a modicum of
1928 * automatic mitigation by making sure there is a per-thread return from
1929 * setlocale(), and that a mutex protects it from races */
1930
1931 # define querylocale_r(cat) \
1932 mortalized_pv_copy(less_dicey_setlocale_r(cat, NULL))
1933
1934 STATIC const char *
S_less_dicey_setlocale_r(pTHX_ const int category,const char * locale)1935 S_less_dicey_setlocale_r(pTHX_ const int category, const char * locale)
1936 {
1937 const char * retval;
1938
1939 PERL_ARGS_ASSERT_LESS_DICEY_SETLOCALE_R;
1940
1941 STDIZED_SETLOCALE_LOCK;
1942
1943 retval = save_to_buffer(stdized_setlocale(category, locale),
1944 &PL_less_dicey_locale_buf,
1945 &PL_less_dicey_locale_bufsize);
1946
1947 STDIZED_SETLOCALE_UNLOCK;
1948
1949 return retval;
1950 }
1951
1952 /*---------------------------------------------------------------------------*/
1953
1954 # define bool_setlocale_r(cat, locale) \
1955 less_dicey_bool_setlocale_r(cat, locale)
1956
1957 STATIC bool
S_less_dicey_bool_setlocale_r(pTHX_ const int cat,const char * locale)1958 S_less_dicey_bool_setlocale_r(pTHX_ const int cat, const char * locale)
1959 {
1960 bool retval;
1961
1962 PERL_ARGS_ASSERT_LESS_DICEY_BOOL_SETLOCALE_R;
1963
1964 /* Unlikely, but potentially possible that another thread could zap the
1965 * buffer from true to false or vice-versa, so need to lock here */
1966 POSIX_SETLOCALE_LOCK;
1967 retval = cBOOL(posix_setlocale(cat, locale));
1968 POSIX_SETLOCALE_UNLOCK;
1969
1970 return retval;
1971 }
1972
1973 /*---------------------------------------------------------------------------*/
1974
1975 /* setlocale_i is only defined for Configurations where the libc setlocale()
1976 * suffices for both querying and setting the locale. It allows for some
1977 * shortcuts */
1978 # define setlocale_i(i, locale) less_dicey_setlocale_r(categories[i], locale)
1979
1980 /* The code in this file may change the locale briefly during certain
1981 * operations. This should be a critical section when that could interfere
1982 * with other instances executing at the same time. */
1983 # define TOGGLE_LOCK(i) POSIX_SETLOCALE_LOCK
1984 # define TOGGLE_UNLOCK(i) POSIX_SETLOCALE_UNLOCK
1985
1986 /*===========================================================================*/
1987
1988 #elif defined(USE_POSIX_2008_LOCALE)
1989 # ifndef LC_ALL
1990 # error This code assumes that LC_ALL is available on a system modern enough to have POSIX 2008
1991 # endif
1992
1993 /* Here, there is a completely different API to get thread-safe locales. We
1994 * emulate the setlocale() API with our own function(s). setlocale categories,
1995 * like LC_NUMERIC, are not valid here for the POSIX 2008 API. Instead, there
1996 * are equivalents, like LC_NUMERIC_MASK, which we use instead, which we find
1997 * by table lookup. */
1998
1999 # if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES)
2000 /* https://sourceware.org/bugzilla/show_bug.cgi?id=24936 */
2001 # define HAS_GLIBC_LC_MESSAGES_BUG
2002 # include <libintl.h>
2003 # endif
2004
2005 # define querylocale_i(i) querylocale_2008_i(i, __LINE__)
2006
2007 /* We need to define this derivative macro here, as it is needed in
2008 * the implementing function (for recursive calls). It also gets defined
2009 * where all the other derivative macros are defined, and the compiler
2010 * will complain if the definition gets out of sync */
2011 # define querylocale_c(cat) querylocale_i(cat##_INDEX_)
2012
2013 STATIC const char *
S_querylocale_2008_i(pTHX_ const locale_category_index index,const line_t caller_line)2014 S_querylocale_2008_i(pTHX_ const locale_category_index index,
2015 const line_t caller_line)
2016 {
2017 PERL_ARGS_ASSERT_QUERYLOCALE_2008_I;
2018
2019 /* This function returns the name of the locale category given by the input
2020 * 'index' into our parallel tables of them.
2021 *
2022 * POSIX 2008, for some sick reason, chose not to provide a method to find
2023 * the category name of a locale, disregarding a basic linguistic tenet
2024 * that for any object, people will create a name for it. (The next
2025 * version of the POSIX standard is proposed to fix this.) Some vendors
2026 * have created a querylocale() function to do this in the meantime. On
2027 * systems without querylocale(), we have to keep track of what the locale
2028 * has been set to, so that we can return its name so as to emulate
2029 * setlocale(). There are potential problems with this:
2030 *
2031 * 1) We don't know what calling newlocale() with the locale argument ""
2032 * actually does. It gets its values from the program's environment.
2033 * find_locale_from_environment() is used to work around this. But it
2034 * isn't fool-proof. See the comments for that function for details.
2035 * 2) It's possible for C code in some library to change the locale
2036 * without us knowing it, and thus our records become wrong;
2037 * querylocale() would catch this. But as of September 2017, there
2038 * are no occurrences in CPAN of uselocale(). Some libraries do use
2039 * setlocale(), but that changes the global locale, and threads using
2040 * per-thread locales will just ignore those changes.
2041 * 3) Many systems have multiple names for the same locale. Generally,
2042 * there is an underlying base name, with aliases that evaluate to it.
2043 * On some systems, if you set the locale to an alias, and then
2044 * retrieve the name, you get the alias as expected; but on others you
2045 * get the base name, not the alias you used. And sometimes the
2046 * charade is incomplete. See
2047 * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375.
2048 *
2049 * The code is structured so that the returned locale name when the
2050 * locale is changed is whatever the result of querylocale() on the
2051 * new locale is. This effectively gives the result the system
2052 * expects. Without querylocale, the name returned is always the
2053 * input name. Theoretically this could cause problems, but khw knows
2054 * of none so far, but mentions it here in case you are trying to
2055 * debug something. (This could be worked around by messing with the
2056 * global locale temporarily, using setlocale() to get the base name;
2057 * but that could cause a race. The comments for
2058 * find_locale_from_environment() give details on the potential race.)
2059 */
2060
2061 const locale_t cur_obj = uselocale((locale_t) 0);
2062 const char * retval;
2063
2064 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "querylocale_2008_i(%s) on %p;"
2065 " called from %" LINE_Tf "\n",
2066 category_names[index], cur_obj,
2067 caller_line));
2068
2069 if (UNLIKELY(cur_obj == LC_GLOBAL_LOCALE)) {
2070
2071 /* Even on platforms that have querylocale(), it is unclear if they
2072 * work in the global locale, and we have the means to get the correct
2073 * answer anyway. khw is unsure this situation even comes up these
2074 * days, hence the branch prediction */
2075 POSIX_SETLOCALE_LOCK;
2076 retval = mortalized_pv_copy(posix_setlocale(categories[index], NULL));
2077 POSIX_SETLOCALE_UNLOCK;
2078 }
2079
2080 /* Here we have handled the case of the current locale being the global
2081 * one. Below is the 'else' case of that. There are two different
2082 * implementations, depending on USE_PL_CURLOCALES */
2083
2084 # ifdef USE_PL_CURLOCALES
2085
2086 else {
2087
2088 /* PL_curlocales[] is kept up-to-date for all categories except LC_ALL,
2089 * which may have been invalidated by setting it to NULL, and if so,
2090 * should now be calculated. (The called function updates that
2091 * element.) */
2092 if (index == LC_ALL_INDEX_ && PL_curlocales[LC_ALL_INDEX_] == NULL) {
2093 calculate_LC_ALL_string((const char **) &PL_curlocales,
2094 INTERNAL_FORMAT,
2095 WANT_VOID,
2096 caller_line);
2097 }
2098
2099 if (cur_obj == PL_C_locale_obj) {
2100
2101 /* If the current locale object is the C object, then the answer is
2102 * "C" or POSIX, regardless of the category. Handling this
2103 * reasonably likely case specially shortcuts extra effort, and
2104 * hides some bugs from us in OS's that alias other locales to C,
2105 * but do so incompletely. If our records say it is POSIX, use
2106 * that; otherwise use C. See
2107 * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375 */
2108 retval = (strEQ(PL_curlocales[index], "POSIX"))
2109 ? "POSIX"
2110 : "C";
2111 }
2112 else {
2113 retval = mortalized_pv_copy(PL_curlocales[index]);
2114 }
2115 }
2116
2117 # else
2118
2119 /* Below is the implementation of the 'else' clause which handles the case
2120 * of the current locale not being the global one on platforms where
2121 * USE_PL_CURLOCALES is NOT in effect. That means the system must have
2122 * some form of querylocale. But these have varying characteristics, so
2123 * first create some #defines to make the actual 'else' clause uniform.
2124 *
2125 * First, glibc has a function that implements querylocale(), but is called
2126 * something else, and takes the category number; the others take the mask.
2127 * */
2128 # if defined(USE_QUERYLOCALE) && ( defined(_NL_LOCALE_NAME) \
2129 && defined(HAS_NL_LANGINFO_L))
2130 # define my_querylocale(index, cur_obj) \
2131 nl_langinfo_l(_NL_LOCALE_NAME(categories[index]), cur_obj)
2132
2133 /* Experience so far shows it is thread-safe, as well as glibc's
2134 * nl_langinfo_l(), so unless overridden, mark it so */
2135 # ifdef NO_THREAD_SAFE_QUERYLOCALE
2136 # undef HAS_THREAD_SAFE_QUERYLOCALE
2137 # else
2138 # define HAS_THREAD_SAFE_QUERYLOCALE
2139 # endif
2140 # else /* below, ! glibc */
2141
2142 /* Otherwise, use the system's querylocale(). */
2143 # define my_querylocale(index, cur_obj) \
2144 querylocale(category_masks[index], cur_obj)
2145
2146 /* There is no standard for this function, and khw has never seen
2147 * anything beyond minimal vendor documentation, lacking important
2148 * details. Experience has shown that some implementations have race
2149 * condiions, and their returns may not be thread safe. It would be
2150 * unreliable to test for complete thread safety in Configure. What we
2151 * do instead is to assume that it is thread-safe, unless overriden by,
2152 * say, a hints file specifying
2153 * -Accflags='-DNO_THREAD_SAFE_QUERYLOCALE */
2154 # ifdef NO_THREAD_SAFE_QUERYLOCALE
2155 # undef HAS_THREAD_SAFE_QUERYLOCALE
2156 # else
2157 # define HAS_THREAD_SAFE_QUERYLOCALE
2158 # endif
2159 # endif
2160
2161 /* Here, we have set up enough information to know if this querylocale()
2162 * is thread-safe, or needs to use a mutex */
2163 # ifdef HAS_THREAD_SAFE_QUERYLOCALE
2164 # define QUERYLOCALE_LOCK
2165 # define QUERYLOCALE_UNLOCK
2166 # else
2167 # define QUERYLOCALE_LOCK gwLOCALE_LOCK
2168 # define QUERYLOCALE_UNLOCK gwLOCALE_UNLOCK
2169 # endif
2170
2171 /* Finally, everything is ready, so here is the 'else' clause to implement
2172 * the case of the current locale not being the global one on systems that
2173 * have some form of querylocale(). (POSIX will presumably eventually
2174 * publish their next version in their pipeline, which will define a
2175 * precisely specified querylocale equivalent, and there can be a new
2176 * #ifdef to use it without having to guess at its characteristics) */
2177
2178 else {
2179 /* We don't keep records when there is querylocale(), so as to avoid the
2180 * pitfalls mentioned at the beginning of this function.
2181 *
2182 * That means LC_ALL has to be calculated from all its constituent
2183 * categories each time, since the querylocale() forms on many (if not
2184 * all) platforms only work on individual categories */
2185 if (index == LC_ALL_INDEX_) {
2186 retval = calculate_LC_ALL_string(NULL, INTERNAL_FORMAT,
2187 WANT_TEMP_PV,
2188 caller_line);
2189 }
2190 else {
2191
2192 QUERYLOCALE_LOCK;
2193 retval = my_querylocale(index, cur_obj);
2194
2195 /* querylocale() may conflate the C locale with something that
2196 * isn't exactly the same. See for example
2197 * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=269375
2198 * We know that if the locale object is the C one, we
2199 * are in the C locale, which may go by the name POSIX, as both, by
2200 * definition, are equivalent. But we consider any other name
2201 * spurious, so override with "C". As in the PL_CURLOCALES case
2202 * above, this hides those glitches, for the most part, from the
2203 * rest of our code. (The code is ordered this way so that if the
2204 * system distinugishes "C" from "POSIX", we do too.) */
2205 if (cur_obj == PL_C_locale_obj && ! isNAME_C_OR_POSIX(retval)) {
2206 QUERYLOCALE_UNLOCK;
2207 retval = "C";
2208 }
2209 else {
2210 retval = savepv(retval);
2211 QUERYLOCALE_UNLOCK;
2212 SAVEFREEPV(retval);
2213 }
2214 }
2215 }
2216
2217 # undef QUERYLOCALE_LOCK
2218 # undef QUERYLOCALE_UNLOCK
2219 # endif
2220
2221 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2222 "querylocale_2008_i(%s) returning '%s'\n",
2223 category_names[index], retval));
2224 assert(strNE(retval, ""));
2225 return retval;
2226 }
2227
2228 /*---------------------------------------------------------------------------*/
2229
2230 # define bool_setlocale_i(i, locale) \
2231 bool_setlocale_2008_i(i, locale, __LINE__)
2232
2233 /* If this doesn't exist on this platform, make it a no-op (to save #ifdefs) */
2234 # ifndef update_PL_curlocales_i
2235 # define update_PL_curlocales_i(index, new_locale, caller_line)
2236 # endif
2237
2238 STATIC bool
S_bool_setlocale_2008_i(pTHX_ const locale_category_index index,const char * new_locale,const line_t caller_line)2239 S_bool_setlocale_2008_i(pTHX_
2240
2241 /* Our internal index of the 'category' setlocale is called with */
2242 const locale_category_index index,
2243 const char * new_locale, /* The locale to set the category to */
2244 const line_t caller_line /* Called from this line number */
2245 )
2246 {
2247 PERL_ARGS_ASSERT_BOOL_SETLOCALE_2008_I;
2248
2249 /* This function effectively performs a setlocale() on just the current
2250 * thread; thus it is thread-safe. It does this by using the POSIX 2008
2251 * locale functions to emulate the behavior of setlocale(). Similar to
2252 * regular setlocale(), the return from this function points to memory that
2253 * can be overwritten by other system calls, so needs to be copied
2254 * immediately if you need to retain it. The difference here is that
2255 * system calls besides another setlocale() can overwrite it.
2256 *
2257 * By doing this, most locale-sensitive functions become thread-safe. The
2258 * exceptions are mostly those that return a pointer to static memory.
2259 */
2260
2261 int mask = category_masks[index];
2262 const locale_t entry_obj = uselocale((locale_t) 0);
2263 const char * locale_on_entry = querylocale_i(index);
2264
2265 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2266 "bool_setlocale_2008_i: input=%d (%s), mask=0x%x,"
2267 " new locale=\"%s\", current locale=\"%s\","
2268 " index=%d, entry object=%p;"
2269 " called from %" LINE_Tf "\n",
2270 categories[index], category_names[index], mask,
2271 ((new_locale == NULL) ? "(nil)" : new_locale),
2272 locale_on_entry, index, entry_obj, caller_line));
2273
2274 /* Here, trying to change the locale, but it is a no-op if the new boss is
2275 * the same as the old boss. Except this routine is called when converting
2276 * from the global locale, so in that case we will create a per-thread
2277 * locale below (with the current values). It also seemed that newlocale()
2278 * could free up the basis locale memory if we called it with the new and
2279 * old being the same, but khw now thinks that this was due to some other
2280 * bug, since fixed, as there are other places where newlocale() gets
2281 * similarly called without problems. */
2282 if ( entry_obj != LC_GLOBAL_LOCALE
2283 && locale_on_entry
2284 && strEQ(new_locale, locale_on_entry))
2285 {
2286 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2287 "bool_setlocale_2008_i: no-op to change to"
2288 " what it already was\n"));
2289 return true;
2290 }
2291
2292 # ifndef USE_QUERYLOCALE
2293
2294 /* Without a querylocale() mechanism, we have to figure out ourselves what
2295 * happens with setting a locale to "" */
2296
2297 if (strEQ(new_locale, "")) {
2298 new_locale = find_locale_from_environment(index);
2299 if (! new_locale) {
2300 SET_EINVAL;
2301 return false;
2302 }
2303 }
2304
2305 # endif
2306 # ifdef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
2307
2308 const bool need_loop = false;
2309
2310 # else
2311
2312 bool need_loop = false;
2313 const char * new_locales[LC_ALL_INDEX_] = { NULL };
2314
2315 /* If we're going to have to parse the LC_ALL string, might as well do it
2316 * now before we have made changes that we would have to back out of if the
2317 * parse fails */
2318 if (index == LC_ALL_INDEX_) {
2319 switch (parse_LC_ALL_string(new_locale,
2320 (const char **) &new_locales,
2321 override_if_ignored,
2322 false, /* Return only [0] if suffices */
2323 false, /* Don't panic on error */
2324 caller_line))
2325 {
2326 case invalid:
2327 SET_EINVAL;
2328 return false;
2329
2330 case no_array:
2331 need_loop = false;
2332 break;
2333
2334 case only_element_0:
2335 SAVEFREEPV(new_locales[0]);
2336 new_locale = new_locales[0];
2337 need_loop = false;
2338 break;
2339
2340 case full_array:
2341 need_loop = true;
2342 break;
2343 }
2344 }
2345
2346 # endif
2347 # ifdef HAS_GLIBC_LC_MESSAGES_BUG
2348
2349 /* For this bug, if the LC_MESSAGES locale changes, we have to do an
2350 * expensive workaround. Save the current value so we can later determine
2351 * if it changed. */
2352 const char * old_messages_locale = NULL;
2353 if ( (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_)
2354 && LIKELY(PL_phase != PERL_PHASE_CONSTRUCT))
2355 {
2356 old_messages_locale = querylocale_c(LC_MESSAGES);
2357 }
2358
2359 # endif
2360
2361 assert(PL_C_locale_obj);
2362
2363 /* Now ready to switch to the input 'new_locale' */
2364
2365 /* Switching locales generally entails freeing the current one's space (at
2366 * the C library's discretion), hence we can't be using that locale at the
2367 * time of the switch (this wasn't obvious to khw from the man pages). So
2368 * switch to a known locale object that we don't otherwise mess with. */
2369 if (! uselocale(PL_C_locale_obj)) {
2370
2371 /* Not being able to change to the C locale is severe; don't keep
2372 * going. */
2373 setlocale_failure_panic_i(index, locale_on_entry, "C",
2374 __LINE__, caller_line);
2375 NOT_REACHED; /* NOTREACHED */
2376 }
2377
2378 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2379 "bool_setlocale_2008_i: now using C"
2380 " object=%p\n", PL_C_locale_obj));
2381
2382 /* These two objects are special:
2383 * LC_GLOBAL_LOCALE because it is undefined behavior to call
2384 * newlocale() with it as a parameter.
2385 * PL_C_locale_obj because newlocale() generally destroys its locale
2386 * object parameter when it succeeds; and we don't
2387 * want that happening to this immutable object.
2388 * Copies will be made for them to use instead if we get so far as to call
2389 * newlocale(). */
2390 bool entry_obj_is_special = ( entry_obj == LC_GLOBAL_LOCALE
2391 || entry_obj == PL_C_locale_obj);
2392 locale_t new_obj;
2393
2394 /* PL_C_locale_obj is LC_ALL set to the C locale. If this call is to
2395 * switch to LC_ALL => C, simply use that object. But in fact, we already
2396 * have switched to it just above, in preparation for the general case.
2397 * Since we're already there, no need to do further switching. */
2398 if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) {
2399 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2400 "bool_setlocale_2008_i: will stay in C"
2401 " object\n"));
2402 new_obj = PL_C_locale_obj;
2403
2404 /* 'entry_obj' is now dangling, of no further use to anyone (unless it
2405 * is one of the special ones). Free it to avoid a leak */
2406 if (! entry_obj_is_special) {
2407 freelocale(entry_obj);
2408 }
2409
2410 update_PL_curlocales_i(index, new_locale, caller_line);
2411 }
2412 else { /* Here is the general case, not to LC_ALL => C */
2413
2414 /* The newlocale() call(s) below take a basis object to build upon to
2415 * create the changed locale, trashing it iff successful.
2416 *
2417 * For the objects that are not to be modified by this function, we
2418 * create a duplicate that gets trashed instead.
2419 *
2420 * Also if we will have to loop doing multiple newlocale()s, there is a
2421 * chance we will succeed for the first few, and then fail, having to
2422 * back out. We need to duplicate 'entry_obj' in this case as well, so
2423 * it remains valid as something to back out to. */
2424 locale_t basis_obj = entry_obj;
2425
2426 if (entry_obj_is_special || need_loop) {
2427 basis_obj = duplocale(basis_obj);
2428 if (! basis_obj) {
2429 locale_panic_via_("duplocale failed", __FILE__, caller_line);
2430 NOT_REACHED; /* NOTREACHED */
2431 }
2432
2433 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2434 "bool_setlocale_2008_i created %p by"
2435 " duping the input\n", basis_obj));
2436 }
2437
2438 # define DEBUG_NEW_OBJECT_CREATED(category, locale, new, old, caller_line) \
2439 DEBUG_Lv(PerlIO_printf(Perl_debug_log, \
2440 "bool_setlocale_2008_i(%s, %s): created %p" \
2441 " while freeing %p; called from %" LINE_Tf \
2442 " via %" LINE_Tf "\n", \
2443 category, locale, new, old, \
2444 caller_line, (line_t)__LINE__))
2445 # define DEBUG_NEW_OBJECT_FAILED(category, locale, basis_obj) \
2446 DEBUG_L(PerlIO_printf(Perl_debug_log, \
2447 "bool_setlocale_2008_i: creating new object" \
2448 " for (%s '%s') from %p failed; called from %" \
2449 LINE_Tf " via %" LINE_Tf "\n", \
2450 category, locale, basis_obj, \
2451 caller_line, (line_t)__LINE__));
2452
2453 /* Ready to create a new locale by modification of the existing one.
2454 *
2455 * NOTE: This code may incorrectly show up as a leak under the address
2456 * sanitizer. We do not free this object under normal teardown, however
2457 * you can set PERL_DESTRUCT_LEVEL=2 to cause it to be freed.
2458 */
2459
2460 # ifdef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
2461
2462 /* Some platforms have a newlocale() that can handle disparate LC_ALL
2463 * input, so on these a single call to newlocale() always works */
2464 # else
2465
2466 /* If a single call to newlocale() will do */
2467 if (! need_loop)
2468
2469 # endif
2470
2471 {
2472 new_obj = newlocale(mask,
2473 override_ignored_category(index, new_locale),
2474 basis_obj);
2475 if (! new_obj) {
2476 DEBUG_NEW_OBJECT_FAILED(category_names[index], new_locale,
2477 basis_obj);
2478
2479 /* Since the call failed, it didn't trash 'basis_obj', which is
2480 * a dup for these objects, and hence would leak if we don't
2481 * free it. XXX However, something is seriously wrong if we
2482 * can't switch to C or the global locale, so maybe should
2483 * panic instead */
2484 if (entry_obj_is_special) {
2485 freelocale(basis_obj);
2486 }
2487
2488 goto must_restore_state;
2489 }
2490
2491 DEBUG_NEW_OBJECT_CREATED(category_names[index], new_locale,
2492 new_obj, basis_obj, caller_line);
2493
2494 update_PL_curlocales_i(index, new_locale, caller_line);
2495 }
2496
2497 # ifndef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
2498
2499 else { /* Need multiple newlocale() calls */
2500
2501 /* Loop through the individual categories, setting the locale of
2502 * each to the corresponding name previously populated into
2503 * newlocales[]. Each iteration builds on the previous one, adding
2504 * its category to what's already been calculated, and taking as a
2505 * basis for what's been calculated 'basis_obj', which is updated
2506 * each iteration to be the result of the previous one. Upon
2507 * success, newlocale() trashes the 'basis_obj' parameter to it.
2508 * If any iteration fails, we immediately give up, restore the
2509 * locale to what it was at the time this function was called
2510 * (saved in 'entry_obj'), and return failure. */
2511
2512 /* Loop, using the previous iteration's result as the basis for the
2513 * next one. (The first time we effectively use the locale in
2514 * force upon entry to this function.) */
2515 for_all_individual_category_indexes(i) {
2516 new_obj = newlocale(category_masks[i],
2517 new_locales[i],
2518 basis_obj);
2519 if (new_obj) {
2520 DEBUG_NEW_OBJECT_CREATED(category_names[i],
2521 new_locales[i],
2522 new_obj, basis_obj,
2523 caller_line);
2524 basis_obj = new_obj;
2525 continue;
2526 }
2527
2528 /* Failed. Likely this is because the proposed new locale
2529 * isn't valid on this system. */
2530
2531 DEBUG_NEW_OBJECT_FAILED(category_names[i],
2532 new_locales[i],
2533 basis_obj);
2534
2535 /* newlocale() didn't trash this, since the function call
2536 * failed */
2537 freelocale(basis_obj);
2538
2539 for_all_individual_category_indexes(j) {
2540 Safefree(new_locales[j]);
2541 }
2542
2543 goto must_restore_state;
2544 }
2545
2546 /* Success for all categories. */
2547 for_all_individual_category_indexes(i) {
2548 update_PL_curlocales_i(i, new_locales[i], caller_line);
2549 Safefree(new_locales[i]);
2550 }
2551
2552 /* We dup'd entry_obj in case we had to fall back to it. The
2553 * newlocale() above destroyed the dup when it first succeeded, but
2554 * entry_obj itself is left dangling, so free it */
2555 if (! entry_obj_is_special) {
2556 freelocale(entry_obj);
2557 }
2558 }
2559
2560 # endif /* End of newlocale can't handle disparate LC_ALL input */
2561
2562 }
2563
2564 # undef DEBUG_NEW_OBJECT_CREATED
2565 # undef DEBUG_NEW_OBJECT_FAILED
2566
2567 /* Here, successfully created an object representing the desired locale;
2568 * now switch into it */
2569 if (! uselocale(new_obj)) {
2570 freelocale(new_obj);
2571 locale_panic_(Perl_form(aTHX_ "(called from %" LINE_Tf "):"
2572 " bool_setlocale_2008_i: switching"
2573 " into new locale failed",
2574 caller_line));
2575 }
2576
2577 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2578 "bool_setlocale_2008_i: now using %p\n", new_obj));
2579
2580 # ifdef MULTIPLICITY /* Unlikely, but POSIX 2008 functions could be
2581 Configured to be used on unthreaded perls, in which
2582 case this object doesn't exist */
2583
2584 if (DEBUG_Lv_TEST) {
2585 if (PL_cur_locale_obj != new_obj) {
2586 PerlIO_printf(Perl_debug_log,
2587 "bool_setlocale_2008_i: PL_cur_locale_obj"
2588 " was %p, now is %p\n",
2589 PL_cur_locale_obj, new_obj);
2590 }
2591 }
2592
2593 /* Update the current object */
2594 PL_cur_locale_obj = new_obj;
2595
2596 # endif
2597 # ifdef HAS_GLIBC_LC_MESSAGES_BUG
2598
2599 /* Invalidate the glibc cache of loaded translations if the locale has
2600 * changed, see [perl #134264] and
2601 * https://sourceware.org/bugzilla/show_bug.cgi?id=24936 */
2602 if (old_messages_locale) {
2603 if (strNE(old_messages_locale, querylocale_c(LC_MESSAGES))) {
2604 textdomain(textdomain(NULL));
2605 }
2606 }
2607
2608 # endif
2609
2610 return true;
2611
2612 must_restore_state:
2613
2614 /* We earlier switched to the LC_ALL => C locale in anticipation of it
2615 * succeeding, Now have to switch back to the state upon entry. */
2616 if (! uselocale(entry_obj)) {
2617 setlocale_failure_panic_i(index, "switching back to",
2618 locale_on_entry, __LINE__, caller_line);
2619 }
2620
2621 return false;
2622 }
2623
2624 /*===========================================================================*/
2625
2626 #else
2627 # error Unexpected Configuration
2628 #endif /* End of the various implementations of the setlocale and
2629 querylocale macros used in the remainder of this program */
2630
2631 /*===========================================================================*/
2632
2633 /* Each implementation above is based on two fundamental macros #defined above:
2634 * 1) either a querylocale_r or a querylocale_i
2635 * 2) either a bool_setlocale_r or a bool_setlocale_i
2636 *
2637 * (Which one of each got #defined is based on which is most efficient in
2638 * interacting with the underlying libc functions called.)
2639 *
2640 * To complete the implementation, macros for the missing two suffixes must be
2641 * #defined, as well as all the void_setlocale_X() forms. These all can be
2642 * mechanically derived from the fundamental ones. */
2643
2644 #ifdef querylocale_r
2645 # define querylocale_c(cat) querylocale_r(cat)
2646 # define querylocale_i(i) querylocale_r(categories[i])
2647 #elif defined(querylocale_i)
2648 # define querylocale_c(cat) querylocale_i(cat##_INDEX_)
2649 # define querylocale_r(cat) querylocale_i(get_category_index(cat))
2650 #else
2651 # error No querylocale() form defined
2652 #endif
2653
2654 #ifdef bool_setlocale_r
2655 # define bool_setlocale_i(i, l) bool_setlocale_r(categories[i], l)
2656 # define bool_setlocale_c(cat, l) bool_setlocale_r(cat, l)
2657
2658 # define void_setlocale_r_with_caller(cat, locale, file, line) \
2659 STMT_START { \
2660 if (! bool_setlocale_r(cat, locale)) \
2661 setlocale_failure_panic_via_i(get_category_index(cat), \
2662 NULL, locale, __LINE__, 0, \
2663 file, line); \
2664 } STMT_END
2665
2666 # define void_setlocale_c_with_caller(cat, locale, file, line) \
2667 void_setlocale_r_with_caller(cat, locale, file, line)
2668
2669 # define void_setlocale_i_with_caller(i, locale, file, line) \
2670 void_setlocale_r_with_caller(categories[i], locale, file, line)
2671
2672 # define void_setlocale_r(cat, locale) \
2673 void_setlocale_r_with_caller(cat, locale, __FILE__, __LINE__)
2674 # define void_setlocale_c(cat, locale) \
2675 void_setlocale_r(cat, locale)
2676 # define void_setlocale_i(i, locale) \
2677 void_setlocale_r(categories[i], locale)
2678
2679 #elif defined(bool_setlocale_i)
2680 # define bool_setlocale_c(cat, loc) bool_setlocale_i(cat##_INDEX_, loc)
2681 # define bool_setlocale_r(c, loc) bool_setlocale_i(get_category_index(c), l)
2682
2683 # define void_setlocale_i_with_caller(i, locale, file, line) \
2684 STMT_START { \
2685 if (! bool_setlocale_i(i, locale)) \
2686 setlocale_failure_panic_via_i(i, NULL, locale, __LINE__, 0, \
2687 file, line); \
2688 } STMT_END
2689
2690 # define void_setlocale_r_with_caller(cat, locale, file, line) \
2691 void_setlocale_i_with_caller(get_category_index(cat), locale, \
2692 file, line)
2693
2694 # define void_setlocale_c_with_caller(cat, locale, file, line) \
2695 void_setlocale_i_with_caller(cat##_INDEX_, locale, file, line)
2696
2697 # define void_setlocale_i(i, locale) \
2698 void_setlocale_i_with_caller(i, locale, __FILE__, __LINE__)
2699 # define void_setlocale_c(cat, locale) \
2700 void_setlocale_i(cat##_INDEX_, locale)
2701 # define void_setlocale_r(cat, locale) \
2702 void_setlocale_i(get_category_index(cat), locale)
2703
2704 #else
2705 # error No bool_setlocale() form defined
2706 #endif
2707
2708 /*===========================================================================*/
2709
2710 /* Most of the cases in this file just toggle the locale briefly; but there are
2711 * a few instances where a longer toggled interval, over multiple operations,
2712 * is desirable, since toggling and untoggling have a cost. But on platforms
2713 * where toggling must be done in a critical section, it is even more desirable
2714 * to minimize the length of time in an uninterruptable state.
2715 *
2716 * The macros below try to balance these competing interests. When the
2717 * toggling is to be brief, simply use the plain "toggle_locale" macros. But
2718 * in addition, in the places where an over-arching toggle would be nice, add
2719 * calls to the macros below that have the "_locking" suffix. These are no-ops
2720 * except on systems where the toggling doesn't force a critical section. But
2721 * otherwise these toggle to the over-arching locale. When the individual
2722 * toggles are executed, they will check and find that the locale is already in
2723 * the right state, and return without doing anything. */
2724 #if TOGGLING_LOCKS
2725 # define toggle_locale_c_unless_locking(cat, locale) NULL
2726 # define toggle_locale_c_if_locking( cat, locale) \
2727 toggle_locale_i(cat##_INDEX_, locale)
2728
2729 # define restore_toggled_locale_c_unless_locking(cat, locale) \
2730 PERL_UNUSED_ARG(locale)
2731 # define restore_toggled_locale_c_if_locking( cat, locale) \
2732 restore_toggled_locale_i( cat##_INDEX_, locale)
2733 #else
2734 # define toggle_locale_c_unless_locking(cat, locale) \
2735 toggle_locale_i(cat##_INDEX_, locale)
2736 # define toggle_locale_c_if_locking( cat, locale) NULL
2737
2738 # define restore_toggled_locale_c_unless_locking(cat, locale) \
2739 restore_toggled_locale_i(cat##_INDEX_, locale)
2740 # define restore_toggled_locale_c_if_locking( cat, locale) \
2741 PERL_UNUSED_ARG(locale)
2742 #endif
2743
2744 /* query_nominal_locale_i() is used when the caller needs the locale that an
2745 * external caller would be expecting, and not what we're secretly using
2746 * behind the scenes. It deliberately doesn't handle LC_ALL; use
2747 * calculate_LC_ALL_string() for that. */
2748 #ifdef USE_LOCALE_NUMERIC
2749 # define query_nominal_locale_i(i) \
2750 (__ASSERT_(i != LC_ALL_INDEX_) \
2751 ((i == LC_NUMERIC_INDEX_) ? PL_numeric_name : querylocale_i(i)))
2752 #elif defined(USE_LOCALE)
2753 # define query_nominal_locale_i(i) \
2754 (__ASSERT_(i != LC_ALL_INDEX_) querylocale_i(i))
2755 #else
2756 # define query_nominal_locale_i(i) "C"
2757 #endif
2758
2759 #ifdef USE_PL_CURLOCALES
2760
2761 STATIC void
S_update_PL_curlocales_i(pTHX_ const locale_category_index index,const char * new_locale,const line_t caller_line)2762 S_update_PL_curlocales_i(pTHX_
2763 const locale_category_index index,
2764 const char * new_locale,
2765 const line_t caller_line)
2766 {
2767 /* Update PL_curlocales[], which is parallel to the other ones indexed by
2768 * our mapping of libc category number to our internal equivalents. */
2769
2770 PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I;
2771
2772 if (index == LC_ALL_INDEX_) {
2773
2774 /* For LC_ALL, we change all individual categories to correspond,
2775 * including the LC_ALL element */
2776 for (unsigned int i = 0; i <= LC_ALL_INDEX_; i++) {
2777 Safefree(PL_curlocales[i]);
2778 PL_curlocales[i] = NULL;
2779 }
2780
2781 switch (parse_LC_ALL_string(new_locale,
2782 (const char **) &PL_curlocales,
2783 check_that_overridden, /* things should
2784 have already
2785 been overridden
2786 */
2787 true, /* Always fill array */
2788 true, /* Panic if fails, as to get here
2789 it earlier had to have succeeded
2790 */
2791 caller_line))
2792 {
2793 case invalid:
2794 case no_array:
2795 case only_element_0:
2796 locale_panic_via_("Unexpected return from parse_LC_ALL_string",
2797 __FILE__, caller_line);
2798
2799 case full_array:
2800 /* parse_LC_ALL_string() has already filled PL_curlocales properly,
2801 * except for the LC_ALL element, which should be set to
2802 * 'new_locale'. */
2803 PL_curlocales[LC_ALL_INDEX_] = savepv(new_locale);
2804 }
2805 }
2806 else { /* Not LC_ALL */
2807
2808 /* Update the single category's record */
2809 Safefree(PL_curlocales[index]);
2810 PL_curlocales[index] = savepv(new_locale);
2811
2812 /* Invalidate LC_ALL */
2813 Safefree(PL_curlocales[LC_ALL_INDEX_]);
2814 PL_curlocales[LC_ALL_INDEX_] = NULL;
2815 }
2816 }
2817
2818 # endif /* Need PL_curlocales[] */
2819
2820 /*===========================================================================*/
2821
2822 #if defined(USE_LOCALE)
2823
2824 /* This paradigm is needed in several places in the function below. We have to
2825 * substitute the nominal locale for LC_NUMERIC when returning a value for
2826 * external consumption */
2827 # ifndef USE_LOCALE_NUMERIC
2828 # define ENTRY(i, array, format) array[i]
2829 # else
2830 # define ENTRY(i, array, format) \
2831 (UNLIKELY( format == EXTERNAL_FORMAT_FOR_QUERY \
2832 && i == LC_NUMERIC_INDEX_) \
2833 ? PL_numeric_name \
2834 : array[i])
2835 # endif
2836
2837 STATIC
2838 const char *
S_calculate_LC_ALL_string(pTHX_ const char ** category_locales_list,const calc_LC_ALL_format format,const calc_LC_ALL_return returning,const line_t caller_line)2839 S_calculate_LC_ALL_string(pTHX_ const char ** category_locales_list,
2840 const calc_LC_ALL_format format,
2841 const calc_LC_ALL_return returning,
2842 const line_t caller_line)
2843 {
2844 PERL_ARGS_ASSERT_CALCULATE_LC_ALL_STRING;
2845
2846 /* NOTE: On Configurations that have PL_curlocales[], this function has the
2847 * side effect of updating the LC_ALL_INDEX_ element with its result.
2848 *
2849 * This function calculates a string that defines the locale(s) LC_ALL is
2850 * set to, in either:
2851 * 1) Our internal format if 'format' is set to INTERNAL_FORMAT.
2852 * 2) The external format returned by Perl_setlocale() if 'format' is set
2853 * to EXTERNAL_FORMAT_FOR_QUERY or EXTERNAL_FORMAT_FOR_SET.
2854 *
2855 * These two are distinguished by:
2856 * a) EXTERNAL_FORMAT_FOR_SET returns the actual locale currently in
2857 * effect.
2858 * b) EXTERNAL_FORMAT_FOR_QUERY returns the nominal locale.
2859 * Currently this can differ only from the actual locale in the
2860 * LC_NUMERIC category when it is set to a locale whose radix is
2861 * not a dot. (The actual locale is kept as a dot to accommodate
2862 * the large corpus of XS code that expects it to be that;
2863 * switched to a non-dot temporarily during certain operations
2864 * that require the actual radix.)
2865 *
2866 * In both 1) and 2), LC_ALL's values are passed to this function by
2867 * 'category_locales_list' which is either:
2868 * 1) a pointer to an array of strings with up-to-date values of all the
2869 * individual categories; or
2870 * 2) NULL, to indicate to use querylocale_i() to get each individual
2871 * value.
2872 *
2873 * The caller sets 'returning' to
2874 * WANT_TEMP_PV the function returns the calculated string
2875 * as a mortalized temporary, so the caller
2876 * doesn't have to worry about it being
2877 * per-thread, nor needs to arrange for its
2878 * clean-up.
2879 * WANT_PL_setlocale_buf the function stores the calculated string
2880 * into the per-thread buffer PL_setlocale_buf
2881 * and returns a pointer to that. The buffer
2882 * is cleaned up automatically in process
2883 * destruction. This return method avoids
2884 * extra copies in some circumstances.
2885 * WANT_VOID NULL is returned. This is used when the
2886 * function is being called only for its side
2887 * effect of updating
2888 * PL_curlocales[LC_ALL_INDEX_]
2889 *
2890 * querylocale(), on systems that have it, doesn't tend to work for LC_ALL.
2891 * So we have to construct the answer ourselves based on the passed in
2892 * data.
2893 *
2894 * If all individual categories are the same locale, we can just set LC_ALL
2895 * to that locale. But if not, we have to create an aggregation of all the
2896 * categories on the system. Platforms differ as to the syntax they use
2897 * for these non-uniform locales for LC_ALL. Some, like glibc and Windows,
2898 * use an unordered series of name=value pairs, like
2899 * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
2900 * to specify LC_ALL; others, like *BSD, use a positional notation with a
2901 * delimitter, typically a single '/' character:
2902 * C/en_UK.UTF-8/...
2903 *
2904 * When the external format is desired, this function returns whatever the
2905 * system expects. The internal format is always name=value pairs.
2906 *
2907 * For systems that have categories we don't know about, the algorithm
2908 * below won't know about those missing categories, leading to potential
2909 * bugs for code that looks at them. If there is an environment variable
2910 * that sets that category, we won't know to look for it, and so our use of
2911 * LANG or "C" improperly overrides it. On the other hand, if we don't do
2912 * what is done here, and there is no environment variable, the category's
2913 * locale should be set to LANG or "C". So there is no good solution. khw
2914 * thinks the best is to make sure we have a complete list of possible
2915 * categories, adding new ones as they show up on obscure platforms.
2916 */
2917
2918 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2919 "Entering calculate_LC_ALL_string(%s);"
2920 " called from %" LINE_Tf "\n",
2921 ((format == EXTERNAL_FORMAT_FOR_QUERY)
2922 ? "EXTERNAL_FORMAT_FOR_QUERY"
2923 : ((format == EXTERNAL_FORMAT_FOR_SET)
2924 ? "EXTERNAL_FORMAT_FOR_SET"
2925 : "INTERNAL_FORMAT")),
2926 caller_line));
2927
2928 bool input_list_was_NULL = (category_locales_list == NULL);
2929
2930 /* If there was no input category list, construct a temporary one
2931 * ourselves. */
2932 const char * my_category_locales_list[LC_ALL_INDEX_];
2933 const char ** locales_list = category_locales_list;
2934 if (locales_list == NULL) {
2935 locales_list = my_category_locales_list;
2936
2937 if (format == EXTERNAL_FORMAT_FOR_QUERY) {
2938 for_all_individual_category_indexes(i) {
2939 locales_list[i] = query_nominal_locale_i(i);
2940 }
2941 }
2942 else {
2943 for_all_individual_category_indexes(i) {
2944 locales_list[i] = querylocale_i(i);
2945 }
2946 }
2947 }
2948
2949 /* While we are calculating LC_ALL, we see if every category's locale is
2950 * the same as every other's or not. */
2951 # ifndef HAS_IGNORED_LOCALE_CATEGORIES_
2952
2953 /* When we pay attention to all categories, we assume they are all the same
2954 * until proven different */
2955 bool disparate = false;
2956
2957 # else
2958
2959 /* But if there are ignored categories, those will be set to "C", so try an
2960 * arbitrary category, and if it isn't C, we know immediately that the
2961 * locales are disparate. (The #if conditionals are to handle the case
2962 * where LC_NUMERIC_INDEX_ is 0. We don't want to use LC_NUMERIC to
2963 * compare, as that may be different between external and internal forms.)
2964 * */
2965 # if ! defined(USE_LOCALE_NUMERIC)
2966
2967 bool disparate = ! isNAME_C_OR_POSIX(locales_list[0]);
2968
2969 # elif LC_NUMERIC_INDEX_ != 0
2970
2971 bool disparate = ! isNAME_C_OR_POSIX(locales_list[0]);
2972
2973 # else
2974
2975 /* Would need revision to handle the very unlikely case where only a single
2976 * category, LC_NUMERIC, is defined */
2977 assert(LOCALE_CATEGORIES_COUNT_ > 0);
2978
2979 bool disparate = ! isNAME_C_OR_POSIX(locales_list[1]);
2980
2981 # endif
2982 # endif
2983
2984 /* Calculate the needed size for the string listing the individual locales.
2985 * Initialize with values known at compile time. */
2986 Size_t total_len;
2987 const char *separator;
2988
2989 # ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS /* Positional formatted LC_ALL */
2990 PERL_UNUSED_ARG(format);
2991 # else
2992
2993 if (format != INTERNAL_FORMAT) {
2994
2995 /* Here, we will be using positional notation. it includes n-1
2996 * separators */
2997 total_len = ( LOCALE_CATEGORIES_COUNT_ - 1)
2998 * STRLENs(PERL_LC_ALL_SEPARATOR)
2999 + 1; /* And a trailing NUL */
3000 separator = PERL_LC_ALL_SEPARATOR;
3001 }
3002 else
3003
3004 # endif
3005
3006 {
3007 /* name=value output is always used in internal format, and when
3008 * positional isn't available on the platform. */
3009 total_len = lc_all_boiler_plate_length;
3010 separator = ";";
3011 }
3012
3013 /* The total length then is just the sum of the above boiler-plate plus the
3014 * total strlen()s of the locale name of each individual category. */
3015 for_all_individual_category_indexes(i) {
3016 const char * entry = ENTRY(i, locales_list, format);
3017
3018 total_len += strlen(entry);
3019 if (! disparate && strNE(entry, locales_list[0])) {
3020 disparate = true;
3021 }
3022 }
3023
3024 bool free_if_void_return = false;
3025 const char * retval;
3026
3027 /* If all categories have the same locale, we already know the answer */
3028 if (! disparate) {
3029 if (returning == WANT_PL_setlocale_buf) {
3030 save_to_buffer(locales_list[0],
3031 &PL_setlocale_buf,
3032 &PL_setlocale_bufsize);
3033 retval = PL_setlocale_buf;
3034 }
3035 else {
3036
3037 retval = locales_list[0];
3038
3039 /* If a temporary is wanted for the return, and we had to create
3040 * the input list ourselves, we created it into such a temporary,
3041 * so no further work is needed; but otherwise, make a mortal copy
3042 * of this passed-in list element */
3043 if (returning == WANT_TEMP_PV && ! input_list_was_NULL) {
3044 retval = savepv(retval);
3045 SAVEFREEPV(retval);
3046 }
3047
3048 /* In all cases here, there's nothing we create that needs to be
3049 * freed, so leave 'free_if_void_return' set to the default
3050 * 'false'. */
3051 }
3052 }
3053 else { /* Here, not all categories have the same locale */
3054
3055 char * constructed;
3056
3057 /* If returning to PL_setlocale_buf, set up to write directly to it,
3058 * being sure it is resized to be large enough */
3059 if (returning == WANT_PL_setlocale_buf) {
3060 set_save_buffer_min_size(total_len,
3061 &PL_setlocale_buf,
3062 &PL_setlocale_bufsize);
3063 constructed = PL_setlocale_buf;
3064 }
3065 else { /* Otherwise we need new memory to hold the calculated value. */
3066
3067 Newx(constructed, total_len, char);
3068
3069 /* If returning the new memory, it must be set up to be freed
3070 * later; otherwise at the end of this function */
3071 if (returning == WANT_TEMP_PV) {
3072 SAVEFREEPV(constructed);
3073 }
3074 else {
3075 free_if_void_return = true;
3076 }
3077 }
3078
3079 constructed[0] = '\0';
3080
3081 /* Loop through all the categories */
3082 for_all_individual_category_indexes(j) {
3083
3084 /* Add a separator, except before the first one */
3085 if (j != 0) {
3086 my_strlcat(constructed, separator, total_len);
3087 }
3088
3089 const char * entry;
3090 Size_t needed_len;
3091 unsigned int i = j;
3092
3093 # ifndef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
3094
3095 if (UNLIKELY(format != INTERNAL_FORMAT)) {
3096
3097 /* In positional notation 'j' means the position, and we have
3098 * to convert to the index 'i' */
3099 i = map_LC_ALL_position_to_index[j];
3100
3101 entry = ENTRY(i, locales_list, format);
3102 needed_len = my_strlcat(constructed, entry, total_len);
3103 }
3104 else
3105
3106 # endif
3107 {
3108 /* Below, we are to use name=value notation, either because
3109 * that's what the platform uses, or because this is the
3110 * internal format, which uses that notation regardless of the
3111 * external form */
3112
3113 entry = ENTRY(i, locales_list, format);
3114
3115 /* "name=locale;" */
3116 my_strlcat(constructed, category_names[i], total_len);
3117 my_strlcat(constructed, "=", total_len);
3118 needed_len = my_strlcat(constructed, entry, total_len);
3119 }
3120
3121 if (LIKELY(needed_len <= total_len)) {
3122 continue;
3123 }
3124
3125 /* If would have overflowed, panic */
3126 locale_panic_via_(Perl_form(aTHX_
3127 "Internal length calculation wrong.\n"
3128 "\"%s\" was not entirely added to"
3129 " \"%.*s\"; needed=%zu, had=%zu",
3130 entry, (int) total_len,
3131 constructed,
3132 needed_len, total_len),
3133 __FILE__,
3134 caller_line);
3135 } /* End of loop through the categories */
3136
3137 retval = constructed;
3138 } /* End of the categories' locales are displarate */
3139
3140 # if defined(USE_PL_CURLOCALES) && defined(LC_ALL)
3141
3142 if (format == INTERNAL_FORMAT) {
3143
3144 /* PL_curlocales[LC_ALL_INDEX_] is updated as a side-effect of this
3145 * function for internal format. */
3146 Safefree(PL_curlocales[LC_ALL_INDEX_]);
3147 PL_curlocales[LC_ALL_INDEX_] = savepv(retval);
3148 }
3149
3150 # endif
3151
3152 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3153 "calculate_LC_ALL_string calculated '%s'\n",
3154 retval));
3155
3156 if (returning == WANT_VOID) {
3157 if (free_if_void_return) {
3158 Safefree(retval);
3159 }
3160
3161 return NULL;
3162 }
3163
3164 return retval;
3165 }
3166
3167 # if defined(WIN32) || ( defined(USE_POSIX_2008_LOCALE) \
3168 && ! defined(USE_QUERYLOCALE))
3169
3170 STATIC const char *
S_find_locale_from_environment(pTHX_ const locale_category_index index)3171 S_find_locale_from_environment(pTHX_ const locale_category_index index)
3172 {
3173 /* NB: This function may actually change the locale on Windows. It
3174 * currently is designed to be called only from setting the locale on
3175 * Windows, and POSIX 2008
3176 *
3177 * This function returns the locale specified by the program's environment
3178 * for the category specified by our internal index number 'index'. It
3179 * therefore simulates:
3180 * setlocale(cat, "")
3181 * but, except for some cases in Windows, doesn't actually change the
3182 * locale; merely returns it.
3183 *
3184 * The return need not be freed by the caller. This
3185 * promise relies on PerlEnv_getenv() returning a mortalized copy to us.
3186 *
3187 * The simulation is needed only on certain platforms; otherwise, libc is
3188 * called with "" to get the actual value(s). The simulation is needed
3189 * for:
3190 *
3191 * 1) On Windows systems, the concept of the POSIX ordering of
3192 * environment variables is missing. To increase portability of
3193 * programs across platforms, the POSIX ordering is emulated on
3194 * Windows.
3195 *
3196 * 2) On POSIX 2008 systems without querylocale(), it is problematic
3197 * getting the results of the POSIX 2008 equivalent of
3198 *
3199 * setlocale(category, "")
3200 *
3201 * To ensure that we know exactly what those values are, we do the
3202 * setting ourselves, using the documented algorithm specified by the
3203 * POSIX standard (assuming the platform follows the Standard) rather
3204 * than use "" as the locale. This will lead to results that differ
3205 * from native behavior if the native behavior differs from the
3206 * Standard's documented value, but khw believes it is better to know
3207 * what's going on, even if different from native, than to just guess.
3208 *
3209 * glibc systems differ from this standard in having a LANGUAGE
3210 * environment variable used for just LC_MESSAGES. This function does
3211 * NOT handle that.
3212 *
3213 * Another option for the POSIX 2008 case would be, in a critical
3214 * section, to save the global locale's current value, and do a
3215 * straight setlocale(LC_ALL, ""). That would return our desired
3216 * values, destroying the global locale's, which we would then
3217 * restore. But that could cause races with any other thread that is
3218 * using the global locale and isn't using the mutex. And, the only
3219 * reason someone would have done that is because they are calling a
3220 * library function, like in gtk, that calls setlocale(), and which
3221 * can't be changed to use the mutex. That wouldn't be a problem if
3222 * this were to be done before any threads had switched, say during
3223 * perl construction time. But this code would still be needed for
3224 * the general case.
3225 *
3226 * The Windows and POSIX 2008 differ in that the ultimate fallback is "C"
3227 * in POSIX, and is the system default locale in Windows. To get that
3228 * system default value, we actually have to call setlocale() on Windows.
3229 */
3230
3231 const char * const lc_all = PerlEnv_getenv("LC_ALL");
3232 const char * locale_names[LC_ALL_INDEX_] = { NULL };
3233
3234 /* Use any "LC_ALL" environment variable, as it overrides everything else.
3235 * */
3236 if (lc_all && strNE(lc_all, "")) {
3237 return lc_all;
3238 }
3239
3240 /* Here, no usable LC_ALL environment variable. We have to handle each
3241 * category separately. If all categories are desired, we loop through
3242 * them all. If only an individual category is desired, to avoid
3243 * duplicating logic, we use the same loop, but set up the limits so it is
3244 * only executed once, for that particular category. */
3245 locale_category_index lower, upper, offset;
3246 if (index == LC_ALL_INDEX_) {
3247 lower = (locale_category_index) 0;
3248 upper = (locale_category_index) ((int) LC_ALL_INDEX_ - 1);
3249 offset = (locale_category_index) 0;
3250 }
3251 else {
3252 lower = index;
3253 upper = index;
3254
3255 /* 'offset' is used so that the result of the single loop iteration is
3256 * stored into output[0] */
3257 offset = lower;
3258 }
3259
3260 /* When no LC_ALL environment variable, LANG is used as a default, but
3261 * overridden for individual categories that have corresponding environment
3262 * variables. If no LANG exists, the default is "C" on POSIX 2008, or the
3263 * system default for the category on Windows. */
3264 const char * env_lang = NULL;
3265
3266 /* For each desired category, use any corresponding environment variable;
3267 * or the default if none such exists. */
3268 bool is_disparate = false; /* Assume is uniform until proven otherwise */
3269 for_category_indexes_between(i, lower, upper) {
3270 const char * const env_override = PerlEnv_getenv(category_names[i]);
3271 locale_category_index j = (locale_category_index) (i - offset);
3272
3273 if (env_override && strNE(env_override, "")) {
3274 locale_names[j] = env_override;
3275 }
3276 else { /* Here, no corresponding environment variable, see if LANG
3277 exists and is usable. Done this way to avoid fetching LANG
3278 unless it is actually needed */
3279 if (env_lang == NULL) {
3280 env_lang = PerlEnv_getenv("LANG");
3281
3282 /* If not usable, set it to a non-NULL illegal value so won't
3283 * try to use it below */
3284 if (env_lang == NULL || strEQ(env_lang, "")) {
3285 env_lang = (const char *) 1;
3286 }
3287 }
3288
3289 /* If a usable LANG exists, use it. */
3290 if (env_lang != NULL && env_lang != (const char *) 1) {
3291 locale_names[j] = env_lang;
3292 }
3293 else {
3294
3295 # ifdef WIN32
3296 /* If no LANG, use the system default on Windows. */
3297 locale_names[j] = wrap_wsetlocale(categories[i], ".ACP");
3298 if (locale_names[j]) {
3299 SAVEFREEPV(locale_names[j]);
3300 }
3301 else
3302 # endif
3303 { /* If nothing was found or worked, use C */
3304 locale_names[j] = "C";
3305 }
3306 }
3307 }
3308
3309 if (j > 0 && ! is_disparate && strNE(locale_names[0], locale_names[j]))
3310 {
3311 is_disparate = true;
3312 }
3313
3314 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3315 "find_locale_from_environment i=%u, j=%u, name=%s,"
3316 " locale=%s, locale of 0th category=%s, disparate=%d\n",
3317 i, j, category_names[i],
3318 locale_names[j], locale_names[0], is_disparate));
3319 }
3320
3321 if (! is_disparate) {
3322 return locale_names[0];
3323 }
3324
3325 return calculate_LC_ALL_string(locale_names, INTERNAL_FORMAT,
3326 WANT_TEMP_PV,
3327 __LINE__);
3328 }
3329
3330 # endif
3331 # if defined(DEBUGGING) || defined(USE_PERL_SWITCH_LOCALE_CONTEXT)
3332
3333 STATIC const char *
S_get_LC_ALL_display(pTHX)3334 S_get_LC_ALL_display(pTHX)
3335 {
3336 return calculate_LC_ALL_string(NULL, INTERNAL_FORMAT,
3337 WANT_TEMP_PV,
3338 __LINE__);
3339 }
3340
3341 # endif
3342
3343 STATIC void
S_setlocale_failure_panic_via_i(pTHX_ const locale_category_index cat_index,const char * current,const char * failed,const line_t proxy_caller_line,const line_t immediate_caller_line,const char * const higher_caller_file,const line_t higher_caller_line)3344 S_setlocale_failure_panic_via_i(pTHX_
3345 const locale_category_index cat_index,
3346 const char * current,
3347 const char * failed,
3348 const line_t proxy_caller_line,
3349 const line_t immediate_caller_line,
3350 const char * const higher_caller_file,
3351 const line_t higher_caller_line)
3352 {
3353 PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_VIA_I;
3354
3355 /* Called to panic when a setlocale form unexpectedly failed for the
3356 * category determined by 'cat_index', and the locale that was in effect
3357 * (and likely still is) is 'current'. 'current' may be NULL, which causes
3358 * this function to query what it is.
3359 *
3360 * The extra caller information is used for when a function acts as a
3361 * stand-in for another function, which a typical reader would more likely
3362 * think would be the caller
3363 *
3364 * If a line number is 0, its stack (sort-of) frame is omitted; same if
3365 * it's the same line number as the next higher caller. */
3366
3367 const int cat = categories[cat_index];
3368 const char * name = category_names[cat_index];
3369
3370 dSAVE_ERRNO;
3371
3372 if (current == NULL) {
3373 current = querylocale_i(cat_index);
3374 }
3375
3376 const char * proxy_text = "";
3377 if (proxy_caller_line != 0 && proxy_caller_line != immediate_caller_line)
3378 {
3379 proxy_text = Perl_form(aTHX_ "\nCalled via %s: %" LINE_Tf,
3380 __FILE__, proxy_caller_line);
3381 }
3382 if ( strNE(__FILE__, higher_caller_file)
3383 || ( immediate_caller_line != 0
3384 && immediate_caller_line != higher_caller_line))
3385 {
3386 proxy_text = Perl_form(aTHX_ "%s\nCalled via %s: %" LINE_Tf,
3387 proxy_text, __FILE__,
3388 immediate_caller_line);
3389 }
3390
3391 /* 'false' in the get_displayable_string() calls makes it not think the
3392 * locale is UTF-8, so just dumps bytes. Actually figuring it out can be
3393 * too complicated for a panic situation. */
3394 const char * msg = Perl_form(aTHX_
3395 "Can't change locale for %s (%d) from '%s' to '%s'"
3396 " %s",
3397 name, cat,
3398 get_displayable_string(current,
3399 current + strlen(current),
3400 false),
3401 get_displayable_string(failed,
3402 failed + strlen(failed),
3403 false),
3404 proxy_text);
3405 RESTORE_ERRNO;
3406
3407 Perl_locale_panic(msg, __LINE__, higher_caller_file, higher_caller_line);
3408 NOT_REACHED; /* NOTREACHED */
3409 }
3410
3411 # ifdef USE_LOCALE_NUMERIC
3412
3413 STATIC void
S_new_numeric(pTHX_ const char * newnum,bool force)3414 S_new_numeric(pTHX_ const char *newnum, bool force)
3415 {
3416 PERL_ARGS_ASSERT_NEW_NUMERIC;
3417
3418 /* Called after each libc setlocale() or uselocale() call affecting
3419 * LC_NUMERIC, to tell core Perl this and that 'newnum' is the name of the
3420 * new locale, and we are switched into it. It installs this locale as the
3421 * current underlying default, and then switches to the C locale, if
3422 * necessary, so that the code that has traditionally expected the radix
3423 * character to be a dot may continue to do so.
3424 *
3425 * The default locale and the C locale can be toggled between by use of the
3426 * set_numeric_underlying() and set_numeric_standard() functions, which
3427 * should probably not be called directly, but only via macros like
3428 * SET_NUMERIC_STANDARD() in perl.h.
3429 *
3430 * The toggling is necessary mainly so that a non-dot radix decimal point
3431 * character can be input and output, while allowing internal calculations
3432 * to use a dot.
3433 *
3434 * This sets several interpreter-level variables:
3435 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
3436 * PL_numeric_underlying A boolean indicating if the toggled state is
3437 * such that the current locale is the program's
3438 * underlying locale
3439 * PL_numeric_standard An int indicating if the toggled state is such
3440 * that the current locale is the C locale or
3441 * indistinguishable from the C locale. If non-zero, it
3442 * is in C; if > 1, it means it may not be toggled away
3443 * from C.
3444 * PL_numeric_underlying_is_standard A bool kept by this function
3445 * indicating that the underlying locale and the standard
3446 * C locale are indistinguishable for the purposes of
3447 * LC_NUMERIC. This happens when both of the above two
3448 * variables are true at the same time. (Toggling is a
3449 * no-op under these circumstances.) This variable is
3450 * used to avoid having to recalculate.
3451 * PL_numeric_radix_sv Contains the string that code should use for the
3452 * decimal point. It is set to either a dot or the
3453 * program's underlying locale's radix character string,
3454 * depending on the situation.
3455 * PL_underlying_radix_sv Contains the program's underlying locale's
3456 * radix character string. This is copied into
3457 * PL_numeric_radix_sv when the situation warrants. It
3458 * exists to avoid having to recalculate it when toggling.
3459 */
3460
3461 DEBUG_L( PerlIO_printf(Perl_debug_log,
3462 "Called new_numeric with %s, PL_numeric_name=%s\n",
3463 newnum, PL_numeric_name));
3464
3465 /* We keep records comparing the characteristics of the LC_NUMERIC catetory
3466 * of the current locale vs the standard C locale. If the new locale that
3467 * has just been changed to is the same as the one our records are for,
3468 * they are still valid, and we don't have to recalculate them. 'force' is
3469 * true if the caller suspects that the records are out-of-date, so do go
3470 * ahead and recalculate them. (This can happen when an external library
3471 * has had control and now perl is reestablishing control; we have to
3472 * assume that that library changed the locale in unknown ways.)
3473 *
3474 * Even if our records are valid, the new locale will likely have been
3475 * switched to before this function gets called, and we must toggle into
3476 * one indistinguishable from the C locale with regards to LC_NUMERIC
3477 * handling, so that all the libc functions that are affected by LC_NUMERIC
3478 * will work as expected. This can be skipped if we already know that the
3479 * locale is indistinguishable from the C locale. */
3480 if (! force && strEQ(PL_numeric_name, newnum)) {
3481 if (! PL_numeric_underlying_is_standard) {
3482 set_numeric_standard(__FILE__, __LINE__);
3483 }
3484
3485 return;
3486 }
3487
3488 Safefree(PL_numeric_name);
3489 PL_numeric_name = savepv(newnum);
3490
3491 /* Handle the trivial case. Since this is called at process
3492 * initialization, be aware that this bit can't rely on much being
3493 * available. */
3494 if (isNAME_C_OR_POSIX(PL_numeric_name)) {
3495 PL_numeric_standard = TRUE;
3496 PL_numeric_underlying_is_standard = TRUE;
3497 PL_numeric_underlying = TRUE;
3498 sv_setpv(PL_numeric_radix_sv, C_decimal_point);
3499 SvUTF8_off(PL_numeric_radix_sv);
3500 sv_setpv(PL_underlying_radix_sv, C_decimal_point);
3501 SvUTF8_off(PL_underlying_radix_sv);
3502 return;
3503 }
3504
3505 /* We are in the underlying locale until changed at the end of this
3506 * function */
3507 PL_numeric_underlying = TRUE;
3508
3509 /* Passing a non-NULL causes the function call just below to
3510 automatically set the UTF-8 flag on PL_underlying_radix_sv */
3511 utf8ness_t dummy;
3512
3513 /* Find and save this locale's radix character. */
3514 langinfo_sv_c(RADIXCHAR, LC_NUMERIC, PL_numeric_name,
3515 PL_underlying_radix_sv, &dummy);
3516 DEBUG_L(PerlIO_printf(Perl_debug_log,
3517 "Locale radix is '%s', ?UTF-8=%d\n",
3518 SvPVX(PL_underlying_radix_sv),
3519 cBOOL(SvUTF8(PL_underlying_radix_sv))));
3520
3521 /* This locale is indistinguishable from C (for numeric purposes) if both
3522 * the radix character and the thousands separator are the same as C's.
3523 * Start with the radix. */
3524 PL_numeric_underlying_is_standard = strEQ(C_decimal_point,
3525 SvPVX(PL_underlying_radix_sv));
3526
3527 # ifndef TS_W32_BROKEN_LOCALECONV
3528
3529 /* If the radix isn't the same as C's, we know it is distinguishable from
3530 * C; otherwise check the thousands separator too. Only if both are the
3531 * same as C's is the locale indistinguishable from C.
3532 *
3533 * But on earlier Windows versions, there is a potential race. This code
3534 * knows that localeconv() (elsewhere in this file) will be used to extract
3535 * the needed value, and localeconv() was buggy for quite a while, and that
3536 * code in this file hence uses a workaround. And that workaround may have
3537 * an (unlikely) race. Gathering the radix uses a different workaround on
3538 * Windows that doesn't involve a race. It might be possible to do the
3539 * same for this (patches welcome).
3540 *
3541 * Until then khw doesn't think it's worth even the small risk of a race to
3542 * get this value, which doesn't appear to be used in any of the Microsoft
3543 * library routines anyway. */
3544
3545 if (PL_numeric_underlying_is_standard) {
3546 PL_numeric_underlying_is_standard = strEQ(C_thousands_sep,
3547 langinfo_c(THOUSEP,
3548 LC_NUMERIC,
3549 PL_numeric_name,
3550 NULL));
3551 }
3552
3553 # endif
3554
3555 PL_numeric_standard = PL_numeric_underlying_is_standard;
3556
3557 /* Keep LC_NUMERIC so that it has the C locale radix and thousands
3558 * separator. This is for XS modules, so they don't have to worry about
3559 * the radix being a non-dot. (Core operations that need the underlying
3560 * locale change to it temporarily). */
3561 if (! PL_numeric_standard) {
3562 set_numeric_standard(__FILE__, __LINE__);
3563 }
3564 }
3565
3566 # endif
3567
3568 void
Perl_set_numeric_standard(pTHX_ const char * const file,const line_t line)3569 Perl_set_numeric_standard(pTHX_ const char * const file, const line_t line)
3570 {
3571 PERL_ARGS_ASSERT_SET_NUMERIC_STANDARD;
3572 PERL_UNUSED_ARG(file); /* Some Configurations ignore these */
3573 PERL_UNUSED_ARG(line);
3574
3575 # ifdef USE_LOCALE_NUMERIC
3576
3577 /* Unconditionally toggle the LC_NUMERIC locale to the C locale
3578 *
3579 * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h
3580 * instead of calling this directly. The macro avoids calling this routine
3581 * if toggling isn't necessary according to our records (which could be
3582 * wrong if some XS code has changed the locale behind our back) */
3583
3584 DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to"
3585 " standard C; called from %s: %"
3586 LINE_Tf "\n", file, line));
3587
3588 void_setlocale_c_with_caller(LC_NUMERIC, "C", file, line);
3589 PL_numeric_standard = TRUE;
3590 sv_setpv(PL_numeric_radix_sv, C_decimal_point);
3591 SvUTF8_off(PL_numeric_radix_sv);
3592
3593 PL_numeric_underlying = PL_numeric_underlying_is_standard;
3594
3595 # endif /* USE_LOCALE_NUMERIC */
3596
3597 }
3598
3599 void
Perl_set_numeric_underlying(pTHX_ const char * const file,const line_t line)3600 Perl_set_numeric_underlying(pTHX_ const char * const file, const line_t line)
3601 {
3602 PERL_ARGS_ASSERT_SET_NUMERIC_UNDERLYING;
3603 PERL_UNUSED_ARG(file); /* Some Configurations ignore these */
3604 PERL_UNUSED_ARG(line);
3605
3606 # ifdef USE_LOCALE_NUMERIC
3607
3608 /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
3609 * default.
3610 *
3611 * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h
3612 * instead of calling this directly. The macro avoids calling this routine
3613 * if toggling isn't necessary according to our records (which could be
3614 * wrong if some XS code has changed the locale behind our back) */
3615
3616 DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s;"
3617 " called from %s: %" LINE_Tf "\n",
3618 PL_numeric_name, file, line));
3619 /* Maybe not in init? assert(PL_locale_mutex_depth > 0);*/
3620
3621 void_setlocale_c_with_caller(LC_NUMERIC, PL_numeric_name, file, line);
3622 PL_numeric_underlying = TRUE;
3623 sv_setsv_nomg(PL_numeric_radix_sv, PL_underlying_radix_sv);
3624
3625 PL_numeric_standard = PL_numeric_underlying_is_standard;
3626
3627 # endif /* USE_LOCALE_NUMERIC */
3628
3629 }
3630
3631 # ifdef USE_LOCALE_CTYPE
3632
3633 STATIC void
S_new_ctype(pTHX_ const char * newctype,bool force)3634 S_new_ctype(pTHX_ const char *newctype, bool force)
3635 {
3636 PERL_ARGS_ASSERT_NEW_CTYPE;
3637 PERL_UNUSED_ARG(force);
3638
3639 /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
3640 * core Perl this and that 'newctype' is the name of the new locale.
3641 *
3642 * This function sets up the folding arrays for all 256 bytes, assuming
3643 * that tofold() is tolc() since fold case is not a concept in POSIX,
3644 */
3645
3646 DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n",
3647 newctype));
3648
3649 /* No change means no-op */
3650 if (strEQ(PL_ctype_name, newctype)) {
3651 return;
3652 }
3653
3654 /* We will replace any bad locale warning with
3655 * 1) nothing if the new one is ok; or
3656 * 2) a new warning for the bad new locale */
3657 if (PL_warn_locale) {
3658 SvREFCNT_dec_NN(PL_warn_locale);
3659 PL_warn_locale = NULL;
3660 }
3661
3662 /* Clear cache */
3663 Safefree(PL_ctype_name);
3664 PL_ctype_name = "";
3665
3666 PL_in_utf8_turkic_locale = FALSE;
3667
3668 /* For the C locale, just use the standard folds, and we know there are no
3669 * glitches possible, so return early. Since this is called at process
3670 * initialization, be aware that this bit can't rely on much being
3671 * available. */
3672 if (isNAME_C_OR_POSIX(newctype)) {
3673 Copy(PL_fold, PL_fold_locale, 256, U8);
3674 PL_ctype_name = savepv(newctype);
3675 PL_in_utf8_CTYPE_locale = FALSE;
3676 return;
3677 }
3678
3679 /* The cache being cleared signals the called function to compute a new
3680 * value */
3681 PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype);
3682
3683 PL_ctype_name = savepv(newctype);
3684 bool maybe_utf8_turkic = FALSE;
3685
3686 /* Don't check for problems if we are suppressing the warnings */
3687 bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
3688
3689 if (PL_in_utf8_CTYPE_locale) {
3690
3691 /* A UTF-8 locale gets standard rules. But note that code still has to
3692 * handle this specially because of the three problematic code points
3693 * */
3694 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
3695
3696 /* UTF-8 locales can have special handling for 'I' and 'i' if they are
3697 * Turkic. Make sure these two are the only anomalies. (We don't
3698 * require towupper and towlower because they aren't in C89.) */
3699
3700 # if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
3701
3702 if (towupper('i') == 0x130 && towlower('I') == 0x131)
3703
3704 # else
3705
3706 if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I')
3707
3708 # endif
3709
3710 {
3711 /* This is how we determine it really is Turkic */
3712 check_for_problems = TRUE;
3713 maybe_utf8_turkic = TRUE;
3714 }
3715 }
3716 else { /* Not a canned locale we know the values for. Compute them */
3717
3718 # ifdef DEBUGGING
3719
3720 bool has_non_ascii_fold = FALSE;
3721 bool found_unexpected = FALSE;
3722
3723 /* Under -DLv, see if there are any folds outside the ASCII range.
3724 * This factoid is used below */
3725 if (DEBUG_Lv_TEST) {
3726 for (unsigned i = 128; i < 256; i++) {
3727 int j = LATIN1_TO_NATIVE(i);
3728 if (toU8_LOWER_LC(j) != j || toU8_UPPER_LC(j) != j) {
3729 has_non_ascii_fold = TRUE;
3730 break;
3731 }
3732 }
3733 }
3734
3735 # endif
3736
3737 for (unsigned i = 0; i < 256; i++) {
3738 if (isU8_UPPER_LC(i))
3739 PL_fold_locale[i] = (U8) toU8_LOWER_LC(i);
3740 else if (isU8_LOWER_LC(i))
3741 PL_fold_locale[i] = (U8) toU8_UPPER_LC(i);
3742 else
3743 PL_fold_locale[i] = (U8) i;
3744
3745 # ifdef DEBUGGING
3746
3747 /* Most locales these days are supersets of ASCII. When debugging,
3748 * it is helpful to know what the exceptions to that are in this
3749 * locale */
3750 if (DEBUG_L_TEST) {
3751 bool unexpected = FALSE;
3752
3753 if (isUPPER_L1(i)) {
3754 if (isUPPER_A(i)) {
3755 if (PL_fold_locale[i] != toLOWER_A(i)) {
3756 unexpected = TRUE;
3757 }
3758 }
3759 else if (has_non_ascii_fold) {
3760 if (PL_fold_locale[i] != toLOWER_L1(i)) {
3761 unexpected = TRUE;
3762 }
3763 }
3764 else if (PL_fold_locale[i] != i) {
3765 unexpected = TRUE;
3766 }
3767 }
3768 else if ( isLOWER_L1(i)
3769 && i != LATIN_SMALL_LETTER_SHARP_S
3770 && i != MICRO_SIGN)
3771 {
3772 if (isLOWER_A(i)) {
3773 if (PL_fold_locale[i] != toUPPER_A(i)) {
3774 unexpected = TRUE;
3775 }
3776 }
3777 else if (has_non_ascii_fold) {
3778 if (PL_fold_locale[i] != toUPPER_LATIN1_MOD(i)) {
3779 unexpected = TRUE;
3780 }
3781 }
3782 else if (PL_fold_locale[i] != i) {
3783 unexpected = TRUE;
3784 }
3785 }
3786 else if (PL_fold_locale[i] != i) {
3787 unexpected = TRUE;
3788 }
3789
3790 if (unexpected) {
3791 found_unexpected = TRUE;
3792 DEBUG_L(PerlIO_printf(Perl_debug_log,
3793 "For %s, fold of %02x is %02x\n",
3794 newctype, i, PL_fold_locale[i]));
3795 }
3796 }
3797 }
3798
3799 if (found_unexpected) {
3800 DEBUG_L(PerlIO_printf(Perl_debug_log,
3801 "All bytes not mentioned above either fold to"
3802 " themselves or are the expected ASCII or"
3803 " Latin1 ones\n"));
3804 }
3805 else {
3806 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3807 "No nonstandard folds were found\n"));
3808 # endif
3809
3810 }
3811 }
3812
3813 # ifdef MB_CUR_MAX
3814
3815 /* We only handle single-byte locales (outside of UTF-8 ones); so if this
3816 * locale requires more than one byte, there are going to be BIG problems.
3817 * */
3818
3819 const int mb_cur_max = MB_CUR_MAX;
3820
3821 if (mb_cur_max > 1 && ! PL_in_utf8_CTYPE_locale
3822
3823 /* Some platforms return MB_CUR_MAX > 1 for even the "C" locale.
3824 * Just assume that the implementation for them (plus for POSIX) is
3825 * correct and the > 1 value is spurious. (Since these are
3826 * specially handled to never be considered UTF-8 locales, as long
3827 * as this is the only problem, everything should work fine */
3828 && ! isNAME_C_OR_POSIX(newctype))
3829 {
3830 DEBUG_L(PerlIO_printf(Perl_debug_log,
3831 "Unsupported, MB_CUR_MAX=%d\n", mb_cur_max));
3832
3833 if (! IN_LC(LC_CTYPE) || ckWARN_d(WARN_LOCALE)) {
3834 char * msg = Perl_form(aTHX_
3835 "Locale '%s' is unsupported, and may hang"
3836 " or crash the interpreter",
3837 newctype);
3838 if (IN_LC(LC_CTYPE)) {
3839 Perl_warner(aTHX_ packWARN(WARN_LOCALE), "%s", msg);
3840 }
3841 else {
3842 PL_warn_locale = newSV(0);
3843 sv_setpvn(PL_warn_locale, msg, strlen(msg));
3844 }
3845 }
3846 }
3847
3848 # endif
3849
3850 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "check_for_problems=%d\n",
3851 check_for_problems));
3852
3853 /* We don't populate the other lists if a UTF-8 locale, but do check that
3854 * everything works as expected, unless checking turned off */
3855 if (check_for_problems) {
3856 /* Assume enough space for every character being bad. 4 spaces each
3857 * for the 94 printable characters that are output like "'x' "; and 5
3858 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
3859 * NUL */
3860 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
3861 unsigned int bad_count = 0; /* Count of bad characters */
3862
3863 for (unsigned i = 0; i < 256; i++) {
3864
3865 /* If checking for locale problems, see if the native ASCII-range
3866 * printables plus \n and \t are in their expected categories in
3867 * the new locale. If not, this could mean big trouble, upending
3868 * Perl's and most programs' assumptions, like having a
3869 * metacharacter with special meaning become a \w. Fortunately,
3870 * it's very rare to find locales that aren't supersets of ASCII
3871 * nowadays. It isn't a problem for most controls to be changed
3872 * into something else; we check only \n and \t, though perhaps \r
3873 * could be an issue as well. */
3874 if (isGRAPH_A(i) || isBLANK_A(i) || i == '\n') {
3875 bool is_bad = FALSE;
3876 char name[4] = { '\0' };
3877
3878 /* Convert the name into a string */
3879 if (isGRAPH_A(i)) {
3880 name[0] = i;
3881 name[1] = '\0';
3882 }
3883 else if (i == '\n') {
3884 my_strlcpy(name, "\\n", sizeof(name));
3885 }
3886 else if (i == '\t') {
3887 my_strlcpy(name, "\\t", sizeof(name));
3888 }
3889 else {
3890 assert(i == ' ');
3891 my_strlcpy(name, "' '", sizeof(name));
3892 }
3893
3894 /* Check each possibe class */
3895 if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) !=
3896 cBOOL(isALPHANUMERIC_A(i))))
3897 {
3898 is_bad = TRUE;
3899 DEBUG_L(PerlIO_printf(Perl_debug_log,
3900 "isalnum('%s') unexpectedly is %x\n",
3901 name, cBOOL(isU8_ALPHANUMERIC_LC(i))));
3902 }
3903 if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i)))) {
3904 is_bad = TRUE;
3905 DEBUG_L(PerlIO_printf(Perl_debug_log,
3906 "isalpha('%s') unexpectedly is %x\n",
3907 name, cBOOL(isU8_ALPHA_LC(i))));
3908 }
3909 if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i)))) {
3910 is_bad = TRUE;
3911 DEBUG_L(PerlIO_printf(Perl_debug_log,
3912 "isdigit('%s') unexpectedly is %x\n",
3913 name, cBOOL(isU8_DIGIT_LC(i))));
3914 }
3915 if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i)))) {
3916 is_bad = TRUE;
3917 DEBUG_L(PerlIO_printf(Perl_debug_log,
3918 "isgraph('%s') unexpectedly is %x\n",
3919 name, cBOOL(isU8_GRAPH_LC(i))));
3920 }
3921 if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i)))) {
3922 is_bad = TRUE;
3923 DEBUG_L(PerlIO_printf(Perl_debug_log,
3924 "islower('%s') unexpectedly is %x\n",
3925 name, cBOOL(isU8_LOWER_LC(i))));
3926 }
3927 if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i)))) {
3928 is_bad = TRUE;
3929 DEBUG_L(PerlIO_printf(Perl_debug_log,
3930 "isprint('%s') unexpectedly is %x\n",
3931 name, cBOOL(isU8_PRINT_LC(i))));
3932 }
3933 if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i)))) {
3934 is_bad = TRUE;
3935 DEBUG_L(PerlIO_printf(Perl_debug_log,
3936 "ispunct('%s') unexpectedly is %x\n",
3937 name, cBOOL(isU8_PUNCT_LC(i))));
3938 }
3939 if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i)))) {
3940 is_bad = TRUE;
3941 DEBUG_L(PerlIO_printf(Perl_debug_log,
3942 "isspace('%s') unexpectedly is %x\n",
3943 name, cBOOL(isU8_SPACE_LC(i))));
3944 }
3945 if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i)))) {
3946 is_bad = TRUE;
3947 DEBUG_L(PerlIO_printf(Perl_debug_log,
3948 "isupper('%s') unexpectedly is %x\n",
3949 name, cBOOL(isU8_UPPER_LC(i))));
3950 }
3951 if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i)))) {
3952 is_bad = TRUE;
3953 DEBUG_L(PerlIO_printf(Perl_debug_log,
3954 "isxdigit('%s') unexpectedly is %x\n",
3955 name, cBOOL(isU8_XDIGIT_LC(i))));
3956 }
3957 if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) {
3958 is_bad = TRUE;
3959 DEBUG_L(PerlIO_printf(Perl_debug_log,
3960 "tolower('%s')=0x%x instead of the expected 0x%x\n",
3961 name, toU8_LOWER_LC(i), (int) toLOWER_A(i)));
3962 }
3963 if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) {
3964 is_bad = TRUE;
3965 DEBUG_L(PerlIO_printf(Perl_debug_log,
3966 "toupper('%s')=0x%x instead of the expected 0x%x\n",
3967 name, toU8_UPPER_LC(i), (int) toUPPER_A(i)));
3968 }
3969 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i)))) {
3970 is_bad = TRUE;
3971 DEBUG_L(PerlIO_printf(Perl_debug_log,
3972 "'\\n' (=%02X) is not a control\n", (int) i));
3973 }
3974
3975 /* Add to the list; Separate multiple entries with a blank */
3976 if (is_bad) {
3977 if (bad_count) {
3978 my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
3979 }
3980 my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
3981 bad_count++;
3982 }
3983 }
3984 }
3985
3986 if (bad_count == 2 && maybe_utf8_turkic) {
3987 bad_count = 0;
3988 *bad_chars_list = '\0';
3989
3990 /* The casts are because otherwise some compilers warn:
3991 gcc.gnu.org/bugzilla/show_bug.cgi?id=99950
3992 gcc.gnu.org/bugzilla/show_bug.cgi?id=94182
3993 */
3994 PL_fold_locale[ (U8) 'I' ] = 'I';
3995 PL_fold_locale[ (U8) 'i' ] = 'i';
3996 PL_in_utf8_turkic_locale = TRUE;
3997 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype));
3998 }
3999
4000 /* If we found problems and we want them output, do so */
4001 if ( (UNLIKELY(bad_count))
4002 && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
4003 {
4004 /* WARNING. If you change the wording of these; be sure to update
4005 * t/loc_tools.pl correspondingly */
4006
4007 if (PL_warn_locale) {
4008 sv_catpvs(PL_warn_locale, "\n");
4009 }
4010 else {
4011 PL_warn_locale = newSVpvs("");
4012 }
4013
4014 if (PL_in_utf8_CTYPE_locale) {
4015 Perl_sv_catpvf(aTHX_ PL_warn_locale,
4016 "Locale '%s' contains (at least) the following characters"
4017 " which have\nunexpected meanings: %s\nThe Perl program"
4018 " will use the expected meanings",
4019 newctype, bad_chars_list);
4020 }
4021 else {
4022 Perl_sv_catpvf(aTHX_ PL_warn_locale,
4023 "\nThe following characters (and maybe"
4024 " others) may not have the same meaning as"
4025 " the Perl program expects: %s\n",
4026 bad_chars_list
4027 );
4028 }
4029
4030 # if defined(HAS_SOME_LANGINFO) || defined(WIN32)
4031
4032 Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
4033 langinfo_c(CODESET, LC_CTYPE, newctype, NULL));
4034
4035 # endif
4036
4037 Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
4038
4039 /* If we are actually in the scope of the locale or are debugging,
4040 * output the message now. If not in that scope, we save the
4041 * message to be output at the first operation using this locale,
4042 * if that actually happens. Most programs don't use locales, so
4043 * they are immune to bad ones. */
4044 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
4045
4046 /* The '0' below suppresses a bogus gcc compiler warning */
4047 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale),
4048 0);
4049 if (IN_LC(LC_CTYPE)) {
4050 SvREFCNT_dec_NN(PL_warn_locale);
4051 PL_warn_locale = NULL;
4052 }
4053 }
4054 }
4055 }
4056 }
4057
4058 void
Perl_warn_problematic_locale()4059 Perl_warn_problematic_locale()
4060 {
4061 dTHX;
4062
4063 /* Core-only function that outputs the message in PL_warn_locale,
4064 * and then NULLS it. Should be called only through the macro
4065 * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */
4066
4067 if (PL_warn_locale) {
4068 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
4069 SvPVX(PL_warn_locale),
4070 0 /* dummy to avoid compiler warning */ );
4071 SvREFCNT_dec_NN(PL_warn_locale);
4072 PL_warn_locale = NULL;
4073 }
4074 }
4075
4076 # endif /* USE_LOCALE_CTYPE */
4077
4078 STATIC void
S_new_LC_ALL(pTHX_ const char * lc_all,bool force)4079 S_new_LC_ALL(pTHX_ const char *lc_all, bool force)
4080 {
4081 PERL_ARGS_ASSERT_NEW_LC_ALL;
4082
4083 /* new_LC_ALL() updates all the things we care about. Note that this is
4084 * called just after a change, so uses the actual underlying locale just
4085 * set, and not the nominal one (should they differ, as they may in
4086 * LC_NUMERIC). */
4087
4088 const char * individ_locales[LC_ALL_INDEX_] = { NULL };
4089
4090 switch (parse_LC_ALL_string(lc_all,
4091 individ_locales,
4092 override_if_ignored, /* Override any ignored
4093 categories */
4094 true, /* Always fill array */
4095 true, /* Panic if fails, as to get here it
4096 earlier had to have succeeded */
4097 __LINE__))
4098 {
4099 case invalid:
4100 case no_array:
4101 case only_element_0:
4102 locale_panic_("Unexpected return from parse_LC_ALL_string");
4103
4104 case full_array:
4105 break;
4106 }
4107
4108 for_all_individual_category_indexes(i) {
4109 if (update_functions[i]) {
4110 const char * this_locale = individ_locales[i];
4111 update_functions[i](aTHX_ this_locale, force);
4112 }
4113
4114 Safefree(individ_locales[i]);
4115 }
4116 }
4117
4118 # ifdef USE_LOCALE_COLLATE
4119
4120 STATIC void
S_new_collate(pTHX_ const char * newcoll,bool force)4121 S_new_collate(pTHX_ const char *newcoll, bool force)
4122 {
4123 PERL_ARGS_ASSERT_NEW_COLLATE;
4124 PERL_UNUSED_ARG(force);
4125
4126 /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
4127 * core Perl this and that 'newcoll' is the name of the new locale.
4128 *
4129 * The design of locale collation is that every locale change is given an
4130 * index 'PL_collation_ix'. The first time a string participates in an
4131 * operation that requires collation while locale collation is active, it
4132 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
4133 * magic includes the collation index, and the transformation of the string
4134 * by strxfrm(), q.v. That transformation is used when doing comparisons,
4135 * instead of the string itself. If a string changes, the magic is
4136 * cleared. The next time the locale changes, the index is incremented,
4137 * and so we know during a comparison that the transformation is not
4138 * necessarily still valid, and so is recomputed. Note that if the locale
4139 * changes enough times, the index could wrap, and it is possible that a
4140 * transformation would improperly be considered valid, leading to an
4141 * unlikely bug. The value is declared to the widest possible type on this
4142 * platform. */
4143
4144 /* Return if the locale isn't changing */
4145 if (strEQ(PL_collation_name, newcoll)) {
4146 return;
4147 }
4148
4149 Safefree(PL_collation_name);
4150 PL_collation_name = savepv(newcoll);
4151 ++PL_collation_ix;
4152
4153 /* Set the new one up if trivial. Since this is called at process
4154 * initialization, be aware that this bit can't rely on much being
4155 * available. */
4156 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
4157 if (PL_collation_standard) {
4158 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4159 "Setting PL_collation name='%s'\n",
4160 PL_collation_name));
4161 PL_collxfrm_base = 0;
4162 PL_collxfrm_mult = 2;
4163 PL_in_utf8_COLLATE_locale = FALSE;
4164 PL_strxfrm_NUL_replacement = '\0';
4165 PL_strxfrm_max_cp = 0;
4166 return;
4167 }
4168
4169 /* Flag that the remainder of the set up is being deferred until first
4170 * need. */
4171 PL_collxfrm_mult = 0;
4172 PL_collxfrm_base = 0;
4173
4174 }
4175
4176 # endif /* USE_LOCALE_COLLATE */
4177
4178 # ifdef WIN32
4179
4180 STATIC wchar_t *
S_Win_byte_string_to_wstring(const UINT code_page,const char * byte_string)4181 S_Win_byte_string_to_wstring(const UINT code_page, const char * byte_string)
4182 {
4183 /* Caller must arrange to free the returned string */
4184
4185 int req_size = MultiByteToWideChar(code_page, 0, byte_string, -1, NULL, 0);
4186 if (! req_size) {
4187 SET_EINVAL;
4188 return NULL;
4189 }
4190
4191 wchar_t *wstring;
4192 Newx(wstring, req_size, wchar_t);
4193
4194 if (! MultiByteToWideChar(code_page, 0, byte_string, -1, wstring, req_size))
4195 {
4196 Safefree(wstring);
4197 SET_EINVAL;
4198 return NULL;
4199 }
4200
4201 return wstring;
4202 }
4203
4204 # define Win_utf8_string_to_wstring(s) \
4205 Win_byte_string_to_wstring(CP_UTF8, (s))
4206
4207 STATIC char *
S_Win_wstring_to_byte_string(const UINT code_page,const wchar_t * wstring)4208 S_Win_wstring_to_byte_string(const UINT code_page, const wchar_t * wstring)
4209 {
4210 /* Caller must arrange to free the returned string */
4211
4212 int req_size =
4213 WideCharToMultiByte(code_page, 0, wstring, -1, NULL, 0, NULL, NULL);
4214
4215 char *byte_string;
4216 Newx(byte_string, req_size, char);
4217
4218 if (! WideCharToMultiByte(code_page, 0, wstring, -1, byte_string,
4219 req_size, NULL, NULL))
4220 {
4221 Safefree(byte_string);
4222 SET_EINVAL;
4223 return NULL;
4224 }
4225
4226 return byte_string;
4227 }
4228
4229 # define Win_wstring_to_utf8_string(ws) \
4230 Win_wstring_to_byte_string(CP_UTF8, (ws))
4231
4232 STATIC const char *
S_wrap_wsetlocale(pTHX_ const int category,const char * locale)4233 S_wrap_wsetlocale(pTHX_ const int category, const char *locale)
4234 {
4235 PERL_ARGS_ASSERT_WRAP_WSETLOCALE;
4236
4237 /* Calls _wsetlocale(), converting the parameters/return to/from
4238 * Perl-expected forms as if plain setlocale() were being called instead.
4239 *
4240 * Caller must arrange for the returned PV to be freed.
4241 */
4242
4243 const wchar_t * wlocale = NULL;
4244
4245 if (locale) {
4246 wlocale = Win_utf8_string_to_wstring(locale);
4247 if (! wlocale) {
4248 return NULL;
4249 }
4250 }
4251
4252 WSETLOCALE_LOCK;
4253 const wchar_t * wresult = _wsetlocale(category, wlocale);
4254
4255 if (! wresult) {
4256 WSETLOCALE_UNLOCK;
4257 Safefree(wlocale);
4258 return NULL;
4259 }
4260
4261 const char * result = Win_wstring_to_utf8_string(wresult);
4262 WSETLOCALE_UNLOCK;
4263
4264 Safefree(wlocale);
4265 return result;
4266 }
4267
4268 STATIC const char *
S_win32_setlocale(pTHX_ int category,const char * locale)4269 S_win32_setlocale(pTHX_ int category, const char* locale)
4270 {
4271 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
4272 * difference between the two unless the input locale is "", which normally
4273 * means on Windows to get the machine default, which is set via the
4274 * computer's "Regional and Language Options" (or its current equivalent).
4275 * In POSIX, it instead means to find the locale from the user's
4276 * environment. This routine changes the Windows behavior to try the POSIX
4277 * behavior first. Further details are in the called function
4278 * find_locale_from_environment().
4279 */
4280
4281 if (locale != NULL && strEQ(locale, "")) {
4282 /* Note this function may change the locale, but that's ok because we
4283 * are about to change it anyway */
4284 locale = find_locale_from_environment(get_category_index(category));
4285 if (locale == NULL) {
4286 SET_EINVAL;
4287 return NULL;
4288 }
4289 }
4290
4291 const char * result = wrap_wsetlocale(category, locale);
4292 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
4293 setlocale_debug_string_r(category, locale, result)));
4294
4295 if (! result) {
4296 SET_EINVAL;
4297 return NULL;
4298 }
4299
4300 save_to_buffer(result, &PL_setlocale_buf, &PL_setlocale_bufsize);
4301
4302 # ifndef USE_PL_CUR_LC_ALL
4303
4304 Safefree(result);
4305
4306 # else
4307
4308 /* Here, we need to keep track of LC_ALL, so store the new value. but if
4309 * the input locale is NULL, we were just querying, so the original value
4310 * hasn't changed */
4311 if (locale == NULL) {
4312 Safefree(result);
4313 }
4314 else {
4315
4316 /* If we set LC_ALL directly above, we already know its new value; but
4317 * if we changed just an individual category, find the new LC_ALL */
4318 if (category != LC_ALL) {
4319 Safefree(result);
4320 result = wrap_wsetlocale(LC_ALL, NULL);
4321 }
4322
4323 Safefree(PL_cur_LC_ALL);
4324 PL_cur_LC_ALL = result;
4325 }
4326
4327 DEBUG_L(PerlIO_printf(Perl_debug_log, "new PL_cur_LC_ALL=%s\n",
4328 PL_cur_LC_ALL));
4329 # endif
4330
4331 return PL_setlocale_buf;
4332 }
4333
4334 # endif
4335
4336 STATIC const char *
S_native_querylocale_i(pTHX_ const locale_category_index cat_index)4337 S_native_querylocale_i(pTHX_ const locale_category_index cat_index)
4338 {
4339 /* Determine the current locale and return it in the form the platform's
4340 * native locale handling understands. This is different only from our
4341 * internal form for the LC_ALL category, as platforms differ in how they
4342 * represent that.
4343 *
4344 * This is only called from Perl_setlocale(). As such it returns in
4345 * PL_setlocale_buf */
4346
4347 # ifdef USE_LOCALE_NUMERIC
4348
4349 /* We have the LC_NUMERIC name saved, because we are normally switched into
4350 * the C locale (or equivalent) for it. */
4351 if (cat_index == LC_NUMERIC_INDEX_) {
4352
4353 /* We don't have to copy this return value, as it is a per-thread
4354 * variable, and won't change until a future setlocale */
4355 return PL_numeric_name;
4356 }
4357
4358 # endif
4359 # ifdef LC_ALL
4360
4361 if (cat_index != LC_ALL_INDEX_)
4362
4363 # endif
4364
4365 {
4366 /* Here, not LC_ALL, and not LC_NUMERIC: the actual and native values
4367 * match */
4368
4369 # ifdef setlocale_i /* Can shortcut if this is defined */
4370
4371 return setlocale_i(cat_index, NULL);
4372
4373 # else
4374
4375 return save_to_buffer(querylocale_i(cat_index),
4376 &PL_setlocale_buf, &PL_setlocale_bufsize);
4377 # endif
4378
4379 }
4380
4381 /* Below, querying LC_ALL */
4382
4383 # ifdef LC_ALL
4384 # ifdef USE_PL_CURLOCALES
4385 # define LC_ALL_ARG PL_curlocales
4386 # else
4387 # define LC_ALL_ARG NULL /* Causes calculate_LC_ALL_string() to find the
4388 locale using a querylocale function */
4389 # endif
4390
4391 return calculate_LC_ALL_string(LC_ALL_ARG, EXTERNAL_FORMAT_FOR_QUERY,
4392 WANT_PL_setlocale_buf,
4393 __LINE__);
4394 # undef LC_ALL_ARG
4395 # endif /* has LC_ALL */
4396
4397 }
4398
4399 #endif /* USE_LOCALE */
4400
4401 /*
4402 =for apidoc Perl_setlocale
4403
4404 This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
4405 taking the same parameters, and returning the same information, except that it
4406 returns the correct underlying C<LC_NUMERIC> locale. Regular C<setlocale> will
4407 instead return C<C> if the underlying locale has a non-dot decimal point
4408 character, or a non-empty thousands separator for displaying floating point
4409 numbers. This is because perl keeps that locale category such that it has a
4410 dot and empty separator, changing the locale briefly during the operations
4411 where the underlying one is required. C<Perl_setlocale> knows about this, and
4412 compensates; regular C<setlocale> doesn't.
4413
4414 Another reason it isn't completely a drop-in replacement is that it is
4415 declared to return S<C<const char *>>, whereas the system setlocale omits the
4416 C<const> (presumably because its API was specified long ago, and can't be
4417 updated; it is illegal to change the information C<setlocale> returns; doing
4418 so leads to segfaults.)
4419
4420 Finally, C<Perl_setlocale> works under all circumstances, whereas plain
4421 C<setlocale> can be completely ineffective on some platforms under some
4422 configurations.
4423
4424 Changing the locale is not a good idea when more than one thread is running,
4425 except on systems where the predefined variable C<${^SAFE_LOCALES}> is
4426 non-zero. This is because on such systems the locale is global to the whole
4427 process and not local to just the thread calling the function. So changing it
4428 in one thread instantaneously changes it in all. On some such systems, the
4429 system C<setlocale()> is ineffective, returning the wrong information, and
4430 failing to actually change the locale. z/OS refuses to try to change the
4431 locale once a second thread is created. C<Perl_setlocale>, should give you
4432 accurate results of what actually happened on these problematic platforms,
4433 returning NULL if the system forbade the locale change.
4434
4435 The return points to a per-thread static buffer, which is overwritten the next
4436 time C<Perl_setlocale> is called from the same thread.
4437
4438 =cut
4439
4440 */
4441
4442 const char *
Perl_setlocale(const int category,const char * locale)4443 Perl_setlocale(const int category, const char * locale)
4444 {
4445 /* This wraps POSIX::setlocale() */
4446
4447 #ifndef USE_LOCALE
4448
4449 PERL_UNUSED_ARG(category);
4450 PERL_UNUSED_ARG(locale);
4451
4452 return "C";
4453
4454 #else
4455
4456 dTHX;
4457
4458 DEBUG_L(PerlIO_printf(Perl_debug_log,
4459 "Entering Perl_setlocale(%d, \"%s\")\n",
4460 category, locale));
4461
4462 bool valid_category;
4463 locale_category_index cat_index = get_category_index_helper(category,
4464 &valid_category,
4465 __LINE__);
4466 if (! valid_category) {
4467 if (ckWARN(WARN_LOCALE)) {
4468 const char * conditional_warn_text;
4469 if (locale == NULL) {
4470 conditional_warn_text = "";
4471 locale = "";
4472 }
4473 else {
4474 conditional_warn_text = "; can't set it to ";
4475 }
4476
4477 /* diag_listed_as: Unknown locale category %d; can't set it to %s */
4478 Perl_warner(aTHX_
4479 packWARN(WARN_LOCALE),
4480 "Unknown locale category %d%s%s",
4481 category, conditional_warn_text, locale);
4482 }
4483
4484 SET_EINVAL;
4485 return NULL;
4486 }
4487
4488 # ifdef setlocale_i
4489
4490 /* setlocale_i() gets defined only on Configurations that use setlocale()
4491 * in a simple manner that adequately handles all cases. If this category
4492 * doesn't have any perl complications, just do that. */
4493 if (! update_functions[cat_index]) {
4494 return setlocale_i(cat_index, locale);
4495 }
4496
4497 # endif
4498
4499 /* Get current locale */
4500 const char * current_locale = native_querylocale_i(cat_index);
4501
4502 /* A NULL locale means only query what the current one is. */
4503 if (locale == NULL) {
4504 return current_locale;
4505 }
4506
4507 if (strEQ(current_locale, locale)) {
4508 DEBUG_L(PerlIO_printf(Perl_debug_log,
4509 "Already in requested locale: no action taken\n"));
4510 return current_locale;
4511 }
4512
4513 /* Here, an actual change is being requested. Do it */
4514 if (! bool_setlocale_i(cat_index, locale)) {
4515 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
4516 setlocale_debug_string_i(cat_index, locale, "NULL")));
4517 return NULL;
4518 }
4519
4520 /* At this point, the locale has been changed based on the requested value,
4521 * and the querylocale_i() will return the actual new value that the system
4522 * has for the category. That may not be the same as the input, as libc
4523 * may have returned a synonymous locale name instead of the input one; or,
4524 * if there are locale categories that we are compiled to ignore, any
4525 * attempt to change them away from "C" is overruled */
4526 current_locale = querylocale_i(cat_index);
4527
4528 /* But certain categories need further work. For example we may need to
4529 * calculate new folding or collation rules. And for LC_NUMERIC, we have
4530 * to switch into a locale that has a dot radix. */
4531 if (update_functions[cat_index]) {
4532 update_functions[cat_index](aTHX_ current_locale,
4533 /* No need to force recalculation, as
4534 * aren't coming from a situation
4535 * where Perl hasn't been controlling
4536 * the locale, so has accurate
4537 * records. */
4538 false);
4539 }
4540
4541 /* Make sure the result is in a stable buffer for the caller's use, and is
4542 * in the expected format */
4543 current_locale = native_querylocale_i(cat_index);
4544
4545 DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", current_locale));
4546
4547 return current_locale;
4548
4549 #endif
4550
4551 }
4552
4553 #ifdef USE_LOCALE
4554 # ifdef DEBUGGING
4555
4556 STATIC char *
S_my_setlocale_debug_string_i(pTHX_ const locale_category_index cat_index,const char * locale,const char * retval,const line_t line)4557 S_my_setlocale_debug_string_i(pTHX_
4558 const locale_category_index cat_index,
4559 const char* locale, /* Optional locale name */
4560
4561 /* return value from setlocale() when attempting
4562 * to set 'category' to 'locale' */
4563 const char* retval,
4564
4565 const line_t line)
4566 {
4567 /* Returns a pointer to a NUL-terminated string in static storage with
4568 * added text about the info passed in. This is not thread safe and will
4569 * be overwritten by the next call, so this should be used just to
4570 * formulate a string to immediately print or savepv() on. */
4571
4572 const char * locale_quote;
4573 const char * retval_quote;
4574
4575 if (locale == NULL) {
4576 locale_quote = "";
4577 locale = "NULL";
4578 }
4579 else {
4580 locale_quote = "\"";
4581 }
4582
4583 if (retval == NULL) {
4584 retval_quote = "";
4585 retval = "NULL";
4586 }
4587 else {
4588 retval_quote = "\"";
4589 }
4590
4591 # ifdef MULTIPLICITY
4592 # define THREAD_FORMAT "%p:"
4593 # define THREAD_ARGUMENT aTHX_
4594 # else
4595 # define THREAD_FORMAT
4596 # define THREAD_ARGUMENT
4597 # endif
4598
4599 return Perl_form(aTHX_
4600 "%s:%" LINE_Tf ": " THREAD_FORMAT
4601 " setlocale(%s[%d], %s%s%s) returned %s%s%s\n",
4602
4603 __FILE__, line, THREAD_ARGUMENT
4604 category_names[cat_index], categories[cat_index],
4605 locale_quote, locale, locale_quote,
4606 retval_quote, retval, retval_quote);
4607 }
4608
4609 # endif
4610
4611 /* If this implementation hasn't defined these macros, they aren't needed */
4612 # ifndef TOGGLE_LOCK
4613 # define TOGGLE_LOCK(i)
4614 # define TOGGLE_UNLOCK(i)
4615 # endif
4616
4617 STATIC const char *
S_toggle_locale_i(pTHX_ const locale_category_index cat_index,const char * new_locale,const line_t caller_line)4618 S_toggle_locale_i(pTHX_ const locale_category_index cat_index,
4619 const char * new_locale,
4620 const line_t caller_line)
4621 {
4622 PERL_ARGS_ASSERT_TOGGLE_LOCALE_I;
4623
4624 /* Changes the locale for the category specified by 'index' to 'new_locale,
4625 * if they aren't already the same. EVERY CALL to this function MUST HAVE
4626 * a corresponding call to restore_toggled_locale_i()
4627 *
4628 * Returns a copy of the name of the original locale for 'cat_index'
4629 * so can be switched back to with the companion function
4630 * restore_toggled_locale_i(), (NULL if no restoral is necessary.) */
4631
4632 /* Find the original locale of the category we may need to change, so that
4633 * it can be restored to later */
4634 const char * locale_to_restore_to = querylocale_i(cat_index);
4635
4636 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4637 "Entering toggle_locale_i: index=%d(%s)," \
4638 " wanted=%s, actual=%s; called from %" LINE_Tf \
4639 "\n", cat_index, category_names[cat_index],
4640 new_locale, locale_to_restore_to ? locale_to_restore_to : "(null)",
4641 caller_line));
4642
4643 if (! locale_to_restore_to) {
4644 locale_panic_via_(Perl_form(aTHX_
4645 "Could not find current %s locale",
4646 category_names[cat_index]),
4647 __FILE__, caller_line);
4648 }
4649
4650 /* Begin a critical section on platforms that need it. We do this even if
4651 * we don't have to change here, so as to prevent other instances from
4652 * changing the locale out from under us. */
4653 TOGGLE_LOCK(cat_index);
4654
4655 /* If the locales are the same, there's nothing to do */
4656 if (strEQ(locale_to_restore_to, new_locale)) {
4657 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
4658 category_names[cat_index],
4659 new_locale));
4660 return NULL;
4661 }
4662
4663 /* Finally, change the locale to the new one */
4664 void_setlocale_i_with_caller(cat_index, new_locale, __FILE__, caller_line);
4665
4666 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4667 "%s locale switched to %s\n",
4668 category_names[cat_index], new_locale));
4669
4670 return locale_to_restore_to;
4671
4672 # ifndef DEBUGGING
4673 PERL_UNUSED_ARG(caller_line);
4674 # endif
4675
4676 }
4677
4678 STATIC void
S_restore_toggled_locale_i(pTHX_ const locale_category_index cat_index,const char * restore_locale,const line_t caller_line)4679 S_restore_toggled_locale_i(pTHX_ const locale_category_index cat_index,
4680 const char * restore_locale,
4681 const line_t caller_line)
4682 {
4683 /* Restores the locale for LC_category corresponding to cat_index to
4684 * 'restore_locale' (which is a copy that will be freed by this function),
4685 * or do nothing if the latter parameter is NULL */
4686
4687 PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I;
4688
4689 if (restore_locale == NULL) {
4690 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4691 "restore_toggled_locale_i: No need to" \
4692 " restore %s; called from %" LINE_Tf "\n", \
4693 category_names[cat_index], caller_line));
4694 TOGGLE_UNLOCK(cat_index);
4695 return;
4696 }
4697
4698 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4699 "restore_toggled_locale_i: restoring locale for" \
4700 " %s to %s; called from %" LINE_Tf "\n", \
4701 category_names[cat_index], restore_locale,
4702 caller_line));
4703
4704 void_setlocale_i_with_caller(cat_index, restore_locale,
4705 __FILE__, caller_line);
4706 TOGGLE_UNLOCK(cat_index);
4707
4708 # ifndef DEBUGGING
4709 PERL_UNUSED_ARG(caller_line);
4710 # endif
4711
4712 }
4713
4714 #endif
4715 #if defined(USE_LOCALE) || defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV)
4716
4717 STATIC utf8ness_t
S_get_locale_string_utf8ness_i(pTHX_ const char * string,const locale_utf8ness_t known_utf8,const char * locale,const locale_category_index cat_index)4718 S_get_locale_string_utf8ness_i(pTHX_ const char * string,
4719 const locale_utf8ness_t known_utf8,
4720 const char * locale,
4721 const locale_category_index cat_index)
4722 {
4723 PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I;
4724
4725 # ifndef USE_LOCALE
4726
4727 return UTF8NESS_NO;
4728 PERL_UNUSED_ARG(string);
4729 PERL_UNUSED_ARG(known_utf8);
4730 PERL_UNUSED_ARG(locale);
4731 PERL_UNUSED_ARG(cat_index);
4732
4733 # else
4734
4735 /* Return to indicate if 'string' in the locale given by the input
4736 * arguments should be considered UTF-8 or not.
4737 *
4738 * If the input 'locale' is not NULL, use that for the locale; otherwise
4739 * use the current locale for the category specified by 'cat_index'.
4740 */
4741
4742 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4743 "Entering get_locale_string_utf8ness_i; locale=%s,"
4744 " index=%u(%s), string=%s, known_utf8=%d\n",
4745 locale, cat_index, category_names[cat_index],
4746 ((string)
4747 ? _byte_dump_string((U8 *) string,
4748 strlen(string),
4749 0)
4750 : "nil"),
4751 known_utf8));
4752 if (string == NULL) {
4753 return UTF8NESS_IMMATERIAL;
4754 }
4755
4756 if (IN_BYTES) { /* respect 'use bytes' */
4757 return UTF8NESS_NO;
4758 }
4759
4760 Size_t len = strlen(string);
4761
4762 /* UTF8ness is immaterial if the representation doesn't vary */
4763 const U8 * first_variant = NULL;
4764 if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) {
4765 return UTF8NESS_IMMATERIAL;
4766 }
4767
4768 /* Can't be UTF-8 if invalid */
4769 if (! is_strict_utf8_string((U8 *) first_variant,
4770 len - ((char *) first_variant - string)))
4771 {
4772 return UTF8NESS_NO;
4773 }
4774
4775 /* Here and below, we know the string is legal UTF-8, containing at least
4776 * one character requiring a sequence of two or more bytes. It is quite
4777 * likely to be UTF-8. But it pays to be paranoid and do further checking.
4778 *
4779 * If we already know the UTF-8ness of the locale, then we immediately know
4780 * what the string is */
4781 if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) {
4782 return (known_utf8 == LOCALE_IS_UTF8) ? UTF8NESS_YES : UTF8NESS_NO;
4783 }
4784
4785 if (locale == NULL) {
4786 locale = querylocale_i(cat_index);
4787 }
4788
4789 /* If the locale is UTF-8, the string is UTF-8; otherwise it was
4790 * coincidental that the string is legal UTF-8
4791 *
4792 * However, if the perl is compiled to not pay attention to the category
4793 * being passed in, you might think that that locale is essentially always
4794 * the C locale, so it would make sense to say it isn't UTF-8. But to get
4795 * here, the string has to contain characters unknown in the C locale. And
4796 * in fact, Windows boxes are compiled without LC_MESSAGES, as their
4797 * message catalog isn't really a part of the locale system. But those
4798 * messages really could be UTF-8, and given that the odds are rather small
4799 * of something not being UTF-8 but being syntactically valid UTF-8, khw
4800 * has decided to call such strings as UTF-8. */
4801 return (is_locale_utf8(locale)) ? UTF8NESS_YES : UTF8NESS_NO;
4802
4803 # endif
4804
4805 }
4806
4807 STATIC bool
S_is_locale_utf8(pTHX_ const char * locale)4808 S_is_locale_utf8(pTHX_ const char * locale)
4809 {
4810 PERL_ARGS_ASSERT_IS_LOCALE_UTF8;
4811
4812 /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise. */
4813
4814 # if ! defined(USE_LOCALE) \
4815 || ! defined(USE_LOCALE_CTYPE) \
4816 || defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */
4817
4818 PERL_UNUSED_ARG(locale);
4819
4820 return FALSE;
4821
4822 /* Definitively, can't be UTF-8 */
4823 # define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4824 # else
4825
4826 /* If the input happens to be the same locale as we are currently setup
4827 * for, the answer has already been cached. */
4828 if (strEQ(locale, PL_ctype_name)) {
4829 return PL_in_utf8_CTYPE_locale;
4830 }
4831
4832 if (isNAME_C_OR_POSIX(locale)) {
4833 return false;
4834 }
4835
4836 # if ! defined(HAS_SOME_LANGINFO) && ! defined(WIN32)
4837
4838 /* On non-Windows without nl_langinfo(), we have to do some digging to get
4839 * the answer. First, toggle to the desired locale so can query its state
4840 * */
4841 const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
4842
4843 # define TEARDOWN_FOR_IS_LOCALE_UTF8 \
4844 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
4845
4846 # ifdef MB_CUR_MAX
4847
4848 /* If there are fewer bytes available in this locale than are required
4849 * to represent the largest legal UTF-8 code point, this isn't a UTF-8
4850 * locale. */
4851 const int mb_cur_max = MB_CUR_MAX;
4852 if (mb_cur_max < (int) UNISKIP(PERL_UNICODE_MAX)) {
4853 TEARDOWN_FOR_IS_LOCALE_UTF8;
4854 return false;
4855 }
4856
4857 # endif
4858 # if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4859
4860 /* With these functions, we can definitively determine a locale's
4861 * UTF-8ness */
4862 # define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4863
4864 /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT CHARACTER
4865 * as that Unicode code point, this has to be a UTF-8 locale; otherwise it
4866 * can't be */
4867 wchar_t wc = 0;
4868 (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */
4869 int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
4870 STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
4871 TEARDOWN_FOR_IS_LOCALE_UTF8;
4872 return ( mbtowc_ret == STRLENs(REPLACEMENT_CHARACTER_UTF8)
4873 && wc == UNICODE_REPLACEMENT);
4874
4875 # else
4876
4877 /* If the above two C99 functions aren't working, you could try some
4878 * different methods. It seems likely that the obvious choices,
4879 * wctomb() and wcrtomb(), wouldn't be working either. But you could
4880 * choose one of the dozen-ish Unicode titlecase triples and verify
4881 * that towupper/towlower work as expected.
4882 *
4883 * But, our emulation of nl_langinfo() works quite well, so avoid the
4884 * extra code until forced to by some weird non-conforming platform. */
4885 # define USE_LANGINFO_FOR_UTF8NESS
4886 # undef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4887 # endif
4888 # else
4889
4890 /* On Windows or on platforms with nl_langinfo(), there is a direct way to
4891 * get the locale's codeset, which will be some form of 'UTF-8' for a
4892 * UTF-8 locale. langinfo_c() handles this, and we will call that
4893 * below */
4894 # define HAS_DEFINITIVE_UTF8NESS_DETERMINATION
4895 # define USE_LANGINFO_FOR_UTF8NESS
4896 # define TEARDOWN_FOR_IS_LOCALE_UTF8
4897 # endif /* USE_LANGINFO_FOR_UTF8NESS */
4898
4899 /* If the above compiled into code, it found the locale's UTF-8ness,
4900 * nothing more to do; if it didn't get compiled,
4901 * USE_LANGINFO_FOR_UTF8NESS is defined. There are two possible reasons:
4902 * 1) it is the preferred method because it knows directly for sure
4903 * what the codeset is because the platform has libc functions that
4904 * return this; or
4905 * 2) the functions the above code section would compile to use don't
4906 * exist or are unreliable on this platform; we are less sure of the
4907 * langinfo_c() result, though it is very unlikely to be wrong
4908 * about if it is UTF-8 or not */
4909 # ifdef USE_LANGINFO_FOR_UTF8NESS
4910
4911 const char * codeset = langinfo_c(CODESET, LC_CTYPE, locale, NULL);
4912 bool retval = is_codeset_name_UTF8(codeset);
4913
4914 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4915 "found codeset=%s, is_utf8=%d\n", codeset, retval));
4916 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "is_locale_utf8(%s) returning %d\n",
4917 locale, retval));
4918 TEARDOWN_FOR_IS_LOCALE_UTF8;
4919 return retval;
4920
4921 # endif
4922 # endif /* End of the #else clause, for the non-trivial case */
4923
4924 }
4925
4926 #endif
4927
4928 #ifdef USE_LOCALE
4929 # ifdef USE_LOCALE_CTYPE
4930
4931 STATIC bool
S_is_codeset_name_UTF8(const char * name)4932 S_is_codeset_name_UTF8(const char * name)
4933 {
4934 /* Return a boolean as to if the passed-in name indicates it is a UTF-8
4935 * code set. Several variants are possible */
4936 const Size_t len = strlen(name);
4937
4938 PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8;
4939
4940 # ifdef WIN32
4941
4942 /* https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers */
4943 if (memENDs(name, len, "65001")) {
4944 return TRUE;
4945 }
4946
4947 # endif
4948 /* 'UTF8' or 'UTF-8' */
4949 return ( inRANGE(len, 4, 5)
4950 && name[len-1] == '8'
4951 && ( memBEGINs(name, len, "UTF")
4952 || memBEGINs(name, len, "utf"))
4953 && (len == 4 || name[3] == '-'));
4954 }
4955
4956 # endif
4957 # ifdef WIN32
4958
4959 bool
Perl_get_win32_message_utf8ness(pTHX_ const char * string)4960 Perl_get_win32_message_utf8ness(pTHX_ const char * string)
4961 {
4962 /* This is because Windows doesn't have LC_MESSAGES. */
4963
4964 # ifdef USE_LOCALE_CTYPE
4965
4966 /* We don't know the locale utf8ness here, and not even the locale itself.
4967 * Since Windows uses a different mechanism to specify message language
4968 * output than the locale system, it is going to be problematic deciding
4969 * if we are to store it as UTF-8 or not. By specifying LOCALE_IS_UTF8, we
4970 * are telling the called function to return true iff the string has
4971 * non-ASCII characters in it that are all syntactically UTF-8. We are
4972 * thus relying on the fact that a string that is syntactically valid UTF-8
4973 * is likely to be UTF-8. Should this ever cause problems, this function
4974 * could be replaced by something more Windows-specific */
4975 return get_locale_string_utf8ness_i(string, LOCALE_IS_UTF8,
4976 NULL, LC_CTYPE_INDEX_);
4977 # else
4978
4979 PERL_UNUSED_ARG(string);
4980 return false;
4981
4982 # endif
4983
4984 }
4985
4986 # endif
4987
4988 STATIC void
S_set_save_buffer_min_size(pTHX_ Size_t min_len,char ** buf,Size_t * buf_cursize)4989 S_set_save_buffer_min_size(pTHX_ Size_t min_len,
4990 char **buf,
4991 Size_t * buf_cursize)
4992 {
4993 /* Make sure the buffer pointed to by *buf is at least as large 'min_len';
4994 * *buf_cursize is the size of 'buf' upon entry; it will be updated to the
4995 * new size on exit. 'buf_cursize' being NULL is to be used when this is a
4996 * single use buffer, which will shortly be freed by the caller. */
4997
4998 if (buf_cursize == NULL) {
4999 Newx(*buf, min_len, char);
5000 }
5001 else if (*buf_cursize == 0) {
5002 Newx(*buf, min_len, char);
5003 *buf_cursize = min_len;
5004 }
5005 else if (min_len > *buf_cursize) {
5006 Renew(*buf, min_len, char);
5007 *buf_cursize = min_len;
5008 }
5009 }
5010
5011 STATIC const char *
S_save_to_buffer(pTHX_ const char * string,char ** buf,Size_t * buf_size)5012 S_save_to_buffer(pTHX_ const char * string, char **buf, Size_t *buf_size)
5013 {
5014 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
5015
5016 /* Copy the NUL-terminated 'string' to a buffer whose address before this
5017 * call began at *buf, and whose available length before this call was
5018 * *buf_size.
5019 *
5020 * If the length of 'string' is greater than the space available, the
5021 * buffer is grown accordingly, which may mean that it gets relocated.
5022 * *buf and *buf_size will be updated to reflect this.
5023 *
5024 * Regardless, the function returns a pointer to where 'string' is now
5025 * stored.
5026 *
5027 * 'string' may be NULL, which means no action gets taken, and NULL is
5028 * returned.
5029 *
5030 * 'buf_size' being NULL is to be used when this is a single use buffer,
5031 * which will shortly be freed by the caller.
5032 *
5033 * If *buf or 'buf_size' are NULL or *buf_size is 0, the buffer is assumed
5034 * empty, and memory is malloc'd.
5035 */
5036
5037 if (! string) {
5038 return NULL;
5039 }
5040
5041 /* No-op to copy over oneself */
5042 if (string == *buf) {
5043 return string;
5044 }
5045
5046 Size_t string_size = strlen(string) + 1;
5047 set_save_buffer_min_size(string_size, buf, buf_size);
5048
5049 # ifdef DEBUGGING
5050
5051 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
5052 "Copying '%s' to %p\n",
5053 ((is_strict_utf8_string((U8 *) string, 0))
5054 ? string
5055 :_byte_dump_string((U8 *) string, strlen(string), 0)),
5056 *buf));
5057
5058 # ifdef USE_LOCALE_CTYPE
5059
5060 /* Catch glitches. Usually this is because LC_CTYPE needs to be the same
5061 * locale as whatever is being worked on */
5062 if (UNLIKELY(instr(string, REPLACEMENT_CHARACTER_UTF8))) {
5063 locale_panic_(Perl_form(aTHX_
5064 "Unexpected REPLACEMENT_CHARACTER in '%s'\n%s",
5065 string, get_LC_ALL_display()));
5066 }
5067
5068 # endif
5069 # endif
5070
5071 Copy(string, *buf, string_size, char);
5072 return *buf;
5073 }
5074
5075 #endif
5076
5077 int
Perl_mbtowc_(pTHX_ const wchar_t * pwc,const char * s,const Size_t len)5078 Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len)
5079 {
5080
5081 #if ! defined(HAS_MBRTOWC) && ! defined(HAS_MBTOWC)
5082
5083 PERL_UNUSED_ARG(pwc);
5084 PERL_UNUSED_ARG(s);
5085 PERL_UNUSED_ARG(len);
5086 return -1;
5087
5088 #else /* Below we have some form of mbtowc() */
5089 # if defined(HAS_MBRTOWC) \
5090 && (defined(MULTIPLICITY) || ! defined(HAS_MBTOWC))
5091 # define USE_MBRTOWC
5092 # else
5093 # undef USE_MBRTOWC
5094 # endif
5095
5096 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
5097 int retval = -1;
5098
5099 if (s == NULL) { /* Initialize the shift state to all zeros in
5100 PL_mbrtowc_ps. */
5101
5102 # if defined(USE_MBRTOWC)
5103
5104 memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
5105 return 0;
5106
5107 # else
5108
5109 SETERRNO(0, 0);
5110 MBTOWC_LOCK_;
5111 retval = mbtowc(NULL, NULL, 0);
5112 MBTOWC_UNLOCK_;
5113 return retval;
5114
5115 # endif
5116
5117 }
5118
5119 # if defined(USE_MBRTOWC)
5120
5121 SETERRNO(0, 0);
5122 MBRTOWC_LOCK_;
5123 retval = (SSize_t) mbrtowc((wchar_t *) pwc, s, len, &PL_mbrtowc_ps);
5124 MBRTOWC_UNLOCK_;
5125
5126 # else
5127
5128 /* Locking prevents races, but locales can be switched out without locking,
5129 * so this isn't a cure all */
5130 SETERRNO(0, 0);
5131 MBTOWC_LOCK_;
5132 retval = mbtowc((wchar_t *) pwc, s, len);
5133 MBTOWC_UNLOCK_;
5134
5135 # endif
5136
5137 return retval;
5138
5139 #endif
5140
5141 }
5142
5143 /*
5144 =for apidoc Perl_localeconv
5145
5146 This is a thread-safe version of the libc L<localeconv(3)>. It is the same as
5147 L<POSIX::localeconv|POSIX/localeconv> (returning a hash of the C<localeconv()>
5148 fields), but directly callable from XS code. The hash is mortalized, so must
5149 be dealt with immediately.
5150
5151 =cut
5152 */
5153
5154 /* All Wndows versions we support, except possibly MingW, have general
5155 * thread-safety, and even localeconv() is thread safe, returning into a
5156 * per-thread buffer. MingW when built with a modern MS C runtime (UCRT as of
5157 * this writing), also has those things.
5158 *
5159 * FreeBSD's localeconv() when used with uselocale() is supposed to be
5160 * thread-safe (as is their localeconv_l()), but we currently don't use
5161 * thread-safe locales there because of bugs. There may be other thread-safe
5162 * localeconv() implementations, especially on *BSD derivatives, but khw knows
5163 * of none, and hasn't really investigated, in part because of the past
5164 * unreliability of vendor thread-safety claims */
5165 #if defined(WIN32) && (defined(_MSC_VER) || (defined(_UCRT)))
5166 # define LOCALECONV_IS_THREAD_SAFE
5167 #endif
5168
5169 /* When multiple threads can be going at once, we need a critical section
5170 * around doing the localeconv() and saving its return, unless localeconv() is
5171 * thread-safe, and we are using it in a thread-safe manner, which we are only
5172 * doing if safe threads are available and we don't have a broken localeconv()
5173 * */
5174 #if defined(USE_THREADS) \
5175 && ( ! defined(LOCALECONV_IS_THREAD_SAFE) \
5176 || ! defined(USE_THREAD_SAFE_LOCALE) \
5177 || defined(TS_W32_BROKEN_LOCALECONV))
5178 # define LOCALECONV_NEEDS_CRITICAL_SECTION
5179 #endif
5180
5181 HV *
Perl_localeconv(pTHX)5182 Perl_localeconv(pTHX)
5183 {
5184 return (HV *) sv_2mortal((SV *) my_localeconv(0));
5185 }
5186
5187 HV *
S_my_localeconv(pTHX_ const int item)5188 S_my_localeconv(pTHX_ const int item)
5189 {
5190 PERL_ARGS_ASSERT_MY_LOCALECONV;
5191
5192 /* This returns a mortalized hash containing all or certain elements
5193 * returned by localeconv(). */
5194 HV * hv = newHV(); /* The returned hash, initially empty */
5195
5196 /* The function is used by Perl_localeconv() and POSIX::localeconv(), or
5197 * internally from this file, and is thread-safe.
5198 *
5199 * localeconv() returns items from two different locale categories,
5200 * LC_MONETARY and LC_NUMERIC. Various data structures in this function
5201 * are arrays with two elements, one for each category, and these indexes
5202 * indicate which array element applies to which category */
5203 #define NUMERIC_OFFSET 0
5204 #define MONETARY_OFFSET 1
5205
5206 /* Some operations apply to one or the other category, or both. A mask
5207 * is used to specify all the possibilities. This macro converts from the
5208 * category offset to its bit position in the mask. */
5209 #define OFFSET_TO_BIT(i) (1 << (i))
5210
5211 /* There are two use cases for this function:
5212 * 1) Called as Perl_localeconv(), or from POSIX::locale_conv(). This
5213 * returns the lconv structure copied to a hash, based on the current
5214 * underlying locales for LC_NUMERIC and LC_MONETARY. An input item==0
5215 * signifies this case, or on many platforms it is the only use case
5216 * compiled.
5217 * 2) Certain items that nl_langinfo() provides are also derivable from
5218 * the return of localeconv(). Windows notably doesn't have
5219 * nl_langinfo(), so on that, and actually any platform lacking it,
5220 * my_localeconv() is used also to emulate it for those particular
5221 * items. The code to do this is compiled only on such platforms.
5222 * Rather than going to the expense of creating a full hash when only
5223 * one item is needed, the returned hash has just the desired item in
5224 * it.
5225 *
5226 * To access all the localeconv() struct lconv fields, there is a data
5227 * structure that contains every commonly documented field in it. (Maybe
5228 * some minority platforms have extra fields. Those could be added here
5229 * without harm; they would just be ignored on platforms lacking them.)
5230 *
5231 * Our structure is compiled to make looping through the fields easier by
5232 * pointing each name to its value's offset within lconv, e.g.,
5233 { "thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep) }
5234 */
5235 #define LCONV_ENTRY(name) {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)}
5236
5237 /* These synonyms are just for clarity, and to make it easier in case
5238 * something needs to change in the future */
5239 #define LCONV_NUMERIC_ENTRY(name) LCONV_ENTRY(name)
5240 #define LCONV_MONETARY_ENTRY(name) LCONV_ENTRY(name)
5241
5242 /* There are just a few fields for NUMERIC strings */
5243 const lconv_offset_t lconv_numeric_strings[] = {
5244 #ifndef NO_LOCALECONV_GROUPING
5245 LCONV_NUMERIC_ENTRY(grouping),
5246 # endif
5247 LCONV_NUMERIC_ENTRY(thousands_sep),
5248 # define THOUSANDS_SEP_LITERAL "thousands_sep"
5249 LCONV_NUMERIC_ENTRY(decimal_point),
5250 # define DECIMAL_POINT_LITERAL "decimal_point"
5251 {NULL, 0}
5252 };
5253
5254 /* When used to implement nl_langinfo(), we save time by only populating
5255 * the hash with the field(s) needed. Thus we would need a data structure
5256 * of just:
5257 * LCONV_NUMERIC_ENTRY(decimal_point),
5258 * {NULL, 0}
5259 *
5260 * By placing the decimal_point field last in the full structure, we can
5261 * use just the tail for this bit of it, saving space. This macro yields
5262 * the address of the sub structure. */
5263 #define DECIMAL_POINT_ADDRESS \
5264 &lconv_numeric_strings[(C_ARRAY_LENGTH(lconv_numeric_strings) - 2)]
5265
5266 /* And the MONETARY string fields */
5267 const lconv_offset_t lconv_monetary_strings[] = {
5268 LCONV_MONETARY_ENTRY(int_curr_symbol),
5269 LCONV_MONETARY_ENTRY(mon_decimal_point),
5270 #ifndef NO_LOCALECONV_MON_THOUSANDS_SEP
5271 LCONV_MONETARY_ENTRY(mon_thousands_sep),
5272 #endif
5273 #ifndef NO_LOCALECONV_MON_GROUPING
5274 LCONV_MONETARY_ENTRY(mon_grouping),
5275 #endif
5276 LCONV_MONETARY_ENTRY(positive_sign),
5277 LCONV_MONETARY_ENTRY(negative_sign),
5278 LCONV_MONETARY_ENTRY(currency_symbol),
5279 #define CURRENCY_SYMBOL_LITERAL "currency_symbol"
5280 {NULL, 0}
5281 };
5282
5283 /* Like above, this field being last can be used as a sub structure */
5284 #define CURRENCY_SYMBOL_ADDRESS \
5285 &lconv_monetary_strings[(C_ARRAY_LENGTH(lconv_monetary_strings) - 2)]
5286
5287 /* Finally there are integer fields, all are for monetary purposes */
5288 const lconv_offset_t lconv_integers[] = {
5289 LCONV_ENTRY(int_frac_digits),
5290 LCONV_ENTRY(frac_digits),
5291 LCONV_ENTRY(p_sep_by_space),
5292 LCONV_ENTRY(n_cs_precedes),
5293 LCONV_ENTRY(n_sep_by_space),
5294 LCONV_ENTRY(p_sign_posn),
5295 LCONV_ENTRY(n_sign_posn),
5296 #ifdef HAS_LC_MONETARY_2008
5297 LCONV_ENTRY(int_p_cs_precedes),
5298 LCONV_ENTRY(int_p_sep_by_space),
5299 LCONV_ENTRY(int_n_cs_precedes),
5300 LCONV_ENTRY(int_n_sep_by_space),
5301 LCONV_ENTRY(int_p_sign_posn),
5302 LCONV_ENTRY(int_n_sign_posn),
5303 #endif
5304 # define P_CS_PRECEDES_LITERAL "p_cs_precedes"
5305 LCONV_ENTRY(p_cs_precedes),
5306 {NULL, 0}
5307 };
5308
5309 /* Like above, this field being last can be used as a sub structure */
5310 #define P_CS_PRECEDES_ADDRESS \
5311 &lconv_integers[(C_ARRAY_LENGTH(lconv_integers) - 2)]
5312
5313 /* The actual populating of the hash is done by two sub functions that get
5314 * passed an array of length two containing the data structure they are
5315 * supposed to use to get the key names to fill the hash with. One element
5316 * is always for the NUMERIC strings (or NULL if none to use), and the
5317 * other element similarly for the MONETARY ones. */
5318 const lconv_offset_t * strings[2] = { lconv_numeric_strings,
5319 lconv_monetary_strings
5320 };
5321
5322 /* The LC_MONETARY category also has some integer-valued fields, whose
5323 * information is kept in a separate parallel array to 'strings' */
5324 const lconv_offset_t * integers[2] = {
5325 NULL,
5326 lconv_integers
5327 };
5328
5329 #if ! defined(HAS_LOCALECONV) \
5330 || (! defined(USE_LOCALE_NUMERIC) && ! defined(USE_LOCALE_MONETARY))
5331
5332 /* If both NUMERIC and MONETARY must be the "C" locale, simply populate the
5333 * hash using the function that works on just that locale. */
5334 populate_hash_from_C_localeconv(hv,
5335 "C",
5336 ( OFFSET_TO_BIT(NUMERIC_OFFSET)
5337 | OFFSET_TO_BIT(MONETARY_OFFSET)),
5338 strings, integers);
5339
5340 /* We shouldn't get to here for the case of an individual item, as
5341 * preprocessor directives elsewhere in this file should have filled in the
5342 * correct values at a higher level */
5343 assert(item == 0);
5344 PERL_UNUSED_ARG(item);
5345
5346 return hv;
5347
5348 #else
5349
5350 /* From here to the end of this function, at least one of NUMERIC or
5351 * MONETARY can be non-C */
5352
5353 /* This is a mask, with one bit to tell the populate functions to populate
5354 * the NUMERIC items; another bit for the MONETARY ones. This way they can
5355 * choose which (or both) to populate from */
5356 U32 index_bits = 0;
5357
5358 /* Some platforms, for correct non-mojibake results, require LC_CTYPE's
5359 * locale to match LC_NUMERIC's for the numeric fields, and LC_MONETARY's
5360 * for the monetary ones. What happens if LC_NUMERIC and LC_MONETARY
5361 * aren't compatible? Wrong results. To avoid that, we call localeconv()
5362 * twice, once for each locale, setting LC_CTYPE to match the category.
5363 * But if the locales of both categories are the same, there is no need for
5364 * a second call. Assume this is the case unless overridden below */
5365 bool requires_2nd_localeconv = false;
5366
5367 /* The actual hash populating is done by one of the two populate functions.
5368 * Which one is appropriate for either the MONETARY_OFFSET or the
5369 * NUMERIC_OFFSET is calculated and then stored in this table */
5370 void (*populate[2]) (pTHX_
5371 HV * ,
5372 const char *,
5373 const U32,
5374 const lconv_offset_t **,
5375 const lconv_offset_t **);
5376
5377 /* This gives the locale to use for the corresponding OFFSET, like the
5378 * 'populate' array above */
5379 const char * locales[2];
5380
5381 # ifdef HAS_SOME_LANGINFO
5382
5383 /* If the only use-case for this is the full localeconv(), the 'item'
5384 * parameter is ignored. */
5385 PERL_UNUSED_ARG(item);
5386
5387 # else /* This only gets compiled for the use-case of using localeconv()
5388 to emulate nl_langinfo() when missing from the platform. */
5389
5390 # ifdef USE_LOCALE_NUMERIC
5391
5392 /* We need this substructure to only return this field for the THOUSEP
5393 * item. The other items also need substructures, but they were handled
5394 * above by placing the substructure's item at the end of the full one, so
5395 * the data structure could do double duty. However, both this and
5396 * RADIXCHAR would need to be in the final position of the same full
5397 * structure; an impossibility. So make this into a separate structure */
5398 const lconv_offset_t thousands_sep_string[] = {
5399 LCONV_NUMERIC_ENTRY(thousands_sep),
5400 {NULL, 0}
5401 };
5402
5403 # endif
5404
5405 /* End of all the initialization of data structures. Now for actual code.
5406 *
5407 * Without nl_langinfo(), the call to my_localeconv() could be for all of
5408 * the localeconv() items or for just one of the following 3 items to
5409 * emulate nl_langinfo().
5410 *
5411 * This is compiled only when using perl_langinfo.h, which we control, and
5412 * it has been constructed so that no item is numbered 0.
5413 *
5414 * For each individual item, either return the known value if the current
5415 * locale is "C", or set up the appropriate parameters for the call below
5416 * to the populate function */
5417 if (item != 0) {
5418 const char *locale;
5419
5420 switch (item) {
5421 default:
5422 locale_panic_(Perl_form(aTHX_
5423 "Unexpected item passed to my_localeconv: %d", item));
5424 break;
5425
5426 # ifdef USE_LOCALE_NUMERIC
5427
5428 case RADIXCHAR:
5429 if (isNAME_C_OR_POSIX(PL_numeric_name)) {
5430 (void) hv_stores(hv, DECIMAL_POINT_LITERAL, newSVpvs("."));
5431 return hv;
5432 }
5433
5434 strings[NUMERIC_OFFSET] = DECIMAL_POINT_ADDRESS;
5435 goto numeric_common;
5436
5437 case THOUSEP:
5438 if (isNAME_C_OR_POSIX(PL_numeric_name)) {
5439 (void) hv_stores(hv, THOUSANDS_SEP_LITERAL, newSVpvs(""));
5440 return hv;
5441 }
5442
5443 strings[NUMERIC_OFFSET] = thousands_sep_string;
5444
5445 numeric_common:
5446 index_bits = OFFSET_TO_BIT(NUMERIC_OFFSET);
5447 locale = PL_numeric_name;
5448 break;
5449
5450 # endif
5451 # ifdef USE_LOCALE_MONETARY
5452
5453 case CRNCYSTR: /* This item needs the values for both the currency
5454 symbol, and another one used to construct the
5455 nl_langino()-compatible return. */
5456
5457 locale = querylocale_c(LC_MONETARY);
5458 if (isNAME_C_OR_POSIX(locale)) {
5459 (void) hv_stores(hv, CURRENCY_SYMBOL_LITERAL, newSVpvs(""));
5460 (void) hv_stores(hv, P_CS_PRECEDES_LITERAL, newSViv(-1));
5461 return hv;
5462 }
5463
5464 strings[MONETARY_OFFSET] = CURRENCY_SYMBOL_ADDRESS;
5465 integers[MONETARY_OFFSET] = P_CS_PRECEDES_ADDRESS;
5466
5467 index_bits = OFFSET_TO_BIT(MONETARY_OFFSET);
5468 break;
5469
5470 # endif
5471
5472 } /* End of switch() */
5473
5474 /* There's only one item, so only one of each of these will get used,
5475 * but cheap to initialize both */
5476 populate[MONETARY_OFFSET] =
5477 populate[NUMERIC_OFFSET] = S_populate_hash_from_localeconv;
5478 locales[MONETARY_OFFSET] = locales[NUMERIC_OFFSET] = locale;
5479 }
5480 else /* End of for just one item to emulate nl_langinfo() */
5481
5482 # endif
5483
5484 {
5485 /* Here, the call is for all of localeconv(). It has a bunch of
5486 * items. The first function call always gets the MONETARY values */
5487 index_bits = OFFSET_TO_BIT(MONETARY_OFFSET);
5488
5489 # ifdef USE_LOCALE_MONETARY
5490
5491 locales[MONETARY_OFFSET] = querylocale_c(LC_MONETARY);
5492 populate[MONETARY_OFFSET] =
5493 (isNAME_C_OR_POSIX(locales[MONETARY_OFFSET]))
5494 ? S_populate_hash_from_C_localeconv
5495 : S_populate_hash_from_localeconv;
5496
5497 # else
5498
5499 locales[MONETARY_OFFSET] = "C";
5500 populate[MONETARY_OFFSET] = S_populate_hash_from_C_localeconv;
5501
5502 # endif
5503 # ifdef USE_LOCALE_NUMERIC
5504
5505 /* And if the locales for the two categories are the same, we can also
5506 * do the NUMERIC values in the same call */
5507 if (strEQ(PL_numeric_name, locales[MONETARY_OFFSET])) {
5508 index_bits |= OFFSET_TO_BIT(NUMERIC_OFFSET);
5509 locales[NUMERIC_OFFSET] = locales[MONETARY_OFFSET];
5510 populate[NUMERIC_OFFSET] = populate[MONETARY_OFFSET];
5511 }
5512 else {
5513 requires_2nd_localeconv = true;
5514 locales[NUMERIC_OFFSET] = PL_numeric_name;
5515 populate[NUMERIC_OFFSET] = (isNAME_C_OR_POSIX(PL_numeric_name))
5516 ? S_populate_hash_from_C_localeconv
5517 : S_populate_hash_from_localeconv;
5518 }
5519
5520 # else
5521
5522 /* When LC_NUMERIC is confined to "C", the two locales are the same
5523 iff LC_MONETARY in this case is also "C". We set up the function
5524 for that case above, so fastest to test just its address */
5525 locales[NUMERIC_OFFSET] = "C";
5526 if (populate[MONETARY_OFFSET] == S_populate_hash_from_C_localeconv) {
5527 index_bits |= OFFSET_TO_BIT(NUMERIC_OFFSET);
5528 populate[NUMERIC_OFFSET] = populate[MONETARY_OFFSET];
5529 }
5530 else {
5531 requires_2nd_localeconv = true;
5532 populate[NUMERIC_OFFSET] = S_populate_hash_from_C_localeconv;
5533 }
5534
5535 # endif
5536
5537 } /* End of call is for localeconv() */
5538
5539 /* Call the proper populate function (which may call localeconv()) and copy
5540 * its results into the hash. All the parameters have been initialized
5541 * above */
5542 (*populate[MONETARY_OFFSET])(aTHX_
5543 hv, locales[MONETARY_OFFSET],
5544 index_bits, strings, integers);
5545
5546 # ifndef HAS_SOME_LANGINFO /* Could be using this function to emulate
5547 nl_langinfo() */
5548
5549 /* We are done when called with an individual item. There are no integer
5550 * items to adjust, and it's best for the caller to determine if this
5551 * string item is UTF-8 or not. This is because the locale's UTF-8ness is
5552 * calculated below, and in some Configurations, that can lead to a
5553 * recursive call to here, which could recurse infinitely. */
5554 if (item != 0) {
5555 return hv;
5556 }
5557
5558 # endif
5559
5560 /* The above call may have done all the hash fields, but not always, as
5561 * already explained. If we need a second call it is always for the
5562 * NUMERIC fields */
5563 if (requires_2nd_localeconv) {
5564 (*populate[NUMERIC_OFFSET])(aTHX_
5565 hv,
5566 locales[NUMERIC_OFFSET],
5567 OFFSET_TO_BIT(NUMERIC_OFFSET),
5568 strings, integers);
5569 }
5570
5571 /* Here, the hash has been completely populated. */
5572
5573 # ifdef LOCALECONV_NEEDS_CRITICAL_SECTION
5574
5575 /* When the hash was populated during a critical section, the determination
5576 * of whether or not a string element should be marked as UTF-8 was
5577 * deferred, so as to minimize the amount of time in the critical section.
5578 * But now we have the hash specific to this thread, and can do the
5579 * adjusting without worrying about delaying other threads. */
5580 for (unsigned int i = 0; i < 2; i++) { /* Try both types of strings */
5581
5582 /* The return from this function is already adjusted */
5583 if (populate[i] == S_populate_hash_from_C_localeconv) {
5584 continue;
5585 }
5586
5587 /* Examine each string */
5588 for (const lconv_offset_t *strp = strings[i]; strp->name; strp++) {
5589 const char * name = strp->name;
5590
5591 /* 'value' will contain the string that may need to be marked as
5592 * UTF-8 */
5593 SV ** value = hv_fetch(hv, name, strlen(name), true);
5594 if (value == NULL) {
5595 continue;
5596 }
5597
5598 /* Determine if the string should be marked as UTF-8. */
5599 if (UTF8NESS_YES == (get_locale_string_utf8ness_i(SvPVX(*value),
5600 LOCALE_UTF8NESS_UNKNOWN,
5601 locales[i],
5602 LC_ALL_INDEX_ /* OOB */)))
5603 {
5604 SvUTF8_on(*value);
5605 }
5606 }
5607 }
5608
5609 # endif
5610
5611 return hv;
5612
5613 #endif /* End of must have one or both USE_MONETARY, USE_NUMERIC */
5614
5615 }
5616
5617 STATIC void
S_populate_hash_from_C_localeconv(pTHX_ HV * hv,const char * locale,const U32 which_mask,const lconv_offset_t * strings[2],const lconv_offset_t * integers[2])5618 S_populate_hash_from_C_localeconv(pTHX_ HV * hv,
5619 const char * locale, /* Unused */
5620
5621 /* bit mask of which categories to
5622 * populate */
5623 const U32 which_mask,
5624
5625 /* The string type values to return;
5626 * one element for numeric; the other
5627 * for monetary */
5628 const lconv_offset_t * strings[2],
5629
5630 /* And the integer fields */
5631 const lconv_offset_t * integers[2])
5632 {
5633 PERL_ARGS_ASSERT_POPULATE_HASH_FROM_C_LOCALECONV;
5634 PERL_UNUSED_ARG(locale);
5635 assert(isNAME_C_OR_POSIX(locale));
5636
5637 /* Fill hv with the values that localeconv() is supposed to return for
5638 * the C locale */
5639
5640 U32 working_mask = which_mask;
5641 while (working_mask) {
5642
5643 /* Get the bit position of the next lowest set bit. That is the
5644 * index into the 'strings' array of the category we use in this loop
5645 * iteration. Turn the bit off so we don't work on this category
5646 * again in this function call. */
5647 const PERL_UINT_FAST8_T i = lsbit_pos(working_mask);
5648 working_mask &= ~ (1 << i);
5649
5650 /* This category's string fields */
5651 const lconv_offset_t * category_strings = strings[i];
5652
5653 #ifndef HAS_SOME_LANGINFO /* This doesn't work properly if called on a single
5654 item, which could only happen when there isn't
5655 nl_langinfo on the platform */
5656 assert(category_strings[1].name != NULL);
5657 #endif
5658
5659 /* All string fields are empty except for one NUMERIC one. That one
5660 * has been initialized to be the final one in the NUMERIC strings, so
5661 * stop the loop early in that case. Otherwise, we would store an
5662 * empty string to the hash, and immediately overwrite it with the
5663 * correct value */
5664 const unsigned int stop_early = (i == NUMERIC_OFFSET) ? 1 : 0;
5665
5666 /* A NULL element terminates the list */
5667 while ((category_strings + stop_early)->name) {
5668 (void) hv_store(hv,
5669 category_strings->name,
5670 strlen(category_strings->name),
5671 newSVpvs(""),
5672 0);
5673
5674 category_strings++;
5675 }
5676
5677 /* And fill in the NUMERIC exception */
5678 if (i == NUMERIC_OFFSET) {
5679 (void) hv_stores(hv, "decimal_point", newSVpvs("."));
5680 category_strings++;
5681 }
5682
5683 /* Add any int fields. In the C locale, all are -1 */
5684 if (integers[i]) {
5685 const lconv_offset_t * current = integers[i];
5686 while (current->name) {
5687 (void) hv_store(hv,
5688 current->name, strlen(current->name),
5689 newSViv(-1),
5690 0);
5691 current++;
5692 }
5693 }
5694 }
5695 }
5696
5697 #if defined(HAS_LOCALECONV) && ( defined(USE_LOCALE_NUMERIC) \
5698 || defined(USE_LOCALE_MONETARY))
5699
5700 STATIC void
S_populate_hash_from_localeconv(pTHX_ HV * hv,const char * locale,const U32 which_mask,const lconv_offset_t * strings[2],const lconv_offset_t * integers[2])5701 S_populate_hash_from_localeconv(pTHX_ HV * hv,
5702
5703 /* Switch to this locale to run
5704 * localeconv() from */
5705 const char * locale,
5706
5707 /* bit mask of which categories to
5708 * populate */
5709 const U32 which_mask,
5710
5711 /* The string type values to return; one
5712 * element for numeric; the other for
5713 * monetary */
5714 const lconv_offset_t * strings[2],
5715
5716 /* And similarly the integer fields */
5717 const lconv_offset_t * integers[2])
5718 {
5719 PERL_ARGS_ASSERT_POPULATE_HASH_FROM_LOCALECONV;
5720
5721 /* Run localeconv() and copy some or all of its results to the input 'hv'
5722 * hash. Most localeconv() implementations return the values in a global
5723 * static buffer, so for them, the operation must be performed in a
5724 * critical section, ending only after the copy is completed. There are so
5725 * many locks because localeconv() deals with two categories, and returns
5726 * in a single global static buffer. Some locks might be no-ops on this
5727 * platform, but not others. We need to lock if any one isn't a no-op. */
5728
5729 /* If the call could be for either or both of the two categories, we need
5730 * to test which one; but if the Configuration is such that we will never
5731 * be called with one of them, the code for that one will be #ifdef'd out
5732 * below, leaving code for just the other category. That code will always
5733 * want to be executed, no conditional required. Create a macro that
5734 * replaces the condition with an always-true value so the compiler will
5735 * omit the conditional */
5736 # if defined(USE_LOCALE_NUMERIC) && defined(USE_LOCALE_MONETARY)
5737 # define CALL_IS_FOR(x) (which_mask & OFFSET_TO_BIT(x ## _OFFSET))
5738 # else
5739 # define CALL_IS_FOR(x) 1
5740 # endif
5741
5742 /* This function is unfortunately full of #ifdefs. It consists of three
5743 * sections:
5744 * 1) Setup:
5745 * a) On platforms where it matters, toggle LC_CTYPE to the same
5746 * locale that LC_NUMERIC and LC_MONETARY will be toggled to
5747 * b) On calls that process LC_NUMERIC, toggle to the desired locale
5748 * c) On calls that process LC_MONETARY, toggle to the desired
5749 * locale
5750 * d) Do any necessary mutex locking not (automatically) done by
5751 * the toggling
5752 * e) Work around some Windows-only issues and bugs
5753 * 2) Do the localeconv(), copying the results.
5754 * 3) Teardown, which is the inverse of setup.
5755 *
5756 * The setup and teardown are highly variable due to the variance in the
5757 * possible Configurations. What is done here to make it slightly more
5758 * understandable is each setup section creates the details of its
5759 * corresponding teardown section, and macroizes them. So that the
5760 * finished teardown product is just a linear series of macros. You can
5761 * thus easily see the logic there. */
5762
5763 /* Setup any LC_CTYPE handling */
5764 start_DEALING_WITH_MISMATCHED_CTYPE(locale);
5765 # define CTYPE_TEARDOWN end_DEALING_WITH_MISMATCHED_CTYPE(locale)
5766
5767 /* Setup any LC_NUMERIC handling */
5768 # ifndef USE_LOCALE_NUMERIC
5769 # define NUMERIC_TEARDOWN
5770 # else
5771
5772 /* We need to toggle the NUMERIC locale to the desired one if we are
5773 * getting NUMERIC strings */
5774 const char * orig_NUMERIC_locale = NULL;
5775 if (CALL_IS_FOR(NUMERIC)) {
5776
5777 # ifdef WIN32
5778
5779 /* There is a bug in Windows in which setting LC_CTYPE after the others
5780 * doesn't actually take effect for localeconv(). See commit
5781 * 418efacd1950763f74ed3cc22f8cf9206661b892 for details. Thus we have
5782 * to make sure that the locale we want is set after LC_CTYPE. We
5783 * unconditionally toggle away from and back to the current locale
5784 * prior to calling localeconv(). */
5785 orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC, "C");
5786 (void) toggle_locale_c(LC_NUMERIC, locale);
5787
5788 # define NUMERIC_TEARDOWN \
5789 STMT_START { \
5790 if (CALL_IS_FOR(NUMERIC)) { \
5791 restore_toggled_locale_c(LC_NUMERIC, "C"); \
5792 restore_toggled_locale_c(LC_NUMERIC, orig_NUMERIC_locale); \
5793 } \
5794 } STMT_END
5795
5796 # else
5797
5798 /* No need for the extra toggle when not on Windows */
5799 orig_NUMERIC_locale = toggle_locale_c(LC_NUMERIC, locale);
5800
5801 # define NUMERIC_TEARDOWN \
5802 STMT_START { \
5803 if (CALL_IS_FOR(NUMERIC)) { \
5804 restore_toggled_locale_c(LC_NUMERIC, orig_NUMERIC_locale); \
5805 } \
5806 } STMT_END
5807 # endif
5808
5809 }
5810
5811 # endif /* End of LC_NUMERIC setup */
5812
5813 /* Setup any LC_MONETARY handling, using the same logic as for
5814 * USE_LOCALE_NUMERIC just above */
5815 # ifndef USE_LOCALE_MONETARY
5816 # define MONETARY_TEARDOWN
5817 # else
5818
5819 /* Same logic as LC_NUMERIC, and same Windows bug */
5820 const char * orig_MONETARY_locale = NULL;
5821 if (CALL_IS_FOR(MONETARY)) {
5822
5823 # ifdef WIN32
5824
5825 orig_MONETARY_locale = toggle_locale_c(LC_MONETARY, "C");
5826 (void) toggle_locale_c(LC_MONETARY, locale);
5827
5828 # define MONETARY_TEARDOWN \
5829 STMT_START { \
5830 if (CALL_IS_FOR(MONETARY)) { \
5831 restore_toggled_locale_c(LC_MONETARY, "C"); \
5832 restore_toggled_locale_c(LC_MONETARY, orig_MONETARY_locale);\
5833 } \
5834 } STMT_END
5835
5836 # else
5837
5838 /* No need for the extra toggle when not on Windows */
5839 orig_MONETARY_locale = toggle_locale_c(LC_MONETARY, locale);
5840
5841 # define MONETARY_TEARDOWN \
5842 STMT_START { \
5843 if (CALL_IS_FOR(MONETARY)) { \
5844 restore_toggled_locale_c(LC_MONETARY, orig_MONETARY_locale);\
5845 } \
5846 } STMT_END
5847
5848 # endif
5849
5850 }
5851
5852 # endif /* End of LC_MONETARY setup */
5853
5854 /* Here, have toggled to the correct locale.
5855 *
5856 * We don't need to worry about locking at all if localeconv() is
5857 * thread-safe, regardless of if using threads or not. */
5858 # ifdef LOCALECONV_IS_THREAD_SAFE
5859 # define LOCALECONV_UNLOCK
5860 # else
5861
5862 /* Otherwise, the gwLOCALE_LOCK macro expands to whatever locking is
5863 * needed (none if there is only a single perl instance) */
5864 gwLOCALE_LOCK;
5865
5866 # define LOCALECONV_UNLOCK gwLOCALE_UNLOCK
5867 # endif
5868 # if ! defined(TS_W32_BROKEN_LOCALECONV) || ! defined(USE_THREAD_SAFE_LOCALE)
5869 # define WIN32_TEARDOWN
5870 # else
5871
5872 /* This is a workaround for another bug in Windows. localeconv() was
5873 * broken with thread-safe locales prior to VS 15. It looks at the global
5874 * locale instead of the thread one. As a work-around, we toggle to the
5875 * global locale; populate the return; then toggle back. We have to use
5876 * LC_ALL instead of the individual categories because of yet another bug
5877 * in Windows. And this all has to be done in a critical section.
5878 *
5879 * This introduces a potential race with any other thread that has also
5880 * converted to use the global locale, and doesn't protect its locale calls
5881 * with mutexes. khw can't think of any reason for a thread to do so on
5882 * Windows, as the locale API is the same regardless of thread-safety,
5883 * except if the code is ported from working on another platform where
5884 * there might be some reason to do this. But this is typically due to
5885 * some alien-to-Perl library that thinks it owns locale setting. Such a
5886 * library isn't likely to exist on Windows, so such an application is
5887 * unlikely to be run on Windows
5888 */
5889 bool restore_per_thread = FALSE;
5890
5891 /* Save the per-thread locale state */
5892 const char * save_thread = querylocale_c(LC_ALL);
5893
5894 /* Change to the global locale, and note if we already were there */
5895 int config_return = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
5896 if (config_return != _DISABLE_PER_THREAD_LOCALE) {
5897 if (config_return == -1) {
5898 locale_panic_("_configthreadlocale returned an error");
5899 }
5900
5901 restore_per_thread = TRUE;
5902 }
5903
5904 /* Save the state of the global locale; then convert to our desired
5905 * state. */
5906 const char * save_global = querylocale_c(LC_ALL);
5907 void_setlocale_c(LC_ALL, save_thread);
5908
5909 # define WIN32_TEARDOWN \
5910 STMT_START { \
5911 /* Restore the global locale's prior state */ \
5912 void_setlocale_c(LC_ALL, save_global); \
5913 \
5914 /* And back to per-thread locales */ \
5915 if (restore_per_thread) { \
5916 if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) { \
5917 locale_panic_("_configthreadlocale returned an error"); \
5918 } \
5919 } \
5920 \
5921 /* Restore the per-thread locale state */ \
5922 void_setlocale_c(LC_ALL, save_thread); \
5923 } STMT_END
5924 # endif /* TS_W32_BROKEN_LOCALECONV */
5925
5926
5927 /* Finally, everything is locked and loaded; do the actual call to
5928 * localeconv() */
5929 const char *lcbuf_as_string = (const char *) localeconv();
5930
5931 /* Copy its results for each desired category as determined by
5932 * 'which_mask' */
5933 U32 working_mask = which_mask;
5934 while (working_mask) {
5935
5936 /* Get the bit position of the next lowest set bit. That is the
5937 * index into the 'strings' array of the category we use in this loop
5938 * iteration. Turn the bit off so we don't work on this category
5939 * again in this function call. */
5940 const PERL_UINT_FAST8_T i = lsbit_pos32(working_mask);
5941 working_mask &= ~ (1 << i);
5942
5943 /* Point to the string field list for the given category ... */
5944 const lconv_offset_t * category_strings = strings[i];
5945
5946 /* The string fields returned by localeconv() are stored as SVs in the
5947 * hash. Their utf8ness needs to be calculated at some point, and the
5948 * SV flagged accordingly. It is easier to do that now as we go
5949 * through them, but strongly countering this is the need to minimize
5950 * the length of time spent in a critical section with other threads
5951 * locked out. Therefore, when this is being executed in a critical
5952 * section, the strings are stored as-is, and the utf8ness calculation
5953 * is done by our caller, outside the critical section, in an extra
5954 * pass through the hash. But when this code is not being executed in
5955 * a critical section, that extra pass would be extra work, so the
5956 * calculation is done here. We have #defined a symbol that indicates
5957 * whether or not this is being done in a critical section. But there
5958 * is a complication. When this is being called with just a single
5959 * string to populate the hash with, there may be extra adjustments
5960 * needed, and the ultimate caller is expecting to do all adjustments,
5961 * so the adjustment is deferred in this case even if there is no
5962 * critical section. (This case is indicated by element [1] being a
5963 * NULL marker, hence having only one real element.) */
5964 # ifndef LOCALECONV_NEEDS_CRITICAL_SECTION
5965 const bool calculate_utf8ness_here = category_strings[1].name;
5966 # endif
5967 bool utf8ness = false;
5968
5969 /* For each string field */
5970 while (category_strings->name) {
5971
5972 /* We have set things up so that we know where in the returned
5973 * structure, when viewed as a string, the corresponding value is.
5974 * */
5975 char *value = *((char **)( lcbuf_as_string
5976 + category_strings->offset));
5977 if (value) { /* Copy to the hash */
5978
5979 # ifndef LOCALECONV_NEEDS_CRITICAL_SECTION
5980
5981 if (calculate_utf8ness_here) {
5982 utf8ness =
5983 ( UTF8NESS_YES
5984 == get_locale_string_utf8ness_i(value,
5985 LOCALE_UTF8NESS_UNKNOWN,
5986 locale,
5987 LC_ALL_INDEX_ /* OOB */));
5988 }
5989 # endif
5990 (void) hv_store(hv,
5991 category_strings->name,
5992 strlen(category_strings->name),
5993 newSVpvn_utf8(value, strlen(value), utf8ness),
5994 0);
5995 }
5996
5997 category_strings++;
5998 }
5999
6000 /* Add any int fields to the HV*. */
6001 if (integers[i]) {
6002 const lconv_offset_t * current = integers[i];
6003 while (current->name) {
6004 int value = *((const char *)( lcbuf_as_string
6005 + current->offset));
6006 if (value == CHAR_MAX) { /* Change CHAR_MAX to -1 */
6007 value = -1;
6008 }
6009
6010 (void) hv_store(hv,
6011 current->name, strlen(current->name),
6012 newSViv(value),
6013 0);
6014 current++;
6015 }
6016 }
6017 } /* End of loop through the fields */
6018
6019 /* Done with copying to the hash. Can unwind the critical section locks */
6020
6021 /* Back out of what we set up */
6022 WIN32_TEARDOWN;
6023 LOCALECONV_UNLOCK;
6024 MONETARY_TEARDOWN;
6025 NUMERIC_TEARDOWN;
6026 CTYPE_TEARDOWN;
6027 }
6028
6029 #endif /* defined(USE_LOCALE_NUMERIC) || defined(USE_LOCALE_MONETARY) */
6030
6031 /*
6032
6033 =for apidoc Perl_langinfo
6034 =for apidoc_item Perl_langinfo8
6035
6036 C<Perl_langinfo> is an (almost) drop-in replacement for the system
6037 C<L<nl_langinfo(3)>>, taking the same C<item> parameter values, and returning
6038 the same information. But it is more thread-safe than regular
6039 C<nl_langinfo()>, and hides the quirks of Perl's locale handling from your
6040 code, and can be used on systems that lack a native C<nl_langinfo>.
6041
6042 However, you should instead use either the improved version of this,
6043 L</Perl_langinfo8>, or even better, L</sv_langinfo>. The latter returns an SV,
6044 handling all the possible non-standard returns of C<nl_langinfo()>, including
6045 the UTF8ness of any returned string.
6046
6047 C<Perl_langinfo8> is identical to C<Perl_langinfo> except for an additional
6048 parameter, a pointer to a variable declared as L</C<utf8ness_t>>, into which it
6049 returns to you how you should treat the returned string with regards to it
6050 being encoded in UTF-8 or not.
6051
6052 These two functions share private per-thread memory that will be changed the
6053 next time either one of them is called with any input, but not before.
6054
6055 Concerning the differences between these and plain C<nl_langinfo()>:
6056
6057 =over
6058
6059 =item a.
6060
6061 C<Perl_langinfo8> has an extra parameter, described above. Besides this, the
6062 other reason they aren't quite a drop-in replacement is actually an advantage.
6063 The C<const>ness of the return allows the compiler to catch attempts to write
6064 into the returned buffer, which is illegal and could cause run-time crashes.
6065
6066 =item b.
6067
6068 They deliver the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
6069 without you having to write extra code. The reason for the extra code would be
6070 because these are from the C<LC_NUMERIC> locale category, which is normally
6071 kept set by Perl so that the radix is a dot, and the separator is the empty
6072 string, no matter what the underlying locale is supposed to be, and so to get
6073 the expected results, you have to temporarily toggle into the underlying
6074 locale, and later toggle back. (You could use plain C<nl_langinfo> and
6075 C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
6076 the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
6077 (or equivalent) locale would break a lot of CPAN, which is expecting the radix
6078 (decimal point) character to be a dot.)
6079
6080 =item c.
6081
6082 The system function they replace can have its static return buffer trashed,
6083 not only by a subsequent call to that function, but by a C<freelocale>,
6084 C<setlocale>, or other locale change. The returned buffer of these functions
6085 is not changed until the next call to one or the other, so the buffer is never
6086 in a trashed state.
6087
6088 =item d.
6089
6090 The return buffer is per-thread, so it also is never overwritten by a call to
6091 these functions from another thread; unlike the function it replaces.
6092
6093 =item e.
6094
6095 But most importantly, they work on systems that don't have C<nl_langinfo>, such
6096 as Windows, hence making your code more portable. Of the fifty-some possible
6097 items specified by the POSIX 2008 standard,
6098 L<https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
6099 only one is completely unimplemented, though on non-Windows platforms, another
6100 significant one is not fully implemented). They use various techniques to
6101 recover the other items, including calling C<L<localeconv(3)>>, and
6102 C<L<strftime(3)>>, both of which are specified in C89, so should be always be
6103 available. Later C<strftime()> versions have additional capabilities.
6104 If an item is not available on your system, this returns either the value
6105 associated with the C locale, or simply C<"">, whichever is more appropriate.
6106
6107 It is important to note that, when called with an item that is recovered by
6108 using C<localeconv>, the buffer from any previous explicit call to
6109 C<L<localeconv(3)>> will be overwritten. But you shouldn't be using
6110 C<localeconv> anyway because it is is very much not thread-safe, and suffers
6111 from the same problems outlined in item 'b.' above for the fields it returns
6112 that are controlled by the LC_NUMERIC locale category. Instead, avoid all of
6113 those problems by calling L</Perl_localeconv>, which is thread-safe; or by
6114 using the methods given in L<perlcall> to call
6115 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe.
6116
6117 =back
6118
6119 The details for those items which may deviate from what this emulation returns
6120 and what a native C<nl_langinfo()> would return are specified in
6121 L<I18N::Langinfo>.
6122
6123 =for apidoc sv_langinfo
6124
6125 This is the preferred interface for accessing the data that L<nl_langinfo(3)>
6126 provides (or Perl's emulation of it on platforms lacking it), returning an SV.
6127 Unlike, the earlier-defined interfaces to this (L</Perl_langinfo> and
6128 L</Perl_langinfo8>), which return strings, the UTF8ness of the result is
6129 automatically handled for you. And like them, it is thread-safe and
6130 automatically handles getting the proper values for the C<RADIXCHAR> and
6131 C<THOUSEP> items (that calling the plain libc C<nl_langinfo()> could give the
6132 wrong results for). Like them, this also doesn't play well with the libc
6133 C<localeconv()>; use L<C<POSIX::localeconv()>|POSIX/localeconv> instead.
6134
6135 There are a few deviations from what a native C<nl_langinfo()> would return and
6136 what this returns on platforms that don't implement that function. These are
6137 detailed in L<I18N::Langinfo>.
6138
6139 =cut
6140
6141 */
6142
6143 /* external_call_langinfo() is an interface to callers from outside this file to
6144 * langinfo_sv_i(), calculating a necessary value for it. If those functions
6145 * aren't defined, the fallback function is emulate_langinfo(), which doesn't
6146 * use that value (as everything in this situation takes place in the "C"
6147 * locale), and so we define this macro to transparently hide the absence of
6148 * the missing functions */
6149 #ifndef external_call_langinfo
6150 # define external_call_langinfo(item, sv, utf8p) \
6151 emulate_langinfo(item, "C", sv, utf8p)
6152 #endif
6153
6154 SV *
Perl_sv_langinfo(pTHX_ const nl_item item)6155 Perl_sv_langinfo(pTHX_ const nl_item item) {
6156 utf8ness_t dummy; /* Having this tells the layers below that we want the
6157 UTF-8 flag in 'sv' to be set properly. */
6158
6159 SV * sv = newSV_type(SVt_PV);
6160 (void) external_call_langinfo(item, sv, &dummy);
6161
6162 return sv;
6163 }
6164
6165 const char *
Perl_langinfo(const nl_item item)6166 Perl_langinfo(const nl_item item)
6167 {
6168 dTHX;
6169
6170 (void) external_call_langinfo(item, PL_langinfo_sv, NULL);
6171 return SvPV_nolen(PL_langinfo_sv);
6172 }
6173
6174 const char *
Perl_langinfo8(const nl_item item,utf8ness_t * utf8ness)6175 Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness)
6176 {
6177 PERL_ARGS_ASSERT_PERL_LANGINFO8;
6178 dTHX;
6179
6180 (void) external_call_langinfo(item, PL_langinfo_sv, utf8ness);
6181 return SvPV_nolen(PL_langinfo_sv);
6182 }
6183
6184 #ifdef USE_LOCALE
6185
6186 const char *
S_external_call_langinfo(pTHX_ const nl_item item,SV * sv,utf8ness_t * utf8ness)6187 S_external_call_langinfo(pTHX_ const nl_item item,
6188 SV * sv,
6189 utf8ness_t * utf8ness)
6190 {
6191 PERL_ARGS_ASSERT_EXTERNAL_CALL_LANGINFO;
6192
6193 /* Find the locale category that controls the input 'item', and call
6194 * langinfo_sv_i() including that value.
6195 *
6196 * If we are not paying attention to that category, instead call
6197 * emulate_langinfo(), which knows how to handle this situation. */
6198 locale_category_index cat_index = LC_ALL_INDEX_; /* Out-of-bounds */
6199
6200 switch (item) {
6201 case CODESET:
6202
6203 # ifdef USE_LOCALE_CTYPE
6204 cat_index = LC_CTYPE_INDEX_;
6205 # endif
6206 break;
6207
6208
6209 case YESEXPR: case YESSTR: case NOEXPR: case NOSTR:
6210
6211 # ifdef USE_LOCALE_MESSAGES
6212 cat_index = LC_MESSAGES_INDEX_;
6213 # endif
6214 break;
6215
6216
6217 case CRNCYSTR:
6218
6219 # ifdef USE_LOCALE_MONETARY
6220 cat_index = LC_MONETARY_INDEX_;
6221 # endif
6222 break;
6223
6224
6225 case RADIXCHAR: case THOUSEP:
6226
6227 # ifdef USE_LOCALE_NUMERIC
6228 cat_index = LC_NUMERIC_INDEX_;
6229 # endif
6230 break;
6231
6232
6233 case _NL_ADDRESS_POSTAL_FMT:
6234 case _NL_ADDRESS_COUNTRY_NAME:
6235 case _NL_ADDRESS_COUNTRY_POST:
6236 case _NL_ADDRESS_COUNTRY_AB2:
6237 case _NL_ADDRESS_COUNTRY_AB3:
6238 case _NL_ADDRESS_COUNTRY_CAR:
6239 case _NL_ADDRESS_COUNTRY_NUM:
6240 case _NL_ADDRESS_COUNTRY_ISBN:
6241 case _NL_ADDRESS_LANG_NAME:
6242 case _NL_ADDRESS_LANG_AB:
6243 case _NL_ADDRESS_LANG_TERM:
6244 case _NL_ADDRESS_LANG_LIB:
6245 # ifdef USE_LOCALE_ADDRESS
6246 cat_index = LC_ADDRESS_INDEX_;
6247 # endif
6248 break;
6249
6250
6251 case _NL_IDENTIFICATION_TITLE:
6252 case _NL_IDENTIFICATION_SOURCE:
6253 case _NL_IDENTIFICATION_ADDRESS:
6254 case _NL_IDENTIFICATION_CONTACT:
6255 case _NL_IDENTIFICATION_EMAIL:
6256 case _NL_IDENTIFICATION_TEL:
6257 case _NL_IDENTIFICATION_FAX:
6258 case _NL_IDENTIFICATION_LANGUAGE:
6259 case _NL_IDENTIFICATION_TERRITORY:
6260 case _NL_IDENTIFICATION_AUDIENCE:
6261 case _NL_IDENTIFICATION_APPLICATION:
6262 case _NL_IDENTIFICATION_ABBREVIATION:
6263 case _NL_IDENTIFICATION_REVISION:
6264 case _NL_IDENTIFICATION_DATE:
6265 case _NL_IDENTIFICATION_CATEGORY:
6266 # ifdef USE_LOCALE_IDENTIFICATION
6267 cat_index = LC_IDENTIFICATION_INDEX_;
6268 # endif
6269 break;
6270
6271
6272 case _NL_MEASUREMENT_MEASUREMENT:
6273 # ifdef USE_LOCALE_MEASUREMENT
6274 cat_index = LC_MEASUREMENT_INDEX_;
6275 # endif
6276 break;
6277
6278
6279 case _NL_NAME_NAME_FMT:
6280 case _NL_NAME_NAME_GEN:
6281 case _NL_NAME_NAME_MR:
6282 case _NL_NAME_NAME_MRS:
6283 case _NL_NAME_NAME_MISS:
6284 case _NL_NAME_NAME_MS:
6285 # ifdef USE_LOCALE_NAME
6286 cat_index = LC_NAME_INDEX_;
6287 # endif
6288 break;
6289
6290
6291 case _NL_PAPER_HEIGHT:
6292 case _NL_PAPER_WIDTH:
6293 # ifdef USE_LOCALE_PAPER
6294 cat_index = LC_PAPER_INDEX_;
6295 # endif
6296 break;
6297
6298
6299 case _NL_TELEPHONE_TEL_INT_FMT:
6300 case _NL_TELEPHONE_TEL_DOM_FMT:
6301 case _NL_TELEPHONE_INT_SELECT:
6302 case _NL_TELEPHONE_INT_PREFIX:
6303 # ifdef USE_LOCALE_TELEPHONE
6304 cat_index = LC_TELEPHONE_INDEX_;
6305 # endif
6306 break;
6307
6308
6309 default: /* The other possible items are all in LC_TIME. */
6310 # ifdef USE_LOCALE_TIME
6311 cat_index = LC_TIME_INDEX_;
6312 # endif
6313 break;
6314
6315 } /* End of switch on item */
6316
6317 # if defined(HAS_MISSING_LANGINFO_ITEM_)
6318
6319 /* If the above didn't find the category's index, it has to be because the
6320 * item is unknown to us (and the callee will handle that), or the category
6321 * is confined to the "C" locale on this platform, which the callee also
6322 * handles. (LC_MESSAGES is not required by the C Standard (the others
6323 * above are), so we have to emulate it on platforms lacking it (such as
6324 * Windows).) */
6325 if (cat_index == LC_ALL_INDEX_) {
6326 return emulate_langinfo(item, "C", sv, utf8ness);
6327 }
6328
6329 # endif
6330
6331 /* And get the value for this 'item', whose category has now been
6332 * calculated. We need to find the current corresponding locale, and pass
6333 * that as well. */
6334 return langinfo_sv_i(item, cat_index,
6335 query_nominal_locale_i(cat_index),
6336 sv, utf8ness);
6337 }
6338
6339 #endif
6340 #if defined(USE_LOCALE) && defined(HAS_NL_LANGINFO)
6341
6342 STATIC const char *
S_langinfo_sv_i(pTHX_ const nl_item item,locale_category_index cat_index,const char * locale,SV * sv,utf8ness_t * utf8ness)6343 S_langinfo_sv_i(pTHX_
6344 const nl_item item, /* The item to look up */
6345
6346 /* The locale category that controls it */
6347 locale_category_index cat_index,
6348
6349 /* The locale to look up 'item' in. */
6350 const char * locale,
6351
6352 /* The SV to store the result in; see below */
6353 SV * sv,
6354
6355 /* If not NULL, the location to store the UTF8-ness of 'item's
6356 * value, as documented */
6357 utf8ness_t * utf8ness)
6358 {
6359 PERL_ARGS_ASSERT_LANGINFO_SV_I;
6360 assert(cat_index < LC_ALL_INDEX_);
6361
6362 /* This function is the interface to nl_langinfo(), returning a thread-safe
6363 * result, valid until its next call that uses the same 'sv'. Similarly,
6364 * the S_emulate_langinfo() function below does the same, when
6365 * nl_langinfo() isn't available for the desired locale, or is completely
6366 * absent from the system. It is hopefully invisible to an outside caller
6367 * as to which one of the two actually ends up processing the request.
6368 * This comment block hence generally describes the two functions as a
6369 * unit.
6370 *
6371 * The two functions both return values (using 'return' statements) and
6372 * potentially change the contents of the passed in SV 'sv'. However, in
6373 * any given call, only one of the return types is reliable.
6374 *
6375 * When the passed in SV is 'PL_scratch_langinfo', the functions make sure
6376 * that the 'return' statements return the correct value, but whatever
6377 * value is in 'PL_scratch_langinfo' should be considered garbage. When it
6378 * is any other SV, that SV will get the correct result, and the value
6379 * returned by a 'return' statement should be considered garbage.
6380 *
6381 * The reason for this is twofold:
6382 *
6383 * 1) These functions serve two masters. For most purposes when called
6384 * from within this file, the desired value is used immediately, and
6385 * then no longer required. For these, the 'return' statement values
6386 * are most convenient.
6387 *
6388 * But when the call is initiated from an external XS source, like
6389 * I18N::Langinfo, the value needs to be able to be stable for a longer
6390 * time and likely returned to Perl space. An SV return is most
6391 * convenient for these
6392 *
6393 * Further, some Configurations use these functions reentrantly. For
6394 * those, an SV must be passed.
6395 *
6396 * 2) In S_emulate_langinfo(), most langinfo items are easy or even
6397 * trivial to get. These are amenable to being returned by 'return'
6398 * statements. But others are more complex, and use the infrastructure
6399 * provided by perl's SV functions to help out.
6400 *
6401 * So for some items, it is most convenient to 'return' a simple value; for
6402 * others an SV is most convenient. And some callers want a simple value;
6403 * others want or need an SV. It would be wasteful to have an SV, convert
6404 * it to a simple value, discarding the SV, then create a new SV.
6405 *
6406 * The solution adopted here is to always pass an SV, and have a reserved
6407 * one, PL_scratch_langinfo, indicate that a 'return' is desired. That SV
6408 * is then used as scratch for the items that it is most convenient to use
6409 * an SV in calculating. Besides these two functions and initialization,
6410 * the only mention of PL_scratch_langinfo is in the expansion of a single
6411 * macro that is used by the code in this file that desires a non-SV return
6412 * value.
6413 *
6414 * A wart of this interface is that to get the UTF-8 flag of the passed-in
6415 * SV set, you have to also pass a non-null 'utf8ness' parameter. This is
6416 * entirely to prevent the extra expense of calculating UTF-8ness when the
6417 * caller is plain Perl_langinfo(), which doesn't care about this. If that
6418 * seems too kludgy, other mechanisms could be devised. But be aware that
6419 * the SV interface has to have a way to not calculate UTF-8ness, or else
6420 * the reentrant uses could infinitely recurse */
6421
6422 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6423 "Entering langinfo_sv_i item=%jd, using locale %s\n",
6424 (PERL_INTMAX_T) item, locale));
6425
6426 # ifdef HAS_MISSING_LANGINFO_ITEM_
6427
6428 if (! category_available[cat_index]) {
6429 return emulate_langinfo(item, locale, sv, utf8ness);
6430 }
6431
6432 # endif
6433
6434 /* One might be tempted to avoid any toggling by instead using
6435 * nl_langinfo_l() on platforms that have it. This would entail creating a
6436 * locale object with newlocale() and freeing it afterwards. But doing so
6437 * runs significantly slower than just doing the toggle ourselves.
6438 * lib/locale_threads.t was slowed down by 25% on Ubuntu 22.04 */
6439
6440 start_DEALING_WITH_MISMATCHED_CTYPE(locale);
6441
6442 const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
6443
6444 /* nl_langinfo() is supposedly thread-safe except for its return value. The
6445 * POSIX 2017 Standard states:
6446 *
6447 * "The pointer returned by nl_langinfo() might be invalidated or the string
6448 * content might be overwritten by a subsequent call to nl_langinfo() in any
6449 * thread or to nl_langinfo_l() in the same thread or the initial thread, by
6450 * subsequent calls to setlocale() with a category corresponding to the
6451 * category of item (see <langinfo.h>) or the category LC_ALL, or by
6452 * subsequent calls to uselocale() which change the category corresponding
6453 * to the category of item."
6454 *
6455 * The implications of this are:
6456 * a) Threaded: nl_langinfo()'s return must be saved in a critical section
6457 * to avoid having another thread's call to it destroying the
6458 * result. That means that the whole call to nl_langinfo()
6459 * plus the save must be done in a critical section.
6460 * b) Unthreaded: No critical section is needed (accomplished by having the
6461 * locks below be no-ops in this case). But any subsequent
6462 * setlocale() or uselocale() could still destroy it.
6463 * Note that before returning, this function restores any
6464 * toggled locale categories. These could easily end up
6465 * calling uselocale() or setlocale(), destroying our
6466 * result. (And in some Configurations, this file currently
6467 * calls nl_langinfo_l() to determine if a uselocale() is
6468 * needed.) So, a copy of the result is made in this case as
6469 * well.
6470 */
6471 const char * retval = NULL;
6472 utf8ness_t is_utf8 = UTF8NESS_UNKNOWN;
6473
6474 /* Do a bit of extra work so avoid
6475 * switch() { default: ... }
6476 * where the only case in it is the default: */
6477 # if defined(USE_LOCALE_PAPER) \
6478 || defined(USE_LOCALE_MEASUREMENT) \
6479 || defined(USE_LOCALE_ADDRESS)
6480 # define IS_SWITCH 1
6481 # define MAYBE_SWITCH(n) switch(n)
6482 # else
6483 # define IS_SWITCH 0
6484 # define MAYBE_SWITCH(n)
6485 # endif
6486
6487 GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
6488
6489 MAYBE_SWITCH(item) {
6490
6491 # if defined(USE_LOCALE_MEASUREMENT)
6492
6493 case _NL_MEASUREMENT_MEASUREMENT:
6494 {
6495 /* An ugly API; only the first byte of the returned char* address means
6496 * anything */
6497 gwLOCALE_LOCK;
6498 char char_value = nl_langinfo(item)[0];
6499 gwLOCALE_UNLOCK;
6500
6501 sv_setuv(sv, char_value);
6502 }
6503
6504 goto non_string_common;
6505
6506 # endif
6507 # if defined(USE_LOCALE_ADDRESS) || defined(USE_LOCALE_PAPER)
6508 # if defined(USE_LOCALE_ADDRESS)
6509
6510 case _NL_ADDRESS_COUNTRY_NUM:
6511
6512 /* Some glibc's return random values for this item and locale;
6513 * workaround by special casing it. */
6514 if (isNAME_C_OR_POSIX(locale)) {
6515 sv_setuv(sv, 0);
6516 goto non_string_common;
6517 }
6518
6519 /* FALLTHROUGH */
6520
6521 # endif
6522 # if defined(USE_LOCALE_PAPER)
6523
6524 case _NL_PAPER_HEIGHT: case _NL_PAPER_WIDTH:
6525
6526 # endif
6527
6528 { /* A slightly less ugly API; the int portion of the returned char*
6529 * address is an integer. */
6530 gwLOCALE_LOCK;
6531 int int_value = (int) PTR2UV(nl_langinfo(item));
6532 gwLOCALE_UNLOCK;
6533
6534 sv_setuv(sv, int_value);
6535 }
6536
6537 # endif
6538 # if IS_SWITCH
6539 # if defined(USE_LOCALE_MEASUREMENT)
6540
6541 non_string_common:
6542
6543 # endif
6544
6545 /* In all cases that get here, the char* instead delivers a numeric
6546 * value, so its UTF-8ness is meaningless */
6547 is_utf8 = UTF8NESS_IMMATERIAL;
6548
6549 if (sv == PL_scratch_langinfo) {
6550 retval = SvPV_nomg_const_nolen(sv);
6551 }
6552
6553 break;
6554
6555 default:
6556
6557 # endif
6558
6559 /* The rest of the possibilities deliver a true char* pointer to a
6560 * string (or sequence of strings in the case of ALT_DIGITS) */
6561 gwLOCALE_LOCK;
6562
6563 retval = nl_langinfo(item);
6564 Size_t total_len = strlen(retval);
6565
6566 /* Initialized only to silence some dumber compilers warning that
6567 * might be uninitialized */
6568 char separator = ';';
6569
6570 if (UNLIKELY(item == ALT_DIGITS) && total_len > 0) {
6571
6572 /* The return from nl_langinfo(ALT_DIGITS) is specified by the
6573 * 2017 POSIX Standard as a string consisting of "semicolon-
6574 * separated symbols. The first is the alternative symbol
6575 * corresponding to zero, the second is the symbol corresponding to
6576 * one, and so on. Up to 100 alternative symbols may be
6577 * specified". Infuriatingly, Linux does not follow this, and uses
6578 * the least C-language-friendly separator possible, the NUL. In
6579 * case other platforms also violate the standard, the code below
6580 * looks for NUL and any graphic \W character as a potential
6581 * separator. */
6582 const char * sep_pos = strchr(retval, ';');
6583 if (! sep_pos) {
6584 sep_pos = strpbrk(retval, " !\"#$%&'()*+,-./:<=>?@[\\]^_`{|}~");
6585 }
6586 if (sep_pos) {
6587 separator = *sep_pos;
6588 }
6589 else if (strpbrk(retval, "123456789")) {
6590
6591 /* Alternate digits, with the possible exception of 0,
6592 * shouldn't be standard digits, so if we get any back, return
6593 * that there aren't alternate digits. 0 is an exception
6594 * because there may be locales that do not have a zero, such
6595 * as Roman numerals. It could therefore be that alt-0 is 0,
6596 * but alt-1 better be some multi-byte Unicode character(s)
6597 * like U+2160, ROMAN NUMERAL ONE. This clause is necessary
6598 * because the total length of the ASCII digits won't trigger
6599 * the conditional in the next clause that protects against
6600 * non-Standard libc returns, such as in Alpine platforms, but
6601 * multi-byte returns will trigger it */
6602 retval = "";
6603 total_len = 0;
6604 }
6605 else if (UNLIKELY(total_len >
6606 2 * UVCHR_SKIP(PERL_UNICODE_MAX) * 4))
6607 { /* But as a check against the possibility that the separator is
6608 * some other character, look at the length of the returned
6609 * string. If the separator is a NUL, the length will be just
6610 * for the first NUL-terminated segment; if it is some other
6611 * character, there is only a single segment with all returned
6612 * alternate digits, which will be quite a bit longer than just
6613 * the first one. Many locales will always have a leading zero
6614 * to represent 0-9 (hence the 2* in the conditional above).
6615 * The conditional uses the worst case value of the most number
6616 * of byte possible for a Unicode character, and it is possible
6617 * that it requires several characters to represent a single
6618 * value; hence the final multiplier. This length represents a
6619 * conservative upper limit of the number of bytes for the
6620 * alternative representation of 00, but if the string
6621 * represents even only the first 10 alternative digits, it
6622 * will be much longer than that. So to reach here, the
6623 * separator must be some other byte. */
6624 locale_panic_(Perl_form(aTHX_
6625 "Can't find separator in ALT_DIGITS"
6626 " representation '%s' for locale '%s'",
6627 _byte_dump_string((U8 *) retval,
6628 total_len, 0),
6629 locale));
6630 }
6631 else {
6632 separator = '\0';
6633
6634 /* Must be using NUL to separate the digits. There are up to
6635 * 100 of them. Find the length of the entire sequence.
6636 *
6637 * The only way it could work if fewer is if it ends in two
6638 * NULs. khw has seen cases where there is no 2nd NUL on a 100
6639 * digit return. */
6640 const char * s = retval + total_len + 1;
6641
6642 for (unsigned int i = 1; i <= 99; i++) {
6643 Size_t len = strlen(s) + 1;
6644 total_len += len;
6645
6646 if (len == 1) { /* Only a NUL */
6647 break;
6648 }
6649
6650 s += len;
6651 }
6652 }
6653 }
6654
6655 sv_setpvn(sv, retval, total_len);
6656
6657 gwLOCALE_UNLOCK;
6658
6659 /* Convert the ALT_DIGITS separator to a semi-colon if not already */
6660 if (UNLIKELY(item == ALT_DIGITS) && total_len > 0 && separator != ';') {
6661
6662 /* Operate directly on the string in the SV */
6663 char * digit_string = SvPVX(sv);
6664 char * s = digit_string;
6665 char * e = s + total_len;
6666
6667 do {
6668 char * this_end = (char *) memchr(s, separator, total_len);
6669 if (! this_end || this_end >= e) {
6670 break;
6671 }
6672
6673 *this_end = ';';
6674 s = this_end;
6675 } while (1);
6676 }
6677
6678 SvUTF8_off(sv);
6679 retval = SvPV_nomg_const_nolen(sv);
6680 }
6681
6682 GCC_DIAG_RESTORE_STMT;
6683
6684 restore_toggled_locale_i(cat_index, orig_switched_locale);
6685 end_DEALING_WITH_MISMATCHED_CTYPE(locale);
6686
6687 if (utf8ness) {
6688 if (LIKELY(is_utf8 == UTF8NESS_UNKNOWN)) { /* default: case above */
6689 is_utf8 = get_locale_string_utf8ness_i(retval,
6690 LOCALE_UTF8NESS_UNKNOWN,
6691 locale, cat_index);
6692 }
6693
6694 *utf8ness = is_utf8;
6695
6696 if (*utf8ness == UTF8NESS_YES) {
6697 SvUTF8_on(sv);
6698 }
6699 }
6700
6701 return retval;
6702 }
6703
6704 # undef IS_SWITCH
6705 # undef MAYBE_SWITCH
6706 #endif
6707 #ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
6708
6709 /* Forward declaration of function that we don't put into embed.fnc so as to
6710 * make its removal easier, as there may not be any extant platforms that need
6711 * it; and the function is located after emulate_langinfo() because it's easier
6712 * to understand when placed in the context of that code */
6713 STATIC bool
6714 S_maybe_override_codeset(pTHX_ const char * codeset,
6715 const char * locale,
6716 const char ** new_codeset);
6717 #endif
6718 #if ! defined(HAS_NL_LANGINFO) || defined(HAS_MISSING_LANGINFO_ITEM_)
6719
6720 STATIC const char *
S_emulate_langinfo(pTHX_ const PERL_INTMAX_T item,const char * locale,SV * sv,utf8ness_t * utf8ness)6721 S_emulate_langinfo(pTHX_ const PERL_INTMAX_T item,
6722 const char * locale,
6723 SV * sv,
6724 utf8ness_t * utf8ness)
6725 {
6726 PERL_ARGS_ASSERT_EMULATE_LANGINFO;
6727 PERL_UNUSED_ARG(locale); /* Too complicated to specify which
6728 Configurations use this vs which don't */
6729
6730 /* This emulates nl_langinfo() on platforms:
6731 * 1) where it doesn't exist; or
6732 * 2) where it does exist, but there are categories that it shouldn't be
6733 * called on because they don't exist on the platform or we are
6734 * supposed to always stay in the C locale for them. This function
6735 * has hard-coded in the results for those for the C locale.
6736 *
6737 * This function returns a thread-safe result, valid until its next call
6738 * that uses the same 'sv'. Similarly, the S_langinfo_sv_i() function
6739 * above does the same when nl_langinfo() is available. Its comments
6740 * include a general description of the interface for both it and this
6741 * function. That function should be the one called by code outside this
6742 * little group. If it can't handle the request, it gets handed off to
6743 * this function.
6744 *
6745 * The major platform lacking nl_langinfo() is Windows. It does have
6746 * GetLocaleInfoEx() that could be used to get most of the items, but it
6747 * (and other similar Windows API functions) use what MS calls "locale
6748 * names", whereas the C functions use what MS calls "locale strings". The
6749 * locale string "English_United_States.1252" is equivalent to the locale
6750 * name "en_US". There are tables inside Windows that translate between
6751 * the two forms, but they are not exposed. Also calling setlocale(), then
6752 * calling GetThreadLocale() doesn't work, as the former doesn't change the
6753 * latter's return. Therefore we are stuck using the mechanisms below. */
6754
6755 /* Almost all the items will have ASCII return values. Set that here, and
6756 * override if necessary */
6757 utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL;
6758 const char * retval = NULL;
6759
6760 /* This function returns its result either by returning the calculated
6761 * value 'retval' if the 'sv' argument is PL_scratch_langinfo; or for any
6762 * other value of 'sv', it places the result into that 'sv'. For some
6763 * paths through the code, it is more convenient, in the moment, to use one
6764 * or the other to hold the calculated result. And, the calculation could
6765 * end up with the value in both places. At the end, if the caller
6766 * wants the convenient result, we are done; but if it wants the opposite
6767 * type of value, it must be converted. These macros are used to tell the
6768 * code at the end where the value got placed. */
6769 # define RETVAL_IN_retval -1
6770 # define RETVAL_IN_BOTH 0
6771 # define RETVAL_IN_sv 1
6772 # define isRETVAL_IN_sv(type) ((type) >= RETVAL_IN_BOTH)
6773 # define isRETVAL_IN_retval(type) ((type) <= RETVAL_IN_BOTH)
6774
6775 /* Most calculations place the result in 'retval', so initialize to that,
6776 * and override if necessary */
6777 int retval_type = RETVAL_IN_retval;
6778
6779 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6780 "Entering emulate_langinfo item=%jd, using locale %s\n",
6781 item, locale));
6782
6783 # if defined(HAS_LOCALECONV) \
6784 && ! defined(HAS_SOME_LANGINFO) \
6785 && (defined(USE_LOCALE_NUMERIC) || defined(USE_LOCALE_MONETARY))
6786
6787 locale_category_index cat_index;
6788 const char * localeconv_key;
6789 I32 localeconv_klen;
6790
6791 # endif
6792
6793 GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
6794
6795 switch (item) {
6796
6797 # if ! defined(HAS_SOME_LANGINFO) || ! LC_MESSAGES_AVAIL_
6798
6799 /* The following items have no way khw could figure out how to get except
6800 * via nl_langinfo() */
6801 case YESEXPR: retval = "^[+1yY]"; break;
6802 case YESSTR: retval = "yes"; break;
6803 case NOEXPR: retval = "^[-0nN]"; break;
6804 case NOSTR: retval = "no"; break;
6805
6806 # endif
6807 # if ! defined(HAS_SOME_LANGINFO) || ! LC_MONETARY_AVAIL_
6808 # if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
6809 # define NEED_USE_LOCALECONV
6810
6811 case CRNCYSTR:
6812 cat_index = LC_MONETARY_INDEX_;
6813 localeconv_key = CURRENCY_SYMBOL_LITERAL;
6814 localeconv_klen = STRLENs(CURRENCY_SYMBOL_LITERAL);
6815 goto use_localeconv;
6816
6817 # else
6818
6819 case CRNCYSTR:
6820
6821 /* The locale's currency symbol may be empty. But if not, the return
6822 * from nl_langinfo() prefixes it with a character that indicates where
6823 * in the monetary value the symbol is to be placed
6824 * a) before, like $9.99
6825 * b) middle, rare, but would like be 9$99
6826 * c) after, like 9.99USD
6827 *
6828 * The POSIX Standard permits an implementation to choose whether or
6829 * not to omit the prefix character if the symbol is empty (the
6830 * placement position is meaningless if there is nothing to place).
6831 * glibc has chosen to always prefix an empty symbol by a minus (which
6832 * is the prefix for 'before' positioning). FreeBSD has chosen to
6833 * return an empty string for an empty symbol. Perl has always
6834 * emulated the glibc way (probably with little thought). */
6835 retval = "-";
6836 break;
6837
6838 # endif
6839 # endif
6840 # if ! defined(HAS_SOME_LANGINFO) || ! LC_NUMERIC_AVAIL_
6841 # if defined(USE_LOCALE_NUMERIC) && defined(HAS_LOCALECONV)
6842 # define NEED_USE_LOCALECONV
6843
6844 case THOUSEP:
6845 cat_index = LC_NUMERIC_INDEX_;
6846 localeconv_key = THOUSANDS_SEP_LITERAL;
6847 localeconv_klen = STRLENs(THOUSANDS_SEP_LITERAL);
6848 goto use_localeconv;
6849
6850 # else
6851
6852 case THOUSEP:
6853 retval = C_thousands_sep;
6854 break;
6855
6856 # endif
6857
6858 case RADIXCHAR:
6859
6860 # if defined(USE_LOCALE_NUMERIC) && defined(HAS_STRTOD)
6861
6862 {
6863 /* khw knows of only three possible radix characters used in the world.
6864 * By far the two most common are comma and dot. We can use strtod()
6865 * to quickly check for those without without much fuss. If it is
6866 * something other than those two, the code drops down and lets
6867 * localeconv() find it.
6868 *
6869 * We don't have to toggle LC_CTYPE here because all locales Perl
6870 * supports are compatible with ASCII, which the two possibilities are.
6871 * */
6872 const char * orig_switched_locale = toggle_locale_c(LC_NUMERIC, locale);
6873
6874 /* Comma tried first in case strtod() always accepts dot regardless of
6875 * the locale */
6876 if (strtod("1,5", NULL) > 1.4) {
6877 retval = ",";
6878 }
6879 else if (strtod("1.5", NULL) > 1.4) {
6880 retval = ".";
6881 }
6882 else {
6883 retval = NULL;
6884 }
6885
6886 restore_toggled_locale_c(LC_NUMERIC, orig_switched_locale);
6887
6888 if (retval) {
6889 break;
6890 }
6891 }
6892
6893 # endif /* Trying strtod() */
6894
6895 /* If gets to here, the strtod() method wasn't compiled, or it failed;
6896 * drop down.
6897 *
6898 * (snprintf() used to be used instead of strtod(), but it was removed
6899 * as being somewhat more clumsy, and maybe non-conforming on some
6900 * platforms. But before resorting to localeconv(), the code that was
6901 * removed by the strtod commit could be inserted here. This seems
6902 * unlikely to be wanted unless some really broken localeconv() shows
6903 * up) */
6904
6905 # if ! defined(USE_LOCALE_NUMERIC) || ! defined(HAS_LOCALECONV)
6906
6907 retval = C_decimal_point;
6908 break;
6909
6910 # else
6911 # define NEED_USE_LOCALECONV
6912
6913 cat_index = LC_NUMERIC_INDEX_;
6914 localeconv_key = DECIMAL_POINT_LITERAL;
6915 localeconv_klen = STRLENs(DECIMAL_POINT_LITERAL);
6916
6917 # endif
6918 # endif
6919 # ifdef NEED_USE_LOCALECONV
6920
6921 /* These items are available from localeconv(). */
6922
6923 /* case RADIXCHAR: // May drop down to here in some configurations
6924 case THOUSEP: // Jumps to here
6925 case CRNCYSTR: // Jumps to here */
6926 use_localeconv:
6927 {
6928
6929 /* The hash gets populated with just the field(s) related to 'item'. */
6930 HV * result_hv = my_localeconv(item);
6931 SV* string = hv_delete(result_hv, localeconv_key, localeconv_klen, 0);
6932
6933 # ifdef USE_LOCALE_MONETARY
6934
6935 if (item == CRNCYSTR) {
6936
6937 /* CRNCYSTR localeconv() returns a slightly different value
6938 * than the nl_langinfo() API calls for, so have to modify this one
6939 * to conform. We need another value from localeconv() to know
6940 * what to change it to. my_localeconv() has populated the hash
6941 * with exactly both fields. */
6942 SV* precedes = hv_deletes(result_hv, P_CS_PRECEDES_LITERAL, 0);
6943 if (! precedes) {
6944 locale_panic_("my_localeconv() unexpectedly didn't return"
6945 " a value for " P_CS_PRECEDES_LITERAL);
6946 }
6947
6948 /* The modification is to prefix the localeconv() return with a
6949 * single byte, calculated as follows: */
6950 const char * prefix = (LIKELY(SvIV(precedes) != -1))
6951 ? ((precedes != 0) ? "-" : "+")
6952 : ".";
6953 /* (khw couldn't find any documentation that the dot is signalled
6954 * by CHAR_MAX (which we modify to -1), but cygwin uses it thusly,
6955 * and it makes sense given that CHAR_MAX indicates the value isn't
6956 * used, so it neither precedes nor succeeds) */
6957
6958 /* Perform the modification */
6959 sv_insert(string, 0, 0, prefix, 1);
6960 }
6961
6962 # endif
6963
6964 /* Here, 'string' contains the value we want to return, and the
6965 * hv_delete() has left it mortalized so its PV may be reused instead of
6966 * copied */
6967 sv_setsv_nomg(sv, string);
6968 retval_type = RETVAL_IN_sv;
6969
6970 if (utf8ness) {
6971 is_utf8 = get_locale_string_utf8ness_i(SvPVX(sv),
6972 LOCALE_UTF8NESS_UNKNOWN,
6973 locale,
6974 cat_index);
6975 }
6976
6977 SvREFCNT_dec_NN(result_hv);
6978 break;
6979 }
6980
6981 # endif /* Using localeconv() for something or other */
6982 # undef NEED_USE_LOCALECONV
6983 # if ! defined(HAS_SOME_LANGINFO) || ! LC_CTYPE_AVAIL_
6984 # ifndef USE_LOCALE_CTYPE
6985
6986 case CODESET:
6987 retval = C_codeset;
6988 break;
6989
6990 # else
6991
6992 case CODESET:
6993
6994 /* The trivial case */
6995 if (isNAME_C_OR_POSIX(locale)) {
6996 retval = C_codeset;
6997 break;
6998 }
6999
7000 /* If this happens to match our cached value */
7001 if (PL_in_utf8_CTYPE_locale && strEQ(locale, PL_ctype_name)) {
7002 retval = "UTF-8";
7003 break;
7004 }
7005
7006 # ifdef WIN32
7007 # ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
7008 # define CODE_PAGE_FORMAT "%s"
7009 # define CODE_PAGE_FUNCTION nl_langinfo(CODESET)
7010 # else
7011 # define CODE_PAGE_FORMAT "%d"
7012
7013 /* This Windows function retrieves the code page. It is subject to
7014 * change, but is documented, and has been stable for many releases */
7015 # define CODE_PAGE_FUNCTION ___lc_codepage_func()
7016 # endif
7017
7018 const char * orig_CTYPE_locale;
7019 orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
7020 Perl_sv_setpvf(aTHX_ sv, CODE_PAGE_FORMAT, CODE_PAGE_FUNCTION);
7021 retval_type = RETVAL_IN_sv;
7022
7023 /* We just assume the codeset is ASCII; no need to check for it being
7024 * UTF-8 */
7025 SvUTF8_off(sv);
7026
7027 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
7028
7029 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n",
7030 locale, SvPVX(sv)));
7031 break;
7032
7033 # else /* Below is ! Win32 */
7034
7035 /* The codeset is important, but khw did not figure out a way for it to
7036 * be retrieved on non-Windows boxes without nl_langinfo(). But even
7037 * if we can't get it directly, we can usually determine if it is a
7038 * UTF-8 locale or not. If it is UTF-8, we (correctly) use that for
7039 * the code set. */
7040
7041 # ifdef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
7042
7043 if (is_locale_utf8(locale)) {
7044 retval = "UTF-8";
7045 break;
7046 }
7047
7048 # endif
7049
7050 /* Here, the code set has not been found. The only other option khw
7051 * could think of is to see if the codeset is part of the locale name.
7052 * This is very less than ideal; often there is no code set in the
7053 * name; and at other times they even lie.
7054 *
7055 * But there is an XPG standard syntax, which many locales follow:
7056 *
7057 * language[_territory[.codeset]][@modifier]
7058 *
7059 * So we take the part between the dot and any '@' */
7060 const char * name;
7061 name = strchr(locale, '.');
7062 if (! name) {
7063 retval = ""; /* Alas, no dot */
7064 }
7065 else {
7066
7067 /* Don't include the dot */
7068 name++;
7069
7070 /* The code set name is considered to be everything between the dot
7071 * and any '@', so stop before any '@' */
7072 const char * modifier = strchr(name, '@');
7073 if (modifier) {
7074 sv_setpvn(sv, name, modifier - name);
7075 }
7076 else {
7077 sv_setpv(sv, name);
7078 }
7079 SvUTF8_off(sv);
7080
7081 retval_type = RETVAL_IN_sv;
7082 }
7083
7084 # ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
7085
7086 /* Here, 'retval' contains any codeset name derived from the locale
7087 * name. That derived name may be empty or not necessarily indicative
7088 * of the real codeset. But we can often determine if it should be
7089 * UTF-8, regardless of what the name is. On most platforms, that
7090 * determination is definitive, and was already done. But for this
7091 * code to be compiled, this platform is not one of them. However,
7092 * there are typically tools available to make a very good guess, and
7093 * knowing the derived codeset name improves the quality of that guess.
7094 * The following function overrides the derived codeset name when it
7095 * guesses that it actually should be UTF-8. It could be inlined here,
7096 * but was moved out of this switch() so as to make the switch()
7097 * control flow easier to follow */
7098 if (isRETVAL_IN_sv(retval_type)) {
7099 retval = SvPVX_const(sv);
7100 retval_type = RETVAL_IN_BOTH;
7101 }
7102
7103 if (S_maybe_override_codeset(aTHX_ retval, locale, &retval)) {
7104 retval_type = RETVAL_IN_retval;
7105 }
7106
7107 # endif
7108
7109 break;
7110
7111 # endif /* ! WIN32 */
7112 # endif /* USE_LOCALE_CTYPE */
7113 # endif
7114
7115 /* The _NL_foo items are mostly empty; the rest are copied from Ubuntu C
7116 * locale values. khw fairly arbitrarily decided which of its non-empty
7117 * values to copy and which to change to empty. All the numeric ones needed
7118 * some value */
7119
7120 # if ! defined(HAS_SOME_LANGINFO) || ! LC_ADDRESS_AVAIL_
7121
7122 case _NL_ADDRESS_POSTAL_FMT:
7123 case _NL_ADDRESS_COUNTRY_NAME:
7124 case _NL_ADDRESS_COUNTRY_POST:
7125 case _NL_ADDRESS_COUNTRY_AB2:
7126 case _NL_ADDRESS_COUNTRY_AB3:
7127 case _NL_ADDRESS_COUNTRY_CAR:
7128 case _NL_ADDRESS_COUNTRY_ISBN:
7129 case _NL_ADDRESS_LANG_NAME:
7130 case _NL_ADDRESS_LANG_AB:
7131 case _NL_ADDRESS_LANG_TERM:
7132 case _NL_ADDRESS_LANG_LIB:
7133 retval = "";
7134 break;
7135
7136 case _NL_ADDRESS_COUNTRY_NUM:
7137 sv_setuv(sv, 0);
7138 retval_type = RETVAL_IN_sv;
7139 break;
7140
7141 # endif
7142 # if ! defined(HAS_SOME_LANGINFO) || ! LC_IDENTIFICATION_AVAIL_
7143
7144 case _NL_IDENTIFICATION_ADDRESS:
7145 case _NL_IDENTIFICATION_CONTACT:
7146 case _NL_IDENTIFICATION_EMAIL:
7147 case _NL_IDENTIFICATION_TEL:
7148 case _NL_IDENTIFICATION_FAX:
7149 case _NL_IDENTIFICATION_LANGUAGE:
7150 case _NL_IDENTIFICATION_AUDIENCE:
7151 case _NL_IDENTIFICATION_APPLICATION:
7152 case _NL_IDENTIFICATION_ABBREVIATION:
7153 retval = "";
7154 break;
7155
7156 case _NL_IDENTIFICATION_DATE: retval = "1997-12-20"; break;
7157 case _NL_IDENTIFICATION_REVISION: retval = "1.0"; break;
7158 case _NL_IDENTIFICATION_CATEGORY: retval = "i18n:1999"; break;
7159 case _NL_IDENTIFICATION_TERRITORY:retval = "ISO"; break;
7160
7161 case _NL_IDENTIFICATION_TITLE:
7162 retval = "ISO/IEC 14652 i18n FDCC-set";
7163 break;
7164
7165 case _NL_IDENTIFICATION_SOURCE:
7166 retval = "ISO/IEC JTC1/SC22/WG20 - internationalization";
7167 break;
7168
7169 # endif
7170 # if ! defined(HAS_SOME_LANGINFO) || ! LC_MEASUREMENT_AVAIL_
7171
7172 case _NL_MEASUREMENT_MEASUREMENT:
7173 sv_setuv(sv, 1);
7174 retval_type = RETVAL_IN_sv;
7175 break;
7176
7177 # endif
7178 # if ! defined(HAS_SOME_LANGINFO) || ! LC_NAME_AVAIL_
7179
7180 case _NL_NAME_NAME_FMT:
7181 case _NL_NAME_NAME_GEN:
7182 case _NL_NAME_NAME_MR:
7183 case _NL_NAME_NAME_MRS:
7184 case _NL_NAME_NAME_MISS:
7185 case _NL_NAME_NAME_MS:
7186 retval = "";
7187 break;
7188
7189 # endif
7190 # if ! defined(HAS_SOME_LANGINFO) || ! LC_PAPER_AVAIL_
7191
7192 case _NL_PAPER_HEIGHT:
7193 sv_setuv(sv, 297);
7194 retval_type = RETVAL_IN_sv;
7195 break;
7196
7197 case _NL_PAPER_WIDTH:
7198 sv_setuv(sv, 210);
7199 retval_type = RETVAL_IN_sv;
7200 break;
7201
7202 # endif
7203 # if ! defined(HAS_SOME_LANGINFO) || ! LC_TELEPHONE_AVAIL_
7204
7205 case _NL_TELEPHONE_INT_SELECT:
7206 case _NL_TELEPHONE_INT_PREFIX:
7207 case _NL_TELEPHONE_TEL_DOM_FMT:
7208 retval = "";
7209 break;
7210
7211 case _NL_TELEPHONE_TEL_INT_FMT:
7212 retval = "+%c %a %l";
7213 break;
7214
7215 # endif
7216
7217 /* When we have to emulate TIME-related items, this bit of code is compiled
7218 * to have the default: case be a nested switch() which distinguishes
7219 * between legal inputs and unknown ones. This bit does initialization and
7220 * then at the end calls switch(). But when we aren't emulating TIME, by
7221 * the time we get to here all legal inputs have been handled above, and it
7222 * is cleaner to not have a nested switch(). So this bit of code is skipped
7223 * and the other-wise nested default: case is compiled as part of the outer
7224 * (and actually only) switch() */
7225 # if ! defined(HAS_SOME_LANGINFO) || ! LC_TIME_AVAIL_
7226
7227 default: /* Anything else that is legal is LC_TIME-related */
7228 {
7229
7230 const char * format = NULL;
7231 retval = NULL;
7232
7233 # ifdef HAS_STRFTIME
7234
7235 bool return_format = FALSE;
7236
7237 /* Without strftime(), default compiled-in values are returned.
7238 * Otherwise, we generally compute a date as explained below.
7239 * Initialize default values for that computation */
7240 int mon = 0;
7241 int mday = 1;
7242 int hour = 6;
7243
7244 # endif
7245
7246 /* Nested switch for LC_TIME items, plus the default: case is for
7247 * unknown items */
7248 switch (item) {
7249
7250 # endif /* ! defined(HAS_SOME_LANGINFO) || ! LC_TIME_AVAIL_ */
7251
7252 default:
7253
7254 /* On systems with langinfo.h, 'item' is an enum. If we don't
7255 * handle one of those, the code needs to change to be able to do
7256 * so. But otherwise, the parameter can be any int, and so could
7257 * be a garbage value and all we can do is to return that it is
7258 * invalid. */;
7259 # if defined(I_LANGINFO)
7260
7261 Perl_croak_nocontext("panic: Unexpected nl_langinfo() item %jd",
7262 item);
7263
7264 # else
7265 assert(item < 0); /* Make sure using perl_langinfo.h */
7266 SET_EINVAL;
7267 retval = "";
7268 break;
7269 # endif
7270
7271 /* Back to the nested switch() */
7272 # if ! defined(HAS_SOME_LANGINFO) || ! LC_TIME_AVAIL_
7273
7274 /* The case: statments in this switch are all for LC_TIME related
7275 * values. There are four types of values returned. One type is
7276 * "Give me the name in this locale of the 3rd month of the year"
7277 * (March in an English locale). The second main type is "Give me
7278 * the best format string understood by strftime(), like '%c', for
7279 * formatting the date and time in this locale." The other two
7280 * types are for ERA and ALT_DIGITS, and are explained at the case
7281 * statements for them.
7282 *
7283 * For the first type, suppose we want to find the name of the 3rd
7284 * month of the year. We pass a date/time to strftime() that is
7285 * known to evaluate to sometime in March, along with a format that
7286 * tells strftime() to return the month's name. We then return
7287 * that to our caller. Similarly for the names of the days of the
7288 * week, like "Tuesday". There are also abbreviated versions for
7289 * each of these.
7290 *
7291 * To implement the second type (returning to the caller a string
7292 * containing a format suitable for passing to strftime() ) we
7293 * guess a format, pass that to strftime, and examine its return to
7294 * see if that format is known on this platform. If so, we return
7295 * that guess. Otherwise we return the empty string "". There are
7296 * no second guesses, as there don't seem to be alternatives
7297 * lurking out there. For some formats that are supposed to be
7298 * known to all strftime()s since C89, we just assume that they are
7299 * valid, not bothering to check. The guesses may not be the best
7300 * available for this locale on this platform, but should be good
7301 * enough, so that a native speaker would find them understandable.
7302 * */
7303
7304 /* Unimplemented by perl; for use with strftime() %E modifier */
7305 case ERA: retval = ""; break;
7306
7307 # if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
7308
7309 case AM_STR: retval = "AM"; break;
7310 case PM_STR: retval = "PM"; break;
7311 # else
7312 case PM_STR: hour = 18;
7313 case AM_STR:
7314 format = "%p";
7315 break;
7316 # endif
7317 # if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
7318
7319 case ABDAY_1: retval = "Sun"; break;
7320 case ABDAY_2: retval = "Mon"; break;
7321 case ABDAY_3: retval = "Tue"; break;
7322 case ABDAY_4: retval = "Wed"; break;
7323 case ABDAY_5: retval = "Thu"; break;
7324 case ABDAY_6: retval = "Fri"; break;
7325 case ABDAY_7: retval = "Sat"; break;
7326 # else
7327 case ABDAY_7: mday++;
7328 case ABDAY_6: mday++;
7329 case ABDAY_5: mday++;
7330 case ABDAY_4: mday++;
7331 case ABDAY_3: mday++;
7332 case ABDAY_2: mday++;
7333 case ABDAY_1:
7334 format = "%a";
7335 break;
7336 # endif
7337 # if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
7338
7339 case DAY_1: retval = "Sunday"; break;
7340 case DAY_2: retval = "Monday"; break;
7341 case DAY_3: retval = "Tuesday"; break;
7342 case DAY_4: retval = "Wednesday"; break;
7343 case DAY_5: retval = "Thursday"; break;
7344 case DAY_6: retval = "Friday"; break;
7345 case DAY_7: retval = "Saturday"; break;
7346 # else
7347 case DAY_7: mday++;
7348 case DAY_6: mday++;
7349 case DAY_5: mday++;
7350 case DAY_4: mday++;
7351 case DAY_3: mday++;
7352 case DAY_2: mday++;
7353 case DAY_1:
7354 format = "%A";
7355 break;
7356 # endif
7357 # if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
7358 case ABMON_1: retval = "Jan"; break;
7359 case ABMON_2: retval = "Feb"; break;
7360 case ABMON_3: retval = "Mar"; break;
7361 case ABMON_4: retval = "Apr"; break;
7362 case ABMON_5: retval = "May"; break;
7363 case ABMON_6: retval = "Jun"; break;
7364 case ABMON_7: retval = "Jul"; break;
7365 case ABMON_8: retval = "Aug"; break;
7366 case ABMON_9: retval = "Sep"; break;
7367 case ABMON_10: retval = "Oct"; break;
7368 case ABMON_11: retval = "Nov"; break;
7369 case ABMON_12: retval = "Dec"; break;
7370 # else
7371 case ABMON_12: mon++;
7372 case ABMON_11: mon++;
7373 case ABMON_10: mon++;
7374 case ABMON_9: mon++;
7375 case ABMON_8: mon++;
7376 case ABMON_7: mon++;
7377 case ABMON_6: mon++;
7378 case ABMON_5: mon++;
7379 case ABMON_4: mon++;
7380 case ABMON_3: mon++;
7381 case ABMON_2: mon++;
7382 case ABMON_1:
7383 format = "%b";
7384 break;
7385 # endif
7386 # if ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
7387
7388 case MON_1: retval = "January"; break;
7389 case MON_2: retval = "February"; break;
7390 case MON_3: retval = "March"; break;
7391 case MON_4: retval = "April"; break;
7392 case MON_5: retval = "May"; break;
7393 case MON_6: retval = "June"; break;
7394 case MON_7: retval = "July"; break;
7395 case MON_8: retval = "August"; break;
7396 case MON_9: retval = "September";break;
7397 case MON_10: retval = "October"; break;
7398 case MON_11: retval = "November"; break;
7399 case MON_12: retval = "December"; break;
7400 # else
7401 case MON_12: mon++;
7402 case MON_11: mon++;
7403 case MON_10: mon++;
7404 case MON_9: mon++;
7405 case MON_8: mon++;
7406 case MON_7: mon++;
7407 case MON_6: mon++;
7408 case MON_5: mon++;
7409 case MON_4: mon++;
7410 case MON_3: mon++;
7411 case MON_2: mon++;
7412 case MON_1:
7413 format = "%B";
7414 break;
7415 # endif
7416 # ifndef HAS_STRFTIME
7417
7418 /* If no strftime() on this system, no format will be recognized, so
7419 * return empty */
7420 case D_FMT: case T_FMT: case D_T_FMT:
7421 case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT:
7422 case T_FMT_AMPM:
7423 retval = "";
7424 break;
7425 # else
7426 /* These strftime formats are defined by C89, so we assume that
7427 * strftime supports them, and so are returned unconditionally; they
7428 * may not be what the locale actually says, but should give good
7429 * enough results for someone using them as formats (as opposed to
7430 * trying to parse them to figure out what the locale says). The
7431 * other format items are actually tested to verify they work on the
7432 * platform */
7433 case D_FMT: retval = "%x"; break;
7434 case T_FMT: retval = "%X"; break;
7435 case D_T_FMT: retval = "%c"; break;
7436
7437 /* This format isn't in C89; test that it actually works on the
7438 * platform */
7439 case T_FMT_AMPM:
7440 format = "%r";
7441 return_format = TRUE;
7442 break;
7443
7444 # if defined(WIN32) || ! defined(USE_LOCALE_TIME)
7445
7446 /* strftime() on Windows doesn't have the POSIX (beyond C89)
7447 * extensions that would allow it to recover these, so use the plain
7448 * non-ERA formats. Also, when LC_TIME is constrained to the C
7449 * locale, the %E modifier is useless, so don't return it. */
7450 case ERA_D_FMT: retval = "%x"; break;
7451 case ERA_T_FMT: retval = "%X"; break;
7452 case ERA_D_T_FMT: retval = "%c"; break;
7453 # else
7454 case ERA_D_FMT:
7455 format = "%Ex";
7456 return_format = TRUE; /* Test that this works on the platform */
7457 break;
7458
7459 case ERA_T_FMT:
7460 format = "%EX";
7461 return_format = TRUE;
7462 break;
7463
7464 case ERA_D_T_FMT:
7465 format = "%Ec";
7466 return_format = TRUE;
7467 break;
7468 # endif
7469 # endif
7470 # if defined(WIN32) || ! defined(USE_LOCALE_TIME) || ! defined(HAS_STRFTIME)
7471
7472 case ALT_DIGITS: retval = ""; break;
7473 # else
7474 # define CAN_BE_ALT_DIGITS
7475
7476 case ALT_DIGITS:
7477 format = "%Ow"; /* Find the alternate digit for 0 */
7478 break;
7479 # endif
7480
7481 } /* End of inner switch() */
7482
7483 /* The inner switch() above has set 'retval' iff that is the final
7484 * answer */
7485 if (retval) {
7486 break;
7487 }
7488
7489 /* And it hasn't set 'format' iff it can't figure out a good value on
7490 * this platform. */
7491 if (! format) {
7492 retval = "";
7493 break;
7494 }
7495
7496 # ifdef HAS_STRFTIME
7497
7498 /* Here we have figured out what to call strftime() with */
7499
7500 struct tm mytm;
7501 const char * orig_TIME_locale
7502 = toggle_locale_c_unless_locking(LC_TIME, locale);
7503
7504 /* The year was deliberately chosen so that January 1 is on the
7505 * first day of the week. Since we're only getting one thing at a
7506 * time, it all works */
7507 ints_to_tm(&mytm, locale, 30, 30, hour, mday, mon, 2011, 0, 0, 0);
7508 bool succeeded;
7509 if (utf8ness) {
7510 succeeded = strftime8(format,
7511 sv,
7512 locale,
7513 &mytm,
7514
7515 /* All possible formats specified above are
7516 * entirely ASCII */
7517 UTF8NESS_IMMATERIAL,
7518
7519 &is_utf8,
7520 false /* not calling from sv_strftime */
7521 );
7522 }
7523 else {
7524 succeeded = strftime_tm(format, sv, locale, &mytm);
7525 }
7526
7527 restore_toggled_locale_c_unless_locking(LC_TIME, orig_TIME_locale);
7528
7529 if (UNLIKELY(! succeeded)) {
7530 retval = "";
7531 break;
7532 }
7533
7534 # ifdef CAN_BE_ALT_DIGITS
7535
7536 if (LIKELY(item != ALT_DIGITS))
7537
7538 # endif
7539
7540 {
7541
7542 /* If to return what strftime() returns, are done */
7543 if (! return_format) {
7544 retval_type = RETVAL_IN_sv;
7545 break;
7546 }
7547
7548 /* Here are to return the format, not the value. This is used when
7549 * we are testing if the format we expect to return is legal on
7550 * this platform. We have passed the format, say "%r, to
7551 * strftime(), and now have in 'sv' what strftime processed it
7552 * to be. But the caller doesnt't want that; it wants the actual
7553 * %r, if it is understood on this platform, and "" if it isn't.
7554 * Some strftime()s return "" for an unknown format. (None of the
7555 * formats exposed by langinfo can have "" be a legal result.)
7556 * Other strftime()s return the format unchanged if not understood.
7557 * So if we pass "%r" to strftime(), and it's illegal, we will get
7558 * back either "" or "%r", and we return "" to our caller. If the
7559 * strftime() return is anything else, we conclude that "%r" is
7560 * understood by the platform, and return "%r". */
7561 if (strEQ(SvPVX(sv), format)) {
7562 retval = "";
7563 }
7564 else {
7565 retval = format;
7566 }
7567
7568 /* A format is always in ASCII */
7569 is_utf8 = UTF8NESS_IMMATERIAL;
7570
7571 break;
7572 }
7573
7574 # ifdef CAN_BE_ALT_DIGITS
7575
7576 /* Here, the item is 'ALT_DIGITS' and 'sv' contains the zeroth
7577 * alternate digit. If empty, return that there aren't alternate
7578 * digits */
7579 Size_t alt0_len = SvCUR(sv);
7580 if (alt0_len == 0) {
7581 retval_type = RETVAL_IN_sv;
7582 break;
7583 }
7584
7585 /* ALT_DIGITS requires special handling because it requires up to 100
7586 * values. Below we generate those by using the %O modifier to
7587 * strftime() formats.
7588 *
7589 * We already have the alternate digit for zero in 'sv', generated
7590 * using the %Ow format, which was used because it seems least likely
7591 * to have a leading zero. But some locales return the equivalent of
7592 * 00 anyway. If the first half of 'sv' is identical to the second
7593 * half, assume that is the case, and use just the first half */
7594 if ((alt0_len & 1) == 0) {
7595 Size_t half_alt0_len = alt0_len / 2;
7596 if (strnEQ(SvPVX(sv), SvPVX(sv) + half_alt0_len, half_alt0_len)) {
7597 alt0_len = half_alt0_len;
7598 SvCUR_set(sv, alt0_len);
7599 }
7600 }
7601
7602 sv_catpvn_nomg (sv, ";", 1);
7603
7604 /* Many of the remaining digits have representations that include at
7605 * least two 0-sized strings */
7606 SV* alt_dig_sv = newSV(2 * alt0_len);
7607
7608 /* Various %O formats can be used to derive the alternate digits. Only
7609 * %Oy can go up to the full 100 values. If it doesn't work, we try
7610 * various fallbacks in decreasing order of how many values they can
7611 * deliver. maxes[] tells the highest value that the format applies
7612 * to; offsets[] compensates for 0-based vs 1-based indices; and vars[]
7613 * holds what field in the 'struct tm' to applies to the corresponding
7614 * format */
7615 int year, min, sec;
7616 const char * fmts[] = {"%Oy", "%OM", "%OS", "%Od", "%OH", "%Om", "%Ow" };
7617 const Size_t maxes[] = { 99, 59, 59, 31, 23, 11, 6 };
7618 const int offsets[] = { 0, 0, 0, 1, 0, 1, 0 };
7619 int * vars[] = {&year, &min, &sec, &mday, &hour, &mon, &mday };
7620 Size_t j = 0; /* Current index into the above tables */
7621
7622 orig_TIME_locale = toggle_locale_c_unless_locking(LC_TIME, locale);
7623
7624 for (unsigned int i = 1; i <= 99; i++) {
7625 struct tm mytm;
7626
7627 redo:
7628 if (j >= C_ARRAY_LENGTH(fmts)) {
7629 break; /* Exhausted formats early; can't continue */
7630 }
7631
7632 if (i > maxes[j]) {
7633 j++; /* Exhausted this format; try next one */
7634 goto redo;
7635 }
7636
7637 year = (strchr(fmts[j], 'y')) ? 1900 : 2011;
7638 hour = 0;
7639 min = 0;
7640 sec = 0;
7641 mday = 1;
7642 mon = 0;
7643
7644 /* Change the variable corresponding to this format to the
7645 * current time being run in 'i' */
7646 *(vars[j]) += i - offsets[j];
7647
7648 /* Do the strftime. Once we have determined the UTF8ness (if
7649 * we want it), assume the rest will be the same, and use
7650 * strftime_tm(), which doesn't recalculate UTF8ness */
7651 ints_to_tm(&mytm, locale, sec, min, hour, mday, mon, year, 0, 0, 0);
7652 if (utf8ness && is_utf8 != UTF8NESS_NO && is_utf8 != UTF8NESS_YES) {
7653 succeeded = strftime8(fmts[j],
7654 alt_dig_sv,
7655 locale,
7656 &mytm,
7657 UTF8NESS_IMMATERIAL,
7658 &is_utf8,
7659 false /* not calling from sv_strftime */
7660 );
7661 }
7662 else {
7663 succeeded = strftime_tm(fmts[j], alt_dig_sv, locale, &mytm);
7664 }
7665
7666 /* If didn't recognize this format, try the next */
7667 if (UNLIKELY(! succeeded)) {
7668 j++;
7669 goto redo;
7670 }
7671
7672 const char * current = SvPVX(alt_dig_sv);
7673
7674 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
7675 "i=%d, format=%s, alt='%s'\n",
7676 i, fmts[j], current));
7677
7678
7679 /* If it returned regular digits, give up on this format, to try
7680 * the next candidate one */
7681 if (strpbrk(current, "0123456789")) {
7682 j++;
7683 goto redo;
7684 }
7685
7686 /* If there is a leading alternate zero, skip past it, to get the
7687 * second one in the string. The first 'alt0_len' bytes in 'sv'
7688 * will be the alternate-zero representation */
7689 if (strnEQ(current, SvPVX(sv), alt0_len)) {
7690 current += alt0_len;
7691 }
7692
7693 /* Append this number to the ongoing list, including the separator.
7694 * */
7695 sv_catpv_nomg (sv, current);
7696 sv_catpvn_nomg (sv, ";", 1);
7697 } /* End of loop generating ALT_DIGIT strings */
7698
7699 /* Above we accepted 0 for alt-0 in case the locale doesn't have a
7700 * zero, but we rejected any other ASCII digits. Now that we have
7701 * processed everything, if that 0 is the only thing we found, it was a
7702 * false positive, and the locale doesn't have alternate digits */
7703 if (SvCUR(sv) == alt0_len + 1) {
7704 SvCUR_set(sv, 0);
7705 }
7706
7707 SvREFCNT_dec_NN(alt_dig_sv);
7708
7709 restore_toggled_locale_c_unless_locking(LC_TIME, orig_TIME_locale);
7710
7711 retval_type = RETVAL_IN_sv;
7712 break;
7713
7714 # endif /* End of CAN_BE_ALT_DIGITS */
7715 # endif /* End of HAS_STRFTIME */
7716
7717 } /* End of braced group for outer switch 'default:' case */
7718
7719 # endif
7720
7721 } /* Giant switch() of nl_langinfo() items */
7722
7723 GCC_DIAG_RESTORE_STMT;
7724
7725 if (sv != PL_scratch_langinfo) { /* Caller wants return in 'sv' */
7726 if (! isRETVAL_IN_sv(retval_type)) {
7727 sv_setpv(sv, retval);
7728 SvUTF8_off(sv);
7729 }
7730
7731 if (utf8ness) {
7732 *utf8ness = is_utf8;
7733 if (is_utf8 == UTF8NESS_YES) {
7734 SvUTF8_on(sv);
7735 }
7736 }
7737
7738 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
7739 "Leaving emulate_langinfo item=%jd, using locale %s\n",
7740 item, locale));
7741
7742 /* The caller shouldn't also be wanting a 'retval'; make sure segfaults
7743 * if they call this wrong */
7744 return NULL;
7745 }
7746
7747 /* Here, wants a 'retval' return. Extract that if not already there. */
7748 if (! isRETVAL_IN_retval(retval_type)) {
7749 retval = SvPV_nolen(sv);
7750 }
7751
7752 /* Here, 'retval' started as a simple value, or has been converted into
7753 * being simple */
7754 if (utf8ness) {
7755 *utf8ness = is_utf8;
7756 }
7757
7758 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
7759 "Leaving emulate_langinfo item=%jd, using locale %s\n",
7760 item, locale));
7761 return retval;
7762
7763 # undef RETVAL_IN_retval
7764 # undef RETVAL_IN_BOTH
7765 # undef RETVAL_IN_sv
7766 # undef isRETVAL_IN_sv
7767 # undef isRETVAL_IN_retval
7768
7769 }
7770
7771 #endif /* Needs emulate_langinfo() */
7772 #ifndef HAS_DEFINITIVE_UTF8NESS_DETERMINATION
7773
7774 STATIC bool
S_maybe_override_codeset(pTHX_ const char * codeset,const char * locale,const char ** new_codeset)7775 S_maybe_override_codeset(pTHX_ const char * codeset,
7776 const char * locale,
7777 const char ** new_codeset)
7778 {
7779 # define NAME_INDICATES_UTF8 0x1
7780 # define MB_CUR_MAX_SUGGESTS_UTF8 0x2
7781
7782 /* Override 'codeset' with UTF-8 if this routine guesses that it should be.
7783 * Conversely (but rarely), "UTF-8" in the locale name might be wrong. We
7784 * return "" as the code set name if we find that to be the case. */
7785
7786 unsigned int lean_towards_being_utf8 = 0;
7787 if (is_codeset_name_UTF8(codeset)) {
7788 lean_towards_being_utf8 |= NAME_INDICATES_UTF8;
7789 }
7790
7791 const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
7792
7793 /* For this portion of the file to compile, some C99 functions aren't
7794 * available to us, even though we now require C99. So, something must be
7795 * wrong with them. The code here should be good enough to work around
7796 * this issue, but should the need arise, comments in S_is_locale_utf8()
7797 * list some alternative C99 functions that could be tried.
7798 *
7799 * But MB_CUR_MAX is a C89 construct that helps a lot, is simple for a
7800 * vendor to implement, and our experience with it is that it works well on
7801 * a variety of platforms. We have found that it returns a too-large
7802 * number on some platforms for the C locale, but for no others. That
7803 * locale was already ruled out in the code that called this function. (If
7804 * MB_CUR_MAX returned too small a number, that would break a lot of
7805 * things, and likely would be quickly corrected by the vendor.) khw has
7806 * some confidence that it doesn't return >1 when 1 is meant, as that would
7807 * trigger a Perl warning, and we've had no reports of invalid occurrences
7808 * of such. */
7809 # ifdef MB_CUR_MAX
7810
7811 /* If there are fewer bytes available in this locale than are required to
7812 * represent the largest legal UTF-8 code point, this definitely isn't a
7813 * UTF-8 locale, even if the locale name says it is. */
7814 const int mb_cur_max = MB_CUR_MAX;
7815 if (mb_cur_max < (int) UNISKIP(PERL_UNICODE_MAX)) {
7816 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
7817
7818 if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
7819 *new_codeset = ""; /* The name is wrong; override */
7820 return true;
7821 }
7822
7823 return false;
7824 }
7825
7826 /* But if the locale could be UTF-8, and also the name corroborates this,
7827 * assume it is so */
7828 if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
7829 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
7830 return false;
7831 }
7832
7833 restore_toggled_locale_c_if_locking(LC_CTYPE, orig_CTYPE_locale);
7834
7835 /* Here, the name doesn't indicate UTF-8, but MB_CUR_MAX indicates it could
7836 * be. khw knows of only two other locales in the world, EUC-TW and GB
7837 * 18030, that legitimately require this many bytes (4). So, if the name
7838 * is one of those, MB_CUR_MAX has corroborated that. */
7839 bool name_implies_non_utf8 = false;
7840 if (foldEQ(codeset, "GB", 2)) {
7841 const char * s = codeset + 2;
7842 if (*s == '-' || *s == '_') {
7843 s++;
7844 }
7845
7846 if strEQ(s, "18030") {
7847 name_implies_non_utf8 = true;
7848 }
7849 }
7850 else if (foldEQ(codeset, "EUC", 3)) {
7851 const char * s = codeset + 3;
7852 if (*s == '-' || *s == '_') {
7853 s++;
7854 }
7855
7856 if (foldEQ(s, "TW", 2)) {
7857 name_implies_non_utf8 = true;
7858 }
7859 }
7860
7861 /* Otherwise, the locale is likely UTF-8 */
7862 if (! name_implies_non_utf8) {
7863 lean_towards_being_utf8 |= MB_CUR_MAX_SUGGESTS_UTF8;
7864 }
7865
7866 /* (In both those two other multibyte locales, the single byte characters
7867 * are the same as ASCII. No multi-byte character in EUC-TW is legal UTF-8
7868 * (since the first byte of each is a continuation). GB 18030 has no three
7869 * byte sequences, and none of the four byte ones is legal UTF-8 (as the
7870 * second byte for these is a non-continuation). But every legal UTF-8 two
7871 * byte sequence is also legal in GB 18030, though none have the same
7872 * meaning, and no Han code point expressed in UTF-8 is two byte. So the
7873 * further tests below which look for native expressions of currency and
7874 * time will not return two byte sequences, hence they will reliably rule
7875 * out such a locale as being UTF-8, even if the code set name checked
7876 * above isn't correct.) */
7877
7878 # endif /* has MB_CUR_MAX */
7879
7880 /* Here, MB_CUR_MAX is not available, or was inconclusive. What we do is
7881 * to look at various strings associated with the locale:
7882 * 1) If any are illegal UTF-8, the locale can't be UTF-8.
7883 * 2) If all are legal UTF-8, and some non-ASCII characters are present,
7884 * it is likely to be UTF-8, because of the strictness of UTF-8
7885 * syntax. So assume it is UTF-8
7886 * 3) If all are ASCII and the locale name and/or MB_CUR_MAX indicate
7887 * UTF-8, assume the locale is UTF-8.
7888 * 4) Otherwise, assume the locale isn't UTF-8
7889 *
7890 * To save cycles, if the locale name indicates it is a UTF-8 locale, we
7891 * stop looking at the first instance with legal non-ASCII UTF-8. It is
7892 * very unlikely this combination is coincidental. */
7893
7894 utf8ness_t strings_utf8ness = UTF8NESS_UNKNOWN;
7895
7896 /* List of strings to look at */
7897 const int trials[] = {
7898
7899 # if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
7900
7901 /* The first string tried is the locale currency name. Often that will
7902 * be in the native script.
7903 *
7904 * But this is usable only if localeconv() is available, as that's the
7905 * way we find out the currency symbol. */
7906
7907 CRNCYSTR,
7908
7909 # endif
7910 # ifdef USE_LOCALE_TIME
7911
7912 /* We can also try various strings associated with LC_TIME, like the names
7913 * of months or days of the week */
7914
7915 DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
7916 MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
7917 MON_9, MON_10, MON_11, MON_12,
7918 ALT_DIGITS, AM_STR, PM_STR,
7919 ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6, ABDAY_7,
7920 ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
7921 ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
7922
7923 # endif
7924
7925 };
7926
7927 # ifdef USE_LOCALE_TIME
7928
7929 /* The code in the recursive call below can handle switching the locales,
7930 * but by doing it now here, that code will check and discover that there
7931 * is no need to switch then restore, avoiding those each loop iteration.
7932 *
7933 * But don't do this if toggling actually creates a critical section, so as
7934 * to minimize the amount of time spent in each critical section. */
7935 const char * orig_TIME_locale =
7936 toggle_locale_c_unless_locking(LC_TIME, locale);
7937
7938 # endif
7939
7940 /* The trials array may consist of strings from two different locale
7941 * categories. The call to langinfo_i() below needs to pass the proper
7942 * category for each string. There is a max of 1 trial for LC_MONETARY;
7943 * the rest are LC_TIME. So the array is arranged so the LC_MONETARY item
7944 * (if any) is first, and all subsequent iterations will use LC_TIME.
7945 * These #ifdefs set up the values for all possible combinations. */
7946 # if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
7947
7948 locale_category_index cat_index = LC_MONETARY_INDEX_;
7949
7950 # ifdef USE_LOCALE_TIME
7951
7952 const locale_category_index follow_on_cat_index = LC_TIME_INDEX_;
7953 assert(trials[1] == DAY_1); /* Make sure only a single non-time entry */
7954
7955 # else
7956
7957 /* Effectively out-of-bounds, as there is only the monetary entry */
7958 const locale_category_index follow_on_cat_index = LC_ALL_INDEX_;
7959
7960 # endif
7961 # elif defined(USE_LOCALE_TIME)
7962
7963 locale_category_index cat_index = LC_TIME_INDEX_;
7964 const locale_category_index follow_on_cat_index = LC_TIME_INDEX_;
7965
7966 # else
7967
7968 /* Effectively out-of-bounds, as here there are no trial entries at all.
7969 * This allows this code to compile, but there are no strings to test, and
7970 * so the answer will always be non-UTF-8. */
7971 locale_category_index cat_index = LC_ALL_INDEX_;
7972 const locale_category_index follow_on_cat_index = LC_ALL_INDEX_;
7973
7974 # endif
7975
7976 /* We will need to use the reentrant interface. */
7977 SV * sv = newSVpvs("");
7978
7979 /* Everything set up; look through all the strings */
7980 for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(trials); i++) {
7981
7982 /* To prevent infinite recursive calls, we don't ask for the UTF-8ness
7983 * of the string. Instead we examine the result below */
7984 langinfo_sv_i(trials[i], cat_index, locale, sv, NULL);
7985
7986 cat_index = follow_on_cat_index;
7987
7988 const char * result = SvPVX(sv);
7989 const Size_t len = strlen(result);
7990 const U8 * first_variant;
7991
7992 /* If the string is identical whether or not it is encoded as UTF-8, it
7993 * isn't helpful in determining UTF8ness. */
7994 if (is_utf8_invariant_string_loc((U8 *) result, len, &first_variant))
7995 {
7996 continue;
7997 }
7998
7999 /* Here, has non-ASCII. If not legal UTF-8, isn't a UTF-8 locale */
8000 if (! is_strict_utf8_string(first_variant,
8001 len - (first_variant - (U8 *) result)))
8002 {
8003 strings_utf8ness = UTF8NESS_NO;
8004 break;
8005 }
8006
8007 /* Here, is a legal non-ASCII UTF-8 string; tentatively set the return
8008 * to YES; possibly overridden by later iterations */
8009 strings_utf8ness = UTF8NESS_YES;
8010
8011 /* But if this corroborates our expectation, quit now */
8012 if (lean_towards_being_utf8 & NAME_INDICATES_UTF8) {
8013 break;
8014 }
8015 }
8016
8017 # ifdef USE_LOCALE_TIME
8018
8019 restore_toggled_locale_c_unless_locking(LC_TIME, orig_TIME_locale);
8020
8021 # endif
8022
8023 restore_toggled_locale_c_unless_locking(LC_CTYPE, orig_CTYPE_locale);
8024
8025 if (strings_utf8ness == UTF8NESS_NO) {
8026 return false; /* No override */
8027 }
8028
8029 /* Here all tested strings are legal UTF-8.
8030 *
8031 * Above we set UTF8NESS_YES if any string wasn't ASCII. But even if they
8032 * are all ascii, and the locale name indicates it is a UTF-8 locale,
8033 * assume the locale is UTF-8. */
8034 if (lean_towards_being_utf8) {
8035 strings_utf8ness = UTF8NESS_YES;
8036 }
8037
8038 if (strings_utf8ness == UTF8NESS_YES) {
8039 *new_codeset = "UTF-8";
8040 return true;
8041 }
8042
8043 /* Here, nothing examined indicates that the codeset is or isn't UTF-8.
8044 * But what is it? The other locale categories are not likely to be of
8045 * further help:
8046 *
8047 * LC_NUMERIC Only a few locales in the world have a non-ASCII radix or
8048 * group separator.
8049 * LC_CTYPE This code wouldn't be compiled if mbtowc() existed and was
8050 * reliable. This is unlikely in C99. There are other
8051 * functions that could be used instead, but are they going to
8052 * exist, and be able to distinguish between UTF-8 and 8859-1?
8053 * Deal with this only if it becomes necessary.
8054 * LC_MESSAGES The strings returned from strerror() would seem likely
8055 * candidates, but experience has shown that many systems
8056 * don't actually have translations installed for them. They
8057 * are instead always in English, so everything in them is
8058 * ASCII, which is of no help to us. A Configure probe could
8059 * possibly be written to see if this platform has non-ASCII
8060 * error messages. But again, wait until it turns out to be
8061 * an actual problem.
8062 *
8063 * Things like YESSTR, NOSTR, might not be in ASCII, but need
8064 * nl_langinfo() to access, which we don't have.
8065 */
8066
8067 /* Otherwise, assume the locale isn't UTF-8. This can be wrong if we don't
8068 * have MB_CUR_MAX, and the locale is English without UTF-8 in its name,
8069 * and with a dollar currency symbol. */
8070 return false; /* No override */
8071 }
8072
8073 # endif /* ! HAS_DEFINITIVE_UTF8NESS_DETERMINATION */
8074
8075 /*
8076 =for apidoc_section $time
8077 =for apidoc sv_strftime_tm
8078 =for apidoc_item my_strftime
8079
8080 These implement the libc strftime().
8081
8082 On failure, they return NULL, and set C<errno> to C<EINVAL>.
8083
8084 C<sv_strftime_tm> is preferred, as it transparently handles the UTF-8ness of
8085 the current locale, the input C<fmt>, and the returned result. Only if the
8086 current C<LC_TIME> locale is a UTF-8 one (and S<C<use bytes>> is not in effect)
8087 will the result be marked as UTF-8.
8088
8089 C<sv_strftime_tm> takes a pointer to a filled-in S<C<struct tm>> parameter. It
8090 ignores the values of the C<wday> and C<yday> fields in it. The other fields
8091 give enough information to accurately calculate these values, and are used for
8092 that purpose.
8093
8094 The caller assumes ownership of the returned SV with a reference count of 1.
8095
8096 C<my_strftime> is kept for backwards compatibility. Knowing if its result
8097 should be considered UTF-8 or not requires significant extra logic.
8098
8099 The return value is a pointer to the formatted result (which MUST be arranged
8100 to be FREED BY THE CALLER). This allows this function to increase the buffer
8101 size as needed, so that the caller doesn't have to worry about that, unlike
8102 libc C<strftime()>.
8103
8104 The C<wday>, C<yday>, and C<isdst> parameters are ignored by C<my_strftime>.
8105 Daylight savings time is never considered to exist, and the values returned for
8106 the other two fields (if C<fmt> even calls for them) are calculated from the
8107 other parameters, without need for referencing these.
8108
8109 Note that both functions are always executed in the underlying
8110 C<LC_TIME> locale of the program, giving results based on that locale.
8111
8112 =cut
8113 */
8114
8115 char *
Perl_my_strftime(pTHX_ const char * fmt,int sec,int min,int hour,int mday,int mon,int year,int wday,int yday,int isdst)8116 Perl_my_strftime(pTHX_ const char *fmt, int sec, int min, int hour,
8117 int mday, int mon, int year, int wday, int yday,
8118 int isdst)
8119 { /* Documented above */
8120 PERL_ARGS_ASSERT_MY_STRFTIME;
8121
8122 #ifdef USE_LOCALE_TIME
8123 const char * locale = querylocale_c(LC_TIME);
8124 #else
8125 const char * locale = "C";
8126 #endif
8127
8128 struct tm mytm;
8129 ints_to_tm(&mytm, locale, sec, min, hour, mday, mon, year, wday, yday,
8130 isdst);
8131 if (! strftime_tm(fmt, PL_scratch_langinfo, locale, &mytm)) {
8132 return NULL;
8133 }
8134
8135 return savepv(SvPVX(PL_scratch_langinfo));
8136 }
8137
8138 SV *
Perl_sv_strftime_ints(pTHX_ SV * fmt,int sec,int min,int hour,int mday,int mon,int year,int wday,int yday,int isdst)8139 Perl_sv_strftime_ints(pTHX_ SV * fmt, int sec, int min, int hour,
8140 int mday, int mon, int year, int wday,
8141 int yday, int isdst)
8142 { /* Documented above */
8143 PERL_ARGS_ASSERT_SV_STRFTIME_INTS;
8144
8145 #ifdef USE_LOCALE_TIME
8146 const char * locale = querylocale_c(LC_TIME);
8147 #else
8148 const char * locale = "C";
8149 #endif
8150
8151 struct tm mytm;
8152 ints_to_tm(&mytm, locale, sec, min, hour, mday, mon, year, wday, yday,
8153 isdst);
8154 return sv_strftime_common(fmt, locale, &mytm);
8155 }
8156
8157 SV *
Perl_sv_strftime_tm(pTHX_ SV * fmt,const struct tm * mytm)8158 Perl_sv_strftime_tm(pTHX_ SV * fmt, const struct tm * mytm)
8159 { /* Documented above */
8160 PERL_ARGS_ASSERT_SV_STRFTIME_TM;
8161
8162 #ifdef USE_LOCALE_TIME
8163
8164 return sv_strftime_common(fmt, querylocale_c(LC_TIME), mytm);
8165
8166 #else
8167
8168 return sv_strftime_common(fmt, "C", mytm);
8169
8170 #endif
8171
8172 }
8173
8174 SV *
S_sv_strftime_common(pTHX_ SV * fmt,const char * locale,const struct tm * mytm)8175 S_sv_strftime_common(pTHX_ SV * fmt,
8176 const char * locale,
8177 const struct tm * mytm)
8178 { /* Documented above */
8179 PERL_ARGS_ASSERT_SV_STRFTIME_COMMON;
8180
8181 STRLEN fmt_cur;
8182 const char *fmt_str = SvPV_const(fmt, fmt_cur);
8183
8184 utf8ness_t fmt_utf8ness = (SvUTF8(fmt) && LIKELY(! IN_BYTES))
8185 ? UTF8NESS_YES
8186 : UTF8NESS_UNKNOWN;
8187
8188 utf8ness_t result_utf8ness;
8189
8190 /* Use a fairly generous guess as to how big the buffer needs to be, so as
8191 * to get almost all the typical returns to fit without the called function
8192 * having to realloc; this is a somewhat educated guess, but feel free to
8193 * tweak it. */
8194 SV* sv = newSV(MAX(fmt_cur * 2, 64));
8195 if (! strftime8(fmt_str,
8196 sv,
8197 locale,
8198 mytm,
8199 fmt_utf8ness,
8200 &result_utf8ness,
8201 true /* calling from sv_strftime */ ))
8202 {
8203 return NULL;
8204 }
8205
8206
8207 if (result_utf8ness == UTF8NESS_YES) {
8208 SvUTF8_on(sv);
8209 }
8210
8211 return sv;
8212 }
8213
8214 STATIC void
S_ints_to_tm(pTHX_ struct tm * mytm,const char * locale,int sec,int min,int hour,int mday,int mon,int year,int wday,int yday,int isdst)8215 S_ints_to_tm(pTHX_ struct tm * mytm,
8216 const char * locale,
8217 int sec, int min, int hour, int mday, int mon, int year,
8218 int wday, int yday, int isdst)
8219 {
8220 /* Create a struct tm structure from the input time-related integer
8221 * variables for 'locale' */
8222
8223 /* Override with the passed-in values */
8224 Zero(mytm, 1, struct tm);
8225 mytm->tm_sec = sec;
8226 mytm->tm_min = min;
8227 mytm->tm_hour = hour;
8228 mytm->tm_mday = mday;
8229 mytm->tm_mon = mon;
8230 mytm->tm_year = year;
8231 mytm->tm_wday = wday;
8232 mytm->tm_yday = yday;
8233 mytm->tm_isdst = isdst;
8234
8235 /* Long-standing behavior is to ignore the effects of locale (in
8236 * particular, daylight savings time) on the input, so we use mini_mktime.
8237 * See GH #22062. */
8238 mini_mktime(mytm);
8239
8240 /* But some of those effect are deemed desirable, so use libc to get the
8241 * values for tm_gmtoff and tm_zone on platforms that have them [perl
8242 * #18238] */
8243 #if defined(HAS_MKTIME) \
8244 && (defined(HAS_TM_TM_GMTOFF) || defined(HAS_TM_TM_ZONE))
8245
8246 const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
8247 struct tm mytm2 = *mytm;
8248 MKTIME_LOCK;
8249 mktime(&mytm2);
8250 MKTIME_UNLOCK;
8251 restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
8252
8253 # ifdef HAS_TM_TM_GMTOFF
8254 mytm->tm_gmtoff = mytm2.tm_gmtoff;
8255 # endif
8256 # ifdef HAS_TM_TM_ZONE
8257 mytm->tm_zone = mytm2.tm_zone;
8258 # endif
8259 #endif
8260
8261 return;
8262 }
8263
8264 STATIC bool
S_strftime_tm(pTHX_ const char * fmt,SV * sv,const char * locale,const struct tm * mytm)8265 S_strftime_tm(pTHX_ const char *fmt,
8266 SV * sv,
8267 const char *locale,
8268 const struct tm *mytm)
8269 {
8270 PERL_ARGS_ASSERT_STRFTIME_TM;
8271
8272 /* Execute strftime() based on the input struct tm, and the current LC_TIME
8273 * locale.
8274 *
8275 * Returns 'true' if succeeded, with the PV pointer in 'sv' filled with the
8276 * result, and all other C<OK> bits disabled, and not marked as UTF-8.
8277 * Determining the UTF-8ness must be done at a higher level.
8278 *
8279 * 'false' is returned if if fails; the state of 'sv' is unspecified. */
8280
8281 /* An empty format yields an empty result */
8282 const Size_t fmtlen = strlen(fmt);
8283 if (fmtlen == 0) {
8284 sv_setpvs(sv, "");
8285 SvUTF8_off(sv);
8286 return true;
8287 }
8288
8289 bool succeeded = false;
8290
8291 #ifndef HAS_STRFTIME
8292 Perl_croak(aTHX_ "panic: no strftime");
8293 #endif
8294
8295 start_DEALING_WITH_MISMATCHED_CTYPE(locale);
8296
8297 #if defined(USE_LOCALE_TIME)
8298
8299 const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
8300
8301 # define LC_TIME_TEARDOWN \
8302 restore_toggled_locale_c(LC_TIME, orig_TIME_locale)
8303 #else
8304 PERL_UNUSED_ARG(locale);
8305 # define LC_TIME_TEARDOWN
8306 #endif
8307
8308 /* Assume the caller has furnished a reasonable sized guess, but guard
8309 * against one that won't work */
8310 Size_t bufsize = MAX(2, SvLEN(sv));
8311 SvUPGRADE(sv, SVt_PV);
8312 SvPOK_only(sv);
8313
8314 do {
8315 char * buf = SvGROW(sv, bufsize);
8316
8317 /* allowing user-supplied (rather than literal) formats is normally
8318 * frowned upon as a potential security risk; but this is part of the
8319 * API so we have to allow it (and the available formats have a much
8320 * lower chance of doing something bad than the ones for printf etc. */
8321 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
8322
8323 #ifdef WIN32 /* Windows will tell you if the input is invalid */
8324
8325 /* Needed because the LOCK might (or might not) save/restore errno */
8326 bool strftime_failed = false;
8327
8328 STRFTIME_LOCK;
8329 dSAVE_ERRNO;
8330 errno = 0;
8331
8332 Size_t len = strftime(buf, bufsize, fmt, mytm);
8333 if (errno == EINVAL) {
8334 strftime_failed = true;
8335 }
8336
8337 RESTORE_ERRNO;
8338 STRFTIME_UNLOCK;
8339
8340 if (strftime_failed) {
8341 goto strftime_failed;
8342 }
8343
8344 #else
8345 STRFTIME_LOCK;
8346 Size_t len = strftime(buf, bufsize, fmt, mytm);
8347 STRFTIME_UNLOCK;
8348 #endif
8349
8350 GCC_DIAG_RESTORE_STMT;
8351
8352 /* A non-zero return indicates success. But to make sure we're not
8353 * dealing with some rogue strftime that returns how much space it
8354 * needs instead of 0 when there isn't enough, check that the return
8355 * indicates we have at least one byte of spare space (which will be
8356 * used for the terminating NUL). */
8357 if (inRANGE(len, 1, bufsize - 1)) {
8358 succeeded = true;
8359 SvCUR_set(sv, len);
8360 goto strftime_return;
8361 }
8362
8363 /* There are several possible reasons for a 0 return code for a
8364 * non-empty format, and they are not trivial to tease apart. This
8365 * issue is a known bug in the strftime() API. What we do to cope is
8366 * to assume that the reason is not enough space in the buffer, so
8367 * increase it and try again. */
8368 bufsize *= 2;
8369
8370 /* But don't just keep increasing the size indefinitely. Stop when it
8371 * becomes obvious that the reason for failure is something besides not
8372 * enough space. The most likely largest expanding format is %c. On
8373 * khw's Linux box, the maximum result of this is 67 characters, in the
8374 * km_KH locale. If a new script comes along that uses 4 UTF-8 bytes
8375 * per character, and with a similar expansion factor, that would be a
8376 * 268:2 byte ratio, or a bit more than 128:1 = 2**7:1. Some strftime
8377 * implementations allow you to say %1000c to pad to 1000 bytes. This
8378 * shows that it is impossible to implement this without a heuristic
8379 * (which can fail). But it indicates we need to be generous in the
8380 * upper limit before failing. The previous heuristic used was too
8381 * stingy. Since the size doubles per iteration, it doesn't take many
8382 * to reach the limit */
8383 } while (bufsize < ((1 << 11) + 1) * fmtlen);
8384
8385 /* Here, strftime() returned 0, and it likely wasn't for lack of space.
8386 * There are two possible reasons:
8387 *
8388 * First is that the result is legitimately 0 length. This can happen
8389 * when the format is precisely "%p". That is the only documented format
8390 * that can have an empty result. */
8391 if (strEQ(fmt, "%p")) {
8392 sv_setpvs(sv, "");
8393 SvUTF8_off(sv);
8394 succeeded = true;
8395 goto strftime_return;
8396 }
8397
8398 /* The other reason is that the format string is malformed. Probably it is
8399 * that the string is syntactically invalid for the locale. On some
8400 * platforms an invalid conversion specifier '%?' (for all illegal '?') is
8401 * treated as a literal, but others may fail when '?' is illegal */
8402
8403 #ifdef WIN32
8404 strftime_failed:
8405 #endif
8406
8407 SET_EINVAL;
8408 succeeded = false;
8409
8410 strftime_return:
8411
8412 LC_TIME_TEARDOWN;
8413 end_DEALING_WITH_MISMATCHED_CTYPE(locale);
8414
8415 return succeeded;
8416 }
8417
8418 STATIC bool
S_strftime8(pTHX_ const char * fmt,SV * sv,const char * locale,const struct tm * mytm,const utf8ness_t fmt_utf8ness,utf8ness_t * result_utf8ness,const bool called_externally)8419 S_strftime8(pTHX_ const char * fmt,
8420 SV * sv,
8421 const char * locale,
8422 const struct tm * mytm,
8423 const utf8ness_t fmt_utf8ness,
8424 utf8ness_t * result_utf8ness,
8425 const bool called_externally)
8426 {
8427 PERL_ARGS_ASSERT_STRFTIME8;
8428
8429 /* Wrap strftime_tm, taking into account the input and output UTF-8ness */
8430
8431 #ifdef USE_LOCALE_TIME
8432 # define INDEX_TO_USE LC_TIME_INDEX_
8433
8434 locale_utf8ness_t locale_utf8ness = LOCALE_UTF8NESS_UNKNOWN;
8435
8436 #else
8437 # define INDEX_TO_USE LC_ALL_INDEX_ /* Effectively out of bounds */
8438
8439 locale_utf8ness_t locale_utf8ness = LOCALE_NOT_UTF8;
8440
8441 #endif
8442
8443 switch (fmt_utf8ness) {
8444 case UTF8NESS_IMMATERIAL:
8445 break;
8446
8447 case UTF8NESS_NO: /* Known not to be UTF-8; must not be UTF-8 locale */
8448 if (is_locale_utf8(locale)) {
8449 SET_EINVAL;
8450 return false;
8451 }
8452
8453 locale_utf8ness = LOCALE_NOT_UTF8;
8454 break;
8455
8456 case UTF8NESS_YES: /* Known to be UTF-8; must be UTF-8 locale if can't
8457 downgrade. */
8458 if (! is_locale_utf8(locale)) {
8459 locale_utf8ness = LOCALE_NOT_UTF8;
8460
8461 bool is_utf8 = true;
8462 Size_t fmt_len = strlen(fmt);
8463 fmt = (char *) bytes_from_utf8((U8 *) fmt, &fmt_len, &is_utf8);
8464 if (is_utf8) {
8465 SET_EINVAL;
8466 return false;
8467 }
8468
8469 SAVEFREEPV(fmt);
8470 }
8471 else {
8472 locale_utf8ness = LOCALE_IS_UTF8;
8473 }
8474
8475 break;
8476
8477 case UTF8NESS_UNKNOWN:
8478 if (! is_locale_utf8(locale)) {
8479 locale_utf8ness = LOCALE_NOT_UTF8;
8480 }
8481 else {
8482 locale_utf8ness = LOCALE_IS_UTF8;
8483 if (called_externally) {
8484
8485 /* All internal calls from this file use ASCII-only formats;
8486 * but otherwise the format could be anything, so make sure to
8487 * upgrade it to UTF-8 for a UTF-8 locale. Otherwise the
8488 * locale would find any UTF-8 variant characters to be
8489 * malformed */
8490 Size_t fmt_len = strlen(fmt);
8491 fmt = (char *) bytes_to_utf8((U8 *) fmt, &fmt_len);
8492 SAVEFREEPV(fmt);
8493 }
8494 }
8495
8496 break;
8497 }
8498
8499 if (! strftime_tm(fmt, sv, locale, mytm)) {
8500 return false;
8501 }
8502
8503 *result_utf8ness = get_locale_string_utf8ness_i(SvPVX(sv),
8504 locale_utf8ness,
8505 locale,
8506 INDEX_TO_USE);
8507 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
8508 "fmt=%s, retval=%s; utf8ness=%d",
8509 fmt,
8510 ((is_strict_utf8_string((U8 *) SvPVX(sv), 0))
8511 ? SvPVX(sv)
8512 :_byte_dump_string((U8 *) SvPVX(sv), SvCUR(sv) ,0)),
8513 *result_utf8ness));
8514 return true;
8515
8516 #undef INDEX_TO_USE
8517
8518 }
8519
8520 #ifdef USE_LOCALE
8521
8522 STATIC void
S_give_perl_locale_control(pTHX_ const char * lc_all_string,const line_t caller_line)8523 S_give_perl_locale_control(pTHX_
8524 # ifdef LC_ALL
8525 const char * lc_all_string,
8526 # else
8527 const char ** locales,
8528 # endif
8529 const line_t caller_line)
8530 {
8531 PERL_UNUSED_ARG(caller_line);
8532
8533 /* This is called when the program is in the global locale and are
8534 * switching to per-thread (if available). And it is called at
8535 * initialization time to do the same.
8536 */
8537
8538 # if defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)
8539
8540 /* On Windows, convert to per-thread behavior. This isn't necessary in
8541 * POSIX 2008, as the conversion gets done automatically in the
8542 * void_setlocale_i() calls below. */
8543 if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) {
8544 locale_panic_("_configthreadlocale returned an error");
8545 }
8546
8547 # endif
8548 # if ! defined(USE_THREAD_SAFE_LOCALE) \
8549 && ! defined(USE_POSIX_2008_LOCALE)
8550 # if defined(LC_ALL)
8551 PERL_UNUSED_ARG(lc_all_string);
8552 # else
8553 PERL_UNUSED_ARG(locales);
8554 # endif
8555 # else
8556
8557 /* This platform has per-thread locale handling. Do the conversion. */
8558
8559 # if defined(LC_ALL)
8560
8561 void_setlocale_c_with_caller(LC_ALL, lc_all_string, __FILE__, caller_line);
8562
8563 # else
8564
8565 for_all_individual_category_indexes(i) {
8566 void_setlocale_i_with_caller(i, locales[i], __FILE__, caller_line);
8567 }
8568
8569 # endif
8570 # endif
8571
8572 /* Finally, update our remaining records. 'true' => force recalculation.
8573 * This is needed because we don't know what's happened while Perl hasn't
8574 * had control, so we need to figure out the current state */
8575
8576 # if defined(LC_ALL)
8577
8578 new_LC_ALL(lc_all_string, true);
8579
8580 # else
8581
8582 new_LC_ALL(calculate_LC_ALL_string(locales,
8583 INTERNAL_FORMAT,
8584 WANT_TEMP_PV,
8585 caller_line),
8586 true);
8587 # endif
8588
8589 }
8590
8591 STATIC void
S_output_check_environment_warning(pTHX_ const char * const language,const char * const lc_all,const char * const lang)8592 S_output_check_environment_warning(pTHX_ const char * const language,
8593 const char * const lc_all,
8594 const char * const lang)
8595 {
8596 PerlIO_printf(Perl_error_log,
8597 "perl: warning: Please check that your locale settings:\n");
8598
8599 # ifdef __GLIBC__
8600
8601 PerlIO_printf(Perl_error_log, "\tLANGUAGE = %c%s%c,\n",
8602 language ? '"' : '(',
8603 language ? language : "unset",
8604 language ? '"' : ')');
8605 # else
8606 PERL_UNUSED_ARG(language);
8607 # endif
8608
8609 PerlIO_printf(Perl_error_log, "\tLC_ALL = %c%s%c,\n",
8610 lc_all ? '"' : '(',
8611 lc_all ? lc_all : "unset",
8612 lc_all ? '"' : ')');
8613
8614 for_all_individual_category_indexes(i) {
8615 const char * value = PerlEnv_getenv(category_names[i]);
8616 PerlIO_printf(Perl_error_log,
8617 "\t%s = %c%s%c,\n",
8618 category_names[i],
8619 value ? '"' : '(',
8620 value ? value : "unset",
8621 value ? '"' : ')');
8622 }
8623
8624 PerlIO_printf(Perl_error_log, "\tLANG = %c%s%c\n",
8625 lang ? '"' : '(',
8626 lang ? lang : "unset",
8627 lang ? '"' : ')');
8628 PerlIO_printf(Perl_error_log,
8629 " are supported and installed on your system.\n");
8630 }
8631
8632 #endif
8633
8634 /* A helper macro for the next function. Needed because would be called in two
8635 * places. Knows about the internal workings of the function */
8636 #define GET_DESCRIPTION(trial, name) \
8637 ((isNAME_C_OR_POSIX(name)) \
8638 ? "the standard locale" \
8639 : ((trial == (system_default_trial) \
8640 ? "the system default locale" \
8641 : "a fallback locale")))
8642
8643 /*
8644 * Initialize locale awareness.
8645 */
8646 int
Perl_init_i18nl10n(pTHX_ int printwarn)8647 Perl_init_i18nl10n(pTHX_ int printwarn)
8648 {
8649 /* printwarn is:
8650 * 0 if not to output warning when setup locale is bad
8651 * 1 if to output warning based on value of PERL_BADLANG
8652 * >1 if to output regardless of PERL_BADLANG
8653 *
8654 * returns
8655 * 1 = set ok or not applicable,
8656 * 0 = fallback to a locale of lower priority
8657 * -1 = fallback to all locales failed, not even to the C locale
8658 *
8659 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
8660 * set, debugging information is output.
8661 *
8662 * This routine effectively does the following in most cases:
8663 *
8664 * basic initialization;
8665 * asserts that the compiled tables are consistent;
8666 * initialize data structures;
8667 * make sure we are in the global locale;
8668 * setlocale(LC_ALL, "");
8669 * switch to per-thread locale if applicable;
8670 *
8671 * The "" causes the locale to be set to what the environment variables at
8672 * the time say it should be.
8673 *
8674 * To handle possible failures, the setlocale is expanded to be like:
8675 *
8676 * trial_locale = pre-first-trial;
8677 * while (has_another_trial()) {
8678 * trial_locale = next_trial();
8679 * if setlocale(LC_ALL, trial_locale) {
8680 * ok = true;
8681 * break;
8682 * }
8683 *
8684 * had_failure = true;
8685 * warn();
8686 * }
8687 *
8688 * if (had_failure) {
8689 * warn_even_more();
8690 * if (! ok) warn_still_more();
8691 * }
8692 *
8693 * The first trial is either:
8694 * "" to examine the environment variables for the locale
8695 * NULL to use the values already set for the locale by the program
8696 * embedding this perl instantiation.
8697 *
8698 * Something is wrong if this trial fails, but there is a sequence of
8699 * fallbacks to try should that happen. They are given in the enum below.
8700
8701 * If there is no LC_ALL defined on the system, the setlocale() above is
8702 * replaced by a loop setting each individual category separately.
8703 *
8704 * In a non-embeded environment, this code is executed exactly once. It
8705 * sets up the global locale environment. At the end, if some sort of
8706 * thread-safety is in effect, it will turn thread 0 into using that, with
8707 * the same locale as the global initially. thread 0 can then change its
8708 * locale at will without affecting the global one.
8709 *
8710 * At destruction time, thread 0 will revert to the global locale as the
8711 * other threads die.
8712 *
8713 * Care must be taken in an embedded environment. This code will be
8714 * executed for each instantiation. Since it changes the global locale, it
8715 * could clash with another running instantiation that isn't using
8716 * per-thread locales. perlembed suggests having the controlling program
8717 * set each instantiation's locale and set PERL_SKIP_LOCALE_INIT so this
8718 * code uses that without actually changing anything. Then the onus is on
8719 * the controlling program to prevent any races. The code below does
8720 * enough locking so as to prevent system calls from overwriting data
8721 * before it is safely copied here, but that isn't a general solution.
8722 */
8723
8724 if (PL_langinfo_sv == NULL) {
8725 PL_langinfo_sv = newSVpvs("");
8726 }
8727 if (PL_scratch_langinfo == NULL) {
8728 PL_scratch_langinfo = newSVpvs("");
8729 }
8730
8731 #ifndef USE_LOCALE
8732
8733 PERL_UNUSED_ARG(printwarn);
8734 const int ok = 1;
8735
8736 #else /* USE_LOCALE to near the end of the routine */
8737
8738 int ok = 0;
8739
8740 # ifdef __GLIBC__
8741
8742 const char * const language = PerlEnv_getenv("LANGUAGE");
8743
8744 # else
8745 const char * const language = NULL; /* Unused placeholder */
8746 # endif
8747
8748 /* A later getenv() could zap this, so only use here */
8749 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
8750
8751 const bool locwarn = (printwarn > 1
8752 || ( printwarn
8753 && ( ! bad_lang_use_once
8754 || (
8755 /* disallow with "" or "0" */
8756 *bad_lang_use_once
8757 && strNE("0", bad_lang_use_once)))));
8758
8759 # ifndef DEBUGGING
8760 # define DEBUG_LOCALE_INIT(a,b,c)
8761 # else
8762
8763 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
8764
8765 # define DEBUG_LOCALE_INIT(cat_index, locale, result) \
8766 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n", \
8767 setlocale_debug_string_i(cat_index, locale, result)));
8768
8769 # ifdef LC_ALL
8770 assert(categories[LC_ALL_INDEX_] == LC_ALL);
8771 assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL"));
8772 # ifdef USE_POSIX_2008_LOCALE
8773 assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK);
8774 # endif
8775 # endif
8776
8777 for_all_individual_category_indexes(i) {
8778 assert(category_name_lengths[i] == strlen(category_names[i]));
8779 }
8780
8781 # endif /* DEBUGGING */
8782
8783 /* Initialize the per-thread mbrFOO() state variables. See POSIX.xs for
8784 * why these particular incantations are used. */
8785 # ifdef HAS_MBRLEN
8786 memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
8787 # endif
8788 # ifdef HAS_MBRTOWC
8789 memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
8790 # endif
8791 # ifdef HAS_WCTOMBR
8792 wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
8793 # endif
8794 # ifdef USE_PL_CURLOCALES
8795
8796 for (unsigned int i = 0; i <= LC_ALL_INDEX_; i++) {
8797 PL_curlocales[i] = savepv("C");
8798 }
8799
8800 # endif
8801 # ifdef USE_PL_CUR_LC_ALL
8802
8803 PL_cur_LC_ALL = savepv("C");
8804
8805 # endif
8806 # if ! defined(PERL_LC_ALL_USES_NAME_VALUE_PAIRS) && defined(LC_ALL)
8807
8808 LOCALE_LOCK;
8809
8810 /* If we haven't done so already, translate the LC_ALL positions of
8811 * categories into our internal indices. */
8812 if (map_LC_ALL_position_to_index[0] == LC_ALL_INDEX_) {
8813
8814 # ifdef PERL_LC_ALL_CATEGORY_POSITIONS_INIT
8815 /* Use this array, initialized by a config.h constant */
8816 int lc_all_category_positions[] = PERL_LC_ALL_CATEGORY_POSITIONS_INIT;
8817 STATIC_ASSERT_STMT( C_ARRAY_LENGTH(lc_all_category_positions)
8818 == LC_ALL_INDEX_);
8819
8820 for (unsigned int i = 0;
8821 i < C_ARRAY_LENGTH(lc_all_category_positions);
8822 i++)
8823 {
8824 map_LC_ALL_position_to_index[i] =
8825 get_category_index(lc_all_category_positions[i]);
8826 }
8827 # else
8828 /* It is possible for both PERL_LC_ALL_USES_NAME_VALUE_PAIRS and
8829 * PERL_LC_ALL_CATEGORY_POSITIONS_INIT not to be defined, e.g. on
8830 * systems with only a C locale during ./Configure. Assume that this
8831 * can only happen as part of some sort of bootstrapping so allow
8832 * compilation to succeed by ignoring correctness.
8833 */
8834 for (unsigned int i = 0;
8835 i < C_ARRAY_LENGTH(map_LC_ALL_position_to_index);
8836 i++)
8837 {
8838 map_LC_ALL_position_to_index[i] = 0;
8839 }
8840 # endif
8841
8842 }
8843
8844 LOCALE_UNLOCK;
8845
8846 # endif
8847 # ifdef USE_POSIX_2008_LOCALE
8848
8849 /* This is a global, so be sure to keep another instance from zapping it */
8850 LOCALE_LOCK;
8851 if (PL_C_locale_obj) {
8852 LOCALE_UNLOCK;
8853 }
8854 else {
8855 PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
8856 if (! PL_C_locale_obj) {
8857 LOCALE_UNLOCK;
8858 locale_panic_("Cannot create POSIX 2008 C locale object");
8859 }
8860 LOCALE_UNLOCK;
8861
8862 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n",
8863 PL_C_locale_obj));
8864 }
8865
8866 /* Switch to using the POSIX 2008 interface now. This would happen below
8867 * anyway, but deferring it can lead to leaks of memory that would also get
8868 * malloc'd in the interim. We arbitrarily switch to the C locale,
8869 * overridden below */
8870 if (! uselocale(PL_C_locale_obj)) {
8871 locale_panic_(Perl_form(aTHX_
8872 "Can't uselocale(%p), LC_ALL supposed to"
8873 " be 'C'",
8874 PL_C_locale_obj));
8875 }
8876
8877 # ifdef MULTIPLICITY
8878
8879 PL_cur_locale_obj = PL_C_locale_obj;
8880
8881 # endif
8882 # endif
8883
8884 /* Now initialize some data structures. This is entirely so that
8885 * later-executed code doesn't have to concern itself with things not being
8886 * initialized. Arbitrarily use the C locale (which we know has to exist
8887 * on the system). */
8888
8889 # ifdef USE_LOCALE_NUMERIC
8890
8891 PL_numeric_radix_sv = newSV(1);
8892 PL_underlying_radix_sv = newSV(1);
8893 Newxz(PL_numeric_name, 1, char); /* Single NUL character */
8894
8895 # endif
8896 # ifdef USE_LOCALE_COLLATE
8897
8898 Newxz(PL_collation_name, 1, char);
8899
8900 # endif
8901 # ifdef USE_LOCALE_CTYPE
8902
8903 Newxz(PL_ctype_name, 1, char);
8904
8905 # endif
8906
8907 new_LC_ALL("C", true /* Don't shortcut */);
8908
8909 /*===========================================================================*/
8910
8911 /* Now ready to override the initialization with the values that the user
8912 * wants. This is done in the global locale as explained in the
8913 * introductory comments to this function */
8914 switch_to_global_locale();
8915
8916 const char * const lc_all = PerlEnv_getenv("LC_ALL");
8917 const char * const lang = PerlEnv_getenv("LANG");
8918
8919 /* We try each locale in the enum, in order, until we get one that works,
8920 * or exhaust the list. Normally the loop is executed just once.
8921 *
8922 * Each enum value is +1 from the previous */
8923 typedef enum {
8924 dummy_trial = -1,
8925 environment_trial = 0, /* "" or NULL; code below assumes value
8926 0 is the first real trial */
8927 LC_ALL_trial, /* ENV{LC_ALL} */
8928 LANG_trial, /* ENV{LANG} */
8929 system_default_trial, /* Windows .ACP */
8930 C_trial, /* C locale */
8931 beyond_final_trial,
8932 } trials;
8933
8934 trials trial;
8935 unsigned int already_checked = 0;
8936 const char * checked[C_trial];
8937
8938 # ifdef LC_ALL
8939 const char * lc_all_string;
8940 # else
8941 const char * curlocales[LC_ALL_INDEX_];
8942 # endif
8943
8944 /* Loop through the initial setting and all the possible fallbacks,
8945 * breaking out of the loop on success */
8946 trial = dummy_trial;
8947 while (trial != beyond_final_trial) {
8948
8949 /* Each time through compute the next trial to use based on the one in
8950 * the previous iteration and switch to the new one. This enforces the
8951 * order in which the fallbacks are applied */
8952 next_trial:
8953 trial = (trials) ((int) trial + 1); /* Casts are needed for g++ */
8954
8955 const char * locale = NULL;
8956
8957 /* Set up the parameters for this trial */
8958 switch (trial) {
8959 case dummy_trial:
8960 locale_panic_("Unexpectedly got 'dummy_trial");
8961 break;
8962
8963 case environment_trial:
8964 /* This is either "" to get the values from the environment, or
8965 * NULL if the calling program has initialized the values already.
8966 * */
8967 locale = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
8968 ? NULL
8969 : "";
8970 break;
8971
8972 case LC_ALL_trial:
8973 if (! lc_all || strEQ(lc_all, "")) {
8974 continue; /* No-op */
8975 }
8976
8977 locale = lc_all;
8978 break;
8979
8980 case LANG_trial:
8981 if (! lang || strEQ(lang, "")) {
8982 continue; /* No-op */
8983 }
8984
8985 locale = lang;
8986 break;
8987
8988 case system_default_trial:
8989
8990 # if ! defined(WIN32) || ! defined(LC_ALL)
8991
8992 continue; /* No-op */
8993
8994 # else
8995 /* For Windows, we also try the system default locale before "C".
8996 * (If there exists a Windows without LC_ALL we skip this because
8997 * it gets too complicated. For those, "C" is the next fallback
8998 * possibility). */
8999 locale = ".ACP";
9000 # endif
9001 break;
9002
9003 case C_trial:
9004 locale = "C";
9005 break;
9006
9007 case beyond_final_trial:
9008 continue; /* No-op, causes loop to exit */
9009 }
9010
9011 /* If the locale is a substantive name, don't try the same locale
9012 * twice. */
9013 if (locale && strNE(locale, "")) {
9014 for (unsigned int i = 0; i < already_checked; i++) {
9015 if (strEQ(checked[i], locale)) {
9016 goto next_trial;
9017 }
9018 }
9019
9020 /* And, for future iterations, indicate we've tried this locale */
9021 assert(already_checked < C_ARRAY_LENGTH(checked));
9022 checked[already_checked] = savepv(locale);
9023 SAVEFREEPV(checked[already_checked]);
9024 already_checked++;
9025 }
9026
9027 # ifdef LC_ALL
9028
9029 STDIZED_SETLOCALE_LOCK;
9030 lc_all_string = savepv(stdized_setlocale(LC_ALL, locale));
9031 STDIZED_SETLOCALE_UNLOCK;
9032
9033 DEBUG_LOCALE_INIT(LC_ALL_INDEX_, locale, lc_all_string);
9034
9035 if (LIKELY(lc_all_string)) { /* Succeeded */
9036 ok = 1;
9037 break;
9038 }
9039
9040 if (trial == 0 && locwarn) {
9041 PerlIO_printf(Perl_error_log,
9042 "perl: warning: Setting locale failed.\n");
9043 output_check_environment_warning(language, lc_all, lang);
9044 }
9045
9046 # else /* Below is ! LC_ALL */
9047
9048 bool setlocale_failure = FALSE; /* This trial hasn't failed so far */
9049 bool dowarn = trial == 0 && locwarn;
9050
9051 for_all_individual_category_indexes(j) {
9052 STDIZED_SETLOCALE_LOCK;
9053 curlocales[j] = savepv(stdized_setlocale(categories[j], locale));
9054 STDIZED_SETLOCALE_UNLOCK;
9055
9056 DEBUG_LOCALE_INIT(j, locale, curlocales[j]);
9057
9058 if (UNLIKELY(! curlocales[j])) {
9059 setlocale_failure = TRUE;
9060
9061 /* If are going to warn below, continue to loop so all failures
9062 * are included in the message */
9063 if (! dowarn) {
9064 break;
9065 }
9066 }
9067 }
9068
9069 if (LIKELY(! setlocale_failure)) { /* All succeeded */
9070 ok = 1;
9071 break; /* Exit trial_locales loop */
9072 }
9073
9074 /* Here, this trial failed */
9075
9076 if (dowarn) {
9077 PerlIO_printf(Perl_error_log,
9078 "perl: warning: Setting locale failed for the categories:\n");
9079
9080 for_all_individual_category_indexes(j) {
9081 if (! curlocales[j]) {
9082 PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]);
9083 }
9084 }
9085
9086 output_check_environment_warning(language, lc_all, lang);
9087 } /* end of warning on first failure */
9088
9089 # endif /* LC_ALL */
9090
9091 } /* end of looping through the trial locales */
9092
9093 /* If we had to do more than the first trial, it means that one failed, and
9094 * we may need to output a warning, and, if none worked, do more */
9095 if (UNLIKELY(trial != 0)) {
9096 if (locwarn) {
9097 const char * description = "a fallback locale";
9098 const char * name = NULL;;
9099
9100 /* If we didn't find a good fallback, list all we tried */
9101 if (! ok && already_checked > 0) {
9102 PerlIO_printf(Perl_error_log, "perl: warning: Failed to fall"
9103 " back to ");
9104 if (already_checked > 1) { /* more than one was tried */
9105 PerlIO_printf(Perl_error_log, "any of:\n");
9106 }
9107
9108 while (already_checked > 0) {
9109 name = checked[--already_checked];
9110 description = GET_DESCRIPTION(trial, name);
9111 PerlIO_printf(Perl_error_log, "%s (\"%s\")\n",
9112 description, name);
9113 }
9114 }
9115
9116 if (ok) {
9117
9118 /* Here, a fallback worked. So we have saved its name, and the
9119 * trial that succeeded is still valid */
9120 # ifdef LC_ALL
9121 const char * individ_locales[LC_ALL_INDEX_] = { NULL };
9122
9123 /* Even though we know the valid string for LC_ALL that worked,
9124 * translate it into our internal format, which is the
9125 * name=value pairs notation. This is easier for a human to
9126 * decipher than the positional notation. Some platforms
9127 * can return "C C C C C C" for LC_ALL. This code also
9128 * standardizes that result into plain "C". */
9129 switch (parse_LC_ALL_string(lc_all_string,
9130 (const char **) &individ_locales,
9131 no_override,
9132 false, /* Return only [0] if
9133 suffices */
9134 false, /* Don't panic on error */
9135 __LINE__))
9136 {
9137 case invalid:
9138
9139 /* Here, the parse failed, which shouldn't happen, but if
9140 * it does, we have an easy fallback that allows us to keep
9141 * going. */
9142 name = lc_all_string;
9143 break;
9144
9145 case no_array: /* The original is a single locale */
9146 name = lc_all_string;
9147 break;
9148
9149 case only_element_0: /* element[0] is a single locale valid
9150 for all categories */
9151 SAVEFREEPV(individ_locales[0]);
9152 name = individ_locales[0];
9153 break;
9154
9155 case full_array:
9156 name = calculate_LC_ALL_string(individ_locales,
9157 INTERNAL_FORMAT,
9158 WANT_TEMP_PV,
9159 __LINE__);
9160 for_all_individual_category_indexes(j) {
9161 Safefree(individ_locales[j]);
9162 }
9163 }
9164 # else
9165 name = calculate_LC_ALL_string(curlocales,
9166 INTERNAL_FORMAT,
9167 WANT_TEMP_PV,
9168 __LINE__);
9169 # endif
9170 description = GET_DESCRIPTION(trial, name);
9171 }
9172 else {
9173
9174 /* Nothing seems to be working, yet we want to continue
9175 * executing. It may well be that locales are mostly
9176 * irrelevant to this particular program, and there must be
9177 * some locale underlying the program. Figure it out as best
9178 * we can, by querying the system's current locale */
9179
9180 # ifdef LC_ALL
9181
9182 STDIZED_SETLOCALE_LOCK;
9183 name = stdized_setlocale(LC_ALL, NULL);
9184 STDIZED_SETLOCALE_UNLOCK;
9185
9186 if (UNLIKELY(! name)) {
9187 name = "locale name not determinable";
9188 }
9189
9190 # else /* Below is ! LC_ALL */
9191
9192 const char * system_locales[LC_ALL_INDEX_] = { NULL };
9193
9194 for_all_individual_category_indexes(j) {
9195 STDIZED_SETLOCALE_LOCK;
9196 system_locales[j] = savepv(stdized_setlocale(categories[j],
9197 NULL));
9198 STDIZED_SETLOCALE_UNLOCK;
9199
9200 if (UNLIKELY(! system_locales[j])) {
9201 system_locales[j] = "not determinable";
9202 }
9203 }
9204
9205 /* We use the name=value form for the string, as that is more
9206 * human readable than the positional notation */
9207 name = calculate_LC_ALL_string(system_locales,
9208 INTERNAL_FORMAT,
9209 WANT_TEMP_PV,
9210 __LINE__);
9211 description = "what the system says";
9212
9213 for_all_individual_category_indexes(j) {
9214 Safefree(system_locales[j]);
9215 }
9216 # endif
9217 }
9218
9219 PerlIO_printf(Perl_error_log,
9220 "perl: warning: Falling back to %s (\"%s\").\n",
9221 description, name);
9222
9223 /* Here, ok being true indicates that the first attempt failed, but
9224 * a fallback succeeded; false => nothing working. Translate to
9225 * API return values. */
9226 ok = (ok) ? 0 : -1;
9227 }
9228 }
9229
9230 # ifdef LC_ALL
9231
9232 give_perl_locale_control(lc_all_string, __LINE__);
9233 Safefree(lc_all_string);
9234
9235 # else
9236
9237 give_perl_locale_control((const char **) &curlocales, __LINE__);
9238
9239 for_all_individual_category_indexes(j) {
9240 Safefree(curlocales[j]);
9241 }
9242
9243 # endif
9244 # if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
9245
9246 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
9247 * locale is UTF-8. give_perl_locale_control() just above has already
9248 * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
9249 * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
9250 * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
9251 * STDIN, STDOUT, STDERR, _and_ the default open discipline. */
9252 PL_utf8locale = PL_in_utf8_CTYPE_locale;
9253
9254 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
9255 This is an alternative to using the -C command line switch
9256 (the -C if present will override this). */
9257 {
9258 const char *p = PerlEnv_getenv("PERL_UNICODE");
9259 PL_unicode = p ? parse_unicode_opts(&p) : 0;
9260 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
9261 PL_utf8cache = -1;
9262 }
9263
9264 # endif
9265 # if defined(USE_POSIX_2008_LOCALE) && defined(MULTIPLICITY)
9266 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
9267 "finished Perl_init_i18nl10n; actual obj=%p,"
9268 " expected obj=%p, initial=%s\n",
9269 uselocale(0), PL_cur_locale_obj,
9270 get_LC_ALL_display()));
9271 # endif
9272
9273 /* So won't continue to output stuff */
9274 DEBUG_INITIALIZATION_set(FALSE);
9275
9276 #endif /* USE_LOCALE */
9277
9278 return ok;
9279 }
9280
9281 #undef GET_DESCRIPTION
9282 #ifdef USE_LOCALE_COLLATE
9283
9284 STATIC void
S_compute_collxfrm_coefficients(pTHX)9285 S_compute_collxfrm_coefficients(pTHX)
9286 {
9287
9288 /* A locale collation definition includes primary, secondary, tertiary,
9289 * etc. weights for each character. To sort, the primary weights are used,
9290 * and only if they compare equal, then the secondary weights are used, and
9291 * only if they compare equal, then the tertiary, etc.
9292 *
9293 * strxfrm() works by taking the input string, say ABC, and creating an
9294 * output transformed string consisting of first the primary weights,
9295 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the tertiary,
9296 * etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters may not have
9297 * weights at every level. In our example, let's say B doesn't have a
9298 * tertiary weight, and A doesn't have a secondary weight. The constructed
9299 * string is then going to be
9300 * A¹B¹C¹ B²C² A³C³ ....
9301 * This has the desired effect that strcmp() will look at the secondary or
9302 * tertiary weights only if the strings compare equal at all higher
9303 * priority weights. The spaces shown here, like in
9304 * "A¹B¹C¹ A²B²C² "
9305 * are not just for readability. In the general case, these must actually
9306 * be bytes, which we will call here 'separator weights'; and they must be
9307 * smaller than any other weight value, but since these are C strings, only
9308 * the terminating one can be a NUL (some implementations may include a
9309 * non-NUL separator weight just before the NUL). Implementations tend to
9310 * reserve 01 for the separator weights. They are needed so that a shorter
9311 * string's secondary weights won't be misconstrued as primary weights of a
9312 * longer string, etc. By making them smaller than any other weight, the
9313 * shorter string will sort first. (Actually, if all secondary weights are
9314 * smaller than all primary ones, there is no need for a separator weight
9315 * between those two levels, etc.)
9316 *
9317 * The length of the transformed string is roughly a linear function of the
9318 * input string. It's not exactly linear because some characters don't
9319 * have weights at all levels. When we call strxfrm() we have to allocate
9320 * some memory to hold the transformed string. The calculations below try
9321 * to find coefficients 'm' and 'b' for this locale so that m*x + b equals
9322 * how much space we need, given the size of the input string in 'x'. If
9323 * we calculate too small, we increase the size as needed, and call
9324 * strxfrm() again, but it is better to get it right the first time to
9325 * avoid wasted expensive string transformations.
9326 *
9327 * We use the string below to find how long the transformation of it is.
9328 * Almost all locales are supersets of ASCII, or at least the ASCII
9329 * letters. We use all of them, half upper half lower, because if we used
9330 * fewer, we might hit just the ones that are outliers in a particular
9331 * locale. Most of the strings being collated will contain a preponderance
9332 * of letters, and even if they are above-ASCII, they are likely to have
9333 * the same number of weight levels as the ASCII ones. It turns out that
9334 * digits tend to have fewer levels, and some punctuation has more, but
9335 * those are relatively sparse in text, and khw believes this gives a
9336 * reasonable result, but it could be changed if experience so dictates. */
9337 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
9338 char * x_longer; /* Transformed 'longer' */
9339 Size_t x_len_longer; /* Length of 'x_longer' */
9340
9341 char * x_shorter; /* We also transform a substring of 'longer' */
9342 Size_t x_len_shorter;
9343
9344 PL_in_utf8_COLLATE_locale = (PL_collation_standard)
9345 ? 0
9346 : is_locale_utf8(PL_collation_name);
9347 PL_strxfrm_NUL_replacement = '\0';
9348 PL_strxfrm_max_cp = 0;
9349
9350 /* mem_collxfrm_() is used get the transformation (though here we are
9351 * interested only in its length). It is used because it has the
9352 * intelligence to handle all cases, but to work, it needs some values of
9353 * 'm' and 'b' to get it started. For the purposes of this calculation we
9354 * use a very conservative estimate of 'm' and 'b'. This assumes a weight
9355 * can be multiple bytes, enough to hold any UV on the platform, and there
9356 * are 5 levels, 4 weight bytes, and a trailing NUL. */
9357 PL_collxfrm_base = 5;
9358 PL_collxfrm_mult = 5 * sizeof(UV);
9359
9360 /* Find out how long the transformation really is */
9361 x_longer = mem_collxfrm_(longer,
9362 sizeof(longer) - 1,
9363 &x_len_longer,
9364
9365 /* We avoid converting to UTF-8 in the called
9366 * function by telling it the string is in UTF-8
9367 * if the locale is a UTF-8 one. Since the string
9368 * passed here is invariant under UTF-8, we can
9369 * claim it's UTF-8 even if it isn't. */
9370 PL_in_utf8_COLLATE_locale);
9371 Safefree(x_longer);
9372
9373 /* Find out how long the transformation of a substring of 'longer' is.
9374 * Together the lengths of these transformations are sufficient to
9375 * calculate 'm' and 'b'. The substring is all of 'longer' except the
9376 * first character. This minimizes the chances of being swayed by outliers
9377 * */
9378 x_shorter = mem_collxfrm_(longer + 1,
9379 sizeof(longer) - 2,
9380 &x_len_shorter,
9381 PL_in_utf8_COLLATE_locale);
9382 Safefree(x_shorter);
9383
9384 /* If the results are nonsensical for this simple test, the whole locale
9385 * definition is suspect. Mark it so that locale collation is not active
9386 * at all for it. XXX Should we warn? */
9387 if ( x_len_shorter == 0
9388 || x_len_longer == 0
9389 || x_len_shorter >= x_len_longer)
9390 {
9391 PL_collxfrm_mult = 0;
9392 PL_collxfrm_base = 1;
9393 DEBUG_L(PerlIO_printf(Perl_debug_log,
9394 "Disabling locale collation for LC_COLLATE='%s';"
9395 " length for shorter sample=%zu; longer=%zu\n",
9396 PL_collation_name, x_len_shorter, x_len_longer));
9397 }
9398 else {
9399 SSize_t base; /* Temporary */
9400
9401 /* We have both: m * strlen(longer) + b = x_len_longer
9402 * m * strlen(shorter) + b = x_len_shorter;
9403 * subtracting yields:
9404 * m * (strlen(longer) - strlen(shorter))
9405 * = x_len_longer - x_len_shorter
9406 * But we have set things up so that 'shorter' is 1 byte smaller than
9407 * 'longer'. Hence:
9408 * m = x_len_longer - x_len_shorter
9409 *
9410 * But if something went wrong, make sure the multiplier is at least 1.
9411 */
9412 if (x_len_longer > x_len_shorter) {
9413 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
9414 }
9415 else {
9416 PL_collxfrm_mult = 1;
9417 }
9418
9419 /* mx + b = len
9420 * so: b = len - mx
9421 * but in case something has gone wrong, make sure it is non-negative
9422 * */
9423 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
9424 if (base < 0) {
9425 base = 0;
9426 }
9427
9428 /* Add 1 for the trailing NUL */
9429 PL_collxfrm_base = base + 1;
9430 }
9431
9432 DEBUG_L(PerlIO_printf(Perl_debug_log,
9433 "?UTF-8 locale=%d; x_len_shorter=%zu, "
9434 "x_len_longer=%zu,"
9435 " collate multipler=%zu, collate base=%zu\n",
9436 PL_in_utf8_COLLATE_locale,
9437 x_len_shorter, x_len_longer,
9438 PL_collxfrm_mult, PL_collxfrm_base));
9439 }
9440
9441 char *
Perl_mem_collxfrm_(pTHX_ const char * input_string,STRLEN len,STRLEN * xlen,bool utf8)9442 Perl_mem_collxfrm_(pTHX_ const char *input_string,
9443 STRLEN len, /* Length of 'input_string' */
9444 STRLEN *xlen, /* Set to length of returned string
9445 (not including the collation index
9446 prefix) */
9447 bool utf8 /* Is the input in UTF-8? */
9448 )
9449 {
9450 /* mem_collxfrm_() is like strxfrm() but with two important differences.
9451 * First, it handles embedded NULs. Second, it allocates a bit more memory
9452 * than needed for the transformed data itself. The real transformed data
9453 * begins at offset COLLXFRM_HDR_LEN. *xlen is set to the length of that,
9454 * and doesn't include the collation index size.
9455 *
9456 * It is the caller's responsibility to eventually free the memory returned
9457 * by this function.
9458 *
9459 * Please see sv_collxfrm() to see how this is used. */
9460
9461 # define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
9462
9463 char * s = (char *) input_string;
9464 STRLEN s_strlen = strlen(input_string);
9465 char *xbuf = NULL;
9466 STRLEN xAlloc; /* xalloc is a reserved word in VC */
9467 STRLEN length_in_chars;
9468 bool first_time = TRUE; /* Cleared after first loop iteration */
9469
9470 # ifdef USE_LOCALE_CTYPE
9471 const char * orig_CTYPE_locale = NULL;
9472 # endif
9473
9474 # if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
9475 locale_t constructed_locale = (locale_t) 0;
9476 # endif
9477
9478 PERL_ARGS_ASSERT_MEM_COLLXFRM_;
9479
9480 /* Must be NUL-terminated */
9481 assert(*(input_string + len) == '\0');
9482
9483 if (PL_collxfrm_mult == 0) { /* unknown or bad */
9484 if (PL_collxfrm_base != 0) { /* bad collation => skip */
9485 DEBUG_L(PerlIO_printf(Perl_debug_log,
9486 "mem_collxfrm_: locale's collation is defective\n"));
9487 goto bad;
9488 }
9489
9490 /* (mult, base) == (0,0) means we need to calculate mult and base
9491 * before proceeding */
9492 S_compute_collxfrm_coefficients(aTHX);
9493 }
9494
9495 /* Replace any embedded NULs with the control that sorts before any others.
9496 * This will give as good as possible results on strings that don't
9497 * otherwise contain that character, but otherwise there may be
9498 * less-than-perfect results with that character and NUL. This is
9499 * unavoidable unless we replace strxfrm with our own implementation. */
9500 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
9501 NUL */
9502 char * e = s + len;
9503 char * sans_nuls;
9504 STRLEN sans_nuls_len;
9505 int try_non_controls;
9506 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
9507 making sure 2nd byte is NUL.
9508 */
9509 STRLEN this_replacement_len;
9510
9511 /* If we don't know what non-NUL control character sorts lowest for
9512 * this locale, find it */
9513 if (PL_strxfrm_NUL_replacement == '\0') {
9514 int j;
9515 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
9516 includes the collation index
9517 prefixed. */
9518
9519 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
9520
9521 /* Unlikely, but it may be that no control will work to replace
9522 * NUL, in which case we instead look for any character. Controls
9523 * are preferred because collation order is, in general, context
9524 * sensitive, with adjoining characters affecting the order, and
9525 * controls are less likely to have such interactions, allowing the
9526 * NUL-replacement to stand on its own. (Another way to look at it
9527 * is to imagine what would happen if the NUL were replaced by a
9528 * combining character; it wouldn't work out all that well.) */
9529 for (try_non_controls = 0;
9530 try_non_controls < 2;
9531 try_non_controls++)
9532 {
9533
9534 # ifdef USE_LOCALE_CTYPE
9535
9536 /* In this case we use isCNTRL_LC() below, which relies on
9537 * LC_CTYPE, so that must be switched to correspond with the
9538 * LC_COLLATE locale */
9539 const bool need_to_toggle = ( ! try_non_controls
9540 && ! PL_in_utf8_COLLATE_locale);
9541 if (need_to_toggle) {
9542 orig_CTYPE_locale = toggle_locale_c(LC_CTYPE,
9543 PL_collation_name);
9544 }
9545 # endif
9546 /* Look through all legal code points (NUL isn't) */
9547 for (j = 1; j < 256; j++) {
9548 char * x; /* j's xfrm plus collation index */
9549 STRLEN x_len; /* length of 'x' */
9550 STRLEN trial_len = 1;
9551 char cur_source[] = { '\0', '\0' };
9552
9553 /* Skip non-controls the first time through the loop. The
9554 * controls in a UTF-8 locale are the L1 ones */
9555 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
9556 ? ! isCNTRL_L1(j)
9557 : ! isCNTRL_LC(j))
9558 {
9559 continue;
9560 }
9561
9562 /* Create a 1-char string of the current code point */
9563 cur_source[0] = (char) j;
9564
9565 /* Then transform it */
9566 x = mem_collxfrm_(cur_source, trial_len, &x_len,
9567 0 /* The string is not in UTF-8 */);
9568
9569 /* Ignore any character that didn't successfully transform.
9570 * */
9571 if (! x) {
9572 continue;
9573 }
9574
9575 /* If this character's transformation is lower than
9576 * the current lowest, this one becomes the lowest */
9577 if ( cur_min_x == NULL
9578 || strLT(x + COLLXFRM_HDR_LEN,
9579 cur_min_x + COLLXFRM_HDR_LEN))
9580 {
9581 PL_strxfrm_NUL_replacement = j;
9582 Safefree(cur_min_x);
9583 cur_min_x = x;
9584 }
9585 else {
9586 Safefree(x);
9587 }
9588 } /* end of loop through all 255 characters */
9589
9590 # ifdef USE_LOCALE_CTYPE
9591
9592 if (need_to_toggle) {
9593 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
9594 }
9595 # endif
9596
9597 /* Stop looking if found */
9598 if (cur_min_x) {
9599 break;
9600 }
9601
9602 /* Unlikely, but possible, if there aren't any controls that
9603 * work in the locale, repeat the loop, looking for any
9604 * character that works */
9605 DEBUG_L(PerlIO_printf(Perl_debug_log,
9606 "mem_collxfrm_: No control worked. Trying non-controls\n"));
9607 } /* End of loop to try first the controls, then any char */
9608
9609 if (! cur_min_x) {
9610 DEBUG_L(PerlIO_printf(Perl_debug_log,
9611 "mem_collxfrm_: Couldn't find any character to replace"
9612 " embedded NULs in locale %s with", PL_collation_name));
9613 goto bad;
9614 }
9615
9616 DEBUG_L(PerlIO_printf(Perl_debug_log,
9617 "mem_collxfrm_: Replacing embedded NULs in locale %s with "
9618 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
9619
9620 Safefree(cur_min_x);
9621 } /* End of determining the character that is to replace NULs */
9622
9623 /* If the replacement is variant under UTF-8, it must match the
9624 * UTF8-ness of the original */
9625 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
9626 this_replacement_char[0] =
9627 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
9628 this_replacement_char[1] =
9629 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
9630 this_replacement_len = 2;
9631 }
9632 else {
9633 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
9634 /* this_replacement_char[1] = '\0' was done at initialization */
9635 this_replacement_len = 1;
9636 }
9637
9638 /* The worst case length for the replaced string would be if every
9639 * character in it is NUL. Multiply that by the length of each
9640 * replacement, and allow for a trailing NUL */
9641 sans_nuls_len = (len * this_replacement_len) + 1;
9642 Newx(sans_nuls, sans_nuls_len, char);
9643 *sans_nuls = '\0';
9644
9645 /* Replace each NUL with the lowest collating control. Loop until have
9646 * exhausted all the NULs */
9647 while (s + s_strlen < e) {
9648 my_strlcat(sans_nuls, s, sans_nuls_len);
9649
9650 /* Do the actual replacement */
9651 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
9652
9653 /* Move past the input NUL */
9654 s += s_strlen + 1;
9655 s_strlen = strlen(s);
9656 }
9657
9658 /* And add anything that trails the final NUL */
9659 my_strlcat(sans_nuls, s, sans_nuls_len);
9660
9661 /* Switch so below we transform this modified string */
9662 s = sans_nuls;
9663 len = strlen(s);
9664 } /* End of replacing NULs */
9665
9666 /* Make sure the UTF8ness of the string and locale match */
9667 if (utf8 != PL_in_utf8_COLLATE_locale) {
9668 /* XXX convert above Unicode to 10FFFF? */
9669 const char * const t = s; /* Temporary so we can later find where the
9670 input was */
9671
9672 /* Here they don't match. Change the string's to be what the locale is
9673 * expecting */
9674
9675 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
9676 s = (char *) bytes_to_utf8((const U8 *) s, &len);
9677 utf8 = TRUE;
9678 }
9679 else { /* locale is not UTF-8; but input is; downgrade the input */
9680
9681 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
9682
9683 /* If the downgrade was successful we are done, but if the input
9684 * contains things that require UTF-8 to represent, have to do
9685 * damage control ... */
9686 if (UNLIKELY(utf8)) {
9687
9688 /* What we do is construct a non-UTF-8 string with
9689 * 1) the characters representable by a single byte converted
9690 * to be so (if necessary);
9691 * 2) and the rest converted to collate the same as the
9692 * highest collating representable character. That makes
9693 * them collate at the end. This is similar to how we
9694 * handle embedded NULs, but we use the highest collating
9695 * code point instead of the smallest. Like the NUL case,
9696 * this isn't perfect, but is the best we can reasonably
9697 * do. Every above-255 code point will sort the same as
9698 * the highest-sorting 0-255 code point. If that code
9699 * point can combine in a sequence with some other code
9700 * points for weight calculations, us changing something to
9701 * be it can adversely affect the results. But in most
9702 * cases, it should work reasonably. And note that this is
9703 * really an illegal situation: using code points above 255
9704 * on a locale where only 0-255 are valid. If two strings
9705 * sort entirely equal, then the sort order for the
9706 * above-255 code points will be in code point order. */
9707
9708 utf8 = FALSE;
9709
9710 /* If we haven't calculated the code point with the maximum
9711 * collating order for this locale, do so now */
9712 if (! PL_strxfrm_max_cp) {
9713 int j;
9714
9715 /* The current transformed string that collates the
9716 * highest (except it also includes the prefixed collation
9717 * index. */
9718 char * cur_max_x = NULL;
9719
9720 /* Look through all legal code points (NUL isn't) */
9721 for (j = 1; j < 256; j++) {
9722 char * x;
9723 STRLEN x_len;
9724 char cur_source[] = { '\0', '\0' };
9725
9726 /* Create a 1-char string of the current code point */
9727 cur_source[0] = (char) j;
9728
9729 /* Then transform it */
9730 x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);
9731
9732 /* If something went wrong (which it shouldn't), just
9733 * ignore this code point */
9734 if (! x) {
9735 continue;
9736 }
9737
9738 /* If this character's transformation is higher than
9739 * the current highest, this one becomes the highest */
9740 if ( cur_max_x == NULL
9741 || strGT(x + COLLXFRM_HDR_LEN,
9742 cur_max_x + COLLXFRM_HDR_LEN))
9743 {
9744 PL_strxfrm_max_cp = j;
9745 Safefree(cur_max_x);
9746 cur_max_x = x;
9747 }
9748 else {
9749 Safefree(x);
9750 }
9751 }
9752
9753 if (! cur_max_x) {
9754 DEBUG_L(PerlIO_printf(Perl_debug_log,
9755 "mem_collxfrm_: Couldn't find any character to"
9756 " replace above-Latin1 chars in locale %s with",
9757 PL_collation_name));
9758 goto bad;
9759 }
9760
9761 DEBUG_L(PerlIO_printf(Perl_debug_log,
9762 "mem_collxfrm_: highest 1-byte collating character"
9763 " in locale %s is 0x%02X\n",
9764 PL_collation_name,
9765 PL_strxfrm_max_cp));
9766
9767 Safefree(cur_max_x);
9768 }
9769
9770 /* Here we know which legal code point collates the highest.
9771 * We are ready to construct the non-UTF-8 string. The length
9772 * will be at least 1 byte smaller than the input string
9773 * (because we changed at least one 2-byte character into a
9774 * single byte), but that is eaten up by the trailing NUL */
9775 Newx(s, len, char);
9776
9777 {
9778 STRLEN i;
9779 STRLEN d= 0;
9780 char * e = (char *) t + len;
9781
9782 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
9783 U8 cur_char = t[i];
9784 if (UTF8_IS_INVARIANT(cur_char)) {
9785 s[d++] = cur_char;
9786 }
9787 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
9788 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
9789 }
9790 else { /* Replace illegal cp with highest collating
9791 one */
9792 s[d++] = PL_strxfrm_max_cp;
9793 }
9794 }
9795 s[d++] = '\0';
9796 Renew(s, d, char); /* Free up unused space */
9797 }
9798 }
9799 }
9800
9801 /* Here, we have constructed a modified version of the input. It could
9802 * be that we already had a modified copy before we did this version.
9803 * If so, that copy is no longer needed */
9804 if (t != input_string) {
9805 Safefree(t);
9806 }
9807 }
9808
9809 length_in_chars = (utf8)
9810 ? utf8_length((U8 *) s, (U8 *) s + len)
9811 : len;
9812
9813 /* The first element in the output is the collation id, used by
9814 * sv_collxfrm(); then comes the space for the transformed string. The
9815 * equation should give us a good estimate as to how much is needed */
9816 xAlloc = COLLXFRM_HDR_LEN
9817 + PL_collxfrm_base
9818 + (PL_collxfrm_mult * length_in_chars);
9819 Newx(xbuf, xAlloc, char);
9820 if (UNLIKELY(! xbuf)) {
9821 DEBUG_L(PerlIO_printf(Perl_debug_log,
9822 "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc));
9823 goto bad;
9824 }
9825
9826 /* Store the collation id */
9827 *(PERL_UINTMAX_T *)xbuf = PL_collation_ix;
9828
9829 # if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
9830 # ifdef USE_LOCALE_CTYPE
9831
9832 constructed_locale = newlocale(LC_CTYPE_MASK, PL_collation_name,
9833 duplocale(use_curlocale_scratch()));
9834 # else
9835
9836 constructed_locale = duplocale(use_curlocale_scratch());
9837
9838 # endif
9839 # define my_strxfrm(dest, src, n) strxfrm_l(dest, src, n, \
9840 constructed_locale)
9841 # define CLEANUP_STRXFRM \
9842 STMT_START { \
9843 if (constructed_locale != (locale_t) 0) \
9844 freelocale(constructed_locale); \
9845 } STMT_END
9846 # else
9847 # define my_strxfrm(dest, src, n) strxfrm(dest, src, n)
9848 # ifdef USE_LOCALE_CTYPE
9849
9850 orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
9851
9852 # define CLEANUP_STRXFRM \
9853 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
9854 # else
9855 # define CLEANUP_STRXFRM NOOP
9856 # endif
9857 # endif
9858
9859 /* Then the transformation of the input. We loop until successful, or we
9860 * give up */
9861 for (;;) {
9862
9863 errno = 0;
9864 *xlen = my_strxfrm(xbuf + COLLXFRM_HDR_LEN,
9865 s,
9866 xAlloc - COLLXFRM_HDR_LEN);
9867
9868
9869 /* If the transformed string occupies less space than we told strxfrm()
9870 * was available, it means it transformed the whole string. */
9871 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
9872
9873 /* But there still could have been a problem */
9874 if (errno != 0) {
9875 DEBUG_L(PerlIO_printf(Perl_debug_log,
9876 "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n",
9877 PL_collation_name, errno,
9878 _byte_dump_string((U8 *) s, len, 0)));
9879 goto bad;
9880 }
9881
9882 /* Here, the transformation was successful. Some systems include a
9883 * trailing NUL in the returned length. Ignore it, using a loop in
9884 * case multiple trailing NULs are returned. */
9885 while ( (*xlen) > 0
9886 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
9887 {
9888 (*xlen)--;
9889 }
9890
9891 /* If the first try didn't get it, it means our prediction was low.
9892 * Modify the coefficients so that we predict a larger value in any
9893 * future transformations */
9894 if (! first_time) {
9895 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
9896 STRLEN computed_guess = PL_collxfrm_base
9897 + (PL_collxfrm_mult * length_in_chars);
9898
9899 /* On zero-length input, just keep current slope instead of
9900 * dividing by 0 */
9901 const STRLEN new_m = (length_in_chars != 0)
9902 ? needed / length_in_chars
9903 : PL_collxfrm_mult;
9904
9905 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
9906 "initial size of %zu bytes for a length "
9907 "%zu string was insufficient, %zu needed\n",
9908 computed_guess, length_in_chars, needed));
9909
9910 /* If slope increased, use it, but discard this result for
9911 * length 1 strings, as we can't be sure that it's a real slope
9912 * change */
9913 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
9914
9915 # ifdef DEBUGGING
9916
9917 STRLEN old_m = PL_collxfrm_mult;
9918 STRLEN old_b = PL_collxfrm_base;
9919
9920 # endif
9921
9922 PL_collxfrm_mult = new_m;
9923 PL_collxfrm_base = 1; /* +1 For trailing NUL */
9924 computed_guess = PL_collxfrm_base
9925 + (PL_collxfrm_mult * length_in_chars);
9926 if (computed_guess < needed) {
9927 PL_collxfrm_base += needed - computed_guess;
9928 }
9929
9930 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
9931 "slope is now %zu; was %zu, base "
9932 "is now %zu; was %zu\n",
9933 PL_collxfrm_mult, old_m,
9934 PL_collxfrm_base, old_b));
9935 }
9936 else { /* Slope didn't change, but 'b' did */
9937 const STRLEN new_b = needed
9938 - computed_guess
9939 + PL_collxfrm_base;
9940 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
9941 "base is now %zu; was %zu\n", new_b, PL_collxfrm_base));
9942 PL_collxfrm_base = new_b;
9943 }
9944 }
9945
9946 break;
9947 }
9948
9949 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
9950 DEBUG_L(PerlIO_printf(Perl_debug_log,
9951 "mem_collxfrm_: Needed %zu bytes, max permissible is %u\n",
9952 *xlen, PERL_INT_MAX));
9953 goto bad;
9954 }
9955
9956 /* A well-behaved strxfrm() returns exactly how much space it needs
9957 * (usually not including the trailing NUL) when it fails due to not
9958 * enough space being provided. Assume that this is the case unless
9959 * it's been proven otherwise */
9960 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
9961 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
9962 }
9963 else { /* Here, either:
9964 * 1) The strxfrm() has previously shown bad behavior; or
9965 * 2) It isn't the first time through the loop, which means
9966 * that the strxfrm() is now showing bad behavior, because
9967 * we gave it what it said was needed in the previous
9968 * iteration, and it came back saying it needed still more.
9969 * (Many versions of cygwin fit this. When the buffer size
9970 * isn't sufficient, they return the input size instead of
9971 * how much is needed.)
9972 * Increase the buffer size by a fixed percentage and try again.
9973 * */
9974 xAlloc += (xAlloc / 4) + 1;
9975 PL_strxfrm_is_behaved = FALSE;
9976
9977 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
9978 "mem_collxfrm_ required more space than previously"
9979 " calculated for locale %s, trying again with new"
9980 " guess=%zu+%zu\n",
9981 PL_collation_name, COLLXFRM_HDR_LEN,
9982 xAlloc - COLLXFRM_HDR_LEN));
9983 }
9984
9985 Renew(xbuf, xAlloc, char);
9986 if (UNLIKELY(! xbuf)) {
9987 DEBUG_L(PerlIO_printf(Perl_debug_log,
9988 "mem_collxfrm_: Couldn't realloc %zu bytes\n", xAlloc));
9989 goto bad;
9990 }
9991
9992 first_time = FALSE;
9993 }
9994
9995 CLEANUP_STRXFRM;
9996
9997 DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8));
9998
9999 /* Free up unneeded space; retain enough for trailing NUL */
10000 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
10001
10002 if (s != input_string) {
10003 Safefree(s);
10004 }
10005
10006 return xbuf;
10007
10008 bad:
10009
10010 CLEANUP_STRXFRM;
10011 DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
10012
10013 Safefree(xbuf);
10014 if (s != input_string) {
10015 Safefree(s);
10016 }
10017 *xlen = 0;
10018
10019 return NULL;
10020 }
10021
10022 # ifdef DEBUGGING
10023
10024 STATIC void
S_print_collxfrm_input_and_return(pTHX_ const char * s,const char * e,const char * xbuf,const STRLEN xlen,const bool is_utf8)10025 S_print_collxfrm_input_and_return(pTHX_
10026 const char * s,
10027 const char * e,
10028 const char * xbuf,
10029 const STRLEN xlen,
10030 const bool is_utf8)
10031 {
10032
10033 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
10034
10035 PerlIO_printf(Perl_debug_log,
10036 "mem_collxfrm_[ix %" UVuf "] for locale '%s':\n"
10037 " input=%s\n return=%s\n return len=%zu\n",
10038 (UV) PL_collation_ix, PL_collation_name,
10039 get_displayable_string(s, e, is_utf8),
10040 ((xbuf == NULL)
10041 ? "(null)"
10042 : ((xlen == 0)
10043 ? "(empty)"
10044 : _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
10045 xlen, 0))),
10046 xlen);
10047 }
10048
10049 # endif /* DEBUGGING */
10050
10051 SV *
Perl_strxfrm(pTHX_ SV * src)10052 Perl_strxfrm(pTHX_ SV * src)
10053 {
10054 PERL_ARGS_ASSERT_STRXFRM;
10055
10056 /* For use by POSIX::strxfrm(). If they differ, toggle LC_CTYPE to
10057 * LC_COLLATE to avoid potential mojibake.
10058 *
10059 * If we can't calculate a collation, 'src' is instead returned, so that
10060 * future comparisons will be by code point order */
10061
10062 # ifdef USE_LOCALE_CTYPE
10063
10064 const char * orig_ctype = toggle_locale_c(LC_CTYPE,
10065 querylocale_c(LC_COLLATE));
10066 # endif
10067
10068 SV * dst = src;
10069 STRLEN dstlen;
10070 STRLEN srclen;
10071 const char *p = SvPV_const(src, srclen);
10072 const U32 utf8_flag = SvUTF8(src);
10073 char *d = mem_collxfrm_(p, srclen, &dstlen, cBOOL(utf8_flag));
10074
10075 assert(utf8_flag == 0 || utf8_flag == SVf_UTF8);
10076
10077 if (d != NULL) {
10078 assert(dstlen > 0);
10079 dst =newSVpvn_flags(d + COLLXFRM_HDR_LEN,
10080 dstlen, SVs_TEMP|utf8_flag);
10081 Safefree(d);
10082 }
10083
10084 # ifdef USE_LOCALE_CTYPE
10085
10086 restore_toggled_locale_c(LC_CTYPE, orig_ctype);
10087
10088 # endif
10089
10090 return dst;
10091 }
10092
10093 #endif /* USE_LOCALE_COLLATE */
10094
10095 /* my_strerror() returns a mortalized copy of the text of the error message
10096 * associated with 'errnum'.
10097 *
10098 * If not called from within the scope of 'use locale', it uses the text from
10099 * the C locale. If Perl is compiled to not pay attention to LC_CTYPE nor
10100 * LC_MESSAGES, it uses whatever strerror() returns. Otherwise the text is
10101 * derived from the locale, LC_MESSAGES if we have that; LC_CTYPE if not.
10102 *
10103 * It returns in *utf8ness the result's UTF-8ness
10104 *
10105 * The function just calls strerror(), but temporarily switches locales, if
10106 * needed. Many platforms require LC_CTYPE and LC_MESSAGES to be in the same
10107 * CODESET in order for the return from strerror() to not contain '?' symbols,
10108 * or worse, mojibaked. It's cheaper to just use the stricter criteria of
10109 * being in the same locale. So the code below uses a common locale for both
10110 * categories. Again, that is C if not within 'use locale' scope; or the
10111 * LC_MESSAGES locale if in scope and we have that category; and LC_CTYPE if we
10112 * don't have LC_MESSAGES; and whatever strerror returns if we don't have
10113 * either category.
10114 *
10115 * There are two sets of implementations. The first below is if we have
10116 * strerror_l(). This is the simpler. We just use the already-built C locale
10117 * object if not in locale scope, or build up a custom one otherwise.
10118 *
10119 * When strerror_l() is not available, we may have to swap locales temporarily
10120 * to bring the two categories into sync with each other, and possibly to the C
10121 * locale.
10122 *
10123 * Because the prepropessing directives to conditionally compile this function
10124 * would greatly obscure the logic of the various implementations, the whole
10125 * function is repeated for each configuration, with some common macros. */
10126
10127 /* Used to shorten the definitions of the following implementations of
10128 * my_strerror() */
10129 #define DEBUG_STRERROR_ENTER(errnum, in_locale) \
10130 DEBUG_Lv(PerlIO_printf(Perl_debug_log, \
10131 "my_strerror called with errnum %d;" \
10132 " Within locale scope=%d\n", \
10133 errnum, in_locale))
10134
10135 #define DEBUG_STRERROR_RETURN(errstr, utf8ness) \
10136 DEBUG_Lv(PerlIO_printf(Perl_debug_log, \
10137 "Strerror returned; saving a copy: '%s';" \
10138 " utf8ness=%d\n", \
10139 get_displayable_string(errstr, \
10140 errstr + strlen(errstr), \
10141 *utf8ness), \
10142 (int) *utf8ness))
10143
10144 /* On platforms that have precisely one of these categories (Windows
10145 * qualifies), these yield the correct one */
10146 #if defined(USE_LOCALE_CTYPE)
10147 # define WHICH_LC_INDEX LC_CTYPE_INDEX_
10148 #elif defined(USE_LOCALE_MESSAGES)
10149 # define WHICH_LC_INDEX LC_MESSAGES_INDEX_
10150 #endif
10151
10152 /*===========================================================================*/
10153 /* First set of implementations, when have strerror_l() */
10154
10155 #if defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
10156
10157 # if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
10158
10159 /* Here, neither category is defined: use the C locale */
10160 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)10161 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
10162 {
10163 PERL_ARGS_ASSERT_MY_STRERROR;
10164
10165 DEBUG_STRERROR_ENTER(errnum, 0);
10166
10167 const char *errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
10168 *utf8ness = UTF8NESS_IMMATERIAL;
10169
10170 DEBUG_STRERROR_RETURN(errstr, utf8ness);
10171
10172 SAVEFREEPV(errstr);
10173 return errstr;
10174 }
10175
10176 # elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
10177
10178 /*--------------------------------------------------------------------------*/
10179
10180 /* Here one or the other of CTYPE or MESSAGES is defined, but not both. If we
10181 * are not within 'use locale' scope of the only one defined, we use the C
10182 * locale; otherwise use the current locale object */
10183
10184 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)10185 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
10186 {
10187 PERL_ARGS_ASSERT_MY_STRERROR;
10188
10189 DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
10190
10191 /* Use C if not within locale scope; Otherwise, use current locale */
10192 const locale_t which_obj = (IN_LC(categories[WHICH_LC_INDEX]))
10193 ? PL_C_locale_obj
10194 : use_curlocale_scratch();
10195
10196 const char *errstr = savepv(strerror_l(errnum, which_obj));
10197 *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
10198 NULL, WHICH_LC_INDEX);
10199 DEBUG_STRERROR_RETURN(errstr, utf8ness);
10200
10201 SAVEFREEPV(errstr);
10202 return errstr;
10203 }
10204
10205 /*--------------------------------------------------------------------------*/
10206 # else /* Are using both categories. Place them in the same CODESET,
10207 * either C or the LC_MESSAGES locale */
10208
10209 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)10210 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
10211 {
10212 PERL_ARGS_ASSERT_MY_STRERROR;
10213
10214 DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
10215
10216 const char *errstr;
10217 if (! IN_LC(LC_MESSAGES)) { /* Use C if not within locale scope */
10218 errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
10219 *utf8ness = UTF8NESS_IMMATERIAL;
10220 }
10221 else { /* Otherwise, use the LC_MESSAGES locale, making sure LC_CTYPE
10222 matches */
10223 locale_t cur = duplocale(use_curlocale_scratch());
10224
10225 const char * locale = querylocale_c(LC_MESSAGES);
10226 cur = newlocale(LC_CTYPE_MASK, locale, cur);
10227 errstr = savepv(strerror_l(errnum, cur));
10228 *utf8ness = get_locale_string_utf8ness_i(errstr,
10229 LOCALE_UTF8NESS_UNKNOWN,
10230 locale,
10231 LC_MESSAGES_INDEX_);
10232 freelocale(cur);
10233 }
10234
10235 DEBUG_STRERROR_RETURN(errstr, utf8ness);
10236
10237 SAVEFREEPV(errstr);
10238 return errstr;
10239 }
10240 # endif /* Above is using strerror_l */
10241 /*===========================================================================*/
10242 #else /* Below is not using strerror_l */
10243 # if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
10244
10245 /* If not using using either of the categories, return plain, unadorned
10246 * strerror */
10247
10248 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)10249 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
10250 {
10251 PERL_ARGS_ASSERT_MY_STRERROR;
10252
10253 DEBUG_STRERROR_ENTER(errnum, 0);
10254
10255 const char *errstr = savepv(Strerror(errnum));
10256 *utf8ness = UTF8NESS_IMMATERIAL;
10257
10258 DEBUG_STRERROR_RETURN(errstr, utf8ness);
10259
10260 SAVEFREEPV(errstr);
10261 return errstr;
10262 }
10263
10264 /*--------------------------------------------------------------------------*/
10265 # elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
10266
10267 /* Here one or the other of CTYPE or MESSAGES is defined, but not both. If we
10268 * are not within 'use locale' scope of the only one defined, we use the C
10269 * locale; otherwise use the current locale */
10270
10271 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)10272 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
10273 {
10274 PERL_ARGS_ASSERT_MY_STRERROR;
10275
10276 DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
10277
10278 const char *errstr;
10279 if (IN_LC(categories[WHICH_LC_INDEX])) {
10280 errstr = savepv(Strerror(errnum));
10281 *utf8ness = get_locale_string_utf8ness_i(errstr,
10282 LOCALE_UTF8NESS_UNKNOWN,
10283 NULL, WHICH_LC_INDEX);
10284 }
10285 else {
10286
10287 LOCALE_LOCK;
10288
10289 const char * orig_locale = toggle_locale_i(WHICH_LC_INDEX, "C");
10290
10291 errstr = savepv(Strerror(errnum));
10292
10293 restore_toggled_locale_i(WHICH_LC_INDEX, orig_locale);
10294
10295 LOCALE_UNLOCK;
10296
10297 *utf8ness = UTF8NESS_IMMATERIAL;
10298 }
10299
10300 DEBUG_STRERROR_RETURN(errstr, utf8ness);
10301
10302 SAVEFREEPV(errstr);
10303 return errstr;
10304 }
10305
10306 /*--------------------------------------------------------------------------*/
10307 # else
10308
10309 /* Below, have both LC_CTYPE and LC_MESSAGES. Place them in the same CODESET,
10310 * either C or the LC_MESSAGES locale */
10311
10312 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)10313 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
10314 {
10315 PERL_ARGS_ASSERT_MY_STRERROR;
10316
10317 DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
10318
10319 const char * desired_locale = (IN_LC(LC_MESSAGES))
10320 ? querylocale_c(LC_MESSAGES)
10321 : "C";
10322 /* XXX Can fail on z/OS */
10323
10324 LOCALE_LOCK;
10325
10326 const char* orig_CTYPE_locale = toggle_locale_c(LC_CTYPE,
10327 desired_locale);
10328 const char* orig_MESSAGES_locale = toggle_locale_c(LC_MESSAGES,
10329 desired_locale);
10330 const char *errstr = savepv(Strerror(errnum));
10331
10332 restore_toggled_locale_c(LC_MESSAGES, orig_MESSAGES_locale);
10333 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
10334
10335 LOCALE_UNLOCK;
10336
10337 *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
10338 desired_locale,
10339 LC_MESSAGES_INDEX_);
10340 DEBUG_STRERROR_RETURN(errstr, utf8ness);
10341
10342 SAVEFREEPV(errstr);
10343 return errstr;
10344 }
10345
10346 /*--------------------------------------------------------------------------*/
10347 # endif /* end of not using strerror_l() */
10348 #endif /* end of all the my_strerror() implementations */
10349
10350 bool
Perl__is_in_locale_category(pTHX_ const bool compiling,const int category)10351 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
10352 {
10353 /* Internal function which returns if we are in the scope of a pragma that
10354 * enables the locale category 'category'. 'compiling' should indicate if
10355 * this is during the compilation phase (TRUE) or not (FALSE). */
10356
10357 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
10358
10359 SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
10360 if (! these_categories || these_categories == &PL_sv_placeholder) {
10361 return FALSE;
10362 }
10363
10364 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
10365 * a valid unsigned */
10366 assert(category >= -1);
10367 return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
10368 }
10369
10370 /*
10371
10372 =for apidoc_section $locale
10373 =for apidoc switch_to_global_locale
10374
10375 This function copies the locale state of the calling thread into the program's
10376 global locale, and converts the thread to use that global locale.
10377
10378 It is intended so that Perl can safely be used with C libraries that access the
10379 global locale and which can't be converted to not access it. Effectively, this
10380 means libraries that call C<L<setlocale(3)>> on non-Windows systems. (For
10381 portability, it is a good idea to use it on Windows as well.)
10382
10383 A downside of using it is that it disables the services that Perl provides to
10384 hide locale gotchas from your code. The service you most likely will miss
10385 regards the radix character (decimal point) in floating point numbers. Code
10386 executed after this function is called can no longer just assume that this
10387 character is correct for the current circumstances.
10388
10389 To return to Perl control, and restart the gotcha prevention services, call
10390 C<L</sync_locale>>. Behavior is undefined for any pure Perl code that executes
10391 while the switch is in effect.
10392
10393 The global locale and the per-thread locales are independent. As long as just
10394 one thread converts to the global locale, everything works smoothly. But if
10395 more than one does, they can easily interfere with each other, and races are
10396 likely. On Windows systems prior to Visual Studio 15 (at which point Microsoft
10397 fixed a bug), races can occur (even if only one thread has been converted to
10398 the global locale), but only if you use the following operations:
10399
10400 =over
10401
10402 =item L<POSIX::localeconv|POSIX/localeconv>
10403
10404 =item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
10405
10406 =item L<perlapi/sv_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
10407
10408 =back
10409
10410 The first item is not fixable (except by upgrading to a later Visual Studio
10411 release), but it would be possible to work around the latter two items by
10412 having Perl change its algorithm for calculating these to use Windows API
10413 functions (likely C<GetNumberFormat> and C<GetCurrencyFormat>); patches
10414 welcome.
10415
10416 XS code should never call plain C<setlocale>, but should instead be converted
10417 to either call L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in
10418 for the system C<setlocale>) or use the methods given in L<perlcall> to call
10419 L<C<POSIX::setlocale>|POSIX/setlocale>. Either one will transparently properly
10420 handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
10421
10422 =cut
10423 */
10424
10425 #if defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)
10426 # define CHANGE_SYSTEM_LOCALE_TO_GLOBAL \
10427 STMT_START { \
10428 if (_configthreadlocale(_DISABLE_PER_THREAD_LOCALE) == -1) { \
10429 locale_panic_("_configthreadlocale returned an error"); \
10430 } \
10431 } STMT_END
10432 #elif defined(USE_POSIX_2008_LOCALE)
10433 # define CHANGE_SYSTEM_LOCALE_TO_GLOBAL \
10434 STMT_START { \
10435 locale_t old_locale = uselocale(LC_GLOBAL_LOCALE); \
10436 if (! old_locale) { \
10437 locale_panic_("Could not change to global locale"); \
10438 } \
10439 \
10440 /* Free the per-thread memory */ \
10441 if ( old_locale != LC_GLOBAL_LOCALE \
10442 && old_locale != PL_C_locale_obj) \
10443 { \
10444 freelocale(old_locale); \
10445 } \
10446 } STMT_END
10447 #else
10448 # define CHANGE_SYSTEM_LOCALE_TO_GLOBAL
10449 #endif
10450
10451 void
Perl_switch_to_global_locale(pTHX)10452 Perl_switch_to_global_locale(pTHX)
10453 {
10454
10455 #ifdef USE_LOCALE
10456
10457 DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering switch_to_global; %s\n",
10458 get_LC_ALL_display()));
10459
10460 /* In these cases, we use the system state to determine if we are in the
10461 * global locale or not. */
10462 # ifdef USE_POSIX_2008_LOCALE
10463
10464 const bool perl_controls = (LC_GLOBAL_LOCALE != uselocale((locale_t) 0));
10465
10466 # elif defined(USE_THREAD_SAFE_LOCALE) && defined(WIN32)
10467
10468 int config_return = _configthreadlocale(0);
10469 if (config_return == -1) {
10470 locale_panic_("_configthreadlocale returned an error");
10471 }
10472 const bool perl_controls = (config_return == _ENABLE_PER_THREAD_LOCALE);
10473
10474 # else
10475
10476 const bool perl_controls = false;
10477
10478 # endif
10479
10480 /* No-op if already in global */
10481 if (! perl_controls) {
10482 return;
10483 }
10484
10485 # ifdef LC_ALL
10486
10487 const char * thread_locale = calculate_LC_ALL_string(NULL,
10488 EXTERNAL_FORMAT_FOR_SET,
10489 WANT_TEMP_PV,
10490 __LINE__);
10491 CHANGE_SYSTEM_LOCALE_TO_GLOBAL;
10492 posix_setlocale(LC_ALL, thread_locale);
10493
10494 # else /* Must be USE_POSIX_2008_LOCALE) */
10495
10496 const char * cur_thread_locales[LC_ALL_INDEX_];
10497
10498 /* Save each category's current per-thread state */
10499 for_all_individual_category_indexes(i) {
10500 cur_thread_locales[i] = querylocale_i(i);
10501 }
10502
10503 CHANGE_SYSTEM_LOCALE_TO_GLOBAL;
10504
10505 /* Set the global to what was our per-thread state */
10506 POSIX_SETLOCALE_LOCK;
10507 for_all_individual_category_indexes(i) {
10508 posix_setlocale(categories[i], cur_thread_locales[i]);
10509 }
10510 POSIX_SETLOCALE_UNLOCK;
10511
10512 # endif
10513 # ifdef USE_LOCALE_NUMERIC
10514
10515 /* Switch to the underlying C numeric locale; the application is on its
10516 * own. */
10517 POSIX_SETLOCALE_LOCK;
10518 posix_setlocale(LC_NUMERIC, PL_numeric_name);
10519 POSIX_SETLOCALE_UNLOCK;
10520
10521 # endif
10522 #endif
10523
10524 }
10525
10526 /*
10527
10528 =for apidoc sync_locale
10529
10530 This function copies the state of the program global locale into the calling
10531 thread, and converts that thread to using per-thread locales, if it wasn't
10532 already, and the platform supports them. The LC_NUMERIC locale is toggled into
10533 the standard state (using the C locale's conventions), if not within the
10534 lexical scope of S<C<use locale>>.
10535
10536 Perl will now consider itself to have control of the locale.
10537
10538 Since unthreaded perls have only a global locale, this function is a no-op
10539 without threads.
10540
10541 This function is intended for use with C libraries that do locale manipulation.
10542 It allows Perl to accommodate the use of them. Call this function before
10543 transferring back to Perl space so that it knows what state the C code has left
10544 things in.
10545
10546 XS code should not manipulate the locale on its own. Instead,
10547 L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
10548 change the locale (though changing the locale is antisocial and dangerous on
10549 multi-threaded systems that don't have multi-thread safe locale operations.
10550 (See L<perllocale/Multi-threaded operation>).
10551
10552 Using the libc L<C<setlocale(3)>> function should be avoided. Nevertheless,
10553 certain non-Perl libraries called from XS, do call it, and their behavior may
10554 not be able to be changed. This function, along with
10555 C<L</switch_to_global_locale>>, can be used to get seamless behavior in these
10556 circumstances, as long as only one thread is involved.
10557
10558 If the library has an option to turn off its locale manipulation, doing that is
10559 preferable to using this mechanism. C<Gtk> is such a library.
10560
10561 The return value is a boolean: TRUE if the global locale at the time of call
10562 was in effect for the caller; and FALSE if a per-thread locale was in effect.
10563
10564 =cut
10565 */
10566
10567 bool
Perl_sync_locale(pTHX)10568 Perl_sync_locale(pTHX)
10569 {
10570
10571 #ifndef USE_LOCALE
10572
10573 return TRUE;
10574
10575 #else
10576
10577 bool was_in_global = TRUE;
10578
10579 # ifdef USE_THREAD_SAFE_LOCALE
10580 # if defined(WIN32)
10581
10582 int config_return = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
10583 if (config_return == -1) {
10584 locale_panic_("_configthreadlocale returned an error");
10585 }
10586 was_in_global = (config_return == _DISABLE_PER_THREAD_LOCALE);
10587
10588 # elif defined(USE_POSIX_2008_LOCALE)
10589
10590 was_in_global = (LC_GLOBAL_LOCALE == uselocale(LC_GLOBAL_LOCALE));
10591
10592 # else
10593 # error Unexpected Configuration
10594 # endif
10595 # endif /* USE_THREAD_SAFE_LOCALE */
10596
10597 /* Here, we are in the global locale. Get and save the values for each
10598 * category, and convert the current thread to use them */
10599
10600 # ifdef LC_ALL
10601
10602 STDIZED_SETLOCALE_LOCK;
10603 const char * lc_all_string = savepv(stdized_setlocale(LC_ALL, NULL));
10604 STDIZED_SETLOCALE_UNLOCK;
10605
10606 give_perl_locale_control(lc_all_string, __LINE__);
10607 Safefree(lc_all_string);
10608
10609 # else
10610
10611 const char * current_globals[LC_ALL_INDEX_];
10612 for_all_individual_category_indexes(i) {
10613 STDIZED_SETLOCALE_LOCK;
10614 current_globals[i] = savepv(stdized_setlocale(categories[i], NULL));
10615 STDIZED_SETLOCALE_UNLOCK;
10616 }
10617
10618 give_perl_locale_control((const char **) ¤t_globals, __LINE__);
10619
10620 for_all_individual_category_indexes(i) {
10621 Safefree(current_globals[i]);
10622 }
10623
10624 # endif
10625
10626 return was_in_global;
10627
10628 #endif
10629
10630 }
10631
10632 #ifdef USE_PERL_SWITCH_LOCALE_CONTEXT
10633
10634 void
Perl_switch_locale_context(pTHX)10635 Perl_switch_locale_context(pTHX)
10636 {
10637 /* libc keeps per-thread locale status information in some configurations.
10638 * So, we can't just switch out aTHX to switch to a new thread. libc has
10639 * to follow along. This routine does that based on per-interpreter
10640 * variables we keep just for this purpose.
10641 *
10642 * There are two implementations where this is an issue. For the other
10643 * implementations, it doesn't matter because libc is using global values
10644 * that all threads know about.
10645 *
10646 * The two implementations are where libc keeps thread-specific information
10647 * on its own. These are
10648 *
10649 * POSIX 2008: The current locale is kept by libc as an object. We save
10650 * a copy of that in the per-thread PL_cur_locale_obj, and so
10651 * this routine uses that copy to tell the thread it should be
10652 * operating with that object
10653 * Windows thread-safe locales: A given thread in Windows can be being run
10654 * with per-thread locales, or not. When the thread context
10655 * changes, libc doesn't automatically know if the thread is
10656 * using per-thread locales, nor does it know what the new
10657 * thread's locale is. We keep that information in the
10658 * per-thread variables:
10659 * PL_controls_locale indicates if this thread is using
10660 * per-thread locales or not
10661 * PL_cur_LC_ALL indicates what the locale should be
10662 * if it is a per-thread locale.
10663 */
10664
10665 if (UNLIKELY( PL_veto_switch_non_tTHX_context
10666 || PL_phase == PERL_PHASE_CONSTRUCT))
10667 {
10668 return;
10669 }
10670
10671 # ifdef USE_POSIX_2008_LOCALE
10672
10673 if (! uselocale(PL_cur_locale_obj)) {
10674 locale_panic_(Perl_form(aTHX_
10675 "Can't uselocale(%p), LC_ALL supposed to"
10676 " be '%s'",
10677 PL_cur_locale_obj, get_LC_ALL_display()));
10678 }
10679
10680 # elif defined(WIN32)
10681
10682 if (! bool_setlocale_c(LC_ALL, PL_cur_LC_ALL)) {
10683 locale_panic_(Perl_form(aTHX_ "Can't setlocale(%s)", PL_cur_LC_ALL));
10684 }
10685
10686 # endif
10687
10688 }
10689
10690 #endif
10691 #ifdef USE_THREADS
10692
10693 void
Perl_thread_locale_init(pTHX)10694 Perl_thread_locale_init(pTHX)
10695 {
10696
10697 # ifdef USE_THREAD_SAFE_LOCALE
10698 # ifdef USE_POSIX_2008_LOCALE
10699
10700 /* Called from a thread on startup.
10701 *
10702 * The operations here have to be done from within the calling thread, as
10703 * they affect libc's knowledge of the thread; libc has no knowledge of
10704 * aTHX */
10705
10706 DEBUG_L(PerlIO_printf(Perl_debug_log,
10707 "new thread, initial locale is %s;"
10708 " calling setlocale(LC_ALL, \"C\")\n",
10709 get_LC_ALL_display()));
10710
10711 if (! uselocale(PL_C_locale_obj)) {
10712
10713 /* Not being able to change to the C locale is severe; don't keep
10714 * going. */
10715 locale_panic_(Perl_form(aTHX_
10716 "Can't uselocale(%p), 'C'", PL_C_locale_obj));
10717 NOT_REACHED; /* NOTREACHED */
10718 }
10719
10720 PL_cur_locale_obj = PL_C_locale_obj;
10721
10722 # elif defined(WIN32)
10723
10724 /* On Windows, make sure new thread has per-thread locales enabled */
10725 if (_configthreadlocale(_ENABLE_PER_THREAD_LOCALE) == -1) {
10726 locale_panic_("_configthreadlocale returned an error");
10727 }
10728 void_setlocale_c(LC_ALL, "C");
10729
10730 # endif
10731 # endif
10732
10733 }
10734
10735 void
Perl_thread_locale_term(pTHX)10736 Perl_thread_locale_term(pTHX)
10737 {
10738 /* Called from a thread as it gets ready to terminate.
10739 *
10740 * The operations here have to be done from within the calling thread, as
10741 * they affect libc's knowledge of the thread; libc has no knowledge of
10742 * aTHX */
10743
10744 # if defined(USE_POSIX_2008_LOCALE)
10745
10746 /* Switch to the global locale, so can free up the per-thread object */
10747 locale_t actual_obj = uselocale(LC_GLOBAL_LOCALE);
10748 if (actual_obj != LC_GLOBAL_LOCALE && actual_obj != PL_C_locale_obj) {
10749 freelocale(actual_obj);
10750 }
10751
10752 /* Prevent leaks even if something has gone wrong */
10753 locale_t expected_obj = PL_cur_locale_obj;
10754 if (UNLIKELY( expected_obj != actual_obj
10755 && expected_obj != LC_GLOBAL_LOCALE
10756 && expected_obj != PL_C_locale_obj))
10757 {
10758 freelocale(expected_obj);
10759 }
10760
10761 PL_cur_locale_obj = LC_GLOBAL_LOCALE;
10762
10763 # endif
10764 # ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
10765
10766 /* When faking the mingw implementation, we coerce this function into doing
10767 * something completely different from its intent -- namely to free up our
10768 * static buffer to avoid a leak. This function gets called for each
10769 * thread that is terminating, so will give us a chance to free the buffer
10770 * from the appropriate pool. On unthreaded systems, it gets called by the
10771 * mutex termination code. */
10772
10773 if (aTHX != wsetlocale_buf_aTHX) {
10774 return;
10775 }
10776
10777 if (wsetlocale_buf_size > 0) {
10778 Safefree(wsetlocale_buf);
10779 wsetlocale_buf_size = 0;
10780 }
10781
10782 # endif
10783
10784 }
10785
10786 #endif
10787
10788 /*
10789 * ex: set ts=8 sts=4 sw=4 et:
10790 */
10791