1 /* locale.c
2 *
3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11 /*
12 * A Elbereth Gilthoniel,
13 * silivren penna míriel
14 * o menel aglar elenath!
15 * Na-chaered palan-díriel
16 * o galadhremmin ennorath,
17 * Fanuilos, le linnathon
18 * nef aear, si nef aearon!
19 *
20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
21 */
22
23 /* utility functions for handling locale-specific stuff like what
24 * character represents the decimal point.
25 *
26 * All C programs have an underlying locale. Perl code generally doesn't pay
27 * any attention to it except within the scope of a 'use locale'. For most
28 * categories, it accomplishes this by just using different operations if it is
29 * in such scope than if not. However, various libc functions called by Perl
30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that
31 * are used to toggle between the current locale and the C locale depending on
32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is
33 * switched to the C locale for outputting the message unless within the scope
34 * of 'use locale'.
35 *
36 * This code now has multi-thread-safe locale handling on systems that support
37 * that. This is completely transparent to most XS code. On earlier systems,
38 * it would be possible to emulate thread-safe locales, but this likely would
39 * involve a lot of locale switching, and would require XS code changes.
40 * Macros could be written so that the code wouldn't have to know which type of
41 * system is being used.
42 *
43 * Table-driven code is used for simplicity and clarity, as many operations
44 * differ only in which category is being worked on. However the system
45 * categories need not be small contiguous integers, so do not lend themselves
46 * to table lookup. Instead we have created our own equivalent values which
47 * are all small contiguous non-negative integers, and translation functions
48 * between the two sets. For category 'LC_foo', the name of our index is
49 * LC_foo_INDEX_. Various parallel tables, indexed by these, are used.
50 *
51 * Many of the macros and functions in this file have one of the suffixes '_c',
52 * '_r', or '_i'. khw found these useful in remembering what type of locale
53 * category to use as their parameter. '_r' takes an int category number as
54 * passed to setlocale(), like LC_ALL, LC_CTYPE, etc. The 'r' indicates that
55 * the value isn't known until runtime. '_c' also indicates such a category
56 * number, but its value is known at compile time. These are both converted
57 * into unsigned indexes into various tables of category information, where the
58 * real work is generally done. The tables are generated at compile-time based
59 * on platform characteristics and Configure options. They hide from the code
60 * many of the vagaries of the different locale implementations out there. You
61 * may have already guessed that '_i' indicates the parameter is such an
62 * unsigned index. Converting from '_r' to '_i' requires run-time lookup.
63 * '_c' is used to get cpp to do this at compile time. To avoid the runtime
64 * expense, the code is structured to use '_r' at the API level, and once
65 * converted, everything possible is done using the table indexes.
66 *
67 * On unthreaded perls, most operations expand out to just the basic
68 * setlocale() calls. The same is true on threaded perls on modern Windows
69 * systems where the same API, after set up, is used for thread-safe locale
70 * handling. On other systems, there is a completely different API, specified
71 * in POSIX 2008, to do thread-safe locales. On these systems, our
72 * emulate_setlocale_i() function is used to hide the different API from the
73 * outside. This makes it completely transparent to most XS code.
74 *
75 * A huge complicating factor is that the LC_NUMERIC category is normally held
76 * in the C locale, except during those relatively rare times when it needs to
77 * be in the underlying locale. There is a bunch of code to accomplish this,
78 * and to allow easy switches from one state to the other.
79 *
80 * In addition, the setlocale equivalents have versions for the return context,
81 * 'void' and 'bool', besides the full return value. This can present
82 * opportunities for avoiding work. We don't have to necessarily create a safe
83 * copy to return if no return is desired.
84 *
85 * There are 3.5 major implementations here; which one chosen depends on what
86 * the platform has available, and Configuration options.
87 *
88 * 1) Raw my_setlocale(). Here the layer adds nothing. This is used for
89 * unthreaded perls, and when the API for safe locale threading is identical
90 * to the unsafe API (Windows, currently).
91 *
92 * 2) A minimal layer that makes my_setlocale() uninterruptible and returns a
93 * per-thread/per-category value.
94 *
95 * 3a and 3b) A layer that implements POSIX 2008 thread-safe locale handling,
96 * mapping the setlocale() API to them. This automatically makes almost all
97 * code thread-safe without need for changes. This layer is chosen on
98 * threaded perls when the platform supports the POSIX 2008 functions, and
99 * when there is no manual override in Configure.
100 *
101 * 3a) is when the platform has a reliable querylocale() function or
102 * equivalent that is selected to be used.
103 * 3b) is when we have to emulate that functionality.
104 *
105 * z/OS (os390) is an outlier. Locales really don't work under threads when
106 * either the radix character isn't a dot, or attempts are made to change
107 * locales after the first thread is created. The reason is that IBM has made
108 * it thread-safe by refusing to change locales (returning failure if
109 * attempted) any time after an application has called pthread_create() to
110 * create another thread. The expectation is that an application will set up
111 * its locale information before the first fork, and be stable thereafter. But
112 * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
113 * the other toggles, which are less common.
114 */
115
116 /* If the environment says to, we can output debugging information during
117 * initialization. This is done before option parsing, and before any thread
118 * creation, so can be a file-level static. (Must come before #including
119 * perl.h) */
120 #ifdef DEBUGGING
121 static int debug_initialization = 0;
122 # define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
123 # define DEBUG_LOCALE_INITIALIZATION_ debug_initialization
124 /* C standards seem to say that __LINE__ is merely "an integer constant",
125 * which means it might be either int, long (with L suffix), or long long
126 * (or their corresponding unsigned type). So, we have to explicitly cast
127 * __LINE__ to a particular integer type to pass it reliably to variadic
128 * functions like (PerlIO_)printf, as below: */
129 # ifdef USE_LOCALE_THREADS
130 # define DEBUG_PRE_STMTS \
131 dSAVE_ERRNO; dTHX; PerlIO_printf(Perl_debug_log,"\n%s: %" LINE_Tf ": %p: ",\
132 __FILE__, (line_t)__LINE__, aTHX);
133 # else
134 # define DEBUG_PRE_STMTS \
135 dSAVE_ERRNO; dTHX; PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf ": ", \
136 __FILE__, (line_t)__LINE__);
137 # endif
138 # define DEBUG_POST_STMTS RESTORE_ERRNO;
139 #else
140 # define debug_initialization 0
141 # define DEBUG_INITIALIZATION_set(v)
142 # define DEBUG_PRE_STMTS
143 # define DEBUG_POST_STMTS
144 #endif
145
146 #include "EXTERN.h"
147 #define PERL_IN_LOCALE_C
148 #include "perl.h"
149
150 #include "reentr.h"
151
152 #ifdef I_WCHAR
153 # include <wchar.h>
154 #endif
155 #ifdef I_WCTYPE
156 # include <wctype.h>
157 #endif
158
159 /* The main errno that gets used is this one, on platforms that support it */
160 #ifdef EINVAL
161 # define SET_EINVAL SETERRNO(EINVAL, LIB_INVARG)
162 #else
163 # define SET_EINVAL
164 #endif
165
166 /* If we have any of these library functions, we can reliably determine is a
167 * locale is a UTF-8 one or not. And if we aren't using locales at all, we act
168 * as if everything is the C locale, so the answer there is always "No, it
169 * isn't UTF-8"; this too is reliably accurate */
170 #if defined(HAS_SOME_LANGINFO) || defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) \
171 || ! defined(USE_LOCALE)
172 # define HAS_RELIABLE_UTF8NESS_DETERMINATION
173 #endif
174
175 #ifdef USE_LOCALE
176
177 PERL_STATIC_INLINE const char *
S_mortalized_pv_copy(pTHX_ const char * const pv)178 S_mortalized_pv_copy(pTHX_ const char * const pv)
179 {
180 PERL_ARGS_ASSERT_MORTALIZED_PV_COPY;
181
182 /* Copies the input pv, and arranges for it to be freed at an unspecified
183 * later time. */
184
185 if (pv == NULL) {
186 return NULL;
187 }
188
189 const char * copy = savepv(pv);
190 SAVEFREEPV(copy);
191 return copy;
192 }
193
194 #endif
195
196 /* Returns the Unix errno portion; ignoring any others. This is a macro here
197 * instead of putting it into perl.h, because unclear to khw what should be
198 * done generally. */
199 #define GET_ERRNO saved_errno
200
201 /* Default values come from the C locale */
202 #define C_codeset "ANSI_X3.4-1968" /* Only in some Configurations, and usually
203 a single instance, so is a #define */
204 static const char C_decimal_point[] = ".";
205 static const char C_thousands_sep[] = "";
206
207 /* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
208 * return of setlocale(), then this is extremely likely to be the C or POSIX
209 * locale. However, the output of setlocale() is documented to be opaque, but
210 * the odds are extremely small that it would return these two strings for some
211 * other locale. Note that VMS in these two locales includes many non-ASCII
212 * characters as controls and punctuation (below are hex bytes):
213 * cntrl: 84-97 9B-9F
214 * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
215 * Oddly, none there are listed as alphas, though some represent alphabetics
216 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
217 #define isNAME_C_OR_POSIX(name) \
218 ( (name) != NULL \
219 && (( *(name) == 'C' && (*(name + 1)) == '\0') \
220 || strEQ((name), "POSIX")))
221
222 #if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
223 # define HAS_SOME_LANGINFO
224 #endif
225
226 #define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep, utf8ness) \
227 my_langinfo_i(item, category##_INDEX_, locale, retbufp, \
228 retbuf_sizep, utf8ness)
229
230 #ifdef USE_LOCALE
231
232 # ifdef DEBUGGING
233 # define setlocale_debug_string_i(index, locale, result) \
234 my_setlocale_debug_string_i(index, locale, result, __LINE__)
235 # define setlocale_debug_string_c(category, locale, result) \
236 setlocale_debug_string_i(category##_INDEX_, locale, result)
237 # define setlocale_debug_string_r(category, locale, result) \
238 setlocale_debug_string_i(get_category_index(category, locale), \
239 locale, result)
240 # endif
241
242 # define toggle_locale_i(index, locale) \
243 S_toggle_locale_i(aTHX_ index, locale, __LINE__)
244 # define toggle_locale_c(cat, locale) toggle_locale_i(cat##_INDEX_, locale)
245 # define restore_toggled_locale_i(index, locale) \
246 S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__)
247 # define restore_toggled_locale_c(cat, locale) \
248 restore_toggled_locale_i(cat##_INDEX_, locale)
249
250 /* Two parallel arrays indexed by our mapping of category numbers into small
251 * non-negative indexes; first the locale categories Perl uses on this system,
252 * used to do the inverse mapping. The second array is their names. These
253 * arrays are in mostly arbitrary order. */
254
255 STATIC const int categories[] = {
256
257 # ifdef USE_LOCALE_CTYPE
258 LC_CTYPE,
259 # endif
260 # ifdef USE_LOCALE_NUMERIC
261 LC_NUMERIC,
262 # endif
263 # ifdef USE_LOCALE_COLLATE
264 LC_COLLATE,
265 # endif
266 # ifdef USE_LOCALE_TIME
267 LC_TIME,
268 # endif
269 # ifdef USE_LOCALE_MESSAGES
270 LC_MESSAGES,
271 # endif
272 # ifdef USE_LOCALE_MONETARY
273 LC_MONETARY,
274 # endif
275 # ifdef USE_LOCALE_ADDRESS
276 LC_ADDRESS,
277 # endif
278 # ifdef USE_LOCALE_IDENTIFICATION
279 LC_IDENTIFICATION,
280 # endif
281 # ifdef USE_LOCALE_MEASUREMENT
282 LC_MEASUREMENT,
283 # endif
284 # ifdef USE_LOCALE_PAPER
285 LC_PAPER,
286 # endif
287 # ifdef USE_LOCALE_TELEPHONE
288 LC_TELEPHONE,
289 # endif
290 # ifdef USE_LOCALE_NAME
291 LC_NAME,
292 # endif
293 # ifdef USE_LOCALE_SYNTAX
294 LC_SYNTAX,
295 # endif
296 # ifdef USE_LOCALE_TOD
297 LC_TOD,
298 # endif
299 # ifdef LC_ALL
300 LC_ALL,
301 # endif
302
303 /* Placeholder as a precaution if code fails to check the return of
304 * get_category_index(), which returns this element to indicate an error */
305 -1
306 };
307
308 /* The top-most real element is LC_ALL */
309
310 STATIC const char * const category_names[] = {
311
312 # ifdef USE_LOCALE_CTYPE
313 "LC_CTYPE",
314 # endif
315 # ifdef USE_LOCALE_NUMERIC
316 "LC_NUMERIC",
317 # endif
318 # ifdef USE_LOCALE_COLLATE
319 "LC_COLLATE",
320 # endif
321 # ifdef USE_LOCALE_TIME
322 "LC_TIME",
323 # endif
324 # ifdef USE_LOCALE_MESSAGES
325 "LC_MESSAGES",
326 # endif
327 # ifdef USE_LOCALE_MONETARY
328 "LC_MONETARY",
329 # endif
330 # ifdef USE_LOCALE_ADDRESS
331 "LC_ADDRESS",
332 # endif
333 # ifdef USE_LOCALE_IDENTIFICATION
334 "LC_IDENTIFICATION",
335 # endif
336 # ifdef USE_LOCALE_MEASUREMENT
337 "LC_MEASUREMENT",
338 # endif
339 # ifdef USE_LOCALE_PAPER
340 "LC_PAPER",
341 # endif
342 # ifdef USE_LOCALE_TELEPHONE
343 "LC_TELEPHONE",
344 # endif
345 # ifdef USE_LOCALE_NAME
346 "LC_NAME",
347 # endif
348 # ifdef USE_LOCALE_SYNTAX
349 "LC_SYNTAX",
350 # endif
351 # ifdef USE_LOCALE_TOD
352 "LC_TOD",
353 # endif
354 # ifdef LC_ALL
355 "LC_ALL",
356 # endif
357
358 /* Placeholder as a precaution if code fails to check the return of
359 * get_category_index(), which returns this element to indicate an error */
360 NULL
361 };
362
363 /* A few categories require additional setup when they are changed. This table
364 * points to the functions that do that setup */
365 STATIC void (*update_functions[]) (pTHX_ const char *, bool force) = {
366 # ifdef USE_LOCALE_CTYPE
367 S_new_ctype,
368 # endif
369 # ifdef USE_LOCALE_NUMERIC
370 S_new_numeric,
371 # endif
372 # ifdef USE_LOCALE_COLLATE
373 S_new_collate,
374 # endif
375 # ifdef USE_LOCALE_TIME
376 NULL,
377 # endif
378 # ifdef USE_LOCALE_MESSAGES
379 NULL,
380 # endif
381 # ifdef USE_LOCALE_MONETARY
382 NULL,
383 # endif
384 # ifdef USE_LOCALE_ADDRESS
385 NULL,
386 # endif
387 # ifdef USE_LOCALE_IDENTIFICATION
388 NULL,
389 # endif
390 # ifdef USE_LOCALE_MEASUREMENT
391 NULL,
392 # endif
393 # ifdef USE_LOCALE_PAPER
394 NULL,
395 # endif
396 # ifdef USE_LOCALE_TELEPHONE
397 NULL,
398 # endif
399 # ifdef USE_LOCALE_NAME
400 NULL,
401 # endif
402 # ifdef USE_LOCALE_SYNTAX
403 NULL,
404 # endif
405 # ifdef USE_LOCALE_TOD
406 NULL,
407 # endif
408 /* No harm done to have this even without an LC_ALL */
409 S_new_LC_ALL,
410
411 /* Placeholder as a precaution if code fails to check the return of
412 * get_category_index(), which returns this element to indicate an error */
413 NULL
414 };
415
416 # ifdef LC_ALL
417
418 /* On systems with LC_ALL, it is kept in the highest index position. (-2
419 * to account for the final unused placeholder element.) */
420 # define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
421 # else
422
423 /* On systems without LC_ALL, we pretend it is there, one beyond the real
424 * top element, hence in the unused placeholder element. */
425 # define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
426 # endif
427
428 /* Pretending there is an LC_ALL element just above allows us to avoid most
429 * special cases. Most loops through these arrays in the code below are
430 * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work
431 * on either type of system. But the code must be written to not access the
432 * element at 'LC_ALL_INDEX_' except on platforms that have it. This can be
433 * checked for at compile time by using the #define LC_ALL_INDEX_ which is only
434 * defined if we do have LC_ALL. */
435
436 STATIC int
S_get_category_index_nowarn(const int category)437 S_get_category_index_nowarn(const int category)
438 {
439 /* Given a category, return the equivalent internal index we generally use
440 * instead, or negative if not found.
441 *
442 * Some sort of hash could be used instead of this loop, but the number of
443 * elements is so far at most 12 */
444
445 unsigned int i;
446
447 PERL_ARGS_ASSERT_GET_CATEGORY_INDEX;
448
449 # ifdef LC_ALL
450 for (i = 0; i <= LC_ALL_INDEX_; i++)
451 # else
452 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)
453 # endif
454 {
455 if (category == categories[i]) {
456 dTHX_DEBUGGING;
457 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
458 "index of category %d (%s) is %d\n",
459 category, category_names[i], i));
460 return i;
461 }
462 }
463
464 return -1;
465 }
466
467 STATIC unsigned int
S_get_category_index(const int category,const char * locale)468 S_get_category_index(const int category, const char * locale)
469 {
470 /* Given a category, return the equivalent internal index we generally use
471 * instead.
472 *
473 * 'locale' is for use in any generated diagnostics, and may be NULL
474 */
475
476 const char * conditional_warn_text = "; can't set it to ";
477 const int index = get_category_index_nowarn(category);
478
479 if (index >= 0) {
480 return index;
481 }
482
483 /* Here, we don't know about this category, so can't handle it. */
484
485 if (! locale) {
486 locale = "";
487 conditional_warn_text = "";
488 }
489
490 /* diag_listed_as: Unknown locale category %d; can't set it to %s */
491 Perl_warner_nocontext(packWARN(WARN_LOCALE),
492 "Unknown locale category %d%s%s",
493 category, conditional_warn_text, locale);
494
495 SET_EINVAL;
496
497 /* Return an out-of-bounds value */
498 return NOMINAL_LC_ALL_INDEX + 1;
499 }
500
501 #endif /* ifdef USE_LOCALE */
502
503 void
Perl_force_locale_unlock()504 Perl_force_locale_unlock()
505 {
506
507 #if defined(USE_LOCALE_THREADS)
508
509 dTHX;
510
511 /* If recursively locked, clear all at once */
512 if (PL_locale_mutex_depth > 1) {
513 PL_locale_mutex_depth = 1;
514 }
515
516 if (PL_locale_mutex_depth > 0) {
517 LOCALE_UNLOCK_;
518 }
519
520 #endif
521
522 }
523
524 #ifdef USE_POSIX_2008_LOCALE
525
526 STATIC locale_t
S_use_curlocale_scratch(pTHX)527 S_use_curlocale_scratch(pTHX)
528 {
529 /* This function is used to hide from the caller the case where the current
530 * locale_t object in POSIX 2008 is the global one, which is illegal in
531 * many of the P2008 API calls. This checks for that and, if necessary
532 * creates a proper P2008 object. Any prior object is deleted, as is any
533 * remaining object during global destruction. */
534
535 locale_t cur = uselocale((locale_t) 0);
536
537 if (cur != LC_GLOBAL_LOCALE) {
538 return cur;
539 }
540
541 if (PL_scratch_locale_obj) {
542 freelocale(PL_scratch_locale_obj);
543 }
544
545 PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE);
546 return PL_scratch_locale_obj;
547 }
548
549 #endif
550
551 void
Perl_locale_panic(const char * msg,const char * file_name,const line_t line,const int errnum)552 Perl_locale_panic(const char * msg,
553 const char * file_name,
554 const line_t line,
555 const int errnum)
556 {
557 dTHX;
558
559 PERL_ARGS_ASSERT_LOCALE_PANIC;
560
561 force_locale_unlock();
562
563 #ifdef USE_C_BACKTRACE
564 dump_c_backtrace(Perl_debug_log, 20, 1);
565 #endif
566
567 /* diag_listed_as: panic: %s */
568 Perl_croak(aTHX_ "%s: %" LINE_Tf ": panic: %s; errno=%d\n",
569 file_name, line, msg, errnum);
570 }
571
572 #define setlocale_failure_panic_c( \
573 cat, current, failed, caller_0_line, caller_1_line) \
574 setlocale_failure_panic_i(cat##_INDEX_, current, failed, \
575 caller_0_line, caller_1_line)
576
577 /* posix_setlocale() presents a consistent POSIX-compliant interface to
578 * setlocale(). Windows requres a customized base-level setlocale(). Any
579 * necessary mutex locking needs to be done at a higher level */
580 #ifdef WIN32
581 # define posix_setlocale(cat, locale) win32_setlocale(cat, locale)
582 #else
583 # define posix_setlocale(cat, locale) ((const char *) setlocale(cat, locale))
584 #endif
585
586 /* The next layer up is to catch vagaries and bugs in the libc setlocale return
587 * value. Again, any necessary mutex locking needs to be done at a higher
588 * level */
589 #ifdef stdize_locale
590 # define stdized_setlocale(cat, locale) \
591 stdize_locale(cat, posix_setlocale(cat, locale), \
592 &PL_stdize_locale_buf, &PL_stdize_locale_bufsize, __LINE__)
593 #else
594 # define stdized_setlocale(cat, locale) posix_setlocale(cat, locale)
595 #endif
596
597 /* The next many lines form a layer above the close-to-the-metal 'posix'
598 * and 'stdized' macros. They are used to present a uniform API to the rest of
599 * the code in this file in spite of the disparate underlying implementations.
600 * */
601
602 #if (! defined(USE_LOCALE_THREADS) && ! defined(USE_POSIX_2008_LOCALE)) \
603 || ( defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE))
604
605 /* For non-threaded perls, the added layer just expands to the base-level
606 * functions, except if we are supposed to use the POSIX 2008 interface anyway.
607 * On perls where threading is invisible to us, the base-level functions are
608 * used regardless of threading. Currently this is only on later Windows
609 * versions.
610 *
611 * See the introductory comments in this file for the meaning of the suffixes
612 * '_c', '_r', '_i'. */
613
614 # define setlocale_r(cat, locale) stdized_setlocale(cat, locale)
615 # define setlocale_i(i, locale) setlocale_r(categories[i], locale)
616 # define setlocale_c(cat, locale) setlocale_r(cat, locale)
617
618 # define void_setlocale_i(i, locale) \
619 STMT_START { \
620 if (! posix_setlocale(categories[i], locale)) { \
621 setlocale_failure_panic_i(i, NULL, locale, __LINE__, 0); \
622 NOT_REACHED; /* NOTREACHED */ \
623 } \
624 } STMT_END
625 # define void_setlocale_c(cat, locale) \
626 void_setlocale_i(cat##_INDEX_, locale)
627 # define void_setlocale_r(cat, locale) \
628 void_setlocale_i(get_category_index(cat, locale), locale)
629
630 # define bool_setlocale_r(cat, locale) cBOOL(posix_setlocale(cat, locale))
631 # define bool_setlocale_i(i, locale) \
632 bool_setlocale_c(categories[i], locale)
633 # define bool_setlocale_c(cat, locale) bool_setlocale_r(cat, locale)
634
635 /* All the querylocale...() forms return a mortalized copy. If you need
636 * something stable across calls, you need to savepv() the result yourself */
637
638 # define querylocale_r(cat) mortalized_pv_copy(setlocale_r(cat, NULL))
639 # define querylocale_c(cat) querylocale_r(cat)
640 # define querylocale_i(i) querylocale_c(categories[i])
641
642 #elif defined(USE_LOCALE_THREADS) \
643 && ! defined(USE_THREAD_SAFE_LOCALE)
644
645 /* Here, there are threads, and there is no support for thread-safe
646 * operation. This is a dangerous situation, which perl is documented as
647 * not supporting, but it arises in practice. We can do a modicum of
648 * automatic mitigation by making sure there is a per-thread return from
649 * setlocale(), and that a mutex protects it from races */
650 STATIC const char *
S_less_dicey_setlocale_r(pTHX_ const int category,const char * locale)651 S_less_dicey_setlocale_r(pTHX_ const int category, const char * locale)
652 {
653 const char * retval;
654
655 PERL_ARGS_ASSERT_LESS_DICEY_SETLOCALE_R;
656
657 POSIX_SETLOCALE_LOCK;
658
659 retval = stdized_setlocale(category, locale);
660
661 /* We reuse PL_stdize_locale_buf as it doesn't conflict, but the call may
662 * already have used it, in which case we don't have to do anything further
663 * */
664 retval = save_to_buffer(retval,
665 &PL_stdize_locale_buf, &PL_stdize_locale_bufsize);
666
667 POSIX_SETLOCALE_UNLOCK;
668
669 return retval;
670 }
671
672 # define setlocale_r(cat, locale) less_dicey_setlocale_r(cat, locale)
673 # define setlocale_c(cat, locale) setlocale_r(cat, locale)
674 # define setlocale_i(i, locale) setlocale_r(categories[i], locale)
675
676 # define querylocale_r(cat) mortalized_pv_copy(setlocale_r(cat, NULL))
677 # define querylocale_c(cat) querylocale_r(cat)
678 # define querylocale_i(i) querylocale_r(categories[i])
679
680 STATIC void
S_less_dicey_void_setlocale_i(pTHX_ const unsigned cat_index,const char * locale,const line_t line)681 S_less_dicey_void_setlocale_i(pTHX_ const unsigned cat_index,
682 const char * locale,
683 const line_t line)
684 {
685 PERL_ARGS_ASSERT_LESS_DICEY_VOID_SETLOCALE_I;
686
687 POSIX_SETLOCALE_LOCK;
688 if (! posix_setlocale(categories[cat_index], locale)) {
689 POSIX_SETLOCALE_UNLOCK;
690 setlocale_failure_panic_i(cat_index, NULL, locale, __LINE__, line);
691 }
692 POSIX_SETLOCALE_UNLOCK;
693 }
694
695 # define void_setlocale_i(i, locale) \
696 less_dicey_void_setlocale_i(i, locale, __LINE__)
697 # define void_setlocale_c(cat, locale) \
698 void_setlocale_i(cat##_INDEX_, locale)
699 # define void_setlocale_r(cat, locale) \
700 void_setlocale_i(get_category_index(cat, locale), locale)
701
702 # if 0 /* Not currently used */
703
704 STATIC bool
705 S_less_dicey_bool_setlocale_r(pTHX_ const int cat, const char * locale)
706 {
707 bool retval;
708
709 PERL_ARGS_ASSERT_LESS_DICEY_BOOL_SETLOCALE_R;
710
711 POSIX_SETLOCALE_LOCK;
712 retval = cBOOL(posix_setlocale(cat, locale));
713 POSIX_SETLOCALE_UNLOCK;
714
715 return retval;
716 }
717
718 # endif
719 # define bool_setlocale_r(cat, locale) \
720 less_dicey_bool_setlocale_r(cat, locale)
721 # define bool_setlocale_i(i, locale) \
722 bool_setlocale_r(categories[i], locale)
723 # define bool_setlocale_c(cat, locale) bool_setlocale_r(cat, locale)
724 #else
725
726 /* Here, there is a completely different API to get thread-safe locales. We
727 * emulate the setlocale() API with our own function(s). setlocale categories,
728 * like LC_NUMERIC, are not valid here for the POSIX 2008 API. Instead, there
729 * are equivalents, like LC_NUMERIC_MASK, which we use instead, converting to
730 * by using get_category_index() followed by table lookup. */
731
732 # define emulate_setlocale_c(cat, locale, recalc_LC_ALL, line) \
733 emulate_setlocale_i(cat##_INDEX_, locale, recalc_LC_ALL, line)
734
735 /* A wrapper for the macros below. */
736 # define common_emulate_setlocale(i, locale) \
737 emulate_setlocale_i(i, locale, YES_RECALC_LC_ALL, __LINE__)
738
739 # define setlocale_i(i, locale) \
740 save_to_buffer(common_emulate_setlocale(i, locale), \
741 &PL_stdize_locale_buf, \
742 &PL_stdize_locale_bufsize)
743 # define setlocale_c(cat, locale) setlocale_i(cat##_INDEX_, locale)
744 # define setlocale_r(cat, locale) \
745 setlocale_i(get_category_index(cat, locale), locale)
746
747 # define void_setlocale_i(i, locale) \
748 ((void) common_emulate_setlocale(i, locale))
749 # define void_setlocale_c(cat, locale) \
750 void_setlocale_i(cat##_INDEX_, locale)
751 # define void_setlocale_r(cat, locale) ((void) setlocale_r(cat, locale))
752
753 # define bool_setlocale_i(i, locale) \
754 cBOOL(common_emulate_setlocale(i, locale))
755 # define bool_setlocale_c(cat, locale) \
756 bool_setlocale_i(cat##_INDEX_, locale)
757 # define bool_setlocale_r(cat, locale) cBOOL(setlocale_r(cat, locale))
758
759 # define querylocale_i(i) mortalized_pv_copy(my_querylocale_i(i))
760 # define querylocale_c(cat) querylocale_i(cat##_INDEX_)
761 # define querylocale_r(cat) querylocale_i(get_category_index(cat,NULL))
762
763 # ifdef USE_QUERYLOCALE
764 # define isSINGLE_BIT_SET(mask) isPOWER_OF_2(mask)
765
766 /* This code used to think querylocale() was valid on LC_ALL. Make sure
767 * all instances of that have been removed */
768 # define QUERYLOCALE_ASSERT(index) \
769 __ASSERT_(isSINGLE_BIT_SET(category_masks[index]))
770 # if ! defined(HAS_QUERYLOCALE) && ( defined(_NL_LOCALE_NAME) \
771 && defined(HAS_NL_LANGINFO_L))
772 # define querylocale_l(index, locale_obj) \
773 (QUERYLOCALE_ASSERT(index) \
774 mortalized_pv_copy(nl_langinfo_l( \
775 _NL_LOCALE_NAME(categories[index]), locale_obj)))
776 # else
777 # define querylocale_l(index, locale_obj) \
778 (QUERYLOCALE_ASSERT(index) \
779 mortalized_pv_copy(querylocale(category_masks[index], locale_obj)))
780 # endif
781 # endif
782 # if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES)
783 # define HAS_GLIBC_LC_MESSAGES_BUG
784 # include <libintl.h>
785 # endif
786
787 /* A fourth array, parallel to the ones above to map from category to its
788 * equivalent mask */
789 STATIC const int category_masks[] = {
790 # ifdef USE_LOCALE_CTYPE
791 LC_CTYPE_MASK,
792 # endif
793 # ifdef USE_LOCALE_NUMERIC
794 LC_NUMERIC_MASK,
795 # endif
796 # ifdef USE_LOCALE_COLLATE
797 LC_COLLATE_MASK,
798 # endif
799 # ifdef USE_LOCALE_TIME
800 LC_TIME_MASK,
801 # endif
802 # ifdef USE_LOCALE_MESSAGES
803 LC_MESSAGES_MASK,
804 # endif
805 # ifdef USE_LOCALE_MONETARY
806 LC_MONETARY_MASK,
807 # endif
808 # ifdef USE_LOCALE_ADDRESS
809 LC_ADDRESS_MASK,
810 # endif
811 # ifdef USE_LOCALE_IDENTIFICATION
812 LC_IDENTIFICATION_MASK,
813 # endif
814 # ifdef USE_LOCALE_MEASUREMENT
815 LC_MEASUREMENT_MASK,
816 # endif
817 # ifdef USE_LOCALE_PAPER
818 LC_PAPER_MASK,
819 # endif
820 # ifdef USE_LOCALE_TELEPHONE
821 LC_TELEPHONE_MASK,
822 # endif
823 # ifdef USE_LOCALE_NAME
824 LC_NAME_MASK,
825 # endif
826 # ifdef USE_LOCALE_SYNTAX
827 LC_SYNTAX_MASK,
828 # endif
829 # ifdef USE_LOCALE_TOD
830 LC_TOD_MASK,
831 # endif
832 /* LC_ALL can't be turned off by a Configure
833 * option, and in Posix 2008, should always be
834 * here, so compile it in unconditionally.
835 * This could catch some glitches at compile
836 * time */
837 LC_ALL_MASK,
838
839 /* Placeholder as a precaution if code fails to check the return of
840 * get_category_index(), which returns this element to indicate an error */
841 0
842 };
843
844 # define my_querylocale_c(cat) my_querylocale_i(cat##_INDEX_)
845
846 STATIC const char *
S_my_querylocale_i(pTHX_ const unsigned int index)847 S_my_querylocale_i(pTHX_ const unsigned int index)
848 {
849 /* This function returns the name of the locale category given by the input
850 * index into our parallel tables of them.
851 *
852 * POSIX 2008, for some sick reason, chose not to provide a method to find
853 * the category name of a locale, discarding a basic linguistic tenet that
854 * for any object, people will create a name for it. Some vendors have
855 * created a querylocale() function to do just that. This function is a
856 * lot simpler to implement on systems that have this. Otherwise, we have
857 * to keep track of what the locale has been set to, so that we can return
858 * its name so as to emulate setlocale(). It's also possible for C code in
859 * some library to change the locale without us knowing it, though as of
860 * September 2017, there are no occurrences in CPAN of uselocale(). Some
861 * libraries do use setlocale(), but that changes the global locale, and
862 * threads using per-thread locales will just ignore those changes. */
863
864 int category;
865 const locale_t cur_obj = uselocale((locale_t) 0);
866 const char * retval;
867
868 PERL_ARGS_ASSERT_MY_QUERYLOCALE_I;
869 assert(index <= NOMINAL_LC_ALL_INDEX);
870
871 category = categories[index];
872
873 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "my_querylocale_i(%s) on %p\n",
874 category_names[index], cur_obj));
875 if (cur_obj == LC_GLOBAL_LOCALE) {
876 POSIX_SETLOCALE_LOCK;
877 retval = posix_setlocale(category, NULL);
878 POSIX_SETLOCALE_UNLOCK;
879 }
880 else {
881
882 # ifdef USE_QUERYLOCALE
883
884 /* We don't currently keep records when there is querylocale(), so have
885 * to get it anew each time */
886 retval = (index == LC_ALL_INDEX_)
887 ? calculate_LC_ALL(cur_obj)
888 : querylocale_l(index, cur_obj);
889
890 # else
891
892 /* But we do have up-to-date values when we keep our own records
893 * (except some times in initialization, where we get the value from
894 * the system. */
895 const char ** which = (index == LC_ALL_INDEX_)
896 ? &PL_cur_LC_ALL
897 : &PL_curlocales[index];
898 if (*which == NULL) {
899 retval = stdized_setlocale(category, NULL);
900 *which = savepv(retval);
901 }
902 else {
903 retval = *which;
904 }
905
906 # endif
907
908 }
909
910 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
911 "my_querylocale_i(%s) returning '%s'\n",
912 category_names[index], retval));
913 assert(strNE(retval, ""));
914 return retval;
915 }
916
917 # ifdef USE_PL_CURLOCALES
918
919 STATIC const char *
S_update_PL_curlocales_i(pTHX_ const unsigned int index,const char * new_locale,recalc_lc_all_t recalc_LC_ALL)920 S_update_PL_curlocales_i(pTHX_
921 const unsigned int index,
922 const char * new_locale,
923 recalc_lc_all_t recalc_LC_ALL)
924 {
925 /* This is a helper function for emulate_setlocale_i(), mostly used to
926 * make that function easier to read. */
927
928 PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I;
929 assert(index <= NOMINAL_LC_ALL_INDEX);
930
931 if (index == LC_ALL_INDEX_) {
932 unsigned int i;
933
934 /* For LC_ALL, we change all individual categories to correspond */
935 /* PL_curlocales is a parallel array, so has same
936 * length as 'categories' */
937 for (i = 0; i < LC_ALL_INDEX_; i++) {
938 Safefree(PL_curlocales[i]);
939 PL_curlocales[i] = savepv(new_locale);
940 }
941
942 Safefree(PL_cur_LC_ALL);
943 PL_cur_LC_ALL = savepv(calculate_LC_ALL(PL_curlocales));
944 return PL_cur_LC_ALL;
945 }
946
947 /* Update the single category's record */
948 Safefree(PL_curlocales[index]);
949 PL_curlocales[index] = savepv(new_locale);
950
951 /* And also LC_ALL if the input says to, including if this is the final
952 * iteration of a loop updating all sub-categories */
953 if ( recalc_LC_ALL == YES_RECALC_LC_ALL
954 || ( recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION
955 && index == NOMINAL_LC_ALL_INDEX - 1))
956 {
957 Safefree(PL_cur_LC_ALL);
958 PL_cur_LC_ALL = savepv(calculate_LC_ALL(PL_curlocales));
959 }
960
961 return PL_curlocales[index];
962 }
963
964 # endif /* Need PL_curlocales[] */
965
966 STATIC const char *
S_setlocale_from_aggregate_LC_ALL(pTHX_ const char * locale,const line_t line)967 S_setlocale_from_aggregate_LC_ALL(pTHX_ const char * locale, const line_t line)
968 {
969 /* This function parses the value of the LC_ALL locale, assuming glibc
970 * syntax, and sets each individual category on the system to the proper
971 * value.
972 *
973 * This is likely to only ever be called from one place, so exists to make
974 * the calling function easier to read by moving this ancillary code out of
975 * the main line.
976 *
977 * The locale for each category is independent of the other categories.
978 * Often, they are all the same, but certainly not always. Perl, in fact,
979 * usually keeps LC_NUMERIC in the C locale, regardless of the underlying
980 * locale. LC_ALL has to be able to represent the case of when there are
981 * varying locales. Platforms have differing ways of representing this.
982 * Because of this, the code in this file goes to lengths to avoid the
983 * issue, generally looping over the component categories instead of
984 * referring to them in the aggregate, wherever possible. However, there
985 * are cases where we have to parse our own constructed aggregates, which use
986 * the glibc syntax. */
987
988 const char * locale_on_entry = querylocale_c(LC_ALL);
989
990 PERL_ARGS_ASSERT_SETLOCALE_FROM_AGGREGATE_LC_ALL;
991
992 /* If the string that gives what to set doesn't include all categories,
993 * the omitted ones get set to "C". To get this behavior, first set
994 * all the individual categories to "C", and override the furnished
995 * ones below. FALSE => No need to recalculate LC_ALL, as this is a
996 * temporary state */
997 if (! emulate_setlocale_c(LC_ALL, "C", DONT_RECALC_LC_ALL, line)) {
998 setlocale_failure_panic_c(LC_ALL, locale_on_entry,
999 "C", __LINE__, line);
1000 NOT_REACHED; /* NOTREACHED */
1001 }
1002
1003 const char * s = locale;
1004 const char * e = locale + strlen(locale);
1005 while (s < e) {
1006 const char * p = s;
1007
1008 /* Parse through the category */
1009 while (isWORDCHAR(*p)) {
1010 p++;
1011 }
1012
1013 const char * category_end = p;
1014
1015 if (*p++ != '=') {
1016 locale_panic_(Perl_form(aTHX_
1017 "Unexpected character in locale category name '%s"
1018 "<-- HERE",
1019 get_displayable_string(s, p - 1, 0)));
1020 }
1021
1022 /* Parse through the locale name */
1023 const char * name_start = p;
1024 while (p < e && *p != ';') {
1025 p++;
1026 }
1027 if (UNLIKELY( p < e && *p != ';')) {
1028 locale_panic_(Perl_form(aTHX_
1029 "Unexpected character in locale name '%s<-- HERE",
1030 get_displayable_string(s, p, 0)));
1031 }
1032
1033 const char * name_end = p;
1034
1035 /* Space past the semi-colon */
1036 if (p < e) {
1037 p++;
1038 }
1039
1040 /* Find the index of the category name in our lists */
1041 for (PERL_UINT_FAST8_T i = 0; i < LC_ALL_INDEX_; i++) {
1042
1043 /* Keep going if this index doesn't point to the category being
1044 * parsed. The strnNE() avoids a Perl_form(), but would fail if
1045 * ever a category name could be a substring of another one, e.g.,
1046 * if there were a "LC_TIME_DATE" */
1047 if strnNE(s, category_names[i], category_end - s) {
1048 continue;
1049 }
1050
1051 /* Here i points to the category being parsed. Now isolate the
1052 * locale it is being changed to */
1053 const char * individ_locale = Perl_form(aTHX_ "%.*s",
1054 (int) (name_end - name_start), name_start);
1055
1056 /* And do the change. Don't recalculate LC_ALL; we'll do it
1057 * ourselves after the loop */
1058 if (! emulate_setlocale_i(i, individ_locale,
1059 DONT_RECALC_LC_ALL, line))
1060 {
1061
1062 /* But if we have to back out, do fix up LC_ALL */
1063 if (! emulate_setlocale_c(LC_ALL, locale_on_entry,
1064 YES_RECALC_LC_ALL, line))
1065 {
1066 setlocale_failure_panic_i(i, individ_locale,
1067 locale, __LINE__, line);
1068 NOT_REACHED; /* NOTREACHED */
1069 }
1070
1071 /* Reverting to the entry value succeeded, but the operation
1072 * failed to go to the requested locale. */
1073 return NULL;
1074 }
1075
1076 /* Found and handled the desired category. Quit the inner loop to
1077 * try the next category */
1078 break;
1079 }
1080
1081 /* Finished with this category; iterate to the next one in the input */
1082 s = p;
1083 }
1084
1085 # ifdef USE_PL_CURLOCALES
1086
1087 /* Here we have set all the individual categories. Update the LC_ALL entry
1088 * as well. We can't just use the input 'locale' as the value may omit
1089 * categories whose locale is 'C'. khw thinks it's better to store a
1090 * complete LC_ALL. So calculate it. */
1091 const char * retval = savepv(calculate_LC_ALL(PL_curlocales));
1092 Safefree(PL_cur_LC_ALL);
1093 PL_cur_LC_ALL = retval;
1094
1095 # else
1096
1097 const char * retval = querylocale_c(LC_ALL);
1098
1099 # endif
1100
1101 return retval;
1102 }
1103
1104 STATIC const char *
S_emulate_setlocale_i(pTHX_ const unsigned int index,const char * new_locale,const recalc_lc_all_t recalc_LC_ALL,const line_t line)1105 S_emulate_setlocale_i(pTHX_
1106
1107 /* Our internal index of the 'category' setlocale is called with */
1108 const unsigned int index,
1109
1110 const char * new_locale, /* The locale to set the category to */
1111 const recalc_lc_all_t recalc_LC_ALL, /* Explained below */
1112 const line_t line /* Called from this line number */
1113 )
1114 {
1115 PERL_ARGS_ASSERT_EMULATE_SETLOCALE_I;
1116 assert(index <= NOMINAL_LC_ALL_INDEX);
1117
1118 /* Otherwise could have undefined behavior, as the return of this function
1119 * may be copied to this buffer, which this function could change in the
1120 * middle of its work */
1121 assert(new_locale != PL_stdize_locale_buf);
1122
1123 /* This function effectively performs a setlocale() on just the current
1124 * thread; thus it is thread-safe. It does this by using the POSIX 2008
1125 * locale functions to emulate the behavior of setlocale(). Similar to
1126 * regular setlocale(), the return from this function points to memory that
1127 * can be overwritten by other system calls, so needs to be copied
1128 * immediately if you need to retain it. The difference here is that
1129 * system calls besides another setlocale() can overwrite it.
1130 *
1131 * By doing this, most locale-sensitive functions become thread-safe. The
1132 * exceptions are mostly those that return a pointer to static memory.
1133 *
1134 * This function may be called in a tight loop that iterates over all
1135 * categories. Because LC_ALL is not a "real" category, but merely the sum
1136 * of all the other ones, such loops don't include LC_ALL. On systems that
1137 * have querylocale() or similar, the current LC_ALL value is immediately
1138 * retrievable; on systems lacking that feature, we have to keep track of
1139 * LC_ALL ourselves. We could do that on each iteration, only to throw it
1140 * away on the next, but the calculation is more than a trivial amount of
1141 * work. Instead, the 'recalc_LC_ALL' parameter is set to
1142 * RECALCULATE_LC_ALL_ON_FINAL_INTERATION to only do the calculation once.
1143 * This function calls itself recursively in such a loop.
1144 *
1145 * When not in such a loop, the parameter is set to the other enum values
1146 * DONT_RECALC_LC_ALL or YES_RECALC_LC_ALL. */
1147
1148 int mask = category_masks[index];
1149 const locale_t entry_obj = uselocale((locale_t) 0);
1150 const char * locale_on_entry = querylocale_i(index);
1151
1152 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1153 "emulate_setlocale_i input=%d (%s), mask=0x%x,"
1154 " new locale=\"%s\", current locale=\"%s\","
1155 "index=%d, object=%p\n",
1156 categories[index], category_names[index], mask,
1157 ((new_locale == NULL) ? "(nil)" : new_locale),
1158 locale_on_entry, index, entry_obj));
1159
1160 /* Return the already-calculated info if just querying what the existing
1161 * locale is */
1162 if (new_locale == NULL) {
1163 return locale_on_entry;
1164 }
1165
1166 /* Here, trying to change the locale, but it is a no-op if the new boss is
1167 * the same as the old boss. Except this routine is called when converting
1168 * from the global locale, so in that case we will create a per-thread
1169 * locale below (with the current values). It also seemed that newlocale()
1170 * could free up the basis locale memory if we called it with the new and
1171 * old being the same, but khw now thinks that this was due to some other
1172 * bug, since fixed, as there are other places where newlocale() gets
1173 * similarly called without problems. */
1174 if ( entry_obj != LC_GLOBAL_LOCALE
1175 && locale_on_entry
1176 && strEQ(new_locale, locale_on_entry))
1177 {
1178 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1179 "(%" LINE_Tf "): emulate_setlocale_i"
1180 " no-op to change to what it already was\n",
1181 line));
1182
1183 # ifdef USE_PL_CURLOCALES
1184
1185 /* On the final iteration of a loop that needs to recalculate LC_ALL, do
1186 * so. If no iteration changed anything, LC_ALL also doesn't change,
1187 * but khw believes the complexity needed to keep track of that isn't
1188 * worth it. */
1189 if (UNLIKELY( recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION
1190 && index == NOMINAL_LC_ALL_INDEX - 1))
1191 {
1192 Safefree(PL_cur_LC_ALL);
1193 PL_cur_LC_ALL = savepv(calculate_LC_ALL(PL_curlocales));
1194 }
1195
1196 # endif
1197
1198 return locale_on_entry;
1199 }
1200
1201 # ifndef USE_QUERYLOCALE
1202
1203 /* Without a querylocale() mechanism, we have to figure out ourselves what
1204 * happens with setting a locale to "" */
1205 if (strEQ(new_locale, "")) {
1206 new_locale = find_locale_from_environment(index);
1207 }
1208
1209 # endif
1210
1211 /* So far, it has worked that a semi-colon in the locale name means that
1212 * the category is LC_ALL and it subsumes categories which don't all have
1213 * the same locale. This is the glibc syntax. */
1214 if (strchr(new_locale, ';')) {
1215 assert(index == LC_ALL_INDEX_);
1216 return setlocale_from_aggregate_LC_ALL(new_locale, line);
1217 }
1218
1219 # ifdef HAS_GLIBC_LC_MESSAGES_BUG
1220
1221 /* For this bug, if the LC_MESSAGES locale changes, we have to do an
1222 * expensive workaround. Save the current value so we can later determine
1223 * if it changed. */
1224 const char * old_messages_locale = NULL;
1225 if ( (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_)
1226 && LIKELY(PL_phase != PERL_PHASE_CONSTRUCT))
1227 {
1228 old_messages_locale = querylocale_c(LC_MESSAGES);
1229 }
1230
1231 # endif
1232
1233 assert(PL_C_locale_obj);
1234
1235 /* Now ready to switch to the input 'new_locale' */
1236
1237 /* Switching locales generally entails freeing the current one's space (at
1238 * the C library's discretion), hence we can't be using that locale at the
1239 * time of the switch (this wasn't obvious to khw from the man pages). So
1240 * switch to a known locale object that we don't otherwise mess with. */
1241 if (! uselocale(PL_C_locale_obj)) {
1242
1243 /* Not being able to change to the C locale is severe; don't keep
1244 * going. */
1245 setlocale_failure_panic_i(index, locale_on_entry, "C", __LINE__, line);
1246 NOT_REACHED; /* NOTREACHED */
1247 }
1248
1249 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1250 "(%" LINE_Tf "): emulate_setlocale_i now using C"
1251 " object=%p\n", line, PL_C_locale_obj));
1252
1253 locale_t new_obj;
1254
1255 /* We created a (never changing) object at start-up for LC_ALL being in the
1256 * C locale. If this call is to switch to LC_ALL=>C, simply use that
1257 * object. But in fact, we already have switched to it just above, in
1258 * preparation for the general case. Since we're already there, no need to
1259 * do further switching. */
1260 if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) {
1261 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "(%" LINE_Tf "):"
1262 " emulate_setlocale_i will stay"
1263 " in C object\n", line));
1264 new_obj = PL_C_locale_obj;
1265
1266 /* And free the old object if it isn't a special one */
1267 if (entry_obj != LC_GLOBAL_LOCALE && entry_obj != PL_C_locale_obj) {
1268 freelocale(entry_obj);
1269 }
1270 }
1271 else { /* Here is the general case, not to LC_ALL=>C */
1272 locale_t basis_obj = entry_obj;
1273
1274 /* Specially handle two objects */
1275 if (basis_obj == LC_GLOBAL_LOCALE || basis_obj == PL_C_locale_obj) {
1276
1277 /* For these two objects, we make duplicates to hand to newlocale()
1278 * below. For LC_GLOBAL_LOCALE, this is because newlocale()
1279 * doesn't necessarily accept it as input (the results are
1280 * undefined). For PL_C_locale_obj, it is so that it never gets
1281 * modified, as otherwise newlocale() is free to do so */
1282 basis_obj = duplocale(basis_obj);
1283 if (! basis_obj) {
1284 locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): duplocale failed",
1285 line));
1286 NOT_REACHED; /* NOTREACHED */
1287 }
1288
1289 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1290 "(%" LINE_Tf "): emulate_setlocale_i"
1291 " created %p by duping the input\n",
1292 line, basis_obj));
1293 }
1294
1295 /* Ready to create a new locale by modification of the existing one.
1296 *
1297 * NOTE: This code may incorrectly show up as a leak under the address
1298 * sanitizer. We do not free this object under normal teardown, however
1299 * you can set PERL_DESTRUCT_LEVEL=2 to cause it to be freed.
1300 */
1301 new_obj = newlocale(mask, new_locale, basis_obj);
1302
1303 if (! new_obj) {
1304 DEBUG_L(PerlIO_printf(Perl_debug_log,
1305 " (%" LINE_Tf "): emulate_setlocale_i"
1306 " creating new object from %p failed:"
1307 " errno=%d\n",
1308 line, basis_obj, GET_ERRNO));
1309
1310 /* Failed. Likely this is because the proposed new locale isn't
1311 * valid on this system. But we earlier switched to the LC_ALL=>C
1312 * locale in anticipation of it succeeding, Now have to switch
1313 * back to the state upon entry */
1314 if (! uselocale(entry_obj)) {
1315 setlocale_failure_panic_i(index, "switching back to",
1316 locale_on_entry, __LINE__, line);
1317 NOT_REACHED; /* NOTREACHED */
1318 }
1319
1320 # ifdef USE_PL_CURLOCALES
1321
1322 if (entry_obj == LC_GLOBAL_LOCALE) {
1323
1324 /* Here, we are back in the global locale. We may never have
1325 * set PL_curlocales. If the locale change had succeeded, the
1326 * code would have then set them up, but since it didn't, do so
1327 * here. khw isn't sure if this prevents some issues or not,
1328 * This will calculate LC_ALL's entry only on the final
1329 * iteration */
1330 POSIX_SETLOCALE_LOCK;
1331 for (PERL_UINT_FAST8_T i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1332 update_PL_curlocales_i(i,
1333 posix_setlocale(categories[i], NULL),
1334 RECALCULATE_LC_ALL_ON_FINAL_INTERATION);
1335 }
1336 POSIX_SETLOCALE_UNLOCK;
1337 }
1338 # endif
1339
1340 return NULL;
1341 }
1342
1343 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1344 "(%" LINE_Tf "): emulate_setlocale_i created %p"
1345 " while freeing %p\n", line, new_obj, basis_obj));
1346
1347 /* Here, successfully created an object representing the desired
1348 * locale; now switch into it */
1349 if (! uselocale(new_obj)) {
1350 freelocale(new_obj);
1351 locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): emulate_setlocale_i"
1352 " switching into new locale failed",
1353 line));
1354 }
1355 }
1356
1357 /* Here, we are using 'new_obj' which matches the input 'new_locale'. */
1358 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1359 "(%" LINE_Tf "): emulate_setlocale_i now using %p\n",
1360 line, new_obj));
1361
1362 #ifdef MULTIPLICITY
1363 PL_cur_locale_obj = new_obj;
1364 #endif
1365
1366 /* We are done, except for updating our records (if the system doesn't keep
1367 * them) and in the case of locale "", we don't actually know what the
1368 * locale that got switched to is, as it came from the environment. So
1369 * have to find it */
1370
1371 # ifdef USE_QUERYLOCALE
1372
1373 if (strEQ(new_locale, "")) {
1374 new_locale = querylocale_i(index);
1375 }
1376
1377 PERL_UNUSED_ARG(recalc_LC_ALL);
1378
1379 # else
1380
1381 new_locale = update_PL_curlocales_i(index, new_locale, recalc_LC_ALL);
1382
1383 # endif
1384 # ifdef HAS_GLIBC_LC_MESSAGES_BUG
1385
1386 /* Invalidate the glibc cache of loaded translations if the locale has
1387 * changed, see [perl #134264] */
1388 if (old_messages_locale) {
1389 if (strNE(old_messages_locale, my_querylocale_c(LC_MESSAGES))) {
1390 textdomain(textdomain(NULL));
1391 }
1392 }
1393
1394 # endif
1395
1396 return new_locale;
1397 }
1398
1399 #endif /* End of the various implementations of the setlocale and
1400 querylocale macros used in the remainder of this program */
1401
1402 #ifdef USE_LOCALE
1403
1404 /* So far, the locale strings returned by modern 2008-compliant systems have
1405 * been fine */
1406
1407 STATIC const char *
S_stdize_locale(pTHX_ const int category,const char * input_locale,const char ** buf,Size_t * buf_size,const line_t caller_line)1408 S_stdize_locale(pTHX_ const int category,
1409 const char *input_locale,
1410 const char **buf,
1411 Size_t *buf_size,
1412 const line_t caller_line)
1413 {
1414 /* The return value of setlocale() is opaque, but is required to be usable
1415 * as input to a future setlocale() to create the same state.
1416 * Unfortunately not all systems are compliant. But most often they are of
1417 * a very restricted set of forms that this file has been coded to expect.
1418 *
1419 * There are some outliers, though, that this function tries to tame:
1420 *
1421 * 1) A new-line. This function chomps any \n characters
1422 * 2) foo=bar. 'bar' is what is generally meant, and the foo= part is
1423 * stripped. This form is legal for LC_ALL. When found in
1424 * that category group, the function calls itself
1425 * recursively on each possible component category to make
1426 * sure the individual categories are ok.
1427 *
1428 * If no changes to the input were made, it is returned; otherwise the
1429 * changed version is stored into memory at *buf, with *buf_size set to its
1430 * new value, and *buf is returned.
1431 */
1432
1433 const char * first_bad;
1434 const char * retval;
1435
1436 PERL_ARGS_ASSERT_STDIZE_LOCALE;
1437
1438 if (input_locale == NULL) {
1439 return NULL;
1440 }
1441
1442 first_bad = strpbrk(input_locale, "=\n");
1443
1444 /* Most likely, there isn't a problem with the input */
1445 if (LIKELY(! first_bad)) {
1446 return input_locale;
1447 }
1448
1449 # ifdef LC_ALL
1450
1451 /* But if there is, and the category is LC_ALL, we have to look at each
1452 * component category */
1453 if (category == LC_ALL) {
1454 const char * individ_locales[LC_ALL_INDEX_];
1455 bool made_changes = FALSE;
1456 unsigned int i;
1457
1458 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1459 Size_t this_size = 0;
1460 individ_locales[i] = stdize_locale(categories[i],
1461 posix_setlocale(categories[i],
1462 NULL),
1463 &individ_locales[i],
1464 &this_size,
1465 caller_line);
1466
1467 /* If the size didn't change, it means this category did not have
1468 * to be adjusted, and individ_locales[i] points to the buffer
1469 * returned by posix_setlocale(); we have to copy that before
1470 * it's called again in the next iteration */
1471 if (this_size == 0) {
1472 individ_locales[i] = savepv(individ_locales[i]);
1473 }
1474 else {
1475 made_changes = TRUE;
1476 }
1477 }
1478
1479 /* If all the individual categories were ok as-is, this was a false
1480 * alarm. We must have seen an '=' which was a legal occurrence in
1481 * this combination locale */
1482 if (! made_changes) {
1483 retval = input_locale; /* The input can be returned unchanged */
1484 }
1485 else {
1486 retval = save_to_buffer(querylocale_c(LC_ALL), buf, buf_size);
1487 }
1488
1489 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1490 Safefree(individ_locales[i]);
1491 }
1492
1493 return retval;
1494 }
1495
1496 # else /* else no LC_ALL */
1497
1498 PERL_UNUSED_ARG(category);
1499 PERL_UNUSED_ARG(caller_line);
1500
1501 # endif
1502
1503 /* Here, there was a problem in an individual category. This means that at
1504 * least one adjustment will be necessary. Create a modifiable copy */
1505 retval = save_to_buffer(input_locale, buf, buf_size);
1506
1507 if (*first_bad != '=') {
1508
1509 /* Translate the found position into terms of the copy */
1510 first_bad = retval + (first_bad - input_locale);
1511 }
1512 else { /* An '=' */
1513
1514 /* It is unlikely that the return is so screwed-up that it contains
1515 * multiple equals signs, but handle that case by stripping all of
1516 * them. */
1517 const char * final_equals = strrchr(retval, '=');
1518
1519 /* The length passed here causes the move to include the terminating
1520 * NUL */
1521 Move(final_equals + 1, retval, strlen(final_equals), char);
1522
1523 /* See if there are additional problems; if not, we're good to return.
1524 * */
1525 first_bad = strpbrk(retval, "\n");
1526
1527 if (! first_bad) {
1528 return retval;
1529 }
1530 }
1531
1532 /* Here, the problem must be a \n. Get rid of it and what follows.
1533 * (Originally, only a trailing \n was stripped. Unsure what to do if not
1534 * trailing) */
1535 *((char *) first_bad) = '\0';
1536 return retval;
1537 }
1538
1539 #if defined(WIN32) || ( defined(USE_POSIX_2008_LOCALE) \
1540 && ! defined(USE_QUERYLOCALE))
1541
1542 STATIC const char *
S_find_locale_from_environment(pTHX_ const unsigned int index)1543 S_find_locale_from_environment(pTHX_ const unsigned int index)
1544 {
1545 /* NB: This function may actually change the locale on Windows.
1546 *
1547 * On Windows systems, the concept of the POSIX ordering of environment
1548 * variables is missing. To increase portability of programs across
1549 * platforms, the POSIX ordering is emulated on Windows.
1550 *
1551 * And on POSIX 2008 systems without querylocale(), it is problematic
1552 * getting the results of the POSIX 2008 equivalent of
1553 * setlocale(category, "")
1554 * (which gets the locale from the environment).
1555 *
1556 * To ensure that we know exactly what those values are, we do the setting
1557 * ourselves, using the documented algorithm (assuming the documentation is
1558 * correct) rather than use "" as the locale. This will lead to results
1559 * that differ from native behavior if the native behavior differs from the
1560 * standard documented value, but khw believes it is better to know what's
1561 * going on, even if different from native, than to just guess.
1562 *
1563 * Another option for the POSIX 2008 case would be, in a critical section,
1564 * to save the global locale's current value, and do a straight
1565 * setlocale(LC_ALL, ""). That would return our desired values, destroying
1566 * the global locale's, which we would then restore. But that could cause
1567 * races with any other thread that is using the global locale and isn't
1568 * using the mutex. And, the only reason someone would have done that is
1569 * because they are calling a library function, like in gtk, that calls
1570 * setlocale(), and which can't be changed to use the mutex. That wouldn't
1571 * be a problem if this were to be done before any threads had switched,
1572 * say during perl construction time. But this code would still be needed
1573 * for the general case.
1574 *
1575 * The Windows and POSIX 2008 differ in that the ultimate fallback is "C"
1576 * in POSIX, and is the system default locale in Windows. To get that
1577 * system default value, we actually have to call setlocale() on Windows.
1578 */
1579
1580 /* We rely on PerlEnv_getenv() returning a mortalized copy */
1581 const char * const lc_all = PerlEnv_getenv("LC_ALL");
1582
1583 /* Use any "LC_ALL" environment variable, as it overrides everything
1584 * else. */
1585 if (lc_all && strNE(lc_all, "")) {
1586 return lc_all;
1587 }
1588
1589 /* If setting an individual category, use its corresponding value found in
1590 * the environment, if any */
1591 if (index != LC_ALL_INDEX_) {
1592 const char * const new_value = PerlEnv_getenv(category_names[index]);
1593
1594 if (new_value && strNE(new_value, "")) {
1595 return new_value;
1596 }
1597
1598 /* If no corresponding environment variable, see if LANG exists. If
1599 * so, use it. */
1600 const char * default_name = PerlEnv_getenv("LANG");
1601 if (default_name && strNE(default_name, "")) {
1602 return default_name;
1603 }
1604
1605 /* If no LANG, use "C" on POSIX 2008, the system default on Windows */
1606 # ifndef WIN32
1607 return "C";
1608 # else
1609 return wrap_wsetlocale(categories[index], "");
1610 # endif
1611
1612 }
1613
1614 /* Here is LC_ALL, and no LC_ALL environment variable. LANG is used as a
1615 * default, but overridden for individual categories that have
1616 * corresponding environment variables. If no LANG exists, the default is
1617 * "C" on POSIX 2008, or the system default for the category on Windows. */
1618 const char * default_name = PerlEnv_getenv("LANG");
1619
1620 /* Convert "" to NULL to save conditionals in the loop below */
1621 if (default_name != NULL && strEQ(default_name, "")) {
1622 default_name = NULL;
1623 }
1624
1625 /* Loop through all the individual categories, setting each to any
1626 * corresponding environment variable; or to the default if none exists for
1627 * the category */
1628 const char * locale_names[LC_ALL_INDEX_];
1629 for (unsigned i = 0; i < LC_ALL_INDEX_; i++) {
1630 const char * const env_override = PerlEnv_getenv(category_names[i]);
1631
1632 if (env_override && strNE(env_override, "")) {
1633 locale_names[i] = env_override;
1634 }
1635 else if (default_name) {
1636 locale_names[i] = default_name;
1637 }
1638 else {
1639
1640 # ifndef WIN32
1641 locale_names[i] = "C";
1642 # else
1643 locale_names[i] = wrap_wsetlocale(categories[index], "");
1644 # endif
1645
1646 }
1647
1648 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1649 "find_locale_from_environment i=%d, name=%s, locale=%s\n",
1650 i, category_names[i], locale_names[i]));
1651 }
1652
1653 return calculate_LC_ALL(locale_names);
1654 }
1655
1656 #endif
1657 #if defined(WIN32) || defined(USE_POSIX_2008_LOCALE) || ! defined(LC_ALL)
1658
1659 STATIC
1660 const char *
1661
1662 # ifdef USE_QUERYLOCALE
S_calculate_LC_ALL(pTHX_ const locale_t cur_obj)1663 S_calculate_LC_ALL(pTHX_ const locale_t cur_obj)
1664 # else
1665 S_calculate_LC_ALL(pTHX_ const char ** individ_locales)
1666 # endif
1667
1668 {
1669 /* For POSIX 2008, we have to figure out LC_ALL ourselves when needed.
1670 * querylocale(), on systems that have it, doesn't tend to work for LC_ALL.
1671 * So we have to construct the answer ourselves based on the passed in
1672 * data, which is either a locale_t object, for systems with querylocale(),
1673 * or an array we keep updated to the proper values, otherwise.
1674 *
1675 * For Windows, we also may need to construct an LC_ALL when setting the
1676 * locale to the system default.
1677 *
1678 * This function returns a mortalized string containing the locale name(s)
1679 * of LC_ALL.
1680 *
1681 * If all individual categories are the same locale, we can just set LC_ALL
1682 * to that locale. But if not, we have to create an aggregation of all the
1683 * categories on the system. Platforms differ as to the syntax they use
1684 * for these non-uniform locales for LC_ALL. Some use a '/' or other
1685 * delimiter of the locales with a predetermined order of categories; a
1686 * Configure probe would be needed to tell us how to decipher those. glibc
1687 * and Windows use a series of name=value pairs, like
1688 * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
1689 * This function returns that syntax, which is suitable for input to the
1690 * Windows setlocale(). It could also be suitable for glibc, but because
1691 * the non-Windows code is common to systems that use a different syntax,
1692 * we don't depend on it for glibc. Instead we take care not to use the
1693 * native setlocale() function on whatever non-Windows style is chosen.
1694 * But, it would be possible for someone to call Perl_setlocale() using a
1695 * native style we don't understand. So far no one has complained.
1696 *
1697 * For systems that have categories we don't know about, the algorithm
1698 * below won't know about those missing categories, leading to potential
1699 * bugs for code that looks at them. If there is an environment variable
1700 * that sets that category, we won't know to look for it, and so our use of
1701 * LANG or "C" improperly overrides it. On the other hand, if we don't do
1702 * what is done here, and there is no environment variable, the category's
1703 * locale should be set to LANG or "C". So there is no good solution. khw
1704 * thinks the best is to make sure we have a complete list of possible
1705 * categories, adding new ones as they show up on obscure platforms.
1706 */
1707
1708 unsigned int i;
1709 Size_t names_len = 0;
1710 bool are_all_categories_the_same_locale = TRUE;
1711 char * aggregate_locale;
1712 char * previous_start = NULL;
1713 char * this_start = NULL;
1714 Size_t entry_len = 0;
1715
1716 PERL_ARGS_ASSERT_CALCULATE_LC_ALL;
1717
1718 /* First calculate the needed size for the string listing the categories
1719 * and their locales. */
1720 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1721
1722 # ifdef USE_QUERYLOCALE
1723 const char * entry = querylocale_l(i, cur_obj);
1724 # else
1725 const char * entry = individ_locales[i];
1726 # endif
1727
1728 names_len += strlen(category_names[i])
1729 + 1 /* '=' */
1730 + strlen(entry)
1731 + 1; /* ';' */
1732 }
1733
1734 names_len++; /* Trailing '\0' */
1735
1736 /* Allocate enough space for the aggregated string */
1737 Newxz(aggregate_locale, names_len, char);
1738 SAVEFREEPV(aggregate_locale);
1739
1740 /* Then fill it in */
1741 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1742 Size_t new_len;
1743
1744 # ifdef USE_QUERYLOCALE
1745 const char * entry = querylocale_l(i, cur_obj);
1746 # else
1747 const char * entry = individ_locales[i];
1748 # endif
1749
1750 new_len = my_strlcat(aggregate_locale, category_names[i], names_len);
1751 assert(new_len <= names_len);
1752 new_len = my_strlcat(aggregate_locale, "=", names_len);
1753 assert(new_len <= names_len);
1754
1755 this_start = aggregate_locale + strlen(aggregate_locale);
1756 entry_len = strlen(entry);
1757
1758 new_len = my_strlcat(aggregate_locale, entry, names_len);
1759 assert(new_len <= names_len);
1760 new_len = my_strlcat(aggregate_locale, ";", names_len);
1761 assert(new_len <= names_len);
1762 PERL_UNUSED_VAR(new_len); /* Only used in DEBUGGING */
1763
1764 if ( i > 0
1765 && are_all_categories_the_same_locale
1766 && memNE(previous_start, this_start, entry_len + 1))
1767 {
1768 are_all_categories_the_same_locale = FALSE;
1769 }
1770 else {
1771 previous_start = this_start;
1772 }
1773 }
1774
1775 /* If they are all the same, just return any one of them */
1776 if (are_all_categories_the_same_locale) {
1777 aggregate_locale = this_start;
1778 aggregate_locale[entry_len] = '\0';
1779 }
1780
1781 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
1782 "calculate_LC_ALL returning '%s'\n",
1783 aggregate_locale));
1784
1785 return aggregate_locale;
1786 }
1787
1788 #endif
1789 #if defined(USE_LOCALE) && ( defined(DEBUGGING) \
1790 || defined(USE_PERL_SWITCH_LOCALE_CONTEXT))
1791
1792 STATIC const char *
S_get_LC_ALL_display(pTHX)1793 S_get_LC_ALL_display(pTHX)
1794 {
1795
1796 # ifdef LC_ALL
1797
1798 return querylocale_c(LC_ALL);
1799
1800 # else
1801
1802 const char * curlocales[NOMINAL_LC_ALL_INDEX];
1803
1804 for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
1805 curlocales[i] = querylocale_i(i);
1806 }
1807
1808 return calculate_LC_ALL(curlocales);
1809
1810 # endif
1811
1812 }
1813
1814 #endif
1815
1816 STATIC void
S_setlocale_failure_panic_i(pTHX_ const unsigned int cat_index,const char * current,const char * failed,const line_t caller_0_line,const line_t caller_1_line)1817 S_setlocale_failure_panic_i(pTHX_
1818 const unsigned int cat_index,
1819 const char * current,
1820 const char * failed,
1821 const line_t caller_0_line,
1822 const line_t caller_1_line)
1823 {
1824 dSAVE_ERRNO;
1825 const int cat = categories[cat_index];
1826 const char * name = category_names[cat_index];
1827
1828 PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_I;
1829
1830 if (current == NULL) {
1831 current = querylocale_i(cat_index);
1832 }
1833
1834 Perl_locale_panic(Perl_form(aTHX_ "(%" LINE_Tf
1835 "): Can't change locale for %s(%d)"
1836 " from '%s' to '%s'",
1837 caller_1_line, name, cat,
1838 current, failed),
1839 __FILE__, caller_0_line, GET_ERRNO);
1840 NOT_REACHED; /* NOTREACHED */
1841 }
1842
1843 /* Any of these will allow us to find the RADIX */
1844 # if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_SOME_LANGINFO) \
1845 || defined(HAS_LOCALECONV) \
1846 || defined(HAS_SNPRINTF))
1847 # define CAN_CALCULATE_RADIX
1848 # endif
1849 # ifdef USE_LOCALE_NUMERIC
1850
1851 STATIC void
S_new_numeric(pTHX_ const char * newnum,bool force)1852 S_new_numeric(pTHX_ const char *newnum, bool force)
1853 {
1854 PERL_ARGS_ASSERT_NEW_NUMERIC;
1855
1856 /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
1857 * core Perl this and that 'newnum' is the name of the new locale, and we
1858 * are switched into it. It installs this locale as the current underlying
1859 * default, and then switches to the C locale, if necessary, so that the
1860 * code that has traditionally expected the radix character to be a dot may
1861 * continue to do so.
1862 *
1863 * The default locale and the C locale can be toggled between by use of the
1864 * set_numeric_underlying() and set_numeric_standard() functions, which
1865 * should probably not be called directly, but only via macros like
1866 * SET_NUMERIC_STANDARD() in perl.h.
1867 *
1868 * The toggling is necessary mainly so that a non-dot radix decimal point
1869 * character can be input and output, while allowing internal calculations
1870 * to use a dot.
1871 *
1872 * This sets several interpreter-level variables:
1873 * PL_numeric_name The underlying locale's name: a copy of 'newnum'
1874 * PL_numeric_underlying A boolean indicating if the toggled state is such
1875 * that the current locale is the program's underlying
1876 * locale
1877 * PL_numeric_standard An int indicating if the toggled state is such
1878 * that the current locale is the C locale or
1879 * indistinguishable from the C locale. If non-zero, it
1880 * is in C; if > 1, it means it may not be toggled away
1881 * from C.
1882 * PL_numeric_underlying_is_standard A bool kept by this function
1883 * indicating that the underlying locale and the standard
1884 * C locale are indistinguishable for the purposes of
1885 * LC_NUMERIC. This happens when both of the above two
1886 * variables are true at the same time. (Toggling is a
1887 * no-op under these circumstances.) This variable is
1888 * used to avoid having to recalculate.
1889 * PL_numeric_radix_sv Contains the string that code should use for the
1890 * decimal point. It is set to either a dot or the
1891 * program's underlying locale's radix character string,
1892 * depending on the situation.
1893 * PL_underlying_radix_sv Contains the program's underlying locale's radix
1894 * character string. This is copied into
1895 * PL_numeric_radix_sv when the situation warrants. It
1896 * exists to avoid having to recalculate it when toggling.
1897 * PL_underlying_numeric_obj = (only on POSIX 2008 platforms) An object
1898 * with everything set up properly so as to avoid work on
1899 * such platforms.
1900 */
1901
1902 DEBUG_L( PerlIO_printf(Perl_debug_log,
1903 "Called new_numeric with %s, PL_numeric_name=%s\n",
1904 newnum, PL_numeric_name));
1905
1906 /* If not forcing this procedure, and there isn't actually a change from
1907 * our records, do nothing. (Our records can be wrong when sync'ing to the
1908 * locale set up by an external library, hence the 'force' parameter) */
1909 if (! force && strEQ(PL_numeric_name, newnum)) {
1910 return;
1911 }
1912
1913 Safefree(PL_numeric_name);
1914 PL_numeric_name = savepv(newnum);
1915
1916 /* Handle the trivial case. Since this is called at process
1917 * initialization, be aware that this bit can't rely on much being
1918 * available. */
1919 if (isNAME_C_OR_POSIX(PL_numeric_name)) {
1920 PL_numeric_standard = TRUE;
1921 PL_numeric_underlying_is_standard = TRUE;
1922 PL_numeric_underlying = TRUE;
1923 sv_setpv(PL_numeric_radix_sv, C_decimal_point);
1924 sv_setpv(PL_underlying_radix_sv, C_decimal_point);
1925 return;
1926 }
1927
1928 /* We are in the underlying locale until changed at the end of this
1929 * function */
1930 PL_numeric_underlying = TRUE;
1931
1932 # ifdef USE_POSIX_2008_LOCALE
1933
1934 /* We keep a special object for easy switching to.
1935 *
1936 * NOTE: This code may incorrectly show up as a leak under the address
1937 * sanitizer. We do not free this object under normal teardown, however
1938 * you can set PERL_DESTRUCT_LEVEL=2 to cause it to be freed.
1939 */
1940 PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
1941 PL_numeric_name,
1942 PL_underlying_numeric_obj);
1943
1944 # endif
1945
1946 const char * radix = NULL;
1947 utf8ness_t utf8ness = UTF8NESS_IMMATERIAL;
1948
1949 /* Find and save this locale's radix character. */
1950 my_langinfo_c(RADIXCHAR, LC_NUMERIC, PL_numeric_name,
1951 &radix, NULL, &utf8ness);
1952 sv_setpv(PL_underlying_radix_sv, radix);
1953
1954 if (utf8ness == UTF8NESS_YES) {
1955 SvUTF8_on(PL_underlying_radix_sv);
1956 }
1957
1958 DEBUG_L(PerlIO_printf(Perl_debug_log,
1959 "Locale radix is '%s', ?UTF-8=%d\n",
1960 SvPVX(PL_underlying_radix_sv),
1961 cBOOL(SvUTF8(PL_underlying_radix_sv))));
1962
1963 /* This locale is indistinguishable from C (for numeric purposes) if both
1964 * the radix character and the thousands separator are the same as C's.
1965 * Start with the radix. */
1966 PL_numeric_underlying_is_standard = strEQ(C_decimal_point, radix);
1967 Safefree(radix);
1968
1969 # ifndef TS_W32_BROKEN_LOCALECONV
1970
1971 /* If the radix isn't the same as C's, we know it is distinguishable from
1972 * C; otherwise check the thousands separator too. Only if both are the
1973 * same as C's is the locale indistinguishable from C.
1974 *
1975 * But on earlier Windows versions, there is a potential race. This code
1976 * knows that localeconv() (elsewhere in this file) will be used to extract
1977 * the needed value, and localeconv() was buggy for quite a while, and that
1978 * code in this file hence uses a workaround. And that workaround may have
1979 * an (unlikely) race. Gathering the radix uses a different workaround on
1980 * Windows that doesn't involve a race. It might be possible to do the
1981 * same for this (patches welcome).
1982 *
1983 * Until then khw doesn't think it's worth even the small risk of a race to
1984 * get this value, which doesn't appear to be used in any of the Microsoft
1985 * library routines anyway. */
1986
1987 const char * scratch_buffer = NULL;
1988 if (PL_numeric_underlying_is_standard) {
1989 PL_numeric_underlying_is_standard = strEQ(C_thousands_sep,
1990 my_langinfo_c(THOUSEP, LC_NUMERIC,
1991 PL_numeric_name,
1992 &scratch_buffer,
1993 NULL, NULL));
1994 }
1995 Safefree(scratch_buffer);
1996
1997 # else
1998 PERL_UNUSED_VAR(C_thousands_sep);
1999 # endif
2000
2001 PL_numeric_standard = PL_numeric_underlying_is_standard;
2002
2003 /* Keep LC_NUMERIC so that it has the C locale radix and thousands
2004 * separator. This is for XS modules, so they don't have to worry about
2005 * the radix being a non-dot. (Core operations that need the underlying
2006 * locale change to it temporarily). */
2007 if (! PL_numeric_standard) {
2008 set_numeric_standard();
2009 }
2010
2011 }
2012
2013 # endif
2014
2015 void
Perl_set_numeric_standard(pTHX)2016 Perl_set_numeric_standard(pTHX)
2017 {
2018
2019 # ifdef USE_LOCALE_NUMERIC
2020
2021 /* Unconditionally toggle the LC_NUMERIC locale to the C locale
2022 *
2023 * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h
2024 * instead of calling this directly. The macro avoids calling this routine
2025 * if toggling isn't necessary according to our records (which could be
2026 * wrong if some XS code has changed the locale behind our back) */
2027
2028 DEBUG_L(PerlIO_printf(Perl_debug_log,
2029 "Setting LC_NUMERIC locale to standard C\n"));
2030
2031 void_setlocale_c(LC_NUMERIC, "C");
2032 PL_numeric_standard = TRUE;
2033 sv_setpv(PL_numeric_radix_sv, C_decimal_point);
2034
2035 PL_numeric_underlying = PL_numeric_underlying_is_standard;
2036
2037 # endif /* USE_LOCALE_NUMERIC */
2038
2039 }
2040
2041 void
Perl_set_numeric_underlying(pTHX)2042 Perl_set_numeric_underlying(pTHX)
2043 {
2044
2045 # ifdef USE_LOCALE_NUMERIC
2046
2047 /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
2048 * default.
2049 *
2050 * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h
2051 * instead of calling this directly. The macro avoids calling this routine
2052 * if toggling isn't necessary according to our records (which could be
2053 * wrong if some XS code has changed the locale behind our back) */
2054
2055 DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s\n",
2056 PL_numeric_name));
2057
2058 void_setlocale_c(LC_NUMERIC, PL_numeric_name);
2059 PL_numeric_underlying = TRUE;
2060 sv_setsv_nomg(PL_numeric_radix_sv, PL_underlying_radix_sv);
2061
2062 PL_numeric_standard = PL_numeric_underlying_is_standard;
2063
2064 # endif /* USE_LOCALE_NUMERIC */
2065
2066 }
2067
2068 # ifdef USE_LOCALE_CTYPE
2069
2070 STATIC void
S_new_ctype(pTHX_ const char * newctype,bool force)2071 S_new_ctype(pTHX_ const char *newctype, bool force)
2072 {
2073 PERL_ARGS_ASSERT_NEW_CTYPE;
2074 PERL_UNUSED_ARG(force);
2075
2076 /* Called after each libc setlocale() call affecting LC_CTYPE, to tell
2077 * core Perl this and that 'newctype' is the name of the new locale.
2078 *
2079 * This function sets up the folding arrays for all 256 bytes, assuming
2080 * that tofold() is tolc() since fold case is not a concept in POSIX,
2081 */
2082
2083 DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n", newctype));
2084
2085 /* No change means no-op */
2086 if (strEQ(PL_ctype_name, newctype)) {
2087 return;
2088 }
2089
2090 /* We will replace any bad locale warning with 1) nothing if the new one is
2091 * ok; or 2) a new warning for the bad new locale */
2092 if (PL_warn_locale) {
2093 SvREFCNT_dec_NN(PL_warn_locale);
2094 PL_warn_locale = NULL;
2095 }
2096
2097 /* Clear cache */
2098 Safefree(PL_ctype_name);
2099 PL_ctype_name = "";
2100
2101 PL_in_utf8_turkic_locale = FALSE;
2102
2103 /* For the C locale, just use the standard folds, and we know there are no
2104 * glitches possible, so return early. Since this is called at process
2105 * initialization, be aware that this bit can't rely on much being
2106 * available. */
2107 if (isNAME_C_OR_POSIX(newctype)) {
2108 Copy(PL_fold, PL_fold_locale, 256, U8);
2109 PL_ctype_name = savepv(newctype);
2110 PL_in_utf8_CTYPE_locale = FALSE;
2111 return;
2112 }
2113
2114 /* The cache being cleared signals this function to compute a new value */
2115 PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype);
2116
2117 PL_ctype_name = savepv(newctype);
2118 bool maybe_utf8_turkic = FALSE;
2119
2120 /* Don't check for problems if we are suppressing the warnings */
2121 bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
2122
2123 if (PL_in_utf8_CTYPE_locale) {
2124
2125 /* A UTF-8 locale gets standard rules. But note that code still has to
2126 * handle this specially because of the three problematic code points
2127 * */
2128 Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
2129
2130 /* UTF-8 locales can have special handling for 'I' and 'i' if they are
2131 * Turkic. Make sure these two are the only anomalies. (We don't
2132 * require towupper and towlower because they aren't in C89.) */
2133
2134 # if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
2135
2136 if (towupper('i') == 0x130 && towlower('I') == 0x131)
2137
2138 # else
2139
2140 if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I')
2141
2142 # endif
2143
2144 {
2145 /* This is how we determine it really is Turkic */
2146 check_for_problems = TRUE;
2147 maybe_utf8_turkic = TRUE;
2148 }
2149 }
2150 else { /* Not a canned locale we know the values for. Compute them */
2151
2152 # ifdef DEBUGGING
2153
2154 bool has_non_ascii_fold = FALSE;
2155 bool found_unexpected = FALSE;
2156
2157 /* Under -DLv, see if there are any folds outside the ASCII range.
2158 * This factoid is used below */
2159 if (DEBUG_Lv_TEST) {
2160 for (unsigned i = 128; i < 256; i++) {
2161 int j = LATIN1_TO_NATIVE(i);
2162 if (toU8_LOWER_LC(j) != j || toU8_UPPER_LC(j) != j) {
2163 has_non_ascii_fold = TRUE;
2164 break;
2165 }
2166 }
2167 }
2168
2169 # endif
2170
2171 for (unsigned i = 0; i < 256; i++) {
2172 if (isU8_UPPER_LC(i))
2173 PL_fold_locale[i] = (U8) toU8_LOWER_LC(i);
2174 else if (isU8_LOWER_LC(i))
2175 PL_fold_locale[i] = (U8) toU8_UPPER_LC(i);
2176 else
2177 PL_fold_locale[i] = (U8) i;
2178
2179 # ifdef DEBUGGING
2180
2181 /* Most locales these days are supersets of ASCII. When debugging
2182 * with -DLv, it is helpful to know what the exceptions to that are
2183 * in this locale */
2184 if (DEBUG_Lv_TEST) {
2185 bool unexpected = FALSE;
2186
2187 if (isUPPER_L1(i)) {
2188 if (isUPPER_A(i)) {
2189 if (PL_fold_locale[i] != toLOWER_A(i)) {
2190 unexpected = TRUE;
2191 }
2192 }
2193 else if (has_non_ascii_fold) {
2194 if (PL_fold_locale[i] != toLOWER_L1(i)) {
2195 unexpected = TRUE;
2196 }
2197 }
2198 else if (PL_fold_locale[i] != i) {
2199 unexpected = TRUE;
2200 }
2201 }
2202 else if ( isLOWER_L1(i)
2203 && i != LATIN_SMALL_LETTER_SHARP_S
2204 && i != MICRO_SIGN)
2205 {
2206 if (isLOWER_A(i)) {
2207 if (PL_fold_locale[i] != toUPPER_A(i)) {
2208 unexpected = TRUE;
2209 }
2210 }
2211 else if (has_non_ascii_fold) {
2212 if (PL_fold_locale[i] != toUPPER_LATIN1_MOD(i)) {
2213 unexpected = TRUE;
2214 }
2215 }
2216 else if (PL_fold_locale[i] != i) {
2217 unexpected = TRUE;
2218 }
2219 }
2220 else if (PL_fold_locale[i] != i) {
2221 unexpected = TRUE;
2222 }
2223
2224 if (unexpected) {
2225 found_unexpected = TRUE;
2226 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2227 "For %s, fold of %02x is %02x\n",
2228 newctype, i, PL_fold_locale[i]));
2229 }
2230 }
2231 }
2232
2233 if (found_unexpected) {
2234 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2235 "All bytes not mentioned above either fold to"
2236 " themselves or are the expected ASCII or"
2237 " Latin1 ones\n"));
2238 }
2239 else {
2240 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2241 "No nonstandard folds were found\n"));
2242 # endif
2243
2244 }
2245 }
2246
2247 # ifdef MB_CUR_MAX
2248
2249 /* We only handle single-byte locales (outside of UTF-8 ones); so if this
2250 * locale requires more than one byte, there are going to be BIG problems.
2251 * */
2252
2253 if (MB_CUR_MAX > 1 && ! PL_in_utf8_CTYPE_locale
2254
2255 /* Some platforms return MB_CUR_MAX > 1 for even the "C" locale.
2256 * Just assume that the implementation for them (plus for POSIX) is
2257 * correct and the > 1 value is spurious. (Since these are
2258 * specially handled to never be considered UTF-8 locales, as long
2259 * as this is the only problem, everything should work fine */
2260 && ! isNAME_C_OR_POSIX(newctype))
2261 {
2262 DEBUG_L(PerlIO_printf(Perl_debug_log,
2263 "Unsupported, MB_CUR_MAX=%d\n", (int) MB_CUR_MAX));
2264
2265 Perl_ck_warner_d(aTHX_ packWARN(WARN_LOCALE),
2266 "Locale '%s' is unsupported, and may crash the"
2267 " interpreter.\n",
2268 newctype);
2269 }
2270
2271 # endif
2272
2273 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "check_for_problems=%d\n",
2274 check_for_problems));
2275
2276 /* We don't populate the other lists if a UTF-8 locale, but do check that
2277 * everything works as expected, unless checking turned off */
2278 if (check_for_problems) {
2279 /* Assume enough space for every character being bad. 4 spaces each
2280 * for the 94 printable characters that are output like "'x' "; and 5
2281 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
2282 * NUL */
2283 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
2284 unsigned int bad_count = 0; /* Count of bad characters */
2285
2286 for (unsigned i = 0; i < 256; i++) {
2287
2288 /* If checking for locale problems, see if the native ASCII-range
2289 * printables plus \n and \t are in their expected categories in
2290 * the new locale. If not, this could mean big trouble, upending
2291 * Perl's and most programs' assumptions, like having a
2292 * metacharacter with special meaning become a \w. Fortunately,
2293 * it's very rare to find locales that aren't supersets of ASCII
2294 * nowadays. It isn't a problem for most controls to be changed
2295 * into something else; we check only \n and \t, though perhaps \r
2296 * could be an issue as well. */
2297 if (isGRAPH_A(i) || isBLANK_A(i) || i == '\n') {
2298 bool is_bad = FALSE;
2299 char name[4] = { '\0' };
2300
2301 /* Convert the name into a string */
2302 if (isGRAPH_A(i)) {
2303 name[0] = i;
2304 name[1] = '\0';
2305 }
2306 else if (i == '\n') {
2307 my_strlcpy(name, "\\n", sizeof(name));
2308 }
2309 else if (i == '\t') {
2310 my_strlcpy(name, "\\t", sizeof(name));
2311 }
2312 else {
2313 assert(i == ' ');
2314 my_strlcpy(name, "' '", sizeof(name));
2315 }
2316
2317 /* Check each possibe class */
2318 if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) !=
2319 cBOOL(isALPHANUMERIC_A(i))))
2320 {
2321 is_bad = TRUE;
2322 DEBUG_L(PerlIO_printf(Perl_debug_log,
2323 "isalnum('%s') unexpectedly is %x\n",
2324 name, cBOOL(isU8_ALPHANUMERIC_LC(i))));
2325 }
2326 if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i)))) {
2327 is_bad = TRUE;
2328 DEBUG_L(PerlIO_printf(Perl_debug_log,
2329 "isalpha('%s') unexpectedly is %x\n",
2330 name, cBOOL(isU8_ALPHA_LC(i))));
2331 }
2332 if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i)))) {
2333 is_bad = TRUE;
2334 DEBUG_L(PerlIO_printf(Perl_debug_log,
2335 "isdigit('%s') unexpectedly is %x\n",
2336 name, cBOOL(isU8_DIGIT_LC(i))));
2337 }
2338 if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i)))) {
2339 is_bad = TRUE;
2340 DEBUG_L(PerlIO_printf(Perl_debug_log,
2341 "isgraph('%s') unexpectedly is %x\n",
2342 name, cBOOL(isU8_GRAPH_LC(i))));
2343 }
2344 if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i)))) {
2345 is_bad = TRUE;
2346 DEBUG_L(PerlIO_printf(Perl_debug_log,
2347 "islower('%s') unexpectedly is %x\n",
2348 name, cBOOL(isU8_LOWER_LC(i))));
2349 }
2350 if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i)))) {
2351 is_bad = TRUE;
2352 DEBUG_L(PerlIO_printf(Perl_debug_log,
2353 "isprint('%s') unexpectedly is %x\n",
2354 name, cBOOL(isU8_PRINT_LC(i))));
2355 }
2356 if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i)))) {
2357 is_bad = TRUE;
2358 DEBUG_L(PerlIO_printf(Perl_debug_log,
2359 "ispunct('%s') unexpectedly is %x\n",
2360 name, cBOOL(isU8_PUNCT_LC(i))));
2361 }
2362 if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i)))) {
2363 is_bad = TRUE;
2364 DEBUG_L(PerlIO_printf(Perl_debug_log,
2365 "isspace('%s') unexpectedly is %x\n",
2366 name, cBOOL(isU8_SPACE_LC(i))));
2367 }
2368 if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i)))) {
2369 is_bad = TRUE;
2370 DEBUG_L(PerlIO_printf(Perl_debug_log,
2371 "isupper('%s') unexpectedly is %x\n",
2372 name, cBOOL(isU8_UPPER_LC(i))));
2373 }
2374 if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i)))) {
2375 is_bad = TRUE;
2376 DEBUG_L(PerlIO_printf(Perl_debug_log,
2377 "isxdigit('%s') unexpectedly is %x\n",
2378 name, cBOOL(isU8_XDIGIT_LC(i))));
2379 }
2380 if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) {
2381 is_bad = TRUE;
2382 DEBUG_L(PerlIO_printf(Perl_debug_log,
2383 "tolower('%s')=0x%x instead of the expected 0x%x\n",
2384 name, toU8_LOWER_LC(i), (int) toLOWER_A(i)));
2385 }
2386 if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) {
2387 is_bad = TRUE;
2388 DEBUG_L(PerlIO_printf(Perl_debug_log,
2389 "toupper('%s')=0x%x instead of the expected 0x%x\n",
2390 name, toU8_UPPER_LC(i), (int) toUPPER_A(i)));
2391 }
2392 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i)))) {
2393 is_bad = TRUE;
2394 DEBUG_L(PerlIO_printf(Perl_debug_log,
2395 "'\\n' (=%02X) is not a control\n", (int) i));
2396 }
2397
2398 /* Add to the list; Separate multiple entries with a blank */
2399 if (is_bad) {
2400 if (bad_count) {
2401 my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list));
2402 }
2403 my_strlcat(bad_chars_list, name, sizeof(bad_chars_list));
2404 bad_count++;
2405 }
2406 }
2407 }
2408
2409 if (bad_count == 2 && maybe_utf8_turkic) {
2410 bad_count = 0;
2411 *bad_chars_list = '\0';
2412
2413 /* The casts are because otherwise some compilers warn:
2414 gcc.gnu.org/bugzilla/show_bug.cgi?id=99950
2415 gcc.gnu.org/bugzilla/show_bug.cgi?id=94182
2416 */
2417 PL_fold_locale[ (U8) 'I' ] = 'I';
2418 PL_fold_locale[ (U8) 'i' ] = 'i';
2419 PL_in_utf8_turkic_locale = TRUE;
2420 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype));
2421 }
2422
2423 /* If we found problems and we want them output, do so */
2424 if ( (UNLIKELY(bad_count))
2425 && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
2426 {
2427 /* WARNING. If you change the wording of these; be sure to update
2428 * t/loc_tools.pl correspondingly */
2429
2430 if (PL_in_utf8_CTYPE_locale) {
2431 PL_warn_locale = Perl_newSVpvf(aTHX_
2432 "Locale '%s' contains (at least) the following characters"
2433 " which have\nunexpected meanings: %s\nThe Perl program"
2434 " will use the expected meanings",
2435 newctype, bad_chars_list);
2436 }
2437 else {
2438 PL_warn_locale =
2439 Perl_newSVpvf(aTHX_
2440 "\nThe following characters (and maybe"
2441 " others) may not have the same meaning as"
2442 " the Perl program expects: %s\n",
2443 bad_chars_list
2444 );
2445 }
2446
2447 # ifdef HAS_SOME_LANGINFO
2448
2449 const char * scratch_buffer = NULL;
2450 Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
2451 my_langinfo_c(CODESET, LC_CTYPE,
2452 newctype,
2453 &scratch_buffer, NULL,
2454 NULL));
2455 Safefree(scratch_buffer);
2456
2457 # endif
2458
2459 Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n");
2460
2461 /* If we are actually in the scope of the locale or are debugging,
2462 * output the message now. If not in that scope, we save the
2463 * message to be output at the first operation using this locale,
2464 * if that actually happens. Most programs don't use locales, so
2465 * they are immune to bad ones. */
2466 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
2467
2468 /* The '0' below suppresses a bogus gcc compiler warning */
2469 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale),
2470 0);
2471
2472 if (IN_LC(LC_CTYPE)) {
2473 SvREFCNT_dec_NN(PL_warn_locale);
2474 PL_warn_locale = NULL;
2475 }
2476 }
2477 }
2478 }
2479 }
2480
2481 # endif /* USE_LOCALE_CTYPE */
2482
2483 void
Perl__warn_problematic_locale()2484 Perl__warn_problematic_locale()
2485 {
2486
2487 # ifdef USE_LOCALE_CTYPE
2488
2489 dTHX;
2490
2491 /* Internal-to-core function that outputs the message in PL_warn_locale,
2492 * and then NULLS it. Should be called only through the macro
2493 * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */
2494
2495 if (PL_warn_locale) {
2496 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2497 SvPVX(PL_warn_locale),
2498 0 /* dummy to avoid compiler warning */ );
2499 SvREFCNT_dec_NN(PL_warn_locale);
2500 PL_warn_locale = NULL;
2501 }
2502
2503 # endif
2504
2505 }
2506
2507 STATIC void
S_new_LC_ALL(pTHX_ const char * unused,bool force)2508 S_new_LC_ALL(pTHX_ const char *unused, bool force)
2509 {
2510 PERL_ARGS_ASSERT_NEW_LC_ALL;
2511 PERL_UNUSED_ARG(unused);
2512
2513 /* LC_ALL updates all the things we care about. */
2514
2515 for (unsigned int i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
2516 if (update_functions[i]) {
2517 const char * this_locale = querylocale_i(i);
2518 update_functions[i](aTHX_ this_locale, force);
2519 }
2520 }
2521 }
2522
2523 # ifdef USE_LOCALE_COLLATE
2524
2525 STATIC void
S_new_collate(pTHX_ const char * newcoll,bool force)2526 S_new_collate(pTHX_ const char *newcoll, bool force)
2527 {
2528 PERL_ARGS_ASSERT_NEW_COLLATE;
2529 PERL_UNUSED_ARG(force);
2530
2531 /* Called after each libc setlocale() call affecting LC_COLLATE, to tell
2532 * core Perl this and that 'newcoll' is the name of the new locale.
2533 *
2534 * The design of locale collation is that every locale change is given an
2535 * index 'PL_collation_ix'. The first time a string participates in an
2536 * operation that requires collation while locale collation is active, it
2537 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
2538 * magic includes the collation index, and the transformation of the string
2539 * by strxfrm(), q.v. That transformation is used when doing comparisons,
2540 * instead of the string itself. If a string changes, the magic is
2541 * cleared. The next time the locale changes, the index is incremented,
2542 * and so we know during a comparison that the transformation is not
2543 * necessarily still valid, and so is recomputed. Note that if the locale
2544 * changes enough times, the index could wrap (a U32), and it is possible
2545 * that a transformation would improperly be considered valid, leading to
2546 * an unlikely bug */
2547
2548 /* Return if the locale isn't changing */
2549 if (strEQ(PL_collation_name, newcoll)) {
2550 return;
2551 }
2552
2553 Safefree(PL_collation_name);
2554 PL_collation_name = savepv(newcoll);
2555 ++PL_collation_ix;
2556
2557 /* Set the new one up if trivial. Since this is called at process
2558 * initialization, be aware that this bit can't rely on much being
2559 * available. */
2560 PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
2561 if (PL_collation_standard) {
2562 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
2563 "Setting PL_collation name='%s'\n",
2564 PL_collation_name));
2565 PL_collxfrm_base = 0;
2566 PL_collxfrm_mult = 2;
2567 PL_in_utf8_COLLATE_locale = FALSE;
2568 PL_strxfrm_NUL_replacement = '\0';
2569 PL_strxfrm_max_cp = 0;
2570 return;
2571 }
2572
2573 /* Flag that the remainder of the set up is being deferred until first
2574 * need. */
2575 PL_collxfrm_mult = 0;
2576 PL_collxfrm_base = 0;
2577
2578 }
2579
2580 # endif /* USE_LOCALE_COLLATE */
2581 #endif /* USE_LOCALE */
2582
2583 #ifdef WIN32
2584
2585 wchar_t *
S_Win_byte_string_to_wstring(const UINT code_page,const char * byte_string)2586 S_Win_byte_string_to_wstring(const UINT code_page, const char * byte_string)
2587 {
2588 /* Caller must arrange to free the returned string */
2589
2590 int req_size = MultiByteToWideChar(code_page, 0, byte_string, -1, NULL, 0);
2591 if (! req_size) {
2592 SET_EINVAL;
2593 return NULL;
2594 }
2595
2596 wchar_t *wstring;
2597 Newx(wstring, req_size, wchar_t);
2598
2599 if (! MultiByteToWideChar(code_page, 0, byte_string, -1, wstring, req_size))
2600 {
2601 Safefree(wstring);
2602 SET_EINVAL;
2603 return NULL;
2604 }
2605
2606 return wstring;
2607 }
2608
2609 #define Win_utf8_string_to_wstring(s) Win_byte_string_to_wstring(CP_UTF8, (s))
2610
2611 char *
S_Win_wstring_to_byte_string(const UINT code_page,const wchar_t * wstring)2612 S_Win_wstring_to_byte_string(const UINT code_page, const wchar_t * wstring)
2613 {
2614 /* Caller must arrange to free the returned string */
2615
2616 int req_size =
2617 WideCharToMultiByte(code_page, 0, wstring, -1, NULL, 0, NULL, NULL);
2618
2619 char *byte_string;
2620 Newx(byte_string, req_size, char);
2621
2622 if (! WideCharToMultiByte(code_page, 0, wstring, -1, byte_string,
2623 req_size, NULL, NULL))
2624 {
2625 Safefree(byte_string);
2626 SET_EINVAL;
2627 return NULL;
2628 }
2629
2630 return byte_string;
2631 }
2632
2633 #define Win_wstring_to_utf8_string(ws) Win_wstring_to_byte_string(CP_UTF8, (ws))
2634
2635 STATIC const char *
S_wrap_wsetlocale(pTHX_ const int category,const char * locale)2636 S_wrap_wsetlocale(pTHX_ const int category, const char *locale)
2637 {
2638 PERL_ARGS_ASSERT_WRAP_WSETLOCALE;
2639
2640 /* Calls _wsetlocale(), converting the parameters/return to/from
2641 * Perl-expected forms as if plain setlocale() were being called instead.
2642 */
2643
2644 const wchar_t * wlocale = NULL;
2645
2646 if (locale) {
2647 wlocale = Win_utf8_string_to_wstring(locale);
2648 if (! wlocale) {
2649 return NULL;
2650 }
2651 }
2652
2653 WSETLOCALE_LOCK;
2654 const wchar_t * wresult = _wsetlocale(category, wlocale);
2655 Safefree(wlocale);
2656
2657 if (! wresult) {
2658 WSETLOCALE_UNLOCK;
2659 return NULL;
2660 }
2661
2662 const char * result = Win_wstring_to_utf8_string(wresult);
2663 WSETLOCALE_UNLOCK;
2664
2665 SAVEFREEPV(result); /* is there something better we can do here? Answer:
2666 Without restructuring, returning a unique value each
2667 call is required. See GH #20434 */
2668 return result;
2669 }
2670
2671 STATIC const char *
S_win32_setlocale(pTHX_ int category,const char * locale)2672 S_win32_setlocale(pTHX_ int category, const char* locale)
2673 {
2674 /* This, for Windows, emulates POSIX setlocale() behavior. There is no
2675 * difference between the two unless the input locale is "", which normally
2676 * means on Windows to get the machine default, which is set via the
2677 * computer's "Regional and Language Options" (or its current equivalent).
2678 * In POSIX, it instead means to find the locale from the user's
2679 * environment. This routine changes the Windows behavior to first look in
2680 * the environment, and, if anything is found, use that instead of going to
2681 * the machine default. If there is no environment override, the machine
2682 * default is used, by calling the real setlocale() with "".
2683 *
2684 * The POSIX behavior is to use the LC_ALL variable if set; otherwise to
2685 * use the particular category's variable if set; otherwise to use the LANG
2686 * variable. */
2687
2688 if (locale == NULL) {
2689 return wrap_wsetlocale(category, NULL);
2690 }
2691
2692 if (strEQ(locale, "")) {
2693 /* Note this function may change the locale, but that's ok because we
2694 * are about to change it anyway */
2695 locale = find_locale_from_environment(get_category_index(category, ""));
2696 }
2697
2698 const char * result = wrap_wsetlocale(category, locale);
2699 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2700 setlocale_debug_string_r(category, locale, result)));
2701
2702 # ifdef USE_PL_CUR_LC_ALL
2703
2704 /* If we need to keep track of LC_ALL, update it to the new value. */
2705 Safefree(PL_cur_LC_ALL);
2706 if (category == LC_ALL) {
2707 PL_cur_LC_ALL = savepv(result);
2708 }
2709 else {
2710 PL_cur_LC_ALL = savepv(wrap_wsetlocale(LC_ALL, NULL));
2711 }
2712
2713 # endif
2714
2715 return result;
2716 }
2717
2718 #endif
2719
2720 /*
2721 =for apidoc Perl_setlocale
2722
2723 This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>,
2724 taking the same parameters, and returning the same information, except that it
2725 returns the correct underlying C<LC_NUMERIC> locale. Regular C<setlocale> will
2726 instead return C<C> if the underlying locale has a non-dot decimal point
2727 character, or a non-empty thousands separator for displaying floating point
2728 numbers. This is because perl keeps that locale category such that it has a
2729 dot and empty separator, changing the locale briefly during the operations
2730 where the underlying one is required. C<Perl_setlocale> knows about this, and
2731 compensates; regular C<setlocale> doesn't.
2732
2733 Another reason it isn't completely a drop-in replacement is that it is
2734 declared to return S<C<const char *>>, whereas the system setlocale omits the
2735 C<const> (presumably because its API was specified long ago, and can't be
2736 updated; it is illegal to change the information C<setlocale> returns; doing
2737 so leads to segfaults.)
2738
2739 Finally, C<Perl_setlocale> works under all circumstances, whereas plain
2740 C<setlocale> can be completely ineffective on some platforms under some
2741 configurations.
2742
2743 Changing the locale is not a good idea when more than one thread is running,
2744 except on systems where the predefined variable C<${^SAFE_LOCALES}> is 1.
2745 This is because on such systems the locale is global to the whole process and
2746 not local to just the thread calling the function. So changing it in one
2747 thread instantaneously changes it in all. On some such systems, the system
2748 C<setlocale()> is ineffective, returning the wrong information, and failing to
2749 actually change the locale. z/OS refuses to try to change the locale once a
2750 second thread is created. C<Perl_setlocale>, should give you accurate results
2751 of what actually happened on these problematic platforms, returning NULL if the
2752 system forbade the locale change.
2753
2754 The return points to a per-thread static buffer, which is overwritten the next
2755 time C<Perl_setlocale> is called from the same thread.
2756
2757 =cut
2758
2759 */
2760
2761 #ifndef USE_LOCALE_NUMERIC
2762 # define affects_LC_NUMERIC(cat) 0
2763 #elif defined(LC_ALL)
2764 # define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC || cat == LC_ALL)
2765 #else
2766 # define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC)
2767 #endif
2768
2769 const char *
Perl_setlocale(const int category,const char * locale)2770 Perl_setlocale(const int category, const char * locale)
2771 {
2772 /* This wraps POSIX::setlocale() */
2773
2774 #ifndef USE_LOCALE
2775
2776 PERL_UNUSED_ARG(category);
2777 PERL_UNUSED_ARG(locale);
2778
2779 return "C";
2780
2781 #else
2782
2783 const char * retval;
2784 dTHX;
2785
2786 DEBUG_L(PerlIO_printf(Perl_debug_log,
2787 "Entering Perl_setlocale(%d, \"%s\")\n",
2788 category, locale));
2789
2790 /* A NULL locale means only query what the current one is. */
2791 if (locale == NULL) {
2792
2793 # ifndef USE_LOCALE_NUMERIC
2794
2795 /* Without LC_NUMERIC, it's trivial; we just return the value */
2796 return save_to_buffer(querylocale_r(category),
2797 &PL_setlocale_buf, &PL_setlocale_bufsize);
2798 # else
2799
2800 /* We have the LC_NUMERIC name saved, because we are normally switched
2801 * into the C locale (or equivalent) for it. */
2802 if (category == LC_NUMERIC) {
2803 DEBUG_L(PerlIO_printf(Perl_debug_log,
2804 "Perl_setlocale(LC_NUMERIC, NULL) returning stashed '%s'\n",
2805 PL_numeric_name));
2806
2807 /* We don't have to copy this return value, as it is a per-thread
2808 * variable, and won't change until a future setlocale */
2809 return PL_numeric_name;
2810 }
2811
2812 # ifndef LC_ALL
2813
2814 /* Without LC_ALL, just return the value */
2815 return save_to_buffer(querylocale_r(category),
2816 &PL_setlocale_buf, &PL_setlocale_bufsize);
2817
2818 # else
2819
2820 /* Here, LC_ALL is available on this platform. It's the one
2821 * complicating category (because it can contain a toggled LC_NUMERIC
2822 * value), for all the remaining ones (we took care of LC_NUMERIC
2823 * above), just return the value */
2824 if (category != LC_ALL) {
2825 return save_to_buffer(querylocale_r(category),
2826 &PL_setlocale_buf, &PL_setlocale_bufsize);
2827 }
2828
2829 bool toggled = FALSE;
2830
2831 /* For an LC_ALL query, switch back to the underlying numeric locale
2832 * (if we aren't there already) so as to get the correct results. Our
2833 * records for all the other categories are valid without switching */
2834 if (! PL_numeric_underlying) {
2835 set_numeric_underlying();
2836 toggled = TRUE;
2837 }
2838
2839 retval = querylocale_c(LC_ALL);
2840
2841 if (toggled) {
2842 set_numeric_standard();
2843 }
2844
2845 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2846 setlocale_debug_string_r(category, locale, retval)));
2847
2848 return save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
2849
2850 # endif /* Has LC_ALL */
2851 # endif /* Has LC_NUMERIC */
2852
2853 } /* End of querying the current locale */
2854
2855
2856 unsigned int cat_index = get_category_index(category, NULL);
2857 retval = querylocale_i(cat_index);
2858
2859 /* If the new locale is the same as the current one, nothing is actually
2860 * being changed, so do nothing. */
2861 if ( retval != NULL && strEQ(retval, locale)
2862 && ( ! affects_LC_NUMERIC(category)
2863
2864 # ifdef USE_LOCALE_NUMERIC
2865
2866 || strEQ(locale, PL_numeric_name)
2867
2868 # endif
2869
2870 )) {
2871 DEBUG_L(PerlIO_printf(Perl_debug_log,
2872 "Already in requested locale: no action taken\n"));
2873 return save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
2874 }
2875
2876 /* Here, an actual change is being requested. Do it */
2877 retval = setlocale_i(cat_index, locale);
2878
2879 if (! retval) {
2880 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
2881 setlocale_debug_string_i(cat_index, locale, "NULL")));
2882 return NULL;
2883 }
2884
2885 assert(strNE(retval, ""));
2886 retval = save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
2887
2888 /* Now that have changed locales, we have to update our records to
2889 * correspond. Only certain categories have extra work to update. */
2890 if (update_functions[cat_index]) {
2891 update_functions[cat_index](aTHX_ retval, false);
2892 }
2893
2894 DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", retval));
2895
2896 return retval;
2897
2898 #endif
2899
2900 }
2901
2902 STATIC utf8ness_t
S_get_locale_string_utf8ness_i(pTHX_ const char * string,const locale_utf8ness_t known_utf8,const char * locale,const unsigned cat_index)2903 S_get_locale_string_utf8ness_i(pTHX_ const char * string,
2904 const locale_utf8ness_t known_utf8,
2905 const char * locale,
2906 const unsigned cat_index)
2907 {
2908 PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I;
2909
2910 #ifndef USE_LOCALE
2911
2912 return UTF8NESS_NO;
2913 PERL_UNUSED_ARG(string);
2914 PERL_UNUSED_ARG(known_utf8);
2915 PERL_UNUSED_ARG(locale);
2916 PERL_UNUSED_ARG(cat_index);
2917
2918 #else
2919
2920 assert(cat_index <= NOMINAL_LC_ALL_INDEX);
2921
2922 /* Return to indicate if 'string' in the locale given by the input
2923 * arguments should be considered UTF-8 or not.
2924 *
2925 * If the input 'locale' is not NULL, use that for the locale; otherwise
2926 * use the current locale for the category specified by 'cat_index'.
2927 */
2928
2929 if (string == NULL) {
2930 return UTF8NESS_NO;
2931 }
2932
2933 if (IN_BYTES) { /* respect 'use bytes' */
2934 return UTF8NESS_NO;
2935 }
2936
2937 Size_t len = strlen(string);
2938
2939 /* UTF8ness is immaterial if the representation doesn't vary */
2940 const U8 * first_variant = NULL;
2941 if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) {
2942 return UTF8NESS_IMMATERIAL;
2943 }
2944
2945 /* Can't be UTF-8 if invalid */
2946 if (! is_utf8_string((U8 *) first_variant,
2947 len - ((char *) first_variant - string)))
2948 {
2949 return UTF8NESS_NO;
2950 }
2951
2952 /* Here and below, we know the string is legal UTF-8, containing at least
2953 * one character requiring a sequence of two or more bytes. It is quite
2954 * likely to be UTF-8. But it pays to be paranoid and do further checking.
2955 *
2956 * If we already know the UTF-8ness of the locale, then we immediately know
2957 * what the string is */
2958 if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) {
2959 if (known_utf8 == LOCALE_IS_UTF8) {
2960 return UTF8NESS_YES;
2961 }
2962 else {
2963 return UTF8NESS_NO;
2964 }
2965 }
2966
2967 # ifdef HAS_RELIABLE_UTF8NESS_DETERMINATION
2968
2969 /* Here, we have available the libc functions that can be used to
2970 * accurately determine the UTF8ness of the underlying locale. If it is a
2971 * UTF-8 locale, the string is UTF-8; otherwise it was coincidental that
2972 * the string is legal UTF-8
2973 *
2974 * However, if the perl is compiled to not pay attention to the category
2975 * being passed in, you might think that that locale is essentially always
2976 * the C locale, so it would make sense to say it isn't UTF-8. But to get
2977 * here, the string has to contain characters unknown in the C locale. And
2978 * in fact, Windows boxes are compiled without LC_MESSAGES, as their
2979 * message catalog isn't really a part of the locale system. But those
2980 * messages really could be UTF-8, and given that the odds are rather small
2981 * of something not being UTF-8 but being syntactically valid UTF-8, khw
2982 * has decided to call such strings as UTF-8. */
2983
2984 if (locale == NULL) {
2985 locale = querylocale_i(cat_index);
2986 }
2987 if (is_locale_utf8(locale)) {
2988 return UTF8NESS_YES;
2989 }
2990
2991 return UTF8NESS_NO;
2992
2993 # else
2994
2995 /* Here, we have a valid UTF-8 string containing non-ASCII characters, and
2996 * don't have access to functions to check if the locale is UTF-8 or not.
2997 * Assume that it is. khw tried adding a check that the string is entirely
2998 * in a single Unicode script, but discovered the strftime() timezone is
2999 * user-settable through the environment, which may be in a different
3000 * script than the locale-expected value. */
3001 PERL_UNUSED_ARG(locale);
3002 PERL_UNUSED_ARG(cat_index);
3003
3004 return UTF8NESS_YES;
3005
3006 # endif
3007 #endif
3008
3009 }
3010
3011 STATIC bool
S_is_locale_utf8(pTHX_ const char * locale)3012 S_is_locale_utf8(pTHX_ const char * locale)
3013 {
3014 /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise. It uses
3015 * my_langinfo(), which employs various methods to get this information
3016 * if nl_langinfo() isn't available, using heuristics as a last resort, in
3017 * which case, the result will very likely be correct for locales for
3018 * languages that have commonly used non-ASCII characters, but for notably
3019 * English, it comes down to if the locale's name ends in something like
3020 * "UTF-8". It errs on the side of not being a UTF-8 locale. */
3021
3022 # if ! defined(USE_LOCALE) \
3023 || ! defined(USE_LOCALE_CTYPE) \
3024 || defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */
3025
3026 PERL_UNUSED_ARG(locale);
3027
3028 return FALSE;
3029
3030 # else
3031
3032 const char * scratch_buffer = NULL;
3033 const char * codeset;
3034 bool retval;
3035
3036 PERL_ARGS_ASSERT_IS_LOCALE_UTF8;
3037
3038 if (strEQ(locale, PL_ctype_name)) {
3039 return PL_in_utf8_CTYPE_locale;
3040 }
3041
3042 codeset = my_langinfo_c(CODESET, LC_CTYPE, locale,
3043 &scratch_buffer, NULL, NULL);
3044 retval = is_codeset_name_UTF8(codeset);
3045
3046 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3047 "found codeset=%s, is_utf8=%d\n", codeset, retval));
3048
3049 Safefree(scratch_buffer);
3050 return retval;
3051
3052 # endif
3053
3054 }
3055
3056 #ifdef USE_LOCALE
3057
3058 STATIC const char *
S_save_to_buffer(const char * string,const char ** buf,Size_t * buf_size)3059 S_save_to_buffer(const char * string, const char **buf, Size_t *buf_size)
3060 {
3061 /* Copy the NUL-terminated 'string' to a buffer whose address before this
3062 * call began at *buf, and whose available length before this call was
3063 * *buf_size.
3064 *
3065 * If the length of 'string' is greater than the space available, the
3066 * buffer is grown accordingly, which may mean that it gets relocated.
3067 * *buf and *buf_size will be updated to reflect this.
3068 *
3069 * Regardless, the function returns a pointer to where 'string' is now
3070 * stored.
3071 *
3072 * 'string' may be NULL, which means no action gets taken, and NULL is
3073 * returned.
3074 *
3075 * If *buf or 'buf_size' are NULL or *buf_size is 0, the buffer is assumed
3076 * empty, and memory is malloc'd. 'buf-size' being NULL is to be used
3077 * when this is a single use buffer, which will shortly be freed by the
3078 * caller.
3079 */
3080
3081 Size_t string_size;
3082
3083 PERL_ARGS_ASSERT_SAVE_TO_BUFFER;
3084
3085 if (! string) {
3086 return NULL;
3087 }
3088
3089 /* No-op to copy over oneself */
3090 if (string == *buf) {
3091 return string;
3092 }
3093
3094 string_size = strlen(string) + 1;
3095
3096 if (buf_size == NULL) {
3097 Newx(*buf, string_size, char);
3098 }
3099 else if (*buf_size == 0) {
3100 Newx(*buf, string_size, char);
3101 *buf_size = string_size;
3102 }
3103 else if (string_size > *buf_size) {
3104 Renew(*buf, string_size, char);
3105 *buf_size = string_size;
3106 }
3107
3108 {
3109 dTHX_DEBUGGING;
3110 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3111 "Copying '%s' to %p\n",
3112 ((is_utf8_string((U8 *) string, 0))
3113 ? string
3114 :_byte_dump_string((U8 *) string, strlen(string), 0)),
3115 *buf));
3116 }
3117
3118 # ifdef DEBUGGING
3119
3120 /* Catch glitches. Usually this is because LC_CTYPE needs to be the same
3121 * locale as whatever is being worked on */
3122 if (UNLIKELY(instr(string, REPLACEMENT_CHARACTER_UTF8))) {
3123 dTHX_DEBUGGING;
3124
3125 locale_panic_(Perl_form(aTHX_
3126 "Unexpected REPLACEMENT_CHARACTER in '%s'\n%s",
3127 string, get_LC_ALL_display()));
3128 }
3129
3130 # endif
3131
3132 Copy(string, *buf, string_size, char);
3133 return *buf;
3134 }
3135
3136 # ifdef WIN32
3137
3138 bool
Perl_get_win32_message_utf8ness(pTHX_ const char * string)3139 Perl_get_win32_message_utf8ness(pTHX_ const char * string)
3140 {
3141 /* NULL => locale irrelevant, 0 => category irrelevant
3142 * so returns based on the UTF-8 legality of the input string, ignoring the
3143 * locale and category completely.
3144 *
3145 * This is because Windows doesn't have LC_MESSAGES */
3146 return get_locale_string_utf8ness_i(string, LOCALE_IS_UTF8, NULL, 0);
3147 }
3148
3149 # endif
3150 #endif /* USE_LOCALE */
3151
3152
3153 int
Perl_mbtowc_(pTHX_ const wchar_t * pwc,const char * s,const Size_t len)3154 Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len)
3155 {
3156
3157 #if ! defined(HAS_MBRTOWC) && ! defined(HAS_MBTOWC)
3158
3159 PERL_UNUSED_ARG(pwc);
3160 PERL_UNUSED_ARG(s);
3161 PERL_UNUSED_ARG(len);
3162 return -1;
3163
3164 #else /* Below we have some form of mbtowc() */
3165 # if defined(HAS_MBRTOWC) \
3166 && (defined(USE_LOCALE_THREADS) || ! defined(HAS_MBTOWC))
3167 # define USE_MBRTOWC
3168 # else
3169 # undef USE_MBRTOWC
3170 # endif
3171
3172 int retval = -1;
3173
3174 if (s == NULL) { /* Initialize the shift state to all zeros in
3175 PL_mbrtowc_ps. */
3176
3177 # if defined(USE_MBRTOWC)
3178
3179 memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
3180 return 0;
3181
3182 # else
3183
3184 MBTOWC_LOCK_;
3185 SETERRNO(0, 0);
3186 retval = mbtowc(NULL, NULL, 0);
3187 MBTOWC_UNLOCK_;
3188 return retval;
3189
3190 # endif
3191
3192 }
3193
3194 # if defined(USE_MBRTOWC)
3195
3196 SETERRNO(0, 0);
3197 retval = (SSize_t) mbrtowc((wchar_t *) pwc, s, len, &PL_mbrtowc_ps);
3198
3199 # else
3200
3201 /* Locking prevents races, but locales can be switched out without locking,
3202 * so this isn't a cure all */
3203 MBTOWC_LOCK_;
3204 SETERRNO(0, 0);
3205 retval = mbtowc((wchar_t *) pwc, s, len);
3206 MBTOWC_UNLOCK_;
3207
3208 # endif
3209
3210 return retval;
3211
3212 #endif
3213
3214 }
3215
3216 /*
3217 =for apidoc Perl_localeconv
3218
3219 This is a thread-safe version of the libc L<localeconv(3)>. It is the same as
3220 L<POSIX::localeconv|POSIX/localeconv> (returning a hash of the C<localeconv()>
3221 fields), but directly callable from XS code.
3222
3223 =cut
3224 */
3225
3226 HV *
Perl_localeconv(pTHX)3227 Perl_localeconv(pTHX)
3228 {
3229
3230 #if ! defined(HAS_LOCALECONV)
3231
3232 return newHV();
3233
3234 #else
3235
3236 return my_localeconv(0);
3237
3238 #endif
3239
3240 }
3241
3242 #if defined(HAS_LOCALECONV)
3243
3244 HV *
S_my_localeconv(pTHX_ const int item)3245 S_my_localeconv(pTHX_ const int item)
3246 {
3247 PERL_ARGS_ASSERT_MY_LOCALECONV;
3248
3249 /* This returns a mortalized hash containing all or one of the elements
3250 * returned by localeconv(). It is used by Perl_localeconv() and
3251 * POSIX::localeconv() and is thread-safe.
3252 *
3253 * There are two use cases:
3254 * 1) Called from POSIX::locale_conv(). This returns the lconv structure
3255 * copied to a hash, based on the current underlying locales for
3256 * LC_NUMERIC and LC_MONETARY. An input item==0 signifies this case, or
3257 * on many platforms it is the only use case compiled.
3258 * 2) Certain items that nl_langinfo() provides are also derivable from
3259 * the return of localeconv(). Windows notably doesn't have
3260 * nl_langinfo(), so on that, and actually any platform lacking it,
3261 * my_localeconv() is used also to emulate it for those particular
3262 * items. The code to do this is compiled only on such platforms.
3263 * Rather than going to the expense of creating a full hash when only
3264 * one item is needed, the returned hash has just the desired item in
3265 * it.
3266 *
3267 * To access all the localeconv() struct lconv fields, there is a data
3268 * structure that contains every commonly documented field in it. (Maybe
3269 * some minority platforms have extra fields. Those could be added here
3270 * without harm; they would just be ignored on platforms lacking them.)
3271 *
3272 * Our structure is compiled to make looping through the fields easier by
3273 * pointing each name to its value's offset within lconv, e.g.,
3274 { "thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep) }
3275 */
3276 # define LCONV_ENTRY(name) \
3277 {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)}
3278
3279 /* These synonyms are just for clarity, and to make it easier in case
3280 * something needs to change in the future */
3281 # define LCONV_NUMERIC_ENTRY(name) LCONV_ENTRY(name)
3282 # define LCONV_MONETARY_ENTRY(name) LCONV_ENTRY(name)
3283
3284 /* There are just a few fields for NUMERIC strings */
3285 const lconv_offset_t lconv_numeric_strings[] = {
3286 # ifndef NO_LOCALECONV_GROUPING
3287 LCONV_NUMERIC_ENTRY(grouping),
3288 # endif
3289 LCONV_NUMERIC_ENTRY(thousands_sep),
3290 LCONV_NUMERIC_ENTRY(decimal_point),
3291 {NULL, 0}
3292 };
3293
3294 /* When used to implement nl_langinfo(), we save time by only populating
3295 * the hash with the field(s) needed. Thus we would need a data structure
3296 * of just:
3297 * LCONV_NUMERIC_ENTRY(decimal_point),
3298 * {NULL, 0}
3299 *
3300 * By placing the decimal_point field last in the full structure, we can
3301 * use just the tail for this bit of it, saving space. This macro yields
3302 * the address of the sub structure. */
3303 # define DECIMAL_POINT_ADDRESS \
3304 &lconv_numeric_strings[(C_ARRAY_LENGTH(lconv_numeric_strings) - 2)]
3305
3306 /* And the MONETARY string fields */
3307 const lconv_offset_t lconv_monetary_strings[] = {
3308 LCONV_MONETARY_ENTRY(int_curr_symbol),
3309 LCONV_MONETARY_ENTRY(mon_decimal_point),
3310 # ifndef NO_LOCALECONV_MON_THOUSANDS_SEP
3311 LCONV_MONETARY_ENTRY(mon_thousands_sep),
3312 # endif
3313 # ifndef NO_LOCALECONV_MON_GROUPING
3314 LCONV_MONETARY_ENTRY(mon_grouping),
3315 # endif
3316 LCONV_MONETARY_ENTRY(positive_sign),
3317 LCONV_MONETARY_ENTRY(negative_sign),
3318 LCONV_MONETARY_ENTRY(currency_symbol),
3319 {NULL, 0}
3320 };
3321
3322 /* Like above, this field being last can be used as a sub structure */
3323 # define CURRENCY_SYMBOL_ADDRESS \
3324 &lconv_monetary_strings[(C_ARRAY_LENGTH(lconv_monetary_strings) - 2)]
3325
3326 /* Finally there are integer fields, all are for monetary purposes */
3327 const lconv_offset_t lconv_integers[] = {
3328 LCONV_ENTRY(int_frac_digits),
3329 LCONV_ENTRY(frac_digits),
3330 LCONV_ENTRY(p_sep_by_space),
3331 LCONV_ENTRY(n_cs_precedes),
3332 LCONV_ENTRY(n_sep_by_space),
3333 LCONV_ENTRY(p_sign_posn),
3334 LCONV_ENTRY(n_sign_posn),
3335 # ifdef HAS_LC_MONETARY_2008
3336 LCONV_ENTRY(int_p_cs_precedes),
3337 LCONV_ENTRY(int_p_sep_by_space),
3338 LCONV_ENTRY(int_n_cs_precedes),
3339 LCONV_ENTRY(int_n_sep_by_space),
3340 LCONV_ENTRY(int_p_sign_posn),
3341 LCONV_ENTRY(int_n_sign_posn),
3342 # endif
3343 LCONV_ENTRY(p_cs_precedes),
3344 {NULL, 0}
3345 };
3346
3347 /* Like above, this field being last can be used as a sub structure */
3348 # define P_CS_PRECEDES_ADDRESS \
3349 &lconv_integers[(C_ARRAY_LENGTH(lconv_integers) - 2)]
3350
3351 /* If we aren't paying attention to a given category, use LC_CTYPE instead;
3352 * If not paying attention to that either, the code below should end up not
3353 * using this. Make sure that things blow up if that avoidance gets lost,
3354 * by setting the category to -1 */
3355 unsigned int numeric_index;
3356 unsigned int monetary_index;
3357
3358 # ifdef USE_LOCALE_NUMERIC
3359 numeric_index = LC_NUMERIC_INDEX_;
3360 # elif defined(USE_LOCALE_CTYPE)
3361 numeric_index = LC_CTYPE_INDEX_;
3362 # else
3363 numeric_index = (unsigned) -1;
3364 # endif
3365 # ifdef USE_LOCALE_MONETARY
3366 monetary_index = LC_MONETARY_INDEX_;
3367 # elif defined(USE_LOCALE_CTYPE)
3368 monetary_index = LC_CTYPE_INDEX_;
3369 # else
3370 monetary_index = (unsigned) -1;
3371 # endif
3372
3373 /* Some platforms, for correct non-mojibake results, require LC_CTYPE's
3374 * locale to match LC_NUMERIC's for the numeric fields, and LC_MONETARY's
3375 * for the monetary ones. What happens if LC_NUMERIC and LC_MONETARY
3376 * aren't compatible? Wrong results. To avoid that, we call localeconv()
3377 * twice, once for each locale, setting LC_CTYPE to match the category.
3378 * But if the locales of both categories are the same, there is no need for
3379 * a second call. Assume this is the case unless overridden below */
3380 bool requires_2nd_localeconv = false;
3381
3382 /* The actual hash populating is done by S_populate_hash_from_localeconv().
3383 * It gets passed an array of length two containing the data structure it
3384 * is supposed to use to get the key names to fill the hash with. One
3385 * element is alwasy for the NUMERIC strings (or NULL if none to use), and
3386 * the other element similarly for the MONETARY ones. */
3387 # define NUMERIC_STRING_OFFSET 0
3388 # define MONETARY_STRING_OFFSET 1
3389 const lconv_offset_t * strings[2] = { NULL, NULL };
3390
3391 /* This is a mask, with one bit to tell S_populate_hash_from_localeconv to
3392 * populate the NUMERIC items; another bit for the MONETARY ones. This way
3393 * it can choose which (or both) to populate from */
3394 U32 index_bits = 0;
3395
3396 /* This converts from a locale index to its bit position in the above mask.
3397 * */
3398 # define INDEX_TO_BIT(i) (1 << (i))
3399
3400 /* The two categories can have disparate locales. Initialize them to C and
3401 * override later whichever one(s) we pay attention to */
3402 const char * numeric_locale = "C";
3403 const char * monetary_locale = "C";
3404
3405 /* This will be either 'numeric_locale' or 'monetary_locale' depending on
3406 * what we are working on at the moment */
3407 const char * locale;
3408
3409 /* The LC_MONETARY category also has some integer-valued fields, whose
3410 * information is kept in a separate list */
3411 const lconv_offset_t * integers;
3412
3413 # ifdef HAS_SOME_LANGINFO
3414
3415 /* If the only use-case for this is the full localeconv(), the 'item'
3416 * parameter is ignored. */
3417 PERL_UNUSED_ARG(item);
3418
3419 # else
3420
3421 /* This only gets compiled for the use-case of using localeconv() to
3422 * emulate an nl_langinfo() missing from the platform.
3423 *
3424 * We need this substructure to only return this field for the THOUSEP
3425 * item. The other items also need substructures, but they were handled
3426 * above by placing the substructure's item at the end of the full one, so
3427 * the data structure could do double duty. However, both this and
3428 * RADIXCHAR would need to be in the final position of the same full
3429 * structure; an impossibility. So make this into a separate structure */
3430 const lconv_offset_t thousands_sep_string[] = {
3431 LCONV_NUMERIC_ENTRY(thousands_sep),
3432 {NULL, 0}
3433 };
3434
3435 /* End of all the initialization of datastructures. Now for actual code.
3436 *
3437 * Without nl_langinfo(), the call to my_localeconv() could be for just one
3438 * of the following 3 items to emulate nl_langinfo(). This is compiled
3439 * only when using perl_langinfo.h, which we control, and it has been
3440 * constructed so that no item is numbered 0.
3441 *
3442 * For each, setup the appropriate parameters for the call below to
3443 * S_populate_hash_from_localeconv() */
3444 if (item != 0) switch (item) {
3445 default:
3446 locale_panic_(Perl_form(aTHX_
3447 "Unexpected item passed to my_localeconv: %d", item));
3448 break;
3449
3450 # ifdef USE_LOCALE_NUMERIC
3451
3452 case RADIXCHAR:
3453 locale = numeric_locale = PL_numeric_name;
3454 index_bits = INDEX_TO_BIT(LC_NUMERIC_INDEX_);
3455 strings[NUMERIC_STRING_OFFSET] = DECIMAL_POINT_ADDRESS;
3456 integers = NULL;
3457 break;
3458
3459 case THOUSEP:
3460 index_bits = INDEX_TO_BIT(LC_NUMERIC_INDEX_);
3461 locale = numeric_locale = PL_numeric_name;
3462 strings[NUMERIC_STRING_OFFSET] = thousands_sep_string;
3463 integers = NULL;
3464 break;
3465
3466 # endif
3467 # ifdef USE_LOCALE_MONETARY
3468
3469 case CRNCYSTR:
3470 index_bits = INDEX_TO_BIT(LC_MONETARY_INDEX_);
3471 locale = monetary_locale = querylocale_i(LC_MONETARY_INDEX_);
3472
3473 /* This item needs the values for both the currency symbol, and another
3474 * one used to construct the nl_langino()-compatible return */
3475 strings[MONETARY_STRING_OFFSET] = CURRENCY_SYMBOL_ADDRESS;
3476 integers = P_CS_PRECEDES_ADDRESS;
3477 break;
3478
3479 # endif
3480
3481 } /* End of switch() */
3482
3483 else /* End of for just one item to emulate nl_langinfo() */
3484
3485 # endif
3486
3487 { /* Here, the call is for all of localeconv(). It has a bunch of
3488 * items. As in the individual item case, set up the parameters for
3489 * S_populate_hash_from_localeconv(); */
3490
3491 # ifdef USE_LOCALE_NUMERIC
3492 numeric_locale = PL_numeric_name;
3493 # elif defined(USE_LOCALE_CTYPE)
3494 numeric_locale = querylocale_i(numeric_index);
3495 # endif
3496 # if defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_CTYPE)
3497 monetary_locale = querylocale_i(monetary_index);
3498 # endif
3499
3500 /* The first call to S_populate_hash_from_localeconv() will be for the
3501 * MONETARY values */
3502 index_bits = INDEX_TO_BIT(monetary_index);
3503 locale = monetary_locale;
3504
3505 /* And if the locales for the two categories are the same, we can also
3506 * do the NUMERIC values in the same call */
3507 if (strEQ(numeric_locale, monetary_locale)) {
3508 index_bits |= INDEX_TO_BIT(numeric_index);
3509 }
3510 else {
3511 requires_2nd_localeconv = true;
3512 }
3513
3514 /* We always pass both sets of strings. 'index_bits' tells
3515 * S_populate_hash_from_localeconv which to actually look at */
3516 strings[NUMERIC_STRING_OFFSET] = lconv_numeric_strings;
3517 strings[MONETARY_STRING_OFFSET] = lconv_monetary_strings;
3518
3519 /* And pass the integer values to populate; again 'index_bits' will
3520 * say to use them or not */
3521 integers = lconv_integers;
3522
3523 } /* End of call is for localeconv() */
3524
3525 /* The code above has determined the parameters to
3526 S_populate_hash_from_localeconv() for both cases of an individual item
3527 and for the entire structure. Below is code common to both */
3528
3529 HV * hv = newHV(); /* The returned hash, initially empty */
3530 sv_2mortal((SV*)hv);
3531
3532 /* Call localeconv() and copy its results into the hash. All the
3533 * parameters have been initialized above */
3534 populate_hash_from_localeconv(hv,
3535 locale,
3536 index_bits,
3537 strings,
3538 integers
3539 );
3540
3541 /* The above call may have done all the hash fields, but not always, as
3542 * already explained. If we need a second call it is always for the
3543 * NUMERIC fields */
3544 if (requires_2nd_localeconv) {
3545 populate_hash_from_localeconv(hv,
3546 numeric_locale,
3547 INDEX_TO_BIT(numeric_index),
3548 strings,
3549 NULL /* There are No NUMERIC integer
3550 fields */
3551 );
3552 }
3553
3554 /* Here, the hash has been completely populated.
3555 *
3556 * Now go through all the items and:
3557 * a) For string items, see if they should be marked as UTF-8 or not.
3558 * This would have been more convenient and faster to do while
3559 * populating the hash in the first place, but that operation has to be
3560 * done within a critical section, keeping other threads from
3561 * executing, so only the minimal amount of work necessary is done at
3562 * that time.
3563 * b) For integer items, convert the C CHAR_MAX value into -1. Again,
3564 * this could have been done in the critical section, but was deferred
3565 * to here to keep to the bare minimum amount the time spent owning the
3566 * processor. CHAR_MAX is a C concept for an 8-bit character type.
3567 * Perl has no such type; the closest fit is a -1.
3568 *
3569 * XXX On unthreaded perls, this code could be #ifdef'd out, and the
3570 * corrections determined at hash population time, at an extra maintenance
3571 * cost which khw doesn't think is worth it
3572 */
3573 for (unsigned int i = 0; i < 2; i++) { /* Try both types of strings */
3574 if (! strings[i]) { /* Skip if no strings of this type */
3575 continue;
3576 }
3577
3578 locale = (i == NUMERIC_STRING_OFFSET)
3579 ? numeric_locale
3580 : monetary_locale;
3581
3582 locale_utf8ness_t locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
3583
3584 # ifdef HAS_RELIABLE_UTF8NESS_DETERMINATION
3585
3586 /* It saves time in the loop below to have predetermined the UTF8ness
3587 * of the locale. But only do so if the platform reliably has this
3588 * information; otherwise to do it, this could recurse indefinitely.
3589 *
3590 * When we don't do it here, it will be done on a per-element basis in
3591 * the loop. The per-element check is intelligent enough to not
3592 * recurse */
3593
3594 locale_is_utf8 = (is_locale_utf8(locale))
3595 ? LOCALE_IS_UTF8
3596 : LOCALE_NOT_UTF8;
3597
3598 if (locale_is_utf8 == LOCALE_NOT_UTF8) {
3599 continue; /* No string can be UTF-8 if the locale isn't */
3600 }
3601
3602 # endif
3603
3604 /* Examine each string */
3605 while (1) {
3606 const char * name = strings[i]->name;
3607
3608 if (! name) { /* Reached the end */
3609 break;
3610 }
3611
3612 /* 'value' will contain the string that may need to be marked as
3613 * UTF-8 */
3614 SV ** value = hv_fetch(hv, name, strlen(name), true);
3615 if (! value) {
3616 continue;
3617 }
3618
3619 /* Determine if the string should be marked as UTF-8. */
3620 if (UTF8NESS_YES == (get_locale_string_utf8ness_i(SvPVX(*value),
3621 locale_is_utf8,
3622 NULL, 0)))
3623 {
3624 SvUTF8_on(*value);
3625 }
3626
3627 strings[i]++; /* Iterate */
3628 }
3629 } /* End of fixing up UTF8ness */
3630
3631
3632 /* Examine each integer */
3633 if (integers) while (1) {
3634 const char * name = integers->name;
3635
3636 if (! name) { /* Reached the end */
3637 break;
3638 }
3639
3640 SV ** value = hv_fetch(hv, name, strlen(name), true);
3641 if (! value) {
3642 continue;
3643 }
3644
3645 /* Change CHAR_MAX to -1 */
3646 if (SvIV(*value) == CHAR_MAX) {
3647 sv_setiv(*value, -1);
3648 }
3649
3650 integers++; /* Iterate */
3651 }
3652
3653 return hv;
3654 }
3655
3656 STATIC void
S_populate_hash_from_localeconv(pTHX_ HV * hv,const char * locale,const U32 which_mask,const lconv_offset_t * strings[2],const lconv_offset_t * integers)3657 S_populate_hash_from_localeconv(pTHX_ HV * hv,
3658
3659 /* Switch to this locale to run
3660 * localeconv() from */
3661 const char * locale,
3662
3663 /* bit mask of which categories to
3664 * populate */
3665 const U32 which_mask,
3666
3667 /* strings[0] points the numeric
3668 * string fields; [1] to the monetary */
3669 const lconv_offset_t * strings[2],
3670
3671 /* And to the monetary integer fields */
3672 const lconv_offset_t * integers)
3673 {
3674 PERL_ARGS_ASSERT_POPULATE_HASH_FROM_LOCALECONV;
3675 PERL_UNUSED_ARG(which_mask); /* Some configurations don't use this;
3676 complicated to figure out which */
3677
3678 /* Run localeconv() and copy some or all of its results to the input 'hv'
3679 * hash. Most localeconv() implementations return the values in a global
3680 * static buffer, so the operation must be performed in a critical section,
3681 * ending only after the copy is completed. There are so many locks
3682 * because localeconv() deals with two categories, and returns in a single
3683 * global static buffer. Some locks might be no-ops on this platform, but
3684 * not others. We need to lock if any one isn't a no-op. */
3685
3686 # ifdef USE_LOCALE_CTYPE
3687
3688 /* Some platforms require LC_CTYPE to be congruent with the category we are
3689 * looking for */
3690 const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
3691
3692 # endif
3693 # ifdef USE_LOCALE_NUMERIC
3694
3695 /* We need to toggle to the underlying NUMERIC locale if we are getting
3696 * NUMERIC strings */
3697 const char * orig_NUMERIC_locale = NULL;
3698 if (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) {
3699 LC_NUMERIC_LOCK(0);
3700 orig_NUMERIC_locale = toggle_locale_i(LC_NUMERIC_INDEX_, locale);
3701 }
3702
3703 # endif
3704
3705 /* Finally ready to do the actual localeconv(). Lock to prevent other
3706 * accesses until we have made a copy of its returned static buffer */
3707 gwLOCALE_LOCK;
3708
3709 # ifdef TS_W32_BROKEN_LOCALECONV
3710
3711 /* This is a workaround for another bug in Windows. localeconv() was
3712 * broken with thread-safe locales prior to VS 15. It looks at the global
3713 * locale instead of the thread one. As a work-around, we toggle to the
3714 * global locale; populate the return; then toggle back. We have to use
3715 * LC_ALL instead of the individual categories because of yet another bug
3716 * in Windows. And this all has to be done in a critical section.
3717 *
3718 * This introduces a potential race with any other thread that has also
3719 * converted to use the global locale, and doesn't protect its locale calls
3720 * with mutexes. khw can't think of any reason for a thread to do so on
3721 * Windows, as the locale API is the same regardless of thread-safety, except
3722 * if the code is ported from working on another platform where there might
3723 * be some reason to do this. But this is typically due to some
3724 * alien-to-Perl library that thinks it owns locale setting. Such a
3725 * library isn't likely to exist on Windows, so such an application is
3726 * unlikely to be run on Windows
3727 */
3728 bool restore_per_thread = FALSE;
3729
3730 /* Save the per-thread locale state */
3731 const char * save_thread = querylocale_c(LC_ALL);
3732
3733 /* Change to the global locale, and note if we already were there */
3734 if (_configthreadlocale(_DISABLE_PER_THREAD_LOCALE)
3735 != _DISABLE_PER_THREAD_LOCALE)
3736 {
3737 restore_per_thread = TRUE;
3738 }
3739
3740 /* Save the state of the global locale; then convert to our desired
3741 * state. */
3742 const char * save_global = querylocale_c(LC_ALL);
3743 void_setlocale_c(LC_ALL, save_thread);
3744
3745 # endif /* TS_W32_BROKEN_LOCALECONV */
3746
3747 /* Finally, do the actual localeconv */
3748 const char *lcbuf_as_string = (const char *) localeconv();
3749
3750 /* Fill in the string fields of the HV* */
3751 for (unsigned int i = 0; i < 2; i++) {
3752
3753 # ifdef USE_LOCALE_NUMERIC
3754
3755 /* One iteration is only for the numeric string fields */
3756 if ( i == NUMERIC_STRING_OFFSET
3757 && (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) == 0)
3758 {
3759 continue;
3760 }
3761
3762 # endif
3763 # ifdef USE_LOCALE_MONETARY
3764
3765 /* The other iteration is only for the monetary string fields */
3766 if ( i == MONETARY_STRING_OFFSET
3767 && (which_mask & INDEX_TO_BIT(LC_MONETARY_INDEX_)) == 0)
3768 {
3769 continue;
3770 }
3771
3772 # endif
3773
3774 /* For each field for the given category ... */
3775 const lconv_offset_t * category_strings = strings[i];
3776 while (1) {
3777 const char * name = category_strings->name;
3778 if (! name) { /* Quit at the end */
3779 break;
3780 }
3781
3782 /* we have set things up so that we know where in the returned
3783 * structure, when viewed as a string, the corresponding value is.
3784 * */
3785 const char *value = *((const char **)( lcbuf_as_string
3786 + category_strings->offset));
3787
3788 /* Set to get next string on next iteration */
3789 category_strings++;
3790
3791 /* Skip if this platform doesn't have this field. */
3792 if (! value) {
3793 continue;
3794 }
3795
3796 /* Copy to the hash */
3797 (void) hv_store(hv,
3798 name, strlen(name),
3799 newSVpv(value, strlen(value)),
3800 0);
3801 }
3802
3803 /* Add any int fields to the HV* */
3804 if (i == MONETARY_STRING_OFFSET && integers) {
3805 while (integers->name) {
3806 const char value = *((const char *)( lcbuf_as_string
3807 + integers->offset));
3808 (void) hv_store(hv, integers->name,
3809 strlen(integers->name), newSViv(value), 0);
3810 integers++;
3811 }
3812 }
3813 } /* End of loop through the fields */
3814
3815 /* Done with copying to the hash. Can unwind the critical section locks */
3816
3817 # ifdef TS_W32_BROKEN_LOCALECONV
3818
3819 /* Restore the global locale's prior state */
3820 void_setlocale_c(LC_ALL, save_global);
3821
3822 /* And back to per-thread locales */
3823 if (restore_per_thread) {
3824 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
3825 }
3826
3827 /* Restore the per-thread locale state */
3828 void_setlocale_c(LC_ALL, save_thread);
3829
3830 # endif /* TS_W32_BROKEN_LOCALECONV */
3831
3832 gwLOCALE_UNLOCK; /* Finished with the critical section of a
3833 globally-accessible buffer */
3834
3835 # ifdef USE_LOCALE_NUMERIC
3836
3837 restore_toggled_locale_i(LC_NUMERIC_INDEX_, orig_NUMERIC_locale);
3838 if (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) {
3839 LC_NUMERIC_UNLOCK;
3840 }
3841
3842 # endif
3843 # ifdef USE_LOCALE_CTYPE
3844
3845 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
3846
3847 # endif
3848
3849 }
3850
3851 #endif /* defined(HAS_LOCALECONV) */
3852 #ifndef HAS_SOME_LANGINFO
3853
3854 typedef int nl_item; /* Substitute 'int' for emulated nl_langinfo() */
3855
3856 #endif
3857
3858 /*
3859
3860 =for apidoc Perl_langinfo
3861 =for apidoc_item Perl_langinfo8
3862
3863 C<Perl_langinfo> is an (almost) drop-in replacement for the system
3864 C<L<nl_langinfo(3)>>, taking the same C<item> parameter values, and returning
3865 the same information. But it is more thread-safe than regular
3866 C<nl_langinfo()>, and hides the quirks of Perl's locale handling from your
3867 code, and can be used on systems that lack a native C<nl_langinfo>.
3868
3869 However, you should instead use the improved version of this:
3870 L</Perl_langinfo8>, which behaves identically except for an additional
3871 parameter, a pointer to a variable declared as L</C<utf8ness_t>>, into which it
3872 returns to you how you should treat the returned string with regards to it
3873 being encoded in UTF-8 or not.
3874
3875 Concerning the differences between these and plain C<nl_langinfo()>:
3876
3877 =over
3878
3879 =item a.
3880
3881 C<Perl_langinfo8> has an extra parameter, described above. Besides this, the
3882 other reason they aren't quite a drop-in replacement is actually an advantage.
3883 The C<const>ness of the return allows the compiler to catch attempts to write
3884 into the returned buffer, which is illegal and could cause run-time crashes.
3885
3886 =item b.
3887
3888 They deliver the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
3889 without you having to write extra code. The reason for the extra code would be
3890 because these are from the C<LC_NUMERIC> locale category, which is normally
3891 kept set by Perl so that the radix is a dot, and the separator is the empty
3892 string, no matter what the underlying locale is supposed to be, and so to get
3893 the expected results, you have to temporarily toggle into the underlying
3894 locale, and later toggle back. (You could use plain C<nl_langinfo> and
3895 C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get
3896 the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
3897 (or equivalent) locale would break a lot of CPAN, which is expecting the radix
3898 (decimal point) character to be a dot.)
3899
3900 =item c.
3901
3902 The system function they replace can have its static return buffer trashed,
3903 not only by a subsequent call to that function, but by a C<freelocale>,
3904 C<setlocale>, or other locale change. The returned buffer of these functions
3905 is not changed until the next call to one or the other, so the buffer is never
3906 in a trashed state.
3907
3908 =item d.
3909
3910 The return buffer is per-thread, so it also is never overwritten by a call to
3911 these functions from another thread; unlike the function it replaces.
3912
3913 =item e.
3914
3915 But most importantly, they work on systems that don't have C<nl_langinfo>, such
3916 as Windows, hence making your code more portable. Of the fifty-some possible
3917 items specified by the POSIX 2008 standard,
3918 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
3919 only one is completely unimplemented, though on non-Windows platforms, another
3920 significant one is not fully implemented). They use various techniques to
3921 recover the other items, including calling C<L<localeconv(3)>>, and
3922 C<L<strftime(3)>>, both of which are specified in C89, so should be always be
3923 available. Later C<strftime()> versions have additional capabilities; What the
3924 C locale yields or C<""> is returned for any item not available on your system.
3925
3926 It is important to note that, when called with an item that is recovered by
3927 using C<localeconv>, the buffer from any previous explicit call to
3928 C<L<localeconv(3)>> will be overwritten. But you shouldn't be using
3929 C<localeconv> anyway because it is is very much not thread-safe, and suffers
3930 from the same problems outlined in item 'b.' above for the fields it returns that
3931 are controlled by the LC_NUMERIC locale category. Instead, avoid all of those
3932 problems by calling L</Perl_localeconv>, which is thread-safe; or by using the
3933 methods given in L<perlcall> to call
3934 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe.
3935
3936 =back
3937
3938 The details for those items which may deviate from what this emulation returns
3939 and what a native C<nl_langinfo()> would return are specified in
3940 L<I18N::Langinfo>.
3941
3942 When using C<Perl_langinfo8> (or plain C<Perl_langinfo>) on systems that don't
3943 have a native C<nl_langinfo()>, you must
3944
3945 #include "perl_langinfo.h"
3946
3947 before the C<perl.h> C<#include>. You can replace your F<langinfo.h>
3948 C<#include> with this one. (Doing it this way keeps out the symbols that plain
3949 F<langinfo.h> would try to import into the namespace for code that doesn't need
3950 it.)
3951
3952 =cut
3953
3954 */
3955
3956 const char *
Perl_langinfo(const nl_item item)3957 Perl_langinfo(const nl_item item)
3958 {
3959 return Perl_langinfo8(item, NULL);
3960 }
3961
3962 const char *
Perl_langinfo8(const nl_item item,utf8ness_t * utf8ness)3963 Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness)
3964 {
3965 dTHX;
3966 unsigned cat_index;
3967
3968 PERL_ARGS_ASSERT_PERL_LANGINFO8;
3969
3970 if (utf8ness) { /* Assume for now */
3971 *utf8ness = UTF8NESS_IMMATERIAL;
3972 }
3973
3974 /* Find the locale category that controls the input 'item'. If we are not
3975 * paying attention to that category, instead return a default value. Also
3976 * return the default value if there is no way for us to figure out the
3977 * correct value. If we have some form of nl_langinfo(), we can always
3978 * figure it out, but lacking that, there may be alternative methods that
3979 * can be used to recover most of the possible items. Some of those
3980 * methods need libc functions, which may or may not be available. If
3981 * unavailable, we can't compute the correct value, so must here return the
3982 * default. */
3983 switch (item) {
3984
3985 case CODESET:
3986
3987 #ifdef USE_LOCALE_CTYPE
3988
3989 cat_index = LC_CTYPE_INDEX_;
3990 break;
3991
3992 #else
3993 return C_codeset;
3994 #endif
3995 #if defined(USE_LOCALE_MESSAGES) && defined(HAS_SOME_LANGINFO)
3996
3997 case YESEXPR: case YESSTR: case NOEXPR: case NOSTR:
3998 cat_index = LC_MESSAGES_INDEX_;
3999 break;
4000 #else
4001 case YESEXPR: return "^[+1yY]";
4002 case YESSTR: return "yes";
4003 case NOEXPR: return "^[-0nN]";
4004 case NOSTR: return "no";
4005 #endif
4006
4007 case CRNCYSTR:
4008
4009 #if defined(USE_LOCALE_MONETARY) \
4010 && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV))
4011
4012 cat_index = LC_MONETARY_INDEX_;
4013 break;
4014 #else
4015 return "-";
4016 #endif
4017
4018 case RADIXCHAR:
4019
4020 #ifdef CAN_CALCULATE_RADIX
4021
4022 cat_index = LC_NUMERIC_INDEX_;
4023 break;
4024 #else
4025 return C_decimal_point;
4026 #endif
4027
4028 case THOUSEP:
4029
4030 #if defined(USE_LOCALE_NUMERIC) \
4031 && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV))
4032
4033 cat_index = LC_NUMERIC_INDEX_;
4034 break;
4035 #else
4036 return C_thousands_sep;
4037 #endif
4038
4039 /* The other possible items are all in LC_TIME. */
4040 #ifdef USE_LOCALE_TIME
4041
4042 default:
4043 cat_index = LC_TIME_INDEX_;
4044 break;
4045
4046 #endif
4047 #if ! defined(USE_LOCALE_TIME) || ! defined(HAS_SOME_LANGINFO)
4048
4049 /* If not using LC_TIME, hard code the rest. Or, if there is no
4050 * nl_langinfo(), we use strftime() as an alternative, and it is missing
4051 * functionality to get every single one, so hard-code those */
4052
4053 case ERA: return ""; /* Unimplemented; for use with strftime() %E
4054 modifier */
4055
4056 /* These formats are defined by C89, so we assume that strftime supports
4057 * them, and so are returned unconditionally; they may not be what the
4058 * locale actually says, but should give good enough results for someone
4059 * using them as formats (as opposed to trying to parse them to figure
4060 * out what the locale says). The other format items are actually tested
4061 * to verify they work on the platform */
4062 case D_FMT: return "%x";
4063 case T_FMT: return "%X";
4064 case D_T_FMT: return "%c";
4065
4066 # if defined(WIN32) || ! defined(USE_LOCALE_TIME)
4067
4068 /* strftime() on Windows doesn't have the POSIX (beyond C89) extensions
4069 * that would allow it to recover these */
4070 case ERA_D_FMT: return "%x";
4071 case ERA_T_FMT: return "%X";
4072 case ERA_D_T_FMT: return "%c";
4073 case ALT_DIGITS: return "0";
4074
4075 # endif
4076 # ifndef USE_LOCALE_TIME
4077
4078 case T_FMT_AMPM: return "%r";
4079 case ABDAY_1: return "Sun";
4080 case ABDAY_2: return "Mon";
4081 case ABDAY_3: return "Tue";
4082 case ABDAY_4: return "Wed";
4083 case ABDAY_5: return "Thu";
4084 case ABDAY_6: return "Fri";
4085 case ABDAY_7: return "Sat";
4086 case AM_STR: return "AM";
4087 case PM_STR: return "PM";
4088 case ABMON_1: return "Jan";
4089 case ABMON_2: return "Feb";
4090 case ABMON_3: return "Mar";
4091 case ABMON_4: return "Apr";
4092 case ABMON_5: return "May";
4093 case ABMON_6: return "Jun";
4094 case ABMON_7: return "Jul";
4095 case ABMON_8: return "Aug";
4096 case ABMON_9: return "Sep";
4097 case ABMON_10: return "Oct";
4098 case ABMON_11: return "Nov";
4099 case ABMON_12: return "Dec";
4100 case DAY_1: return "Sunday";
4101 case DAY_2: return "Monday";
4102 case DAY_3: return "Tuesday";
4103 case DAY_4: return "Wednesday";
4104 case DAY_5: return "Thursday";
4105 case DAY_6: return "Friday";
4106 case DAY_7: return "Saturday";
4107 case MON_1: return "January";
4108 case MON_2: return "February";
4109 case MON_3: return "March";
4110 case MON_4: return "April";
4111 case MON_5: return "May";
4112 case MON_6: return "June";
4113 case MON_7: return "July";
4114 case MON_8: return "August";
4115 case MON_9: return "September";
4116 case MON_10: return "October";
4117 case MON_11: return "November";
4118 case MON_12: return "December";
4119
4120 # endif
4121 #endif
4122
4123 } /* End of switch on item */
4124
4125 #ifndef USE_LOCALE
4126
4127 Perl_croak_nocontext("panic: Unexpected nl_langinfo() item %d", item);
4128 NOT_REACHED; /* NOTREACHED */
4129 PERL_UNUSED_VAR(cat_index);
4130
4131 #else
4132 # ifdef USE_LOCALE_NUMERIC
4133
4134 /* Use either the underlying numeric, or the other underlying categories */
4135 if (cat_index == LC_NUMERIC_INDEX_) {
4136 return my_langinfo_c(item, LC_NUMERIC, PL_numeric_name,
4137 &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
4138 }
4139 else
4140
4141 # endif
4142
4143 {
4144 return my_langinfo_i(item, cat_index, querylocale_i(cat_index),
4145 &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
4146 }
4147
4148 #endif
4149
4150 }
4151
4152 #ifdef USE_LOCALE
4153
4154 /* There are several implementations of my_langinfo, depending on the
4155 * Configuration. They all share the same beginning of the function */
4156 STATIC const char *
S_my_langinfo_i(pTHX_ const nl_item item,const unsigned int cat_index,const char * locale,const char ** retbufp,Size_t * retbuf_sizep,utf8ness_t * utf8ness)4157 S_my_langinfo_i(pTHX_
4158 const nl_item item, /* The item to look up */
4159 const unsigned int cat_index, /* The locale category that
4160 controls it */
4161 /* The locale to look up 'item' in. */
4162 const char * locale,
4163
4164 /* Where to store the result, and where the size of that buffer
4165 * is stored, updated on exit. retbuf_sizep may be NULL for an
4166 * empty-on-entry, single use buffer whose size we don't need
4167 * to keep track of */
4168 const char ** retbufp,
4169 Size_t * retbuf_sizep,
4170
4171 /* If not NULL, the location to store the UTF8-ness of 'item's
4172 * value, as documented */
4173 utf8ness_t * utf8ness)
4174 {
4175 const char * retval = NULL;
4176
4177 PERL_ARGS_ASSERT_MY_LANGINFO_I;
4178 assert(cat_index < NOMINAL_LC_ALL_INDEX);
4179
4180 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4181 "Entering my_langinfo item=%ld, using locale %s\n",
4182 (long) item, locale));
4183 /*--------------------------------------------------------------------------*/
4184 /* Above is the common beginning to all the implementations of my_langinfo().
4185 * Below are the various completions.
4186 *
4187 * Some platforms don't deal well with non-ASCII strings in locale X when
4188 * LC_CTYPE is not in X. (Actually it is probably when X is UTF-8 and LC_CTYPE
4189 * isn't, or vice versa). There is explicit code to bring the categories into
4190 * sync. This doesn't seem to be a problem with nl_langinfo(), so that
4191 * implementation doesn't currently worry about it. But it is a problem on
4192 * Windows boxes, which don't have nl_langinfo(). */
4193
4194 /*--------------------------------------------------------------------------*/
4195 # if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
4196 # ifdef USE_LOCALE_CTYPE
4197
4198 /* Ths function sorts out if things actually have to be switched or not,
4199 * for both calls. */
4200 const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
4201
4202 # endif
4203
4204 const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
4205
4206 gwLOCALE_LOCK;
4207 retval = save_to_buffer(nl_langinfo(item), retbufp, retbuf_sizep);
4208 gwLOCALE_UNLOCK;
4209
4210 if (utf8ness) {
4211 *utf8ness = get_locale_string_utf8ness_i(retval,
4212 LOCALE_UTF8NESS_UNKNOWN,
4213 locale, cat_index);
4214 }
4215
4216 restore_toggled_locale_i(cat_index, orig_switched_locale);
4217
4218 # ifdef USE_LOCALE_CTYPE
4219 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
4220 # endif
4221
4222 return retval;
4223 /*--------------------------------------------------------------------------*/
4224 # else /* Below, emulate nl_langinfo as best we can */
4225
4226 /* And the third and final completion is where we have to emulate
4227 * nl_langinfo(). There are various possibilities depending on the
4228 * Configuration */
4229
4230 # ifdef USE_LOCALE_CTYPE
4231
4232 const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
4233
4234 # endif
4235
4236 const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
4237
4238 /* Here, we are in the locale we want information about */
4239
4240 /* Almost all the items will have ASCII return values. Set that here, and
4241 * override if necessary */
4242 utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL;
4243
4244 switch (item) {
4245 default:
4246 assert(item < 0); /* Make sure using perl_langinfo.h */
4247 retval = "";
4248 break;
4249
4250 case RADIXCHAR:
4251
4252 # if defined(HAS_SNPRINTF) \
4253 && (! defined(HAS_LOCALECONV) || defined(TS_W32_BROKEN_LOCALECONV))
4254
4255 {
4256 /* snprintf() can be used to find the radix character by outputting
4257 * a known simple floating point number to a buffer, and parsing
4258 * it, inferring the radix as the bytes separating the integer and
4259 * fractional parts. But localeconv() is more direct, not
4260 * requiring inference, so use it instead of the code just below,
4261 * if (likely) it is available and works ok */
4262
4263 char * floatbuf = NULL;
4264 const Size_t initial_size = 10;
4265
4266 Newx(floatbuf, initial_size, char);
4267
4268 /* 1.5 is exactly representable on binary computers */
4269 Size_t needed_size = snprintf(floatbuf, initial_size, "%.1f", 1.5);
4270
4271 /* If our guess wasn't big enough, increase and try again, based on
4272 * the real number that snprintf() is supposed to return */
4273 if (UNLIKELY(needed_size >= initial_size)) {
4274 needed_size++; /* insurance */
4275 Renew(floatbuf, needed_size, char);
4276 Size_t new_needed = snprintf(floatbuf, needed_size, "%.1f", 1.5);
4277 assert(new_needed <= needed_size);
4278 needed_size = new_needed;
4279 }
4280
4281 char * s = floatbuf;
4282 char * e = floatbuf + needed_size;
4283
4284 /* Find the '1' */
4285 while (s < e && *s != '1') {
4286 s++;
4287 }
4288
4289 if (LIKELY(s < e)) {
4290 s++;
4291 }
4292
4293 /* Find the '5' */
4294 char * item_start = s;
4295 while (s < e && *s != '5') {
4296 s++;
4297 }
4298
4299 /* Everything in between is the radix string */
4300 if (LIKELY(s < e)) {
4301 *s = '\0';
4302 retval = save_to_buffer(item_start, retbufp, retbuf_sizep);
4303 Safefree(floatbuf);
4304
4305 if (utf8ness) {
4306 is_utf8 = get_locale_string_utf8ness_i(retval,
4307 LOCALE_UTF8NESS_UNKNOWN,
4308 locale, cat_index);
4309 }
4310
4311 break;
4312 }
4313
4314 Safefree(floatbuf);
4315 }
4316
4317 # ifdef HAS_LOCALECONV /* snprintf() failed; drop down to use
4318 localeconv() */
4319
4320 /* FALLTHROUGH */
4321
4322 # else /* snprintf() failed and no localeconv() */
4323
4324 retval = C_decimal_point;
4325 break;
4326
4327 # endif
4328 # endif
4329 # ifdef HAS_LOCALECONV
4330
4331 /* These items are available from localeconv(). (To avoid using
4332 * TS_W32_BROKEN_LOCALECONV, one could use GetNumberFormat and
4333 * GetCurrencyFormat; patches welcome) */
4334
4335 # define P_CS_PRECEDES "p_cs_precedes"
4336 # define CURRENCY_SYMBOL "currency_symbol"
4337
4338 /* case RADIXCHAR: // May drop down to here in some configurations */
4339 case THOUSEP:
4340 case CRNCYSTR:
4341 {
4342
4343 /* The hash gets populated with just the field(s) related to 'item'. */
4344 HV * result_hv = my_localeconv(item);
4345
4346 SV* string;
4347 if (item != CRNCYSTR) {
4348
4349 /* These items have been populated with just one key => value */
4350 (void) hv_iterinit(result_hv);
4351 HE * entry = hv_iternext(result_hv);
4352 string = hv_iterval(result_hv, entry);
4353 }
4354 else {
4355
4356 /* But CRNCYSTR localeconv() returns a slightly different value
4357 * than the nl_langinfo() API calls for, so have to modify this one
4358 * to conform. We need another value from localeconv() to know
4359 * what to change it to. my_localeconv() has populated the hash
4360 * with exactly both fields. Delete this one, leaving just the
4361 * CRNCYSTR one in the hash */
4362 SV* precedes = hv_delete(result_hv,
4363 P_CS_PRECEDES, STRLENs(P_CS_PRECEDES),
4364 0);
4365 if (! precedes) {
4366 locale_panic_("my_localeconv() unexpectedly didn't return"
4367 " a value for " P_CS_PRECEDES);
4368 }
4369
4370 /* The modification is to prefix the localeconv() return with a
4371 * single byte, calculated as follows: */
4372 char prefix = (LIKELY(SvIV(precedes) != -1))
4373 ? ((precedes != 0) ? '-' : '+')
4374
4375 /* khw couldn't find any documentation that
4376 * CHAR_MAX (which we modify to -1) is the signal,
4377 * but cygwin uses it thusly, and it makes sense
4378 * given that CHAR_MAX indicates the value isn't
4379 * used, so it neither precedes nor succeeds */
4380 : '.';
4381
4382 /* Now get CRNCYSTR */
4383 (void) hv_iterinit(result_hv);
4384 HE * entry = hv_iternext(result_hv);
4385 string = hv_iterval(result_hv, entry);
4386
4387 /* And perform the modification */
4388 Perl_sv_setpvf(aTHX_ string, "%c%s", prefix, SvPV_nolen(string));
4389 }
4390
4391 /* Here, 'string' contains the value we want to return */
4392 retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep);
4393
4394 if (utf8ness) {
4395 is_utf8 = (SvUTF8(string))
4396 ? UTF8NESS_YES
4397 : (is_utf8_invariant_string( (U8 *) retval,
4398 strlen(retval)))
4399 ? UTF8NESS_IMMATERIAL
4400 : UTF8NESS_NO;
4401 }
4402
4403 break;
4404
4405 }
4406
4407 # endif /* Some form of localeconv */
4408 # ifdef HAS_STRFTIME
4409
4410 /* These formats are only available in later strftime's */
4411 case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM:
4412
4413 /* The rest can be gotten from most versions of strftime(). */
4414 case ABDAY_1: case ABDAY_2: case ABDAY_3:
4415 case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7:
4416 case ALT_DIGITS:
4417 case AM_STR: case PM_STR:
4418 case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4:
4419 case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8:
4420 case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12:
4421 case DAY_1: case DAY_2: case DAY_3: case DAY_4:
4422 case DAY_5: case DAY_6: case DAY_7:
4423 case MON_1: case MON_2: case MON_3: case MON_4:
4424 case MON_5: case MON_6: case MON_7: case MON_8:
4425 case MON_9: case MON_10: case MON_11: case MON_12:
4426 {
4427 const char * format;
4428 bool return_format = FALSE;
4429 int mon = 0;
4430 int mday = 1;
4431 int hour = 6;
4432
4433 GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
4434
4435 switch (item) {
4436 default:
4437 locale_panic_(Perl_form(aTHX_ "switch case: %d problem", item));
4438 NOT_REACHED; /* NOTREACHED */
4439
4440 case PM_STR: hour = 18;
4441 case AM_STR:
4442 format = "%p";
4443 break;
4444 case ABDAY_7: mday++;
4445 case ABDAY_6: mday++;
4446 case ABDAY_5: mday++;
4447 case ABDAY_4: mday++;
4448 case ABDAY_3: mday++;
4449 case ABDAY_2: mday++;
4450 case ABDAY_1:
4451 format = "%a";
4452 break;
4453 case DAY_7: mday++;
4454 case DAY_6: mday++;
4455 case DAY_5: mday++;
4456 case DAY_4: mday++;
4457 case DAY_3: mday++;
4458 case DAY_2: mday++;
4459 case DAY_1:
4460 format = "%A";
4461 break;
4462 case ABMON_12: mon++;
4463 case ABMON_11: mon++;
4464 case ABMON_10: mon++;
4465 case ABMON_9: mon++;
4466 case ABMON_8: mon++;
4467 case ABMON_7: mon++;
4468 case ABMON_6: mon++;
4469 case ABMON_5: mon++;
4470 case ABMON_4: mon++;
4471 case ABMON_3: mon++;
4472 case ABMON_2: mon++;
4473 case ABMON_1:
4474 format = "%b";
4475 break;
4476 case MON_12: mon++;
4477 case MON_11: mon++;
4478 case MON_10: mon++;
4479 case MON_9: mon++;
4480 case MON_8: mon++;
4481 case MON_7: mon++;
4482 case MON_6: mon++;
4483 case MON_5: mon++;
4484 case MON_4: mon++;
4485 case MON_3: mon++;
4486 case MON_2: mon++;
4487 case MON_1:
4488 format = "%B";
4489 break;
4490 case T_FMT_AMPM:
4491 format = "%r";
4492 return_format = TRUE;
4493 break;
4494 case ERA_D_FMT:
4495 format = "%Ex";
4496 return_format = TRUE;
4497 break;
4498 case ERA_T_FMT:
4499 format = "%EX";
4500 return_format = TRUE;
4501 break;
4502 case ERA_D_T_FMT:
4503 format = "%Ec";
4504 return_format = TRUE;
4505 break;
4506 case ALT_DIGITS:
4507 format = "%Ow"; /* Find the alternate digit for 0 */
4508 break;
4509 }
4510
4511 GCC_DIAG_RESTORE_STMT;
4512
4513 /* The year was deliberately chosen so that January 1 is on the
4514 * first day of the week. Since we're only getting one thing at a
4515 * time, it all works */
4516 const char * temp = my_strftime8_temp(format, 30, 30, hour, mday, mon,
4517 2011, 0, 0, 0, &is_utf8);
4518 retval = save_to_buffer(temp, retbufp, retbuf_sizep);
4519 Safefree(temp);
4520
4521 /* If the item is 'ALT_DIGITS', '*retbuf' contains the alternate
4522 * format for wday 0. If the value is the same as the normal 0,
4523 * there isn't an alternate, so clear the buffer.
4524 *
4525 * (wday was chosen because its range is all a single digit.
4526 * Things like tm_sec have two digits as the minimum: '00'.) */
4527 if (item == ALT_DIGITS && strEQ(*retbufp, "0")) {
4528 retval = "";
4529 break;
4530 }
4531
4532 /* ALT_DIGITS is problematic. Experiments on it showed that
4533 * strftime() did not always work properly when going from alt-9 to
4534 * alt-10. Only a few locales have this item defined, and in all
4535 * of them on Linux that khw was able to find, nl_langinfo() merely
4536 * returned the alt-0 character, possibly doubled. Most Unicode
4537 * digits are in blocks of 10 consecutive code points, so that is
4538 * sufficient information for such scripts, as we can infer alt-1,
4539 * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is
4540 * returned, and the CJK digits are not in code point order, so you
4541 * can't really infer anything. The localedef for this locale did
4542 * specify the succeeding digits, so that strftime() works properly
4543 * on them, without needing to infer anything. But the
4544 * nl_langinfo() return did not give sufficient information for the
4545 * caller to understand what's going on. So until there is
4546 * evidence that it should work differently, this returns the alt-0
4547 * string for ALT_DIGITS. */
4548
4549 if (return_format) {
4550
4551 /* If to return the format, not the value, overwrite the buffer
4552 * with it. But some strftime()s will keep the original format
4553 * if illegal, so change those to "" */
4554 if (strEQ(*retbufp, format)) {
4555 retval = "";
4556 }
4557 else {
4558 retval = format;
4559 }
4560
4561 /* A format is always in ASCII */
4562 is_utf8 = UTF8NESS_IMMATERIAL;
4563 }
4564
4565 break;
4566 }
4567
4568 # endif
4569
4570 case CODESET:
4571
4572 /* The trivial case */
4573 if (isNAME_C_OR_POSIX(locale)) {
4574 retval = C_codeset;
4575 break;
4576 }
4577
4578 # ifdef WIN32
4579
4580 /* This function retrieves the code page. It is subject to change, but
4581 * is documented and has been stable for many releases */
4582 UINT ___lc_codepage_func(void);
4583
4584 retval = save_to_buffer(Perl_form(aTHX_ "%d", ___lc_codepage_func()),
4585 retbufp, retbuf_sizep);
4586 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n",
4587 locale, retval));
4588 break;
4589
4590 # else
4591
4592 /* The codeset is important, but khw did not figure out a way for it to
4593 * be retrieved on non-Windows boxes without nl_langinfo(). But even
4594 * if we can't get it directly, we can usually determine if it is a
4595 * UTF-8 locale or not. If it is UTF-8, we (correctly) use that for
4596 * the code set. */
4597
4598 # if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4599
4600 /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT
4601 * CHARACTER as that Unicode code point, this has to be a UTF-8 locale.
4602 * */
4603 wchar_t wc = 0;
4604 (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */
4605 int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
4606 STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
4607 if (mbtowc_ret >= 0 && wc == UNICODE_REPLACEMENT) {
4608 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4609 "mbtowc returned REPLACEMENT\n"));
4610 retval = "UTF-8";
4611 break;
4612 }
4613
4614 /* Here, it isn't a UTF-8 locale. */
4615
4616 # else /* mbtowc() is not available. */
4617
4618 /* Sling together several possibilities, depending on platform
4619 * capabilities and what we found.
4620 *
4621 * For non-English locales or non-dollar currency locales, we likely
4622 * will find out whether a locale is UTF-8 or not */
4623
4624 utf8ness_t is_utf8 = UTF8NESS_UNKNOWN;
4625 const char * scratch_buf = NULL;
4626
4627 # if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
4628
4629 /* Can't use this method unless localeconv() is available, as that's
4630 * the way we find out the currency symbol. */
4631
4632 /* First try looking at the currency symbol (via a recursive call) to
4633 * see if it disambiguates things. Often that will be in the native
4634 * script, and if the symbol isn't legal UTF-8, we know that the locale
4635 * isn't either. */
4636 (void) my_langinfo_c(CRNCYSTR, LC_MONETARY, locale, &scratch_buf, NULL,
4637 &is_utf8);
4638 Safefree(scratch_buf);
4639
4640 # endif
4641 # ifdef USE_LOCALE_TIME
4642
4643 /* If we have ruled out being UTF-8, no point in checking further. */
4644 if (is_utf8 != UTF8NESS_NO) {
4645
4646 /* But otherwise do check more. This is done even if the currency
4647 * symbol looks to be UTF-8, just in case that's a false positive.
4648 *
4649 * Look at the LC_TIME entries, like the names of the months or
4650 * weekdays. We quit at the first one that is illegal UTF-8 */
4651
4652 utf8ness_t this_is_utf8 = UTF8NESS_UNKNOWN;
4653 const int times[] = {
4654 DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
4655 MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
4656 MON_9, MON_10, MON_11, MON_12,
4657 ALT_DIGITS, AM_STR, PM_STR,
4658 ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6,
4659 ABDAY_7,
4660 ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
4661 ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
4662 };
4663
4664 /* The code in the recursive call can handle switching the locales,
4665 * but by doing it here, we avoid switching each iteration of the
4666 * loop */
4667 const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
4668
4669 for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(times); i++) {
4670 scratch_buf = NULL;
4671 (void) my_langinfo_c(times[i], LC_TIME, locale, &scratch_buf,
4672 NULL, &this_is_utf8);
4673 Safefree(scratch_buf);
4674 if (this_is_utf8 == UTF8NESS_NO) {
4675 is_utf8 = UTF8NESS_NO;
4676 break;
4677 }
4678
4679 if (this_is_utf8 == UTF8NESS_YES) {
4680 is_utf8 = UTF8NESS_YES;
4681 }
4682 }
4683
4684 /* Here we have gone through all the LC_TIME elements. is_utf8 has
4685 * been set as follows:
4686 * UTF8NESS_NO If at least one is't legal UTF-8
4687 * UTF8NESS_IMMMATERIAL If all are ASCII
4688 * UTF8NESS_YES If all are legal UTF-8 (including
4689 * ASCIIi), and at least one isn't
4690 * ASCII. */
4691
4692 restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
4693 }
4694
4695 # endif /* LC_TIME */
4696
4697 /* If nothing examined above rules out it being UTF-8, and at least one
4698 * thing fits as UTF-8 (and not plain ASCII), assume the codeset is
4699 * UTF-8. */
4700 if (is_utf8 == UTF8NESS_YES) {
4701 retval = "UTF-8";
4702 break;
4703 }
4704
4705 /* Here, nothing examined indicates that the codeset is UTF-8. But
4706 * what is it? The other locale categories are not likely to be of
4707 * further help:
4708 *
4709 * LC_NUMERIC Only a few locales in the world have a non-ASCII radix
4710 * or group separator.
4711 * LC_CTYPE This code wouldn't be compiled if mbtowc() existed and
4712 * was reliable. This is unlikely in C99. There are
4713 * other functions that could be used instead, but are
4714 * they going to exist, and be able to distinguish between
4715 * UTF-8 and 8859-1? Deal with this only if it becomes
4716 * necessary.
4717 * LC_MESSAGES The strings returned from strerror() would seem likely
4718 * candidates, but experience has shown that many systems
4719 * don't actually have translations installed for them.
4720 * They are instead always in English, so everything in
4721 * them is ASCII, which is of no help to us. A Configure
4722 * probe could possibly be written to see if this platform
4723 * has non-ASCII error messages. But again, wait until it
4724 * turns out to be an actual problem. */
4725
4726 # endif /* ! mbtowc() */
4727
4728 /* Rejoin the mbtowc available/not-available cases.
4729 *
4730 * We got here only because we haven't been able to find the codeset.
4731 * The only other option khw could think of is to see if the codeset is
4732 * part of the locale name. This is very less than ideal; often there
4733 * is no code set in the name; and at other times they even lie.
4734 *
4735 * But there is an XPG standard syntax, which many locales follow:
4736 *
4737 * language[_territory[.codeset]][@modifier]
4738 *
4739 * So we take the part between the dot and any '@' */
4740 retval = (const char *) strchr(locale, '.');
4741 if (! retval) {
4742 retval = ""; /* Alas, no dot */
4743 break;
4744 }
4745
4746 /* Don't include the dot */
4747 retval++;
4748
4749 /* And stop before any '@' */
4750 const char * modifier = strchr(retval, '@');
4751 if (modifier) {
4752 char * code_set_name;
4753 const Size_t name_len = modifier - retval;
4754 Newx(code_set_name, name_len + 1, char); /* +1 for NUL */
4755 my_strlcpy(code_set_name, retval, name_len + 1);
4756 SAVEFREEPV(code_set_name);
4757 retval = code_set_name;
4758 }
4759
4760 # if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
4761
4762 /* When these functions, are available, they were tried earlier and
4763 * indicated that the locale did not act like a proper UTF-8 one. So
4764 * if it claims to be UTF-8, it is a lie */
4765 if (is_codeset_name_UTF8(retval)) {
4766 retval = "";
4767 break;
4768 }
4769
4770 # endif
4771
4772 /* Otherwise the code set name is considered to be everything between
4773 * the dot and the '@' */
4774 retval = save_to_buffer(retval, retbufp, retbuf_sizep);
4775
4776 break;
4777
4778 # endif
4779
4780 } /* Giant switch() of nl_langinfo() items */
4781
4782 restore_toggled_locale_i(cat_index, orig_switched_locale);
4783
4784 # ifdef USE_LOCALE_CTYPE
4785 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
4786 # endif
4787
4788 if (utf8ness) {
4789 *utf8ness = is_utf8;
4790 }
4791
4792 return retval;
4793
4794 # endif /* All the implementations of my_langinfo() */
4795
4796 /*--------------------------------------------------------------------------*/
4797
4798 } /* my_langinfo() */
4799
4800 #endif /* USE_LOCALE */
4801
4802 char *
Perl_my_strftime(pTHX_ const char * fmt,int sec,int min,int hour,int mday,int mon,int year,int wday,int yday,int isdst)4803 Perl_my_strftime(pTHX_ const char *fmt, int sec, int min, int hour, int mday, int mon, int year, int wday, int yday, int isdst)
4804 {
4805 #ifdef HAS_STRFTIME
4806
4807 /*
4808 =for apidoc_section $time
4809 =for apidoc my_strftime
4810
4811 strftime(), but with a different API so that the return value is a pointer
4812 to the formatted result (which MUST be arranged to be FREED BY THE
4813 CALLER). This allows this function to increase the buffer size as needed,
4814 so that the caller doesn't have to worry about that.
4815
4816 On failure it returns NULL.
4817
4818 Note that yday and wday effectively are ignored by this function, as
4819 mini_mktime() overwrites them.
4820
4821 Also note that it is always executed in the underlying C<LC_TIME> locale of
4822 the program, giving results based on that locale.
4823
4824 =cut
4825 */
4826 PERL_ARGS_ASSERT_MY_STRFTIME;
4827
4828 /* An empty format yields an empty result */
4829 const int fmtlen = strlen(fmt);
4830 if (fmtlen == 0) {
4831 char *ret;
4832 Newxz (ret, 1, char);
4833 return ret;
4834 }
4835
4836 /* Set mytm to now */
4837 struct tm mytm;
4838 init_tm(&mytm); /* XXX workaround - see Perl_init_tm() */
4839
4840 /* Override with the passed-in values */
4841 mytm.tm_sec = sec;
4842 mytm.tm_min = min;
4843 mytm.tm_hour = hour;
4844 mytm.tm_mday = mday;
4845 mytm.tm_mon = mon;
4846 mytm.tm_year = year;
4847 mytm.tm_wday = wday;
4848 mytm.tm_yday = yday;
4849 mytm.tm_isdst = isdst;
4850 mini_mktime(&mytm);
4851
4852 /* use libc to get the values for tm_gmtoff and tm_zone on platforms that
4853 * have them [perl #18238] */
4854 #if defined(HAS_MKTIME) && (defined(HAS_TM_TM_GMTOFF) || defined(HAS_TM_TM_ZONE))
4855 struct tm mytm2;
4856 mytm2 = mytm;
4857 MKTIME_LOCK;
4858 mktime(&mytm2);
4859 MKTIME_UNLOCK;
4860 # ifdef HAS_TM_TM_GMTOFF
4861 mytm.tm_gmtoff = mytm2.tm_gmtoff;
4862 # endif
4863 # ifdef HAS_TM_TM_ZONE
4864 mytm.tm_zone = mytm2.tm_zone;
4865 # endif
4866 #endif
4867 #if defined(USE_LOCALE_CTYPE) && defined(USE_LOCALE_TIME)
4868
4869 const char * orig_CTYPE_LOCALE = toggle_locale_c(LC_CTYPE,
4870 querylocale_c(LC_TIME));
4871 #endif
4872
4873 /* Guess an initial size for the returned string based on an expansion
4874 * factor of the input format, but with a minimum that should handle most
4875 * common cases. If this guess is too small, we will try again with a
4876 * larger one */
4877 int bufsize = MAX(fmtlen * 2, 64);
4878
4879 char *buf = NULL; /* Makes Renew() act as Newx() on the first iteration */
4880 do {
4881 Renew(buf, bufsize, char);
4882
4883 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); /* fmt checked by caller */
4884
4885 STRFTIME_LOCK;
4886 int len = strftime(buf, bufsize, fmt, &mytm);
4887 STRFTIME_UNLOCK;
4888
4889 GCC_DIAG_RESTORE_STMT;
4890
4891 /* A non-zero return indicates success. But to make sure we're not
4892 * dealing with some rogue strftime that returns how much space it
4893 * needs instead of 0 when there isn't enough, check that the return
4894 * indicates we have at least one byte of spare space (which will be
4895 * used for the terminating NUL). */
4896 if (inRANGE(len, 1, bufsize - 1)) {
4897 goto strftime_success;
4898 }
4899
4900 /* There are several possible reasons for a 0 return code for a
4901 * non-empty format, and they are not trivial to tease apart. This
4902 * issue is a known bug in the strftime() API. What we do to cope is
4903 * to assume that the reason is not enough space in the buffer, so
4904 * increase it and try again. */
4905 bufsize *= 2;
4906
4907 /* But don't just keep increasing the size indefinitely. Stop when it
4908 * becomes obvious that the reason for failure is something besides not
4909 * enough space. The most likely largest expanding format is %c. On
4910 * khw's Linux box, the maximum result of this is 67 characters, in the
4911 * km_KH locale. If a new script comes along that uses 4 UTF-8 bytes
4912 * per character, and with a similar expansion factor, that would be a
4913 * 268:2 byte ratio, or a bit more than 128:1 = 2**7:1. Some strftime
4914 * implementations allow you to say %1000c to pad to 1000 bytes. This
4915 * shows that it is impossible to implement this without a heuristic
4916 * (that can fail). But it indicates we need to be generous in the
4917 * upper limit before failing. The previous heuristic used was too
4918 * stingy. Since the size doubles per iteration, it doesn't take many
4919 * to reach the limit */
4920 } while (bufsize < ((1 << 11) + 1) * fmtlen);
4921
4922 /* Here, strftime() returned 0, and it likely wasn't for lack of space.
4923 * There are two possible reasons:
4924 *
4925 * First is that the result is legitimately 0 length. This can happen
4926 * when the format is precisely "%p". That is the only documented format
4927 * that can have an empty result. */
4928 if (strEQ(fmt, "%p")) {
4929 Renew(buf, 1, char);
4930 *buf = '\0';
4931 goto strftime_success;
4932 }
4933
4934 /* The other reason is that the format string is malformed. Probably it is
4935 * an illegal conversion specifier.) */
4936 Safefree(buf);
4937 return NULL;
4938
4939 strftime_success:
4940
4941 #if defined(USE_LOCALE_CTYPE) && defined(USE_LOCALE_TIME)
4942
4943 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_LOCALE);
4944
4945 #endif
4946 return buf;
4947
4948 #else
4949 Perl_croak(aTHX_ "panic: no strftime");
4950 return NULL;
4951 #endif
4952
4953 }
4954
4955 char *
Perl_my_strftime8_temp(pTHX_ const char * fmt,int sec,int min,int hour,int mday,int mon,int year,int wday,int yday,int isdst,utf8ness_t * utf8ness)4956 Perl_my_strftime8_temp(pTHX_ const char *fmt, int sec, int min, int hour, int mday,
4957 int mon, int year, int wday, int yday, int isdst,
4958 utf8ness_t * utf8ness)
4959 { /* Documented above */
4960 char * retval = my_strftime(fmt, sec, min, hour, mday, mon, year, wday,
4961 yday, isdst);
4962
4963 PERL_ARGS_ASSERT_MY_STRFTIME8_TEMP;
4964
4965 if (utf8ness) {
4966
4967 #ifdef USE_LOCALE_TIME
4968 *utf8ness = get_locale_string_utf8ness_i(retval,
4969 LOCALE_UTF8NESS_UNKNOWN,
4970 NULL, LC_TIME_INDEX_);
4971 #else
4972 *utf8ness = UTF8NESS_IMMATERIAL;
4973 #endif
4974
4975 }
4976
4977 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4978 "fmt=%s, retval=%s", fmt,
4979 ((is_utf8_string((U8 *) retval, 0))
4980 ? retval
4981 :_byte_dump_string((U8 *) retval, strlen(retval), 0)));
4982 if (utf8ness) PerlIO_printf(Perl_debug_log, "; utf8ness=%d",
4983 (int) *utf8ness);
4984 PerlIO_printf(Perl_debug_log, "\n");
4985 );
4986
4987 return retval;
4988 }
4989
4990 /*
4991 * Initialize locale awareness.
4992 */
4993 int
Perl_init_i18nl10n(pTHX_ int printwarn)4994 Perl_init_i18nl10n(pTHX_ int printwarn)
4995 {
4996 /* printwarn is
4997 *
4998 * 0 if not to output warning when setup locale is bad
4999 * 1 if to output warning based on value of PERL_BADLANG
5000 * >1 if to output regardless of PERL_BADLANG
5001 *
5002 * returns
5003 * 1 = set ok or not applicable,
5004 * 0 = fallback to a locale of lower priority
5005 * -1 = fallback to all locales failed, not even to the C locale
5006 *
5007 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is
5008 * set, debugging information is output.
5009 *
5010 * This looks more complicated than it is, mainly due to the #ifdefs and
5011 * error handling.
5012 *
5013 * Besides some asserts, data structure initialization, and specific
5014 * platform complications, this routine is effectively represented by this
5015 * pseudo-code:
5016 *
5017 * setlocale(LC_ALL, ""); x
5018 * foreach (subcategory) { x
5019 * curlocales[f(subcategory)] = setlocale(subcategory, NULL); x
5020 * } x
5021 * if (platform_so_requires) {
5022 * foreach (subcategory) {
5023 * PL_curlocales[f(subcategory)] = curlocales[f(subcategory)]
5024 * }
5025 * }
5026 * foreach (subcategory) {
5027 * if (needs_special_handling[f(subcategory)] &this_subcat_handler
5028 * }
5029 *
5030 * This sets all the categories to the values in the current environment,
5031 * saves them temporarily in curlocales[] until they can be handled and/or
5032 * on some platforms saved in a per-thread array PL_curlocales[].
5033 *
5034 * f(foo) is a mapping from the opaque system category numbers to small
5035 * non-negative integers used most everywhere in this file as indices into
5036 * arrays (such as curlocales[]) so the program doesn't have to otherwise
5037 * deal with the opaqueness.
5038 *
5039 * If the platform doesn't have LC_ALL, the lines marked 'x' above are
5040 * effectively replaced by:
5041 * foreach (subcategory) { y
5042 * curlocales[f(subcategory)] = setlocale(subcategory, ""); y
5043 * } y
5044 *
5045 * The only differences being the lack of an LC_ALL call, and using ""
5046 * instead of NULL in the setlocale calls.
5047 *
5048 * But there are, of course, complications.
5049 *
5050 * it has to deal with if this is an embedded perl, whose locale doesn't
5051 * come from the environment, but has been set up by the caller. This is
5052 * pretty simply handled: the "" in the setlocale calls is not a string
5053 * constant, but a variable which is set to NULL in the embedded case.
5054 *
5055 * But the major complication is handling failure and doing fallback. All
5056 * the code marked 'x' or 'y' above is actually enclosed in an outer loop,
5057 * using the array trial_locales[]. On entry, trial_locales[] is
5058 * initialized to just one entry, containing the NULL or "" locale argument
5059 * shown above. If, as is almost always the case, everything works, it
5060 * exits after just the one iteration, going on to the next step.
5061 *
5062 * But if there is a failure, the code tries its best to honor the
5063 * environment as much as possible. It self-modifies trial_locales[] to
5064 * have more elements, one for each of the POSIX-specified settings from
5065 * the environment, such as LANG, ending in the ultimate fallback, the C
5066 * locale. Thus if there is something bogus with a higher priority
5067 * environment variable, it will try with the next highest, until something
5068 * works. If everything fails, it limps along with whatever state it got
5069 * to.
5070 *
5071 * A further complication is that Windows has an additional fallback, the
5072 * user-default ANSI code page obtained from the operating system. This is
5073 * added as yet another loop iteration, just before the final "C"
5074 *
5075 * A slight complication is that in embedded Perls, the locale may already
5076 * be set-up, and we don't want to get it from the normal environment
5077 * variables. This is handled by having a special environment variable
5078 * indicate we're in this situation. We simply set setlocale's 2nd
5079 * parameter to be a NULL instead of "". That indicates to setlocale that
5080 * it is not to change anything, but to return the current value,
5081 * effectively initializing perl's db to what the locale already is.
5082 *
5083 * We play the same trick with NULL if a LC_ALL succeeds. We call
5084 * setlocale() on the individual categories with NULL to get their existing
5085 * values for our db, instead of trying to change them.
5086 * */
5087
5088 int ok = 1;
5089
5090 #ifndef USE_LOCALE
5091
5092 PERL_UNUSED_ARG(printwarn);
5093
5094 #else /* USE_LOCALE */
5095 # ifdef __GLIBC__
5096
5097 const char * const language = PerlEnv_getenv("LANGUAGE");
5098
5099 # endif
5100
5101 /* NULL uses the existing already set up locale */
5102 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
5103 ? NULL
5104 : "";
5105 typedef struct trial_locales_struct_s {
5106 const char* trial_locale;
5107 const char* fallback_desc;
5108 const char* fallback_name;
5109 } trial_locales_struct;
5110 /* 5 = 1 each for "", LC_ALL, LANG, (Win32) system default locale, C */
5111 trial_locales_struct trial_locales[5];
5112 unsigned int trial_locales_count;
5113 const char * const lc_all = PerlEnv_getenv("LC_ALL");
5114 const char * const lang = PerlEnv_getenv("LANG");
5115 bool setlocale_failure = FALSE;
5116 unsigned int i;
5117
5118 /* A later getenv() could zap this, so only use here */
5119 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG");
5120
5121 const bool locwarn = (printwarn > 1
5122 || ( printwarn
5123 && ( ! bad_lang_use_once
5124 || (
5125 /* disallow with "" or "0" */
5126 *bad_lang_use_once
5127 && strNE("0", bad_lang_use_once)))));
5128
5129 /* current locale for given category; should have been copied so aren't
5130 * volatile */
5131 const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
5132
5133 # ifndef DEBUGGING
5134 # define DEBUG_LOCALE_INIT(a,b,c)
5135 # else
5136
5137 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
5138
5139 # define DEBUG_LOCALE_INIT(cat_index, locale, result) \
5140 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n", \
5141 setlocale_debug_string_i(cat_index, locale, result)));
5142
5143 /* Make sure the parallel arrays are properly set up */
5144 # ifdef USE_LOCALE_NUMERIC
5145 assert(categories[LC_NUMERIC_INDEX_] == LC_NUMERIC);
5146 assert(strEQ(category_names[LC_NUMERIC_INDEX_], "LC_NUMERIC"));
5147 # ifdef USE_POSIX_2008_LOCALE
5148 assert(category_masks[LC_NUMERIC_INDEX_] == LC_NUMERIC_MASK);
5149 # endif
5150 # endif
5151 # ifdef USE_LOCALE_CTYPE
5152 assert(categories[LC_CTYPE_INDEX_] == LC_CTYPE);
5153 assert(strEQ(category_names[LC_CTYPE_INDEX_], "LC_CTYPE"));
5154 # ifdef USE_POSIX_2008_LOCALE
5155 assert(category_masks[LC_CTYPE_INDEX_] == LC_CTYPE_MASK);
5156 # endif
5157 # endif
5158 # ifdef USE_LOCALE_COLLATE
5159 assert(categories[LC_COLLATE_INDEX_] == LC_COLLATE);
5160 assert(strEQ(category_names[LC_COLLATE_INDEX_], "LC_COLLATE"));
5161 # ifdef USE_POSIX_2008_LOCALE
5162 assert(category_masks[LC_COLLATE_INDEX_] == LC_COLLATE_MASK);
5163 # endif
5164 # endif
5165 # ifdef USE_LOCALE_TIME
5166 assert(categories[LC_TIME_INDEX_] == LC_TIME);
5167 assert(strEQ(category_names[LC_TIME_INDEX_], "LC_TIME"));
5168 # ifdef USE_POSIX_2008_LOCALE
5169 assert(category_masks[LC_TIME_INDEX_] == LC_TIME_MASK);
5170 # endif
5171 # endif
5172 # ifdef USE_LOCALE_MESSAGES
5173 assert(categories[LC_MESSAGES_INDEX_] == LC_MESSAGES);
5174 assert(strEQ(category_names[LC_MESSAGES_INDEX_], "LC_MESSAGES"));
5175 # ifdef USE_POSIX_2008_LOCALE
5176 assert(category_masks[LC_MESSAGES_INDEX_] == LC_MESSAGES_MASK);
5177 # endif
5178 # endif
5179 # ifdef USE_LOCALE_MONETARY
5180 assert(categories[LC_MONETARY_INDEX_] == LC_MONETARY);
5181 assert(strEQ(category_names[LC_MONETARY_INDEX_], "LC_MONETARY"));
5182 # ifdef USE_POSIX_2008_LOCALE
5183 assert(category_masks[LC_MONETARY_INDEX_] == LC_MONETARY_MASK);
5184 # endif
5185 # endif
5186 # ifdef USE_LOCALE_ADDRESS
5187 assert(categories[LC_ADDRESS_INDEX_] == LC_ADDRESS);
5188 assert(strEQ(category_names[LC_ADDRESS_INDEX_], "LC_ADDRESS"));
5189 # ifdef USE_POSIX_2008_LOCALE
5190 assert(category_masks[LC_ADDRESS_INDEX_] == LC_ADDRESS_MASK);
5191 # endif
5192 # endif
5193 # ifdef USE_LOCALE_IDENTIFICATION
5194 assert(categories[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION);
5195 assert(strEQ(category_names[LC_IDENTIFICATION_INDEX_], "LC_IDENTIFICATION"));
5196 # ifdef USE_POSIX_2008_LOCALE
5197 assert(category_masks[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION_MASK);
5198 # endif
5199 # endif
5200 # ifdef USE_LOCALE_MEASUREMENT
5201 assert(categories[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT);
5202 assert(strEQ(category_names[LC_MEASUREMENT_INDEX_], "LC_MEASUREMENT"));
5203 # ifdef USE_POSIX_2008_LOCALE
5204 assert(category_masks[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT_MASK);
5205 # endif
5206 # endif
5207 # ifdef USE_LOCALE_PAPER
5208 assert(categories[LC_PAPER_INDEX_] == LC_PAPER);
5209 assert(strEQ(category_names[LC_PAPER_INDEX_], "LC_PAPER"));
5210 # ifdef USE_POSIX_2008_LOCALE
5211 assert(category_masks[LC_PAPER_INDEX_] == LC_PAPER_MASK);
5212 # endif
5213 # endif
5214 # ifdef USE_LOCALE_TELEPHONE
5215 assert(categories[LC_TELEPHONE_INDEX_] == LC_TELEPHONE);
5216 assert(strEQ(category_names[LC_TELEPHONE_INDEX_], "LC_TELEPHONE"));
5217 # ifdef USE_POSIX_2008_LOCALE
5218 assert(category_masks[LC_TELEPHONE_INDEX_] == LC_TELEPHONE_MASK);
5219 # endif
5220 # endif
5221 # ifdef USE_LOCALE_NAME
5222 assert(categories[LC_NAME_INDEX_] == LC_NAME);
5223 assert(strEQ(category_names[LC_NAME_INDEX_], "LC_NAME"));
5224 # ifdef USE_POSIX_2008_LOCALE
5225 assert(category_masks[LC_NAME_INDEX_] == LC_NAME_MASK);
5226 # endif
5227 # endif
5228 # ifdef USE_LOCALE_SYNTAX
5229 assert(categories[LC_SYNTAX_INDEX_] == LC_SYNTAX);
5230 assert(strEQ(category_names[LC_SYNTAX_INDEX_], "LC_SYNTAX"));
5231 # ifdef USE_POSIX_2008_LOCALE
5232 assert(category_masks[LC_SYNTAX_INDEX_] == LC_SYNTAX_MASK);
5233 # endif
5234 # endif
5235 # ifdef USE_LOCALE_TOD
5236 assert(categories[LC_TOD_INDEX_] == LC_TOD);
5237 assert(strEQ(category_names[LC_TOD_INDEX_], "LC_TOD"));
5238 # ifdef USE_POSIX_2008_LOCALE
5239 assert(category_masks[LC_TOD_INDEX_] == LC_TOD_MASK);
5240 # endif
5241 # endif
5242 # ifdef LC_ALL
5243 assert(categories[LC_ALL_INDEX_] == LC_ALL);
5244 assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL"));
5245 STATIC_ASSERT_STMT(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX_);
5246 # ifdef USE_POSIX_2008_LOCALE
5247 assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK);
5248 # endif
5249 # endif
5250 # endif /* DEBUGGING */
5251
5252 /* Initialize the per-thread mbrFOO() state variables. See POSIX.xs for
5253 * why these particular incantations are used. */
5254 # ifdef HAS_MBRLEN
5255 memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
5256 # endif
5257 # ifdef HAS_MBRTOWC
5258 memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
5259 # endif
5260 # ifdef HAS_WCTOMBR
5261 wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
5262 # endif
5263 # ifdef USE_THREAD_SAFE_LOCALE
5264 # ifdef WIN32
5265
5266 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
5267
5268 # endif
5269 # endif
5270 # ifdef USE_POSIX_2008_LOCALE
5271
5272 if (! PL_C_locale_obj) {
5273 PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
5274 }
5275 if (! PL_C_locale_obj) {
5276 locale_panic_(Perl_form(aTHX_
5277 "Cannot create POSIX 2008 C locale object"));
5278 }
5279
5280 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n",
5281 PL_C_locale_obj));
5282
5283 /* Switch to using the POSIX 2008 interface now. This would happen below
5284 * anyway, but deferring it can lead to leaks of memory that would also get
5285 * malloc'd in the interim */
5286 uselocale(PL_C_locale_obj);
5287
5288 # ifdef USE_LOCALE_NUMERIC
5289
5290 PL_underlying_numeric_obj = duplocale(PL_C_locale_obj);
5291
5292 # endif
5293 # endif
5294 # ifdef USE_LOCALE_NUMERIC
5295
5296 PL_numeric_radix_sv = newSV(1);
5297 PL_underlying_radix_sv = newSV(1);
5298 Newxz(PL_numeric_name, 1, char); /* Single NUL character */
5299 new_numeric("C", false);
5300
5301 # endif
5302 # ifdef USE_LOCALE_COLLATE
5303
5304 Newxz(PL_collation_name, 1, char);
5305 new_collate("C", false);
5306
5307 # endif
5308 # ifdef USE_LOCALE_CTYPE
5309
5310 Newxz(PL_ctype_name, 1, char);
5311 new_ctype("C", false);
5312
5313 # endif
5314 # ifdef USE_PL_CURLOCALES
5315
5316 /* Initialize our records. */
5317 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
5318 (void) emulate_setlocale_i(i, posix_setlocale(categories[i], NULL),
5319 RECALCULATE_LC_ALL_ON_FINAL_INTERATION,
5320 __LINE__);
5321 }
5322
5323 # endif
5324
5325 /* We try each locale in the list until we get one that works, or exhaust
5326 * the list. Normally the loop is executed just once. But if setting the
5327 * locale fails, inside the loop we add fallback trials to the array and so
5328 * will execute the loop multiple times */
5329 trial_locales_struct ts = {
5330 .trial_locale = setlocale_init,
5331 .fallback_desc = NULL,
5332 .fallback_name = NULL,
5333 };
5334 trial_locales[0] = ts;
5335 trial_locales_count = 1;
5336
5337 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
5338 curlocales[i] = NULL;
5339 }
5340
5341 for (i= 0; i < trial_locales_count; i++) {
5342 const char * trial_locale = trial_locales[i].trial_locale;
5343 setlocale_failure = FALSE;
5344
5345 # ifdef LC_ALL
5346
5347 /* setlocale() return vals; not copied so must be looked at
5348 * immediately. */
5349 const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
5350 sl_result[LC_ALL_INDEX_] = stdized_setlocale(LC_ALL, trial_locale);
5351 DEBUG_LOCALE_INIT(LC_ALL_INDEX_, trial_locale, sl_result[LC_ALL_INDEX_]);
5352 if (! sl_result[LC_ALL_INDEX_]) {
5353 setlocale_failure = TRUE;
5354 }
5355 else {
5356 /* Since LC_ALL succeeded, it should have changed all the other
5357 * categories it can to its value; so we massage things so that the
5358 * setlocales below just return their category's current values.
5359 * This adequately handles the case in NetBSD where LC_COLLATE may
5360 * not be defined for a locale, and setting it individually will
5361 * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to
5362 * the POSIX locale. */
5363 trial_locale = NULL;
5364 }
5365
5366 # endif /* LC_ALL */
5367
5368 if (! setlocale_failure) {
5369 unsigned int j;
5370 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
5371 curlocales[j] = stdized_setlocale(categories[j], trial_locale);
5372 if (! curlocales[j]) {
5373 setlocale_failure = TRUE;
5374 }
5375 curlocales[j] = savepv(curlocales[j]);
5376 DEBUG_LOCALE_INIT(j, trial_locale, curlocales[j]);
5377 }
5378
5379 if (LIKELY(! setlocale_failure)) { /* All succeeded */
5380 break; /* Exit trial_locales loop */
5381 }
5382 }
5383
5384 /* Here, something failed; will need to try a fallback. */
5385 ok = 0;
5386
5387 if (i == 0) {
5388 unsigned int j;
5389
5390 if (locwarn) { /* Output failure info only on the first one */
5391
5392 # ifdef LC_ALL
5393
5394 PerlIO_printf(Perl_error_log,
5395 "perl: warning: Setting locale failed.\n");
5396
5397 # else /* !LC_ALL */
5398
5399 PerlIO_printf(Perl_error_log,
5400 "perl: warning: Setting locale failed for the categories:\n");
5401
5402 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
5403 if (! curlocales[j]) {
5404 PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]);
5405 }
5406 }
5407
5408 # endif /* LC_ALL */
5409
5410 PerlIO_printf(Perl_error_log,
5411 "perl: warning: Please check that your locale settings:\n");
5412
5413 # ifdef __GLIBC__
5414
5415 PerlIO_printf(Perl_error_log,
5416 "\tLANGUAGE = %c%s%c,\n",
5417 language ? '"' : '(',
5418 language ? language : "unset",
5419 language ? '"' : ')');
5420 # endif
5421
5422 PerlIO_printf(Perl_error_log,
5423 "\tLC_ALL = %c%s%c,\n",
5424 lc_all ? '"' : '(',
5425 lc_all ? lc_all : "unset",
5426 lc_all ? '"' : ')');
5427
5428 # if defined(USE_ENVIRON_ARRAY)
5429
5430 {
5431 char **e;
5432
5433 /* Look through the environment for any variables of the
5434 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was
5435 * already handled above. These are assumed to be locale
5436 * settings. Output them and their values. */
5437 for (e = environ; *e; e++) {
5438 const STRLEN prefix_len = sizeof("LC_") - 1;
5439 STRLEN uppers_len;
5440
5441 if ( strBEGINs(*e, "LC_")
5442 && ! strBEGINs(*e, "LC_ALL=")
5443 && (uppers_len = strspn(*e + prefix_len,
5444 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
5445 && ((*e)[prefix_len + uppers_len] == '='))
5446 {
5447 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n",
5448 (int) (prefix_len + uppers_len), *e,
5449 *e + prefix_len + uppers_len + 1);
5450 }
5451 }
5452 }
5453
5454 # else
5455
5456 PerlIO_printf(Perl_error_log,
5457 "\t(possibly more locale environment variables)\n");
5458
5459 # endif
5460
5461 PerlIO_printf(Perl_error_log,
5462 "\tLANG = %c%s%c\n",
5463 lang ? '"' : '(',
5464 lang ? lang : "unset",
5465 lang ? '"' : ')');
5466
5467 PerlIO_printf(Perl_error_log,
5468 " are supported and installed on your system.\n");
5469 }
5470
5471 /* Calculate what fallback locales to try. We have avoided this
5472 * until we have to, because failure is quite unlikely. This will
5473 * usually change the upper bound of the loop we are in.
5474 *
5475 * Since the system's default way of setting the locale has not
5476 * found one that works, We use Perl's defined ordering: LC_ALL,
5477 * LANG, and the C locale. We don't try the same locale twice, so
5478 * don't add to the list if already there. (On POSIX systems, the
5479 * LC_ALL element will likely be a repeat of the 0th element "",
5480 * but there's no harm done by doing it explicitly.
5481 *
5482 * Note that this tries the LC_ALL environment variable even on
5483 * systems which have no LC_ALL locale setting. This may or may
5484 * not have been originally intentional, but there's no real need
5485 * to change the behavior. */
5486 if (lc_all) {
5487 for (j = 0; j < trial_locales_count; j++) {
5488 if (strEQ(lc_all, trial_locales[j].trial_locale)) {
5489 goto done_lc_all;
5490 }
5491 }
5492 trial_locales_struct ts = {
5493 .trial_locale = lc_all,
5494 .fallback_desc = (strEQ(lc_all, "C")
5495 ? "the standard locale"
5496 : "a fallback locale"),
5497 .fallback_name = lc_all,
5498 };
5499 trial_locales[trial_locales_count++] = ts;
5500 }
5501 done_lc_all:
5502
5503 if (lang) {
5504 for (j = 0; j < trial_locales_count; j++) {
5505 if (strEQ(lang, trial_locales[j].trial_locale)) {
5506 goto done_lang;
5507 }
5508 }
5509 trial_locales_struct ts = {
5510 .trial_locale = lang,
5511 .fallback_desc = (strEQ(lang, "C")
5512 ? "the standard locale"
5513 : "a fallback locale"),
5514 .fallback_name = lang,
5515 };
5516 trial_locales[trial_locales_count++] = ts;
5517 }
5518 done_lang:
5519
5520 # if defined(WIN32) && defined(LC_ALL)
5521
5522 /* For Windows, we also try the system default locale before "C".
5523 * (If there exists a Windows without LC_ALL we skip this because
5524 * it gets too complicated. For those, the "C" is the next
5525 * fallback possibility). */
5526 {
5527 /* Note that this may change the locale, but we are going to do
5528 * that anyway.
5529 *
5530 * Our normal Windows setlocale() implementation ignores the
5531 * system default locale to make things work like POSIX. This
5532 * is the only place where we want to consider it, so have to
5533 * use wrap_wsetlocale(). */
5534 const char *system_default_locale =
5535 stdize_locale(LC_ALL,
5536 wrap_wsetlocale(LC_ALL, ""),
5537 &PL_stdize_locale_buf,
5538 &PL_stdize_locale_bufsize,
5539 __LINE__);
5540 DEBUG_LOCALE_INIT(LC_ALL_INDEX_, "", system_default_locale);
5541
5542 /* Skip if invalid or if it's already on the list of locales to
5543 * try */
5544 if (! system_default_locale) {
5545 goto done_system_default;
5546 }
5547 for (j = 0; j < trial_locales_count; j++) {
5548 if (strEQ(system_default_locale, trial_locales[j].trial_locale)) {
5549 goto done_system_default;
5550 }
5551 }
5552
5553 trial_locales_struct ts = {
5554 .trial_locale = system_default_locale,
5555 .fallback_desc = (strEQ(system_default_locale, "C")
5556 ? "the standard locale"
5557 : "the system default locale"),
5558 .fallback_name = system_default_locale,
5559 };
5560 trial_locales[trial_locales_count++] = ts;
5561 }
5562 done_system_default:
5563
5564 # endif
5565
5566 for (j = 0; j < trial_locales_count; j++) {
5567 if (strEQ("C", trial_locales[j].trial_locale)) {
5568 goto done_C;
5569 }
5570 }
5571 {
5572 /* new scope to avoid C++ complaining about
5573 initialization being bypassed by goto.
5574 */
5575 trial_locales_struct ts = {
5576 .trial_locale = "C",
5577 .fallback_desc = "the standard locale",
5578 .fallback_name = "C",
5579 };
5580 trial_locales[trial_locales_count++] = ts;
5581 }
5582 done_C: ;
5583 } /* end of first time through the loop */
5584 } /* end of looping through the trial locales */
5585
5586 if (ok < 1) { /* If we tried to fallback */
5587 const char* msg;
5588 if (! setlocale_failure) { /* fallback succeeded */
5589 msg = "Falling back to";
5590 }
5591 else { /* fallback failed */
5592 unsigned int j;
5593
5594 /* We dropped off the end of the loop, so have to decrement i to
5595 * get back to the value the last time through */
5596 i--;
5597
5598 ok = -1;
5599 msg = "Failed to fall back to";
5600
5601 /* To continue, we should use whatever values we've got */
5602
5603 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
5604 Safefree(curlocales[j]);
5605 curlocales[j] = savepv(stdized_setlocale(categories[j], NULL));
5606 DEBUG_LOCALE_INIT(j, NULL, curlocales[j]);
5607 }
5608 }
5609
5610 if (locwarn) {
5611 const char * description = trial_locales[i].fallback_desc;
5612 const char * name = trial_locales[i].fallback_name;
5613
5614 if (name && strNE(name, "")) {
5615 PerlIO_printf(Perl_error_log,
5616 "perl: warning: %s %s (\"%s\").\n", msg, description, name);
5617 }
5618 else {
5619 PerlIO_printf(Perl_error_log,
5620 "perl: warning: %s %s.\n", msg, description);
5621 }
5622 }
5623 } /* End of tried to fallback */
5624
5625 # ifdef USE_POSIX_2008_LOCALE
5626
5627 /* The stdized setlocales haven't affected the P2008 locales. Initialize
5628 * them now, calculating LC_ALL only on the final go round, when all have
5629 * been set. */
5630 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
5631 (void) emulate_setlocale_i(i, curlocales[i],
5632 RECALCULATE_LC_ALL_ON_FINAL_INTERATION,
5633 __LINE__);
5634 }
5635
5636 # endif
5637
5638 /* Done with finding the locales; update the auxiliary records */
5639 new_LC_ALL(NULL, false);
5640
5641 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
5642 Safefree(curlocales[i]);
5643 }
5644
5645 # if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE)
5646
5647 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE
5648 * locale is UTF-8. The call to new_ctype() just above has already
5649 * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If
5650 * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE})
5651 * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on
5652 * STDIN, STDOUT, STDERR, _and_ the default open discipline. */
5653 PL_utf8locale = PL_in_utf8_CTYPE_locale;
5654
5655 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO.
5656 This is an alternative to using the -C command line switch
5657 (the -C if present will override this). */
5658 {
5659 const char *p = PerlEnv_getenv("PERL_UNICODE");
5660 PL_unicode = p ? parse_unicode_opts(&p) : 0;
5661 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG)
5662 PL_utf8cache = -1;
5663 }
5664
5665 # endif
5666 #endif /* USE_LOCALE */
5667
5668 /* So won't continue to output stuff */
5669 DEBUG_INITIALIZATION_set(FALSE);
5670
5671 return ok;
5672 }
5673
5674 #ifdef USE_LOCALE_COLLATE
5675
5676 STATIC void
S_compute_collxfrm_coefficients(pTHX)5677 S_compute_collxfrm_coefficients(pTHX)
5678 {
5679
5680 /* A locale collation definition includes primary, secondary, tertiary,
5681 * etc. weights for each character. To sort, the primary weights are used,
5682 * and only if they compare equal, then the secondary weights are used, and
5683 * only if they compare equal, then the tertiary, etc.
5684 *
5685 * strxfrm() works by taking the input string, say ABC, and creating an
5686 * output transformed string consisting of first the primary weights,
5687 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the tertiary,
5688 * etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters may not have
5689 * weights at every level. In our example, let's say B doesn't have a
5690 * tertiary weight, and A doesn't have a secondary weight. The constructed
5691 * string is then going to be
5692 * A¹B¹C¹ B²C² A³C³ ....
5693 * This has the desired effect that strcmp() will look at the secondary or
5694 * tertiary weights only if the strings compare equal at all higher
5695 * priority weights. The spaces shown here, like in
5696 * "A¹B¹C¹ A²B²C² "
5697 * are not just for readability. In the general case, these must actually
5698 * be bytes, which we will call here 'separator weights'; and they must be
5699 * smaller than any other weight value, but since these are C strings, only
5700 * the terminating one can be a NUL (some implementations may include a
5701 * non-NUL separator weight just before the NUL). Implementations tend to
5702 * reserve 01 for the separator weights. They are needed so that a shorter
5703 * string's secondary weights won't be misconstrued as primary weights of a
5704 * longer string, etc. By making them smaller than any other weight, the
5705 * shorter string will sort first. (Actually, if all secondary weights are
5706 * smaller than all primary ones, there is no need for a separator weight
5707 * between those two levels, etc.)
5708 *
5709 * The length of the transformed string is roughly a linear function of the
5710 * input string. It's not exactly linear because some characters don't
5711 * have weights at all levels. When we call strxfrm() we have to allocate
5712 * some memory to hold the transformed string. The calculations below try
5713 * to find coefficients 'm' and 'b' for this locale so that m*x + b equals
5714 * how much space we need, given the size of the input string in 'x'. If
5715 * we calculate too small, we increase the size as needed, and call
5716 * strxfrm() again, but it is better to get it right the first time to
5717 * avoid wasted expensive string transformations.
5718 *
5719 * We use the string below to find how long the transformation of it is.
5720 * Almost all locales are supersets of ASCII, or at least the ASCII
5721 * letters. We use all of them, half upper half lower, because if we used
5722 * fewer, we might hit just the ones that are outliers in a particular
5723 * locale. Most of the strings being collated will contain a preponderance
5724 * of letters, and even if they are above-ASCII, they are likely to have
5725 * the same number of weight levels as the ASCII ones. It turns out that
5726 * digits tend to have fewer levels, and some punctuation has more, but
5727 * those are relatively sparse in text, and khw believes this gives a
5728 * reasonable result, but it could be changed if experience so dictates. */
5729 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
5730 char * x_longer; /* Transformed 'longer' */
5731 Size_t x_len_longer; /* Length of 'x_longer' */
5732
5733 char * x_shorter; /* We also transform a substring of 'longer' */
5734 Size_t x_len_shorter;
5735
5736 PL_in_utf8_COLLATE_locale = (PL_collation_standard)
5737 ? 0
5738 : is_locale_utf8(PL_collation_name);
5739 PL_strxfrm_NUL_replacement = '\0';
5740 PL_strxfrm_max_cp = 0;
5741
5742 /* mem_collxfrm_() is used get the transformation (though here we are
5743 * interested only in its length). It is used because it has the
5744 * intelligence to handle all cases, but to work, it needs some values of
5745 * 'm' and 'b' to get it started. For the purposes of this calculation we
5746 * use a very conservative estimate of 'm' and 'b'. This assumes a weight
5747 * can be multiple bytes, enough to hold any UV on the platform, and there
5748 * are 5 levels, 4 weight bytes, and a trailing NUL. */
5749 PL_collxfrm_base = 5;
5750 PL_collxfrm_mult = 5 * sizeof(UV);
5751
5752 /* Find out how long the transformation really is */
5753 x_longer = mem_collxfrm_(longer,
5754 sizeof(longer) - 1,
5755 &x_len_longer,
5756
5757 /* We avoid converting to UTF-8 in the called
5758 * function by telling it the string is in UTF-8
5759 * if the locale is a UTF-8 one. Since the string
5760 * passed here is invariant under UTF-8, we can
5761 * claim it's UTF-8 even though it isn't. */
5762 PL_in_utf8_COLLATE_locale);
5763 Safefree(x_longer);
5764
5765 /* Find out how long the transformation of a substring of 'longer' is.
5766 * Together the lengths of these transformations are sufficient to
5767 * calculate 'm' and 'b'. The substring is all of 'longer' except the
5768 * first character. This minimizes the chances of being swayed by outliers
5769 * */
5770 x_shorter = mem_collxfrm_(longer + 1,
5771 sizeof(longer) - 2,
5772 &x_len_shorter,
5773 PL_in_utf8_COLLATE_locale);
5774 Safefree(x_shorter);
5775
5776 /* If the results are nonsensical for this simple test, the whole locale
5777 * definition is suspect. Mark it so that locale collation is not active
5778 * at all for it. XXX Should we warn? */
5779 if ( x_len_shorter == 0
5780 || x_len_longer == 0
5781 || x_len_shorter >= x_len_longer)
5782 {
5783 PL_collxfrm_mult = 0;
5784 PL_collxfrm_base = 1;
5785 DEBUG_L(PerlIO_printf(Perl_debug_log,
5786 "Disabling locale collation for LC_COLLATE='%s';"
5787 " length for shorter sample=%zu; longer=%zu\n",
5788 PL_collation_name, x_len_shorter, x_len_longer));
5789 }
5790 else {
5791 SSize_t base; /* Temporary */
5792
5793 /* We have both: m * strlen(longer) + b = x_len_longer
5794 * m * strlen(shorter) + b = x_len_shorter;
5795 * subtracting yields:
5796 * m * (strlen(longer) - strlen(shorter))
5797 * = x_len_longer - x_len_shorter
5798 * But we have set things up so that 'shorter' is 1 byte smaller than
5799 * 'longer'. Hence:
5800 * m = x_len_longer - x_len_shorter
5801 *
5802 * But if something went wrong, make sure the multiplier is at least 1.
5803 */
5804 if (x_len_longer > x_len_shorter) {
5805 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
5806 }
5807 else {
5808 PL_collxfrm_mult = 1;
5809 }
5810
5811 /* mx + b = len
5812 * so: b = len - mx
5813 * but in case something has gone wrong, make sure it is non-negative
5814 * */
5815 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
5816 if (base < 0) {
5817 base = 0;
5818 }
5819
5820 /* Add 1 for the trailing NUL */
5821 PL_collxfrm_base = base + 1;
5822 }
5823
5824 DEBUG_L(PerlIO_printf(Perl_debug_log,
5825 "?UTF-8 locale=%d; x_len_shorter=%zu, "
5826 "x_len_longer=%zu,"
5827 " collate multipler=%zu, collate base=%zu\n",
5828 PL_in_utf8_COLLATE_locale,
5829 x_len_shorter, x_len_longer,
5830 PL_collxfrm_mult, PL_collxfrm_base));
5831 }
5832
5833 char *
Perl_mem_collxfrm_(pTHX_ const char * input_string,STRLEN len,STRLEN * xlen,bool utf8)5834 Perl_mem_collxfrm_(pTHX_ const char *input_string,
5835 STRLEN len, /* Length of 'input_string' */
5836 STRLEN *xlen, /* Set to length of returned string
5837 (not including the collation index
5838 prefix) */
5839 bool utf8 /* Is the input in UTF-8? */
5840 )
5841 {
5842 /* mem_collxfrm_() is like strxfrm() but with two important differences.
5843 * First, it handles embedded NULs. Second, it allocates a bit more memory
5844 * than needed for the transformed data itself. The real transformed data
5845 * begins at offset COLLXFRM_HDR_LEN. *xlen is set to the length of that,
5846 * and doesn't include the collation index size.
5847 *
5848 * It is the caller's responsibility to eventually free the memory returned
5849 * by this function.
5850 *
5851 * Please see sv_collxfrm() to see how this is used. */
5852
5853 # define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
5854
5855 char * s = (char *) input_string;
5856 STRLEN s_strlen = strlen(input_string);
5857 char *xbuf = NULL;
5858 STRLEN xAlloc; /* xalloc is a reserved word in VC */
5859 STRLEN length_in_chars;
5860 bool first_time = TRUE; /* Cleared after first loop iteration */
5861
5862 # ifdef USE_LOCALE_CTYPE
5863 const char * orig_CTYPE_locale = NULL;
5864 # endif
5865
5866 # if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
5867 locale_t constructed_locale = (locale_t) 0;
5868 # endif
5869
5870 PERL_ARGS_ASSERT_MEM_COLLXFRM_;
5871
5872 /* Must be NUL-terminated */
5873 assert(*(input_string + len) == '\0');
5874
5875 if (PL_collxfrm_mult == 0) { /* unknown or bad */
5876 if (PL_collxfrm_base != 0) { /* bad collation => skip */
5877 DEBUG_L(PerlIO_printf(Perl_debug_log,
5878 "mem_collxfrm_: locale's collation is defective\n"));
5879 goto bad;
5880 }
5881
5882 /* (mult, base) == (0,0) means we need to calculate mult and base
5883 * before proceeding */
5884 S_compute_collxfrm_coefficients(aTHX);
5885 }
5886
5887 /* Replace any embedded NULs with the control that sorts before any others.
5888 * This will give as good as possible results on strings that don't
5889 * otherwise contain that character, but otherwise there may be
5890 * less-than-perfect results with that character and NUL. This is
5891 * unavoidable unless we replace strxfrm with our own implementation. */
5892 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
5893 NUL */
5894 char * e = s + len;
5895 char * sans_nuls;
5896 STRLEN sans_nuls_len;
5897 int try_non_controls;
5898 char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
5899 making sure 2nd byte is NUL.
5900 */
5901 STRLEN this_replacement_len;
5902
5903 /* If we don't know what non-NUL control character sorts lowest for
5904 * this locale, find it */
5905 if (PL_strxfrm_NUL_replacement == '\0') {
5906 int j;
5907 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
5908 includes the collation index
5909 prefixed. */
5910
5911 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n"));
5912
5913 /* Unlikely, but it may be that no control will work to replace
5914 * NUL, in which case we instead look for any character. Controls
5915 * are preferred because collation order is, in general, context
5916 * sensitive, with adjoining characters affecting the order, and
5917 * controls are less likely to have such interactions, allowing the
5918 * NUL-replacement to stand on its own. (Another way to look at it
5919 * is to imagine what would happen if the NUL were replaced by a
5920 * combining character; it wouldn't work out all that well.) */
5921 for (try_non_controls = 0;
5922 try_non_controls < 2;
5923 try_non_controls++)
5924 {
5925
5926 # ifdef USE_LOCALE_CTYPE
5927
5928 /* In this case we use isCNTRL_LC() below, which relies on
5929 * LC_CTYPE, so that must be switched to correspond with the
5930 * LC_COLLATE locale */
5931 if (! try_non_controls && ! PL_in_utf8_COLLATE_locale) {
5932 orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
5933 }
5934 # endif
5935 /* Look through all legal code points (NUL isn't) */
5936 for (j = 1; j < 256; j++) {
5937 char * x; /* j's xfrm plus collation index */
5938 STRLEN x_len; /* length of 'x' */
5939 STRLEN trial_len = 1;
5940 char cur_source[] = { '\0', '\0' };
5941
5942 /* Skip non-controls the first time through the loop. The
5943 * controls in a UTF-8 locale are the L1 ones */
5944 if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
5945 ? ! isCNTRL_L1(j)
5946 : ! isCNTRL_LC(j))
5947 {
5948 continue;
5949 }
5950
5951 /* Create a 1-char string of the current code point */
5952 cur_source[0] = (char) j;
5953
5954 /* Then transform it */
5955 x = mem_collxfrm_(cur_source, trial_len, &x_len,
5956 0 /* The string is not in UTF-8 */);
5957
5958 /* Ignore any character that didn't successfully transform.
5959 * */
5960 if (! x) {
5961 continue;
5962 }
5963
5964 /* If this character's transformation is lower than
5965 * the current lowest, this one becomes the lowest */
5966 if ( cur_min_x == NULL
5967 || strLT(x + COLLXFRM_HDR_LEN,
5968 cur_min_x + COLLXFRM_HDR_LEN))
5969 {
5970 PL_strxfrm_NUL_replacement = j;
5971 Safefree(cur_min_x);
5972 cur_min_x = x;
5973 }
5974 else {
5975 Safefree(x);
5976 }
5977 } /* end of loop through all 255 characters */
5978
5979 # ifdef USE_LOCALE_CTYPE
5980 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
5981 # endif
5982
5983 /* Stop looking if found */
5984 if (cur_min_x) {
5985 break;
5986 }
5987
5988 /* Unlikely, but possible, if there aren't any controls that
5989 * work in the locale, repeat the loop, looking for any
5990 * character that works */
5991 DEBUG_L(PerlIO_printf(Perl_debug_log,
5992 "mem_collxfrm_: No control worked. Trying non-controls\n"));
5993 } /* End of loop to try first the controls, then any char */
5994
5995 if (! cur_min_x) {
5996 DEBUG_L(PerlIO_printf(Perl_debug_log,
5997 "mem_collxfrm_: Couldn't find any character to replace"
5998 " embedded NULs in locale %s with", PL_collation_name));
5999 goto bad;
6000 }
6001
6002 DEBUG_L(PerlIO_printf(Perl_debug_log,
6003 "mem_collxfrm_: Replacing embedded NULs in locale %s with "
6004 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
6005
6006 Safefree(cur_min_x);
6007 } /* End of determining the character that is to replace NULs */
6008
6009 /* If the replacement is variant under UTF-8, it must match the
6010 * UTF8-ness of the original */
6011 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
6012 this_replacement_char[0] =
6013 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
6014 this_replacement_char[1] =
6015 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
6016 this_replacement_len = 2;
6017 }
6018 else {
6019 this_replacement_char[0] = PL_strxfrm_NUL_replacement;
6020 /* this_replacement_char[1] = '\0' was done at initialization */
6021 this_replacement_len = 1;
6022 }
6023
6024 /* The worst case length for the replaced string would be if every
6025 * character in it is NUL. Multiply that by the length of each
6026 * replacement, and allow for a trailing NUL */
6027 sans_nuls_len = (len * this_replacement_len) + 1;
6028 Newx(sans_nuls, sans_nuls_len, char);
6029 *sans_nuls = '\0';
6030
6031 /* Replace each NUL with the lowest collating control. Loop until have
6032 * exhausted all the NULs */
6033 while (s + s_strlen < e) {
6034 my_strlcat(sans_nuls, s, sans_nuls_len);
6035
6036 /* Do the actual replacement */
6037 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len);
6038
6039 /* Move past the input NUL */
6040 s += s_strlen + 1;
6041 s_strlen = strlen(s);
6042 }
6043
6044 /* And add anything that trails the final NUL */
6045 my_strlcat(sans_nuls, s, sans_nuls_len);
6046
6047 /* Switch so below we transform this modified string */
6048 s = sans_nuls;
6049 len = strlen(s);
6050 } /* End of replacing NULs */
6051
6052 /* Make sure the UTF8ness of the string and locale match */
6053 if (utf8 != PL_in_utf8_COLLATE_locale) {
6054 /* XXX convert above Unicode to 10FFFF? */
6055 const char * const t = s; /* Temporary so we can later find where the
6056 input was */
6057
6058 /* Here they don't match. Change the string's to be what the locale is
6059 * expecting */
6060
6061 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
6062 s = (char *) bytes_to_utf8((const U8 *) s, &len);
6063 utf8 = TRUE;
6064 }
6065 else { /* locale is not UTF-8; but input is; downgrade the input */
6066
6067 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
6068
6069 /* If the downgrade was successful we are done, but if the input
6070 * contains things that require UTF-8 to represent, have to do
6071 * damage control ... */
6072 if (UNLIKELY(utf8)) {
6073
6074 /* What we do is construct a non-UTF-8 string with
6075 * 1) the characters representable by a single byte converted
6076 * to be so (if necessary);
6077 * 2) and the rest converted to collate the same as the
6078 * highest collating representable character. That makes
6079 * them collate at the end. This is similar to how we
6080 * handle embedded NULs, but we use the highest collating
6081 * code point instead of the smallest. Like the NUL case,
6082 * this isn't perfect, but is the best we can reasonably
6083 * do. Every above-255 code point will sort the same as
6084 * the highest-sorting 0-255 code point. If that code
6085 * point can combine in a sequence with some other code
6086 * points for weight calculations, us changing something to
6087 * be it can adversely affect the results. But in most
6088 * cases, it should work reasonably. And note that this is
6089 * really an illegal situation: using code points above 255
6090 * on a locale where only 0-255 are valid. If two strings
6091 * sort entirely equal, then the sort order for the
6092 * above-255 code points will be in code point order. */
6093
6094 utf8 = FALSE;
6095
6096 /* If we haven't calculated the code point with the maximum
6097 * collating order for this locale, do so now */
6098 if (! PL_strxfrm_max_cp) {
6099 int j;
6100
6101 /* The current transformed string that collates the
6102 * highest (except it also includes the prefixed collation
6103 * index. */
6104 char * cur_max_x = NULL;
6105
6106 /* Look through all legal code points (NUL isn't) */
6107 for (j = 1; j < 256; j++) {
6108 char * x;
6109 STRLEN x_len;
6110 char cur_source[] = { '\0', '\0' };
6111
6112 /* Create a 1-char string of the current code point */
6113 cur_source[0] = (char) j;
6114
6115 /* Then transform it */
6116 x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);
6117
6118 /* If something went wrong (which it shouldn't), just
6119 * ignore this code point */
6120 if (! x) {
6121 continue;
6122 }
6123
6124 /* If this character's transformation is higher than
6125 * the current highest, this one becomes the highest */
6126 if ( cur_max_x == NULL
6127 || strGT(x + COLLXFRM_HDR_LEN,
6128 cur_max_x + COLLXFRM_HDR_LEN))
6129 {
6130 PL_strxfrm_max_cp = j;
6131 Safefree(cur_max_x);
6132 cur_max_x = x;
6133 }
6134 else {
6135 Safefree(x);
6136 }
6137 }
6138
6139 if (! cur_max_x) {
6140 DEBUG_L(PerlIO_printf(Perl_debug_log,
6141 "mem_collxfrm_: Couldn't find any character to"
6142 " replace above-Latin1 chars in locale %s with",
6143 PL_collation_name));
6144 goto bad;
6145 }
6146
6147 DEBUG_L(PerlIO_printf(Perl_debug_log,
6148 "mem_collxfrm_: highest 1-byte collating character"
6149 " in locale %s is 0x%02X\n",
6150 PL_collation_name,
6151 PL_strxfrm_max_cp));
6152
6153 Safefree(cur_max_x);
6154 }
6155
6156 /* Here we know which legal code point collates the highest.
6157 * We are ready to construct the non-UTF-8 string. The length
6158 * will be at least 1 byte smaller than the input string
6159 * (because we changed at least one 2-byte character into a
6160 * single byte), but that is eaten up by the trailing NUL */
6161 Newx(s, len, char);
6162
6163 {
6164 STRLEN i;
6165 STRLEN d= 0;
6166 char * e = (char *) t + len;
6167
6168 for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
6169 U8 cur_char = t[i];
6170 if (UTF8_IS_INVARIANT(cur_char)) {
6171 s[d++] = cur_char;
6172 }
6173 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
6174 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
6175 }
6176 else { /* Replace illegal cp with highest collating
6177 one */
6178 s[d++] = PL_strxfrm_max_cp;
6179 }
6180 }
6181 s[d++] = '\0';
6182 Renew(s, d, char); /* Free up unused space */
6183 }
6184 }
6185 }
6186
6187 /* Here, we have constructed a modified version of the input. It could
6188 * be that we already had a modified copy before we did this version.
6189 * If so, that copy is no longer needed */
6190 if (t != input_string) {
6191 Safefree(t);
6192 }
6193 }
6194
6195 length_in_chars = (utf8)
6196 ? utf8_length((U8 *) s, (U8 *) s + len)
6197 : len;
6198
6199 /* The first element in the output is the collation id, used by
6200 * sv_collxfrm(); then comes the space for the transformed string. The
6201 * equation should give us a good estimate as to how much is needed */
6202 xAlloc = COLLXFRM_HDR_LEN
6203 + PL_collxfrm_base
6204 + (PL_collxfrm_mult * length_in_chars);
6205 Newx(xbuf, xAlloc, char);
6206 if (UNLIKELY(! xbuf)) {
6207 DEBUG_L(PerlIO_printf(Perl_debug_log,
6208 "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc));
6209 goto bad;
6210 }
6211
6212 /* Store the collation id */
6213 *(U32*)xbuf = PL_collation_ix;
6214
6215 # if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
6216 # ifdef USE_LOCALE_CTYPE
6217
6218 constructed_locale = newlocale(LC_CTYPE_MASK, PL_collation_name,
6219 duplocale(use_curlocale_scratch()));
6220 # else
6221
6222 constructed_locale = duplocale(use_curlocale_scratch());
6223
6224 # endif
6225 # define my_strxfrm(dest, src, n) strxfrm_l(dest, src, n, \
6226 constructed_locale)
6227 # define CLEANUP_STRXFRM \
6228 STMT_START { \
6229 if (constructed_locale != (locale_t) 0) \
6230 freelocale(constructed_locale); \
6231 } STMT_END
6232 # else
6233 # define my_strxfrm(dest, src, n) strxfrm(dest, src, n)
6234 # ifdef USE_LOCALE_CTYPE
6235
6236 orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
6237
6238 # define CLEANUP_STRXFRM \
6239 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
6240 # else
6241 # define CLEANUP_STRXFRM NOOP
6242 # endif
6243 # endif
6244
6245 /* Then the transformation of the input. We loop until successful, or we
6246 * give up */
6247 for (;;) {
6248
6249 errno = 0;
6250 *xlen = my_strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
6251
6252 /* If the transformed string occupies less space than we told strxfrm()
6253 * was available, it means it transformed the whole string. */
6254 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
6255
6256 /* But there still could have been a problem */
6257 if (errno != 0) {
6258 DEBUG_L(PerlIO_printf(Perl_debug_log,
6259 "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n",
6260 PL_collation_name, errno,
6261 _byte_dump_string((U8 *) s, len, 0)));
6262 goto bad;
6263 }
6264
6265 /* Here, the transformation was successful. Some systems include a
6266 * trailing NUL in the returned length. Ignore it, using a loop in
6267 * case multiple trailing NULs are returned. */
6268 while ( (*xlen) > 0
6269 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
6270 {
6271 (*xlen)--;
6272 }
6273
6274 /* If the first try didn't get it, it means our prediction was low.
6275 * Modify the coefficients so that we predict a larger value in any
6276 * future transformations */
6277 if (! first_time) {
6278 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */
6279 STRLEN computed_guess = PL_collxfrm_base
6280 + (PL_collxfrm_mult * length_in_chars);
6281
6282 /* On zero-length input, just keep current slope instead of
6283 * dividing by 0 */
6284 const STRLEN new_m = (length_in_chars != 0)
6285 ? needed / length_in_chars
6286 : PL_collxfrm_mult;
6287
6288 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6289 "initial size of %zu bytes for a length "
6290 "%zu string was insufficient, %zu needed\n",
6291 computed_guess, length_in_chars, needed));
6292
6293 /* If slope increased, use it, but discard this result for
6294 * length 1 strings, as we can't be sure that it's a real slope
6295 * change */
6296 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) {
6297
6298 # ifdef DEBUGGING
6299
6300 STRLEN old_m = PL_collxfrm_mult;
6301 STRLEN old_b = PL_collxfrm_base;
6302
6303 # endif
6304
6305 PL_collxfrm_mult = new_m;
6306 PL_collxfrm_base = 1; /* +1 For trailing NUL */
6307 computed_guess = PL_collxfrm_base
6308 + (PL_collxfrm_mult * length_in_chars);
6309 if (computed_guess < needed) {
6310 PL_collxfrm_base += needed - computed_guess;
6311 }
6312
6313 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6314 "slope is now %zu; was %zu, base "
6315 "is now %zu; was %zu\n",
6316 PL_collxfrm_mult, old_m,
6317 PL_collxfrm_base, old_b));
6318 }
6319 else { /* Slope didn't change, but 'b' did */
6320 const STRLEN new_b = needed
6321 - computed_guess
6322 + PL_collxfrm_base;
6323 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6324 "base is now %zu; was %zu\n", new_b, PL_collxfrm_base));
6325 PL_collxfrm_base = new_b;
6326 }
6327 }
6328
6329 break;
6330 }
6331
6332 if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
6333 DEBUG_L(PerlIO_printf(Perl_debug_log,
6334 "mem_collxfrm_: Needed %zu bytes, max permissible is %u\n",
6335 *xlen, PERL_INT_MAX));
6336 goto bad;
6337 }
6338
6339 /* A well-behaved strxfrm() returns exactly how much space it needs
6340 * (usually not including the trailing NUL) when it fails due to not
6341 * enough space being provided. Assume that this is the case unless
6342 * it's been proven otherwise */
6343 if (LIKELY(PL_strxfrm_is_behaved) && first_time) {
6344 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1;
6345 }
6346 else { /* Here, either:
6347 * 1) The strxfrm() has previously shown bad behavior; or
6348 * 2) It isn't the first time through the loop, which means
6349 * that the strxfrm() is now showing bad behavior, because
6350 * we gave it what it said was needed in the previous
6351 * iteration, and it came back saying it needed still more.
6352 * (Many versions of cygwin fit this. When the buffer size
6353 * isn't sufficient, they return the input size instead of
6354 * how much is needed.)
6355 * Increase the buffer size by a fixed percentage and try again.
6356 * */
6357 xAlloc += (xAlloc / 4) + 1;
6358 PL_strxfrm_is_behaved = FALSE;
6359
6360 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6361 "mem_collxfrm_ required more space than previously"
6362 " calculated for locale %s, trying again with new"
6363 " guess=%zu+%zu\n",
6364 PL_collation_name, COLLXFRM_HDR_LEN,
6365 xAlloc - COLLXFRM_HDR_LEN));
6366 }
6367
6368 Renew(xbuf, xAlloc, char);
6369 if (UNLIKELY(! xbuf)) {
6370 DEBUG_L(PerlIO_printf(Perl_debug_log,
6371 "mem_collxfrm_: Couldn't realloc %zu bytes\n", xAlloc));
6372 goto bad;
6373 }
6374
6375 first_time = FALSE;
6376 }
6377
6378 CLEANUP_STRXFRM;
6379
6380 DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8));
6381
6382 /* Free up unneeded space; retain enough for trailing NUL */
6383 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
6384
6385 if (s != input_string) {
6386 Safefree(s);
6387 }
6388
6389 return xbuf;
6390
6391 bad:
6392
6393 CLEANUP_STRXFRM;
6394 DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
6395
6396 Safefree(xbuf);
6397 if (s != input_string) {
6398 Safefree(s);
6399 }
6400 *xlen = 0;
6401
6402 return NULL;
6403 }
6404
6405 # ifdef DEBUGGING
6406
6407 STATIC void
S_print_collxfrm_input_and_return(pTHX_ const char * s,const char * e,const char * xbuf,const STRLEN xlen,const bool is_utf8)6408 S_print_collxfrm_input_and_return(pTHX_
6409 const char * s,
6410 const char * e,
6411 const char * xbuf,
6412 const STRLEN xlen,
6413 const bool is_utf8)
6414 {
6415
6416 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
6417
6418 PerlIO_printf(Perl_debug_log,
6419 "mem_collxfrm_[ix %" UVuf "] for locale '%s':\n"
6420 " input=%s\n return=%s\n return len=%zu\n",
6421 (UV) PL_collation_ix, PL_collation_name,
6422 get_displayable_string(s, e, is_utf8),
6423 ((xbuf == NULL)
6424 ? "(null)"
6425 : _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN, xlen, 0)),
6426 xlen);
6427 }
6428
6429 # endif /* DEBUGGING */
6430
6431 SV *
Perl_strxfrm(pTHX_ SV * src)6432 Perl_strxfrm(pTHX_ SV * src)
6433 {
6434 PERL_ARGS_ASSERT_STRXFRM;
6435
6436 /* For use by POSIX::strxfrm(). If they differ, toggle LC_CTYPE to
6437 * LC_COLLATE to avoid potential mojibake.
6438 *
6439 * If we can't calculate a collation, 'src' is instead returned, so that
6440 * future comparisons will be by code point order */
6441
6442 # ifdef USE_LOCALE_CTYPE
6443
6444 const char * orig_ctype = toggle_locale_c(LC_CTYPE,
6445 querylocale_c(LC_COLLATE));
6446 # endif
6447
6448 SV * dst = src;
6449 STRLEN dstlen;
6450 STRLEN srclen;
6451 const char *p = SvPV_const(src,srclen);
6452 const U32 utf8_flag = SvUTF8(src);
6453 char *d = mem_collxfrm_(p, srclen, &dstlen, cBOOL(utf8_flag));
6454
6455 assert(utf8_flag == 0 || utf8_flag == SVf_UTF8);
6456
6457 if (d != NULL) {
6458 assert(dstlen > 0);
6459 dst =newSVpvn_flags(d + COLLXFRM_HDR_LEN,
6460 dstlen, SVs_TEMP|utf8_flag);
6461 Safefree(d);
6462 }
6463
6464 # ifdef USE_LOCALE_CTYPE
6465
6466 restore_toggled_locale_c(LC_CTYPE, orig_ctype);
6467
6468 # endif
6469
6470 return dst;
6471 }
6472
6473 #endif /* USE_LOCALE_COLLATE */
6474 #if defined(DEBUGGING) || defined(USE_POSIX_2008_LOCALE)
6475
6476 STATIC const char *
S_get_displayable_string(pTHX_ const char * const s,const char * const e,const bool is_utf8)6477 S_get_displayable_string(pTHX_
6478 const char * const s,
6479 const char * const e,
6480 const bool is_utf8)
6481 {
6482 PERL_ARGS_ASSERT_GET_DISPLAYABLE_STRING;
6483
6484 const char * t = s;
6485 bool prev_was_printable = TRUE;
6486 bool first_time = TRUE;
6487 char * ret;
6488
6489 if (e <= s) {
6490 return "";
6491 }
6492
6493 /* Worst case scenario: All are non-printable so have a blank between each.
6494 * If UTF-8, all are the largest possible code point; otherwise all are a
6495 * single byte. '(2 + 1)' is from each byte takes 2 characters to
6496 * display, and a blank (or NUL for the final one) after it */
6497 Newxz(ret, (e - s) * (2 + 1) * ((is_utf8) ? UVSIZE : 1), char);
6498 SAVEFREEPV(ret);
6499
6500 while (t < e) {
6501 UV cp = (is_utf8)
6502 ? utf8_to_uvchr_buf((U8 *) t, e, NULL)
6503 : * (U8 *) t;
6504 if (isPRINT(cp)) {
6505 if (! prev_was_printable) {
6506 my_strlcat(ret, " ", sizeof(ret));
6507 }
6508
6509 /* Escape these to avoid any ambiguity */
6510 if (cp == ' ' || cp == '\\') {
6511 my_strlcat(ret, "\\", sizeof(ret));
6512 }
6513 my_strlcat(ret, Perl_form(aTHX_ "%c", (U8) cp), sizeof(ret));
6514 prev_was_printable = TRUE;
6515 }
6516 else {
6517 if (! first_time) {
6518 my_strlcat(ret, " ", sizeof(ret));
6519 }
6520 my_strlcat(ret, Perl_form(aTHX_ "%02" UVXf, cp), sizeof(ret));
6521 prev_was_printable = FALSE;
6522 }
6523 t += (is_utf8) ? UTF8SKIP(t) : 1;
6524 first_time = FALSE;
6525 }
6526
6527 return ret;
6528 }
6529
6530 #endif
6531 #ifdef USE_LOCALE
6532
6533 STATIC const char *
S_toggle_locale_i(pTHX_ const unsigned cat_index,const char * new_locale,const line_t caller_line)6534 S_toggle_locale_i(pTHX_ const unsigned cat_index,
6535 const char * new_locale,
6536 const line_t caller_line)
6537 {
6538 /* Changes the locale for the category specified by 'index' to 'new_locale,
6539 * if they aren't already the same.
6540 *
6541 * Returns a copy of the name of the original locale for 'cat_index'
6542 * so can be switched back to with the companion function
6543 * restore_toggled_locale_i(), (NULL if no restoral is necessary.) */
6544
6545 const char * locale_to_restore_to = NULL;
6546
6547 PERL_ARGS_ASSERT_TOGGLE_LOCALE_I;
6548 assert(cat_index <= NOMINAL_LC_ALL_INDEX);
6549
6550 /* Find the original locale of the category we may need to change, so that
6551 * it can be restored to later */
6552
6553 locale_to_restore_to = querylocale_i(cat_index);
6554
6555 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6556 "(%" LINE_Tf "): toggle_locale_i: index=%d(%s), wanted=%s,"
6557 " actual=%s\n",
6558 caller_line, cat_index, category_names[cat_index],
6559 new_locale, locale_to_restore_to));
6560
6561 if (! locale_to_restore_to) {
6562 locale_panic_(Perl_form(aTHX_
6563 "Could not find current %s locale, errno=%d",
6564 category_names[cat_index], errno));
6565 }
6566
6567 /* If the locales are the same, there's nothing to do */
6568 if (strEQ(locale_to_restore_to, new_locale)) {
6569 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6570 "(%" LINE_Tf "): %s locale unchanged as %s\n",
6571 caller_line, category_names[cat_index],
6572 new_locale));
6573
6574 return NULL;
6575 }
6576
6577 /* Finally, change the locale to the new one */
6578 void_setlocale_i(cat_index, new_locale);
6579
6580 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6581 "(%" LINE_Tf "): %s locale switched to %s\n",
6582 caller_line, category_names[cat_index], new_locale));
6583
6584 return locale_to_restore_to;
6585
6586 # ifndef DEBUGGING
6587 PERL_UNUSED_ARG(caller_line);
6588 # endif
6589
6590 }
6591
6592 STATIC void
S_restore_toggled_locale_i(pTHX_ const unsigned int cat_index,const char * restore_locale,const line_t caller_line)6593 S_restore_toggled_locale_i(pTHX_ const unsigned int cat_index,
6594 const char * restore_locale,
6595 const line_t caller_line)
6596 {
6597 /* Restores the locale for LC_category corresponding to cat_indes to
6598 * 'restore_locale' (which is a copy that will be freed by this function),
6599 * or do nothing if the latter parameter is NULL */
6600
6601 PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I;
6602 assert(cat_index <= NOMINAL_LC_ALL_INDEX);
6603
6604 if (restore_locale == NULL) {
6605 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6606 "(%" LINE_Tf "): No need to restore %s\n",
6607 caller_line, category_names[cat_index]));
6608 return;
6609 }
6610
6611 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
6612 "(%" LINE_Tf "): %s restoring locale to %s\n",
6613 caller_line, category_names[cat_index],
6614 restore_locale));
6615
6616 void_setlocale_i(cat_index, restore_locale);
6617
6618 # ifndef DEBUGGING
6619 PERL_UNUSED_ARG(caller_line);
6620 # endif
6621
6622 }
6623
6624 # ifdef USE_LOCALE_CTYPE
6625
6626 STATIC bool
S_is_codeset_name_UTF8(const char * name)6627 S_is_codeset_name_UTF8(const char * name)
6628 {
6629 /* Return a boolean as to if the passed-in name indicates it is a UTF-8
6630 * code set. Several variants are possible */
6631 const Size_t len = strlen(name);
6632
6633 PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8;
6634
6635 # ifdef WIN32
6636
6637 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
6638 if (memENDs(name, len, "65001")) {
6639 return TRUE;
6640 }
6641
6642 # endif
6643 /* 'UTF8' or 'UTF-8' */
6644 return ( inRANGE(len, 4, 5)
6645 && name[len-1] == '8'
6646 && ( memBEGINs(name, len, "UTF")
6647 || memBEGINs(name, len, "utf"))
6648 && (len == 4 || name[3] == '-'));
6649 }
6650
6651 # endif
6652 #endif /* USE_LOCALE */
6653
6654 bool
Perl__is_in_locale_category(pTHX_ const bool compiling,const int category)6655 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
6656 {
6657 /* Internal function which returns if we are in the scope of a pragma that
6658 * enables the locale category 'category'. 'compiling' should indicate if
6659 * this is during the compilation phase (TRUE) or not (FALSE). */
6660
6661 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
6662
6663 SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
6664 if (! these_categories || these_categories == &PL_sv_placeholder) {
6665 return FALSE;
6666 }
6667
6668 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
6669 * a valid unsigned */
6670 assert(category >= -1);
6671 return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
6672 }
6673
6674 /* my_strerror() returns a mortalized copy of the text of the error message
6675 * associated with 'errnum'.
6676 *
6677 * If not called from within the scope of 'use locale', it uses the text from
6678 * the C locale. If Perl is compiled to not pay attention to LC_CTYPE nor
6679 * LC_MESSAGES, it uses whatever strerror() returns. Otherwise the text is
6680 * derived from the locale, LC_MESSAGES if we have that; LC_CTYPE if not.
6681 *
6682 * It returns in *utf8ness the result's UTF-8ness
6683 *
6684 * The function just calls strerror(), but temporarily switches locales, if
6685 * needed. Many platforms require LC_CTYPE and LC_MESSAGES to be in the same
6686 * CODESET in order for the return from strerror() to not contain '?' symbols,
6687 * or worse, mojibaked. It's cheaper to just use the stricter criteria of
6688 * being in the same locale. So the code below uses a common locale for both
6689 * categories. Again, that is C if not within 'use locale' scope; or the
6690 * LC_MESSAGES locale if in scope and we have that category; and LC_CTYPE if we
6691 * don't have LC_MESSAGES; and whatever strerror returns if we don't have
6692 * either category.
6693 *
6694 * There are two sets of implementations. The first below is if we have
6695 * strerror_l(). This is the simpler. We just use the already-built C locale
6696 * object if not in locale scope, or build up a custom one otherwise.
6697 *
6698 * When strerror_l() is not available, we may have to swap locales temporarily
6699 * to bring the two categories into sync with each other, and possibly to the C
6700 * locale.
6701 *
6702 * Because the prepropessing directives to conditionally compile this function
6703 * would greatly obscure the logic of the various implementations, the whole
6704 * function is repeated for each configuration, with some common macros. */
6705
6706 /* Used to shorten the definitions of the following implementations of
6707 * my_strerror() */
6708 #define DEBUG_STRERROR_ENTER(errnum, in_locale) \
6709 DEBUG_Lv(PerlIO_printf(Perl_debug_log, \
6710 "my_strerror called with errnum %d;" \
6711 " Within locale scope=%d\n", \
6712 errnum, in_locale))
6713 #define DEBUG_STRERROR_RETURN(errstr, utf8ness) \
6714 DEBUG_Lv(PerlIO_printf(Perl_debug_log, \
6715 "Strerror returned; saving a copy: '%s';" \
6716 " utf8ness=%d\n", \
6717 get_displayable_string(errstr, \
6718 errstr + strlen(errstr), \
6719 *utf8ness), \
6720 (int) *utf8ness))
6721
6722 /* On platforms that have precisely one of these categories (Windows
6723 * qualifies), these yield the correct one */
6724 #if defined(USE_LOCALE_CTYPE)
6725 # define WHICH_LC_INDEX LC_CTYPE_INDEX_
6726 #elif defined(USE_LOCALE_MESSAGES)
6727 # define WHICH_LC_INDEX LC_MESSAGES_INDEX_
6728 #endif
6729
6730 /*==========================================================================*/
6731 /* First set of implementations, when have strerror_l() */
6732
6733 #if defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
6734
6735 # if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
6736
6737 /* Here, neither category is defined: use the C locale */
6738 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)6739 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6740 {
6741 PERL_ARGS_ASSERT_MY_STRERROR;
6742
6743 DEBUG_STRERROR_ENTER(errnum, 0);
6744
6745 const char *errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
6746 *utf8ness = UTF8NESS_IMMATERIAL;
6747
6748 DEBUG_STRERROR_RETURN(errstr, utf8ness);
6749
6750 SAVEFREEPV(errstr);
6751 return errstr;
6752 }
6753
6754 # elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
6755
6756 /*--------------------------------------------------------------------------*/
6757
6758 /* Here one or the other of CTYPE or MESSAGES is defined, but not both. If we
6759 * are not within 'use locale' scope of the only one defined, we use the C
6760 * locale; otherwise use the current locale object */
6761
6762 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)6763 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6764 {
6765 PERL_ARGS_ASSERT_MY_STRERROR;
6766
6767 DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
6768
6769 /* Use C if not within locale scope; Otherwise, use current locale */
6770 const locale_t which_obj = (IN_LC(categories[WHICH_LC_INDEX]))
6771 ? PL_C_locale_obj
6772 : use_curlocale_scratch();
6773
6774 const char *errstr = savepv(strerror_l(errnum, which_obj));
6775 *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
6776 NULL, WHICH_LC_INDEX);
6777 DEBUG_STRERROR_RETURN(errstr, utf8ness);
6778
6779 SAVEFREEPV(errstr);
6780 return errstr;
6781 }
6782
6783 /*--------------------------------------------------------------------------*/
6784 # else /* Are using both categories. Place them in the same CODESET,
6785 * either C or the LC_MESSAGES locale */
6786
6787 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)6788 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6789 {
6790 PERL_ARGS_ASSERT_MY_STRERROR;
6791
6792 DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
6793
6794 const char *errstr;
6795 if (! IN_LC(LC_MESSAGES)) { /* Use C if not within locale scope */
6796 errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
6797 *utf8ness = UTF8NESS_IMMATERIAL;
6798 }
6799 else { /* Otherwise, use the LC_MESSAGES locale, making sure LC_CTYPE
6800 matches */
6801 locale_t cur = duplocale(use_curlocale_scratch());
6802
6803 cur = newlocale(LC_CTYPE_MASK, querylocale_c(LC_MESSAGES), cur);
6804 errstr = savepv(strerror_l(errnum, cur));
6805 *utf8ness = get_locale_string_utf8ness_i(errstr,
6806 LOCALE_UTF8NESS_UNKNOWN,
6807 NULL, LC_MESSAGES_INDEX_);
6808 freelocale(cur);
6809 }
6810
6811 DEBUG_STRERROR_RETURN(errstr, utf8ness);
6812
6813 SAVEFREEPV(errstr);
6814 return errstr;
6815 }
6816 # endif /* Above is using strerror_l */
6817 /*==========================================================================*/
6818 #else /* Below is not using strerror_l */
6819 # if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
6820
6821 /* If not using using either of the categories, return plain, unadorned
6822 * strerror */
6823
6824 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)6825 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6826 {
6827 PERL_ARGS_ASSERT_MY_STRERROR;
6828
6829 DEBUG_STRERROR_ENTER(errnum, 0);
6830
6831 const char *errstr = savepv(Strerror(errnum));
6832 *utf8ness = UTF8NESS_IMMATERIAL;
6833
6834 DEBUG_STRERROR_RETURN(errstr, utf8ness);
6835
6836 SAVEFREEPV(errstr);
6837 return errstr;
6838 }
6839
6840 /*--------------------------------------------------------------------------*/
6841 # elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
6842
6843 /* Here one or the other of CTYPE or MESSAGES is defined, but not both. If we
6844 * are not within 'use locale' scope of the only one defined, we use the C
6845 * locale; otherwise use the current locale */
6846
6847 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)6848 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6849 {
6850 PERL_ARGS_ASSERT_MY_STRERROR;
6851
6852 DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
6853
6854 const char *errstr;
6855 if (IN_LC(categories[WHICH_LC_INDEX])) {
6856 errstr = savepv(Strerror(errnum));
6857 *utf8ness = get_locale_string_utf8ness_i(errstr,
6858 LOCALE_UTF8NESS_UNKNOWN,
6859 NULL, WHICH_LC_INDEX);
6860 }
6861 else {
6862
6863 SETLOCALE_LOCK;
6864
6865 const char * orig_locale = toggle_locale_i(WHICH_LC_INDEX, "C");
6866
6867 errstr = savepv(Strerror(errnum));
6868
6869 restore_toggled_locale_i(WHICH_LC_INDEX, orig_locale);
6870
6871 SETLOCALE_UNLOCK;
6872
6873 *utf8ness = UTF8NESS_IMMATERIAL;
6874
6875 }
6876
6877 DEBUG_STRERROR_RETURN(errstr, utf8ness);
6878
6879 SAVEFREEPV(errstr);
6880 return errstr;
6881 }
6882
6883 /*--------------------------------------------------------------------------*/
6884 # else
6885
6886 /* Below, have both LC_CTYPE and LC_MESSAGES. Place them in the same CODESET,
6887 * either C or the LC_MESSAGES locale */
6888
6889 const char *
Perl_my_strerror(pTHX_ const int errnum,utf8ness_t * utf8ness)6890 Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
6891 {
6892 PERL_ARGS_ASSERT_MY_STRERROR;
6893
6894 DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
6895
6896 const char * desired_locale = (IN_LC(LC_MESSAGES))
6897 ? querylocale_c(LC_MESSAGES)
6898 : "C";
6899 /* XXX Can fail on z/OS */
6900
6901 SETLOCALE_LOCK;
6902
6903 const char* orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, desired_locale);
6904 const char* orig_MESSAGES_locale = toggle_locale_c(LC_MESSAGES,
6905 desired_locale);
6906 const char *errstr = savepv(Strerror(errnum));
6907
6908 restore_toggled_locale_c(LC_MESSAGES, orig_MESSAGES_locale);
6909 restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
6910
6911 SETLOCALE_UNLOCK;
6912
6913 *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
6914 NULL, LC_MESSAGES_INDEX_);
6915 DEBUG_STRERROR_RETURN(errstr, utf8ness);
6916
6917 SAVEFREEPV(errstr);
6918 return errstr;
6919 }
6920
6921 /*--------------------------------------------------------------------------*/
6922 # endif /* end of not using strerror_l() */
6923 #endif /* end of all the my_strerror() implementations */
6924
6925 /*
6926
6927 =for apidoc switch_to_global_locale
6928
6929 This function copies the locale state of the calling thread into the program's
6930 global locale, and converts the thread to use that global locale.
6931
6932 It is intended so that Perl can safely be used with C libraries that access the
6933 global locale and which can't be converted to not access it. Effectively, this
6934 means libraries that call C<L<setlocale(3)>> on non-Windows systems. (For
6935 portability, it is a good idea to use it on Windows as well.)
6936
6937 A downside of using it is that it disables the services that Perl provides to
6938 hide locale gotchas from your code. The service you most likely will miss
6939 regards the radix character (decimal point) in floating point numbers. Code
6940 executed after this function is called can no longer just assume that this
6941 character is correct for the current circumstances.
6942
6943 To return to Perl control, and restart the gotcha prevention services, call
6944 C<L</sync_locale>>. Behavior is undefined for any pure Perl code that executes
6945 while the switch is in effect.
6946
6947 The global locale and the per-thread locales are independent. As long as just
6948 one thread converts to the global locale, everything works smoothly. But if
6949 more than one does, they can easily interfere with each other, and races are
6950 likely. On Windows systems prior to Visual Studio 15 (at which point Microsoft
6951 fixed a bug), races can occur (even if only one thread has been converted to
6952 the global locale), but only if you use the following operations:
6953
6954 =over
6955
6956 =item L<POSIX::localeconv|POSIX/localeconv>
6957
6958 =item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP>
6959
6960 =item L<perlapi/Perl_langinfo>, items C<CRNCYSTR> and C<THOUSEP>
6961
6962 =back
6963
6964 The first item is not fixable (except by upgrading to a later Visual Studio
6965 release), but it would be possible to work around the latter two items by
6966 having Perl change its algorithm for calculating these to use Windows API
6967 functions (likely C<GetNumberFormat> and C<GetCurrencyFormat>); patches
6968 welcome.
6969
6970 XS code should never call plain C<setlocale>, but should instead be converted
6971 to either call L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in
6972 for the system C<setlocale>) or use the methods given in L<perlcall> to call
6973 L<C<POSIX::setlocale>|POSIX/setlocale>. Either one will transparently properly
6974 handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
6975
6976 =cut
6977 */
6978
6979 void
Perl_switch_to_global_locale(pTHX)6980 Perl_switch_to_global_locale(pTHX)
6981 {
6982
6983 #ifdef USE_LOCALE
6984
6985 DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering switch_to_global; %s\n",
6986 get_LC_ALL_display()));
6987 bool perl_controls = false;
6988
6989 # ifdef USE_THREAD_SAFE_LOCALE
6990
6991 /* In these cases, we use the system state to determine if we are in the
6992 * global locale or not. */
6993
6994 # ifdef USE_POSIX_2008_LOCALE
6995
6996 perl_controls = (LC_GLOBAL_LOCALE != uselocale((locale_t) 0));
6997
6998 # elif defined(WIN32)
6999
7000 perl_controls = (_configthreadlocale(0) == _ENABLE_PER_THREAD_LOCALE);
7001
7002 # else
7003 # error Unexpected Configuration
7004 # endif
7005 # endif
7006
7007 /* No-op if already in global */
7008 if (! perl_controls) {
7009 return;
7010 }
7011
7012 # ifdef USE_THREAD_SAFE_LOCALE
7013 # if defined(WIN32)
7014
7015 const char * thread_locale = posix_setlocale(LC_ALL, NULL);
7016 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
7017 posix_setlocale(LC_ALL, thread_locale);
7018
7019 # else /* Must be USE_POSIX_2008_LOCALE) */
7020
7021 const char * cur_thread_locales[NOMINAL_LC_ALL_INDEX + 1];
7022
7023 /* Save each category's current per-thread state */
7024 for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
7025 cur_thread_locales[i] = querylocale_i(i);
7026 }
7027
7028 /* Now switch to global */
7029 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Switching to global locale\n"));
7030
7031 locale_t old_locale = uselocale(LC_GLOBAL_LOCALE);
7032 if (! old_locale) {
7033 locale_panic_("Could not change to global locale");
7034 }
7035
7036 /* Free the per-thread memory */
7037 if (old_locale != LC_GLOBAL_LOCALE && old_locale != PL_C_locale_obj) {
7038 freelocale(old_locale);
7039 }
7040
7041 /* Set the global to what was our per-thread state */
7042 POSIX_SETLOCALE_LOCK;
7043 for (unsigned int i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
7044 posix_setlocale(categories[i], cur_thread_locales[i]);
7045 }
7046 POSIX_SETLOCALE_UNLOCK;
7047
7048 # endif
7049 # endif
7050 # ifdef USE_LOCALE_NUMERIC
7051
7052 /* Switch to the underlying C numeric locale; the application is on its
7053 * own. */
7054 POSIX_SETLOCALE_LOCK;
7055 posix_setlocale(LC_NUMERIC, PL_numeric_name);
7056 POSIX_SETLOCALE_UNLOCK;
7057
7058 # endif
7059 #endif
7060
7061 }
7062
7063 /*
7064
7065 =for apidoc sync_locale
7066
7067 This function copies the state of the program global locale into the calling
7068 thread, and converts that thread to using per-thread locales, if it wasn't
7069 already, and the platform supports them. The LC_NUMERIC locale is toggled into
7070 the standard state (using the C locale's conventions), if not within the
7071 lexical scope of S<C<use locale>>.
7072
7073 Perl will now consider itself to have control of the locale.
7074
7075 Since unthreaded perls have only a global locale, this function is a no-op
7076 without threads.
7077
7078 This function is intended for use with C libraries that do locale manipulation.
7079 It allows Perl to accommodate the use of them. Call this function before
7080 transferring back to Perl space so that it knows what state the C code has left
7081 things in.
7082
7083 XS code should not manipulate the locale on its own. Instead,
7084 L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
7085 change the locale (though changing the locale is antisocial and dangerous on
7086 multi-threaded systems that don't have multi-thread safe locale operations.
7087 (See L<perllocale/Multi-threaded operation>).
7088
7089 Using the libc L<C<setlocale(3)>> function should be avoided. Nevertheless,
7090 certain non-Perl libraries called from XS, do call it, and their behavior may
7091 not be able to be changed. This function, along with
7092 C<L</switch_to_global_locale>>, can be used to get seamless behavior in these
7093 circumstances, as long as only one thread is involved.
7094
7095 If the library has an option to turn off its locale manipulation, doing that is
7096 preferable to using this mechanism. C<Gtk> is such a library.
7097
7098 The return value is a boolean: TRUE if the global locale at the time of call
7099 was in effect for the caller; and FALSE if a per-thread locale was in effect.
7100
7101 =cut
7102 */
7103
7104 bool
Perl_sync_locale(pTHX)7105 Perl_sync_locale(pTHX)
7106 {
7107
7108 #ifndef USE_LOCALE
7109
7110 return TRUE;
7111
7112 #else
7113
7114 bool was_in_global = TRUE;
7115
7116 # ifdef USE_THREAD_SAFE_LOCALE
7117 # if defined(WIN32)
7118
7119 was_in_global = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE)
7120 == _DISABLE_PER_THREAD_LOCALE;
7121
7122 # elif defined(USE_POSIX_2008_LOCALE)
7123
7124 was_in_global = (LC_GLOBAL_LOCALE == uselocale((locale_t) 0));
7125
7126 # else
7127 # error Unexpected Configuration
7128 # endif
7129 # endif /* USE_THREAD_SAFE_LOCALE */
7130
7131 /* Here, we are in the global locale. Get and save the values for each
7132 * category. */
7133 const char * current_globals[NOMINAL_LC_ALL_INDEX];
7134 for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
7135 POSIX_SETLOCALE_LOCK;
7136 current_globals[i] = savepv(stdized_setlocale(categories[i], NULL));
7137 POSIX_SETLOCALE_UNLOCK;
7138 }
7139
7140 /* Now we have to convert the current thread to use them */
7141
7142 # if defined(WIN32)
7143
7144 /* On Windows, convert to per-thread behavior. This isn't necessary in
7145 * POSIX 2008, as the conversion gets done automatically in the loop below.
7146 * */
7147 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
7148
7149 # endif
7150
7151 for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
7152 setlocale_i(i, current_globals[i]);
7153 Safefree(current_globals[i]);
7154 }
7155
7156 /* And update our remaining records. 'true' => force recalculation */
7157 new_LC_ALL(NULL, true);
7158
7159 return was_in_global;
7160
7161 #endif
7162
7163 }
7164
7165 #if defined(DEBUGGING) && defined(USE_LOCALE)
7166
7167 STATIC char *
S_my_setlocale_debug_string_i(pTHX_ const unsigned cat_index,const char * locale,const char * retval,const line_t line)7168 S_my_setlocale_debug_string_i(pTHX_
7169 const unsigned cat_index,
7170 const char* locale, /* Optional locale name */
7171
7172 /* return value from setlocale() when attempting
7173 * to set 'category' to 'locale' */
7174 const char* retval,
7175
7176 const line_t line)
7177 {
7178 /* Returns a pointer to a NUL-terminated string in static storage with
7179 * added text about the info passed in. This is not thread safe and will
7180 * be overwritten by the next call, so this should be used just to
7181 * formulate a string to immediately print or savepv() on. */
7182
7183 const char * locale_quote;
7184 const char * retval_quote;
7185
7186 assert(cat_index <= NOMINAL_LC_ALL_INDEX);
7187
7188 if (locale == NULL) {
7189 locale_quote = "";
7190 locale = "NULL";
7191 }
7192 else {
7193 locale_quote = "\"";
7194 }
7195
7196 if (retval == NULL) {
7197 retval_quote = "";
7198 retval = "NULL";
7199 }
7200 else {
7201 retval_quote = "\"";
7202 }
7203
7204 # ifdef USE_LOCALE_THREADS
7205 # define THREAD_FORMAT "%p:"
7206 # define THREAD_ARGUMENT aTHX_
7207 # else
7208 # define THREAD_FORMAT
7209 # define THREAD_ARGUMENT
7210 # endif
7211
7212 return Perl_form(aTHX_
7213 "%s:%" LINE_Tf ": " THREAD_FORMAT
7214 " setlocale(%s[%d], %s%s%s) returned %s%s%s\n",
7215
7216 __FILE__, line, THREAD_ARGUMENT
7217 category_names[cat_index], categories[cat_index],
7218 locale_quote, locale, locale_quote,
7219 retval_quote, retval, retval_quote);
7220 }
7221
7222 #endif
7223 #ifdef USE_PERL_SWITCH_LOCALE_CONTEXT
7224
7225 void
Perl_switch_locale_context()7226 Perl_switch_locale_context()
7227 {
7228 /* libc keeps per-thread locale status information in some configurations.
7229 * So, we can't just switch out aTHX to switch to a new thread. libc has
7230 * to follow along. This routine does that based on per-interpreter
7231 * variables we keep just for this purpose */
7232
7233 /* Can't use pTHX, because we may be called from a place where that
7234 * isn't available */
7235 dTHX;
7236
7237 if (UNLIKELY( aTHX == NULL
7238 || PL_veto_switch_non_tTHX_context
7239 || PL_phase == PERL_PHASE_CONSTRUCT))
7240 {
7241 return;
7242 }
7243
7244 # ifdef USE_POSIX_2008_LOCALE
7245
7246 if (! uselocale(PL_cur_locale_obj)) {
7247 locale_panic_(Perl_form(aTHX_
7248 "Can't uselocale(%p), LC_ALL supposed to be '%s",
7249 PL_cur_locale_obj, get_LC_ALL_display()));
7250 }
7251
7252 # elif defined(WIN32)
7253
7254 if (! bool_setlocale_c(LC_ALL, PL_cur_LC_ALL)) {
7255 locale_panic_(Perl_form(aTHX_ "Can't setlocale(%s)", PL_cur_LC_ALL));
7256 }
7257
7258 # endif
7259
7260 }
7261
7262 #endif
7263
7264 void
Perl_thread_locale_init(pTHX)7265 Perl_thread_locale_init(pTHX)
7266 {
7267
7268 #ifdef USE_THREAD_SAFE_LOCALE
7269 # ifdef USE_POSIX_2008_LOCALE
7270
7271 /* Called from a thread on startup.
7272 *
7273 * The operations here have to be done from within the calling thread, as
7274 * they affect libc's knowledge of the thread; libc has no knowledge of
7275 * aTHX */
7276
7277 DEBUG_L(PerlIO_printf(Perl_debug_log,
7278 "new thread, initial locale is %s;"
7279 " calling setlocale(LC_ALL, \"C\")\n",
7280 get_LC_ALL_display()));
7281
7282 uselocale(PL_C_locale_obj);
7283
7284 # elif defined(WIN32)
7285
7286 /* On Windows, make sure new thread has per-thread locales enabled */
7287 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
7288 void_setlocale_c(LC_ALL, "C");
7289
7290 # endif
7291 #endif
7292
7293 }
7294
7295 void
Perl_thread_locale_term(pTHX)7296 Perl_thread_locale_term(pTHX)
7297 {
7298 /* Called from a thread as it gets ready to terminate.
7299 *
7300 * The operations here have to be done from within the calling thread, as
7301 * they affect libc's knowledge of the thread; libc has no knowledge of
7302 * aTHX */
7303
7304 #ifdef USE_POSIX_2008_LOCALE
7305
7306 /* C starts the new thread in the global C locale. If we are thread-safe,
7307 * we want to not be in the global locale */
7308
7309 /* Free up */
7310 locale_t actual_obj = uselocale(LC_GLOBAL_LOCALE);
7311 if (actual_obj != LC_GLOBAL_LOCALE && actual_obj != PL_C_locale_obj) {
7312 freelocale(actual_obj);
7313 }
7314
7315 /* Prevent leaks even if something has gone wrong */
7316 locale_t expected_obj = PL_cur_locale_obj;
7317 if (UNLIKELY( expected_obj != actual_obj
7318 && expected_obj != LC_GLOBAL_LOCALE
7319 && expected_obj != PL_C_locale_obj))
7320 {
7321 freelocale(expected_obj);
7322 }
7323
7324 PL_cur_locale_obj = LC_GLOBAL_LOCALE;
7325
7326 #endif
7327
7328 }
7329
7330 /*
7331 * ex: set ts=8 sts=4 sw=4 et:
7332 */
7333